Coverage for src/toolbox_pyspark/datetime.py: 100%

78 statements  

« prev     ^ index     » next       coverage.py v7.6.10, created at 2025-01-25 23:08 +0000

1# ============================================================================ # 

2# # 

3# Title : Datetime # 

4# Purpose : Fixing column names that contain datetime data, adding # 

5# conversions to local datetimes, and for splitting a column in # 

6# to their date and time components. # 

7# # 

8# ============================================================================ # 

9 

10 

11# ---------------------------------------------------------------------------- # 

12# # 

13# Overview #### 

14# # 

15# ---------------------------------------------------------------------------- # 

16 

17 

18# ---------------------------------------------------------------------------- # 

19# Description #### 

20# ---------------------------------------------------------------------------- # 

21 

22 

23""" 

24!!! note "Summary" 

25 The `datetime` module is used for fixing column names that contain datetime data, adding conversions to local datetimes, and for splitting a column in to their date and time components. 

26""" 

27 

28 

29# ---------------------------------------------------------------------------- # 

30# # 

31# Setup #### 

32# # 

33# ---------------------------------------------------------------------------- # 

34 

35 

36# ---------------------------------------------------------------------------- # 

37# Imports #### 

38# ---------------------------------------------------------------------------- # 

39 

40 

41# ## Python StdLib Imports ---- 

42from typing import Optional, Union 

43 

44# ## Python Third Party Imports ---- 

45from pyspark.sql import Column, DataFrame as psDataFrame, functions as F 

46from toolbox_python.checkers import is_all_in, is_in, is_type 

47from toolbox_python.collection_types import str_collection, str_list 

48from typeguard import typechecked 

49 

50# ## Local First Party Imports ---- 

51from toolbox_pyspark.checks import assert_column_exists, assert_columns_exists 

52from toolbox_pyspark.columns import get_columns 

53 

54 

55# ---------------------------------------------------------------------------- # 

56# Exports #### 

57# ---------------------------------------------------------------------------- # 

58 

59 

60__all__: str_list = [ 

61 "rename_datetime_column", 

62 "rename_datetime_columns", 

63 "add_local_datetime_column", 

64 "add_local_datetime_columns", 

65 "split_datetime_column", 

66 "split_datetime_columns", 

67] 

68 

69 

70# ---------------------------------------------------------------------------- # 

71# # 

72# Functions #### 

73# # 

74# ---------------------------------------------------------------------------- # 

75 

76 

77# ---------------------------------------------------------------------------- # 

78# Renaming #### 

79# ---------------------------------------------------------------------------- # 

80 

81 

82@typechecked 

83def rename_datetime_column( 

84 dataframe: psDataFrame, 

85 column: str, 

86) -> psDataFrame: 

87 """ 

88 !!! note "Summary" 

89 For a given column in a Data Frame, if there is not another column existing that has `TIME` appended to the end, then re-name the column to append `TIME` to it. 

90 

91 Params: 

92 dataframe (psDataFrame): 

93 The DataFrame to update. 

94 column (str): 

95 The column to check. 

96 

97 Raises: 

98 TypeError: 

99 If any of the inputs parsed to the parameters of this function are not the correct type. Uses the [`@typeguard.typechecked`](https://typeguard.readthedocs.io/en/stable/api.html#typeguard.typechecked) decorator. 

100 ColumnDoesNotExistError: 

101 If the `#!py column` does not exist within `#!py dataframe.columns`. 

102 

103 Returns: 

104 (psDataFrame): 

105 The updated Data Frame. 

106 

107 ???+ example "Examples" 

108 

109 ```{.py .python linenums="1" title="Set up"} 

110 >>> # Imports 

111 >>> import pandas as pd 

112 >>> from pyspark.sql import SparkSession 

113 >>> from toolbox_pyspark.datetime import rename_datetime_column 

114 >>> 

115 >>> # Instantiate Spark 

116 >>> spark = SparkSession.builder.getOrCreate() 

117 >>> 

118 >>> # Create data 

119 >>> df = spark.createDataFrame( 

120 ... pd.DataFrame( 

121 ... { 

122 ... "a": [1, 2, 3, 4], 

123 ... "b": ["a", "b", "c", "d"], 

124 ... "c_date": pd.date_range(start="2022-01-01", periods=4, freq="h"), 

125 ... "d_date": pd.date_range(start="2022-02-01", periods=4, freq="h"), 

126 ... } 

127 ... ) 

128 ... ) 

129 >>> 

130 >>> # Check 

131 >>> df.show() 

132 ``` 

133 <div class="result" markdown> 

134 ```{.txt .text title="Terminal"} 

135 +---+---+---------------------+---------------------+ 

136 | a | b | c_date | d_date | 

137 +---+---+---------------------+---------------------+ 

138 | 0 | a | 2022-01-01 00:00:00 | 2022-02-01 00:00:00 | 

139 | 1 | b | 2022-01-01 01:00:00 | 2022-02-01 01:00:00 | 

140 | 2 | c | 2022-01-01 02:00:00 | 2022-02-01 02:00:00 | 

141 | 3 | d | 2022-01-01 03:00:00 | 2022-02-01 03:00:00 | 

142 +---+---+---------------------+---------------------+ 

143 ``` 

144 </div> 

145 

146 ```{.py .python linenums="1" title="Example 1: Update column"} 

147 >>> rename_datetime_column(df, "c_date").show() 

148 ``` 

149 <div class="result" markdown> 

150 ```{.txt .text title="Terminal"} 

151 +---+---+---------------------+---------------------+ 

152 | a | b | c_dateTIME | d_date | 

153 +---+---+---------------------+---------------------+ 

154 | 0 | a | 2022-01-01 00:00:00 | 2022-02-01 00:00:00 | 

155 | 1 | b | 2022-01-01 01:00:00 | 2022-02-01 01:00:00 | 

156 | 2 | c | 2022-01-01 02:00:00 | 2022-02-01 02:00:00 | 

157 | 3 | d | 2022-01-01 03:00:00 | 2022-02-01 03:00:00 | 

158 +---+---+---------------------+---------------------+ 

159 ``` 

160 !!! success "Conclusion: Successfully renamed column." 

161 </div> 

162 

163 ```{.py .python linenums="1" title="Example 2: Missing column"} 

164 >>> rename_datetime_column(df, "fff") 

165 ``` 

166 <div class="result" markdown> 

167 ```{.txt .text title="Terminal"} 

168 ColumnDoesNotExistError: Column "fff" does not exist in "dataframe". 

169 Try one of: ["a", "b", "c_date", "d_date"]. 

170 ``` 

171 !!! failure "Conclusion: Column does not exist." 

172 </div> 

173 

174 ??? tip "See Also" 

175 - [`assert_column_exists()`][toolbox_pyspark.checks.assert_column_exists] 

176 - [`rename_datetime_columns()`][toolbox_pyspark.datetime.rename_datetime_columns] 

177 """ 

178 assert_column_exists(dataframe=dataframe, column=column, match_case=True) 

179 if f"{column}TIME" not in dataframe.columns: 

180 return dataframe.withColumnRenamed(column, f"{column}TIME") 

181 else: 

182 return dataframe 

183 

184 

185@typechecked 

186def rename_datetime_columns( 

187 dataframe: psDataFrame, 

188 columns: Optional[Union[str_collection, str]] = None, 

189) -> psDataFrame: 

190 """ 

191 !!! note "Summary" 

192 Fix the column names for the date-time columns. 

193 

194 ???+ abstract "Details" 

195 This is necessary because in NGW, there are some columns which have `datetime` data types, but which have the name only containing `date`. 

196 So, this function will fix that. 

197 

198 Params: 

199 dataframe (psDataFrame): 

200 The DataFrame to update. 

201 columns (Optional[Union[str_collection, str]], None): 

202 An optional list of columns to update. 

203 If this is not provided, or is the value `#!py None` or `#!py "all"`, then the function will automatically determine which columns to update based on the following logic: 

204 

205 1. Loop through each column on `dataframe` to fetch the name and dtype using the method: `dataframe.dtypes`. 

206 1. If the column name ends with `#!py "date"` 

207 2. **AND** the column type is `#!py "timestamp"` 

208 3. **AND** there is **NOT** already a column existing in the `dataframe.columns` with the name: `f"{column}TIME"` 

209 4. **THEN** rename the column to have the name: `f"{column}TIME"` 

210 2. Next column. 

211 

212 Default: `None`. 

213 

214 Raises: 

215 TypeError: 

216 If any of the inputs parsed to the parameters of this function are not the correct type. Uses the [`@typeguard.typechecked`](https://typeguard.readthedocs.io/en/stable/api.html#typeguard.typechecked) decorator. 

217 ColumnDoesNotExistError: 

218 If any of the `#!py columns` do not exist within `#!py dataframe.columns`. 

219 

220 Returns: 

221 (psDataFrame): 

222 The updated DataFrame. 

223 

224 ???+ example "Examples" 

225 

226 ```{.py .python linenums="1" title="Set up"} 

227 >>> # Imports 

228 >>> import pandas as pd 

229 >>> from pyspark.sql import SparkSession 

230 >>> from toolbox_pyspark.datetime import rename_datetime_column 

231 >>> 

232 >>> # Instantiate Spark 

233 >>> spark = SparkSession.builder.getOrCreate() 

234 >>> 

235 >>> # Create data 

236 >>> df = spark.createDataFrame( 

237 ... pd.DataFrame( 

238 ... { 

239 ... "a": [1, 2, 3, 4], 

240 ... "b": ["a", "b", "c", "d"], 

241 ... "c_date": pd.date_range(start="2022-01-01", periods=4, freq="h"), 

242 ... "d_date": pd.date_range(start="2022-02-01", periods=4, freq="h"), 

243 ... } 

244 ... ) 

245 ... ) 

246 >>> 

247 >>> # Check 

248 >>> df.show() 

249 ``` 

250 <div class="result" markdown> 

251 ```{.txt .text title="Terminal"} 

252 +---+---+---------------------+---------------------+ 

253 | a | b | c_date | d_date | 

254 +---+---+---------------------+---------------------+ 

255 | 0 | a | 2022-01-01 00:00:00 | 2022-02-01 00:00:00 | 

256 | 1 | b | 2022-01-01 01:00:00 | 2022-02-01 01:00:00 | 

257 | 2 | c | 2022-01-01 02:00:00 | 2022-02-01 02:00:00 | 

258 | 3 | d | 2022-01-01 03:00:00 | 2022-02-01 03:00:00 | 

259 +---+---+---------------------+---------------------+ 

260 ``` 

261 </div> 

262 

263 ```{.py .python linenums="1" title="Example 1: One column"} 

264 >>> rename_datetime_column(df, ["c_date"]).show() 

265 ``` 

266 <div class="result" markdown> 

267 ```{.txt .text title="Terminal"} 

268 +---+---+---------------------+---------------------+ 

269 | a | b | c_dateTIME | d_date | 

270 +---+---+---------------------+---------------------+ 

271 | 0 | a | 2022-01-01 00:00:00 | 2022-02-01 00:00:00 | 

272 | 1 | b | 2022-01-01 01:00:00 | 2022-02-01 01:00:00 | 

273 | 2 | c | 2022-01-01 02:00:00 | 2022-02-01 02:00:00 | 

274 | 3 | d | 2022-01-01 03:00:00 | 2022-02-01 03:00:00 | 

275 +---+---+---------------------+---------------------+ 

276 ``` 

277 !!! success "Conclusion: Successfully renamed column." 

278 </div> 

279 

280 ```{.py .python linenums="1" title="Example 2: One column `str`"} 

281 >>> rename_datetime_column(df, "c_date").show() 

282 ``` 

283 <div class="result" markdown> 

284 ```{.txt .text title="Terminal"} 

285 +---+---+---------------------+---------------------+ 

286 | a | b | c_dateTIME | d_date | 

287 +---+---+---------------------+---------------------+ 

288 | 0 | a | 2022-01-01 00:00:00 | 2022-02-01 00:00:00 | 

289 | 1 | b | 2022-01-01 01:00:00 | 2022-02-01 01:00:00 | 

290 | 2 | c | 2022-01-01 02:00:00 | 2022-02-01 02:00:00 | 

291 | 3 | d | 2022-01-01 03:00:00 | 2022-02-01 03:00:00 | 

292 +---+---+---------------------+---------------------+ 

293 ``` 

294 !!! success "Conclusion: Successfully renamed column." 

295 </div> 

296 

297 ```{.py .python linenums="1" title="Example 3: All columns"} 

298 >>> rename_datetime_column(df).show() 

299 ``` 

300 <div class="result" markdown> 

301 ```{.txt .text title="Terminal"} 

302 +---+---+---------------------+---------------------+ 

303 | a | b | c_dateTIME | d_dateTIME | 

304 +---+---+---------------------+---------------------+ 

305 | 0 | a | 2022-01-01 00:00:00 | 2022-02-01 00:00:00 | 

306 | 1 | b | 2022-01-01 01:00:00 | 2022-02-01 01:00:00 | 

307 | 2 | c | 2022-01-01 02:00:00 | 2022-02-01 02:00:00 | 

308 | 3 | d | 2022-01-01 03:00:00 | 2022-02-01 03:00:00 | 

309 +---+---+---------------------+---------------------+ 

310 ``` 

311 !!! success "Conclusion: Successfully renamed columns." 

312 </div> 

313 

314 ```{.py .python linenums="1" title="Example 4: All columns using 'all'"} 

315 >>> rename_datetime_column(df, "all").show() 

316 ``` 

317 <div class="result" markdown> 

318 ```{.txt .text title="Terminal"} 

319 +---+---+---------------------+---------------------+ 

320 | a | b | c_dateTIME | d_dateTIME | 

321 +---+---+---------------------+---------------------+ 

322 | 0 | a | 2022-01-01 00:00:00 | 2022-02-01 00:00:00 | 

323 | 1 | b | 2022-01-01 01:00:00 | 2022-02-01 01:00:00 | 

324 | 2 | c | 2022-01-01 02:00:00 | 2022-02-01 02:00:00 | 

325 | 3 | d | 2022-01-01 03:00:00 | 2022-02-01 03:00:00 | 

326 +---+---+---------------------+---------------------+ 

327 ``` 

328 !!! success "Conclusion: Successfully renamed columns." 

329 </div> 

330 

331 ```{.py .python linenums="1" title="Example 5: Missing column"} 

332 >>> rename_datetime_columns(df, ["fff", "ggg"]) 

333 ``` 

334 <div class="result" markdown> 

335 ```{.txt .text title="Terminal"} 

336 Attribute Error: Columns ["fff", "ggg"] do not exist in "dataframe". 

337 Try one of: ["a", "b", "c_dateTIME", "d_dateTIME"]. 

338 ``` 

339 !!! failure "Conclusion: Columns do not exist." 

340 </div> 

341 

342 ??? tip "See Also" 

343 - [`assert_columns_exists()`][toolbox_pyspark.checks.assert_columns_exists] 

344 - [`rename_datetime_column()`][toolbox_pyspark.datetime.rename_datetime_column] 

345 """ 

346 if columns is None or columns == "all": 

347 datetime_cols: str_list = [ 

348 col 

349 for col in get_columns(dataframe, "all_datetime") 

350 if col.lower().endswith("date") 

351 ] 

352 columns = [ 

353 col 

354 for col in datetime_cols 

355 if col.lower().endswith("date") and f"{col}TIME" not in dataframe.columns 

356 ] 

357 elif is_type(columns, str): 

358 columns = [columns] 

359 assert_columns_exists(dataframe, columns, True) 

360 for column in columns: 

361 dataframe = rename_datetime_column(dataframe, column) 

362 return dataframe 

363 

364 

365# ---------------------------------------------------------------------------- # 

366# Add Locals #### 

367# ---------------------------------------------------------------------------- # 

368 

369 

370@typechecked 

371def add_local_datetime_column( 

372 dataframe: psDataFrame, 

373 column: str, 

374 from_timezone: Optional[str] = None, 

375 column_with_target_timezone: str = "timezone_location".upper(), 

376) -> psDataFrame: 

377 """ 

378 !!! note "Summary" 

379 For the given `column`, add a new column with the suffix `_LOCAL` which is a conversion of the datetime values from `column` to the desired timezone. 

380 

381 Params: 

382 dataframe (psDataFrame): 

383 The DataFrame to be fixed 

384 column (str): 

385 The name of the column to do the conversion for. Must exist in `#!py dataframe.columns`, and must be type `typestamp`. 

386 from_timezone (str, optional): 

387 The timezone which will be converted from. Must be a valid TimeZoneID, for more info, see: [TimeZoneID](https://docs.oracle.com/middleware/12211/wcs/tag-ref/MISC/TimeZones.html).<br> 

388 If not given, will default the `from_timezone` to be UTC.<br> 

389 Default: `#!py None`. 

390 column_with_target_timezone (str, optional): 

391 The column containing the target timezone value. By default will be the column `#!py "timezone_location"`.<br> 

392 Defaults to `#!py "timezone_location".upper()`. 

393 

394 Raises: 

395 TypeError: 

396 If any of the inputs parsed to the parameters of this function are not the correct type. Uses the [`@typeguard.typechecked`](https://typeguard.readthedocs.io/en/stable/api.html#typeguard.typechecked) decorator. 

397 ColumnDoesNotExistError: 

398 If `#!py "column"` or `column_with_target_timezone` does not exist within `#!py dataframe.columns`. 

399 ValueError: 

400 If the `from_timezone` or `column_with_target_timezone` is not a valid timezone. 

401 

402 Returns: 

403 (psDataFrame): 

404 The updated DataFrame. 

405 

406 ???+ example "Examples" 

407 

408 ```{.py .python linenums="1" title="Set up"} 

409 >>> # Imports 

410 >>> import pandas as pd 

411 >>> from pyspark.sql import SparkSession 

412 >>> from toolbox_pyspark.datetime import add_local_datetime_column 

413 >>> 

414 >>> # Instantiate Spark 

415 >>> spark = SparkSession.builder.getOrCreate() 

416 >>> 

417 >>> # Create data 

418 >>> df = spark.createDataFrame( 

419 ... pd.DataFrame( 

420 ... { 

421 ... "a": [1, 2, 3, 4], 

422 ... "b": ["a", "b", "c", "d"], 

423 ... "c": pd.date_range(start="2022-01-01", periods=4, freq="D"), 

424 ... "d": pd.date_range(start="2022-02-01", periods=4, freq="D"), 

425 ... "e": pd.date_range(start="2022-03-01", periods=4, freq="D"), 

426 ... "target": ["Asia/Singapore"] * 4, 

427 ... "TIMEZONE_LOCATION": ["Australia/Perth"] * 4, 

428 ... } 

429 ... ) 

430 ... ) 

431 >>> 

432 >>> # Check 

433 >>> df.show() 

434 ``` 

435 <div class="result" markdown> 

436 ```{.txt .text title="Terminal"} 

437 +---+---+---------------------+---------------------+---------------------+----------------+-------------------+ 

438 | a | b | c | d | e | target | TIMEZONE_LOCATION | 

439 +---+---+---------------------+---------------------+---------------------+----------------+-------------------+ 

440 | 1 | a | 2022-01-01 00:00:00 | 2022-02-01 00:00:00 | 2022-03-01 00:00:00 | Asia/Singapore | Australia/Perth | 

441 | 2 | b | 2022-01-02 00:00:00 | 2022-02-02 00:00:00 | 2022-03-02 00:00:00 | Asia/Singapore | Australia/Perth | 

442 | 3 | c | 2022-01-03 00:00:00 | 2022-02-03 00:00:00 | 2022-03-03 00:00:00 | Asia/Singapore | Australia/Perth | 

443 | 4 | d | 2022-01-04 00:00:00 | 2022-02-04 00:00:00 | 2022-03-04 00:00:00 | Asia/Singapore | Australia/Perth | 

444 +---+---+---------------------+---------------------+---------------------+----------------+-------------------+ 

445 ``` 

446 </div> 

447 

448 ```{.py .python linenums="1" title="Example 1: Converting from UTC time"} 

449 >>> add_local_datetime_column(df, "c").show() 

450 ``` 

451 <div class="result" markdown> 

452 ```{.txt .text title="Terminal"} 

453 +---+---+---------------------+---------------------+---------------------+----------------+-------------------+---------------------+ 

454 | a | b | c | d | e | target | TIMEZONE_LOCATION | c_LOCAL | 

455 +---+---+---------------------+---------------------+---------------------+----------------+-------------------+---------------------+ 

456 | 1 | a | 2022-01-01 00:00:00 | 2022-02-01 00:00:00 | 2022-03-01 00:00:00 | Asia/Singapore | Australia/Perth | 2022-01-01 08:00:00 | 

457 | 2 | b | 2022-01-02 00:00:00 | 2022-02-02 00:00:00 | 2022-03-02 00:00:00 | Asia/Singapore | Australia/Perth | 2022-01-02 08:00:00 | 

458 | 3 | c | 2022-01-03 00:00:00 | 2022-02-03 00:00:00 | 2022-03-03 00:00:00 | Asia/Singapore | Australia/Perth | 2022-01-03 08:00:00 | 

459 | 4 | d | 2022-01-04 00:00:00 | 2022-02-04 00:00:00 | 2022-03-04 00:00:00 | Asia/Singapore | Australia/Perth | 2022-01-04 08:00:00 | 

460 +---+---+---------------------+---------------------+---------------------+----------------+-------------------+---------------------+ 

461 ``` 

462 !!! success "Conclusion: Successfully converted from UTC." 

463 </div> 

464 

465 ```{.py .python linenums="1" title="Example 2: Converting from specific timezone, with custom column containing target timezone"} 

466 >>> add_local_datetime_column( 

467 ... dataframe=df, 

468 ... column="c", 

469 ... from_timezone="Australia/Sydney", 

470 ... column_with_target_timezone="target", 

471 ... ).show() 

472 ``` 

473 <div class="result" markdown> 

474 ```{.txt .text title="Terminal"} 

475 +---+---+---------------------+---------------------+---------------------+----------------+-------------------+---------------------+---------------------+ 

476 | a | b | c | d | e | target | TIMEZONE_LOCATION | c_UTC | c_LOCAL | 

477 +---+---+---------------------+---------------------+---------------------+----------------+-------------------+---------------------+---------------------+ 

478 | 1 | a | 2022-01-01 00:00:00 | 2022-02-01 00:00:00 | 2022-03-01 00:00:00 | Asia/Singapore | Australia/Perth | 2021-12-31 13:00:00 | 2021-12-31 21:00:00 | 

479 | 2 | b | 2022-01-02 00:00:00 | 2022-02-02 00:00:00 | 2022-03-02 00:00:00 | Asia/Singapore | Australia/Perth | 2022-01-01 13:00:00 | 2022-01-01 21:00:00 | 

480 | 3 | c | 2022-01-03 00:00:00 | 2022-02-03 00:00:00 | 2022-03-03 00:00:00 | Asia/Singapore | Australia/Perth | 2022-01-02 13:00:00 | 2022-01-02 21:00:00 | 

481 | 4 | d | 2022-01-04 00:00:00 | 2022-02-04 00:00:00 | 2022-03-04 00:00:00 | Asia/Singapore | Australia/Perth | 2022-01-03 13:00:00 | 2022-01-03 21:00:00 | 

482 +---+---+---------------------+---------------------+---------------------+----------------+-------------------+---------------------+---------------------+ 

483 ``` 

484 !!! success "Conclusion: Successfully converted timezone." 

485 </div> 

486 

487 ```{.py .python linenums="1" title="Example 3: Invalid column name"} 

488 >>> add_local_datetime_column(df, "invalid_column") 

489 ``` 

490 <div class="result" markdown> 

491 ```{.txt .text title="Terminal"} 

492 ColumnDoesNotExistError: Column "invalid_column" does not exist in "dataframe". 

493 Try one of: ["a", "b", "c", "d", "e", "target", "TIMEZONE_LOCATION"]. 

494 ``` 

495 !!! failure "Conclusion: Column does not exist." 

496 </div> 

497 

498 ```{.py .python linenums="1" title="Example 4: Invalid timezone"} 

499 >>> add_local_datetime_column(df, "c", from_timezone="Invalid/Timezone") 

500 ``` 

501 <div class="result" markdown> 

502 ```{.txt .text title="Terminal"} 

503 ValueError: The timezone "Invalid/Timezone" is not a valid timezone. 

504 ``` 

505 !!! failure "Conclusion: Invalid timezone." 

506 </div> 

507 

508 ??? info "Notes" 

509 - If `#!py from_timezone is None`, then it is assumed that the datetime data in `column` is _already_ in UTC timezone.<br> 

510 - If `#!py from_timezone is not None`, then a new column will be added with the syntax `#!py {column}_UTC`, then another column added with `#!py {column}_LOCAL`. This is necessary because PySpark cannot convert immediately from one timezone to another; it must first require a conversion from the `from_timezone` value _to_ UTC, then a second conversion _from_ UTC to whichever timezone is defined in the column `column_with_target_timezone`.<br> 

511 - The reason why this function uses multiple [`.withColumn()`][withColumn] methods, instead of a single [`.withColumns()`][withColumns] expression is because to add the `#!py {column}_LOCAL` column, it is first necessary for the `#!py {column}_UTC` column to exist on the `dataframe`. Therefore, we need to call [`.withColumn()`][withColumn] first to add `#!py {column}_UTC`, then we need to call [`.withColumn()`][withColumn] a second time to add `#!py {column}_LOCAL`. 

512 [withColumn]: https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.DataFrame.withColumn.html 

513 [withColumns]: https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.DataFrame.withColumns.html 

514 

515 ??? tip "See Also" 

516 - [`assert_columns_exists()`][toolbox_pyspark.checks.assert_columns_exists] 

517 - [`add_local_datetime_columns()`][toolbox_pyspark.datetime.add_local_datetime_columns] 

518 """ 

519 assert_columns_exists(dataframe=dataframe, columns=[column, column_with_target_timezone]) 

520 require_utc: bool = f"{column}_UTC" not in dataframe.columns 

521 require_local: bool = f"{column}_LOCAL" not in dataframe.columns 

522 if from_timezone is not None: 

523 if require_utc: 

524 dataframe = dataframe.withColumn( 

525 f"{column}_UTC", 

526 F.to_utc_timestamp( 

527 F.col(column).cast("timestamp"), 

528 from_timezone.title(), 

529 ), 

530 ) 

531 if require_local: 

532 dataframe = dataframe.withColumn( 

533 f"{column}_LOCAL", 

534 F.from_utc_timestamp( 

535 F.col(f"{column}_UTC").cast("timestamp"), 

536 F.col(column_with_target_timezone), 

537 ), 

538 ) 

539 else: 

540 if require_local: 

541 dataframe = dataframe.withColumn( 

542 f"{column}_LOCAL", 

543 F.from_utc_timestamp( 

544 F.col(column).cast("timestamp"), 

545 F.col(column_with_target_timezone), 

546 ), 

547 ) 

548 return dataframe 

549 

550 

551@typechecked 

552def add_local_datetime_columns( 

553 dataframe: psDataFrame, 

554 columns: Optional[Union[str, str_collection]] = None, 

555 from_timezone: Optional[str] = None, 

556 column_with_target_timezone: str = "timezone_location".upper(), 

557) -> psDataFrame: 

558 """ 

559 !!! note "Summary" 

560 For each of the `data` or `datetime` columns in `dataframe`, add a new column which is converting it to the timezone of the local datetime. 

561 

562 ???+ abstract "Details" 

563 Under the hood, this function will call [`add_local_datetime_column()`][toolbox_pyspark.datetime.add_local_datetime_column] for each `column` in `columns`. 

564 

565 Params: 

566 dataframe (psDataFrame): 

567 The DataFrame to update. 

568 columns (Optional[Union[str, str_collection]], optional): 

569 The columns to check. If not provided, it will use all of the columns which contains the text `date`.<br> 

570 Defaults to `#!py None`. 

571 from_timezone (Optional[str], optional): 

572 The timezone which will be converted from. If not given, will default the from timezone to be UTC.<br> 

573 Defaults to `#!py None`. 

574 column_with_target_timezone (str, optional): 

575 The column containing the target timezone value. By default will be the column `#!py "timezone_location"`.<br> 

576 Defaults to `#!py "timezone_location".upper()`. 

577 

578 Raises: 

579 TypeError: 

580 If any of the inputs parsed to the parameters of this function are not the correct type. Uses the [`@typeguard.typechecked`](https://typeguard.readthedocs.io/en/stable/api.html#typeguard.typechecked) decorator. 

581 ColumnDoesNotExistError: 

582 If any of the `#!py columns` do not exist within `#!py dataframe.columns`. 

583 ValueError: 

584 If the `from_timezone` or `column_with_target_timezone` is not a valid timezone. 

585 

586 Returns: 

587 (psDataFrame): 

588 The updated DataFrame. 

589 

590 ???+ example "Examples" 

591 

592 ```{.py .python linenums="1" title="Set up"} 

593 >>> # Imports 

594 >>> import pandas as pd 

595 >>> from pyspark.sql import SparkSession 

596 >>> from toolbox_pyspark.datetime import add_local_datetime_columns 

597 >>> 

598 >>> # Instantiate Spark 

599 >>> spark = SparkSession.builder.getOrCreate() 

600 >>> 

601 >>> # Create data 

602 >>> df = spark.createDataFrame( 

603 ... pd.DataFrame( 

604 ... { 

605 ... "a": [1, 2, 3, 4], 

606 ... "b": ["a", "b", "c", "d"], 

607 ... "c": pd.date_range(start="2022-01-01", periods=4, freq="D"), 

608 ... "d_datetime": pd.date_range(start="2022-02-01", periods=4, freq="D"), 

609 ... "e_datetime": pd.date_range(start="2022-03-01", periods=4, freq="D"), 

610 ... "target": ["Asia/Singapore"] * 4, 

611 ... "TIMEZONE_LOCATION": ["Australia/Perth"] * 4, 

612 ... } 

613 ... ) 

614 ... ) 

615 >>> 

616 >>> # Check 

617 >>> df.show() 

618 ``` 

619 <div class="result" markdown> 

620 ```{.txt .text title="Terminal"} 

621 +---+---+---------------------+---------------------+---------------------+----------------+-------------------+ 

622 | a | b | c | d_datetime | e_datetime | target | TIMEZONE_LOCATION | 

623 +---+---+---------------------+---------------------+---------------------+----------------+-------------------+ 

624 | 1 | a | 2022-01-01 00:00:00 | 2022-02-01 00:00:00 | 2022-03-01 00:00:00 | Asia/Singapore | Australia/Perth | 

625 | 2 | b | 2022-01-02 00:00:00 | 2022-02-02 00:00:00 | 2022-03-02 00:00:00 | Asia/Singapore | Australia/Perth | 

626 | 3 | c | 2022-01-03 00:00:00 | 2022-02-03 00:00:00 | 2022-03-03 00:00:00 | Asia/Singapore | Australia/Perth | 

627 | 4 | d | 2022-01-04 00:00:00 | 2022-02-04 00:00:00 | 2022-03-04 00:00:00 | Asia/Singapore | Australia/Perth | 

628 +---+---+---------------------+---------------------+---------------------+----------------+-------------------+ 

629 ``` 

630 </div> 

631 

632 ```{.py .python linenums="1" title="Example 1: Default config"} 

633 >>> add_local_datetime_columns(df).show() 

634 ``` 

635 <div class="result" markdown> 

636 ```{.txt .text title="Terminal"} 

637 +---+---+---------------------+---------------------+---------------------+----------------+-------------------+---------------------+---------------------+ 

638 | a | b | c | d_datetime | e_datetime | target | TIMEZONE_LOCATION | d_datetime_LOCAL | e_datetime_LOCAL | 

639 +---+---+---------------------+---------------------+---------------------+----------------+-------------------+---------------------+---------------------+ 

640 | 1 | a | 2022-01-01 00:00:00 | 2022-02-01 00:00:00 | 2022-03-01 00:00:00 | Asia/Singapore | Australia/Perth | 2022-02-01 08:00:00 | 2022-03-01 08:00:00 | 

641 | 2 | b | 2022-01-02 00:00:00 | 2022-02-02 00:00:00 | 2022-03-02 00:00:00 | Asia/Singapore | Australia/Perth | 2022-02-02 08:00:00 | 2022-03-02 08:00:00 | 

642 | 3 | c | 2022-01-03 00:00:00 | 2022-02-03 00:00:00 | 2022-03-03 00:00:00 | Asia/Singapore | Australia/Perth | 2022-02-03 08:00:00 | 2022-03-03 08:00:00 | 

643 | 4 | d | 2022-01-04 00:00:00 | 2022-02-04 00:00:00 | 2022-03-04 00:00:00 | Asia/Singapore | Australia/Perth | 2022-02-04 08:00:00 | 2022-03-04 08:00:00 | 

644 +---+---+---------------------+---------------------+---------------------+----------------+-------------------+---------------------+---------------------+ 

645 ``` 

646 !!! success "Conclusion: Successfully converted columns to local timezone." 

647 </div> 

648 

649 ```{.py .python linenums="1" title="Example 2: Semi-custom config"} 

650 >>> add_local_datetime_columns(df, ["c", "d_datetime"]).show() 

651 ``` 

652 <div class="result" markdown> 

653 ```{.txt .text title="Terminal"} 

654 +---+---+---------------------+---------------------+---------------------+----------------+-------------------+---------------------+---------------------+ 

655 | a | b | c | d_datetime | e_datetime | target | TIMEZONE_LOCATION | c_LOCAL | d_datetime_LOCAL | 

656 +---+---+---------------------+---------------------+---------------------+----------------+-------------------+---------------------+---------------------+ 

657 | 1 | a | 2022-01-01 00:00:00 | 2022-02-01 00:00:00 | 2022-03-01 00:00:00 | Asia/Singapore | Australia/Perth | 2022-01-01 08:00:00 | 2022-02-01 08:00:00 | 

658 | 2 | b | 2022-01-02 00:00:00 | 2022-02-02 00:00:00 | 2022-03-02 00:00:00 | Asia/Singapore | Australia/Perth | 2022-01-02 08:00:00 | 2022-02-02 08:00:00 | 

659 | 3 | c | 2022-01-03 00:00:00 | 2022-02-03 00:00:00 | 2022-03-03 00:00:00 | Asia/Singapore | Australia/Perth | 2022-01-03 08:00:00 | 2022-02-03 08:00:00 | 

660 | 4 | d | 2022-01-04 00:00:00 | 2022-02-04 00:00:00 | 2022-03-04 00:00:00 | Asia/Singapore | Australia/Perth | 2022-01-04 08:00:00 | 2022-02-04 08:00:00 | 

661 +---+---+---------------------+---------------------+---------------------+----------------+-------------------+---------------------+---------------------+ 

662 ``` 

663 !!! success "Conclusion: Successfully converted columns to local timezone." 

664 </div> 

665 

666 ```{.py .python linenums="1" title="Example 3: Full-custom config"} 

667 >>> add_local_datetime_columns( 

668 ... dataframe=df, 

669 ... columns=["c", "d_datetime"], 

670 ... from_timezone="Australia/Sydney", 

671 ... column_with_target_timezone="target", 

672 ... ).show() 

673 ``` 

674 <div class="result" markdown> 

675 ```{.txt .text title="Terminal"} 

676 +---+---+---------------------+---------------------+---------------------+----------------+-------------------+---------------------+---------------------+---------------------+---------------------+ 

677 | a | b | c | d_datetime | e_datetime | target | TIMEZONE_LOCATION | c_UTC | c_LOCAL | d_datetime_UTC | d_datetime_LOCAL | 

678 +---+---+---------------------+---------------------+---------------------+----------------+-------------------+---------------------+---------------------+---------------------+---------------------+ 

679 | 1 | a | 2022-01-01 00:00:00 | 2022-02-01 00:00:00 | 2022-03-01 00:00:00 | Asia/Singapore | Australia/Perth | 2021-12-31 13:00:00 | 2021-12-31 21:00:00 | 2022-01-31 13:00:00 | 2022-02-01 08:00:00 | 

680 | 2 | b | 2022-01-02 00:00:00 | 2022-02-02 00:00:00 | 2022-03-02 00:00:00 | Asia/Singapore | Australia/Perth | 2022-01-01 13:00:00 | 2022-01-01 21:00:00 | 2022-02-01 13:00:00 | 2022-02-02 08:00:00 | 

681 | 3 | c | 2022-01-03 00:00:00 | 2022-02-03 00:00:00 | 2022-03-03 00:00:00 | Asia/Singapore | Australia/Perth | 2022-01-02 13:00:00 | 2022-01-02 21:00:00 | 2022-02-02 13:00:00 | 2022-02-03 08:00:00 | 

682 | 4 | d | 2022-01-04 00:00:00 | 2022-02-04 00:00:00 | 2022-03-04 00:00:00 | Asia/Singapore | Australia/Perth | 2022-01-03 13:00:00 | 2022-01-03 21:00:00 | 2022-02-03 13:00:00 | 2022-02-04 08:00:00 | 

683 +---+---+---------------------+---------------------+---------------------+----------------+-------------------+---------------------+---------------------+---------------------+---------------------+ 

684 ``` 

685 !!! success "Conclusion: Successfully converted columns to local time zone, from other custom time zone." 

686 </div> 

687 

688 ```{.py .python linenums="1" title="Example 4: Single column"} 

689 >>> add_local_datetime_columns( 

690 ... dataframe=df, 

691 ... columns="c", 

692 ... from_timezone="Australia/Sydney", 

693 ... column_with_target_timezone="target", 

694 ... ).show() 

695 ``` 

696 <div class="result" markdown> 

697 ```{.txt .text title="Terminal"} 

698 +---+---+---------------------+---------------------+---------------------+----------------+-------------------+---------------------+---------------------+ 

699 | a | b | c | d_datetime | e_datetime | target | TIMEZONE_LOCATION | c_UTC | c_LOCAL | 

700 +---+---+---------------------+---------------------+---------------------+----------------+-------------------+---------------------+---------------------+ 

701 | 1 | a | 2022-01-01 00:00:00 | 2022-02-01 00:00:00 | 2022-03-01 00:00:00 | Asia/Singapore | Australia/Perth | 2021-12-31 13:00:00 | 2021-12-31 21:00:00 | 

702 | 2 | b | 2022-01-02 00:00:00 | 2022-02-02 00:00:00 | 2022-03-02 00:00:00 | Asia/Singapore | Australia/Perth | 2022-01-01 13:00:00 | 2022-01-01 21:00:00 | 

703 | 3 | c | 2022-01-03 00:00:00 | 2022-02-03 00:00:00 | 2022-03-03 00:00:00 | Asia/Singapore | Australia/Perth | 2022-01-02 13:00:00 | 2022-01-02 21:00:00 | 

704 | 4 | d | 2022-01-04 00:00:00 | 2022-02-04 00:00:00 | 2022-03-04 00:00:00 | Asia/Singapore | Australia/Perth | 2022-01-03 13:00:00 | 2022-01-03 21:00:00 | 

705 +---+---+---------------------+---------------------+---------------------+----------------+-------------------+---------------------+---------------------+ 

706 ``` 

707 !!! success "Conclusion: Successfully converted single column from other time zone to local time zone." 

708 </div> 

709 

710 ```{.py .python linenums="1" title="Example 5: All columns"} 

711 >>> add_local_datetime_columns( 

712 ... dataframe=df, 

713 ... columns="all", 

714 ... from_timezone="Australia/Sydney", 

715 ... column_with_target_timezone="target", 

716 ... ).show() 

717 ``` 

718 <div class="result" markdown> 

719 ```{.txt .text title="Terminal"} 

720 +---+---+---------------------+---------------------+---------------------+----------------+-------------------+---------------------+---------------------+---------------------+---------------------+ 

721 | a | b | c | d_datetime | e_datetime | target | TIMEZONE_LOCATION | d_datetime_UTC | d_datetime_LOCAL | e_datetime_UTC | e_datetime_LOCAL | 

722 +---+---+---------------------+---------------------+---------------------+----------------+-------------------+---------------------+---------------------+---------------------+---------------------+ 

723 | 1 | a | 2022-01-01 00:00:00 | 2022-02-01 00:00:00 | 2022-03-01 00:00:00 | Asia/Singapore | Australia/Perth | 2022-01-31 13:00:00 | 2022-02-01 08:00:00 | 2022-02-28 13:00:00 | 2022-03-01 08:00:00 | 

724 | 2 | b | 2022-01-02 00:00:00 | 2022-02-02 00:00:00 | 2022-03-02 00:00:00 | Asia/Singapore | Australia/Perth | 2022-02-01 13:00:00 | 2022-02-02 08:00:00 | 2022-03-01 13:00:00 | 2022-03-02 08:00:00 | 

725 | 3 | c | 2022-01-03 00:00:00 | 2022-02-03 00:00:00 | 2022-03-03 00:00:00 | Asia/Singapore | Australia/Perth | 2022-02-02 13:00:00 | 2022-02-03 08:00:00 | 2022-03-02 13:00:00 | 2022-03-03 08:00:00 | 

726 | 4 | d | 2022-01-04 00:00:00 | 2022-02-04 00:00:00 | 2022-03-04 00:00:00 | Asia/Singapore | Australia/Perth | 2022-02-03 13:00:00 | 2022-02-04 08:00:00 | 2022-03-03 13:00:00 | 2022-03-04 08:00:00 | 

727 +---+---+---------------------+---------------------+---------------------+----------------+-------------------+---------------------+---------------------+---------------------+---------------------+ 

728 ``` 

729 !!! success "Conclusion: Successfully converted all date time columns from other time zone to local time zone." 

730 </div> 

731 

732 ```{.py .python linenums="1" title="Example 6: Invalid column name"} 

733 >>> add_local_datetime_columns(df, "invalid_column") 

734 ``` 

735 <div class="result" markdown> 

736 ```{.txt .text title="Terminal"} 

737 ColumnDoesNotExistError: Column "invalid_column" does not exist in "dataframe". 

738 Try one of: ["a", "b", "c", "d_datetime", "e_datetime", "target", "TIMEZONE_LOCATION"]. 

739 ``` 

740 !!! failure "Conclusion: Column does not exist." 

741 </div> 

742 

743 ```{.py .python linenums="1" title="Example 7: Invalid timezone"} 

744 >>> add_local_datetime_columns(df, "c", from_timezone="Invalid/Timezone") 

745 ``` 

746 <div class="result" markdown> 

747 ```{.txt .text title="Terminal"} 

748 ValueError: The timezone "Invalid/Timezone" is not a valid timezone. 

749 ``` 

750 !!! failure "Conclusion: Invalid timezone." 

751 </div> 

752 

753 ??? tip "See Also" 

754 - [`add_local_datetime_column()`][toolbox_pyspark.datetime.add_local_datetime_column] 

755 """ 

756 if columns is None or columns in ["all"]: 

757 columns = [col for col in dataframe.columns if col.lower().endswith("datetime")] 

758 elif is_type(columns, str): 

759 columns = [columns] 

760 assert_columns_exists(dataframe, list(columns) + [column_with_target_timezone]) 

761 for column in columns: 

762 dataframe = add_local_datetime_column( 

763 dataframe=dataframe, 

764 column=column, 

765 from_timezone=from_timezone, 

766 column_with_target_timezone=column_with_target_timezone, 

767 ) 

768 return dataframe 

769 

770 

771# ---------------------------------------------------------------------------- # 

772# Splitting #### 

773# ---------------------------------------------------------------------------- # 

774 

775 

776@typechecked 

777def split_datetime_column( 

778 dataframe: psDataFrame, 

779 column: str, 

780) -> psDataFrame: 

781 """ 

782 !!! note "Summary" 

783 Take the column `column`, which should be a `timestamp` type, and split it in to it's respective `date` and `time` components. 

784 

785 Params: 

786 dataframe (psDataFrame): 

787 The DataFrame to update. 

788 column (str): 

789 The column to split. 

790 

791 Raises: 

792 TypeError: 

793 If any of the inputs parsed to the parameters of this function are not the correct type. Uses the [`@typeguard.typechecked`](https://typeguard.readthedocs.io/en/stable/api.html#typeguard.typechecked) decorator. 

794 ColumnDoesNotExistError: 

795 If the `#!py column` does not exist within `#!py dataframe.columns`. 

796 TypeError: 

797 If the `column` is not type `timestamp` or `datetime`. 

798 

799 Returns: 

800 (psDataFrame): 

801 The updated DataFrame. 

802 

803 ???+ example "Examples" 

804 

805 ```{.py .python linenums="1" title="Set up"} 

806 >>> # Imports 

807 >>> import pandas as pd 

808 >>> from pyspark.sql import SparkSession 

809 >>> from toolbox_pyspark.datetime import split_datetime_column 

810 >>> 

811 >>> # Instantiate Spark 

812 >>> spark = SparkSession.builder.getOrCreate() 

813 >>> 

814 >>> # Create data 

815 >>> df = spark.createDataFrame( 

816 ... pd.DataFrame( 

817 ... { 

818 ... "a": [1, 2, 3, 4], 

819 ... "b": ["a", "b", "c", "d"], 

820 ... "c_datetime": pd.date_range(start="2022-01-01", periods=4, freq="h"), 

821 ... "d_datetime": pd.date_range(start="2022-02-01", periods=4, freq="h"), 

822 ... "e_datetime": pd.date_range(start="2022-03-01", periods=4, freq="h"), 

823 ... "TIMEZONE_LOCATION": ["Australia/Perth"] * 4, 

824 ... } 

825 ... ) 

826 ... ) 

827 >>> 

828 >>> # Check 

829 >>> df.show() 

830 ``` 

831 <div class="result" markdown> 

832 ```{.txt .text title="Terminal"} 

833 +---+---+---------------------+---------------------+---------------------+-------------------+ 

834 | a | b | c_datetime | d_datetime | e_datetime | TIMEZONE_LOCATION | 

835 +---+---+---------------------+---------------------+---------------------+-------------------+ 

836 | 1 | a | 2022-01-01 00:00:00 | 2022-02-01 00:00:00 | 2022-03-01 00:00:00 | Australia/Perth | 

837 | 2 | b | 2022-01-01 01:00:00 | 2022-02-01 01:00:00 | 2022-03-01 01:00:00 | Australia/Perth | 

838 | 3 | c | 2022-01-01 02:00:00 | 2022-02-01 02:00:00 | 2022-03-01 02:00:00 | Australia/Perth | 

839 | 4 | d | 2022-01-01 03:00:00 | 2022-02-01 03:00:00 | 2022-03-01 03:00:00 | Australia/Perth | 

840 +---+---+---------------------+---------------------+---------------------+-------------------+ 

841 ``` 

842 </div> 

843 

844 ```{.py .python linenums="1" title="Example 1: Default config"} 

845 >>> split_datetime_column(df, "c_datetime").show() 

846 ``` 

847 <div class="result" markdown> 

848 ```{.txt .text title="Terminal"} 

849 +---+---+---------------------+---------------------+---------------------+-------------------+------------+----------+ 

850 | a | b | c_datetime | d_datetime | e_datetime | TIMEZONE_LOCATION | C_DATE | C_TIME | 

851 +---+---+---------------------+---------------------+---------------------+-------------------+------------+----------+ 

852 | 1 | a | 2022-01-01 00:00:00 | 2022-02-01 00:00:00 | 2022-03-01 00:00:00 | Australia/Perth | 2022-01-01 | 00:00:00 | 

853 | 2 | b | 2022-01-01 01:00:00 | 2022-02-01 01:00:00 | 2022-03-01 01:00:00 | Australia/Perth | 2022-01-01 | 01:00:00 | 

854 | 3 | c | 2022-01-01 02:00:00 | 2022-02-01 02:00:00 | 2022-03-01 02:00:00 | Australia/Perth | 2022-01-01 | 02:00:00 | 

855 | 4 | d | 2022-01-01 03:00:00 | 2022-02-01 03:00:00 | 2022-03-01 03:00:00 | Australia/Perth | 2022-01-01 | 03:00:00 | 

856 +---+---+---------------------+---------------------+---------------------+-------------------+------------+----------+ 

857 ``` 

858 !!! success "Conclusion: Successfully split the column in to it's Date and Time constituents." 

859 </div> 

860 

861 ```{.py .python linenums="1" title="Example 2: Invalid column name"} 

862 >>> split_datetime_column(df, "invalid_column") 

863 ``` 

864 <div class="result" markdown> 

865 ```{.txt .text title="Terminal"} 

866 ColumnDoesNotExistError: Column "invalid_column" does not exist in "dataframe". 

867 Try one of: ["a", "b", "c_datetime", "d_datetime", "e_datetime", "TIMEZONE_LOCATION"]. 

868 ``` 

869 !!! failure "Conclusion: Column does not exist." 

870 </div> 

871 

872 ```{.py .python linenums="1" title="Example 2: Invalid column name"} 

873 >>> split_datetime_column(df, "b") 

874 ``` 

875 <div class="result" markdown> 

876 ```{.txt .text title="Terminal"} 

877 TypeError: Column must be type 'timestamp' or 'datetime'. 

878 Current type: [('b', 'string')] 

879 ``` 

880 !!! failure "Conclusion: Column is not the correct type for splitting." 

881 </div> 

882 

883 ??? tip "See Also" 

884 - [`split_datetime_columns()`][toolbox_pyspark.datetime.split_datetime_columns] 

885 """ 

886 assert_column_exists(dataframe, column) 

887 datetime_cols: str_list = get_columns(dataframe, "all_datetime") 

888 if not is_in(column, datetime_cols): 

889 raise TypeError( 

890 "Column must be type 'timestamp' or 'datetime'.\n" 

891 f"Current type: {[(col,typ) for col,typ in dataframe.dtypes if col == column]}" 

892 ) 

893 col_date_name: str = column.upper().replace("DATETIME", "DATE") 

894 col_time_name: str = column.upper().replace("DATETIME", "TIME") 

895 col_date_value: Column = F.date_format(column, "yyyy-MM-dd").cast("string").cast("date") 

896 col_time_value: Column = F.date_format(column, "HH:mm:ss").cast("string") 

897 return dataframe.withColumns( 

898 { 

899 col_date_name: col_date_value, 

900 col_time_name: col_time_value, 

901 } 

902 ) 

903 

904 

905@typechecked 

906def split_datetime_columns( 

907 dataframe: psDataFrame, 

908 columns: Optional[Union[str, str_collection]] = None, 

909) -> psDataFrame: 

910 """ 

911 !!! note "Summary" 

912 For all the columns listed in `columns`, split them each in to their respective `date` and `time` components. 

913 

914 ???+ abstract "Details" 

915 The reason why this function is structured this way, and not re-calling [`split_datetime_column()`][toolbox_pyspark.datetime.split_datetime_column] in each iteration of `columns` is due to `#!py pyspark` RDD complexity. More specifically, if it _were_ to call [`split_datetime_column()`][toolbox_pyspark.datetime.split_datetime_column] each time, the RDD would get incredibly and unnecessarily complicated. However, by doing it this way, using the [`.withColumns()`](https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.DataFrame.withColumns.html) method, it will project the SQL expression **once** down to the underlying dataframe; not multiple times. Therefore, in this way, the underlying SQL execution plan is now much less complicated; albeit that the coding DRY principle is not strictly being followed here. 

916 

917 Params: 

918 dataframe (psDataFrame): 

919 The DataFrame to update. 

920 columns (Optional[Union[str, str_collection]], optional): 

921 The list of columns to update. If not given, it will generate the list of columns from the `#!py dataframe.columns` which contain the text `datetime`.<br> 

922 Defaults to `#!py None`. 

923 

924 Raises: 

925 TypeError: 

926 If any of the inputs parsed to the parameters of this function are not the correct type. Uses the [`@typeguard.typechecked`](https://typeguard.readthedocs.io/en/stable/api.html#typeguard.typechecked) decorator. 

927 ColumnDoesNotExistError: 

928 If any of the `#!py columns` do not exist within `#!py dataframe.columns`. 

929 TypeError: 

930 If any of the columns in `columns` are not type `timestamp` or `datetime`. 

931 

932 Returns: 

933 (psDataFrame): 

934 The updated DataFrame. 

935 

936 ???+ example "Examples" 

937 

938 ```{.py .python linenums="1" title="Set up"} 

939 >>> # Imports 

940 >>> import pandas as pd 

941 >>> from pyspark.sql import SparkSession 

942 >>> from toolbox_pyspark.datetime import split_datetime_columns 

943 >>> 

944 >>> # Instantiate Spark 

945 >>> spark = SparkSession.builder.getOrCreate() 

946 >>> 

947 >>> # Create data 

948 >>> df = spark.createDataFrame( 

949 ... pd.DataFrame( 

950 ... { 

951 ... "a": [1, 2, 3, 4], 

952 ... "b": ["a", "b", "c", "d"], 

953 ... "c_datetime": pd.date_range(start="2022-01-01", periods=4, freq="h"), 

954 ... "d_datetime": pd.date_range(start="2022-02-01", periods=4, freq="h"), 

955 ... "e_datetime": pd.date_range(start="2022-03-01", periods=4, freq="h"), 

956 ... "TIMEZONE_LOCATION": ["Australia/Perth"] * 4, 

957 ... } 

958 ... ) 

959 ... ) 

960 >>> 

961 >>> # Check 

962 >>> df.show() 

963 ``` 

964 <div class="result" markdown> 

965 ```{.txt .text title="Terminal"} 

966 +---+---+---------------------+---------------------+---------------------+-------------------+ 

967 | a | b | c_datetime | d_datetime | e_datetime | TIMEZONE_LOCATION | 

968 +---+---+---------------------+---------------------+---------------------+-------------------+ 

969 | 1 | a | 2022-01-01 00:00:00 | 2022-02-01 00:00:00 | 2022-03-01 00:00:00 | Australia/Perth | 

970 | 2 | b | 2022-01-01 01:00:00 | 2022-02-01 01:00:00 | 2022-03-01 01:00:00 | Australia/Perth | 

971 | 3 | c | 2022-01-01 02:00:00 | 2022-02-01 02:00:00 | 2022-03-01 02:00:00 | Australia/Perth | 

972 | 4 | d | 2022-01-01 03:00:00 | 2022-02-01 03:00:00 | 2022-03-01 03:00:00 | Australia/Perth | 

973 +---+---+---------------------+---------------------+---------------------+-------------------+ 

974 ``` 

975 </div> 

976 

977 ```{.py .python linenums="1" title="Example 1: Default config"} 

978 >>> split_datetime_columns(df).show() 

979 ``` 

980 <div class="result" markdown> 

981 ```{.txt .text title="Terminal"} 

982 +---+---+---------------------+---------------------+---------------------+-------------------+------------+----------+------------+----------+------------+----------+ 

983 | a | b | c_datetime | d_datetime | e_datetime | TIMEZONE_LOCATION | C_DATE | C_TIME | D_DATE | D_TIME | E_DATE | E_TIME | 

984 +---+---+---------------------+---------------------+---------------------+-------------------+------------+----------+------------+----------+------------+----------+ 

985 | 1 | a | 2022-01-01 00:00:00 | 2022-02-01 00:00:00 | 2022-03-01 00:00:00 | Australia/Perth | 2022-01-01 | 00:00:00 | 2022-02-01 | 00:00:00 | 2022-03-01 | 00:00:00 | 

986 | 2 | b | 2022-01-01 01:00:00 | 2022-02-01 01:00:00 | 2022-03-01 01:00:00 | Australia/Perth | 2022-01-01 | 01:00:00 | 2022-02-01 | 01:00:00 | 2022-03-01 | 01:00:00 | 

987 | 3 | c | 2022-01-01 02:00:00 | 2022-02-01 02:00:00 | 2022-03-01 02:00:00 | Australia/Perth | 2022-01-01 | 02:00:00 | 2022-02-01 | 02:00:00 | 2022-03-01 | 02:00:00 | 

988 | 4 | d | 2022-01-01 03:00:00 | 2022-02-01 03:00:00 | 2022-03-01 03:00:00 | Australia/Perth | 2022-01-01 | 03:00:00 | 2022-02-01 | 03:00:00 | 2022-03-01 | 03:00:00 | 

989 +---+---+---------------------+---------------------+---------------------+-------------------+------------+----------+------------+----------+------------+----------+ 

990 ``` 

991 !!! success "Conclusion: Successfully split all DateTime columns in to their Date and Time constituents." 

992 </div> 

993 

994 ```{.py .python linenums="1" title="Example 2: Custom config"} 

995 >>> split_datetime_columns(df, ["c_datetime", "d_datetime"]).show() 

996 ``` 

997 <div class="result" markdown> 

998 ```{.txt .text title="Terminal"} 

999 +---+---+---------------------+---------------------+---------------------+-------------------+------------+----------+------------+----------+ 

1000 | a | b | c_datetime | d_datetime | e_datetime | TIMEZONE_LOCATION | C_DATE | C_TIME | D_DATE | D_TIME | 

1001 +---+---+---------------------+---------------------+---------------------+-------------------+------------+----------+------------+----------+ 

1002 | 1 | a | 2022-01-01 00:00:00 | 2022-02-01 00:00:00 | 2022-03-01 00:00:00 | Australia/Perth | 2022-01-01 | 00:00:00 | 2022-02-01 | 00:00:00 | 

1003 | 2 | b | 2022-01-01 01:00:00 | 2022-02-01 01:00:00 | 2022-03-01 01:00:00 | Australia/Perth | 2022-01-01 | 01:00:00 | 2022-02-01 | 01:00:00 | 

1004 | 3 | c | 2022-01-01 02:00:00 | 2022-02-01 02:00:00 | 2022-03-01 02:00:00 | Australia/Perth | 2022-01-01 | 02:00:00 | 2022-02-01 | 02:00:00 | 

1005 | 4 | d | 2022-01-01 03:00:00 | 2022-02-01 03:00:00 | 2022-03-01 03:00:00 | Australia/Perth | 2022-01-01 | 03:00:00 | 2022-02-01 | 03:00:00 | 

1006 +---+---+---------------------+---------------------+---------------------+-------------------+------------+----------+------------+----------+ 

1007 ``` 

1008 !!! success "Conclusion: Successfully split two columns into their Date and Time constituents." 

1009 </div> 

1010 

1011 ```{.py .python linenums="1" title="Example 3: All columns"} 

1012 >>> split_datetime_columns(df, "all").show() 

1013 ``` 

1014 <div class="result" markdown> 

1015 ```{.txt .text title="Terminal"} 

1016 +---+---+---------------------+---------------------+---------------------+-------------------+------------+----------+------------+----------+------------+----------+ 

1017 | a | b | c_datetime | d_datetime | e_datetime | TIMEZONE_LOCATION | C_DATE | C_TIME | D_DATE | D_TIME | E_DATE | E_TIME | 

1018 +---+---+---------------------+---------------------+---------------------+-------------------+------------+----------+------------+----------+------------+----------+ 

1019 | 1 | a | 2022-01-01 00:00:00 | 2022-02-01 00:00:00 | 2022-03-01 00:00:00 | Australia/Perth | 2022-01-01 | 00:00:00 | 2022-02-01 | 00:00:00 | 2022-03-01 | 00:00:00 | 

1020 | 2 | b | 2022-01-01 01:00:00 | 2022-02-01 01:00:00 | 2022-03-01 01:00:00 | Australia/Perth | 2022-01-01 | 01:00:00 | 2022-02-01 | 01:00:00 | 2022-03-01 | 01:00:00 | 

1021 | 3 | c | 2022-01-01 02:00:00 | 2022-02-01 02:00:00 | 2022-03-01 02:00:00 | Australia/Perth | 2022-01-01 | 02:00:00 | 2022-02-01 | 02:00:00 | 2022-03-01 | 02:00:00 | 

1022 | 4 | d | 2022-01-01 03:00:00 | 2022-02-01 03:00:00 | 2022-03-01 03:00:00 | Australia/Perth | 2022-01-01 | 03:00:00 | 2022-02-01 | 03:00:00 | 2022-03-01 | 03:00:00 | 

1023 +---+---+---------------------+---------------------+---------------------+-------------------+------------+----------+------------+----------+------------+----------+ 

1024 ``` 

1025 !!! success "Conclusion: Successfully split all DateTime columns in to their Date and Time constituents." 

1026 </div> 

1027 

1028 ```{.py .python linenums="1" title="Example 4: Single column"} 

1029 >>> split_datetime_columns(df, "c_datetime").show() 

1030 ``` 

1031 <div class="result" markdown> 

1032 ```{.txt .text title="Terminal"} 

1033 +---+---+---------------------+---------------------+---------------------+-------------------+------------+----------+ 

1034 | a | b | c_datetime | d_datetime | e_datetime | TIMEZONE_LOCATION | C_DATE | C_TIME | 

1035 +---+---+---------------------+---------------------+---------------------+-------------------+------------+----------+ 

1036 | 1 | a | 2022-01-01 00:00:00 | 2022-02-01 00:00:00 | 2022-03-01 00:00:00 | Australia/Perth | 2022-01-01 | 00:00:00 | 

1037 | 2 | b | 2022-01-01 01:00:00 | 2022-02-01 01:00:00 | 2022-03-01 01:00:00 | Australia/Perth | 2022-01-01 | 01:00:00 | 

1038 | 3 | c | 2022-01-01 02:00:00 | 2022-02-01 02:00:00 | 2022-03-01 02:00:00 | Australia/Perth | 2022-01-01 | 02:00:00 | 

1039 | 4 | d | 2022-01-01 03:00:00 | 2022-02-01 03:00:00 | 2022-03-01 03:00:00 | Australia/Perth | 2022-01-01 | 03:00:00 | 

1040 +---+---+---------------------+---------------------+---------------------+-------------------+------------+----------+ 

1041 ``` 

1042 !!! success "Conclusion: Successfully split a single column in to it's Date and Time constituents." 

1043 </div> 

1044 

1045 ```{.py .python linenums="1" title="Example 5: Invalid column name"} 

1046 >>> split_datetime_columns(df, "invalid_column") 

1047 ``` 

1048 <div class="result" markdown> 

1049 ```{.txt .text title="Terminal"} 

1050 ColumnDoesNotExistError: Column "invalid_column" does not exist in "dataframe". 

1051 Try one of: ["a", "b", "c_datetime", "d_datetime", "e_datetime", "TIMEZONE_LOCATION"]. 

1052 ``` 

1053 !!! failure "Conclusion: Column does not exist." 

1054 </div> 

1055 

1056 ```{.py .python linenums="1" title="Example 6: Invalid column type"} 

1057 >>> split_datetime_columns(df, "b") 

1058 ``` 

1059 <div class="result" markdown> 

1060 ```{.txt .text title="Terminal"} 

1061 TypeError: Column must be type 'timestamp' or 'datetime'. 

1062 Current type: [('b', 'string')] 

1063 ``` 

1064 !!! failure "Conclusion: Column is not the correct type for splitting." 

1065 </div> 

1066 

1067 ??? tip "See Also" 

1068 - [`split_datetime_column()`][toolbox_pyspark.datetime.split_datetime_column] 

1069 """ 

1070 if columns is None or columns in ["all"]: 

1071 columns = [col for col in dataframe.columns if "datetime" in col.lower()] 

1072 elif is_type(columns, str): 

1073 columns = [columns] 

1074 assert_columns_exists(dataframe, columns) 

1075 datetime_cols: str_list = get_columns(dataframe, "all_datetime") 

1076 if not is_all_in(columns, datetime_cols): 

1077 raise TypeError( 

1078 "Columns to split must be type 'timestamp' or 'datetime'.\n" 

1079 f"Current types: {[(col,typ) for col,typ in dataframe.dtypes if col in columns]}" 

1080 ) 

1081 cols_exprs: dict[str, Column] = {} 

1082 for column in columns: 

1083 col_date_name: str = column.upper().replace("DATETIME", "DATE") 

1084 col_time_name: str = column.upper().replace("DATETIME", "TIME") 

1085 col_date_value: Column = ( 

1086 F.date_format(column, "yyyy-MM-dd").cast("string").cast("date") 

1087 ) 

1088 col_time_value: Column = F.date_format(column, "HH:mm:ss").cast("string") 

1089 cols_exprs[col_date_name] = col_date_value 

1090 cols_exprs[col_time_name] = col_time_value 

1091 return dataframe.withColumns(cols_exprs)