Coverage for src/toolbox_pyspark/datetime.py: 100%

1# ============================================================================ #

2# #

3# Title : Datetime #

4# Purpose : Fixing column names that contain datetime data, adding #

5# conversions to local datetimes, and for splitting a column in #

6# to their date and time components. #

7# #

8# ============================================================================ #

11# ---------------------------------------------------------------------------- #

12# #

13# Overview ####

14# #

15# ---------------------------------------------------------------------------- #

18# ---------------------------------------------------------------------------- #

19# Description ####

20# ---------------------------------------------------------------------------- #

23"""

24!!! note "Summary"

25 The `datetime` module is used for fixing column names that contain datetime data, adding conversions to local datetimes, and for splitting a column in to their date and time components.

26"""

29# ---------------------------------------------------------------------------- #

30# #

31# Setup ####

32# #

33# ---------------------------------------------------------------------------- #

36# ---------------------------------------------------------------------------- #

37# Imports ####

38# ---------------------------------------------------------------------------- #

41# ## Python StdLib Imports ----

42from typing import Optional, Union

44# ## Python Third Party Imports ----

45from pyspark.sql import Column, DataFrame as psDataFrame, functions as F

46from toolbox_python.checkers import is_all_in, is_in, is_type

47from toolbox_python.collection_types import str_collection, str_list

48from typeguard import typechecked

50# ## Local First Party Imports ----

51from toolbox_pyspark.checks import assert_column_exists, assert_columns_exists

52from toolbox_pyspark.columns import get_columns

55# ---------------------------------------------------------------------------- #

56# Exports ####

57# ---------------------------------------------------------------------------- #

60__all__: str_list = [

61 "rename_datetime_column",

62 "rename_datetime_columns",

63 "add_local_datetime_column",

64 "add_local_datetime_columns",

65 "split_datetime_column",

66 "split_datetime_columns",

67]

70# ---------------------------------------------------------------------------- #

71# #

72# Functions ####

73# #

74# ---------------------------------------------------------------------------- #

77# ---------------------------------------------------------------------------- #

78# Renaming ####

79# ---------------------------------------------------------------------------- #

82@typechecked

83def rename_datetime_column(

84 dataframe: psDataFrame,

85 column: str,

86) -> psDataFrame:

87 """

88 !!! note "Summary"

89 For a given column in a Data Frame, if there is not another column existing that has `TIME` appended to the end, then re-name the column to append `TIME` to it.

91 Params:

92 dataframe (psDataFrame):

93 The DataFrame to update.

94 column (str):

95 The column to check.

97 Raises:

98 TypeError:

99 If any of the inputs parsed to the parameters of this function are not the correct type. Uses the [`@typeguard.typechecked`](https://typeguard.readthedocs.io/en/stable/api.html#typeguard.typechecked) decorator.

100 ColumnDoesNotExistError:

101 If the `#!py column` does not exist within `#!py dataframe.columns`.

102

103 Returns:

104 (psDataFrame):

105 The updated Data Frame.

106

107 ???+ example "Examples"

108

109 ```{.py .python linenums="1" title="Set up"}

110 >>> # Imports

111 >>> import pandas as pd

112 >>> from pyspark.sql import SparkSession

113 >>> from toolbox_pyspark.datetime import rename_datetime_column

114 >>>

115 >>> # Instantiate Spark

116 >>> spark = SparkSession.builder.getOrCreate()

117 >>>

118 >>> # Create data

119 >>> df = spark.createDataFrame(

120 ... pd.DataFrame(

121 ... {

122 ... "a": [1, 2, 3, 4],

123 ... "b": ["a", "b", "c", "d"],

124 ... "c_date": pd.date_range(start="2022-01-01", periods=4, freq="h"),

125 ... "d_date": pd.date_range(start="2022-02-01", periods=4, freq="h"),

126 ... }

127 ... )

128 ... )

129 >>>

130 >>> # Check

131 >>> df.show()

132 ```

133 <div class="result" markdown>

134 ```{.txt .text title="Terminal"}

135 +---+---+---------------------+---------------------+

136 | a | b | c_date | d_date |

137 +---+---+---------------------+---------------------+

138 | 0 | a | 2022-01-01 00:00:00 | 2022-02-01 00:00:00 |

139 | 1 | b | 2022-01-01 01:00:00 | 2022-02-01 01:00:00 |

140 | 2 | c | 2022-01-01 02:00:00 | 2022-02-01 02:00:00 |

141 | 3 | d | 2022-01-01 03:00:00 | 2022-02-01 03:00:00 |

142 +---+---+---------------------+---------------------+

143 ```

144 </div>

145

146 ```{.py .python linenums="1" title="Example 1: Update column"}

147 >>> rename_datetime_column(df, "c_date").show()

148 ```

149 <div class="result" markdown>

150 ```{.txt .text title="Terminal"}

151 +---+---+---------------------+---------------------+

152 | a | b | c_dateTIME | d_date |

153 +---+---+---------------------+---------------------+

154 | 0 | a | 2022-01-01 00:00:00 | 2022-02-01 00:00:00 |

155 | 1 | b | 2022-01-01 01:00:00 | 2022-02-01 01:00:00 |

156 | 2 | c | 2022-01-01 02:00:00 | 2022-02-01 02:00:00 |

157 | 3 | d | 2022-01-01 03:00:00 | 2022-02-01 03:00:00 |

158 +---+---+---------------------+---------------------+

159 ```

160 !!! success "Conclusion: Successfully renamed column."

161 </div>

162

163 ```{.py .python linenums="1" title="Example 2: Missing column"}

164 >>> rename_datetime_column(df, "fff")

165 ```

166 <div class="result" markdown>

167 ```{.txt .text title="Terminal"}

168 ColumnDoesNotExistError: Column "fff" does not exist in "dataframe".

169 Try one of: ["a", "b", "c_date", "d_date"].

170 ```

171 !!! failure "Conclusion: Column does not exist."

172 </div>

173

174 ??? tip "See Also"

175 - [`assert_column_exists()`][toolbox_pyspark.checks.assert_column_exists]

176 - [`rename_datetime_columns()`][toolbox_pyspark.datetime.rename_datetime_columns]

177 """

178 assert_column_exists(dataframe=dataframe, column=column, match_case=True)

179 if f"{column}TIME" not in dataframe.columns:

180 return dataframe.withColumnRenamed(column, f"{column}TIME")

181 else:

182 return dataframe

183

184

185@typechecked

186def rename_datetime_columns(

187 dataframe: psDataFrame,

188 columns: Optional[Union[str_collection, str]] = None,

189) -> psDataFrame:

190 """

191 !!! note "Summary"

192 Fix the column names for the date-time columns.

193

194 ???+ abstract "Details"

195 This is necessary because in NGW, there are some columns which have `datetime` data types, but which have the name only containing `date`.

196 So, this function will fix that.

197

198 Params:

199 dataframe (psDataFrame):

200 The DataFrame to update.

201 columns (Optional[Union[str_collection, str]], None):

202 An optional list of columns to update.

203 If this is not provided, or is the value `#!py None` or `#!py "all"`, then the function will automatically determine which columns to update based on the following logic:

204

205 1. Loop through each column on `dataframe` to fetch the name and dtype using the method: `dataframe.dtypes`.

206 1. If the column name ends with `#!py "date"`

207 2. **AND** the column type is `#!py "timestamp"`

208 3. **AND** there is **NOT** already a column existing in the `dataframe.columns` with the name: `f"{column}TIME"`

209 4. **THEN** rename the column to have the name: `f"{column}TIME"`

210 2. Next column.

211

212 Default: `None`.

213

214 Raises:

215 TypeError:

216 If any of the inputs parsed to the parameters of this function are not the correct type. Uses the [`@typeguard.typechecked`](https://typeguard.readthedocs.io/en/stable/api.html#typeguard.typechecked) decorator.

217 ColumnDoesNotExistError:

218 If any of the `#!py columns` do not exist within `#!py dataframe.columns`.

219

220 Returns:

221 (psDataFrame):

222 The updated DataFrame.

223

224 ???+ example "Examples"

225

226 ```{.py .python linenums="1" title="Set up"}

227 >>> # Imports

228 >>> import pandas as pd

229 >>> from pyspark.sql import SparkSession

230 >>> from toolbox_pyspark.datetime import rename_datetime_column

231 >>>

232 >>> # Instantiate Spark

233 >>> spark = SparkSession.builder.getOrCreate()

234 >>>

235 >>> # Create data

236 >>> df = spark.createDataFrame(

237 ... pd.DataFrame(

238 ... {

239 ... "a": [1, 2, 3, 4],

240 ... "b": ["a", "b", "c", "d"],

241 ... "c_date": pd.date_range(start="2022-01-01", periods=4, freq="h"),

242 ... "d_date": pd.date_range(start="2022-02-01", periods=4, freq="h"),

243 ... }

244 ... )

245 ... )

246 >>>

247 >>> # Check

248 >>> df.show()

249 ```

250 <div class="result" markdown>

251 ```{.txt .text title="Terminal"}

252 +---+---+---------------------+---------------------+

253 | a | b | c_date | d_date |

254 +---+---+---------------------+---------------------+

255 | 0 | a | 2022-01-01 00:00:00 | 2022-02-01 00:00:00 |

256 | 1 | b | 2022-01-01 01:00:00 | 2022-02-01 01:00:00 |

257 | 2 | c | 2022-01-01 02:00:00 | 2022-02-01 02:00:00 |

258 | 3 | d | 2022-01-01 03:00:00 | 2022-02-01 03:00:00 |

259 +---+---+---------------------+---------------------+

260 ```

261 </div>

262

263 ```{.py .python linenums="1" title="Example 1: One column"}

264 >>> rename_datetime_column(df, ["c_date"]).show()

265 ```

266 <div class="result" markdown>

267 ```{.txt .text title="Terminal"}

268 +---+---+---------------------+---------------------+

269 | a | b | c_dateTIME | d_date |

270 +---+---+---------------------+---------------------+

271 | 0 | a | 2022-01-01 00:00:00 | 2022-02-01 00:00:00 |

272 | 1 | b | 2022-01-01 01:00:00 | 2022-02-01 01:00:00 |

273 | 2 | c | 2022-01-01 02:00:00 | 2022-02-01 02:00:00 |

274 | 3 | d | 2022-01-01 03:00:00 | 2022-02-01 03:00:00 |

275 +---+---+---------------------+---------------------+

276 ```

277 !!! success "Conclusion: Successfully renamed column."

278 </div>

279

280 ```{.py .python linenums="1" title="Example 2: One column `str`"}

281 >>> rename_datetime_column(df, "c_date").show()

282 ```

283 <div class="result" markdown>

284 ```{.txt .text title="Terminal"}

285 +---+---+---------------------+---------------------+

286 | a | b | c_dateTIME | d_date |

287 +---+---+---------------------+---------------------+

288 | 0 | a | 2022-01-01 00:00:00 | 2022-02-01 00:00:00 |

289 | 1 | b | 2022-01-01 01:00:00 | 2022-02-01 01:00:00 |

290 | 2 | c | 2022-01-01 02:00:00 | 2022-02-01 02:00:00 |

291 | 3 | d | 2022-01-01 03:00:00 | 2022-02-01 03:00:00 |

292 +---+---+---------------------+---------------------+

293 ```

294 !!! success "Conclusion: Successfully renamed column."

295 </div>

296

297 ```{.py .python linenums="1" title="Example 3: All columns"}

298 >>> rename_datetime_column(df).show()

299 ```

300 <div class="result" markdown>

301 ```{.txt .text title="Terminal"}

302 +---+---+---------------------+---------------------+

303 | a | b | c_dateTIME | d_dateTIME |

304 +---+---+---------------------+---------------------+

305 | 0 | a | 2022-01-01 00:00:00 | 2022-02-01 00:00:00 |

306 | 1 | b | 2022-01-01 01:00:00 | 2022-02-01 01:00:00 |

307 | 2 | c | 2022-01-01 02:00:00 | 2022-02-01 02:00:00 |

308 | 3 | d | 2022-01-01 03:00:00 | 2022-02-01 03:00:00 |

309 +---+---+---------------------+---------------------+

310 ```

311 !!! success "Conclusion: Successfully renamed columns."

312 </div>

313

314 ```{.py .python linenums="1" title="Example 4: All columns using 'all'"}

315 >>> rename_datetime_column(df, "all").show()

316 ```

317 <div class="result" markdown>

318 ```{.txt .text title="Terminal"}

319 +---+---+---------------------+---------------------+

320 | a | b | c_dateTIME | d_dateTIME |

321 +---+---+---------------------+---------------------+

322 | 0 | a | 2022-01-01 00:00:00 | 2022-02-01 00:00:00 |

323 | 1 | b | 2022-01-01 01:00:00 | 2022-02-01 01:00:00 |

324 | 2 | c | 2022-01-01 02:00:00 | 2022-02-01 02:00:00 |

325 | 3 | d | 2022-01-01 03:00:00 | 2022-02-01 03:00:00 |

326 +---+---+---------------------+---------------------+

327 ```

328 !!! success "Conclusion: Successfully renamed columns."

329 </div>

330

331 ```{.py .python linenums="1" title="Example 5: Missing column"}

332 >>> rename_datetime_columns(df, ["fff", "ggg"])

333 ```

334 <div class="result" markdown>

335 ```{.txt .text title="Terminal"}

336 Attribute Error: Columns ["fff", "ggg"] do not exist in "dataframe".

337 Try one of: ["a", "b", "c_dateTIME", "d_dateTIME"].

338 ```

339 !!! failure "Conclusion: Columns do not exist."

340 </div>

341

342 ??? tip "See Also"

343 - [`assert_columns_exists()`][toolbox_pyspark.checks.assert_columns_exists]

344 - [`rename_datetime_column()`][toolbox_pyspark.datetime.rename_datetime_column]

345 """

346 if columns is None or columns == "all":

347 datetime_cols: str_list = [

348 col

349 for col in get_columns(dataframe, "all_datetime")

350 if col.lower().endswith("date")

351 ]

352 columns = [

353 col

354 for col in datetime_cols

355 if col.lower().endswith("date") and f"{col}TIME" not in dataframe.columns

356 ]

357 elif is_type(columns, str):

358 columns = [columns]

359 assert_columns_exists(dataframe, columns, True)

360 for column in columns:

361 dataframe = rename_datetime_column(dataframe, column)

362 return dataframe

363

364

365# ---------------------------------------------------------------------------- #

366# Add Locals ####

367# ---------------------------------------------------------------------------- #

368

369

370@typechecked

371def add_local_datetime_column(

372 dataframe: psDataFrame,

373 column: str,

374 from_timezone: Optional[str] = None,

375 column_with_target_timezone: str = "timezone_location".upper(),

376) -> psDataFrame:

377 """

378 !!! note "Summary"

379 For the given `column`, add a new column with the suffix `_LOCAL` which is a conversion of the datetime values from `column` to the desired timezone.

380

381 Params:

382 dataframe (psDataFrame):

383 The DataFrame to be fixed

384 column (str):

385 The name of the column to do the conversion for. Must exist in `#!py dataframe.columns`, and must be type `typestamp`.

386 from_timezone (str, optional):

387 The timezone which will be converted from. Must be a valid TimeZoneID, for more info, see: [TimeZoneID](https://docs.oracle.com/middleware/12211/wcs/tag-ref/MISC/TimeZones.html).<br>

388 If not given, will default the `from_timezone` to be UTC.<br>

389 Default: `#!py None`.

390 column_with_target_timezone (str, optional):

391 The column containing the target timezone value. By default will be the column `#!py "timezone_location"`.<br>

392 Defaults to `#!py "timezone_location".upper()`.

393

394 Raises:

395 TypeError:

396 If any of the inputs parsed to the parameters of this function are not the correct type. Uses the [`@typeguard.typechecked`](https://typeguard.readthedocs.io/en/stable/api.html#typeguard.typechecked) decorator.

397 ColumnDoesNotExistError:

398 If `#!py "column"` or `column_with_target_timezone` does not exist within `#!py dataframe.columns`.

399 ValueError:

400 If the `from_timezone` or `column_with_target_timezone` is not a valid timezone.

401

402 Returns:

403 (psDataFrame):

404 The updated DataFrame.

405

406 ???+ example "Examples"

407

408 ```{.py .python linenums="1" title="Set up"}

409 >>> # Imports

410 >>> import pandas as pd

411 >>> from pyspark.sql import SparkSession

412 >>> from toolbox_pyspark.datetime import add_local_datetime_column

413 >>>

414 >>> # Instantiate Spark

415 >>> spark = SparkSession.builder.getOrCreate()

416 >>>

417 >>> # Create data

418 >>> df = spark.createDataFrame(

419 ... pd.DataFrame(

420 ... {

421 ... "a": [1, 2, 3, 4],

422 ... "b": ["a", "b", "c", "d"],

423 ... "c": pd.date_range(start="2022-01-01", periods=4, freq="D"),

424 ... "d": pd.date_range(start="2022-02-01", periods=4, freq="D"),

425 ... "e": pd.date_range(start="2022-03-01", periods=4, freq="D"),

426 ... "target": ["Asia/Singapore"] * 4,

427 ... "TIMEZONE_LOCATION": ["Australia/Perth"] * 4,

428 ... }

429 ... )

430 ... )

431 >>>

432 >>> # Check

433 >>> df.show()

434 ```

435 <div class="result" markdown>

436 ```{.txt .text title="Terminal"}

437 +---+---+---------------------+---------------------+---------------------+----------------+-------------------+

438 | a | b | c | d | e | target | TIMEZONE_LOCATION |

439 +---+---+---------------------+---------------------+---------------------+----------------+-------------------+

440 | 1 | a | 2022-01-01 00:00:00 | 2022-02-01 00:00:00 | 2022-03-01 00:00:00 | Asia/Singapore | Australia/Perth |

441 | 2 | b | 2022-01-02 00:00:00 | 2022-02-02 00:00:00 | 2022-03-02 00:00:00 | Asia/Singapore | Australia/Perth |

442 | 3 | c | 2022-01-03 00:00:00 | 2022-02-03 00:00:00 | 2022-03-03 00:00:00 | Asia/Singapore | Australia/Perth |

443 | 4 | d | 2022-01-04 00:00:00 | 2022-02-04 00:00:00 | 2022-03-04 00:00:00 | Asia/Singapore | Australia/Perth |

444 +---+---+---------------------+---------------------+---------------------+----------------+-------------------+

445 ```

446 </div>

447

448 ```{.py .python linenums="1" title="Example 1: Converting from UTC time"}

449 >>> add_local_datetime_column(df, "c").show()

450 ```

451 <div class="result" markdown>

452 ```{.txt .text title="Terminal"}

453 +---+---+---------------------+---------------------+---------------------+----------------+-------------------+---------------------+

454 | a | b | c | d | e | target | TIMEZONE_LOCATION | c_LOCAL |

455 +---+---+---------------------+---------------------+---------------------+----------------+-------------------+---------------------+

456 | 1 | a | 2022-01-01 00:00:00 | 2022-02-01 00:00:00 | 2022-03-01 00:00:00 | Asia/Singapore | Australia/Perth | 2022-01-01 08:00:00 |

457 | 2 | b | 2022-01-02 00:00:00 | 2022-02-02 00:00:00 | 2022-03-02 00:00:00 | Asia/Singapore | Australia/Perth | 2022-01-02 08:00:00 |

458 | 3 | c | 2022-01-03 00:00:00 | 2022-02-03 00:00:00 | 2022-03-03 00:00:00 | Asia/Singapore | Australia/Perth | 2022-01-03 08:00:00 |

459 | 4 | d | 2022-01-04 00:00:00 | 2022-02-04 00:00:00 | 2022-03-04 00:00:00 | Asia/Singapore | Australia/Perth | 2022-01-04 08:00:00 |

460 +---+---+---------------------+---------------------+---------------------+----------------+-------------------+---------------------+

461 ```

462 !!! success "Conclusion: Successfully converted from UTC."

463 </div>

464

465 ```{.py .python linenums="1" title="Example 2: Converting from specific timezone, with custom column containing target timezone"}

466 >>> add_local_datetime_column(

467 ... dataframe=df,

468 ... column="c",

469 ... from_timezone="Australia/Sydney",

470 ... column_with_target_timezone="target",

471 ... ).show()

472 ```

473 <div class="result" markdown>

474 ```{.txt .text title="Terminal"}

475 +---+---+---------------------+---------------------+---------------------+----------------+-------------------+---------------------+---------------------+

476 | a | b | c | d | e | target | TIMEZONE_LOCATION | c_UTC | c_LOCAL |

477 +---+---+---------------------+---------------------+---------------------+----------------+-------------------+---------------------+---------------------+

478 | 1 | a | 2022-01-01 00:00:00 | 2022-02-01 00:00:00 | 2022-03-01 00:00:00 | Asia/Singapore | Australia/Perth | 2021-12-31 13:00:00 | 2021-12-31 21:00:00 |

479 | 2 | b | 2022-01-02 00:00:00 | 2022-02-02 00:00:00 | 2022-03-02 00:00:00 | Asia/Singapore | Australia/Perth | 2022-01-01 13:00:00 | 2022-01-01 21:00:00 |

480 | 3 | c | 2022-01-03 00:00:00 | 2022-02-03 00:00:00 | 2022-03-03 00:00:00 | Asia/Singapore | Australia/Perth | 2022-01-02 13:00:00 | 2022-01-02 21:00:00 |

481 | 4 | d | 2022-01-04 00:00:00 | 2022-02-04 00:00:00 | 2022-03-04 00:00:00 | Asia/Singapore | Australia/Perth | 2022-01-03 13:00:00 | 2022-01-03 21:00:00 |

482 +---+---+---------------------+---------------------+---------------------+----------------+-------------------+---------------------+---------------------+

483 ```

484 !!! success "Conclusion: Successfully converted timezone."

485 </div>

486

487 ```{.py .python linenums="1" title="Example 3: Invalid column name"}

488 >>> add_local_datetime_column(df, "invalid_column")

489 ```

490 <div class="result" markdown>

491 ```{.txt .text title="Terminal"}

492 ColumnDoesNotExistError: Column "invalid_column" does not exist in "dataframe".

493 Try one of: ["a", "b", "c", "d", "e", "target", "TIMEZONE_LOCATION"].

494 ```

495 !!! failure "Conclusion: Column does not exist."

496 </div>

497

498 ```{.py .python linenums="1" title="Example 4: Invalid timezone"}

499 >>> add_local_datetime_column(df, "c", from_timezone="Invalid/Timezone")

500 ```

501 <div class="result" markdown>

502 ```{.txt .text title="Terminal"}

503 ValueError: The timezone "Invalid/Timezone" is not a valid timezone.

504 ```

505 !!! failure "Conclusion: Invalid timezone."

506 </div>

507

508 ??? info "Notes"

509 - If `#!py from_timezone is None`, then it is assumed that the datetime data in `column` is _already_ in UTC timezone.<br>

510 - If `#!py from_timezone is not None`, then a new column will be added with the syntax `#!py {column}_UTC`, then another column added with `#!py {column}_LOCAL`. This is necessary because PySpark cannot convert immediately from one timezone to another; it must first require a conversion from the `from_timezone` value _to_ UTC, then a second conversion _from_ UTC to whichever timezone is defined in the column `column_with_target_timezone`.<br>

511 - The reason why this function uses multiple [`.withColumn()`][withColumn] methods, instead of a single [`.withColumns()`][withColumns] expression is because to add the `#!py {column}_LOCAL` column, it is first necessary for the `#!py {column}_UTC` column to exist on the `dataframe`. Therefore, we need to call [`.withColumn()`][withColumn] first to add `#!py {column}_UTC`, then we need to call [`.withColumn()`][withColumn] a second time to add `#!py {column}_LOCAL`.

512 [withColumn]: https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.DataFrame.withColumn.html

513 [withColumns]: https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.DataFrame.withColumns.html

514

515 ??? tip "See Also"

516 - [`assert_columns_exists()`][toolbox_pyspark.checks.assert_columns_exists]

517 - [`add_local_datetime_columns()`][toolbox_pyspark.datetime.add_local_datetime_columns]

518 """

519 assert_columns_exists(dataframe=dataframe, columns=[column, column_with_target_timezone])

520 require_utc: bool = f"{column}_UTC" not in dataframe.columns

521 require_local: bool = f"{column}_LOCAL" not in dataframe.columns

522 if from_timezone is not None:

523 if require_utc:

524 dataframe = dataframe.withColumn(

525 f"{column}_UTC",

526 F.to_utc_timestamp(

527 F.col(column).cast("timestamp"),

528 from_timezone.title(),

529 ),

530 )

531 if require_local:

532 dataframe = dataframe.withColumn(

533 f"{column}_LOCAL",

534 F.from_utc_timestamp(

535 F.col(f"{column}_UTC").cast("timestamp"),

536 F.col(column_with_target_timezone),

537 ),

538 )

539 else:

540 if require_local:

541 dataframe = dataframe.withColumn(

542 f"{column}_LOCAL",

543 F.from_utc_timestamp(

544 F.col(column).cast("timestamp"),

545 F.col(column_with_target_timezone),

546 ),

547 )

548 return dataframe

549

550

551@typechecked

552def add_local_datetime_columns(

553 dataframe: psDataFrame,

554 columns: Optional[Union[str, str_collection]] = None,

555 from_timezone: Optional[str] = None,

556 column_with_target_timezone: str = "timezone_location".upper(),

557) -> psDataFrame:

558 """

559 !!! note "Summary"

560 For each of the `data` or `datetime` columns in `dataframe`, add a new column which is converting it to the timezone of the local datetime.

561

562 ???+ abstract "Details"

563 Under the hood, this function will call [`add_local_datetime_column()`][toolbox_pyspark.datetime.add_local_datetime_column] for each `column` in `columns`.

564

565 Params:

566 dataframe (psDataFrame):

567 The DataFrame to update.

568 columns (Optional[Union[str, str_collection]], optional):

569 The columns to check. If not provided, it will use all of the columns which contains the text `date`.<br>

570 Defaults to `#!py None`.

571 from_timezone (Optional[str], optional):

572 The timezone which will be converted from. If not given, will default the from timezone to be UTC.<br>

573 Defaults to `#!py None`.

574 column_with_target_timezone (str, optional):

575 The column containing the target timezone value. By default will be the column `#!py "timezone_location"`.<br>

576 Defaults to `#!py "timezone_location".upper()`.

577

578 Raises:

579 TypeError:

580 If any of the inputs parsed to the parameters of this function are not the correct type. Uses the [`@typeguard.typechecked`](https://typeguard.readthedocs.io/en/stable/api.html#typeguard.typechecked) decorator.

581 ColumnDoesNotExistError:

582 If any of the `#!py columns` do not exist within `#!py dataframe.columns`.

583 ValueError:

584 If the `from_timezone` or `column_with_target_timezone` is not a valid timezone.

585

586 Returns:

587 (psDataFrame):

588 The updated DataFrame.

589

590 ???+ example "Examples"

591

592 ```{.py .python linenums="1" title="Set up"}

593 >>> # Imports

594 >>> import pandas as pd

595 >>> from pyspark.sql import SparkSession

596 >>> from toolbox_pyspark.datetime import add_local_datetime_columns

597 >>>

598 >>> # Instantiate Spark

599 >>> spark = SparkSession.builder.getOrCreate()

600 >>>

601 >>> # Create data

602 >>> df = spark.createDataFrame(

603 ... pd.DataFrame(

604 ... {

605 ... "a": [1, 2, 3, 4],

606 ... "b": ["a", "b", "c", "d"],

607 ... "c": pd.date_range(start="2022-01-01", periods=4, freq="D"),

608 ... "d_datetime": pd.date_range(start="2022-02-01", periods=4, freq="D"),

609 ... "e_datetime": pd.date_range(start="2022-03-01", periods=4, freq="D"),

610 ... "target": ["Asia/Singapore"] * 4,

611 ... "TIMEZONE_LOCATION": ["Australia/Perth"] * 4,

612 ... }

613 ... )

614 ... )

615 >>>

616 >>> # Check

617 >>> df.show()

618 ```

619 <div class="result" markdown>

620 ```{.txt .text title="Terminal"}

621 +---+---+---------------------+---------------------+---------------------+----------------+-------------------+

622 | a | b | c | d_datetime | e_datetime | target | TIMEZONE_LOCATION |

623 +---+---+---------------------+---------------------+---------------------+----------------+-------------------+

624 | 1 | a | 2022-01-01 00:00:00 | 2022-02-01 00:00:00 | 2022-03-01 00:00:00 | Asia/Singapore | Australia/Perth |

625 | 2 | b | 2022-01-02 00:00:00 | 2022-02-02 00:00:00 | 2022-03-02 00:00:00 | Asia/Singapore | Australia/Perth |

626 | 3 | c | 2022-01-03 00:00:00 | 2022-02-03 00:00:00 | 2022-03-03 00:00:00 | Asia/Singapore | Australia/Perth |

627 | 4 | d | 2022-01-04 00:00:00 | 2022-02-04 00:00:00 | 2022-03-04 00:00:00 | Asia/Singapore | Australia/Perth |

628 +---+---+---------------------+---------------------+---------------------+----------------+-------------------+

629 ```

630 </div>

631

632 ```{.py .python linenums="1" title="Example 1: Default config"}

633 >>> add_local_datetime_columns(df).show()

634 ```

635 <div class="result" markdown>

636 ```{.txt .text title="Terminal"}

637 +---+---+---------------------+---------------------+---------------------+----------------+-------------------+---------------------+---------------------+

639 +---+---+---------------------+---------------------+---------------------+----------------+-------------------+---------------------+---------------------+

640 | 1 | a | 2022-01-01 00:00:00 | 2022-02-01 00:00:00 | 2022-03-01 00:00:00 | Asia/Singapore | Australia/Perth | 2022-02-01 08:00:00 | 2022-03-01 08:00:00 |

641 | 2 | b | 2022-01-02 00:00:00 | 2022-02-02 00:00:00 | 2022-03-02 00:00:00 | Asia/Singapore | Australia/Perth | 2022-02-02 08:00:00 | 2022-03-02 08:00:00 |

642 | 3 | c | 2022-01-03 00:00:00 | 2022-02-03 00:00:00 | 2022-03-03 00:00:00 | Asia/Singapore | Australia/Perth | 2022-02-03 08:00:00 | 2022-03-03 08:00:00 |

643 | 4 | d | 2022-01-04 00:00:00 | 2022-02-04 00:00:00 | 2022-03-04 00:00:00 | Asia/Singapore | Australia/Perth | 2022-02-04 08:00:00 | 2022-03-04 08:00:00 |

644 +---+---+---------------------+---------------------+---------------------+----------------+-------------------+---------------------+---------------------+

645 ```

646 !!! success "Conclusion: Successfully converted columns to local timezone."

647 </div>

648

649 ```{.py .python linenums="1" title="Example 2: Semi-custom config"}

650 >>> add_local_datetime_columns(df, ["c", "d_datetime"]).show()

651 ```

652 <div class="result" markdown>

653 ```{.txt .text title="Terminal"}

654 +---+---+---------------------+---------------------+---------------------+----------------+-------------------+---------------------+---------------------+

656 +---+---+---------------------+---------------------+---------------------+----------------+-------------------+---------------------+---------------------+

657 | 1 | a | 2022-01-01 00:00:00 | 2022-02-01 00:00:00 | 2022-03-01 00:00:00 | Asia/Singapore | Australia/Perth | 2022-01-01 08:00:00 | 2022-02-01 08:00:00 |

658 | 2 | b | 2022-01-02 00:00:00 | 2022-02-02 00:00:00 | 2022-03-02 00:00:00 | Asia/Singapore | Australia/Perth | 2022-01-02 08:00:00 | 2022-02-02 08:00:00 |

659 | 3 | c | 2022-01-03 00:00:00 | 2022-02-03 00:00:00 | 2022-03-03 00:00:00 | Asia/Singapore | Australia/Perth | 2022-01-03 08:00:00 | 2022-02-03 08:00:00 |

660 | 4 | d | 2022-01-04 00:00:00 | 2022-02-04 00:00:00 | 2022-03-04 00:00:00 | Asia/Singapore | Australia/Perth | 2022-01-04 08:00:00 | 2022-02-04 08:00:00 |

661 +---+---+---------------------+---------------------+---------------------+----------------+-------------------+---------------------+---------------------+

662 ```

663 !!! success "Conclusion: Successfully converted columns to local timezone."

664 </div>

665

666 ```{.py .python linenums="1" title="Example 3: Full-custom config"}

667 >>> add_local_datetime_columns(

668 ... dataframe=df,

669 ... columns=["c", "d_datetime"],

670 ... from_timezone="Australia/Sydney",

671 ... column_with_target_timezone="target",

672 ... ).show()

673 ```

674 <div class="result" markdown>

675 ```{.txt .text title="Terminal"}

676 +---+---+---------------------+---------------------+---------------------+----------------+-------------------+---------------------+---------------------+---------------------+---------------------+

678 +---+---+---------------------+---------------------+---------------------+----------------+-------------------+---------------------+---------------------+---------------------+---------------------+

679 | 1 | a | 2022-01-01 00:00:00 | 2022-02-01 00:00:00 | 2022-03-01 00:00:00 | Asia/Singapore | Australia/Perth | 2021-12-31 13:00:00 | 2021-12-31 21:00:00 | 2022-01-31 13:00:00 | 2022-02-01 08:00:00 |

680 | 2 | b | 2022-01-02 00:00:00 | 2022-02-02 00:00:00 | 2022-03-02 00:00:00 | Asia/Singapore | Australia/Perth | 2022-01-01 13:00:00 | 2022-01-01 21:00:00 | 2022-02-01 13:00:00 | 2022-02-02 08:00:00 |

681 | 3 | c | 2022-01-03 00:00:00 | 2022-02-03 00:00:00 | 2022-03-03 00:00:00 | Asia/Singapore | Australia/Perth | 2022-01-02 13:00:00 | 2022-01-02 21:00:00 | 2022-02-02 13:00:00 | 2022-02-03 08:00:00 |

682 | 4 | d | 2022-01-04 00:00:00 | 2022-02-04 00:00:00 | 2022-03-04 00:00:00 | Asia/Singapore | Australia/Perth | 2022-01-03 13:00:00 | 2022-01-03 21:00:00 | 2022-02-03 13:00:00 | 2022-02-04 08:00:00 |

683 +---+---+---------------------+---------------------+---------------------+----------------+-------------------+---------------------+---------------------+---------------------+---------------------+

684 ```

685 !!! success "Conclusion: Successfully converted columns to local time zone, from other custom time zone."

686 </div>

687

688 ```{.py .python linenums="1" title="Example 4: Single column"}

689 >>> add_local_datetime_columns(

690 ... dataframe=df,

691 ... columns="c",

692 ... from_timezone="Australia/Sydney",

693 ... column_with_target_timezone="target",

694 ... ).show()

695 ```

696 <div class="result" markdown>

697 ```{.txt .text title="Terminal"}

698 +---+---+---------------------+---------------------+---------------------+----------------+-------------------+---------------------+---------------------+

700 +---+---+---------------------+---------------------+---------------------+----------------+-------------------+---------------------+---------------------+

701 | 1 | a | 2022-01-01 00:00:00 | 2022-02-01 00:00:00 | 2022-03-01 00:00:00 | Asia/Singapore | Australia/Perth | 2021-12-31 13:00:00 | 2021-12-31 21:00:00 |

702 | 2 | b | 2022-01-02 00:00:00 | 2022-02-02 00:00:00 | 2022-03-02 00:00:00 | Asia/Singapore | Australia/Perth | 2022-01-01 13:00:00 | 2022-01-01 21:00:00 |

703 | 3 | c | 2022-01-03 00:00:00 | 2022-02-03 00:00:00 | 2022-03-03 00:00:00 | Asia/Singapore | Australia/Perth | 2022-01-02 13:00:00 | 2022-01-02 21:00:00 |

704 | 4 | d | 2022-01-04 00:00:00 | 2022-02-04 00:00:00 | 2022-03-04 00:00:00 | Asia/Singapore | Australia/Perth | 2022-01-03 13:00:00 | 2022-01-03 21:00:00 |

705 +---+---+---------------------+---------------------+---------------------+----------------+-------------------+---------------------+---------------------+

706 ```

707 !!! success "Conclusion: Successfully converted single column from other time zone to local time zone."

708 </div>

709

710 ```{.py .python linenums="1" title="Example 5: All columns"}

711 >>> add_local_datetime_columns(

712 ... dataframe=df,

713 ... columns="all",

714 ... from_timezone="Australia/Sydney",

715 ... column_with_target_timezone="target",

716 ... ).show()

717 ```

718 <div class="result" markdown>

719 ```{.txt .text title="Terminal"}

720 +---+---+---------------------+---------------------+---------------------+----------------+-------------------+---------------------+---------------------+---------------------+---------------------+

722 +---+---+---------------------+---------------------+---------------------+----------------+-------------------+---------------------+---------------------+---------------------+---------------------+

723 | 1 | a | 2022-01-01 00:00:00 | 2022-02-01 00:00:00 | 2022-03-01 00:00:00 | Asia/Singapore | Australia/Perth | 2022-01-31 13:00:00 | 2022-02-01 08:00:00 | 2022-02-28 13:00:00 | 2022-03-01 08:00:00 |

724 | 2 | b | 2022-01-02 00:00:00 | 2022-02-02 00:00:00 | 2022-03-02 00:00:00 | Asia/Singapore | Australia/Perth | 2022-02-01 13:00:00 | 2022-02-02 08:00:00 | 2022-03-01 13:00:00 | 2022-03-02 08:00:00 |

725 | 3 | c | 2022-01-03 00:00:00 | 2022-02-03 00:00:00 | 2022-03-03 00:00:00 | Asia/Singapore | Australia/Perth | 2022-02-02 13:00:00 | 2022-02-03 08:00:00 | 2022-03-02 13:00:00 | 2022-03-03 08:00:00 |

726 | 4 | d | 2022-01-04 00:00:00 | 2022-02-04 00:00:00 | 2022-03-04 00:00:00 | Asia/Singapore | Australia/Perth | 2022-02-03 13:00:00 | 2022-02-04 08:00:00 | 2022-03-03 13:00:00 | 2022-03-04 08:00:00 |

727 +---+---+---------------------+---------------------+---------------------+----------------+-------------------+---------------------+---------------------+---------------------+---------------------+

728 ```

729 !!! success "Conclusion: Successfully converted all date time columns from other time zone to local time zone."

730 </div>

731

732 ```{.py .python linenums="1" title="Example 6: Invalid column name"}

733 >>> add_local_datetime_columns(df, "invalid_column")

734 ```

735 <div class="result" markdown>

736 ```{.txt .text title="Terminal"}

737 ColumnDoesNotExistError: Column "invalid_column" does not exist in "dataframe".

738 Try one of: ["a", "b", "c", "d_datetime", "e_datetime", "target", "TIMEZONE_LOCATION"].

739 ```

740 !!! failure "Conclusion: Column does not exist."

741 </div>

742

743 ```{.py .python linenums="1" title="Example 7: Invalid timezone"}

744 >>> add_local_datetime_columns(df, "c", from_timezone="Invalid/Timezone")

745 ```

746 <div class="result" markdown>

747 ```{.txt .text title="Terminal"}

748 ValueError: The timezone "Invalid/Timezone" is not a valid timezone.

749 ```

750 !!! failure "Conclusion: Invalid timezone."

751 </div>

752

753 ??? tip "See Also"

754 - [`add_local_datetime_column()`][toolbox_pyspark.datetime.add_local_datetime_column]

755 """

756 if columns is None or columns in ["all"]:

757 columns = [col for col in dataframe.columns if col.lower().endswith("datetime")]

758 elif is_type(columns, str):

759 columns = [columns]

760 assert_columns_exists(dataframe, list(columns) + [column_with_target_timezone])

761 for column in columns:

762 dataframe = add_local_datetime_column(

763 dataframe=dataframe,

764 column=column,

765 from_timezone=from_timezone,

766 column_with_target_timezone=column_with_target_timezone,

767 )

768 return dataframe

769

770

771# ---------------------------------------------------------------------------- #

772# Splitting ####

773# ---------------------------------------------------------------------------- #

774

775

776@typechecked

777def split_datetime_column(

778 dataframe: psDataFrame,

779 column: str,

780) -> psDataFrame:

781 """

782 !!! note "Summary"

783 Take the column `column`, which should be a `timestamp` type, and split it in to it's respective `date` and `time` components.

784

785 Params:

786 dataframe (psDataFrame):

787 The DataFrame to update.

788 column (str):

789 The column to split.

790

791 Raises:

792 TypeError:

793 If any of the inputs parsed to the parameters of this function are not the correct type. Uses the [`@typeguard.typechecked`](https://typeguard.readthedocs.io/en/stable/api.html#typeguard.typechecked) decorator.

794 ColumnDoesNotExistError:

795 If the `#!py column` does not exist within `#!py dataframe.columns`.

796 TypeError:

797 If the `column` is not type `timestamp` or `datetime`.

798

799 Returns:

800 (psDataFrame):

801 The updated DataFrame.

802

803 ???+ example "Examples"

804

805 ```{.py .python linenums="1" title="Set up"}

806 >>> # Imports

807 >>> import pandas as pd

808 >>> from pyspark.sql import SparkSession

809 >>> from toolbox_pyspark.datetime import split_datetime_column

810 >>>

811 >>> # Instantiate Spark

812 >>> spark = SparkSession.builder.getOrCreate()

813 >>>

814 >>> # Create data

815 >>> df = spark.createDataFrame(

816 ... pd.DataFrame(

817 ... {

818 ... "a": [1, 2, 3, 4],

819 ... "b": ["a", "b", "c", "d"],

820 ... "c_datetime": pd.date_range(start="2022-01-01", periods=4, freq="h"),

821 ... "d_datetime": pd.date_range(start="2022-02-01", periods=4, freq="h"),

822 ... "e_datetime": pd.date_range(start="2022-03-01", periods=4, freq="h"),

823 ... "TIMEZONE_LOCATION": ["Australia/Perth"] * 4,

824 ... }

825 ... )

826 ... )

827 >>>

828 >>> # Check

829 >>> df.show()

830 ```

831 <div class="result" markdown>

832 ```{.txt .text title="Terminal"}

833 +---+---+---------------------+---------------------+---------------------+-------------------+

834 | a | b | c_datetime | d_datetime | e_datetime | TIMEZONE_LOCATION |

835 +---+---+---------------------+---------------------+---------------------+-------------------+

836 | 1 | a | 2022-01-01 00:00:00 | 2022-02-01 00:00:00 | 2022-03-01 00:00:00 | Australia/Perth |

837 | 2 | b | 2022-01-01 01:00:00 | 2022-02-01 01:00:00 | 2022-03-01 01:00:00 | Australia/Perth |

838 | 3 | c | 2022-01-01 02:00:00 | 2022-02-01 02:00:00 | 2022-03-01 02:00:00 | Australia/Perth |

839 | 4 | d | 2022-01-01 03:00:00 | 2022-02-01 03:00:00 | 2022-03-01 03:00:00 | Australia/Perth |

840 +---+---+---------------------+---------------------+---------------------+-------------------+

841 ```

842 </div>

843

844 ```{.py .python linenums="1" title="Example 1: Default config"}

845 >>> split_datetime_column(df, "c_datetime").show()

846 ```

847 <div class="result" markdown>

848 ```{.txt .text title="Terminal"}

849 +---+---+---------------------+---------------------+---------------------+-------------------+------------+----------+

851 +---+---+---------------------+---------------------+---------------------+-------------------+------------+----------+

852 | 1 | a | 2022-01-01 00:00:00 | 2022-02-01 00:00:00 | 2022-03-01 00:00:00 | Australia/Perth | 2022-01-01 | 00:00:00 |

853 | 2 | b | 2022-01-01 01:00:00 | 2022-02-01 01:00:00 | 2022-03-01 01:00:00 | Australia/Perth | 2022-01-01 | 01:00:00 |

854 | 3 | c | 2022-01-01 02:00:00 | 2022-02-01 02:00:00 | 2022-03-01 02:00:00 | Australia/Perth | 2022-01-01 | 02:00:00 |

855 | 4 | d | 2022-01-01 03:00:00 | 2022-02-01 03:00:00 | 2022-03-01 03:00:00 | Australia/Perth | 2022-01-01 | 03:00:00 |

856 +---+---+---------------------+---------------------+---------------------+-------------------+------------+----------+

857 ```

858 !!! success "Conclusion: Successfully split the column in to it's Date and Time constituents."

859 </div>

860

861 ```{.py .python linenums="1" title="Example 2: Invalid column name"}

862 >>> split_datetime_column(df, "invalid_column")

863 ```

864 <div class="result" markdown>

865 ```{.txt .text title="Terminal"}

866 ColumnDoesNotExistError: Column "invalid_column" does not exist in "dataframe".

867 Try one of: ["a", "b", "c_datetime", "d_datetime", "e_datetime", "TIMEZONE_LOCATION"].

868 ```

869 !!! failure "Conclusion: Column does not exist."

870 </div>

871

872 ```{.py .python linenums="1" title="Example 2: Invalid column name"}

873 >>> split_datetime_column(df, "b")

874 ```

875 <div class="result" markdown>

876 ```{.txt .text title="Terminal"}

877 TypeError: Column must be type 'timestamp' or 'datetime'.

878 Current type: [('b', 'string')]

879 ```

880 !!! failure "Conclusion: Column is not the correct type for splitting."

881 </div>

882

883 ??? tip "See Also"

884 - [`split_datetime_columns()`][toolbox_pyspark.datetime.split_datetime_columns]

885 """

886 assert_column_exists(dataframe, column)

887 datetime_cols: str_list = get_columns(dataframe, "all_datetime")

888 if not is_in(column, datetime_cols):

889 raise TypeError(

890 "Column must be type 'timestamp' or 'datetime'.\n"

891 f"Current type: {[(col,typ) for col,typ in dataframe.dtypes if col == column]}"

892 )

893 col_date_name: str = column.upper().replace("DATETIME", "DATE")

894 col_time_name: str = column.upper().replace("DATETIME", "TIME")

895 col_date_value: Column = F.date_format(column, "yyyy-MM-dd").cast("string").cast("date")

896 col_time_value: Column = F.date_format(column, "HH:mm:ss").cast("string")

897 return dataframe.withColumns(

898 {

899 col_date_name: col_date_value,

900 col_time_name: col_time_value,

901 }

902 )

903

904

905@typechecked

906def split_datetime_columns(

907 dataframe: psDataFrame,

908 columns: Optional[Union[str, str_collection]] = None,

909) -> psDataFrame:

910 """

911 !!! note "Summary"

912 For all the columns listed in `columns`, split them each in to their respective `date` and `time` components.

913

914 ???+ abstract "Details"

915 The reason why this function is structured this way, and not re-calling [`split_datetime_column()`][toolbox_pyspark.datetime.split_datetime_column] in each iteration of `columns` is due to `#!py pyspark` RDD complexity. More specifically, if it _were_ to call [`split_datetime_column()`][toolbox_pyspark.datetime.split_datetime_column] each time, the RDD would get incredibly and unnecessarily complicated. However, by doing it this way, using the [`.withColumns()`](https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.DataFrame.withColumns.html) method, it will project the SQL expression **once** down to the underlying dataframe; not multiple times. Therefore, in this way, the underlying SQL execution plan is now much less complicated; albeit that the coding DRY principle is not strictly being followed here.

916

917 Params:

918 dataframe (psDataFrame):

919 The DataFrame to update.

920 columns (Optional[Union[str, str_collection]], optional):

921 The list of columns to update. If not given, it will generate the list of columns from the `#!py dataframe.columns` which contain the text `datetime`.<br>

922 Defaults to `#!py None`.

923

924 Raises:

925 TypeError:

926 If any of the inputs parsed to the parameters of this function are not the correct type. Uses the [`@typeguard.typechecked`](https://typeguard.readthedocs.io/en/stable/api.html#typeguard.typechecked) decorator.

927 ColumnDoesNotExistError:

928 If any of the `#!py columns` do not exist within `#!py dataframe.columns`.

929 TypeError:

930 If any of the columns in `columns` are not type `timestamp` or `datetime`.

931

932 Returns:

933 (psDataFrame):

934 The updated DataFrame.

935

936 ???+ example "Examples"

937

938 ```{.py .python linenums="1" title="Set up"}

939 >>> # Imports

940 >>> import pandas as pd

941 >>> from pyspark.sql import SparkSession

942 >>> from toolbox_pyspark.datetime import split_datetime_columns

943 >>>

944 >>> # Instantiate Spark

945 >>> spark = SparkSession.builder.getOrCreate()

946 >>>

947 >>> # Create data

948 >>> df = spark.createDataFrame(

949 ... pd.DataFrame(

950 ... {

951 ... "a": [1, 2, 3, 4],

952 ... "b": ["a", "b", "c", "d"],

953 ... "c_datetime": pd.date_range(start="2022-01-01", periods=4, freq="h"),

954 ... "d_datetime": pd.date_range(start="2022-02-01", periods=4, freq="h"),

955 ... "e_datetime": pd.date_range(start="2022-03-01", periods=4, freq="h"),

956 ... "TIMEZONE_LOCATION": ["Australia/Perth"] * 4,

957 ... }

958 ... )

959 ... )

960 >>>

961 >>> # Check

962 >>> df.show()

963 ```

964 <div class="result" markdown>

965 ```{.txt .text title="Terminal"}

966 +---+---+---------------------+---------------------+---------------------+-------------------+

967 | a | b | c_datetime | d_datetime | e_datetime | TIMEZONE_LOCATION |

968 +---+---+---------------------+---------------------+---------------------+-------------------+

969 | 1 | a | 2022-01-01 00:00:00 | 2022-02-01 00:00:00 | 2022-03-01 00:00:00 | Australia/Perth |

970 | 2 | b | 2022-01-01 01:00:00 | 2022-02-01 01:00:00 | 2022-03-01 01:00:00 | Australia/Perth |

971 | 3 | c | 2022-01-01 02:00:00 | 2022-02-01 02:00:00 | 2022-03-01 02:00:00 | Australia/Perth |

972 | 4 | d | 2022-01-01 03:00:00 | 2022-02-01 03:00:00 | 2022-03-01 03:00:00 | Australia/Perth |

973 +---+---+---------------------+---------------------+---------------------+-------------------+

974 ```

975 </div>

976

977 ```{.py .python linenums="1" title="Example 1: Default config"}

978 >>> split_datetime_columns(df).show()

979 ```

980 <div class="result" markdown>

981 ```{.txt .text title="Terminal"}

982 +---+---+---------------------+---------------------+---------------------+-------------------+------------+----------+------------+----------+------------+----------+

984 +---+---+---------------------+---------------------+---------------------+-------------------+------------+----------+------------+----------+------------+----------+

985 | 1 | a | 2022-01-01 00:00:00 | 2022-02-01 00:00:00 | 2022-03-01 00:00:00 | Australia/Perth | 2022-01-01 | 00:00:00 | 2022-02-01 | 00:00:00 | 2022-03-01 | 00:00:00 |

986 | 2 | b | 2022-01-01 01:00:00 | 2022-02-01 01:00:00 | 2022-03-01 01:00:00 | Australia/Perth | 2022-01-01 | 01:00:00 | 2022-02-01 | 01:00:00 | 2022-03-01 | 01:00:00 |

987 | 3 | c | 2022-01-01 02:00:00 | 2022-02-01 02:00:00 | 2022-03-01 02:00:00 | Australia/Perth | 2022-01-01 | 02:00:00 | 2022-02-01 | 02:00:00 | 2022-03-01 | 02:00:00 |

988 | 4 | d | 2022-01-01 03:00:00 | 2022-02-01 03:00:00 | 2022-03-01 03:00:00 | Australia/Perth | 2022-01-01 | 03:00:00 | 2022-02-01 | 03:00:00 | 2022-03-01 | 03:00:00 |

989 +---+---+---------------------+---------------------+---------------------+-------------------+------------+----------+------------+----------+------------+----------+

990 ```

991 !!! success "Conclusion: Successfully split all DateTime columns in to their Date and Time constituents."

992 </div>

993

994 ```{.py .python linenums="1" title="Example 2: Custom config"}

995 >>> split_datetime_columns(df, ["c_datetime", "d_datetime"]).show()

996 ```

997 <div class="result" markdown>

998 ```{.txt .text title="Terminal"}

999 +---+---+---------------------+---------------------+---------------------+-------------------+------------+----------+------------+----------+

1001 +---+---+---------------------+---------------------+---------------------+-------------------+------------+----------+------------+----------+

1002 | 1 | a | 2022-01-01 00:00:00 | 2022-02-01 00:00:00 | 2022-03-01 00:00:00 | Australia/Perth | 2022-01-01 | 00:00:00 | 2022-02-01 | 00:00:00 |

1003 | 2 | b | 2022-01-01 01:00:00 | 2022-02-01 01:00:00 | 2022-03-01 01:00:00 | Australia/Perth | 2022-01-01 | 01:00:00 | 2022-02-01 | 01:00:00 |

1004 | 3 | c | 2022-01-01 02:00:00 | 2022-02-01 02:00:00 | 2022-03-01 02:00:00 | Australia/Perth | 2022-01-01 | 02:00:00 | 2022-02-01 | 02:00:00 |

1005 | 4 | d | 2022-01-01 03:00:00 | 2022-02-01 03:00:00 | 2022-03-01 03:00:00 | Australia/Perth | 2022-01-01 | 03:00:00 | 2022-02-01 | 03:00:00 |

1006 +---+---+---------------------+---------------------+---------------------+-------------------+------------+----------+------------+----------+

1007 ```

1008 !!! success "Conclusion: Successfully split two columns into their Date and Time constituents."

1009 </div>

1010

1011 ```{.py .python linenums="1" title="Example 3: All columns"}

1012 >>> split_datetime_columns(df, "all").show()

1013 ```

1014 <div class="result" markdown>

1015 ```{.txt .text title="Terminal"}

1016 +---+---+---------------------+---------------------+---------------------+-------------------+------------+----------+------------+----------+------------+----------+

1018 +---+---+---------------------+---------------------+---------------------+-------------------+------------+----------+------------+----------+------------+----------+

1019 | 1 | a | 2022-01-01 00:00:00 | 2022-02-01 00:00:00 | 2022-03-01 00:00:00 | Australia/Perth | 2022-01-01 | 00:00:00 | 2022-02-01 | 00:00:00 | 2022-03-01 | 00:00:00 |

1020 | 2 | b | 2022-01-01 01:00:00 | 2022-02-01 01:00:00 | 2022-03-01 01:00:00 | Australia/Perth | 2022-01-01 | 01:00:00 | 2022-02-01 | 01:00:00 | 2022-03-01 | 01:00:00 |

1021 | 3 | c | 2022-01-01 02:00:00 | 2022-02-01 02:00:00 | 2022-03-01 02:00:00 | Australia/Perth | 2022-01-01 | 02:00:00 | 2022-02-01 | 02:00:00 | 2022-03-01 | 02:00:00 |

1022 | 4 | d | 2022-01-01 03:00:00 | 2022-02-01 03:00:00 | 2022-03-01 03:00:00 | Australia/Perth | 2022-01-01 | 03:00:00 | 2022-02-01 | 03:00:00 | 2022-03-01 | 03:00:00 |

1023 +---+---+---------------------+---------------------+---------------------+-------------------+------------+----------+------------+----------+------------+----------+

1024 ```

1025 !!! success "Conclusion: Successfully split all DateTime columns in to their Date and Time constituents."

1026 </div>

1027

1028 ```{.py .python linenums="1" title="Example 4: Single column"}

1029 >>> split_datetime_columns(df, "c_datetime").show()

1030 ```

1031 <div class="result" markdown>

1032 ```{.txt .text title="Terminal"}

1033 +---+---+---------------------+---------------------+---------------------+-------------------+------------+----------+

1035 +---+---+---------------------+---------------------+---------------------+-------------------+------------+----------+

1036 | 1 | a | 2022-01-01 00:00:00 | 2022-02-01 00:00:00 | 2022-03-01 00:00:00 | Australia/Perth | 2022-01-01 | 00:00:00 |

1037 | 2 | b | 2022-01-01 01:00:00 | 2022-02-01 01:00:00 | 2022-03-01 01:00:00 | Australia/Perth | 2022-01-01 | 01:00:00 |

1038 | 3 | c | 2022-01-01 02:00:00 | 2022-02-01 02:00:00 | 2022-03-01 02:00:00 | Australia/Perth | 2022-01-01 | 02:00:00 |

1039 | 4 | d | 2022-01-01 03:00:00 | 2022-02-01 03:00:00 | 2022-03-01 03:00:00 | Australia/Perth | 2022-01-01 | 03:00:00 |

1040 +---+---+---------------------+---------------------+---------------------+-------------------+------------+----------+

1041 ```

1042 !!! success "Conclusion: Successfully split a single column in to it's Date and Time constituents."

1043 </div>

1044

1045 ```{.py .python linenums="1" title="Example 5: Invalid column name"}

1046 >>> split_datetime_columns(df, "invalid_column")

1047 ```

1048 <div class="result" markdown>

1049 ```{.txt .text title="Terminal"}

1050 ColumnDoesNotExistError: Column "invalid_column" does not exist in "dataframe".

1051 Try one of: ["a", "b", "c_datetime", "d_datetime", "e_datetime", "TIMEZONE_LOCATION"].

1052 ```

1053 !!! failure "Conclusion: Column does not exist."

1054 </div>

1055

1056 ```{.py .python linenums="1" title="Example 6: Invalid column type"}

1057 >>> split_datetime_columns(df, "b")

1058 ```

1059 <div class="result" markdown>

1060 ```{.txt .text title="Terminal"}

1061 TypeError: Column must be type 'timestamp' or 'datetime'.

1062 Current type: [('b', 'string')]

1063 ```

1064 !!! failure "Conclusion: Column is not the correct type for splitting."

1065 </div>

1066

1067 ??? tip "See Also"

1068 - [`split_datetime_column()`][toolbox_pyspark.datetime.split_datetime_column]

1069 """

1070 if columns is None or columns in ["all"]:

1071 columns = [col for col in dataframe.columns if "datetime" in col.lower()]

1072 elif is_type(columns, str):

1073 columns = [columns]

1074 assert_columns_exists(dataframe, columns)

1075 datetime_cols: str_list = get_columns(dataframe, "all_datetime")

1076 if not is_all_in(columns, datetime_cols):

1077 raise TypeError(

1078 "Columns to split must be type 'timestamp' or 'datetime'.\n"

1079 f"Current types: {[(col,typ) for col,typ in dataframe.dtypes if col in columns]}"

1080 )

1081 cols_exprs: dict[str, Column] = {}

1082 for column in columns:

1083 col_date_name: str = column.upper().replace("DATETIME", "DATE")

1084 col_time_name: str = column.upper().replace("DATETIME", "TIME")

1085 col_date_value: Column = (

1086 F.date_format(column, "yyyy-MM-dd").cast("string").cast("date")

1087 )

1088 col_time_value: Column = F.date_format(column, "HH:mm:ss").cast("string")

1089 cols_exprs[col_date_name] = col_date_value

1090 cols_exprs[col_time_name] = col_time_value

1091 return dataframe.withColumns(cols_exprs)