Coverage for src/toolbox_pyspark/datetime.py: 100%
78 statements
« prev ^ index » next coverage.py v7.6.10, created at 2025-01-25 23:08 +0000
« prev ^ index » next coverage.py v7.6.10, created at 2025-01-25 23:08 +0000
1# ============================================================================ #
2# #
3# Title : Datetime #
4# Purpose : Fixing column names that contain datetime data, adding #
5# conversions to local datetimes, and for splitting a column in #
6# to their date and time components. #
7# #
8# ============================================================================ #
11# ---------------------------------------------------------------------------- #
12# #
13# Overview ####
14# #
15# ---------------------------------------------------------------------------- #
18# ---------------------------------------------------------------------------- #
19# Description ####
20# ---------------------------------------------------------------------------- #
23"""
24!!! note "Summary"
25 The `datetime` module is used for fixing column names that contain datetime data, adding conversions to local datetimes, and for splitting a column in to their date and time components.
26"""
29# ---------------------------------------------------------------------------- #
30# #
31# Setup ####
32# #
33# ---------------------------------------------------------------------------- #
36# ---------------------------------------------------------------------------- #
37# Imports ####
38# ---------------------------------------------------------------------------- #
41# ## Python StdLib Imports ----
42from typing import Optional, Union
44# ## Python Third Party Imports ----
45from pyspark.sql import Column, DataFrame as psDataFrame, functions as F
46from toolbox_python.checkers import is_all_in, is_in, is_type
47from toolbox_python.collection_types import str_collection, str_list
48from typeguard import typechecked
50# ## Local First Party Imports ----
51from toolbox_pyspark.checks import assert_column_exists, assert_columns_exists
52from toolbox_pyspark.columns import get_columns
55# ---------------------------------------------------------------------------- #
56# Exports ####
57# ---------------------------------------------------------------------------- #
60__all__: str_list = [
61 "rename_datetime_column",
62 "rename_datetime_columns",
63 "add_local_datetime_column",
64 "add_local_datetime_columns",
65 "split_datetime_column",
66 "split_datetime_columns",
67]
70# ---------------------------------------------------------------------------- #
71# #
72# Functions ####
73# #
74# ---------------------------------------------------------------------------- #
77# ---------------------------------------------------------------------------- #
78# Renaming ####
79# ---------------------------------------------------------------------------- #
82@typechecked
83def rename_datetime_column(
84 dataframe: psDataFrame,
85 column: str,
86) -> psDataFrame:
87 """
88 !!! note "Summary"
89 For a given column in a Data Frame, if there is not another column existing that has `TIME` appended to the end, then re-name the column to append `TIME` to it.
91 Params:
92 dataframe (psDataFrame):
93 The DataFrame to update.
94 column (str):
95 The column to check.
97 Raises:
98 TypeError:
99 If any of the inputs parsed to the parameters of this function are not the correct type. Uses the [`@typeguard.typechecked`](https://typeguard.readthedocs.io/en/stable/api.html#typeguard.typechecked) decorator.
100 ColumnDoesNotExistError:
101 If the `#!py column` does not exist within `#!py dataframe.columns`.
103 Returns:
104 (psDataFrame):
105 The updated Data Frame.
107 ???+ example "Examples"
109 ```{.py .python linenums="1" title="Set up"}
110 >>> # Imports
111 >>> import pandas as pd
112 >>> from pyspark.sql import SparkSession
113 >>> from toolbox_pyspark.datetime import rename_datetime_column
114 >>>
115 >>> # Instantiate Spark
116 >>> spark = SparkSession.builder.getOrCreate()
117 >>>
118 >>> # Create data
119 >>> df = spark.createDataFrame(
120 ... pd.DataFrame(
121 ... {
122 ... "a": [1, 2, 3, 4],
123 ... "b": ["a", "b", "c", "d"],
124 ... "c_date": pd.date_range(start="2022-01-01", periods=4, freq="h"),
125 ... "d_date": pd.date_range(start="2022-02-01", periods=4, freq="h"),
126 ... }
127 ... )
128 ... )
129 >>>
130 >>> # Check
131 >>> df.show()
132 ```
133 <div class="result" markdown>
134 ```{.txt .text title="Terminal"}
135 +---+---+---------------------+---------------------+
136 | a | b | c_date | d_date |
137 +---+---+---------------------+---------------------+
138 | 0 | a | 2022-01-01 00:00:00 | 2022-02-01 00:00:00 |
139 | 1 | b | 2022-01-01 01:00:00 | 2022-02-01 01:00:00 |
140 | 2 | c | 2022-01-01 02:00:00 | 2022-02-01 02:00:00 |
141 | 3 | d | 2022-01-01 03:00:00 | 2022-02-01 03:00:00 |
142 +---+---+---------------------+---------------------+
143 ```
144 </div>
146 ```{.py .python linenums="1" title="Example 1: Update column"}
147 >>> rename_datetime_column(df, "c_date").show()
148 ```
149 <div class="result" markdown>
150 ```{.txt .text title="Terminal"}
151 +---+---+---------------------+---------------------+
152 | a | b | c_dateTIME | d_date |
153 +---+---+---------------------+---------------------+
154 | 0 | a | 2022-01-01 00:00:00 | 2022-02-01 00:00:00 |
155 | 1 | b | 2022-01-01 01:00:00 | 2022-02-01 01:00:00 |
156 | 2 | c | 2022-01-01 02:00:00 | 2022-02-01 02:00:00 |
157 | 3 | d | 2022-01-01 03:00:00 | 2022-02-01 03:00:00 |
158 +---+---+---------------------+---------------------+
159 ```
160 !!! success "Conclusion: Successfully renamed column."
161 </div>
163 ```{.py .python linenums="1" title="Example 2: Missing column"}
164 >>> rename_datetime_column(df, "fff")
165 ```
166 <div class="result" markdown>
167 ```{.txt .text title="Terminal"}
168 ColumnDoesNotExistError: Column "fff" does not exist in "dataframe".
169 Try one of: ["a", "b", "c_date", "d_date"].
170 ```
171 !!! failure "Conclusion: Column does not exist."
172 </div>
174 ??? tip "See Also"
175 - [`assert_column_exists()`][toolbox_pyspark.checks.assert_column_exists]
176 - [`rename_datetime_columns()`][toolbox_pyspark.datetime.rename_datetime_columns]
177 """
178 assert_column_exists(dataframe=dataframe, column=column, match_case=True)
179 if f"{column}TIME" not in dataframe.columns:
180 return dataframe.withColumnRenamed(column, f"{column}TIME")
181 else:
182 return dataframe
185@typechecked
186def rename_datetime_columns(
187 dataframe: psDataFrame,
188 columns: Optional[Union[str_collection, str]] = None,
189) -> psDataFrame:
190 """
191 !!! note "Summary"
192 Fix the column names for the date-time columns.
194 ???+ abstract "Details"
195 This is necessary because in NGW, there are some columns which have `datetime` data types, but which have the name only containing `date`.
196 So, this function will fix that.
198 Params:
199 dataframe (psDataFrame):
200 The DataFrame to update.
201 columns (Optional[Union[str_collection, str]], None):
202 An optional list of columns to update.
203 If this is not provided, or is the value `#!py None` or `#!py "all"`, then the function will automatically determine which columns to update based on the following logic:
205 1. Loop through each column on `dataframe` to fetch the name and dtype using the method: `dataframe.dtypes`.
206 1. If the column name ends with `#!py "date"`
207 2. **AND** the column type is `#!py "timestamp"`
208 3. **AND** there is **NOT** already a column existing in the `dataframe.columns` with the name: `f"{column}TIME"`
209 4. **THEN** rename the column to have the name: `f"{column}TIME"`
210 2. Next column.
212 Default: `None`.
214 Raises:
215 TypeError:
216 If any of the inputs parsed to the parameters of this function are not the correct type. Uses the [`@typeguard.typechecked`](https://typeguard.readthedocs.io/en/stable/api.html#typeguard.typechecked) decorator.
217 ColumnDoesNotExistError:
218 If any of the `#!py columns` do not exist within `#!py dataframe.columns`.
220 Returns:
221 (psDataFrame):
222 The updated DataFrame.
224 ???+ example "Examples"
226 ```{.py .python linenums="1" title="Set up"}
227 >>> # Imports
228 >>> import pandas as pd
229 >>> from pyspark.sql import SparkSession
230 >>> from toolbox_pyspark.datetime import rename_datetime_column
231 >>>
232 >>> # Instantiate Spark
233 >>> spark = SparkSession.builder.getOrCreate()
234 >>>
235 >>> # Create data
236 >>> df = spark.createDataFrame(
237 ... pd.DataFrame(
238 ... {
239 ... "a": [1, 2, 3, 4],
240 ... "b": ["a", "b", "c", "d"],
241 ... "c_date": pd.date_range(start="2022-01-01", periods=4, freq="h"),
242 ... "d_date": pd.date_range(start="2022-02-01", periods=4, freq="h"),
243 ... }
244 ... )
245 ... )
246 >>>
247 >>> # Check
248 >>> df.show()
249 ```
250 <div class="result" markdown>
251 ```{.txt .text title="Terminal"}
252 +---+---+---------------------+---------------------+
253 | a | b | c_date | d_date |
254 +---+---+---------------------+---------------------+
255 | 0 | a | 2022-01-01 00:00:00 | 2022-02-01 00:00:00 |
256 | 1 | b | 2022-01-01 01:00:00 | 2022-02-01 01:00:00 |
257 | 2 | c | 2022-01-01 02:00:00 | 2022-02-01 02:00:00 |
258 | 3 | d | 2022-01-01 03:00:00 | 2022-02-01 03:00:00 |
259 +---+---+---------------------+---------------------+
260 ```
261 </div>
263 ```{.py .python linenums="1" title="Example 1: One column"}
264 >>> rename_datetime_column(df, ["c_date"]).show()
265 ```
266 <div class="result" markdown>
267 ```{.txt .text title="Terminal"}
268 +---+---+---------------------+---------------------+
269 | a | b | c_dateTIME | d_date |
270 +---+---+---------------------+---------------------+
271 | 0 | a | 2022-01-01 00:00:00 | 2022-02-01 00:00:00 |
272 | 1 | b | 2022-01-01 01:00:00 | 2022-02-01 01:00:00 |
273 | 2 | c | 2022-01-01 02:00:00 | 2022-02-01 02:00:00 |
274 | 3 | d | 2022-01-01 03:00:00 | 2022-02-01 03:00:00 |
275 +---+---+---------------------+---------------------+
276 ```
277 !!! success "Conclusion: Successfully renamed column."
278 </div>
280 ```{.py .python linenums="1" title="Example 2: One column `str`"}
281 >>> rename_datetime_column(df, "c_date").show()
282 ```
283 <div class="result" markdown>
284 ```{.txt .text title="Terminal"}
285 +---+---+---------------------+---------------------+
286 | a | b | c_dateTIME | d_date |
287 +---+---+---------------------+---------------------+
288 | 0 | a | 2022-01-01 00:00:00 | 2022-02-01 00:00:00 |
289 | 1 | b | 2022-01-01 01:00:00 | 2022-02-01 01:00:00 |
290 | 2 | c | 2022-01-01 02:00:00 | 2022-02-01 02:00:00 |
291 | 3 | d | 2022-01-01 03:00:00 | 2022-02-01 03:00:00 |
292 +---+---+---------------------+---------------------+
293 ```
294 !!! success "Conclusion: Successfully renamed column."
295 </div>
297 ```{.py .python linenums="1" title="Example 3: All columns"}
298 >>> rename_datetime_column(df).show()
299 ```
300 <div class="result" markdown>
301 ```{.txt .text title="Terminal"}
302 +---+---+---------------------+---------------------+
303 | a | b | c_dateTIME | d_dateTIME |
304 +---+---+---------------------+---------------------+
305 | 0 | a | 2022-01-01 00:00:00 | 2022-02-01 00:00:00 |
306 | 1 | b | 2022-01-01 01:00:00 | 2022-02-01 01:00:00 |
307 | 2 | c | 2022-01-01 02:00:00 | 2022-02-01 02:00:00 |
308 | 3 | d | 2022-01-01 03:00:00 | 2022-02-01 03:00:00 |
309 +---+---+---------------------+---------------------+
310 ```
311 !!! success "Conclusion: Successfully renamed columns."
312 </div>
314 ```{.py .python linenums="1" title="Example 4: All columns using 'all'"}
315 >>> rename_datetime_column(df, "all").show()
316 ```
317 <div class="result" markdown>
318 ```{.txt .text title="Terminal"}
319 +---+---+---------------------+---------------------+
320 | a | b | c_dateTIME | d_dateTIME |
321 +---+---+---------------------+---------------------+
322 | 0 | a | 2022-01-01 00:00:00 | 2022-02-01 00:00:00 |
323 | 1 | b | 2022-01-01 01:00:00 | 2022-02-01 01:00:00 |
324 | 2 | c | 2022-01-01 02:00:00 | 2022-02-01 02:00:00 |
325 | 3 | d | 2022-01-01 03:00:00 | 2022-02-01 03:00:00 |
326 +---+---+---------------------+---------------------+
327 ```
328 !!! success "Conclusion: Successfully renamed columns."
329 </div>
331 ```{.py .python linenums="1" title="Example 5: Missing column"}
332 >>> rename_datetime_columns(df, ["fff", "ggg"])
333 ```
334 <div class="result" markdown>
335 ```{.txt .text title="Terminal"}
336 Attribute Error: Columns ["fff", "ggg"] do not exist in "dataframe".
337 Try one of: ["a", "b", "c_dateTIME", "d_dateTIME"].
338 ```
339 !!! failure "Conclusion: Columns do not exist."
340 </div>
342 ??? tip "See Also"
343 - [`assert_columns_exists()`][toolbox_pyspark.checks.assert_columns_exists]
344 - [`rename_datetime_column()`][toolbox_pyspark.datetime.rename_datetime_column]
345 """
346 if columns is None or columns == "all":
347 datetime_cols: str_list = [
348 col
349 for col in get_columns(dataframe, "all_datetime")
350 if col.lower().endswith("date")
351 ]
352 columns = [
353 col
354 for col in datetime_cols
355 if col.lower().endswith("date") and f"{col}TIME" not in dataframe.columns
356 ]
357 elif is_type(columns, str):
358 columns = [columns]
359 assert_columns_exists(dataframe, columns, True)
360 for column in columns:
361 dataframe = rename_datetime_column(dataframe, column)
362 return dataframe
365# ---------------------------------------------------------------------------- #
366# Add Locals ####
367# ---------------------------------------------------------------------------- #
370@typechecked
371def add_local_datetime_column(
372 dataframe: psDataFrame,
373 column: str,
374 from_timezone: Optional[str] = None,
375 column_with_target_timezone: str = "timezone_location".upper(),
376) -> psDataFrame:
377 """
378 !!! note "Summary"
379 For the given `column`, add a new column with the suffix `_LOCAL` which is a conversion of the datetime values from `column` to the desired timezone.
381 Params:
382 dataframe (psDataFrame):
383 The DataFrame to be fixed
384 column (str):
385 The name of the column to do the conversion for. Must exist in `#!py dataframe.columns`, and must be type `typestamp`.
386 from_timezone (str, optional):
387 The timezone which will be converted from. Must be a valid TimeZoneID, for more info, see: [TimeZoneID](https://docs.oracle.com/middleware/12211/wcs/tag-ref/MISC/TimeZones.html).<br>
388 If not given, will default the `from_timezone` to be UTC.<br>
389 Default: `#!py None`.
390 column_with_target_timezone (str, optional):
391 The column containing the target timezone value. By default will be the column `#!py "timezone_location"`.<br>
392 Defaults to `#!py "timezone_location".upper()`.
394 Raises:
395 TypeError:
396 If any of the inputs parsed to the parameters of this function are not the correct type. Uses the [`@typeguard.typechecked`](https://typeguard.readthedocs.io/en/stable/api.html#typeguard.typechecked) decorator.
397 ColumnDoesNotExistError:
398 If `#!py "column"` or `column_with_target_timezone` does not exist within `#!py dataframe.columns`.
399 ValueError:
400 If the `from_timezone` or `column_with_target_timezone` is not a valid timezone.
402 Returns:
403 (psDataFrame):
404 The updated DataFrame.
406 ???+ example "Examples"
408 ```{.py .python linenums="1" title="Set up"}
409 >>> # Imports
410 >>> import pandas as pd
411 >>> from pyspark.sql import SparkSession
412 >>> from toolbox_pyspark.datetime import add_local_datetime_column
413 >>>
414 >>> # Instantiate Spark
415 >>> spark = SparkSession.builder.getOrCreate()
416 >>>
417 >>> # Create data
418 >>> df = spark.createDataFrame(
419 ... pd.DataFrame(
420 ... {
421 ... "a": [1, 2, 3, 4],
422 ... "b": ["a", "b", "c", "d"],
423 ... "c": pd.date_range(start="2022-01-01", periods=4, freq="D"),
424 ... "d": pd.date_range(start="2022-02-01", periods=4, freq="D"),
425 ... "e": pd.date_range(start="2022-03-01", periods=4, freq="D"),
426 ... "target": ["Asia/Singapore"] * 4,
427 ... "TIMEZONE_LOCATION": ["Australia/Perth"] * 4,
428 ... }
429 ... )
430 ... )
431 >>>
432 >>> # Check
433 >>> df.show()
434 ```
435 <div class="result" markdown>
436 ```{.txt .text title="Terminal"}
437 +---+---+---------------------+---------------------+---------------------+----------------+-------------------+
438 | a | b | c | d | e | target | TIMEZONE_LOCATION |
439 +---+---+---------------------+---------------------+---------------------+----------------+-------------------+
440 | 1 | a | 2022-01-01 00:00:00 | 2022-02-01 00:00:00 | 2022-03-01 00:00:00 | Asia/Singapore | Australia/Perth |
441 | 2 | b | 2022-01-02 00:00:00 | 2022-02-02 00:00:00 | 2022-03-02 00:00:00 | Asia/Singapore | Australia/Perth |
442 | 3 | c | 2022-01-03 00:00:00 | 2022-02-03 00:00:00 | 2022-03-03 00:00:00 | Asia/Singapore | Australia/Perth |
443 | 4 | d | 2022-01-04 00:00:00 | 2022-02-04 00:00:00 | 2022-03-04 00:00:00 | Asia/Singapore | Australia/Perth |
444 +---+---+---------------------+---------------------+---------------------+----------------+-------------------+
445 ```
446 </div>
448 ```{.py .python linenums="1" title="Example 1: Converting from UTC time"}
449 >>> add_local_datetime_column(df, "c").show()
450 ```
451 <div class="result" markdown>
452 ```{.txt .text title="Terminal"}
453 +---+---+---------------------+---------------------+---------------------+----------------+-------------------+---------------------+
454 | a | b | c | d | e | target | TIMEZONE_LOCATION | c_LOCAL |
455 +---+---+---------------------+---------------------+---------------------+----------------+-------------------+---------------------+
456 | 1 | a | 2022-01-01 00:00:00 | 2022-02-01 00:00:00 | 2022-03-01 00:00:00 | Asia/Singapore | Australia/Perth | 2022-01-01 08:00:00 |
457 | 2 | b | 2022-01-02 00:00:00 | 2022-02-02 00:00:00 | 2022-03-02 00:00:00 | Asia/Singapore | Australia/Perth | 2022-01-02 08:00:00 |
458 | 3 | c | 2022-01-03 00:00:00 | 2022-02-03 00:00:00 | 2022-03-03 00:00:00 | Asia/Singapore | Australia/Perth | 2022-01-03 08:00:00 |
459 | 4 | d | 2022-01-04 00:00:00 | 2022-02-04 00:00:00 | 2022-03-04 00:00:00 | Asia/Singapore | Australia/Perth | 2022-01-04 08:00:00 |
460 +---+---+---------------------+---------------------+---------------------+----------------+-------------------+---------------------+
461 ```
462 !!! success "Conclusion: Successfully converted from UTC."
463 </div>
465 ```{.py .python linenums="1" title="Example 2: Converting from specific timezone, with custom column containing target timezone"}
466 >>> add_local_datetime_column(
467 ... dataframe=df,
468 ... column="c",
469 ... from_timezone="Australia/Sydney",
470 ... column_with_target_timezone="target",
471 ... ).show()
472 ```
473 <div class="result" markdown>
474 ```{.txt .text title="Terminal"}
475 +---+---+---------------------+---------------------+---------------------+----------------+-------------------+---------------------+---------------------+
476 | a | b | c | d | e | target | TIMEZONE_LOCATION | c_UTC | c_LOCAL |
477 +---+---+---------------------+---------------------+---------------------+----------------+-------------------+---------------------+---------------------+
478 | 1 | a | 2022-01-01 00:00:00 | 2022-02-01 00:00:00 | 2022-03-01 00:00:00 | Asia/Singapore | Australia/Perth | 2021-12-31 13:00:00 | 2021-12-31 21:00:00 |
479 | 2 | b | 2022-01-02 00:00:00 | 2022-02-02 00:00:00 | 2022-03-02 00:00:00 | Asia/Singapore | Australia/Perth | 2022-01-01 13:00:00 | 2022-01-01 21:00:00 |
480 | 3 | c | 2022-01-03 00:00:00 | 2022-02-03 00:00:00 | 2022-03-03 00:00:00 | Asia/Singapore | Australia/Perth | 2022-01-02 13:00:00 | 2022-01-02 21:00:00 |
481 | 4 | d | 2022-01-04 00:00:00 | 2022-02-04 00:00:00 | 2022-03-04 00:00:00 | Asia/Singapore | Australia/Perth | 2022-01-03 13:00:00 | 2022-01-03 21:00:00 |
482 +---+---+---------------------+---------------------+---------------------+----------------+-------------------+---------------------+---------------------+
483 ```
484 !!! success "Conclusion: Successfully converted timezone."
485 </div>
487 ```{.py .python linenums="1" title="Example 3: Invalid column name"}
488 >>> add_local_datetime_column(df, "invalid_column")
489 ```
490 <div class="result" markdown>
491 ```{.txt .text title="Terminal"}
492 ColumnDoesNotExistError: Column "invalid_column" does not exist in "dataframe".
493 Try one of: ["a", "b", "c", "d", "e", "target", "TIMEZONE_LOCATION"].
494 ```
495 !!! failure "Conclusion: Column does not exist."
496 </div>
498 ```{.py .python linenums="1" title="Example 4: Invalid timezone"}
499 >>> add_local_datetime_column(df, "c", from_timezone="Invalid/Timezone")
500 ```
501 <div class="result" markdown>
502 ```{.txt .text title="Terminal"}
503 ValueError: The timezone "Invalid/Timezone" is not a valid timezone.
504 ```
505 !!! failure "Conclusion: Invalid timezone."
506 </div>
508 ??? info "Notes"
509 - If `#!py from_timezone is None`, then it is assumed that the datetime data in `column` is _already_ in UTC timezone.<br>
510 - If `#!py from_timezone is not None`, then a new column will be added with the syntax `#!py {column}_UTC`, then another column added with `#!py {column}_LOCAL`. This is necessary because PySpark cannot convert immediately from one timezone to another; it must first require a conversion from the `from_timezone` value _to_ UTC, then a second conversion _from_ UTC to whichever timezone is defined in the column `column_with_target_timezone`.<br>
511 - The reason why this function uses multiple [`.withColumn()`][withColumn] methods, instead of a single [`.withColumns()`][withColumns] expression is because to add the `#!py {column}_LOCAL` column, it is first necessary for the `#!py {column}_UTC` column to exist on the `dataframe`. Therefore, we need to call [`.withColumn()`][withColumn] first to add `#!py {column}_UTC`, then we need to call [`.withColumn()`][withColumn] a second time to add `#!py {column}_LOCAL`.
512 [withColumn]: https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.DataFrame.withColumn.html
513 [withColumns]: https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.DataFrame.withColumns.html
515 ??? tip "See Also"
516 - [`assert_columns_exists()`][toolbox_pyspark.checks.assert_columns_exists]
517 - [`add_local_datetime_columns()`][toolbox_pyspark.datetime.add_local_datetime_columns]
518 """
519 assert_columns_exists(dataframe=dataframe, columns=[column, column_with_target_timezone])
520 require_utc: bool = f"{column}_UTC" not in dataframe.columns
521 require_local: bool = f"{column}_LOCAL" not in dataframe.columns
522 if from_timezone is not None:
523 if require_utc:
524 dataframe = dataframe.withColumn(
525 f"{column}_UTC",
526 F.to_utc_timestamp(
527 F.col(column).cast("timestamp"),
528 from_timezone.title(),
529 ),
530 )
531 if require_local:
532 dataframe = dataframe.withColumn(
533 f"{column}_LOCAL",
534 F.from_utc_timestamp(
535 F.col(f"{column}_UTC").cast("timestamp"),
536 F.col(column_with_target_timezone),
537 ),
538 )
539 else:
540 if require_local:
541 dataframe = dataframe.withColumn(
542 f"{column}_LOCAL",
543 F.from_utc_timestamp(
544 F.col(column).cast("timestamp"),
545 F.col(column_with_target_timezone),
546 ),
547 )
548 return dataframe
551@typechecked
552def add_local_datetime_columns(
553 dataframe: psDataFrame,
554 columns: Optional[Union[str, str_collection]] = None,
555 from_timezone: Optional[str] = None,
556 column_with_target_timezone: str = "timezone_location".upper(),
557) -> psDataFrame:
558 """
559 !!! note "Summary"
560 For each of the `data` or `datetime` columns in `dataframe`, add a new column which is converting it to the timezone of the local datetime.
562 ???+ abstract "Details"
563 Under the hood, this function will call [`add_local_datetime_column()`][toolbox_pyspark.datetime.add_local_datetime_column] for each `column` in `columns`.
565 Params:
566 dataframe (psDataFrame):
567 The DataFrame to update.
568 columns (Optional[Union[str, str_collection]], optional):
569 The columns to check. If not provided, it will use all of the columns which contains the text `date`.<br>
570 Defaults to `#!py None`.
571 from_timezone (Optional[str], optional):
572 The timezone which will be converted from. If not given, will default the from timezone to be UTC.<br>
573 Defaults to `#!py None`.
574 column_with_target_timezone (str, optional):
575 The column containing the target timezone value. By default will be the column `#!py "timezone_location"`.<br>
576 Defaults to `#!py "timezone_location".upper()`.
578 Raises:
579 TypeError:
580 If any of the inputs parsed to the parameters of this function are not the correct type. Uses the [`@typeguard.typechecked`](https://typeguard.readthedocs.io/en/stable/api.html#typeguard.typechecked) decorator.
581 ColumnDoesNotExistError:
582 If any of the `#!py columns` do not exist within `#!py dataframe.columns`.
583 ValueError:
584 If the `from_timezone` or `column_with_target_timezone` is not a valid timezone.
586 Returns:
587 (psDataFrame):
588 The updated DataFrame.
590 ???+ example "Examples"
592 ```{.py .python linenums="1" title="Set up"}
593 >>> # Imports
594 >>> import pandas as pd
595 >>> from pyspark.sql import SparkSession
596 >>> from toolbox_pyspark.datetime import add_local_datetime_columns
597 >>>
598 >>> # Instantiate Spark
599 >>> spark = SparkSession.builder.getOrCreate()
600 >>>
601 >>> # Create data
602 >>> df = spark.createDataFrame(
603 ... pd.DataFrame(
604 ... {
605 ... "a": [1, 2, 3, 4],
606 ... "b": ["a", "b", "c", "d"],
607 ... "c": pd.date_range(start="2022-01-01", periods=4, freq="D"),
608 ... "d_datetime": pd.date_range(start="2022-02-01", periods=4, freq="D"),
609 ... "e_datetime": pd.date_range(start="2022-03-01", periods=4, freq="D"),
610 ... "target": ["Asia/Singapore"] * 4,
611 ... "TIMEZONE_LOCATION": ["Australia/Perth"] * 4,
612 ... }
613 ... )
614 ... )
615 >>>
616 >>> # Check
617 >>> df.show()
618 ```
619 <div class="result" markdown>
620 ```{.txt .text title="Terminal"}
621 +---+---+---------------------+---------------------+---------------------+----------------+-------------------+
622 | a | b | c | d_datetime | e_datetime | target | TIMEZONE_LOCATION |
623 +---+---+---------------------+---------------------+---------------------+----------------+-------------------+
624 | 1 | a | 2022-01-01 00:00:00 | 2022-02-01 00:00:00 | 2022-03-01 00:00:00 | Asia/Singapore | Australia/Perth |
625 | 2 | b | 2022-01-02 00:00:00 | 2022-02-02 00:00:00 | 2022-03-02 00:00:00 | Asia/Singapore | Australia/Perth |
626 | 3 | c | 2022-01-03 00:00:00 | 2022-02-03 00:00:00 | 2022-03-03 00:00:00 | Asia/Singapore | Australia/Perth |
627 | 4 | d | 2022-01-04 00:00:00 | 2022-02-04 00:00:00 | 2022-03-04 00:00:00 | Asia/Singapore | Australia/Perth |
628 +---+---+---------------------+---------------------+---------------------+----------------+-------------------+
629 ```
630 </div>
632 ```{.py .python linenums="1" title="Example 1: Default config"}
633 >>> add_local_datetime_columns(df).show()
634 ```
635 <div class="result" markdown>
636 ```{.txt .text title="Terminal"}
637 +---+---+---------------------+---------------------+---------------------+----------------+-------------------+---------------------+---------------------+
638 | a | b | c | d_datetime | e_datetime | target | TIMEZONE_LOCATION | d_datetime_LOCAL | e_datetime_LOCAL |
639 +---+---+---------------------+---------------------+---------------------+----------------+-------------------+---------------------+---------------------+
640 | 1 | a | 2022-01-01 00:00:00 | 2022-02-01 00:00:00 | 2022-03-01 00:00:00 | Asia/Singapore | Australia/Perth | 2022-02-01 08:00:00 | 2022-03-01 08:00:00 |
641 | 2 | b | 2022-01-02 00:00:00 | 2022-02-02 00:00:00 | 2022-03-02 00:00:00 | Asia/Singapore | Australia/Perth | 2022-02-02 08:00:00 | 2022-03-02 08:00:00 |
642 | 3 | c | 2022-01-03 00:00:00 | 2022-02-03 00:00:00 | 2022-03-03 00:00:00 | Asia/Singapore | Australia/Perth | 2022-02-03 08:00:00 | 2022-03-03 08:00:00 |
643 | 4 | d | 2022-01-04 00:00:00 | 2022-02-04 00:00:00 | 2022-03-04 00:00:00 | Asia/Singapore | Australia/Perth | 2022-02-04 08:00:00 | 2022-03-04 08:00:00 |
644 +---+---+---------------------+---------------------+---------------------+----------------+-------------------+---------------------+---------------------+
645 ```
646 !!! success "Conclusion: Successfully converted columns to local timezone."
647 </div>
649 ```{.py .python linenums="1" title="Example 2: Semi-custom config"}
650 >>> add_local_datetime_columns(df, ["c", "d_datetime"]).show()
651 ```
652 <div class="result" markdown>
653 ```{.txt .text title="Terminal"}
654 +---+---+---------------------+---------------------+---------------------+----------------+-------------------+---------------------+---------------------+
655 | a | b | c | d_datetime | e_datetime | target | TIMEZONE_LOCATION | c_LOCAL | d_datetime_LOCAL |
656 +---+---+---------------------+---------------------+---------------------+----------------+-------------------+---------------------+---------------------+
657 | 1 | a | 2022-01-01 00:00:00 | 2022-02-01 00:00:00 | 2022-03-01 00:00:00 | Asia/Singapore | Australia/Perth | 2022-01-01 08:00:00 | 2022-02-01 08:00:00 |
658 | 2 | b | 2022-01-02 00:00:00 | 2022-02-02 00:00:00 | 2022-03-02 00:00:00 | Asia/Singapore | Australia/Perth | 2022-01-02 08:00:00 | 2022-02-02 08:00:00 |
659 | 3 | c | 2022-01-03 00:00:00 | 2022-02-03 00:00:00 | 2022-03-03 00:00:00 | Asia/Singapore | Australia/Perth | 2022-01-03 08:00:00 | 2022-02-03 08:00:00 |
660 | 4 | d | 2022-01-04 00:00:00 | 2022-02-04 00:00:00 | 2022-03-04 00:00:00 | Asia/Singapore | Australia/Perth | 2022-01-04 08:00:00 | 2022-02-04 08:00:00 |
661 +---+---+---------------------+---------------------+---------------------+----------------+-------------------+---------------------+---------------------+
662 ```
663 !!! success "Conclusion: Successfully converted columns to local timezone."
664 </div>
666 ```{.py .python linenums="1" title="Example 3: Full-custom config"}
667 >>> add_local_datetime_columns(
668 ... dataframe=df,
669 ... columns=["c", "d_datetime"],
670 ... from_timezone="Australia/Sydney",
671 ... column_with_target_timezone="target",
672 ... ).show()
673 ```
674 <div class="result" markdown>
675 ```{.txt .text title="Terminal"}
676 +---+---+---------------------+---------------------+---------------------+----------------+-------------------+---------------------+---------------------+---------------------+---------------------+
677 | a | b | c | d_datetime | e_datetime | target | TIMEZONE_LOCATION | c_UTC | c_LOCAL | d_datetime_UTC | d_datetime_LOCAL |
678 +---+---+---------------------+---------------------+---------------------+----------------+-------------------+---------------------+---------------------+---------------------+---------------------+
679 | 1 | a | 2022-01-01 00:00:00 | 2022-02-01 00:00:00 | 2022-03-01 00:00:00 | Asia/Singapore | Australia/Perth | 2021-12-31 13:00:00 | 2021-12-31 21:00:00 | 2022-01-31 13:00:00 | 2022-02-01 08:00:00 |
680 | 2 | b | 2022-01-02 00:00:00 | 2022-02-02 00:00:00 | 2022-03-02 00:00:00 | Asia/Singapore | Australia/Perth | 2022-01-01 13:00:00 | 2022-01-01 21:00:00 | 2022-02-01 13:00:00 | 2022-02-02 08:00:00 |
681 | 3 | c | 2022-01-03 00:00:00 | 2022-02-03 00:00:00 | 2022-03-03 00:00:00 | Asia/Singapore | Australia/Perth | 2022-01-02 13:00:00 | 2022-01-02 21:00:00 | 2022-02-02 13:00:00 | 2022-02-03 08:00:00 |
682 | 4 | d | 2022-01-04 00:00:00 | 2022-02-04 00:00:00 | 2022-03-04 00:00:00 | Asia/Singapore | Australia/Perth | 2022-01-03 13:00:00 | 2022-01-03 21:00:00 | 2022-02-03 13:00:00 | 2022-02-04 08:00:00 |
683 +---+---+---------------------+---------------------+---------------------+----------------+-------------------+---------------------+---------------------+---------------------+---------------------+
684 ```
685 !!! success "Conclusion: Successfully converted columns to local time zone, from other custom time zone."
686 </div>
688 ```{.py .python linenums="1" title="Example 4: Single column"}
689 >>> add_local_datetime_columns(
690 ... dataframe=df,
691 ... columns="c",
692 ... from_timezone="Australia/Sydney",
693 ... column_with_target_timezone="target",
694 ... ).show()
695 ```
696 <div class="result" markdown>
697 ```{.txt .text title="Terminal"}
698 +---+---+---------------------+---------------------+---------------------+----------------+-------------------+---------------------+---------------------+
699 | a | b | c | d_datetime | e_datetime | target | TIMEZONE_LOCATION | c_UTC | c_LOCAL |
700 +---+---+---------------------+---------------------+---------------------+----------------+-------------------+---------------------+---------------------+
701 | 1 | a | 2022-01-01 00:00:00 | 2022-02-01 00:00:00 | 2022-03-01 00:00:00 | Asia/Singapore | Australia/Perth | 2021-12-31 13:00:00 | 2021-12-31 21:00:00 |
702 | 2 | b | 2022-01-02 00:00:00 | 2022-02-02 00:00:00 | 2022-03-02 00:00:00 | Asia/Singapore | Australia/Perth | 2022-01-01 13:00:00 | 2022-01-01 21:00:00 |
703 | 3 | c | 2022-01-03 00:00:00 | 2022-02-03 00:00:00 | 2022-03-03 00:00:00 | Asia/Singapore | Australia/Perth | 2022-01-02 13:00:00 | 2022-01-02 21:00:00 |
704 | 4 | d | 2022-01-04 00:00:00 | 2022-02-04 00:00:00 | 2022-03-04 00:00:00 | Asia/Singapore | Australia/Perth | 2022-01-03 13:00:00 | 2022-01-03 21:00:00 |
705 +---+---+---------------------+---------------------+---------------------+----------------+-------------------+---------------------+---------------------+
706 ```
707 !!! success "Conclusion: Successfully converted single column from other time zone to local time zone."
708 </div>
710 ```{.py .python linenums="1" title="Example 5: All columns"}
711 >>> add_local_datetime_columns(
712 ... dataframe=df,
713 ... columns="all",
714 ... from_timezone="Australia/Sydney",
715 ... column_with_target_timezone="target",
716 ... ).show()
717 ```
718 <div class="result" markdown>
719 ```{.txt .text title="Terminal"}
720 +---+---+---------------------+---------------------+---------------------+----------------+-------------------+---------------------+---------------------+---------------------+---------------------+
721 | a | b | c | d_datetime | e_datetime | target | TIMEZONE_LOCATION | d_datetime_UTC | d_datetime_LOCAL | e_datetime_UTC | e_datetime_LOCAL |
722 +---+---+---------------------+---------------------+---------------------+----------------+-------------------+---------------------+---------------------+---------------------+---------------------+
723 | 1 | a | 2022-01-01 00:00:00 | 2022-02-01 00:00:00 | 2022-03-01 00:00:00 | Asia/Singapore | Australia/Perth | 2022-01-31 13:00:00 | 2022-02-01 08:00:00 | 2022-02-28 13:00:00 | 2022-03-01 08:00:00 |
724 | 2 | b | 2022-01-02 00:00:00 | 2022-02-02 00:00:00 | 2022-03-02 00:00:00 | Asia/Singapore | Australia/Perth | 2022-02-01 13:00:00 | 2022-02-02 08:00:00 | 2022-03-01 13:00:00 | 2022-03-02 08:00:00 |
725 | 3 | c | 2022-01-03 00:00:00 | 2022-02-03 00:00:00 | 2022-03-03 00:00:00 | Asia/Singapore | Australia/Perth | 2022-02-02 13:00:00 | 2022-02-03 08:00:00 | 2022-03-02 13:00:00 | 2022-03-03 08:00:00 |
726 | 4 | d | 2022-01-04 00:00:00 | 2022-02-04 00:00:00 | 2022-03-04 00:00:00 | Asia/Singapore | Australia/Perth | 2022-02-03 13:00:00 | 2022-02-04 08:00:00 | 2022-03-03 13:00:00 | 2022-03-04 08:00:00 |
727 +---+---+---------------------+---------------------+---------------------+----------------+-------------------+---------------------+---------------------+---------------------+---------------------+
728 ```
729 !!! success "Conclusion: Successfully converted all date time columns from other time zone to local time zone."
730 </div>
732 ```{.py .python linenums="1" title="Example 6: Invalid column name"}
733 >>> add_local_datetime_columns(df, "invalid_column")
734 ```
735 <div class="result" markdown>
736 ```{.txt .text title="Terminal"}
737 ColumnDoesNotExistError: Column "invalid_column" does not exist in "dataframe".
738 Try one of: ["a", "b", "c", "d_datetime", "e_datetime", "target", "TIMEZONE_LOCATION"].
739 ```
740 !!! failure "Conclusion: Column does not exist."
741 </div>
743 ```{.py .python linenums="1" title="Example 7: Invalid timezone"}
744 >>> add_local_datetime_columns(df, "c", from_timezone="Invalid/Timezone")
745 ```
746 <div class="result" markdown>
747 ```{.txt .text title="Terminal"}
748 ValueError: The timezone "Invalid/Timezone" is not a valid timezone.
749 ```
750 !!! failure "Conclusion: Invalid timezone."
751 </div>
753 ??? tip "See Also"
754 - [`add_local_datetime_column()`][toolbox_pyspark.datetime.add_local_datetime_column]
755 """
756 if columns is None or columns in ["all"]:
757 columns = [col for col in dataframe.columns if col.lower().endswith("datetime")]
758 elif is_type(columns, str):
759 columns = [columns]
760 assert_columns_exists(dataframe, list(columns) + [column_with_target_timezone])
761 for column in columns:
762 dataframe = add_local_datetime_column(
763 dataframe=dataframe,
764 column=column,
765 from_timezone=from_timezone,
766 column_with_target_timezone=column_with_target_timezone,
767 )
768 return dataframe
771# ---------------------------------------------------------------------------- #
772# Splitting ####
773# ---------------------------------------------------------------------------- #
776@typechecked
777def split_datetime_column(
778 dataframe: psDataFrame,
779 column: str,
780) -> psDataFrame:
781 """
782 !!! note "Summary"
783 Take the column `column`, which should be a `timestamp` type, and split it in to it's respective `date` and `time` components.
785 Params:
786 dataframe (psDataFrame):
787 The DataFrame to update.
788 column (str):
789 The column to split.
791 Raises:
792 TypeError:
793 If any of the inputs parsed to the parameters of this function are not the correct type. Uses the [`@typeguard.typechecked`](https://typeguard.readthedocs.io/en/stable/api.html#typeguard.typechecked) decorator.
794 ColumnDoesNotExistError:
795 If the `#!py column` does not exist within `#!py dataframe.columns`.
796 TypeError:
797 If the `column` is not type `timestamp` or `datetime`.
799 Returns:
800 (psDataFrame):
801 The updated DataFrame.
803 ???+ example "Examples"
805 ```{.py .python linenums="1" title="Set up"}
806 >>> # Imports
807 >>> import pandas as pd
808 >>> from pyspark.sql import SparkSession
809 >>> from toolbox_pyspark.datetime import split_datetime_column
810 >>>
811 >>> # Instantiate Spark
812 >>> spark = SparkSession.builder.getOrCreate()
813 >>>
814 >>> # Create data
815 >>> df = spark.createDataFrame(
816 ... pd.DataFrame(
817 ... {
818 ... "a": [1, 2, 3, 4],
819 ... "b": ["a", "b", "c", "d"],
820 ... "c_datetime": pd.date_range(start="2022-01-01", periods=4, freq="h"),
821 ... "d_datetime": pd.date_range(start="2022-02-01", periods=4, freq="h"),
822 ... "e_datetime": pd.date_range(start="2022-03-01", periods=4, freq="h"),
823 ... "TIMEZONE_LOCATION": ["Australia/Perth"] * 4,
824 ... }
825 ... )
826 ... )
827 >>>
828 >>> # Check
829 >>> df.show()
830 ```
831 <div class="result" markdown>
832 ```{.txt .text title="Terminal"}
833 +---+---+---------------------+---------------------+---------------------+-------------------+
834 | a | b | c_datetime | d_datetime | e_datetime | TIMEZONE_LOCATION |
835 +---+---+---------------------+---------------------+---------------------+-------------------+
836 | 1 | a | 2022-01-01 00:00:00 | 2022-02-01 00:00:00 | 2022-03-01 00:00:00 | Australia/Perth |
837 | 2 | b | 2022-01-01 01:00:00 | 2022-02-01 01:00:00 | 2022-03-01 01:00:00 | Australia/Perth |
838 | 3 | c | 2022-01-01 02:00:00 | 2022-02-01 02:00:00 | 2022-03-01 02:00:00 | Australia/Perth |
839 | 4 | d | 2022-01-01 03:00:00 | 2022-02-01 03:00:00 | 2022-03-01 03:00:00 | Australia/Perth |
840 +---+---+---------------------+---------------------+---------------------+-------------------+
841 ```
842 </div>
844 ```{.py .python linenums="1" title="Example 1: Default config"}
845 >>> split_datetime_column(df, "c_datetime").show()
846 ```
847 <div class="result" markdown>
848 ```{.txt .text title="Terminal"}
849 +---+---+---------------------+---------------------+---------------------+-------------------+------------+----------+
850 | a | b | c_datetime | d_datetime | e_datetime | TIMEZONE_LOCATION | C_DATE | C_TIME |
851 +---+---+---------------------+---------------------+---------------------+-------------------+------------+----------+
852 | 1 | a | 2022-01-01 00:00:00 | 2022-02-01 00:00:00 | 2022-03-01 00:00:00 | Australia/Perth | 2022-01-01 | 00:00:00 |
853 | 2 | b | 2022-01-01 01:00:00 | 2022-02-01 01:00:00 | 2022-03-01 01:00:00 | Australia/Perth | 2022-01-01 | 01:00:00 |
854 | 3 | c | 2022-01-01 02:00:00 | 2022-02-01 02:00:00 | 2022-03-01 02:00:00 | Australia/Perth | 2022-01-01 | 02:00:00 |
855 | 4 | d | 2022-01-01 03:00:00 | 2022-02-01 03:00:00 | 2022-03-01 03:00:00 | Australia/Perth | 2022-01-01 | 03:00:00 |
856 +---+---+---------------------+---------------------+---------------------+-------------------+------------+----------+
857 ```
858 !!! success "Conclusion: Successfully split the column in to it's Date and Time constituents."
859 </div>
861 ```{.py .python linenums="1" title="Example 2: Invalid column name"}
862 >>> split_datetime_column(df, "invalid_column")
863 ```
864 <div class="result" markdown>
865 ```{.txt .text title="Terminal"}
866 ColumnDoesNotExistError: Column "invalid_column" does not exist in "dataframe".
867 Try one of: ["a", "b", "c_datetime", "d_datetime", "e_datetime", "TIMEZONE_LOCATION"].
868 ```
869 !!! failure "Conclusion: Column does not exist."
870 </div>
872 ```{.py .python linenums="1" title="Example 2: Invalid column name"}
873 >>> split_datetime_column(df, "b")
874 ```
875 <div class="result" markdown>
876 ```{.txt .text title="Terminal"}
877 TypeError: Column must be type 'timestamp' or 'datetime'.
878 Current type: [('b', 'string')]
879 ```
880 !!! failure "Conclusion: Column is not the correct type for splitting."
881 </div>
883 ??? tip "See Also"
884 - [`split_datetime_columns()`][toolbox_pyspark.datetime.split_datetime_columns]
885 """
886 assert_column_exists(dataframe, column)
887 datetime_cols: str_list = get_columns(dataframe, "all_datetime")
888 if not is_in(column, datetime_cols):
889 raise TypeError(
890 "Column must be type 'timestamp' or 'datetime'.\n"
891 f"Current type: {[(col,typ) for col,typ in dataframe.dtypes if col == column]}"
892 )
893 col_date_name: str = column.upper().replace("DATETIME", "DATE")
894 col_time_name: str = column.upper().replace("DATETIME", "TIME")
895 col_date_value: Column = F.date_format(column, "yyyy-MM-dd").cast("string").cast("date")
896 col_time_value: Column = F.date_format(column, "HH:mm:ss").cast("string")
897 return dataframe.withColumns(
898 {
899 col_date_name: col_date_value,
900 col_time_name: col_time_value,
901 }
902 )
905@typechecked
906def split_datetime_columns(
907 dataframe: psDataFrame,
908 columns: Optional[Union[str, str_collection]] = None,
909) -> psDataFrame:
910 """
911 !!! note "Summary"
912 For all the columns listed in `columns`, split them each in to their respective `date` and `time` components.
914 ???+ abstract "Details"
915 The reason why this function is structured this way, and not re-calling [`split_datetime_column()`][toolbox_pyspark.datetime.split_datetime_column] in each iteration of `columns` is due to `#!py pyspark` RDD complexity. More specifically, if it _were_ to call [`split_datetime_column()`][toolbox_pyspark.datetime.split_datetime_column] each time, the RDD would get incredibly and unnecessarily complicated. However, by doing it this way, using the [`.withColumns()`](https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.DataFrame.withColumns.html) method, it will project the SQL expression **once** down to the underlying dataframe; not multiple times. Therefore, in this way, the underlying SQL execution plan is now much less complicated; albeit that the coding DRY principle is not strictly being followed here.
917 Params:
918 dataframe (psDataFrame):
919 The DataFrame to update.
920 columns (Optional[Union[str, str_collection]], optional):
921 The list of columns to update. If not given, it will generate the list of columns from the `#!py dataframe.columns` which contain the text `datetime`.<br>
922 Defaults to `#!py None`.
924 Raises:
925 TypeError:
926 If any of the inputs parsed to the parameters of this function are not the correct type. Uses the [`@typeguard.typechecked`](https://typeguard.readthedocs.io/en/stable/api.html#typeguard.typechecked) decorator.
927 ColumnDoesNotExistError:
928 If any of the `#!py columns` do not exist within `#!py dataframe.columns`.
929 TypeError:
930 If any of the columns in `columns` are not type `timestamp` or `datetime`.
932 Returns:
933 (psDataFrame):
934 The updated DataFrame.
936 ???+ example "Examples"
938 ```{.py .python linenums="1" title="Set up"}
939 >>> # Imports
940 >>> import pandas as pd
941 >>> from pyspark.sql import SparkSession
942 >>> from toolbox_pyspark.datetime import split_datetime_columns
943 >>>
944 >>> # Instantiate Spark
945 >>> spark = SparkSession.builder.getOrCreate()
946 >>>
947 >>> # Create data
948 >>> df = spark.createDataFrame(
949 ... pd.DataFrame(
950 ... {
951 ... "a": [1, 2, 3, 4],
952 ... "b": ["a", "b", "c", "d"],
953 ... "c_datetime": pd.date_range(start="2022-01-01", periods=4, freq="h"),
954 ... "d_datetime": pd.date_range(start="2022-02-01", periods=4, freq="h"),
955 ... "e_datetime": pd.date_range(start="2022-03-01", periods=4, freq="h"),
956 ... "TIMEZONE_LOCATION": ["Australia/Perth"] * 4,
957 ... }
958 ... )
959 ... )
960 >>>
961 >>> # Check
962 >>> df.show()
963 ```
964 <div class="result" markdown>
965 ```{.txt .text title="Terminal"}
966 +---+---+---------------------+---------------------+---------------------+-------------------+
967 | a | b | c_datetime | d_datetime | e_datetime | TIMEZONE_LOCATION |
968 +---+---+---------------------+---------------------+---------------------+-------------------+
969 | 1 | a | 2022-01-01 00:00:00 | 2022-02-01 00:00:00 | 2022-03-01 00:00:00 | Australia/Perth |
970 | 2 | b | 2022-01-01 01:00:00 | 2022-02-01 01:00:00 | 2022-03-01 01:00:00 | Australia/Perth |
971 | 3 | c | 2022-01-01 02:00:00 | 2022-02-01 02:00:00 | 2022-03-01 02:00:00 | Australia/Perth |
972 | 4 | d | 2022-01-01 03:00:00 | 2022-02-01 03:00:00 | 2022-03-01 03:00:00 | Australia/Perth |
973 +---+---+---------------------+---------------------+---------------------+-------------------+
974 ```
975 </div>
977 ```{.py .python linenums="1" title="Example 1: Default config"}
978 >>> split_datetime_columns(df).show()
979 ```
980 <div class="result" markdown>
981 ```{.txt .text title="Terminal"}
982 +---+---+---------------------+---------------------+---------------------+-------------------+------------+----------+------------+----------+------------+----------+
983 | a | b | c_datetime | d_datetime | e_datetime | TIMEZONE_LOCATION | C_DATE | C_TIME | D_DATE | D_TIME | E_DATE | E_TIME |
984 +---+---+---------------------+---------------------+---------------------+-------------------+------------+----------+------------+----------+------------+----------+
985 | 1 | a | 2022-01-01 00:00:00 | 2022-02-01 00:00:00 | 2022-03-01 00:00:00 | Australia/Perth | 2022-01-01 | 00:00:00 | 2022-02-01 | 00:00:00 | 2022-03-01 | 00:00:00 |
986 | 2 | b | 2022-01-01 01:00:00 | 2022-02-01 01:00:00 | 2022-03-01 01:00:00 | Australia/Perth | 2022-01-01 | 01:00:00 | 2022-02-01 | 01:00:00 | 2022-03-01 | 01:00:00 |
987 | 3 | c | 2022-01-01 02:00:00 | 2022-02-01 02:00:00 | 2022-03-01 02:00:00 | Australia/Perth | 2022-01-01 | 02:00:00 | 2022-02-01 | 02:00:00 | 2022-03-01 | 02:00:00 |
988 | 4 | d | 2022-01-01 03:00:00 | 2022-02-01 03:00:00 | 2022-03-01 03:00:00 | Australia/Perth | 2022-01-01 | 03:00:00 | 2022-02-01 | 03:00:00 | 2022-03-01 | 03:00:00 |
989 +---+---+---------------------+---------------------+---------------------+-------------------+------------+----------+------------+----------+------------+----------+
990 ```
991 !!! success "Conclusion: Successfully split all DateTime columns in to their Date and Time constituents."
992 </div>
994 ```{.py .python linenums="1" title="Example 2: Custom config"}
995 >>> split_datetime_columns(df, ["c_datetime", "d_datetime"]).show()
996 ```
997 <div class="result" markdown>
998 ```{.txt .text title="Terminal"}
999 +---+---+---------------------+---------------------+---------------------+-------------------+------------+----------+------------+----------+
1000 | a | b | c_datetime | d_datetime | e_datetime | TIMEZONE_LOCATION | C_DATE | C_TIME | D_DATE | D_TIME |
1001 +---+---+---------------------+---------------------+---------------------+-------------------+------------+----------+------------+----------+
1002 | 1 | a | 2022-01-01 00:00:00 | 2022-02-01 00:00:00 | 2022-03-01 00:00:00 | Australia/Perth | 2022-01-01 | 00:00:00 | 2022-02-01 | 00:00:00 |
1003 | 2 | b | 2022-01-01 01:00:00 | 2022-02-01 01:00:00 | 2022-03-01 01:00:00 | Australia/Perth | 2022-01-01 | 01:00:00 | 2022-02-01 | 01:00:00 |
1004 | 3 | c | 2022-01-01 02:00:00 | 2022-02-01 02:00:00 | 2022-03-01 02:00:00 | Australia/Perth | 2022-01-01 | 02:00:00 | 2022-02-01 | 02:00:00 |
1005 | 4 | d | 2022-01-01 03:00:00 | 2022-02-01 03:00:00 | 2022-03-01 03:00:00 | Australia/Perth | 2022-01-01 | 03:00:00 | 2022-02-01 | 03:00:00 |
1006 +---+---+---------------------+---------------------+---------------------+-------------------+------------+----------+------------+----------+
1007 ```
1008 !!! success "Conclusion: Successfully split two columns into their Date and Time constituents."
1009 </div>
1011 ```{.py .python linenums="1" title="Example 3: All columns"}
1012 >>> split_datetime_columns(df, "all").show()
1013 ```
1014 <div class="result" markdown>
1015 ```{.txt .text title="Terminal"}
1016 +---+---+---------------------+---------------------+---------------------+-------------------+------------+----------+------------+----------+------------+----------+
1017 | a | b | c_datetime | d_datetime | e_datetime | TIMEZONE_LOCATION | C_DATE | C_TIME | D_DATE | D_TIME | E_DATE | E_TIME |
1018 +---+---+---------------------+---------------------+---------------------+-------------------+------------+----------+------------+----------+------------+----------+
1019 | 1 | a | 2022-01-01 00:00:00 | 2022-02-01 00:00:00 | 2022-03-01 00:00:00 | Australia/Perth | 2022-01-01 | 00:00:00 | 2022-02-01 | 00:00:00 | 2022-03-01 | 00:00:00 |
1020 | 2 | b | 2022-01-01 01:00:00 | 2022-02-01 01:00:00 | 2022-03-01 01:00:00 | Australia/Perth | 2022-01-01 | 01:00:00 | 2022-02-01 | 01:00:00 | 2022-03-01 | 01:00:00 |
1021 | 3 | c | 2022-01-01 02:00:00 | 2022-02-01 02:00:00 | 2022-03-01 02:00:00 | Australia/Perth | 2022-01-01 | 02:00:00 | 2022-02-01 | 02:00:00 | 2022-03-01 | 02:00:00 |
1022 | 4 | d | 2022-01-01 03:00:00 | 2022-02-01 03:00:00 | 2022-03-01 03:00:00 | Australia/Perth | 2022-01-01 | 03:00:00 | 2022-02-01 | 03:00:00 | 2022-03-01 | 03:00:00 |
1023 +---+---+---------------------+---------------------+---------------------+-------------------+------------+----------+------------+----------+------------+----------+
1024 ```
1025 !!! success "Conclusion: Successfully split all DateTime columns in to their Date and Time constituents."
1026 </div>
1028 ```{.py .python linenums="1" title="Example 4: Single column"}
1029 >>> split_datetime_columns(df, "c_datetime").show()
1030 ```
1031 <div class="result" markdown>
1032 ```{.txt .text title="Terminal"}
1033 +---+---+---------------------+---------------------+---------------------+-------------------+------------+----------+
1034 | a | b | c_datetime | d_datetime | e_datetime | TIMEZONE_LOCATION | C_DATE | C_TIME |
1035 +---+---+---------------------+---------------------+---------------------+-------------------+------------+----------+
1036 | 1 | a | 2022-01-01 00:00:00 | 2022-02-01 00:00:00 | 2022-03-01 00:00:00 | Australia/Perth | 2022-01-01 | 00:00:00 |
1037 | 2 | b | 2022-01-01 01:00:00 | 2022-02-01 01:00:00 | 2022-03-01 01:00:00 | Australia/Perth | 2022-01-01 | 01:00:00 |
1038 | 3 | c | 2022-01-01 02:00:00 | 2022-02-01 02:00:00 | 2022-03-01 02:00:00 | Australia/Perth | 2022-01-01 | 02:00:00 |
1039 | 4 | d | 2022-01-01 03:00:00 | 2022-02-01 03:00:00 | 2022-03-01 03:00:00 | Australia/Perth | 2022-01-01 | 03:00:00 |
1040 +---+---+---------------------+---------------------+---------------------+-------------------+------------+----------+
1041 ```
1042 !!! success "Conclusion: Successfully split a single column in to it's Date and Time constituents."
1043 </div>
1045 ```{.py .python linenums="1" title="Example 5: Invalid column name"}
1046 >>> split_datetime_columns(df, "invalid_column")
1047 ```
1048 <div class="result" markdown>
1049 ```{.txt .text title="Terminal"}
1050 ColumnDoesNotExistError: Column "invalid_column" does not exist in "dataframe".
1051 Try one of: ["a", "b", "c_datetime", "d_datetime", "e_datetime", "TIMEZONE_LOCATION"].
1052 ```
1053 !!! failure "Conclusion: Column does not exist."
1054 </div>
1056 ```{.py .python linenums="1" title="Example 6: Invalid column type"}
1057 >>> split_datetime_columns(df, "b")
1058 ```
1059 <div class="result" markdown>
1060 ```{.txt .text title="Terminal"}
1061 TypeError: Column must be type 'timestamp' or 'datetime'.
1062 Current type: [('b', 'string')]
1063 ```
1064 !!! failure "Conclusion: Column is not the correct type for splitting."
1065 </div>
1067 ??? tip "See Also"
1068 - [`split_datetime_column()`][toolbox_pyspark.datetime.split_datetime_column]
1069 """
1070 if columns is None or columns in ["all"]:
1071 columns = [col for col in dataframe.columns if "datetime" in col.lower()]
1072 elif is_type(columns, str):
1073 columns = [columns]
1074 assert_columns_exists(dataframe, columns)
1075 datetime_cols: str_list = get_columns(dataframe, "all_datetime")
1076 if not is_all_in(columns, datetime_cols):
1077 raise TypeError(
1078 "Columns to split must be type 'timestamp' or 'datetime'.\n"
1079 f"Current types: {[(col,typ) for col,typ in dataframe.dtypes if col in columns]}"
1080 )
1081 cols_exprs: dict[str, Column] = {}
1082 for column in columns:
1083 col_date_name: str = column.upper().replace("DATETIME", "DATE")
1084 col_time_name: str = column.upper().replace("DATETIME", "TIME")
1085 col_date_value: Column = (
1086 F.date_format(column, "yyyy-MM-dd").cast("string").cast("date")
1087 )
1088 col_time_value: Column = F.date_format(column, "HH:mm:ss").cast("string")
1089 cols_exprs[col_date_name] = col_date_value
1090 cols_exprs[col_time_name] = col_time_value
1091 return dataframe.withColumns(cols_exprs)