Coverage for src/toolbox_pyspark/columns.py: 100%
94 statements
« prev ^ index » next coverage.py v7.6.10, created at 2025-01-25 23:08 +0000
« prev ^ index » next coverage.py v7.6.10, created at 2025-01-25 23:08 +0000
1# ============================================================================ #
2# #
3# Title : Dataframe Cleaning #
4# Purpose : Fetch columns from a given DataFrame using convenient syntax. #
5# #
6# ============================================================================ #
9# ---------------------------------------------------------------------------- #
10# #
11# Overview ####
12# #
13# ---------------------------------------------------------------------------- #
16# ---------------------------------------------------------------------------- #
17# Description ####
18# ---------------------------------------------------------------------------- #
21"""
22!!! note "Summary"
23 The `columns` module is used to fetch columns from a given DataFrame using convenient syntax.
24"""
27# ---------------------------------------------------------------------------- #
28# #
29# Setup ####
30# #
31# ---------------------------------------------------------------------------- #
34# ---------------------------------------------------------------------------- #
35# Imports ####
36# ---------------------------------------------------------------------------- #
39# ## Python StdLib Imports ----
40from typing import Literal, Optional, Union
42# ## Python Third Party Imports ----
43from pyspark.sql import DataFrame as psDataFrame
44from toolbox_python.checkers import is_type
45from toolbox_python.collection_types import str_collection, str_list
46from typeguard import typechecked
48# ## Local First Party Imports ----
49from toolbox_pyspark.checks import (
50 assert_columns_exists,
51 warn_columns_missing,
52)
55# ---------------------------------------------------------------------------- #
56# Exports ####
57# ---------------------------------------------------------------------------- #
60__all__: str_list = [
61 "get_columns",
62 "get_columns_by_likeness",
63 "rename_columns",
64 "reorder_columns",
65 "delete_columns",
66]
69# ---------------------------------------------------------------------------- #
70# #
71# Functions ####
72# #
73# ---------------------------------------------------------------------------- #
76# ---------------------------------------------------------------------------- #
77# Selecting ####
78# ---------------------------------------------------------------------------- #
81@typechecked
82def get_columns(
83 dataframe: psDataFrame,
84 columns: Optional[Union[str, str_collection]] = None,
85) -> str_list:
86 """
87 !!! note "Summary"
88 Get a list of column names from a DataFrame based on optional filter criteria.
90 Params:
91 dataframe (psDataFrame):
92 The DataFrame from which to retrieve column names.
93 columns (Optional[Union[str, str_collection]], optional):
94 Optional filter criteria for selecting columns.<br>
95 If a string is provided, it can be one of the following options:
97 | Value | Description |
98 |-------|-------------|
99 | `#!py "all"` | Return all columns in the DataFrame.
100 | `#!py "all_str"` | Return columns of string type.
101 | `#!py "all_int"` | Return columns of integer type.
102 | `#!py "all_numeric"` | Return columns of numeric types (integers and floats).
103 | `#!py "all_datetime"` or `#!py "all_timestamp"` | Return columns of datetime or timestamp type.
104 | `#!py "all_date"` | Return columns of date type.
105 | Any other string | Return columns matching the provided exact column name.
107 If a list or tuple of column names is provided, return only those columns.<br>
108 Defaults to `#!py None` (which returns all columns).
110 Raises:
111 TypeError:
112 If any of the inputs parsed to the parameters of this function are not the correct type. Uses the [`@typeguard.typechecked`](https://typeguard.readthedocs.io/en/stable/api.html#typeguard.typechecked) decorator.
114 Returns:
115 (str_list):
116 The selected column names from the DataFrame.
118 ???+ example "Examples"
120 ```{.py .python linenums="1" title="Set up"}
121 >>> # Imports
122 >>> from pprint import pprint
123 >>> import pandas as pd
124 >>> from pyspark.sql import SparkSession, functions as F
125 >>> from toolbox_pyspark.columns import get_columns
126 >>>
127 >>> # Instantiate Spark
128 >>> spark = SparkSession.builder.getOrCreate()
129 >>>
130 >>> # Create data
131 >>> df = (
132 ... spark
133 ... .createDataFrame(
134 ... pd.DataFrame(
135 ... {
136 ... "a": (0, 1, 2, 3),
137 ... "b": ["a", "b", "c", "d"],
138 ... }
139 ... )
140 ... )
141 ... .withColumns(
142 ... {
143 ... "c": F.lit("1").cast("int"),
144 ... "d": F.lit("2").cast("string"),
145 ... "e": F.lit("1.1").cast("float"),
146 ... "f": F.lit("1.2").cast("double"),
147 ... "g": F.lit("2022-01-01").cast("date"),
148 ... "h": F.lit("2022-02-01 01:00:00").cast("timestamp"),
149 ... }
150 ... )
151 ... )
152 >>>
153 >>> # Check
154 >>> df.show()
155 >>> print(df.dtypes)
156 ```
157 <div class="result" markdown>
158 ```{.txt .text title="Terminal"}
159 +---+---+---+---+-----+-----+------------+---------------------+
160 | a | b | c | d | e | f | g | h |
161 +---+---+---+---+-----+-----+------------+---------------------+
162 | 0 | a | 1 | 2 | 1.1 | 1.2 | 2022-01-01 | 2022-02-01 01:00:00 |
163 | 1 | b | 1 | 2 | 1.1 | 1.2 | 2022-01-01 | 2022-02-01 01:00:00 |
164 | 2 | c | 1 | 2 | 1.1 | 1.2 | 2022-01-01 | 2022-02-01 01:00:00 |
165 | 3 | d | 1 | 2 | 1.1 | 1.2 | 2022-01-01 | 2022-02-01 01:00:00 |
166 +---+---+---+---+-----+-----+------------+---------------------+
167 ```
168 ```{.sh .shell title="Terminal"}
169 [
170 ("a", "bigint"),
171 ("b", "string"),
172 ("c", "int"),
173 ("d", "string"),
174 ("e", "float"),
175 ("f", "double"),
176 ("g", "date"),
177 ("h", "timestamp"),
178 ]
179 ```
180 </div>
182 ```{.py .python linenums="1" title="Example 1: Default params"}
183 >>> print(get_columns(df).columns)
184 ```
185 <div class="result" markdown>
186 ```{.sh .shell title="Terminal"}
187 ["a", "b", "c", "d", "e", "f", "g", "h"]
188 ```
189 !!! success "Conclusion: Success."
190 </div>
192 ```{.py .python linenums="1" title="Example 2: Specific columns"}
193 >>> print(get_columns(df, ["a", "b", "c"]).columns)
194 ```
195 <div class="result" markdown>
196 ```{.sh .shell title="Terminal"}
197 ["a", "b", "c"]
198 ```
199 !!! success "Conclusion: Success."
200 </div>
202 ```{.py .python linenums="1" title="Example 3: Single column as list"}
203 >>> print(get_columns(df, ["a"]).columns)
204 ```
205 <div class="result" markdown>
206 ```{.sh .shell title="Terminal"}
207 ["a"]
208 ```
209 !!! success "Conclusion: Success."
210 </div>
212 ```{.py .python linenums="1" title="Example 4: Single column as string"}
213 >>> print(get_columns(df, "a").columns)
214 ```
215 <div class="result" markdown>
216 ```{.sh .shell title="Terminal"}
217 ["a"]
218 ```
219 !!! success "Conclusion: Success."
220 </div>
222 ```{.py .python linenums="1" title="Example 5: All columns"}
223 >>> print(get_columns(df, "all").columns)
224 ```
225 <div class="result" markdown>
226 ```{.sh .shell title="Terminal"}
227 ["a", "b", "c", "d", "e", "f", "g", "h"]
228 ```
229 !!! success "Conclusion: Success."
230 </div>
232 ```{.py .python linenums="1" title="Example 6: All str"}
233 >>> print(get_columns(df, "all_str").columns)
234 ```
235 <div class="result" markdown>
236 ```{.sh .shell title="Terminal"}
237 ["b", "d"]
238 ```
239 !!! success "Conclusion: Success."
240 </div>
242 ```{.py .python linenums="1" title="Example 7: All int"}
243 >>> print(get_columns(df, "all int").columns)
244 ```
245 <div class="result" markdown>
246 ```{.sh .shell title="Terminal"}
247 ["c"]
248 ```
249 !!! success "Conclusion: Success."
250 </div>
252 ```{.py .python linenums="1" title="Example 8: All float"}
253 >>> print(get_columns(df, "all_decimal").columns)
254 ```
255 <div class="result" markdown>
256 ```{.sh .shell title="Terminal"}
257 ["e", "f"]
258 ```
259 !!! success "Conclusion: Success."
260 </div>
262 ```{.py .python linenums="1" title="Example 9: All numeric"}
263 >>> print(get_columns(df, "all_numeric").columns)
264 ```
265 <div class="result" markdown>
266 ```{.sh .shell title="Terminal"}
267 ["c", "e", "f"]
268 ```
269 !!! success "Conclusion: Success."
270 </div>
272 ```{.py .python linenums="1" title="Example 10: All date"}
273 >>> print(get_columns(df, "all_date").columns)
274 ```
275 <div class="result" markdown>
276 ```{.sh .shell title="Terminal"}
277 ["g"]
278 ```
279 !!! success "Conclusion: Success."
280 </div>
282 ```{.py .python linenums="1" title="Example 11: All datetime"}
283 >>> print(get_columns(df, "all_datetime").columns)
284 ```
285 <div class="result" markdown>
286 ```{.sh .shell title="Terminal"}
287 ["h"]
288 ```
289 !!! success "Conclusion: Success."
290 </div>
291 """
292 if columns is None:
293 return dataframe.columns
294 elif is_type(columns, str):
295 if "all" in columns:
296 if "str" in columns:
297 return [col for col, typ in dataframe.dtypes if typ in ("str", "string")]
298 elif "int" in columns:
299 return [col for col, typ in dataframe.dtypes if typ in ("int", "integer")]
300 elif "numeric" in columns:
301 return [
302 col
303 for col, typ in dataframe.dtypes
304 if typ in ("int", "integer", "float", "double", "long") or "decimal" in typ
305 ]
306 elif "float" in columns or "double" in columns or "decimal" in columns:
307 return [
308 col
309 for col, typ in dataframe.dtypes
310 if typ in ("float", "double", "long") or "decimal" in typ
311 ]
312 elif "datetime" in columns or "timestamp" in columns:
313 return [
314 col for col, typ in dataframe.dtypes if typ in ("datetime", "timestamp")
315 ]
316 elif "date" in columns:
317 return [col for col, typ in dataframe.dtypes if typ in ["date"]]
318 else:
319 return dataframe.columns
320 else:
321 return [columns]
322 else:
323 return list(columns)
326@typechecked
327def get_columns_by_likeness(
328 dataframe: psDataFrame,
329 starts_with: Optional[str] = None,
330 contains: Optional[str] = None,
331 ends_with: Optional[str] = None,
332 match_case: bool = False,
333 operator: Literal["and", "or", "and not", "or not"] = "and",
334) -> str_list:
335 """
336 !!! note "Summary"
337 Extract the column names from a given `dataframe` based on text that the column name contains.
339 ???+ abstract "Details"
340 You can use any combination of `startswith`, `contains`, and `endswith`. Under the hood, these will be implemented with a number of internal `#!py lambda` functions to determine matches.
342 The `operator` parameter determines how the conditions (`starts_with`, `contains`, `ends_with`) are combined:
344 | Value | Description |
345 |-------|-------------|
346 | `"and"` | All conditions must be true.
347 | `"or"` | At least one condition must be true.
348 | `"and not"` | The first condition must be true and the second condition must be false.
349 | `"or not"` | At least one condition must be true, but not all.
351 Params:
352 dataframe (psDataFrame):
353 The `dataframe` from which to extract the column names.
354 starts_with (Optional[str], optional):
355 Extract any columns that starts with this `#!py str`.<br>
356 Determined by using the `#!py str.startswith()` method.<br>
357 Defaults to `#!py None`.
358 contains (Optional[str], optional):
359 Extract any columns that contains this `#!py str` anywhere within it.<br>
360 Determined by using the `#!py in` keyword.<br>
361 Defaults to `#!py None`.
362 ends_with (Optional[str], optional):
363 Extract any columns that ends with this `#!py str`.<br>
364 Determined by using the `#!py str.endswith()` method.<br>
365 Defaults to `#!py None`.
366 match_case (bool, optional):
367 If you want to ensure an exact match for the columns, set this to `#!py True`, else if you want to match the exact case for the columns, set this to `#!py False`.<br>
368 Defaults to `#!py False`.
369 operator (Literal["and", "or", "and not", "or not"], optional):
370 The logical operator to place between the functions.<br>
371 Only used when there are multiple values parsed to the parameters: `#!py starts_with`, `#!py contains`: `#!py ends_with`.<br>
372 Defaults to `#!py and`.
374 Returns:
375 (str_list):
376 The list of columns which match the criteria specified.
378 ???+ example "Examples"
380 ```{.py .python linenums="1" title="Set up"}
381 >>> # Imports
382 >>> import pandas as pd
383 >>> from pyspark.sql import SparkSession
384 >>> from toolbox_pyspark.columns import get_columns_by_likeness
385 >>>
386 >>> # Instantiate Spark
387 >>> spark = SparkSession.builder.getOrCreate()
388 >>>
389 >>> # Create data
390 >>> values = list(range(1, 6))
391 >>> df = spark.createDataFrame(
392 ... pd.DataFrame(
393 ... {
394 ... "aaa": values,
395 ... "aab": values,
396 ... "aac": values,
397 ... "afa": values,
398 ... "afb": values,
399 ... "afc": values,
400 ... "bac": values,
401 ... }
402 ... )
403 ... )
404 >>>
405 >>> # Check
406 >>> df.show()
407 ```
408 <div class="result" markdown>
409 ```{.txt .text title="Terminal"}
410 +-----+-----+-----+-----+-----+-----+-----+
411 | aaa | aab | aac | afa | afb | afc | bac |
412 +-----+-----+-----+-----+-----+-----+-----+
413 | 1 | 1 | 1 | 1 | 1 | 1 | 1 |
414 | 2 | 2 | 2 | 2 | 2 | 2 | 2 |
415 | 3 | 3 | 3 | 3 | 3 | 3 | 3 |
416 | 4 | 4 | 4 | 4 | 4 | 4 | 4 |
417 | 5 | 5 | 5 | 5 | 5 | 5 | 5 |
418 +-----+-----+-----+-----+-----+-----+-----+
419 ```
420 </div>
422 ```{.py .python linenums="1" title="Example 1: Starts With"}
423 >>> print(get_columns_by_likeness(df, starts_with="a"))
424 ```
425 <div class="result" markdown>
426 ```{.sh .shell title="Terminal"}
427 ["aaa", "aab", "aac", "afa", "afb", "afc"]
428 ```
429 !!! success "Conclusion: Success."
430 </div>
432 ```{.py .python linenums="1" title="Example 2: Contains"}
433 >>> print(get_columns_by_likeness(df, contains="f"))
434 ```
435 <div class="result" markdown>
436 ```{.sh .shell title="Terminal"}
437 ["afa", "afb", "afc"]
438 ```
439 !!! success "Conclusion: Success."
440 </div>
442 ```{.py .python linenums="1" title="Example 3: Ends With"}
443 >>> print(get_columns_by_likeness(df, ends_with="c"))
444 ```
445 <div class="result" markdown>
446 ```{.sh .shell title="Terminal"}
447 ["aac", "afc", "bac"]
448 ```
449 !!! success "Conclusion: Success."
450 </div>
452 ```{.py .python linenums="1" title="Example 4: Starts With and Contains"}
453 >>> print(get_columns_by_likeness(df, starts_with="a", contains="c"))
454 ```
455 <div class="result" markdown>
456 ```{.sh .shell title="Terminal"}
457 ["aac", "afc"]
458 ```
459 !!! success "Conclusion: Success."
460 </div>
462 ```{.py .python linenums="1" title="Example 5: Starts With and Ends With"}
463 >>> print(get_columns_by_likeness(df, starts_with="a", ends_with="b"))
464 ```
465 <div class="result" markdown>
466 ```{.sh .shell title="Terminal"}
467 ["aab", "afb"]
468 ```
469 !!! success "Conclusion: Success."
470 </div>
472 ```{.py .python linenums="1" title="Example 6: Contains and Ends With"}
473 >>> print(get_columns_by_likeness(df, contains="f", ends_with="b"))
474 ```
475 <div class="result" markdown>
476 ```{.sh .shell title="Terminal"}
477 ["afb"]
478 ```
479 !!! success "Conclusion: Success."
480 </div>
482 ```{.py .python linenums="1" title="Example 7: Starts With and Contains and Ends With"}
483 >>> print(get_columns_by_likeness(df, starts_with="a", contains="f", ends_with="b"))
484 ```
485 <div class="result" markdown>
486 ```{.sh .shell title="Terminal"}
487 ["afb"]
488 ```
489 !!! success "Conclusion: Success."
490 </div>
492 ```{.py .python linenums="1" title="Example 8: Using 'or' Operator"}
493 >>> print(get_columns_by_likeness(df, starts_with="a", operator="or", contains="f"))
494 ```
495 <div class="result" markdown>
496 ```{.sh .shell title="Terminal"}
497 ["aaa", "aab", "aac", "afa", "afb", "afc"]
498 ```
499 !!! success "Conclusion: Success."
500 </div>
502 ```{.py .python linenums="1" title="Example 9: Using 'and not' Operator"}
503 >>> print(get_columns_by_likeness(df, starts_with="a", operator="and not", contains="f"))
504 ```
505 <div class="result" markdown>
506 ```{.sh .shell title="Terminal"}
507 ["aaa", "aab", "aac"]
508 ```
509 !!! success "Conclusion: Success."
510 </div>
512 ```{.py .python linenums="1" title="Example 10: Error Example 1"}
513 >>> print(get_columns_by_likeness(df, starts_with=123))
514 ```
515 <div class="result" markdown>
516 ```{.sh .shell title="Terminal"}
517 TypeError: `starts_with` must be a `string` or `None`.
518 ```
519 !!! failure "Conclusion: Error."
520 </div>
522 ```{.py .python linenums="1" title="Example 11: Error Example 2"}
523 >>> print(get_columns_by_likeness(df, operator="xor"))
524 ```
525 <div class="result" markdown>
526 ```{.sh .shell title="Terminal"}
527 ValueError: `operator` must be one of 'and', 'or', 'and not', 'or not'
528 ```
529 !!! failure "Conclusion: Error."
530 </div>
531 """
533 # Columns
534 cols: str_list = dataframe.columns
535 if not match_case:
536 cols = [col.upper() for col in cols]
537 starts_with = starts_with.upper() if starts_with is not None else None
538 contains = contains.upper() if contains is not None else None
539 ends_with = ends_with.upper() if ends_with is not None else None
541 # Parameters
542 o_: Literal["and", "or", "and not", "or not"] = operator
543 s_: bool = starts_with is not None
544 c_: bool = contains is not None
545 e_: bool = ends_with is not None
547 # Functions
548 _ops = {
549 "and": lambda x, y: x and y,
550 "or": lambda x, y: x or y,
551 "and not": lambda x, y: x and not y,
552 "or not": lambda x, y: x or not y,
553 }
554 _s = lambda col, s: col.startswith(s)
555 _c = lambda col, c: c in col
556 _e = lambda col, e: col.endswith(e)
557 _sc = lambda col, s, c: _ops[o_](_s(col, s), _c(col, c))
558 _se = lambda col, s, e: _ops[o_](_s(col, s), _e(col, e))
559 _ce = lambda col, c, e: _ops[o_](_c(col, c), _e(col, e))
560 _sce = lambda col, s, c, e: _ops[o_](_ops[o_](_s(col, s), _c(col, c)), _e(col, e))
562 # Logic
563 if s_ and not c_ and not e_:
564 return [col for col in cols if _s(col, starts_with)]
565 elif c_ and not s_ and not e_:
566 return [col for col in cols if _c(col, contains)]
567 elif e_ and not s_ and not c_:
568 return [col for col in cols if _e(col, ends_with)]
569 elif s_ and c_ and not e_:
570 return [col for col in cols if _sc(col, starts_with, contains)]
571 elif s_ and e_ and not c_:
572 return [col for col in cols if _se(col, starts_with, ends_with)]
573 elif c_ and e_ and not s_:
574 return [col for col in cols if _ce(col, contains, ends_with)]
575 elif s_ and c_ and e_:
576 return [col for col in cols if _sce(col, starts_with, contains, ends_with)]
577 else:
578 return cols
581# ---------------------------------------------------------------------------- #
582# Renaming ####
583# ---------------------------------------------------------------------------- #
586@typechecked
587def rename_columns(
588 dataframe: psDataFrame,
589 columns: Optional[Union[str, str_collection]] = None,
590 string_function: str = "upper",
591) -> psDataFrame:
592 """
593 !!! note "Summary"
594 Use one of the common Python string functions to be applied to one or multiple columns.
596 ???+ abstract "Details"
597 The `string_function` must be a valid string method. For more info on available functions, see: https://docs.python.org/3/library/stdtypes.html#string-methods
599 Params:
600 dataframe (psDataFrame):
601 The DataFrame to be updated.
602 columns (Optional[Union[str, str_collection]], optional):
603 The columns to be updated.<br>
604 Must be a valid column on `dataframe`.<br>
605 If not provided, will be applied to all columns.<br>
606 It is also possible to parse the values `"all"`, which will also apply this function to all columns in `dataframe`.<br>
607 Defaults to `None`.
608 string_function (str, optional):
609 The string function to be applied. Defaults to `"upper"`.
611 Returns:
612 (psDataFrame):
613 The updated DataFrame.
615 ???+ example "Examples"
617 ```{.py .python linenums="1" title="Set up"}
618 >>> # Import
619 >>> import pandas as pd
620 >>> from pyspark.sql import SparkSession
621 >>> from toolbox_pyspark.columns import rename_columns
622 >>>
623 >>> # Instantiate Spark
624 >>> spark = SparkSession.builder.getOrCreate()
625 >>>
626 >>> # Create data
627 >>> df = spark.createDataFrame(
628 ... pd.DataFrame(
629 ... {
630 ... "a": [0, 1, 2, 3],
631 ... "b": ["a", "b", "c", "d"],
632 ... "c": ["c", "c", "c", "c"],
633 ... "d": ["d", "d", "d", "d"],
634 ... }
635 ... )
636 ... )
637 >>>
638 >>> # Check
639 >>> df.show()
640 ```
641 <div class="result" markdown>
642 ```{.txt .text title="Terminal"}
643 +---+---+---+---+
644 | a | b | c | d |
645 +---+---+---+---+
646 | 0 | a | c | d |
647 | 1 | b | c | d |
648 | 2 | c | c | d |
649 | 3 | d | c | d |
650 +---+---+---+---+
651 ```
652 </div>
654 ```{.py .python linenums="1" title="Example 1: Single column, default params"}
655 >>> print(rename_columns(df, "a").columns)
656 ```
657 <div class="result" markdown>
658 ```{.sh .shell title="Terminal"}
659 ["A", "b", "c", "d"]
660 ```
661 !!! success "Conclusion: Success."
662 </div>
664 ```{.py .python linenums="1" title="Example 2: Single column, simple function"}
665 >>> print(rename_columns(df, "a", "upper").columns)
666 ```
667 <div class="result" markdown>
668 ```{.sh .shell title="Terminal"}
669 ["A", "b", "c", "d"]
670 ```
671 !!! success "Conclusion: Success."
672 </div>
674 ```{.py .python linenums="1" title="Example 3: Single column, complex function"}
675 >>> print(rename_columns(df, "a", "replace('b', 'test')").columns)
676 ```
677 <div class="result" markdown>
678 ```{.sh .shell title="Terminal"}
679 ["a", "test", "c", "d"]
680 ```
681 !!! success "Conclusion: Success."
682 </div>
684 ```{.py .python linenums="1" title="Example 4: Multiple columns"}
685 >>> print(rename_columns(df, ["a", "b"]).columns)
686 ```
687 <div class="result" markdown>
688 ```{.sh .shell title="Terminal"}
689 ["A", "B", "c", "d"]
690 ```
691 !!! success "Conclusion: Success."
692 </div>
694 ```{.py .python linenums="1" title="Example 5: Default function over all columns"}
695 >>> print(rename_columns(df).columns)
696 ```
697 <div class="result" markdown>
698 ```{.sh .shell title="Terminal"}
699 ["A", "B", "C", "D"]
700 ```
701 !!! success "Conclusion: Success."
702 </div>
704 ```{.py .python linenums="1" title="Example 6: Complex function over multiple columns"}
705 >>> print(rename_columns(df, ["a", "b"], "replace('b', 'test')").columns)
706 ```
707 <div class="result" markdown>
708 ```{.sh .shell title="Terminal"}
709 ["a", "test", "c", "d"]
710 ```
711 !!! success "Conclusion: Success."
712 </div>
714 ??? tip "See Also"
715 - [`assert_columns_exists()`][toolbox_pyspark.checks.assert_columns_exists]
716 - [`assert_column_exists()`][toolbox_pyspark.checks.assert_column_exists]
717 """
718 columns = get_columns(dataframe, columns)
719 assert_columns_exists(dataframe=dataframe, columns=columns, match_case=True)
720 cols_exprs: dict[str, str] = {
721 col: eval(
722 f"'{col}'.{string_function}{'()' if not string_function.endswith(')') else ''}"
723 )
724 for col in columns
725 }
726 return dataframe.withColumnsRenamed(cols_exprs)
729# ---------------------------------------------------------------------------- #
730# Reordering ####
731# ---------------------------------------------------------------------------- #
734@typechecked
735def reorder_columns(
736 dataframe: psDataFrame,
737 new_order: Optional[str_collection] = None,
738 missing_columns_last: bool = True,
739 key_columns_position: Optional[Literal["first", "last"]] = "first",
740) -> psDataFrame:
741 """
742 !!! note "Summary"
743 Reorder the columns in a given DataFrame in to a custom order, or to put the `key_` columns at the end (that is, to the far right) of the dataframe.
745 ???+ abstract "Details"
746 The decision flow chart is as follows:
748 ```mermaid
749 graph TD
750 a([begin])
751 z([end])
752 b{{new_order}}
753 c{{missing_cols_last}}
754 d{{key_cols_position}}
755 g[cols = dataframe.columns]
756 h[cols = new_order]
757 i[cols += missing_cols]
758 j[cols = non_key_cols + key_cols]
759 k[cols = key_cols + non_key_cols]
760 l["return dataframe.select(cols)"]
761 a --> b
762 b --is not None--> h --> c
763 b --is None--> g --> d
764 c --False--> l
765 c --True--> i ----> l
766 d --"first"--> k ---> l
767 d --"last"---> j --> l
768 d --None--> l
769 l --> z
770 ```
772 Params:
773 dataframe (psDataFrame):
774 The DataFrame to update
775 new_order (Optional[Union[str, str_list, str_tuple, str_set]], optional):
776 The custom order for the columns on the order.<br>
777 Defaults to `#!py None`.
778 missing_columns_last (bool, optional):
779 For any columns existing on `#!py dataframes.columns`, but missing from `#!py new_order`, if `#!py missing_columns_last=True`, then include those missing columns to the right of the dataframe, in the same order that they originally appear.<br>
780 Defaults to `#!py True`.
781 key_columns_position (Optional[Literal["first", "last"]], optional):
782 Where should the `#!py "key_*"` columns be located?.<br>
784 - If `#!py "first"`, then they will be relocated to the start of the dataframe, before all other columns.
785 - If `#!py "last"`, then they will be relocated to the end of the dataframe, after all other columns.
786 - If `#!py None`, they they will remain their original order.
788 Regardless of their position, their original order will be maintained.
789 Defaults to `#!py "first"`.
791 Raises:
792 TypeError:
793 If any of the inputs parsed to the parameters of this function are not the correct type. Uses the [`@typeguard.typechecked`](https://typeguard.readthedocs.io/en/stable/api.html#typeguard.typechecked) decorator.
795 Returns:
796 (psDataFrame):
797 The updated DataFrame.
799 ???+ example "Examples"
801 ```{.py .python linenums="1" title="Set up"}
802 >>> # Imports
803 >>> import pandas as pd
804 >>> from pyspark.sql import SparkSession
805 >>> from toolbox_pyspark.columns import reorder_columns
806 >>>
807 >>> # Instantiate Spark
808 >>> spark = SparkSession.builder.getOrCreate()
809 >>>
810 >>> # Create data
811 >>> df = spark.createDataFrame(
812 ... pd.DataFrame(
813 ... {
814 ... "a": [0, 1, 2, 3],
815 ... "b": ["a", "b", "c", "d"],
816 ... "key_a": ["0", "1", "2", "3"],
817 ... "c": ["1", "1", "1", "1"],
818 ... "d": ["2", "2", "2", "2"],
819 ... "key_c": ["1", "1", "1", "1"],
820 ... "key_e": ["3", "3", "3", "3"],
821 ... }
822 ... )
823 ... )
824 >>>
825 >>> # Check
826 >>> df.show()
827 ```
828 <div class="result" markdown>
829 ```{.txt .text title="Terminal"}
830 +---+---+-------+---+---+-------+-------+
831 | a | b | key_a | c | d | key_c | key_e |
832 +---+---+-------+---+---+-------+-------+
833 | 0 | a | 0 | 1 | 2 | 1 | 3 |
834 | 1 | b | 1 | 1 | 2 | 1 | 3 |
835 | 2 | c | 2 | 1 | 2 | 1 | 3 |
836 | 3 | d | 3 | 1 | 2 | 1 | 3 |
837 +---+---+-------+---+---+-------+-------+
838 ```
839 </div>
841 ```{.py .python linenums="1" title="Example 1: Default config"}
842 >>> new_df = reorder_columns(dataframe=df)
843 >>> new_df.show()
844 ```
845 <div class="result" markdown>
846 ```{.txt .text title="Terminal"}
847 +-------+-------+-------+---+---+---+---+
848 | key_a | key_c | key_e | a | b | c | d |
849 +-------+-------+-------+---+---+---+---+
850 | 0 | 1 | 3 | 0 | a | 1 | 2 |
851 | 1 | 1 | 3 | 1 | b | 1 | 2 |
852 | 2 | 1 | 3 | 2 | c | 1 | 2 |
853 | 3 | 1 | 3 | 3 | d | 1 | 2 |
854 +-------+-------+-------+---+---+---+---+
855 ```
856 !!! success "Conclusion: Success."
857 </div>
859 ```{.py .python linenums="1" title="Example 2: Custom order"}
860 >>> new_df = reorder_columns(
861 ... dataframe=df,
862 ... new_order=["key_a", "key_c", "b", "key_e", "a", "c", "d"],
863 ... )
864 >>> new_df.show()
865 ```
866 <div class="result" markdown>
867 ```{.txt .text title="Terminal"}
868 +-------+-------+---+-------+---+---+---+
869 | key_a | key_c | b | key_e | a | c | d |
870 +-------+-------+---+-------+---+---+---+
871 | 0 | 1 | a | 3 | 0 | 1 | 2 |
872 | 1 | 1 | b | 3 | 1 | 1 | 2 |
873 | 2 | 1 | c | 3 | 2 | 1 | 2 |
874 | 3 | 1 | d | 3 | 3 | 1 | 2 |
875 +-------+-------+---+-------+---+---+---+
876 ```
877 !!! success "Conclusion: Success."
878 </div>
880 ```{.py .python linenums="1" title="Example 3: Custom order, include missing columns"}
881 >>> new_df = reorder_columns(
882 ... dataframe=df,
883 ... new_order=["key_a", "key_c", "a", "b"],
884 ... missing_columns_last=True,
885 ... )
886 >>> new_df.show()
887 ```
888 <div class="result" markdown>
889 ```{.txt .text title="Terminal"}
890 +-------+-------+---+---+-------+---+---+
891 | key_a | key_c | a | b | key_e | c | d |
892 +-------+-------+---+---+-------+---+---+
893 | 0 | 1 | 0 | a | 3 | 1 | 2 |
894 | 1 | 1 | 1 | b | 3 | 1 | 2 |
895 | 2 | 1 | 2 | c | 3 | 1 | 2 |
896 | 3 | 1 | 3 | d | 3 | 1 | 2 |
897 +-------+-------+---+---+-------+---+---+
898 ```
899 !!! success "Conclusion: Success."
900 </div>
902 ```{.py .python linenums="1" title="Example 4: Custom order, exclude missing columns"}
903 >>> new_df = reorder_columns(
904 ... dataframe=df,
905 ... new_order=["key_a", "key_c", "a", "b"],
906 ... missing_columns_last=False,
907 ... )
908 >>> new_df.show()
909 ```
910 <div class="result" markdown>
911 ```{.txt .text title="Terminal"}
912 +-------+-------+---+---+
913 | key_a | key_c | a | b |
914 +-------+-------+---+---+
915 | 0 | 1 | 0 | a |
916 | 1 | 1 | 1 | b |
917 | 2 | 1 | 2 | c |
918 | 3 | 1 | 3 | d |
919 +-------+-------+---+---+
920 ```
921 !!! success "Conclusion: Success."
922 </div>
924 ```{.py .python linenums="1" title="Example 5: Keys last"}
925 >>> new_df = reorder_columns(
926 ... dataframe=df,
927 ... key_columns_position="last",
928 ... )
929 >>> new_df.show()
930 ```
931 <div class="result" markdown>
932 ```{.txt .text title="Terminal"}
933 +---+---+---+---+-------+-------+-------+
934 | a | b | c | d | key_a | key_c | key_e |
935 +---+---+---+---+-------+-------+-------+
936 | 0 | a | 1 | 2 | 0 | 1 | 3 |
937 | 1 | b | 1 | 2 | 1 | 1 | 3 |
938 | 2 | c | 1 | 2 | 2 | 1 | 3 |
939 | 3 | d | 1 | 2 | 3 | 1 | 3 |
940 +---+---+---+---+-------+-------+-------+
941 ```
942 !!! success "Conclusion: Success."
943 </div>
945 ```{.py .python linenums="1" title="Example 6: Keys first"}
946 >>> new_df = reorder_columns(
947 ... dataframe=df,
948 ... key_columns_position="first",
949 ... )
950 >>> new_df.show()
951 ```
952 <div class="result" markdown>
953 ```{.txt .text title="Terminal"}
954 +-------+-------+-------+---+---+---+---+
955 | key_a | key_c | key_e | a | b | c | d |
956 +-------+-------+-------+---+---+---+---+
957 | 0 | 1 | 3 | 0 | a | 1 | 2 |
958 | 1 | 1 | 3 | 1 | b | 1 | 2 |
959 | 2 | 1 | 3 | 2 | c | 1 | 2 |
960 | 3 | 1 | 3 | 3 | d | 1 | 2 |
961 +-------+-------+-------+---+---+---+---+
962 ```
963 !!! success "Conclusion: Success."
964 </div>
965 """
966 df_cols: str_list = dataframe.columns
967 if new_order is not None:
968 cols: str_list = get_columns(dataframe, new_order)
969 if missing_columns_last:
970 cols += [col for col in df_cols if col not in new_order]
971 else:
972 non_key_cols: str_list = [col for col in df_cols if not col.lower().startswith("key_")]
973 key_cols: str_list = [col for col in df_cols if col.lower().startswith("key_")]
974 if key_columns_position == "first":
975 cols = key_cols + non_key_cols
976 elif key_columns_position == "last":
977 cols = non_key_cols + key_cols
978 else:
979 cols = df_cols
980 return dataframe.select(cols)
983# ---------------------------------------------------------------------------- #
984# Deleting ####
985# ---------------------------------------------------------------------------- #
988@typechecked
989def delete_columns(
990 dataframe: psDataFrame,
991 columns: Union[str, str_collection],
992 missing_column_handler: Literal["raise", "warn", "pass"] = "pass",
993) -> psDataFrame:
994 """
995 !!! note "Summary"
996 For a given `#!py dataframe`, delete the columns listed in `columns`.
998 ???+ abstract "Details"
999 You can use `#!py missing_columns_handler` to specify how to handle missing columns.
1001 Params:
1002 dataframe (psDataFrame):
1003 The dataframe from which to delete the columns
1004 columns (Union[str, str_collection]):
1005 The list of columns to delete.
1006 missing_column_handler (Literal["raise", "warn", "pass"], optional):
1007 How to handle any columns which are missing from `#!py dataframe.columns`.
1009 If _any_ columns in `columns` are missing from `#!py dataframe.columns`, then the following will happen for each option:
1011 | Option | Result |
1012 |--------|--------|
1013 | `#!py "raise"` | An `#!py ColumnDoesNotExistError` exception will be raised
1014 | `#!py "warn"` | An `#!py ColumnDoesNotExistWarning` warning will be raised
1015 | `#!py "pass"` | Nothing will be raised
1017 Defaults to `#!py "pass"`.
1019 Raises:
1020 TypeError:
1021 If any of the inputs parsed to the parameters of this function are not the correct type. Uses the [`@typeguard.typechecked`](https://typeguard.readthedocs.io/en/stable/api.html#typeguard.typechecked) decorator.
1022 ColumnDoesNotExistError:
1023 If any of the `#!py columns` do not exist within `#!py dataframe.columns`.
1025 Returns:
1026 (psDataFrame):
1027 The updated `#!py dataframe`, with the columns listed in `#!py columns` having been removed.
1029 ???+ example "Examples"
1031 ```{.py .python linenums="1" title="Set up"}
1032 >>> # Imports
1033 >>> import pandas as pd
1034 >>> from pyspark.sql import SparkSession
1035 >>> from toolbox_pyspark.columns import delete_columns
1036 >>>
1037 >>> # Instantiate Spark
1038 >>> spark = SparkSession.builder.getOrCreate()
1039 >>>
1040 >>> # Create data
1041 >>> df = spark.createDataFrame(
1042 ... pd.DataFrame(
1043 ... {
1044 ... "a": [0, 1, 2, 3],
1045 ... "b": ["a", "b", "c", "d"],
1046 ... "c": ["c", "c", "c", "c"],
1047 ... "d": ["d", "d", "d", "d"],
1048 ... }
1049 ... )
1050 ... )
1051 >>>
1052 >>> # Check
1053 >>> df.show()
1054 ```
1055 <div class="result" markdown>
1056 ```{.txt .text title="Terminal"}
1057 +---+---+---+---+
1058 | a | b | c | d |
1059 +---+---+---+---+
1060 | 0 | a | c | d |
1061 | 1 | b | c | d |
1062 | 2 | c | c | d |
1063 | 3 | d | c | d |
1064 +---+---+---+---+
1065 ```
1066 </div>
1068 ```{.py .python linenums="1" title="Example 1: Single column"}
1069 >>> df.transform(delete_columns, "a").show()
1070 ```
1071 <div class="result" markdown>
1072 ```{.txt .text title="Terminal"}
1073 +---+---+---+
1074 | b | c | d |
1075 +---+---+---+
1076 | a | c | d |
1077 | b | c | d |
1078 | c | c | d |
1079 | d | c | d |
1080 +---+---+---+
1081 ```
1082 !!! success "Conclusion: Success."
1083 </div>
1085 ```{.py .python linenums="1" title="Example 2: Multiple columns"}
1086 >>> df.transform(delete_columns, ["a", "b"]).show()
1087 ```
1088 <div class="result" markdown>
1089 ```{.txt .text title="Terminal"}
1090 +---+---+
1091 | c | d |
1092 +---+---+
1093 | c | d |
1094 | c | d |
1095 | c | d |
1096 | c | d |
1097 +---+---+
1098 ```
1099 !!! success "Conclusion: Success."
1100 </div>
1102 ```{.py .python linenums="1" title="Example 3: Single column missing, raises error"}
1103 >>> (
1104 ... df.transform(
1105 ... delete_columns,
1106 ... columns="z",
1107 ... missing_column_handler="raise",
1108 ... )
1109 ... .show()
1110 ... )
1111 ```
1112 <div class="result" markdown>
1113 ```{.txt .text title="Terminal"}
1114 ColumnDoesNotExistError: Columns ["z"] do not exist in "dataframe".
1115 Try one of: ["a", "b", "c", "d"]
1116 ```
1117 !!! success "Conclusion: Success."
1118 </div>
1120 ```{.py .python linenums="1" title="Example 4: Multiple columns, one missing, raises error"}
1121 >>> (
1122 ... df.transform(
1123 ... delete_columns,
1124 ... columns=["a", "b", "z"],
1125 ... missing_column_handler="raise",
1126 ... )
1127 ... .show()
1128 ... )
1129 ```
1130 <div class="result" markdown>
1131 ```{.txt .text title="Terminal"}
1132 ColumnDoesNotExistError: Columns ["z"] do not exist in "dataframe".
1133 Try one of: ["a", "b", "c", "d"]
1134 ```
1135 !!! success "Conclusion: Success."
1136 </div>
1138 ```{.py .python linenums="1" title="Example 5: Multiple columns, all missing, raises error"}
1139 >>> (
1140 ... df.transform(
1141 ... delete_columns,
1142 ... columns=["x", "y", "z"],
1143 ... missing_column_handler="raise",
1144 ... )
1145 ... .show()
1146 ... )
1147 ```
1148 <div class="result" markdown>
1149 ```{.txt .text title="Terminal"}
1150 ColumnDoesNotExistError: Columns ["x", "y", "z"] do not exist in "dataframe".
1151 Try one of: ["a", "b", "c", "d"]
1152 ```
1153 !!! success "Conclusion: Success."
1154 </div>
1156 ```{.py .python linenums="1" title="Example 6: Single column missing, raises warning"}
1157 >>> (
1158 ... df.transform(
1159 ... delete_columns,
1160 ... columns="z",
1161 ... missing_column_handler="warn",
1162 ... )
1163 ... .show()
1164 ... )
1165 ```
1166 <div class="result" markdown>
1167 ```{.txt .text title="Terminal"}
1168 ColumnDoesNotExistWarning: Columns missing from "dataframe": ["z"].
1169 Will still proceed to delete columns that do exist.
1170 ```
1171 ```{.txt .text title="Terminal"}
1172 +---+---+---+---+
1173 | a | b | c | d |
1174 +---+---+---+---+
1175 | 0 | a | c | d |
1176 | 1 | b | c | d |
1177 | 2 | c | c | d |
1178 | 3 | d | c | d |
1179 +---+---+---+---+
1180 ```
1181 !!! success "Conclusion: Success."
1182 </div>
1184 ```{.py .python linenums="1" title="Example 7: Multiple columns, one missing, raises warning"}
1185 >>> (
1186 ... df.transform(
1187 ... delete_columns,
1188 ... columns=["a", "b", "z"],
1189 ... missing_column_handler="warn",
1190 ... )
1191 ... .show()
1192 ... )
1193 ```
1194 <div class="result" markdown>
1195 ```{.txt .text title="Terminal"}
1196 ColumnDoesNotExistWarning: Columns missing from "dataframe": ["z"].
1197 Will still proceed to delete columns that do exist.
1198 ```
1199 ```{.txt .text title="Terminal"}
1200 +---+---+
1201 | c | d |
1202 +---+---+
1203 | c | d |
1204 | c | d |
1205 | c | d |
1206 | c | d |
1207 +---+---+
1208 ```
1209 !!! success "Conclusion: Success."
1210 </div>
1212 ```{.py .python linenums="1" title="Example 8: Multiple columns, all missing, raises warning"}
1213 >>> (
1214 ... df.transform(
1215 ... delete_columns,
1216 ... columns=["x", "y", "z"],
1217 ... missing_column_handler="warn",
1218 ... )
1219 ... .show()
1220 ... )
1221 ```
1222 <div class="result" markdown>
1223 ```{.txt .text title="Terminal"}
1224 ColumnDoesNotExistWarning: Columns missing from "dataframe": ["x", "y", "z"].
1225 Will still proceed to delete columns that do exist.
1226 ```
1227 ```{.txt .text title="Terminal"}
1228 +---+---+---+---+
1229 | a | b | c | d |
1230 +---+---+---+---+
1231 | 0 | a | c | d |
1232 | 1 | b | c | d |
1233 | 2 | c | c | d |
1234 | 3 | d | c | d |
1235 +---+---+---+---+
1236 ```
1237 !!! success "Conclusion: Success."
1238 </div>
1240 ```{.py .python linenums="1" title="Example 9: Single column missing, nothing raised"}
1241 >>> (
1242 ... df.transform(
1243 ... delete_columns,
1244 ... columns="z",
1245 ... missing_column_handler="pass",
1246 ... )
1247 ... .show()
1248 ... )
1249 ```
1250 <div class="result" markdown>
1251 ```{.txt .text title="Terminal"}
1252 +---+---+---+---+
1253 | a | b | c | d |
1254 +---+---+---+---+
1255 | 0 | a | c | d |
1256 | 1 | b | c | d |
1257 | 2 | c | c | d |
1258 | 3 | d | c | d |
1259 +---+---+---+---+
1260 ```
1261 !!! success "Conclusion: Success."
1262 </div>
1264 ```{.py .python linenums="1" title="Example 10: Multiple columns, one missing, nothing raised"}
1265 >>> (
1266 ... df.transform(
1267 ... delete_columns,
1268 ... columns=["a", "b", "z"],
1269 ... missing_column_handler="pass",
1270 ... )
1271 ... .show()
1272 ... )
1273 ```
1274 <div class="result" markdown>
1275 ```{.txt .text title="Terminal"}
1276 +---+---+
1277 | c | d |
1278 +---+---+
1279 | c | d |
1280 | c | d |
1281 | c | d |
1282 | c | d |
1283 +---+---+
1284 ```
1285 !!! success "Conclusion: Success."
1286 </div>
1288 ```{.py .python linenums="1" title="Example 11: Multiple columns, all missing, nothing raised"}
1289 >>> (
1290 ... df.transform(
1291 ... delete_columns,
1292 ... columns=["x", "y", "z"],
1293 ... missing_column_handler="pass",
1294 ... )
1295 ... .show()
1296 ... )
1297 ```
1298 <div class="result" markdown>
1299 ```{.txt .text title="Terminal"}
1300 +---+---+---+---+
1301 | a | b | c | d |
1302 +---+---+---+---+
1303 | 0 | a | c | d |
1304 | 1 | b | c | d |
1305 | 2 | c | c | d |
1306 | 3 | d | c | d |
1307 +---+---+---+---+
1308 ```
1309 !!! success "Conclusion: Success."
1310 </div>
1311 """
1312 columns = get_columns(dataframe, columns)
1313 if missing_column_handler == "raise":
1314 assert_columns_exists(dataframe=dataframe, columns=columns)
1315 elif missing_column_handler == "warn":
1316 warn_columns_missing(dataframe=dataframe, columns=columns)
1317 elif missing_column_handler == "pass":
1318 pass
1319 return dataframe.select([col for col in dataframe.columns if col not in columns])