Coverage for src/toolbox_pyspark/formatting.py: 100%
24 statements
« prev ^ index » next coverage.py v7.6.10, created at 2025-01-25 23:08 +0000
« prev ^ index » next coverage.py v7.6.10, created at 2025-01-25 23:08 +0000
1# ============================================================================ #
2# #
3# Title: Title #
4# Purpose: This module provides functions for formatting and displaying #
5# intermediary Spark DataFrames. #
6# #
7# ============================================================================ #
10# ---------------------------------------------------------------------------- #
11# #
12# Overview ####
13# #
14# ---------------------------------------------------------------------------- #
17# ---------------------------------------------------------------------------- #
18# Description ####
19# ---------------------------------------------------------------------------- #
22"""
23!!! note "Summary"
24 The `formatting` module provides functions for formatting and displaying.
25"""
28# ---------------------------------------------------------------------------- #
29# #
30# Setup ####
31# #
32# ---------------------------------------------------------------------------- #
35## --------------------------------------------------------------------------- #
36## Imports ####
37## --------------------------------------------------------------------------- #
40# ## Python Third Party Imports ----
41from pyspark.sql import DataFrame as psDataFrame, functions as F
42from toolbox_python.collection_types import str_list
43from typeguard import typechecked
46## --------------------------------------------------------------------------- #
47## Exports ####
48## --------------------------------------------------------------------------- #
51__all__: str_list = [
52 "format_numbers",
53 "display_intermediary_table",
54 "display_intermediary_schema",
55 "display_intermediary_columns",
56]
59# ---------------------------------------------------------------------------- #
60# #
61# Main Section ####
62# #
63# ---------------------------------------------------------------------------- #
66@typechecked
67def format_numbers(dataframe: psDataFrame) -> psDataFrame:
68 """
69 !!! note "Summary"
70 Format numbers in a Spark DataFrame.
72 ??? abstract "Details"
73 This function formats numbers in a Spark DataFrame. It formats integers to have no decimal places and floats to have two decimal places. The function is useful for displaying intermediary tables in a more readable format. It will replace all numeric columns to string.
75 Params:
76 dataframe (psDataFrame):
77 The Spark DataFrame to format.
79 Raises:
80 TypeError:
81 If any of the inputs parsed to the parameters of this function are not the correct type. Uses the [`@typeguard.typechecked`](https://typeguard.readthedocs.io/en/stable/api.html#typeguard.typechecked) decorator.
83 Returns:
84 (psDataFrame):
85 The formatted Spark DataFrame.
87 ???+ example "Examples"
89 ```{.py .python linenums="1" title="Set Up"}
90 >>> # Imports
91 >>> import pandas as pd
92 >>> from pyspark.sql import SparkSession
93 >>> from toolbox_pyspark.formatting import format_numbers
94 >>>
95 >>> # Instantiate Spark
96 >>> spark = SparkSession.builder.getOrCreate()
97 >>>
98 >>> # Create data
99 >>> df = spark.createDataFrame(
100 ... pd.DataFrame(
101 ... {
102 ... "a": [1, 2, 3, 4],
103 ... "b": ["a", "b", "c", "d"],
104 ... "c": [1.0, 2.0, 3.0, 4.0],
105 ... "d": [1.1, 2.2, 3.3, 4.4],
106 ... "e": [1000, 10000, 100000, 1000000],
107 ... "f": [1111.11, 22222.22, 333333.33, 4444444.44],
108 ... }
109 ... )
110 ... )
111 >>>
112 >>> # Check
113 >>> df.show()
114 ```
115 <div class="result" markdown>
116 ```{.txt .text title="Terminal"}
117 +---+---+-----+-----+---------+------------+
118 | a | b | c | d | e | f |
119 +---+---+-----+-----+---------+------------+
120 | 1 | a | 1.0 | 1.1 | 1000 | 1111.11 |
121 | 2 | b | 2.0 | 2.2 | 10000 | 22222.22 |
122 | 3 | c | 3.0 | 3.3 | 100000 | 333333.33 |
123 | 4 | d | 4.0 | 4.4 | 1000000 | 4444444.44 |
124 +---+---+-----+-----+---------+------------+
125 ```
126 </div>
128 ```{.py .python linenums="1" title="Example 1: Format Numbers by function"}
129 >>> format_numbers(df).show()
130 ```
131 <div class="result" markdown>
132 ```{.txt .text title="Terminal"}
133 +---+---+-----+-----+-----------+--------------+
134 | a | b | c | d | e | f |
135 +---+---+-----+-----+-----------+--------------+
136 | 1 | a | 1.0 | 1.1 | 1,000 | 1,111.11 |
137 | 2 | b | 2.0 | 2.2 | 10,000 | 22,222.22 |
138 | 3 | c | 3.0 | 3.3 | 100,000 | 333,333.33 |
139 | 4 | d | 4.0 | 4.4 | 1,000,000 | 4,444,444.44 |
140 +---+---+-----+-----+-----------+--------------+
141 ```
142 !!! success "Conclusion: Successfully formatted dataframe."
143 </div>
145 ```{.py .python linenums="1" title="Example 2: Format Numbers by method"}
146 >>> df.transform(format_numbers).show()
147 ```
148 <div class="result" markdown>
149 ```{.txt .text title="Terminal"}
150 +---+---+-----+-----+-----------+--------------+
151 | a | b | c | d | e | f |
152 +---+---+-----+-----+-----------+--------------+
153 | 1 | a | 1.0 | 1.1 | 1,000 | 1,111.11 |
154 | 2 | b | 2.0 | 2.2 | 10,000 | 22,222.22 |
155 | 3 | c | 3.0 | 3.3 | 100,000 | 333,333.33 |
156 | 4 | d | 4.0 | 4.4 | 1,000,000 | 4,444,444.44 |
157 +---+---+-----+-----+-----------+--------------+
158 ```
159 !!! success "Conclusion: Successfully formatted dataframe."
160 </div>
161 """
162 for col, typ in dataframe.dtypes:
163 if typ in ("int", "tinyint", "smallint", "bigint"):
164 dataframe = dataframe.withColumn(col, F.format_number(col, 0))
165 elif typ in ("float", "double"):
166 dataframe = dataframe.withColumn(col, F.format_number(col, 2))
167 return dataframe
170@typechecked
171def display_intermediary_table(
172 dataframe: psDataFrame, reformat_numbers: bool = True, num_rows: int = 20
173) -> psDataFrame:
174 """
175 !!! note "Summary"
176 Display an intermediary Spark DataFrame.
178 ???+ abstract "Details"
179 This function displays an intermediary Spark DataFrame. The function is useful for displaying intermediary tables in a more readable format. Optionally, it can format numbers in the DataFrame to make it more readable.
181 Params:
182 dataframe (psDataFrame):
183 The Spark DataFrame to display.
184 reformat_numbers (bool):
185 Whether to format numbers in the DataFrame. Default is `True`.
186 num_rows (int):
187 The number of rows to display. Default is `20`.
189 Raises:
190 TypeError:
191 If any of the inputs parsed to the parameters of this function are not the correct type. Uses the [`@typeguard.typechecked`](https://typeguard.readthedocs.io/en/stable/api.html#typeguard.typechecked) decorator.
193 Returns:
194 (psDataFrame):
195 The original Spark DataFrame.
197 ???+ example "Examples"
199 ```{.py .python linenums="1" title="Set Up"}
200 >>> # Imports
201 >>> import pandas as pd
202 >>> from pyspark.sql import SparkSession
203 >>> from toolbox_pyspark.formatting import display_intermediary_table
204 >>>
205 >>> # Instantiate Spark
206 >>> spark = SparkSession.builder.getOrCreate()
207 >>>
208 >>> # Create data
209 >>> df = spark.createDataFrame(
210 ... pd.DataFrame(
211 ... {
212 ... "a": [1, 2, 3, 4],
213 ... "b": ["a", "b", "c", "d"],
214 ... "c": [1.0, 2.0, 3.0, 4.0],
215 ... "d": [1.1, 2.2, 3.3, 4.4],
216 ... }
217 ... )
218 ... )
219 >>>
220 >>> # Check
221 >>> df.show()
222 ```
223 <div class="result" markdown>
224 ```{.txt .text title="Terminal"}
225 +---+---+-----+-----+
226 | a | b | c | d |
227 +---+---+-----+-----+
228 | 1 | a | 1.0 | 1.1 |
229 | 2 | b | 2.0 | 2.2 |
230 | 3 | c | 3.0 | 3.3 |
231 | 4 | d | 4.0 | 4.4 |
232 +---+---+-----+-----+
233 ```
234 </div>
236 ```{.py .python linenums="1" title="Example 1: Display intermediary table with no subsequent formatting"}
237 >>> (
238 ... df
239 ... .transform(display_intermediary_table, reformat_numbers=False, num_rows=2)
240 ... .show()
241 ... )
242 ```
243 <div class="result" markdown>
244 ```{.txt .text title="Terminal"}
245 +---+---+-----+-----+
246 | a | b | c | d |
247 +---+---+-----+-----+
248 | 1 | a | 1.0 | 1.1 |
249 | 2 | b | 2.0 | 2.2 |
250 +---+---+-----+-----+
251 ```
252 ```{.txt .text title="Terminal"}
253 +---+---+-----+-----+
254 | a | b | c | d |
255 +---+---+-----+-----+
256 | 1 | a | 1.0 | 1.1 |
257 | 2 | b | 2.0 | 2.2 |
258 | 3 | c | 3.0 | 3.3 |
259 | 4 | d | 4.0 | 4.4 |
260 +---+---+-----+-----+
261 ```
262 !!! success "Conclusion: Successfully displayed intermediary table with no subsequent formatting."
263 </div>
265 ```{.py .python linenums="1" title="Example 2: Display intermediary table with subsequent formatting"}
266 >>> (
267 ... df
268 ... .transform(display_intermediary_table, reformat_numbers=True)
269 ... .withColumn("c", F.expr("c * 2"))
270 ... .show()
271 ... )
272 ```
273 <div class="result" markdown>
274 ```{.txt .text title="Terminal"}
275 +---+---+-----+-----+
276 | a | b | c | d |
277 +---+---+-----+-----+
278 | 1 | a | 1.0 | 1.1 |
279 | 2 | b | 2.0 | 2.2 |
280 | 3 | c | 3.0 | 3.3 |
281 | 4 | d | 4.0 | 4.4 |
282 +---+---+-----+-----+
283 ```
284 ```{.txt .text title="Terminal"}
285 +---+---+-----+-----+
286 | a | b | c | d |
287 +---+---+-----+-----+
288 | 1 | a | 2.0 | 1.1 |
289 | 2 | b | 4.0 | 2.2 |
290 | 3 | c | 6.0 | 3.3 |
291 | 4 | d | 8.0 | 4.4 |
292 +---+---+-----+-----+
293 ```
294 !!! success "Conclusion: Successfully displayed intermediary table with subsequent formatting."
295 </div>
296 """
297 if reformat_numbers:
298 dataframe.transform(format_numbers).show(n=num_rows, truncate=False)
299 else:
300 dataframe.show(n=num_rows, truncate=False)
301 return dataframe
304def display_intermediary_schema(dataframe: psDataFrame) -> psDataFrame:
305 """
306 !!! note "Summary"
307 Display the schema of an intermediary Spark DataFrame.
309 ??? abstract "Details"
310 This function displays the schema of an intermediary Spark DataFrame. The function is useful for displaying intermediary tables in a more readable format.
312 Params:
313 dataframe (psDataFrame):
314 The Spark DataFrame to display.
316 Returns:
317 (psDataFrame):
318 The original Spark DataFrame.
320 ???+ example "Examples"
322 ```{.py .python linenums="1" title="Set Up"}
323 >>> # Imports
324 >>> import pandas as pd
325 >>> from pyspark.sql import SparkSession
326 >>> from toolbox_pyspark.formatting import display_intermediary_schema
327 >>>
328 >>> # Instantiate Spark
329 >>> spark = SparkSession.builder.getOrCreate()
330 >>>
331 >>> # Create data
332 >>> df = spark.createDataFrame(
333 ... pd.DataFrame(
334 ... {
335 ... "a": [1, 2, 3, 4],
336 ... "b": ["a", "b", "c", "d"],
337 ... "c": [1.0, 2.0, 3.0, 4.0],
338 ... "d": [1.1, 2.2, 3.3, 4.4],
339 ... }
340 ... )
341 ... )
342 >>>
343 >>> # Check
344 >>> df.show()
345 >>> df.printSchema()
346 ```
347 <div class="result" markdown>
348 ```{.txt .text title="Terminal"}
349 +---+---+-----+-----+
350 | a | b | c | d |
351 +---+---+-----+-----+
352 | 1 | a | 1.0 | 1.1 |
353 | 2 | b | 2.0 | 2.2 |
354 | 3 | c | 3.0 | 3.3 |
355 | 4 | d | 4.0 | 4.4 |
356 +---+---+-----+-----+
357 ```
358 ```{.txt .text title="Terminal"}
359 root
360 |-- a: long (nullable = true)
361 |-- b: string (nullable = true)
362 |-- c: double (nullable = true)
363 |-- d: double (nullable = true)
364 ```
365 </div>
367 ```{.py .python linenums="1" title="Example 1: Display intermediary schema"}
368 >>> df.transform(display_intermediary_schema).show()
369 ```
370 <div class="result" markdown>
371 ```{.txt .text title="Terminal"}
372 root
373 |-- a: long (nullable = true)
374 |-- b: string (nullable = true)
375 |-- c: double (nullable = true)
376 |-- d: double (nullable = true)
377 ```
378 ```{.txt .text title="Terminal"}
379 +---+---+-----+-----+
380 | a | b | c | d |
381 +---+---+-----+-----+
382 | 1 | a | 1.0 | 1.1 |
383 | 2 | b | 2.0 | 2.2 |
384 | 3 | c | 3.0 | 3.3 |
385 | 4 | d | 4.0 | 4.4 |
386 +---+---+-----+-----+
387 ```
388 !!! success "Conclusion: Successfully displayed intermediary schema."
389 </div>
391 ```{.py .python linenums="1" title="Example 2: Display intermediary schema with subsequent formatting"}
392 >>> df.transform(display_intermediary_schema).withColumn("e", F.expr("c * 2")).show()
393 ```
394 <div class="result" markdown>
395 ```{.txt .text title="Terminal"}
396 root
397 |-- a: long (nullable = true)
398 |-- b: string (nullable = true)
399 |-- c: double (nullable = true)
400 |-- d: double (nullable = true)
401 ```
402 ```{.txt .text title="Terminal"}
403 +---+---+-----+-----+---+
404 | a | b | c | d | e |
405 +---+---+-----+-----+---+
406 | 1 | a | 1.0 | 1.1 | 2 |
407 | 2 | b | 2.0 | 2.2 | 4 |
408 | 3 | c | 3.0 | 3.3 | 6 |
409 | 4 | d | 4.0 | 4.4 | 8 |
410 +---+---+-----+-----+---+
411 ```
412 !!! success "Conclusion: Successfully displayed intermediary schema."
413 </div>
414 """
415 dataframe.printSchema()
416 return dataframe
419def display_intermediary_columns(dataframe: psDataFrame) -> psDataFrame:
420 """
421 !!! note "Summary"
422 Display the columns of an intermediary Spark DataFrame.
424 ??? abstract "Details"
425 This function displays the columns of an intermediary Spark DataFrame. The function is useful for displaying intermediary tables in a more readable format.
427 Params:
428 dataframe (psDataFrame):
429 The Spark DataFrame to display.
431 Returns:
432 (psDataFrame):
433 The original Spark DataFrame.
435 ???+ example "Examples"
437 ```{.py .python linenums="1" title="Set Up"}
438 >>> # Imports
439 >>> import pandas as pd
440 >>> from pyspark.sql import SparkSession
441 >>> from toolbox_pyspark.formatting import display_intermediary_columns
442 >>>
443 >>> # Instantiate Spark
444 >>> spark = SparkSession.builder.getOrCreate()
445 >>>
446 >>> # Create data
447 >>> df = spark.createDataFrame(
448 ... pd.DataFrame(
449 ... {
450 ... "a": [1, 2, 3, 4],
451 ... "b": ["a", "b", "c", "d"],
452 ... "c": [1.0, 2.0, 3.0, 4.0],
453 ... "d": [1.1, 2.2, 3.3, 4.4],
454 ... }
455 ... )
456 ... )
457 >>>
458 >>> # Check
459 >>> df.show()
460 ```
461 <div class="result" markdown>
462 ```{.txt .text title="Terminal"}
463 +---+---+-----+-----+
464 | a | b | c | d |
465 +---+---+-----+-----+
466 | 1 | a | 1.0 | 1.1 |
467 | 2 | b | 2.0 | 2.2 |
468 | 3 | c | 3.0 | 3.3 |
469 | 4 | d | 4.0 | 4.4 |
470 +---+---+-----+-----+
471 ```
472 </div>
474 ```{.py .python linenums="1" title="Example 1: Display intermediary columns"}
475 >>> df.transform(display_intermediary_columns).show()
476 ```
477 <div class="result" markdown>
478 ```{.txt .text title="Terminal"}
479 ['a', 'b', 'c', 'd']
480 ```
481 ```{.txt .text title="Terminal"}
482 +---+---+-----+-----+
483 | a | b | c | d |
484 +---+---+-----+-----+
485 | 1 | a | 1.0 | 1.1 |
486 | 2 | b | 2.0 | 2.2 |
487 | 3 | c | 3.0 | 3.3 |
488 | 4 | d | 4.0 | 4.4 |
489 +---+---+-----+-----+
490 ```
491 !!! success "Conclusion: Successfully displayed intermediary columns.
492 </div>
494 ```{.py .python linenums="1" title="Example 2: Display intermediary columns with subsequent formatting"}
495 >>> df.transform(display_intermediary_columns).withColumn("e", F.expr("c * 2")).show()
496 ```
497 <div class="result" markdown>
498 ```{.txt .text title="Terminal"}
499 ['a', 'b', 'c', 'd']
500 ```
501 ```{.txt .text title="Terminal"}
502 +---+---+-----+-----+---+
503 | a | b | c | d | e |
504 +---+---+-----+-----+---+
505 | 1 | a | 1.0 | 1.1 | 2 |
506 | 2 | b | 2.0 | 2.2 | 4 |
507 | 3 | c | 3.0 | 3.3 | 6 |
508 | 4 | d | 4.0 | 4.4 | 8 |
509 +---+---+-----+-----+---+
510 ```
511 """
512 print(dataframe.columns)
513 return dataframe