Coverage for src/toolbox_pyspark/schema.py: 100%
144 statements
« prev ^ index » next coverage.py v7.6.10, created at 2025-01-25 23:08 +0000
« prev ^ index » next coverage.py v7.6.10, created at 2025-01-25 23:08 +0000
1# ============================================================================ #
2# #
3# Title : Schema #
4# Purpose : Checking, validating, and viewing any schema differences #
5# between two different tables, either from in-memory variables, #
6# or pointing to locations on disk. #
7# #
8# ============================================================================ #
11# ---------------------------------------------------------------------------- #
12# #
13# Overview ####
14# #
15# ---------------------------------------------------------------------------- #
18# ---------------------------------------------------------------------------- #
19# Description ####
20# ---------------------------------------------------------------------------- #
23"""
24!!! note "Summary"
25 The `schema` module is used for checking, validating, and viewing any schema differences between two different tables, either from in-memory variables, or pointing to locations on disk.
26"""
29# ---------------------------------------------------------------------------- #
30# #
31# Setup ####
32# #
33# ---------------------------------------------------------------------------- #
36# ---------------------------------------------------------------------------- #
37# Imports ####
38# ---------------------------------------------------------------------------- #
41# ## Python StdLib Imports ----
42from pprint import pprint
43from typing import Literal, NamedTuple, Optional, Union
45# ## Python Third Party Imports ----
46from pyspark.sql import DataFrame as psDataFrame, SparkSession
47from pyspark.sql.types import StructField
48from toolbox_python.checkers import is_type
49from toolbox_python.collection_types import str_list, str_set
50from typeguard import typechecked
52# ## Local First Party Imports ----
53from toolbox_pyspark.io import read_from_path
56# ---------------------------------------------------------------------------- #
57# Exports ####
58# ---------------------------------------------------------------------------- #
61__all__: str_list = [
62 "view_schema_differences",
63 "check_schemas_match",
64]
67# ---------------------------------------------------------------------------- #
68# #
69# Functions ####
70# #
71# ---------------------------------------------------------------------------- #
74## --------------------------------------------------------------------------- #
75## Classes ####
76## --------------------------------------------------------------------------- #
79class ValidMethods(NamedTuple):
80 """
81 ```py
82 by_table_and_table: str_set
83 by_table_and_path: str_set
84 by_path_and_table: str_set
85 by_path_and_path: str_set
86 ```
87 """
89 by_table_and_table: str_set = {
90 "table",
91 "table_table",
92 "tables",
93 "by_table",
94 "by_table_and_table",
95 "table_and_table",
96 }
97 """
98 ```py
99 {
100 "table",
101 "table_table",
102 "tables",
103 "by_table",
104 "by_table_and_table",
105 "table_and_table",
106 }
107 ```
108 """
109 by_table_and_path: str_set = {
110 "table_and_path",
111 "table_path",
112 "by_table_and_path",
113 }
114 """
115 ```py
116 {
117 "table_and_path",
118 "table_path",
119 "by_table_and_path",
120 }
121 ```
122 """
123 by_path_and_table: str_set = {
124 "path_and_table",
125 "path_table",
126 "by_path_and_table",
127 }
128 """
129 ```py
130 {
131 "path_and_table",
132 "path_table",
133 "by_path_and_table",
134 }
135 ```
136 """
137 by_path_and_path: str_set = {
138 "path_and_path",
139 "path_path",
140 "by_path_and_path",
141 "path",
142 "paths",
143 }
144 """
145 ```py
146 {
147 "path_and_path",
148 "path_path",
149 "by_path_and_path",
150 "path",
151 "paths",
152 }
153 ```
154 """
157# ---------------------------------------------------------------------------- #
158# Check Matching ####
159# ---------------------------------------------------------------------------- #
162@typechecked
163def _check_schemas_match_by_table_and_table(
164 left_table: psDataFrame,
165 right_table: psDataFrame,
166 include_change_field: bool = True,
167 include_add_field: bool = True,
168 include_remove_field: bool = True,
169 include_change_nullable: bool = False,
170 return_object: Literal["results", "check"] = "check",
171) -> Union[list, bool]:
173 # Set up
174 left_schema: dict = left_table.schema.__dict__
175 left_names: str_list = left_schema["names"]
176 left_fields: list[StructField] = left_schema["fields"]
177 right_schema: dict = right_table.schema.__dict__
178 right_names: str_list = right_schema["names"]
179 right_fields: list[StructField] = right_schema["fields"]
180 results = list()
182 # Loop for any additions
183 if include_add_field:
184 for left_field in left_fields:
185 if left_field.name not in right_names:
186 results.append(("add", {"left": left_field}))
188 # Loop for any removals
189 if include_remove_field:
190 for right_field in right_fields:
191 if right_field.name not in left_names:
192 results.append(("remove", {"right": right_field}))
194 # Loop for any changes
195 if include_change_field:
196 for left_field in left_fields:
197 if left_field.name not in right_names:
198 continue
199 right_field: StructField = [
200 field for field in right_fields if field.name == left_field.name
201 ][0]
202 if left_field.dataType != right_field.dataType:
203 results.append(("change_type", {"left": left_field, "right": right_field}))
204 if include_change_nullable:
205 if left_field.nullable != right_field.nullable:
206 results.append(
207 ("change_nullable", {"left": left_field, "right": right_field})
208 )
210 # Return
211 if len(results) > 0:
212 if return_object == "results":
213 return results
214 elif return_object == "check":
215 return False
216 return True
219@typechecked
220def _check_schemas_match_by_table_and_path(
221 left_table: psDataFrame,
222 right_table_path: str,
223 right_table_name: str,
224 spark_session: SparkSession,
225 right_table_format: str = "delta",
226 include_change_field: bool = True,
227 include_add_field: bool = True,
228 include_remove_field: bool = True,
229 include_change_nullable: bool = False,
230 return_object: Literal["results", "check"] = "check",
231) -> Union[list, bool]:
232 right_table: psDataFrame = read_from_path(
233 name=right_table_name,
234 path=right_table_path,
235 spark_session=spark_session,
236 data_format=right_table_format,
237 )
238 return _check_schemas_match_by_table_and_table(
239 left_table=left_table,
240 right_table=right_table,
241 include_change_field=include_change_field,
242 include_add_field=include_add_field,
243 include_remove_field=include_remove_field,
244 include_change_nullable=include_change_nullable,
245 return_object=return_object,
246 )
249@typechecked
250def _check_schemas_match_by_path_and_table(
251 left_table_path: str,
252 left_table_name: str,
253 right_table: psDataFrame,
254 spark_session: SparkSession,
255 left_table_format: str = "delta",
256 include_change_field: bool = True,
257 include_add_field: bool = True,
258 include_remove_field: bool = True,
259 include_change_nullable: bool = False,
260 return_object: Literal["results", "check"] = "check",
261) -> Union[list, bool]:
262 left_table: psDataFrame = read_from_path(
263 name=left_table_name,
264 path=left_table_path,
265 spark_session=spark_session,
266 data_format=left_table_format,
267 )
268 return _check_schemas_match_by_table_and_table(
269 left_table=left_table,
270 right_table=right_table,
271 include_change_field=include_change_field,
272 include_add_field=include_add_field,
273 include_remove_field=include_remove_field,
274 include_change_nullable=include_change_nullable,
275 return_object=return_object,
276 )
279@typechecked
280def _check_schemas_match_by_path_and_path(
281 left_table_path: str,
282 left_table_name: str,
283 right_table_path: str,
284 right_table_name: str,
285 spark_session: SparkSession,
286 left_table_format: str = "delta",
287 right_table_format: str = "delta",
288 include_change_field: bool = True,
289 include_add_field: bool = True,
290 include_remove_field: bool = True,
291 include_change_nullable: bool = False,
292 return_object: Literal["results", "check"] = "check",
293) -> Union[list, bool]:
294 left_table: psDataFrame = read_from_path(
295 name=left_table_name,
296 path=left_table_path,
297 spark_session=spark_session,
298 data_format=left_table_format,
299 )
300 right_table: psDataFrame = read_from_path(
301 name=right_table_name,
302 path=right_table_path,
303 spark_session=spark_session,
304 data_format=right_table_format,
305 )
306 return _check_schemas_match_by_table_and_table(
307 left_table=left_table,
308 right_table=right_table,
309 include_change_field=include_change_field,
310 include_add_field=include_add_field,
311 include_remove_field=include_remove_field,
312 include_change_nullable=include_change_nullable,
313 return_object=return_object,
314 )
317@typechecked
318def check_schemas_match(
319 method: str = "by_table_and_table",
320 left_table: Optional[psDataFrame] = None,
321 right_table: Optional[psDataFrame] = None,
322 left_table_path: Optional[str] = None,
323 left_table_name: Optional[str] = None,
324 right_table_path: Optional[str] = None,
325 right_table_name: Optional[str] = None,
326 spark_session: Optional[SparkSession] = None,
327 left_table_format: str = "delta",
328 right_table_format: str = "delta",
329 include_change_field: bool = True,
330 include_add_field: bool = True,
331 include_remove_field: bool = True,
332 include_change_nullable: bool = False,
333 return_object: Literal["results", "check"] = "check",
334) -> Union[list[tuple[str, dict[str, StructField]]], bool]:
335 """
336 !!! note "Summary"
337 Check the schemas between two different tables.
339 ???+ abstract "Details"
340 This function is heavily inspired by other packages which check and validate schema differences for `pyspark` tables. This function just streamlines it a bit, and adds additional functionality for whether or not table on either `left` or `right` side is already in-memory or sitting on a directory somewhere else.
342 Params:
343 method (str, optional):
344 The method to use for the comparison. That is, is either side a table in memory or is it a `table` sitting on a `path`?. Check the Notes section for all options available for this parameter.<br>
345 Defaults to `#!py "by_table_and_table"`.
346 spark_session (Optional[SparkSession], optional):
347 The `SparkSession` to use if either the `left` or `right` tables are sitting on a `path` somewhere.<br>
348 Defaults to `#!py None`.
349 left_table (Optional[psDataFrame], optional):
350 If `method` defines the `left` table as a `table`, then this parameter is the actual `dataframe` to do the checking against.<br>
351 Defaults to `#!py None`.
352 left_table_path (Optional[str], optional):
353 If `method` defines the `left` table as a `path`, then this parameter is the actual path location where the table can be found.<br>
354 Defaults to `#!py None`.
355 left_table_name (Optional[str], optional):
356 If `method` defines the `left` table as a `path`, then this parameter is the name of the table found at the given `left_table_path` location.<br>
357 Defaults to `#!py None`.
358 left_table_format (str, optional):
359 If `method` defines the `left` table as a `path`, then this parameter is the format of the table found at the given `left_table_path` location.<br>
360 Defaults to `#!py "delta"`.
361 right_table (Optional[psDataFrame], optional):
362 If `method` defines the `right` table as a `table`, then this parameter is the actual `dataframe` to do the checking against.<br>
363 Defaults to `#!py None`.
364 right_table_path (Optional[str], optional):
365 If `method` defines the `right` table as a `path`, then this parameter is the actual path location where the table can be found.<br>
366 Defaults to `#!py None`.
367 right_table_name (Optional[str], optional):
368 If `method` defines the `right` table as a `path`, then this parameter is the name of the table found at the given `right_table_path` location.<br>
369 Defaults to `#!py None`.
370 right_table_format (str, optional):
371 If `method` defines the `right` table as a `path`, then this parameter is the format of the table found at the given `right_table_path` location.<br>
372 Defaults to `#!py "delta"`.
373 include_change_field (bool, optional):
374 When doing the schema validations, do you want to include any fields where the data-type on the right-hand side is different from the left-hand side?<br>
375 This can be read as: "What fields have had their data type _changed **between**_ the left-hand side and the right-hand side?"<br>
376 Defaults to `#!py True`.
377 include_add_field (bool, optional):
378 When doing the schema validations, do you want to include any fields that have had any additional fields added to the left-hand side, when compared to the right-hand side?<br>
379 This can be read as: "What fields have been _added **to**_ the left-hand side?"<br>
380 Defaults to `#!py True`.
381 include_remove_field (bool, optional):
382 When doing the schema validations, do you want to include any fields which are missing from the left-hand side and only existing on the right-hand side?<br>
383 This can be read as: "What fields been _removed **from**_ the left-hand side?"<br>
384 Defaults to `#!py True`.
385 include_change_nullable (bool, optional):
386 When doing the schema validations, do you want to include any fields which have had their nullability metadata changed on the right-hand side, when compared to the left-hand side?.<br>
387 This can be read as: "What fields had their nullability _changed **between**_ the left-hand side and the right-hand side?"<br>
388 Defaults to `#!py False`.
389 return_object (Literal["results", "check"], optional):
390 After having checked the schema, how do you want the results to be returned? If `#!py "check"`, then will only return a `#!py bool` value: `#!py True` if the schemas actually match, `#!py False` if there are any differences. If `#!py "results"`, then the actual schema differences will be returned. Check the Notes section for more information on the structure of this object.<br>
391 Defaults to `#!py "check"`.
393 Raises:
394 TypeError:
395 If any of the inputs parsed to the parameters of this function are not the correct type. Uses the [`@typeguard.typechecked`](https://typeguard.readthedocs.io/en/stable/api.html#typeguard.typechecked) decorator.
396 AttributeError:
397 If the value parse'd to `method` is not a valid option.
399 Returns:
400 (Union[list[tuple[str, dict[str, StructField]]], bool]):
401 If `return_object` is `#!py "results"`, then this will be a `#!py list` of `#!py tuple`'s of `#!py dict`'s containing the details of the schema differences. If `return_object` is `#!py "check"`, then it will only be a `#!py bool` object about whether the schemas match or not.
403 ???+ example "Examples"
405 ```{.py .python linenums="1" title="Set up"}
406 >>> # Imports
407 >>> from pprint import pprint
408 >>> import pandas as pd
409 >>> from pyspark.sql import SparkSession, functions as F
410 >>> from toolbox_pyspark.schema import check_schemas_match
411 >>> from toolbox_pyspark.io import write_to_path
412 >>> from toolbox_pyspark.checks import table_exists
413 >>>
414 >>> # Instantiate Spark
415 >>> spark = SparkSession.builder.getOrCreate()
416 >>>
417 >>> # Create data
418 >>> df1 = spark.createDataFrame(
419 ... pd.DataFrame(
420 ... {
421 ... "a": [0, 1, 2, 3],
422 ... "b": ["a", "b", "c", "d"],
423 ... "c": ["1", "1", "1", "1"],
424 ... "d": ["2", "2", "2", "2"],
425 ... "e": ["3", "3", "3", "3"],
426 ... "f": ["4", "4", "4", "4"],
427 ... }
428 ... )
429 ... )
430 >>> df2 = (
431 ... df1.withColumn("c", F.col("c").cast("int"))
432 ... .withColumn("g", F.lit("a"))
433 ... .withColumn("d", F.lit("null"))
434 ... .drop("e")
435 ... )
436 >>> write_to_path(
437 ... table=df1,
438 ... name="left",
439 ... path="./test",
440 ... data_format="parquet",
441 ... mode="overwrite",
442 ... write_options={"overwriteSchema": "true"},
443 ... )
444 >>> write_to_path(
445 ... table=df2,
446 ... name="right",
447 ... path="./test",
448 ... data_format="parquet",
449 ... mode="overwrite",
450 ... write_options={"overwriteSchema": "true"},
451 ... )
452 >>>
453 >>> # Check
454 >>> pprint(df1.dtypes)
455 >>> print(df1.show())
456 >>> print(table_exists("left", "./test", "parquet", spark))
457 >>> pprint(df2.dtypes)
458 >>> print(df2.show())
459 >>> print(table_exists("right", "./test", "parquet", spark))
460 ```
461 <div class="result" markdown>
462 ```{.sh .shell title="Terminal"}
463 [
464 ("a", "bigint"),
465 ("b", "string"),
466 ("c", "string"),
467 ("d", "string"),
468 ("e", "string"),
469 ("f", "string"),
470 ]
471 ```
472 ```{.txt .text title="Terminal"}
473 +---+---+---+---+---+---+
474 | a | b | c | d | e | f |
475 +---+---+---+---+---+---+
476 | 0 | a | 1 | 2 | 3 | 4 |
477 | 1 | b | 1 | 2 | 3 | 4 |
478 | 2 | c | 1 | 2 | 3 | 4 |
479 | 3 | d | 1 | 2 | 3 | 4 |
480 +---+---+---+---+---+---+
481 ```
482 ```{.sh .shell title="Terminal"}
483 True
484 ```
485 ```{.sh .shell title="Terminal"}
486 [
487 ("a", "bigint"),
488 ("b", "string"),
489 ("c", "int"),
490 ("d", "string"),
491 ("f", "string"),
492 ("g", "string"),
493 ]
494 ```
495 ```{.txt .text title="Terminal"}
496 +---+---+---+------+---+---+
497 | a | b | c | d | f | g |
498 +---+---+---+------+---+---+
499 | 0 | a | 1 | null | 4 | 2 |
500 | 1 | b | 1 | null | 4 | 2 |
501 | 2 | c | 1 | null | 4 | 2 |
502 | 3 | d | 1 | null | 4 | 2 |
503 +---+---+---+------+---+---+
504 ```
505 ```{.sh .shell title="Terminal"}
506 True
507 ```
508 </div>
510 ```{.py .python linenums="1" title="Example 1: Check matching"}
511 >>> diff = check_schemas_match(
512 ... method="table_table",
513 ... left_table=df1,
514 ... right_table=df1,
515 ... include_add_field=True,
516 ... include_change_field=True,
517 ... include_remove_field=True,
518 ... include_change_nullable=True,
519 ... return_object="check",
520 ... )
521 >>> print(diff)
522 ```
523 <div class="result" markdown>
524 ```{.sh .shell title="Terminal"}
525 True
526 ```
527 !!! success "Conclusion: Schemas match."
528 </div>
530 ```{.py .python linenums="1" title="Example 2: Check not matching"}
531 >>> diff = check_schemas_match(
532 ... method="table_table",
533 ... left_table=df1,
534 ... right_table=df2,
535 ... include_add_field=True,
536 ... include_change_field=True,
537 ... include_remove_field=True,
538 ... include_change_nullable=True,
539 ... return_object="check",
540 ... )
541 >>> print(diff)
542 ```
543 <div class="result" markdown>
544 ```{.sh .shell title="Terminal"}
545 False
546 ```
547 !!! failure "Conclusion: Schemas do not match."
548 </div>
550 ```{.py .python linenums="1" title="Example 3: Show only `add`"}
551 >>> diff = check_schemas_match(
552 ... method="table_table",
553 ... left_table=df1,
554 ... right_table=df2,
555 ... include_add_field=True,
556 ... include_change_field=False,
557 ... include_remove_field=False,
558 ... include_change_nullable=False,
559 ... return_object="results",
560 ... )
561 >>> print(diff)
562 ```
563 <div class="result" markdown>
564 ```{.sh .shell title="Terminal"}
565 [
566 (
567 "add",
568 {"left": T.StructField("e", T.StringType(), False)},
569 ),
570 ]
571 ```
572 !!! failure "Conclusion: Schemas do not match because the `e` field was added."
573 </div>
575 ```{.py .python linenums="1" title="Example 4: Show `add` and `remove`"}
576 >>> diff = check_schemas_match(
577 ... method="table_table",
578 ... left_table=df1,
579 ... right_table=df2,
580 ... include_add_field=True,
581 ... include_change_field=False,
582 ... include_remove_field=True,
583 ... include_change_nullable=False,
584 ... return_object="results",
585 ... )
586 >>> print(diff)
587 ```
588 <div class="result" markdown>
589 ```{.sh .shell title="Terminal"}
590 [
591 (
592 "add",
593 {"left": T.StructField("e", T.StringType(), False)},
594 ),
595 (
596 "remove",
597 {"right": T.StructField("g", T.StringType(), False)},
598 ),
599 ]
600 ```
601 !!! failure "Conclusion: Schemas do not match because the `e` field was added and the `g` field was removed."
602 </div>
604 ```{.py .python linenums="1" title="Example 5: Show all changes"}
605 >>> diff = check_schemas_match(
606 ... method="table_table",
607 ... left_table=df1,
608 ... right_table=df2,
609 ... include_add_field=True,
610 ... include_change_field=True,
611 ... include_remove_field=True,
612 ... include_change_nullable=True,
613 ... return_object="results",
614 ... )
615 >>> print(diff)
616 ```
617 <div class="result" markdown>
618 ```{.sh .shell title="Terminal"}
619 [
620 (
621 "add",
622 {"left": T.StructField("e", T.StringType(), False)},
623 ),
624 (
625 "remove",
626 {"right": T.StructField("g", T.StringType(), False)},
627 ),
628 (
629 "change_type",
630 {
631 "left": T.StructField("c", T.StringType(), False),
632 "right": T.StructField("c", T.IntegerType(), True),
633 },
634 ),
635 (
636 "change_nullable",
637 {
638 "left": T.StructField("c", T.StringType(), False),
639 "right": T.StructField("c", T.IntegerType(), True),
640 },
641 ),
642 ]
643 ```
644 !!! failure "Conclusion: Schemas do not match because the `e` field was added, the `g` field was removed, the `c` field had its data type changed, and the `c` field had its nullability changed."
645 </div>
647 ```{.py .python linenums="1" title="Example 6: Check where right-hand side is a `path`"}
648 >>> diff = check_schemas_match(
649 ... method="path_table",
650 ... spark_session=spark,
651 ... left_table=df1,
652 ... right_table_path="./test",
653 ... right_table_name="right",
654 ... right_table_format="parquet",
655 ... include_add_field=True,
656 ... include_change_field=False,
657 ... include_remove_field=False,
658 ... include_change_nullable=False,
659 ... return_object="results",
660 ... )
661 >>> print(diff)
662 ```
663 <div class="result" markdown>
664 ```{.sh .shell title="Terminal"}
665 [
666 (
667 "add",
668 {"left": T.StructField("e", T.StringType(), False)},
669 ),
670 ]
671 ```
672 !!! failure "Conclusion: Schemas do not match because the `e` field was added."
673 </div>
675 ```{.py .python linenums="1" title="Example 7: Check where both sides are a `path`"}
676 >>> diff = check_schemas_match(
677 ... method="path_path",
678 ... spark_session=spark,
679 ... left_table_path="./test",
680 ... left_table_name="left",
681 ... left_table_format="parquet",
682 ... right_table_path="./test",
683 ... right_table_name="right",
684 ... right_table_format="parquet",
685 ... include_add_field=False,
686 ... include_change_field=True,
687 ... include_remove_field=False,
688 ... include_change_nullable=False,
689 ... return_object="results",
690 ... )
691 >>> print(diff)
692 ```
693 <div class="result" markdown>
694 ```{.sh .shell title="Terminal"}
695 [
696 (
697 "remove",
698 {"right": T.StructField("g", T.StringType(), True)},
699 ),
700 ]
701 ```
702 !!! failure "Conclusion: Schemas do not match because the `g` field was removed."
703 </div>
705 ```{.py .python linenums="1" title="Example 8: Invalid `method` parameter"}
706 >>> diff = check_schemas_match(
707 ... method="invalid",
708 ... left_table=df1,
709 ... right_table=df2,
710 ... include_add_field=True,
711 ... include_change_field=True,
712 ... include_remove_field=True,
713 ... include_change_nullable=True,
714 ... return_object="check",
715 ... )
716 ```
717 <div class="result" markdown>
718 ```{.py .python .title="Terminal"}
719 AttributeError: Invalid value for `method`: 'invalid'
720 Please use one of the following options:
721 - For `by_table_and_table`, use one of the following values: ['table', 'table_table', 'tables', 'by_table', 'by_table_and_table', 'table_and_table']
722 - For `by_table_and_path`, use one of the following values: ['table_and_path', 'table_path', 'by_table_and_path']
723 - For `by_path_and_table`, use one of the following values: ['path_and_table', 'path_table', 'by_path_and_table']
724 - For `by_path_and_path`, use one of the following values: ['path_and_path', 'path_path', 'by_path_and_path', 'path', 'paths']
725 ```
726 !!! failure "Conclusion: Invalid `method` parameter."
727 </div>
729 ???+ info "Notes"
731 ???+ info "Options available in the `method` parameter"
733 The options available in the `method` parameter include:
735 - If the objects on both the left-hand side and the right-hand side are both `dataframes` already loaded to memory, use one of the following values:
736 <div class="mdx-three-columns" markdown>
737 - `#!py "table"`
738 - `#!py "table_table"`
739 - `#!py "tables"`
740 - `#!py "by_table"`
741 - `#!py "by_table_and_table"`
742 - `#!py "table_and_table"`
743 </div>
744 - If the object on the left-hand side is a `dataframe` already loaded to memory, but the object on the right-hand side is a table sitting on a path somewhere, use one of the following values:
745 <div class="mdx-three-columns" markdown>
746 - `#!py "table_and_path"`
747 - `#!py "table_path"`
748 - `#!py "by_table_and_path"`
749 </div>
750 - If the object on the left-hand side is a table sitting on a path somewhere, but the object on the right-hand side is a `dataframe` already loaded to memory, use one of the following values:
751 <div class="mdx-three-columns" markdown>
752 - `#!py "path_and_table"`
753 - `#!py "path_table"`
754 - `#!py "by_path_and_table"`
755 </div>
756 - If the objects on both the left-hand side and the right-hand side are both tables sitting on a path somewhere, then use one of the following values:
757 <div class="mdx-three-columns" markdown>
758 - `#!py "path_and_path"`
759 - `#!py "path_path"`
760 - `#!py "by_path_and_path"`
761 - `#!py "path"`
762 - `#!py "paths"`
763 </div>
765 ???+ info "Details about the return object when we set the parameter `#!py return_object="results"`"
767 - When we set the parameter `#!py return_object="results"`, then we will get an object returned from this function.
768 - That object will be a `#!py list` of `#!py tuple`'s, each `#!py tuple` is only two-elements long, where the first element is a `#!py str` object, and the second is a `#!py dict` where the keys are `#!py str` and the values are a `#!py StructField` object.
769 - For each of the `#!py tuple` elements, the first element (the `#!py str` object) describes what the `#!py tuple` is there for. It will be one of four words: `#!py "add"`, `#!py "remove"`, `#!py "change_type"`, or `#!py "change_nullable"`.
770 - You can change whether these options are included in the schema check by changing the other parameters: `#!py include_change_field`, `#!py include_add_field`, `#!py include_remove_field`, `#!py include_change_nullable`.
771 - The structure of the list will look like this:
773 ```{.py .python .title="The structure of the returned object"}
774 [
775 (
776 "add", # (1)!
777 {"left": T.StructField("e", T.StringType(), False)}, # (2)!
778 ),
779 (
780 "add", # (3)!
781 {"left": T.StructField("h", T.StringType(), False)},
782 ),
783 (
784 "remove", # (4)!
785 {"right": T.StructField("g", T.StringType(), False)}, # (5)!
786 ),
787 (
788 "change_type", # (6)!
789 {
790 "left": T.StructField("c", T.StringType(), False), # (7)!
791 "right": T.StructField("c", T.IntegerType(), True),
792 },
793 ),
794 (
795 "change_nullable", # (8)!
796 {
797 "left": T.StructField("c", T.StringType(), False), # (9)!
798 "right": T.StructField("c", T.IntegerType(), True),
799 },
800 ),
801 ]
802 ```
804 1. When `#!py include_add_field=True`, then the `add` section will always appear first.<br>
805 If `#!py include_add_field=False`, then this section is omitted.
806 2. The second element of the `#!py tuple` is a `#!py dict` that has only one `key`-`value` pair.<br>
807 The `key` will _always_ be the value `#!py "left"`, because these are fields which have been added to the table on the left-hand side and not found on the right-hand side.
808 3. When there are multiple fields added to the table on the left-hand side, they will appear like this.
809 4. When `#!py include_remove_field=True`, then the `remove` section will always appear next.<br>
810 If `#!py include_remove_field=False`, then this section is omitted.
811 5. The second element of the `#!py tuple` is a `#!py dict` that has only one `key`-`value` pair.<br>
812 The `key` will _always_ be the value `#!py "right"`, because these are fields which have been removed from the left-hand side and only visible on the right-hand side.
813 6. When `#!py include_change_field=True`, then the `change_type` section will always appear next.<br>
814 If `#!py include_change_field=False`, then this section is omitted.
815 7. The second element of the `#!py tuple` is a `#!py dict` that has two `key`-`value` pairs.<br>
816 The `key`'s will _always_ be the values `#!py "left"` then `#!py "right"`, because these are fields where the data type has changed between the left-hand side and the right-hand side, and therefore you need to see both to see exactly what has changed.
817 8. When `#!py include_change_nullable=True`, then the `change_nullable` section will always appear next.<br>
818 If `#!py include_change_nullable=False`, then this section is omitted.
819 9. The sectond element of the `#!py tuple` is a `#!py dict` that has two `key`-`value` pairs.<br>
820 The `key`'s will _always_ be the values `#!py "left"` then `#!py "right"`, because these are fields where the nullability of the firlds are changed between the left-hand side and the right-hand side, and therefore you need to see both to see exactly what has changed.
821 """
823 valid_methods = ValidMethods()
824 msg: str = "If using the '{meth}' method, then '{name}' cannot be 'None'."
826 if method in valid_methods.by_table_and_table:
827 assert left_table is not None, msg.format(meth=method, name="left_table")
828 assert right_table is not None, msg.format(meth=method, name="right_table")
829 return _check_schemas_match_by_table_and_table(
830 left_table=left_table,
831 right_table=right_table,
832 include_change_field=include_change_field,
833 include_add_field=include_add_field,
834 include_remove_field=include_remove_field,
835 include_change_nullable=include_change_nullable,
836 return_object=return_object,
837 )
838 elif method in valid_methods.by_table_and_path:
839 assert left_table is not None, msg.format(meth=method, name="left_table")
840 assert right_table_path is not None, msg.format(meth=method, name="right_table_path")
841 assert right_table_name is not None, msg.format(meth=method, name="right_table_name")
842 assert spark_session is not None, msg.format(meth=method, name="spark_session")
843 return _check_schemas_match_by_table_and_path(
844 left_table=left_table,
845 right_table_path=right_table_path,
846 right_table_name=right_table_name,
847 right_table_format=right_table_format,
848 spark_session=spark_session,
849 include_change_field=include_change_field,
850 include_add_field=include_add_field,
851 include_remove_field=include_remove_field,
852 include_change_nullable=include_change_nullable,
853 return_object=return_object,
854 )
855 elif method in valid_methods.by_path_and_table:
856 assert left_table_path is not None, msg.format(meth=method, name="left_table_path")
857 assert left_table_name is not None, msg.format(meth=method, name="left_table_name")
858 assert right_table is not None, msg.format(meth=method, name="right_table")
859 assert spark_session is not None, msg.format(meth=method, name="spark_session")
860 return _check_schemas_match_by_path_and_table(
861 left_table_path=left_table_path,
862 left_table_name=left_table_name,
863 right_table=right_table,
864 spark_session=spark_session,
865 left_table_format=left_table_format,
866 include_change_field=include_change_field,
867 include_add_field=include_add_field,
868 include_remove_field=include_remove_field,
869 include_change_nullable=include_change_nullable,
870 return_object=return_object,
871 )
872 elif method in valid_methods.by_path_and_path:
873 assert left_table_path is not None, msg.format(meth=method, name="left_table_path")
874 assert left_table_name is not None, msg.format(meth=method, name="left_table_name")
875 assert right_table_path is not None, msg.format(meth=method, name="right_table_path")
876 assert right_table_name is not None, msg.format(meth=method, name="right_table_name")
877 assert spark_session is not None, msg.format(meth=method, name="spark_session")
878 return _check_schemas_match_by_path_and_path(
879 left_table_path=left_table_path,
880 left_table_name=left_table_name,
881 left_table_format=left_table_format,
882 right_table_path=right_table_path,
883 right_table_name=right_table_name,
884 right_table_format=right_table_format,
885 spark_session=spark_session,
886 include_change_field=include_change_field,
887 include_add_field=include_add_field,
888 include_remove_field=include_remove_field,
889 include_change_nullable=include_change_nullable,
890 return_object=return_object,
891 )
892 else:
893 raise AttributeError(
894 f"Invalid value for `method`: '{method}'\n"
895 f"Please use one of the following options:\n"
896 f"- For `by_table_and_table`, use one of: {valid_methods.by_table_and_table}\n"
897 f"- For `by_table_and_path`, use one of: {valid_methods.by_table_and_path}\n"
898 f"- For `by_path_and_table`, use one of: {valid_methods.by_path_and_table}\n"
899 f"- For `by_path_and_path`, use one of: {valid_methods.by_path_and_path}\n"
900 )
903# ---------------------------------------------------------------------------- #
904# View Differences ####
905# ---------------------------------------------------------------------------- #
908@typechecked
909def _view_schema_differences_by_table_and_table(
910 left_table: psDataFrame,
911 right_table: psDataFrame,
912 include_change_field: bool = True,
913 include_add_field: bool = True,
914 include_remove_field: bool = True,
915 include_change_nullable: bool = False,
916 view_type: Literal["print", "pprint", "return"] = "pprint",
917) -> Optional[Union[list[tuple[str, dict[str, StructField]]], bool]]:
918 schema_differences: Union[list[tuple[str, dict[str, StructField]]], bool] = (
919 check_schemas_match(
920 method="table_table",
921 left_table=left_table,
922 right_table=right_table,
923 include_change_field=include_change_field,
924 include_add_field=include_add_field,
925 include_remove_field=include_remove_field,
926 include_change_nullable=include_change_nullable,
927 return_object="results",
928 )
929 )
930 if is_type(schema_differences, list) and len(schema_differences) > 0:
931 if view_type == "print":
932 print(schema_differences)
933 elif view_type == "pprint":
934 pprint(schema_differences)
935 elif view_type == "return":
936 return schema_differences
937 return None
940@typechecked
941def _view_schema_differences_by_path_and_path(
942 left_table_path: str,
943 left_table_name: str,
944 right_table_path: str,
945 right_table_name: str,
946 spark_session: SparkSession,
947 left_table_format: str = "delta",
948 right_table_format: str = "delta",
949 include_change_field: bool = True,
950 include_add_field: bool = True,
951 include_remove_field: bool = True,
952 include_change_nullable: bool = False,
953 view_type: Literal["print", "pprint", "return"] = "pprint",
954) -> Optional[Union[list[tuple[str, dict[str, StructField]]], bool]]:
955 left_table: psDataFrame = read_from_path(
956 name=left_table_name,
957 path=left_table_path,
958 spark_session=spark_session,
959 data_format=left_table_format,
960 )
961 right_table: psDataFrame = read_from_path(
962 name=right_table_name,
963 path=right_table_path,
964 spark_session=spark_session,
965 data_format=right_table_format,
966 )
967 return _view_schema_differences_by_table_and_table(
968 left_table=left_table,
969 right_table=right_table,
970 include_change_field=include_change_field,
971 include_add_field=include_add_field,
972 include_remove_field=include_remove_field,
973 include_change_nullable=include_change_nullable,
974 view_type=view_type,
975 )
978@typechecked
979def _view_schema_differences_by_table_and_path(
980 left_table: psDataFrame,
981 right_table_path: str,
982 right_table_name: str,
983 spark_session: SparkSession,
984 right_table_format: str = "delta",
985 include_change_field: bool = True,
986 include_add_field: bool = True,
987 include_remove_field: bool = True,
988 include_change_nullable: bool = False,
989 view_type: Literal["print", "pprint", "return"] = "pprint",
990) -> Optional[Union[list[tuple[str, dict[str, StructField]]], bool]]:
991 right_table: psDataFrame = read_from_path(
992 name=right_table_name,
993 path=right_table_path,
994 spark_session=spark_session,
995 data_format=right_table_format,
996 )
997 return _view_schema_differences_by_table_and_table(
998 left_table=left_table,
999 right_table=right_table,
1000 include_change_field=include_change_field,
1001 include_add_field=include_add_field,
1002 include_remove_field=include_remove_field,
1003 include_change_nullable=include_change_nullable,
1004 view_type=view_type,
1005 )
1008@typechecked
1009def _view_schema_differences_by_path_and_table(
1010 left_table_path: str,
1011 left_table_name: str,
1012 right_table: psDataFrame,
1013 spark_session: SparkSession,
1014 left_table_format: str = "delta",
1015 include_change_field: bool = True,
1016 include_add_field: bool = True,
1017 include_remove_field: bool = True,
1018 include_change_nullable: bool = False,
1019 view_type: Literal["print", "pprint", "return"] = "pprint",
1020) -> Optional[Union[list[tuple[str, dict[str, StructField]]], bool]]:
1021 left_table: psDataFrame = read_from_path(
1022 name=left_table_name,
1023 path=left_table_path,
1024 spark_session=spark_session,
1025 data_format=left_table_format,
1026 )
1027 return _view_schema_differences_by_table_and_table(
1028 left_table=left_table,
1029 right_table=right_table,
1030 include_change_field=include_change_field,
1031 include_add_field=include_add_field,
1032 include_remove_field=include_remove_field,
1033 include_change_nullable=include_change_nullable,
1034 view_type=view_type,
1035 )
1038@typechecked
1039def view_schema_differences(
1040 method: str = "by_table_and_table",
1041 spark_session: Optional[SparkSession] = None,
1042 left_table: Optional[psDataFrame] = None,
1043 left_table_path: Optional[str] = None,
1044 left_table_name: Optional[str] = None,
1045 left_table_format: str = "delta",
1046 right_table: Optional[psDataFrame] = None,
1047 right_table_path: Optional[str] = None,
1048 right_table_name: Optional[str] = None,
1049 right_table_format: str = "delta",
1050 include_change_field: bool = True,
1051 include_add_field: bool = True,
1052 include_remove_field: bool = True,
1053 include_change_nullable: bool = False,
1054 view_type: Literal["print", "pprint", "return"] = "pprint",
1055) -> Optional[Union[list[tuple[str, dict[str, StructField]]], bool]]:
1056 """
1057 !!! note "Summary"
1058 View the schemas between two different tables.
1060 ???+ abstract "Details"
1061 The primary differences between [`check_schemas_match()`][toolbox_pyspark.schema.check_schemas_match] and [`view_schema_differences()`][toolbox_pyspark.schema.view_schema_differences] is that [`check_...()`][toolbox_pyspark.schema.check_schemas_match] returns either a `#!py bool` result, or the actual details of the schema differences, whilst [`view_...()`][toolbox_pyspark.schema.view_schema_differences] may also return the actual details object, but it will also print the result to the terminal for you to review.<br>
1062 For full details of all the parameters and all the options, including nuances and detailed explanations and thorough examples, please check the [`check_schemas_match()`][toolbox_pyspark.schema.check_schemas_match] function.
1064 Params:
1065 method (str, optional):
1066 The method to use for the comparison. That is, is either side a table in memory or is it a `table` sitting on a `path`?. Check the Notes section for all options available for this parameter.<br>
1067 Defaults to `#!py "by_table_and_table"`.
1068 spark_session (Optional[SparkSession], optional):
1069 The `SparkSession` to use if either the `left` or `right` tables are sitting on a `path` somewhere.<br>
1070 Defaults to `#!py None`.
1071 left_table (Optional[psDataFrame], optional):
1072 If `method` defines the `left` table as a `table`, then this parameter is the actual `dataframe` to do the checking against.<br>
1073 Defaults to `#!py None`.
1074 left_table_path (Optional[str], optional):
1075 If `method` defines the `left` table as a `path`, then this parameter is the actual path location where the table can be found.<br>
1076 Defaults to `#!py None`.
1077 left_table_name (Optional[str], optional):
1078 If `method` defines the `left` table as a `path`, then this parameter is the name of the table found at the given `left_table_path` location.<br>
1079 Defaults to `#!py None`.
1080 left_table_format (str, optional):
1081 If `method` defines the `left` table as a `path`, then this parameter is the format of the table found at the given `left_table_path` location.<br>
1082 Defaults to `#!py "delta"`.
1083 right_table (Optional[psDataFrame], optional):
1084 If `method` defines the `right` table as a `table`, then this parameter is the actual `dataframe` to do the checking against.<br>
1085 Defaults to `#!py None`.
1086 right_table_path (Optional[str], optional):
1087 If `method` defines the `right` table as a `path`, then this parameter is the actual path location where the table can be found.<br>
1088 Defaults to `#!py None`.
1089 right_table_name (Optional[str], optional):
1090 If `method` defines the `right` table as a `path`, then this parameter is the name of the table found at the given `right_table_path` location.<br>
1091 Defaults to `#!py None`.
1092 right_table_format (str, optional):
1093 If `method` defines the `right` table as a `path`, then this parameter is the format of the table found at the given `right_table_path` location.<br>
1094 Defaults to `#!py "delta"`.
1095 include_change_field (bool, optional):
1096 When doing the schema validations, do you want to include any fields where the data-type on the right-hand side is different from the left-hand side?<br>
1097 This can be read as: "What fields have had their data type _changed **between**_ the left-hand side and the right-hand side?"<br>
1098 Defaults to `#!py True`.
1099 include_add_field (bool, optional):
1100 When doing the schema validations, do you want to include any fields that have had any additional fields added to the left-hand side, when compared to the right-hand side?<br>
1101 This can be read as: "What fields have been _added **to**_ the left-hand side?"<br>
1102 Defaults to `#!py True`.
1103 include_remove_field (bool, optional):
1104 When doing the schema validations, do you want to include any fields which are missing from the left-hand side and only existing on the right-hand side?<br>
1105 This can be read as: "What fields been _removed **from**_ the left-hand side?"<br>
1106 Defaults to `#!py True`.
1107 include_change_nullable (bool, optional):
1108 When doing the schema validations, do you want to include any fields which have had their nullability metadata changed on the right-hand side, when compared to the left-hand side?.<br>
1109 This can be read as: "What fields had their nullability _changed **between**_ the left-hand side and the right-hand side?"<br>
1110 Defaults to `#!py False`.
1111 view_type (Literal["print", "pprint", "return"], optional):
1112 When returning the output from this function, how do you want it to be displayed? Must be one of `#!py ["print", "pprint", "return"]`.<br>
1113 Logically, the difference is that `#!py "print"` will display a text value to the terminal that is not formatted in any way; `#!py "pprint"` will display a pretty-printed text value to the terminal; and `#!py "return"` will return the schema differences which can then be assigned to another variable.<br>
1114 Defaults to `#!py "pprint"`.
1116 Raises:
1117 TypeError:
1118 If any of the inputs parsed to the parameters of this function are not the correct type. Uses the [`@typeguard.typechecked`](https://typeguard.readthedocs.io/en/stable/api.html#typeguard.typechecked) decorator.
1119 AttributeError:
1120 If the value parse'd to `method` is not a valid option.
1122 Returns:
1123 (Optional[list[tuple[str, dict[str, StructField]]]]):
1124 If `#!py view_type="return"`, then this will be a `#!py list` of `#!py tuple`'s of `#!py dict`'s containing the details of the schema differences. If `#!py view_type!="return"` (or if `#!py view_type="return"`, but there are actually no differences in the schema), then nothing is returned; only printed to terminal.
1126 ???+ example "Examples"
1128 ```{.py .python linenums="1" title="Set up"}
1129 >>> # Imports
1130 >>> from pprint import pprint
1131 >>> import pandas as pd
1132 >>> from pyspark.sql import SparkSession, functions as F
1133 >>> from toolbox_pyspark.schema import view_schema_differences
1134 >>> from toolbox_pyspark.io import write_to_path
1135 >>> from toolbox_pyspark.checks import table_exists
1136 >>>
1137 >>> # Instantiate Spark
1138 >>> spark = SparkSession.builder.getOrCreate()
1139 >>>
1140 >>> # Create data
1141 >>> df1 = spark.createDataFrame(
1142 ... pd.DataFrame(
1143 ... {
1144 ... "a": [0, 1, 2, 3],
1145 ... "b": ["a", "b", "c", "d"],
1146 ... "c": ["1", "1", "1", "1"],
1147 ... "d": ["2", "2", "2", "2"],
1148 ... "e": ["3", "3", "3", "3"],
1149 ... "f": ["4", "4", "4", "4"],
1150 ... }
1151 ... )
1152 ... )
1153 >>> df2 = (
1154 ... df1.withColumn("c", F.col("c").cast("int"))
1155 ... .withColumn("g", F.lit("a"))
1156 ... .withColumn("d", F.lit("null"))
1157 ... .drop("e")
1158 ... )
1159 >>> write_to_path(
1160 ... table=df1,
1161 ... name="left",
1162 ... path="./test",
1163 ... data_format="parquet",
1164 ... mode="overwrite",
1165 ... write_options={"overwriteSchema": "true"},
1166 ... )
1167 >>> write_to_path(
1168 ... table=df2,
1169 ... name="right",
1170 ... path="./test",
1171 ... data_format="parquet",
1172 ... mode="overwrite",
1173 ... write_options={"overwriteSchema": "true"},
1174 ... )
1175 >>>
1176 >>> # Check
1177 >>> pprint(df1.dtypes)
1178 >>> print(df1.show())
1179 >>> print(table_exists("left", "./test", "parquet", spark))
1180 >>> pprint(df2.dtypes)
1181 >>> print(df2.show())
1182 >>> print(table_exists("right", "./test", "parquet", spark))
1183 ```
1184 <div class="result" markdown>
1185 ```{.sh .shell title="Terminal"}
1186 [
1187 ("a", "bigint"),
1188 ("b", "string"),
1189 ("c", "string"),
1190 ("d", "string"),
1191 ("e", "string"),
1192 ("f", "string"),
1193 ]
1194 ```
1195 ```{.txt .text title="Terminal"}
1196 +---+---+---+---+---+---+
1197 | a | b | c | d | e | f |
1198 +---+---+---+---+---+---+
1199 | 0 | a | 1 | 2 | 3 | 4 |
1200 | 1 | b | 1 | 2 | 3 | 4 |
1201 | 2 | c | 1 | 2 | 3 | 4 |
1202 | 3 | d | 1 | 2 | 3 | 4 |
1203 +---+---+---+---+---+---+
1204 ```
1205 ```{.sh .shell title="Terminal"}
1206 True
1207 ```
1208 ```{.sh .shell title="Terminal"}
1209 [
1210 ("a", "bigint"),
1211 ("b", "string"),
1212 ("c", "int"),
1213 ("d", "string"),
1214 ("f", "string"),
1215 ("g", "string"),
1216 ]
1217 ```
1218 ```{.txt .text title="Terminal"}
1219 +---+---+---+------+---+---+
1220 | a | b | c | d | f | g |
1221 +---+---+---+------+---+---+
1222 | 0 | a | 1 | null | 4 | 2 |
1223 | 1 | b | 1 | null | 4 | 2 |
1224 | 2 | c | 1 | null | 4 | 2 |
1225 | 3 | d | 1 | null | 4 | 2 |
1226 +---+---+---+------+---+---+
1227 ```
1228 ```{.sh .shell title="Terminal"}
1229 True
1230 ```
1231 </div>
1233 ```{.py .python linenums="1" title="Example 1: Check matching"}
1234 >>> view_schema_differences(
1235 ... method="table_table",
1236 ... left_table=df1,
1237 ... right_table=df1,
1238 ... include_add_field=True,
1239 ... include_change_field=True,
1240 ... include_remove_field=True,
1241 ... include_change_nullable=True,
1242 ... view_type="return",
1243 ... )
1244 >>> print(diff)
1245 ```
1246 <div class="result" markdown>
1247 ```{.sh .shell title="Terminal"}
1248 None
1249 ```
1250 !!! success "Conclusion: Schemas match."
1251 </div>
1253 ```{.py .python linenums="1" title="Example 2: Check print"}
1254 >>> view_schema_differences(
1255 ... method="table_table",
1256 ... left_table=df1,
1257 ... right_table=df2,
1258 ... include_add_field=True,
1259 ... include_change_field=False,
1260 ... include_remove_field=False,
1261 ... include_change_nullable=False,
1262 ... view_type="print",
1263 ... )
1264 ```
1265 <div class="result" markdown>
1266 ```{.sh .shell title="Terminal"}
1267 [('add', {'left': StructField('e', StringType(), True)})]
1268 ```
1269 !!! failure "Conclusion: Schemas do not match because the `e` field was added."
1270 </div>
1272 ```{.py .python linenums="1" title="Example 3: Check pprint"}
1273 >>> view_schema_differences(
1274 ... method="table_table",
1275 ... left_table=df1,
1276 ... right_table=df2,
1277 ... include_add_field=True,
1278 ... include_change_field=True,
1279 ... include_remove_field=True,
1280 ... include_change_nullable=True,
1281 ... view_type="pprint",
1282 ... )
1283 ```
1284 <div class="result" markdown>
1285 ```{.sh .shell title="Terminal"}
1286 [('add', {'left': StructField('e', StringType(), False)}),
1287 ('remove', {'right': StructField('g', StringType(), False)}),
1288 ('change_type',
1289 {'left': StructField('c', StringType(), False),
1290 'right': StructField('c', IntegerType(), True)}),
1291 ('change_nullable',
1292 {'left': StructField('c', StringType(), False),
1293 'right': StructField('c', IntegerType(), True)})]
1294 ```
1295 !!! failure "Conclusion: Schemas do not match because the `e` field was added, the `g` field was removed, the `c` field had its data type changed, and the `c` field had its nullability changed."
1296 </div>
1298 ```{.py .python linenums="1" title="Example 4: Check with right-hand side as a `path`"}
1299 >>> view_schema_differences(
1300 ... method="table_table",
1301 ... spark_session=spark,
1302 ... left_table=df1,
1303 ... right_table_path="./test",
1304 ... right_table_name="right",
1305 ... right_table_format="parquet",
1306 ... include_add_field=True,
1307 ... include_change_field=False,
1308 ... include_remove_field=False,
1309 ... include_change_nullable=False,
1310 ... view_type="pprint",
1311 ... )
1312 ```
1313 <div class="result" markdown>
1314 ```{.sh .shell title="Terminal"}
1315 [('add', {'left': StructField('e', StringType(), True)})]
1316 ```
1317 !!! failure "Conclusion: Schemas do not match because the `e` field was added."
1318 </div>
1320 ```{.py .python linenums="1" title="Example 5: Check with both sides being a `path`"}
1321 >>> view_schema_differences(
1322 ... method="table_table",
1323 ... spark_session=spark,
1324 ... left_table_path="./test",
1325 ... left_table_name="left",
1326 ... left_table_format="parquet",
1327 ... right_table_path="./test",
1328 ... right_table_name="right",
1329 ... right_table_format="parquet",
1330 ... include_add_field=False,
1331 ... include_change_field=False,
1332 ... include_remove_field=True,
1333 ... include_change_nullable=False,
1334 ... view_type="pprint",
1335 ... )
1336 ```
1337 <div class="result" markdown>
1338 ```{.sh .shell title="Terminal"}
1339 [('remove', {'right': StructField('g', StringType(), True)})]
1340 ```
1341 !!! failure "Conclusion: Schemas do not match because the `g` field was removed."
1342 </div>
1344 ```{.py .python linenums="1" title="Example 6: Invalid `method` parameter"}
1345 >>> view_schema_differences(
1346 ... method="table_table_table",
1347 ... left_table=df1,
1348 ... right_table=df2,
1349 ... include_add_field=True,
1350 ... include_change_field=True,
1351 ... include_remove_field=True,
1352 ... include_change_nullable=True,
1353 ... view_type="return",
1354 ... )
1355 ```
1356 <div class="result" markdown>
1357 ```{.sh .shell title="Terminal"}
1358 AttributeError: Invalid value for `method`: 'table_table_table'
1359 Please use one of the following options:
1360 - For `by_table_and_table`, use one of the following values: ['table', 'table_table', 'tables', 'by_table', 'by_table_and_table', 'table_and_table']
1361 - For `by_table_and_path`, use one of the following values: ['table_and_path', 'table_path', 'by_table_and_path']
1362 - For `by_path_and_table`, use one of the following values: ['path_and_table', 'path_table', 'by_path_and_table']
1363 - For `by_path_and_path`, use one of the following values: ['path_and_path', 'path_path', 'by_path_and_path', 'path', 'paths']
1364 ```
1365 !!! failure "Conclusion: Invalid `method` parameter."
1366 </div>
1368 ??? tip "See Also"
1369 - [`check_schemas_match()`][toolbox_pyspark.schema.check_schemas_match]
1370 """
1372 valid_methods: ValidMethods = ValidMethods()
1373 msg: str = "If using the '{meth}' method, then '{name}' cannot be 'None'."
1375 if method in valid_methods.by_table_and_table:
1376 assert left_table is not None, msg.format(meth=method, name="left_table")
1377 assert right_table is not None, msg.format(meth=method, name="right_table")
1378 return _view_schema_differences_by_table_and_table(
1379 left_table=left_table,
1380 right_table=right_table,
1381 include_change_field=include_change_field,
1382 include_add_field=include_add_field,
1383 include_remove_field=include_remove_field,
1384 include_change_nullable=include_change_nullable,
1385 view_type=view_type,
1386 )
1387 elif method in valid_methods.by_table_and_path:
1388 assert left_table is not None, msg.format(meth=method, name="left_table")
1389 assert right_table_path is not None, msg.format(meth=method, name="right_table_path")
1390 assert right_table_name is not None, msg.format(meth=method, name="right_table_name")
1391 assert spark_session is not None, msg.format(meth=method, name="spark_session")
1392 return _view_schema_differences_by_table_and_path(
1393 left_table=left_table,
1394 right_table_path=right_table_path,
1395 right_table_name=right_table_name,
1396 right_table_format=right_table_format,
1397 spark_session=spark_session,
1398 include_change_field=include_change_field,
1399 include_add_field=include_add_field,
1400 include_remove_field=include_remove_field,
1401 include_change_nullable=include_change_nullable,
1402 view_type=view_type,
1403 )
1404 elif method in valid_methods.by_path_and_table:
1405 assert left_table_path is not None, msg.format(meth=method, name="left_table_path")
1406 assert left_table_name is not None, msg.format(meth=method, name="left_table_name")
1407 assert right_table is not None, msg.format(meth=method, name="right_table")
1408 assert spark_session is not None, msg.format(meth=method, name="spark_session")
1409 return _view_schema_differences_by_path_and_table(
1410 left_table_path=left_table_path,
1411 left_table_name=left_table_name,
1412 left_table_format=left_table_format,
1413 right_table=right_table,
1414 spark_session=spark_session,
1415 include_change_field=include_change_field,
1416 include_add_field=include_add_field,
1417 include_remove_field=include_remove_field,
1418 include_change_nullable=include_change_nullable,
1419 view_type=view_type,
1420 )
1421 elif method in valid_methods.by_path_and_path:
1422 assert left_table_path is not None, msg.format(meth=method, name="left_table_path")
1423 assert left_table_name is not None, msg.format(meth=method, name="left_table_name")
1424 assert right_table_path is not None, msg.format(meth=method, name="right_table_path")
1425 assert right_table_name is not None, msg.format(meth=method, name="right_table_name")
1426 assert spark_session is not None, msg.format(meth=method, name="spark_session")
1427 return _view_schema_differences_by_path_and_path(
1428 left_table_path=left_table_path,
1429 left_table_name=left_table_name,
1430 left_table_format=left_table_format,
1431 right_table_path=right_table_path,
1432 right_table_name=right_table_name,
1433 right_table_format=right_table_format,
1434 spark_session=spark_session,
1435 include_change_field=include_change_field,
1436 include_add_field=include_add_field,
1437 include_remove_field=include_remove_field,
1438 include_change_nullable=include_change_nullable,
1439 view_type=view_type,
1440 )
1441 else:
1442 raise AttributeError(
1443 f"Invalid value for `method`: '{method}'\n"
1444 f"Please use one of the following options:\n"
1445 f"- For `by_table_and_table`, use one of: {valid_methods.by_table_and_table}\n"
1446 f"- For `by_table_and_path`, use one of: {valid_methods.by_table_and_path}\n"
1447 f"- For `by_path_and_table`, use one of: {valid_methods.by_path_and_table}\n"
1448 f"- For `by_path_and_path`, use one of: {valid_methods.by_path_and_path}\n"
1449 )