Coverage for src/toolbox_pyspark/schema.py: 100%

1# ============================================================================ #

2# #

3# Title : Schema #

4# Purpose : Checking, validating, and viewing any schema differences #

5# between two different tables, either from in-memory variables, #

6# or pointing to locations on disk. #

7# #

8# ============================================================================ #

11# ---------------------------------------------------------------------------- #

12# #

13# Overview ####

14# #

15# ---------------------------------------------------------------------------- #

18# ---------------------------------------------------------------------------- #

19# Description ####

20# ---------------------------------------------------------------------------- #

23"""

24!!! note "Summary"

25 The `schema` module is used for checking, validating, and viewing any schema differences between two different tables, either from in-memory variables, or pointing to locations on disk.

26"""

29# ---------------------------------------------------------------------------- #

30# #

31# Setup ####

32# #

33# ---------------------------------------------------------------------------- #

36# ---------------------------------------------------------------------------- #

37# Imports ####

38# ---------------------------------------------------------------------------- #

41# ## Python StdLib Imports ----

42from pprint import pprint

43from typing import Literal, NamedTuple, Optional, Union

45# ## Python Third Party Imports ----

46from pyspark.sql import DataFrame as psDataFrame, SparkSession

47from pyspark.sql.types import StructField

48from toolbox_python.checkers import is_type

49from toolbox_python.collection_types import str_list, str_set

50from typeguard import typechecked

52# ## Local First Party Imports ----

53from toolbox_pyspark.io import read_from_path

56# ---------------------------------------------------------------------------- #

57# Exports ####

58# ---------------------------------------------------------------------------- #

61__all__: str_list = [

62 "view_schema_differences",

63 "check_schemas_match",

64]

67# ---------------------------------------------------------------------------- #

68# #

69# Functions ####

70# #

71# ---------------------------------------------------------------------------- #

74## --------------------------------------------------------------------------- #

75## Classes ####

76## --------------------------------------------------------------------------- #

79class ValidMethods(NamedTuple):

80 """

81 ```py

82 by_table_and_table: str_set

83 by_table_and_path: str_set

84 by_path_and_table: str_set

85 by_path_and_path: str_set

86 ```

87 """

89 by_table_and_table: str_set = {

90 "table",

91 "table_table",

92 "tables",

93 "by_table",

94 "by_table_and_table",

95 "table_and_table",

96 }

97 """

98 ```py

99 {

100 "table",

101 "table_table",

102 "tables",

103 "by_table",

104 "by_table_and_table",

105 "table_and_table",

106 }

107 ```

108 """

109 by_table_and_path: str_set = {

110 "table_and_path",

111 "table_path",

112 "by_table_and_path",

113 }

114 """

115 ```py

116 {

117 "table_and_path",

118 "table_path",

119 "by_table_and_path",

120 }

121 ```

122 """

123 by_path_and_table: str_set = {

124 "path_and_table",

125 "path_table",

126 "by_path_and_table",

127 }

128 """

129 ```py

130 {

131 "path_and_table",

132 "path_table",

133 "by_path_and_table",

134 }

135 ```

136 """

137 by_path_and_path: str_set = {

138 "path_and_path",

139 "path_path",

140 "by_path_and_path",

141 "path",

142 "paths",

143 }

144 """

145 ```py

146 {

147 "path_and_path",

148 "path_path",

149 "by_path_and_path",

150 "path",

151 "paths",

152 }

153 ```

154 """

155

156

157# ---------------------------------------------------------------------------- #

158# Check Matching ####

159# ---------------------------------------------------------------------------- #

160

161

162@typechecked

163def _check_schemas_match_by_table_and_table(

164 left_table: psDataFrame,

165 right_table: psDataFrame,

166 include_change_field: bool = True,

167 include_add_field: bool = True,

168 include_remove_field: bool = True,

169 include_change_nullable: bool = False,

170 return_object: Literal["results", "check"] = "check",

171) -> Union[list, bool]:

172

173 # Set up

174 left_schema: dict = left_table.schema.__dict__

175 left_names: str_list = left_schema["names"]

176 left_fields: list[StructField] = left_schema["fields"]

177 right_schema: dict = right_table.schema.__dict__

178 right_names: str_list = right_schema["names"]

179 right_fields: list[StructField] = right_schema["fields"]

180 results = list()

181

182 # Loop for any additions

183 if include_add_field:

184 for left_field in left_fields:

185 if left_field.name not in right_names:

186 results.append(("add", {"left": left_field}))

187

188 # Loop for any removals

189 if include_remove_field:

190 for right_field in right_fields:

191 if right_field.name not in left_names:

192 results.append(("remove", {"right": right_field}))

193

194 # Loop for any changes

195 if include_change_field:

196 for left_field in left_fields:

197 if left_field.name not in right_names:

198 continue

199 right_field: StructField = [

200 field for field in right_fields if field.name == left_field.name

201 ][0]

202 if left_field.dataType != right_field.dataType:

203 results.append(("change_type", {"left": left_field, "right": right_field}))

204 if include_change_nullable:

205 if left_field.nullable != right_field.nullable:

206 results.append(

207 ("change_nullable", {"left": left_field, "right": right_field})

208 )

209

210 # Return

211 if len(results) > 0:

212 if return_object == "results":

213 return results

214 elif return_object == "check":

215 return False

216 return True

217

218

219@typechecked

220def _check_schemas_match_by_table_and_path(

221 left_table: psDataFrame,

222 right_table_path: str,

223 right_table_name: str,

224 spark_session: SparkSession,

225 right_table_format: str = "delta",

226 include_change_field: bool = True,

227 include_add_field: bool = True,

228 include_remove_field: bool = True,

229 include_change_nullable: bool = False,

230 return_object: Literal["results", "check"] = "check",

231) -> Union[list, bool]:

232 right_table: psDataFrame = read_from_path(

233 name=right_table_name,

234 path=right_table_path,

235 spark_session=spark_session,

236 data_format=right_table_format,

237 )

238 return _check_schemas_match_by_table_and_table(

239 left_table=left_table,

240 right_table=right_table,

241 include_change_field=include_change_field,

242 include_add_field=include_add_field,

243 include_remove_field=include_remove_field,

244 include_change_nullable=include_change_nullable,

245 return_object=return_object,

246 )

247

248

249@typechecked

250def _check_schemas_match_by_path_and_table(

251 left_table_path: str,

252 left_table_name: str,

253 right_table: psDataFrame,

254 spark_session: SparkSession,

255 left_table_format: str = "delta",

256 include_change_field: bool = True,

257 include_add_field: bool = True,

258 include_remove_field: bool = True,

259 include_change_nullable: bool = False,

260 return_object: Literal["results", "check"] = "check",

261) -> Union[list, bool]:

262 left_table: psDataFrame = read_from_path(

263 name=left_table_name,

264 path=left_table_path,

265 spark_session=spark_session,

266 data_format=left_table_format,

267 )

268 return _check_schemas_match_by_table_and_table(

269 left_table=left_table,

270 right_table=right_table,

271 include_change_field=include_change_field,

272 include_add_field=include_add_field,

273 include_remove_field=include_remove_field,

274 include_change_nullable=include_change_nullable,

275 return_object=return_object,

276 )

277

278

279@typechecked

280def _check_schemas_match_by_path_and_path(

281 left_table_path: str,

282 left_table_name: str,

283 right_table_path: str,

284 right_table_name: str,

285 spark_session: SparkSession,

286 left_table_format: str = "delta",

287 right_table_format: str = "delta",

288 include_change_field: bool = True,

289 include_add_field: bool = True,

290 include_remove_field: bool = True,

291 include_change_nullable: bool = False,

292 return_object: Literal["results", "check"] = "check",

293) -> Union[list, bool]:

294 left_table: psDataFrame = read_from_path(

295 name=left_table_name,

296 path=left_table_path,

297 spark_session=spark_session,

298 data_format=left_table_format,

299 )

300 right_table: psDataFrame = read_from_path(

301 name=right_table_name,

302 path=right_table_path,

303 spark_session=spark_session,

304 data_format=right_table_format,

305 )

306 return _check_schemas_match_by_table_and_table(

307 left_table=left_table,

308 right_table=right_table,

309 include_change_field=include_change_field,

310 include_add_field=include_add_field,

311 include_remove_field=include_remove_field,

312 include_change_nullable=include_change_nullable,

313 return_object=return_object,

314 )

315

316

317@typechecked

318def check_schemas_match(

319 method: str = "by_table_and_table",

320 left_table: Optional[psDataFrame] = None,

321 right_table: Optional[psDataFrame] = None,

322 left_table_path: Optional[str] = None,

323 left_table_name: Optional[str] = None,

324 right_table_path: Optional[str] = None,

325 right_table_name: Optional[str] = None,

326 spark_session: Optional[SparkSession] = None,

327 left_table_format: str = "delta",

328 right_table_format: str = "delta",

329 include_change_field: bool = True,

330 include_add_field: bool = True,

331 include_remove_field: bool = True,

332 include_change_nullable: bool = False,

333 return_object: Literal["results", "check"] = "check",

334) -> Union[list[tuple[str, dict[str, StructField]]], bool]:

335 """

336 !!! note "Summary"

337 Check the schemas between two different tables.

338

339 ???+ abstract "Details"

340 This function is heavily inspired by other packages which check and validate schema differences for `pyspark` tables. This function just streamlines it a bit, and adds additional functionality for whether or not table on either `left` or `right` side is already in-memory or sitting on a directory somewhere else.

341

342 Params:

343 method (str, optional):

344 The method to use for the comparison. That is, is either side a table in memory or is it a `table` sitting on a `path`?. Check the Notes section for all options available for this parameter.<br>

345 Defaults to `#!py "by_table_and_table"`.

346 spark_session (Optional[SparkSession], optional):

347 The `SparkSession` to use if either the `left` or `right` tables are sitting on a `path` somewhere.<br>

348 Defaults to `#!py None`.

349 left_table (Optional[psDataFrame], optional):

350 If `method` defines the `left` table as a `table`, then this parameter is the actual `dataframe` to do the checking against.<br>

351 Defaults to `#!py None`.

352 left_table_path (Optional[str], optional):

353 If `method` defines the `left` table as a `path`, then this parameter is the actual path location where the table can be found.<br>

354 Defaults to `#!py None`.

355 left_table_name (Optional[str], optional):

356 If `method` defines the `left` table as a `path`, then this parameter is the name of the table found at the given `left_table_path` location.<br>

357 Defaults to `#!py None`.

358 left_table_format (str, optional):

359 If `method` defines the `left` table as a `path`, then this parameter is the format of the table found at the given `left_table_path` location.<br>

360 Defaults to `#!py "delta"`.

361 right_table (Optional[psDataFrame], optional):

362 If `method` defines the `right` table as a `table`, then this parameter is the actual `dataframe` to do the checking against.<br>

363 Defaults to `#!py None`.

364 right_table_path (Optional[str], optional):

365 If `method` defines the `right` table as a `path`, then this parameter is the actual path location where the table can be found.<br>

366 Defaults to `#!py None`.

367 right_table_name (Optional[str], optional):

368 If `method` defines the `right` table as a `path`, then this parameter is the name of the table found at the given `right_table_path` location.<br>

369 Defaults to `#!py None`.

370 right_table_format (str, optional):

371 If `method` defines the `right` table as a `path`, then this parameter is the format of the table found at the given `right_table_path` location.<br>

372 Defaults to `#!py "delta"`.

373 include_change_field (bool, optional):

374 When doing the schema validations, do you want to include any fields where the data-type on the right-hand side is different from the left-hand side?<br>

375 This can be read as: "What fields have had their data type _changed **between**_ the left-hand side and the right-hand side?"<br>

376 Defaults to `#!py True`.

377 include_add_field (bool, optional):

378 When doing the schema validations, do you want to include any fields that have had any additional fields added to the left-hand side, when compared to the right-hand side?<br>

379 This can be read as: "What fields have been _added **to**_ the left-hand side?"<br>

380 Defaults to `#!py True`.

381 include_remove_field (bool, optional):

382 When doing the schema validations, do you want to include any fields which are missing from the left-hand side and only existing on the right-hand side?<br>

383 This can be read as: "What fields been _removed **from**_ the left-hand side?"<br>

384 Defaults to `#!py True`.

385 include_change_nullable (bool, optional):

386 When doing the schema validations, do you want to include any fields which have had their nullability metadata changed on the right-hand side, when compared to the left-hand side?.<br>

387 This can be read as: "What fields had their nullability _changed **between**_ the left-hand side and the right-hand side?"<br>

388 Defaults to `#!py False`.

389 return_object (Literal["results", "check"], optional):

390 After having checked the schema, how do you want the results to be returned? If `#!py "check"`, then will only return a `#!py bool` value: `#!py True` if the schemas actually match, `#!py False` if there are any differences. If `#!py "results"`, then the actual schema differences will be returned. Check the Notes section for more information on the structure of this object.<br>

391 Defaults to `#!py "check"`.

392

393 Raises:

394 TypeError:

395 If any of the inputs parsed to the parameters of this function are not the correct type. Uses the [`@typeguard.typechecked`](https://typeguard.readthedocs.io/en/stable/api.html#typeguard.typechecked) decorator.

396 AttributeError:

397 If the value parse'd to `method` is not a valid option.

398

399 Returns:

400 (Union[list[tuple[str, dict[str, StructField]]], bool]):

401 If `return_object` is `#!py "results"`, then this will be a `#!py list` of `#!py tuple`'s of `#!py dict`'s containing the details of the schema differences. If `return_object` is `#!py "check"`, then it will only be a `#!py bool` object about whether the schemas match or not.

402

403 ???+ example "Examples"

404

405 ```{.py .python linenums="1" title="Set up"}

406 >>> # Imports

407 >>> from pprint import pprint

408 >>> import pandas as pd

409 >>> from pyspark.sql import SparkSession, functions as F

410 >>> from toolbox_pyspark.schema import check_schemas_match

411 >>> from toolbox_pyspark.io import write_to_path

412 >>> from toolbox_pyspark.checks import table_exists

413 >>>

414 >>> # Instantiate Spark

415 >>> spark = SparkSession.builder.getOrCreate()

416 >>>

417 >>> # Create data

418 >>> df1 = spark.createDataFrame(

419 ... pd.DataFrame(

420 ... {

421 ... "a": [0, 1, 2, 3],

422 ... "b": ["a", "b", "c", "d"],

423 ... "c": ["1", "1", "1", "1"],

424 ... "d": ["2", "2", "2", "2"],

425 ... "e": ["3", "3", "3", "3"],

426 ... "f": ["4", "4", "4", "4"],

427 ... }

428 ... )

429 ... )

430 >>> df2 = (

431 ... df1.withColumn("c", F.col("c").cast("int"))

432 ... .withColumn("g", F.lit("a"))

433 ... .withColumn("d", F.lit("null"))

434 ... .drop("e")

435 ... )

436 >>> write_to_path(

437 ... table=df1,

438 ... name="left",

439 ... path="./test",

440 ... data_format="parquet",

441 ... mode="overwrite",

442 ... write_options={"overwriteSchema": "true"},

443 ... )

444 >>> write_to_path(

445 ... table=df2,

446 ... name="right",

447 ... path="./test",

448 ... data_format="parquet",

449 ... mode="overwrite",

450 ... write_options={"overwriteSchema": "true"},

451 ... )

452 >>>

453 >>> # Check

454 >>> pprint(df1.dtypes)

455 >>> print(df1.show())

456 >>> print(table_exists("left", "./test", "parquet", spark))

457 >>> pprint(df2.dtypes)

458 >>> print(df2.show())

459 >>> print(table_exists("right", "./test", "parquet", spark))

460 ```

461 <div class="result" markdown>

462 ```{.sh .shell title="Terminal"}

463 [

464 ("a", "bigint"),

465 ("b", "string"),

466 ("c", "string"),

467 ("d", "string"),

468 ("e", "string"),

469 ("f", "string"),

470 ]

471 ```

472 ```{.txt .text title="Terminal"}

473 +---+---+---+---+---+---+

474 | a | b | c | d | e | f |

475 +---+---+---+---+---+---+

476 | 0 | a | 1 | 2 | 3 | 4 |

477 | 1 | b | 1 | 2 | 3 | 4 |

478 | 2 | c | 1 | 2 | 3 | 4 |

479 | 3 | d | 1 | 2 | 3 | 4 |

480 +---+---+---+---+---+---+

481 ```

482 ```{.sh .shell title="Terminal"}

483 True

484 ```

485 ```{.sh .shell title="Terminal"}

486 [

487 ("a", "bigint"),

488 ("b", "string"),

489 ("c", "int"),

490 ("d", "string"),

491 ("f", "string"),

492 ("g", "string"),

493 ]

494 ```

495 ```{.txt .text title="Terminal"}

496 +---+---+---+------+---+---+

497 | a | b | c | d | f | g |

498 +---+---+---+------+---+---+

499 | 0 | a | 1 | null | 4 | 2 |

500 | 1 | b | 1 | null | 4 | 2 |

501 | 2 | c | 1 | null | 4 | 2 |

502 | 3 | d | 1 | null | 4 | 2 |

503 +---+---+---+------+---+---+

504 ```

505 ```{.sh .shell title="Terminal"}

506 True

507 ```

508 </div>

509

510 ```{.py .python linenums="1" title="Example 1: Check matching"}

511 >>> diff = check_schemas_match(

512 ... method="table_table",

513 ... left_table=df1,

514 ... right_table=df1,

515 ... include_add_field=True,

516 ... include_change_field=True,

517 ... include_remove_field=True,

518 ... include_change_nullable=True,

519 ... return_object="check",

520 ... )

521 >>> print(diff)

522 ```

523 <div class="result" markdown>

524 ```{.sh .shell title="Terminal"}

525 True

526 ```

527 !!! success "Conclusion: Schemas match."

528 </div>

529

530 ```{.py .python linenums="1" title="Example 2: Check not matching"}

531 >>> diff = check_schemas_match(

532 ... method="table_table",

533 ... left_table=df1,

534 ... right_table=df2,

535 ... include_add_field=True,

536 ... include_change_field=True,

537 ... include_remove_field=True,

538 ... include_change_nullable=True,

539 ... return_object="check",

540 ... )

541 >>> print(diff)

542 ```

543 <div class="result" markdown>

544 ```{.sh .shell title="Terminal"}

545 False

546 ```

547 !!! failure "Conclusion: Schemas do not match."

548 </div>

549

550 ```{.py .python linenums="1" title="Example 3: Show only `add`"}

551 >>> diff = check_schemas_match(

552 ... method="table_table",

553 ... left_table=df1,

554 ... right_table=df2,

555 ... include_add_field=True,

556 ... include_change_field=False,

557 ... include_remove_field=False,

558 ... include_change_nullable=False,

559 ... return_object="results",

560 ... )

561 >>> print(diff)

562 ```

563 <div class="result" markdown>

564 ```{.sh .shell title="Terminal"}

565 [

566 (

567 "add",

568 {"left": T.StructField("e", T.StringType(), False)},

569 ),

570 ]

571 ```

572 !!! failure "Conclusion: Schemas do not match because the `e` field was added."

573 </div>

574

575 ```{.py .python linenums="1" title="Example 4: Show `add` and `remove`"}

576 >>> diff = check_schemas_match(

577 ... method="table_table",

578 ... left_table=df1,

579 ... right_table=df2,

580 ... include_add_field=True,

581 ... include_change_field=False,

582 ... include_remove_field=True,

583 ... include_change_nullable=False,

584 ... return_object="results",

585 ... )

586 >>> print(diff)

587 ```

588 <div class="result" markdown>

589 ```{.sh .shell title="Terminal"}

590 [

591 (

592 "add",

593 {"left": T.StructField("e", T.StringType(), False)},

594 ),

595 (

596 "remove",

597 {"right": T.StructField("g", T.StringType(), False)},

598 ),

599 ]

600 ```

601 !!! failure "Conclusion: Schemas do not match because the `e` field was added and the `g` field was removed."

602 </div>

603

604 ```{.py .python linenums="1" title="Example 5: Show all changes"}

605 >>> diff = check_schemas_match(

606 ... method="table_table",

607 ... left_table=df1,

608 ... right_table=df2,

609 ... include_add_field=True,

610 ... include_change_field=True,

611 ... include_remove_field=True,

612 ... include_change_nullable=True,

613 ... return_object="results",

614 ... )

615 >>> print(diff)

616 ```

617 <div class="result" markdown>

618 ```{.sh .shell title="Terminal"}

619 [

620 (

621 "add",

622 {"left": T.StructField("e", T.StringType(), False)},

623 ),

624 (

625 "remove",

626 {"right": T.StructField("g", T.StringType(), False)},

627 ),

628 (

629 "change_type",

630 {

631 "left": T.StructField("c", T.StringType(), False),

632 "right": T.StructField("c", T.IntegerType(), True),

633 },

634 ),

635 (

636 "change_nullable",

637 {

638 "left": T.StructField("c", T.StringType(), False),

639 "right": T.StructField("c", T.IntegerType(), True),

640 },

641 ),

642 ]

643 ```

644 !!! failure "Conclusion: Schemas do not match because the `e` field was added, the `g` field was removed, the `c` field had its data type changed, and the `c` field had its nullability changed."

645 </div>

646

647 ```{.py .python linenums="1" title="Example 6: Check where right-hand side is a `path`"}

648 >>> diff = check_schemas_match(

649 ... method="path_table",

650 ... spark_session=spark,

651 ... left_table=df1,

652 ... right_table_path="./test",

653 ... right_table_name="right",

654 ... right_table_format="parquet",

655 ... include_add_field=True,

656 ... include_change_field=False,

657 ... include_remove_field=False,

658 ... include_change_nullable=False,

659 ... return_object="results",

660 ... )

661 >>> print(diff)

662 ```

663 <div class="result" markdown>

664 ```{.sh .shell title="Terminal"}

665 [

666 (

667 "add",

668 {"left": T.StructField("e", T.StringType(), False)},

669 ),

670 ]

671 ```

672 !!! failure "Conclusion: Schemas do not match because the `e` field was added."

673 </div>

674

675 ```{.py .python linenums="1" title="Example 7: Check where both sides are a `path`"}

676 >>> diff = check_schemas_match(

677 ... method="path_path",

678 ... spark_session=spark,

679 ... left_table_path="./test",

680 ... left_table_name="left",

681 ... left_table_format="parquet",

682 ... right_table_path="./test",

683 ... right_table_name="right",

684 ... right_table_format="parquet",

685 ... include_add_field=False,

686 ... include_change_field=True,

687 ... include_remove_field=False,

688 ... include_change_nullable=False,

689 ... return_object="results",

690 ... )

691 >>> print(diff)

692 ```

693 <div class="result" markdown>

694 ```{.sh .shell title="Terminal"}

695 [

696 (

697 "remove",

698 {"right": T.StructField("g", T.StringType(), True)},

699 ),

700 ]

701 ```

702 !!! failure "Conclusion: Schemas do not match because the `g` field was removed."

703 </div>

704

705 ```{.py .python linenums="1" title="Example 8: Invalid `method` parameter"}

706 >>> diff = check_schemas_match(

707 ... method="invalid",

708 ... left_table=df1,

709 ... right_table=df2,

710 ... include_add_field=True,

711 ... include_change_field=True,

712 ... include_remove_field=True,

713 ... include_change_nullable=True,

714 ... return_object="check",

715 ... )

716 ```

717 <div class="result" markdown>

718 ```{.py .python .title="Terminal"}

719 AttributeError: Invalid value for `method`: 'invalid'

720 Please use one of the following options:

721 - For `by_table_and_table`, use one of the following values: ['table', 'table_table', 'tables', 'by_table', 'by_table_and_table', 'table_and_table']

722 - For `by_table_and_path`, use one of the following values: ['table_and_path', 'table_path', 'by_table_and_path']

723 - For `by_path_and_table`, use one of the following values: ['path_and_table', 'path_table', 'by_path_and_table']

724 - For `by_path_and_path`, use one of the following values: ['path_and_path', 'path_path', 'by_path_and_path', 'path', 'paths']

725 ```

726 !!! failure "Conclusion: Invalid `method` parameter."

727 </div>

728

729 ???+ info "Notes"

730

731 ???+ info "Options available in the `method` parameter"

732

733 The options available in the `method` parameter include:

734

735 - If the objects on both the left-hand side and the right-hand side are both `dataframes` already loaded to memory, use one of the following values:

736 <div class="mdx-three-columns" markdown>

737 - `#!py "table"`

738 - `#!py "table_table"`

739 - `#!py "tables"`

740 - `#!py "by_table"`

741 - `#!py "by_table_and_table"`

742 - `#!py "table_and_table"`

743 </div>

744 - If the object on the left-hand side is a `dataframe` already loaded to memory, but the object on the right-hand side is a table sitting on a path somewhere, use one of the following values:

745 <div class="mdx-three-columns" markdown>

746 - `#!py "table_and_path"`

747 - `#!py "table_path"`

748 - `#!py "by_table_and_path"`

749 </div>

750 - If the object on the left-hand side is a table sitting on a path somewhere, but the object on the right-hand side is a `dataframe` already loaded to memory, use one of the following values:

751 <div class="mdx-three-columns" markdown>

752 - `#!py "path_and_table"`

753 - `#!py "path_table"`

754 - `#!py "by_path_and_table"`

755 </div>

756 - If the objects on both the left-hand side and the right-hand side are both tables sitting on a path somewhere, then use one of the following values:

757 <div class="mdx-three-columns" markdown>

758 - `#!py "path_and_path"`

759 - `#!py "path_path"`

760 - `#!py "by_path_and_path"`

761 - `#!py "path"`

762 - `#!py "paths"`

763 </div>

764

765 ???+ info "Details about the return object when we set the parameter `#!py return_object="results"`"

766

767 - When we set the parameter `#!py return_object="results"`, then we will get an object returned from this function.

768 - That object will be a `#!py list` of `#!py tuple`'s, each `#!py tuple` is only two-elements long, where the first element is a `#!py str` object, and the second is a `#!py dict` where the keys are `#!py str` and the values are a `#!py StructField` object.

769 - For each of the `#!py tuple` elements, the first element (the `#!py str` object) describes what the `#!py tuple` is there for. It will be one of four words: `#!py "add"`, `#!py "remove"`, `#!py "change_type"`, or `#!py "change_nullable"`.

770 - You can change whether these options are included in the schema check by changing the other parameters: `#!py include_change_field`, `#!py include_add_field`, `#!py include_remove_field`, `#!py include_change_nullable`.

771 - The structure of the list will look like this:

772

773 ```{.py .python .title="The structure of the returned object"}

774 [

775 (

776 "add", # (1)!

777 {"left": T.StructField("e", T.StringType(), False)}, # (2)!

778 ),

779 (

780 "add", # (3)!

781 {"left": T.StructField("h", T.StringType(), False)},

782 ),

783 (

784 "remove", # (4)!

785 {"right": T.StructField("g", T.StringType(), False)}, # (5)!

786 ),

787 (

788 "change_type", # (6)!

789 {

790 "left": T.StructField("c", T.StringType(), False), # (7)!

791 "right": T.StructField("c", T.IntegerType(), True),

792 },

793 ),

794 (

795 "change_nullable", # (8)!

796 {

797 "left": T.StructField("c", T.StringType(), False), # (9)!

798 "right": T.StructField("c", T.IntegerType(), True),

799 },

800 ),

801 ]

802 ```

803

804 1. When `#!py include_add_field=True`, then the `add` section will always appear first.<br>

805 If `#!py include_add_field=False`, then this section is omitted.

806 2. The second element of the `#!py tuple` is a `#!py dict` that has only one `key`-`value` pair.<br>

807 The `key` will _always_ be the value `#!py "left"`, because these are fields which have been added to the table on the left-hand side and not found on the right-hand side.

808 3. When there are multiple fields added to the table on the left-hand side, they will appear like this.

809 4. When `#!py include_remove_field=True`, then the `remove` section will always appear next.<br>

810 If `#!py include_remove_field=False`, then this section is omitted.

811 5. The second element of the `#!py tuple` is a `#!py dict` that has only one `key`-`value` pair.<br>

812 The `key` will _always_ be the value `#!py "right"`, because these are fields which have been removed from the left-hand side and only visible on the right-hand side.

813 6. When `#!py include_change_field=True`, then the `change_type` section will always appear next.<br>

814 If `#!py include_change_field=False`, then this section is omitted.

815 7. The second element of the `#!py tuple` is a `#!py dict` that has two `key`-`value` pairs.<br>

816 The `key`'s will _always_ be the values `#!py "left"` then `#!py "right"`, because these are fields where the data type has changed between the left-hand side and the right-hand side, and therefore you need to see both to see exactly what has changed.

817 8. When `#!py include_change_nullable=True`, then the `change_nullable` section will always appear next.<br>

818 If `#!py include_change_nullable=False`, then this section is omitted.

819 9. The sectond element of the `#!py tuple` is a `#!py dict` that has two `key`-`value` pairs.<br>

820 The `key`'s will _always_ be the values `#!py "left"` then `#!py "right"`, because these are fields where the nullability of the firlds are changed between the left-hand side and the right-hand side, and therefore you need to see both to see exactly what has changed.

821 """

822

823 valid_methods = ValidMethods()

824 msg: str = "If using the '{meth}' method, then '{name}' cannot be 'None'."

825

826 if method in valid_methods.by_table_and_table:

827 assert left_table is not None, msg.format(meth=method, name="left_table")

828 assert right_table is not None, msg.format(meth=method, name="right_table")

829 return _check_schemas_match_by_table_and_table(

830 left_table=left_table,

831 right_table=right_table,

832 include_change_field=include_change_field,

833 include_add_field=include_add_field,

834 include_remove_field=include_remove_field,

835 include_change_nullable=include_change_nullable,

836 return_object=return_object,

837 )

838 elif method in valid_methods.by_table_and_path:

839 assert left_table is not None, msg.format(meth=method, name="left_table")

840 assert right_table_path is not None, msg.format(meth=method, name="right_table_path")

841 assert right_table_name is not None, msg.format(meth=method, name="right_table_name")

842 assert spark_session is not None, msg.format(meth=method, name="spark_session")

843 return _check_schemas_match_by_table_and_path(

844 left_table=left_table,

845 right_table_path=right_table_path,

846 right_table_name=right_table_name,

847 right_table_format=right_table_format,

848 spark_session=spark_session,

849 include_change_field=include_change_field,

850 include_add_field=include_add_field,

851 include_remove_field=include_remove_field,

852 include_change_nullable=include_change_nullable,

853 return_object=return_object,

854 )

855 elif method in valid_methods.by_path_and_table:

856 assert left_table_path is not None, msg.format(meth=method, name="left_table_path")

857 assert left_table_name is not None, msg.format(meth=method, name="left_table_name")

858 assert right_table is not None, msg.format(meth=method, name="right_table")

859 assert spark_session is not None, msg.format(meth=method, name="spark_session")

860 return _check_schemas_match_by_path_and_table(

861 left_table_path=left_table_path,

862 left_table_name=left_table_name,

863 right_table=right_table,

864 spark_session=spark_session,

865 left_table_format=left_table_format,

866 include_change_field=include_change_field,

867 include_add_field=include_add_field,

868 include_remove_field=include_remove_field,

869 include_change_nullable=include_change_nullable,

870 return_object=return_object,

871 )

872 elif method in valid_methods.by_path_and_path:

873 assert left_table_path is not None, msg.format(meth=method, name="left_table_path")

874 assert left_table_name is not None, msg.format(meth=method, name="left_table_name")

875 assert right_table_path is not None, msg.format(meth=method, name="right_table_path")

876 assert right_table_name is not None, msg.format(meth=method, name="right_table_name")

877 assert spark_session is not None, msg.format(meth=method, name="spark_session")

878 return _check_schemas_match_by_path_and_path(

879 left_table_path=left_table_path,

880 left_table_name=left_table_name,

881 left_table_format=left_table_format,

882 right_table_path=right_table_path,

883 right_table_name=right_table_name,

884 right_table_format=right_table_format,

885 spark_session=spark_session,

886 include_change_field=include_change_field,

887 include_add_field=include_add_field,

888 include_remove_field=include_remove_field,

889 include_change_nullable=include_change_nullable,

890 return_object=return_object,

891 )

892 else:

893 raise AttributeError(

894 f"Invalid value for `method`: '{method}'\n"

895 f"Please use one of the following options:\n"

896 f"- For `by_table_and_table`, use one of: {valid_methods.by_table_and_table}\n"

897 f"- For `by_table_and_path`, use one of: {valid_methods.by_table_and_path}\n"

898 f"- For `by_path_and_table`, use one of: {valid_methods.by_path_and_table}\n"

899 f"- For `by_path_and_path`, use one of: {valid_methods.by_path_and_path}\n"

900 )

901

902

903# ---------------------------------------------------------------------------- #

904# View Differences ####

905# ---------------------------------------------------------------------------- #

906

907

908@typechecked

909def _view_schema_differences_by_table_and_table(

910 left_table: psDataFrame,

911 right_table: psDataFrame,

912 include_change_field: bool = True,

913 include_add_field: bool = True,

914 include_remove_field: bool = True,

915 include_change_nullable: bool = False,

916 view_type: Literal["print", "pprint", "return"] = "pprint",

917) -> Optional[Union[list[tuple[str, dict[str, StructField]]], bool]]:

918 schema_differences: Union[list[tuple[str, dict[str, StructField]]], bool] = (

919 check_schemas_match(

920 method="table_table",

921 left_table=left_table,

922 right_table=right_table,

923 include_change_field=include_change_field,

924 include_add_field=include_add_field,

925 include_remove_field=include_remove_field,

926 include_change_nullable=include_change_nullable,

927 return_object="results",

928 )

929 )

930 if is_type(schema_differences, list) and len(schema_differences) > 0:

931 if view_type == "print":

932 print(schema_differences)

933 elif view_type == "pprint":

934 pprint(schema_differences)

935 elif view_type == "return":

936 return schema_differences

937 return None

938

939

940@typechecked

941def _view_schema_differences_by_path_and_path(

942 left_table_path: str,

943 left_table_name: str,

944 right_table_path: str,

945 right_table_name: str,

946 spark_session: SparkSession,

947 left_table_format: str = "delta",

948 right_table_format: str = "delta",

949 include_change_field: bool = True,

950 include_add_field: bool = True,

951 include_remove_field: bool = True,

952 include_change_nullable: bool = False,

953 view_type: Literal["print", "pprint", "return"] = "pprint",

954) -> Optional[Union[list[tuple[str, dict[str, StructField]]], bool]]:

955 left_table: psDataFrame = read_from_path(

956 name=left_table_name,

957 path=left_table_path,

958 spark_session=spark_session,

959 data_format=left_table_format,

960 )

961 right_table: psDataFrame = read_from_path(

962 name=right_table_name,

963 path=right_table_path,

964 spark_session=spark_session,

965 data_format=right_table_format,

966 )

967 return _view_schema_differences_by_table_and_table(

968 left_table=left_table,

969 right_table=right_table,

970 include_change_field=include_change_field,

971 include_add_field=include_add_field,

972 include_remove_field=include_remove_field,

973 include_change_nullable=include_change_nullable,

974 view_type=view_type,

975 )

976

977

978@typechecked

979def _view_schema_differences_by_table_and_path(

980 left_table: psDataFrame,

981 right_table_path: str,

982 right_table_name: str,

983 spark_session: SparkSession,

984 right_table_format: str = "delta",

985 include_change_field: bool = True,

986 include_add_field: bool = True,

987 include_remove_field: bool = True,

988 include_change_nullable: bool = False,

989 view_type: Literal["print", "pprint", "return"] = "pprint",

990) -> Optional[Union[list[tuple[str, dict[str, StructField]]], bool]]:

991 right_table: psDataFrame = read_from_path(

992 name=right_table_name,

993 path=right_table_path,

994 spark_session=spark_session,

995 data_format=right_table_format,

996 )

997 return _view_schema_differences_by_table_and_table(

998 left_table=left_table,

999 right_table=right_table,

1000 include_change_field=include_change_field,

1001 include_add_field=include_add_field,

1002 include_remove_field=include_remove_field,

1003 include_change_nullable=include_change_nullable,

1004 view_type=view_type,

1005 )

1006

1007

1008@typechecked

1009def _view_schema_differences_by_path_and_table(

1010 left_table_path: str,

1011 left_table_name: str,

1012 right_table: psDataFrame,

1013 spark_session: SparkSession,

1014 left_table_format: str = "delta",

1015 include_change_field: bool = True,

1016 include_add_field: bool = True,

1017 include_remove_field: bool = True,

1018 include_change_nullable: bool = False,

1019 view_type: Literal["print", "pprint", "return"] = "pprint",

1020) -> Optional[Union[list[tuple[str, dict[str, StructField]]], bool]]:

1021 left_table: psDataFrame = read_from_path(

1022 name=left_table_name,

1023 path=left_table_path,

1024 spark_session=spark_session,

1025 data_format=left_table_format,

1026 )

1027 return _view_schema_differences_by_table_and_table(

1028 left_table=left_table,

1029 right_table=right_table,

1030 include_change_field=include_change_field,

1031 include_add_field=include_add_field,

1032 include_remove_field=include_remove_field,

1033 include_change_nullable=include_change_nullable,

1034 view_type=view_type,

1035 )

1036

1037

1038@typechecked

1039def view_schema_differences(

1040 method: str = "by_table_and_table",

1041 spark_session: Optional[SparkSession] = None,

1042 left_table: Optional[psDataFrame] = None,

1043 left_table_path: Optional[str] = None,

1044 left_table_name: Optional[str] = None,

1045 left_table_format: str = "delta",

1046 right_table: Optional[psDataFrame] = None,

1047 right_table_path: Optional[str] = None,

1048 right_table_name: Optional[str] = None,

1049 right_table_format: str = "delta",

1050 include_change_field: bool = True,

1051 include_add_field: bool = True,

1052 include_remove_field: bool = True,

1053 include_change_nullable: bool = False,

1054 view_type: Literal["print", "pprint", "return"] = "pprint",

1055) -> Optional[Union[list[tuple[str, dict[str, StructField]]], bool]]:

1056 """

1057 !!! note "Summary"

1058 View the schemas between two different tables.

1059

1060 ???+ abstract "Details"

1061 The primary differences between [`check_schemas_match()`][toolbox_pyspark.schema.check_schemas_match] and [`view_schema_differences()`][toolbox_pyspark.schema.view_schema_differences] is that [`check_...()`][toolbox_pyspark.schema.check_schemas_match] returns either a `#!py bool` result, or the actual details of the schema differences, whilst [`view_...()`][toolbox_pyspark.schema.view_schema_differences] may also return the actual details object, but it will also print the result to the terminal for you to review.<br>

1062 For full details of all the parameters and all the options, including nuances and detailed explanations and thorough examples, please check the [`check_schemas_match()`][toolbox_pyspark.schema.check_schemas_match] function.

1063

1064 Params:

1065 method (str, optional):

1066 The method to use for the comparison. That is, is either side a table in memory or is it a `table` sitting on a `path`?. Check the Notes section for all options available for this parameter.<br>

1067 Defaults to `#!py "by_table_and_table"`.

1068 spark_session (Optional[SparkSession], optional):

1069 The `SparkSession` to use if either the `left` or `right` tables are sitting on a `path` somewhere.<br>

1070 Defaults to `#!py None`.

1071 left_table (Optional[psDataFrame], optional):

1072 If `method` defines the `left` table as a `table`, then this parameter is the actual `dataframe` to do the checking against.<br>

1073 Defaults to `#!py None`.

1074 left_table_path (Optional[str], optional):

1075 If `method` defines the `left` table as a `path`, then this parameter is the actual path location where the table can be found.<br>

1076 Defaults to `#!py None`.

1077 left_table_name (Optional[str], optional):

1078 If `method` defines the `left` table as a `path`, then this parameter is the name of the table found at the given `left_table_path` location.<br>

1079 Defaults to `#!py None`.

1080 left_table_format (str, optional):

1081 If `method` defines the `left` table as a `path`, then this parameter is the format of the table found at the given `left_table_path` location.<br>

1082 Defaults to `#!py "delta"`.

1083 right_table (Optional[psDataFrame], optional):

1084 If `method` defines the `right` table as a `table`, then this parameter is the actual `dataframe` to do the checking against.<br>

1085 Defaults to `#!py None`.

1086 right_table_path (Optional[str], optional):

1087 If `method` defines the `right` table as a `path`, then this parameter is the actual path location where the table can be found.<br>

1088 Defaults to `#!py None`.

1089 right_table_name (Optional[str], optional):

1090 If `method` defines the `right` table as a `path`, then this parameter is the name of the table found at the given `right_table_path` location.<br>

1091 Defaults to `#!py None`.

1092 right_table_format (str, optional):

1093 If `method` defines the `right` table as a `path`, then this parameter is the format of the table found at the given `right_table_path` location.<br>

1094 Defaults to `#!py "delta"`.

1095 include_change_field (bool, optional):

1096 When doing the schema validations, do you want to include any fields where the data-type on the right-hand side is different from the left-hand side?<br>

1097 This can be read as: "What fields have had their data type _changed **between**_ the left-hand side and the right-hand side?"<br>

1098 Defaults to `#!py True`.

1099 include_add_field (bool, optional):

1100 When doing the schema validations, do you want to include any fields that have had any additional fields added to the left-hand side, when compared to the right-hand side?<br>

1101 This can be read as: "What fields have been _added **to**_ the left-hand side?"<br>

1102 Defaults to `#!py True`.

1103 include_remove_field (bool, optional):

1104 When doing the schema validations, do you want to include any fields which are missing from the left-hand side and only existing on the right-hand side?<br>

1105 This can be read as: "What fields been _removed **from**_ the left-hand side?"<br>

1106 Defaults to `#!py True`.

1107 include_change_nullable (bool, optional):

1108 When doing the schema validations, do you want to include any fields which have had their nullability metadata changed on the right-hand side, when compared to the left-hand side?.<br>

1109 This can be read as: "What fields had their nullability _changed **between**_ the left-hand side and the right-hand side?"<br>

1110 Defaults to `#!py False`.

1111 view_type (Literal["print", "pprint", "return"], optional):

1112 When returning the output from this function, how do you want it to be displayed? Must be one of `#!py ["print", "pprint", "return"]`.<br>

1113 Logically, the difference is that `#!py "print"` will display a text value to the terminal that is not formatted in any way; `#!py "pprint"` will display a pretty-printed text value to the terminal; and `#!py "return"` will return the schema differences which can then be assigned to another variable.<br>

1114 Defaults to `#!py "pprint"`.

1115

1116 Raises:

1117 TypeError:

1118 If any of the inputs parsed to the parameters of this function are not the correct type. Uses the [`@typeguard.typechecked`](https://typeguard.readthedocs.io/en/stable/api.html#typeguard.typechecked) decorator.

1119 AttributeError:

1120 If the value parse'd to `method` is not a valid option.

1121

1122 Returns:

1123 (Optional[list[tuple[str, dict[str, StructField]]]]):

1124 If `#!py view_type="return"`, then this will be a `#!py list` of `#!py tuple`'s of `#!py dict`'s containing the details of the schema differences. If `#!py view_type!="return"` (or if `#!py view_type="return"`, but there are actually no differences in the schema), then nothing is returned; only printed to terminal.

1125

1126 ???+ example "Examples"

1127

1128 ```{.py .python linenums="1" title="Set up"}

1129 >>> # Imports

1130 >>> from pprint import pprint

1131 >>> import pandas as pd

1132 >>> from pyspark.sql import SparkSession, functions as F

1133 >>> from toolbox_pyspark.schema import view_schema_differences

1134 >>> from toolbox_pyspark.io import write_to_path

1135 >>> from toolbox_pyspark.checks import table_exists

1136 >>>

1137 >>> # Instantiate Spark

1138 >>> spark = SparkSession.builder.getOrCreate()

1139 >>>

1140 >>> # Create data

1141 >>> df1 = spark.createDataFrame(

1142 ... pd.DataFrame(

1143 ... {

1144 ... "a": [0, 1, 2, 3],

1145 ... "b": ["a", "b", "c", "d"],

1146 ... "c": ["1", "1", "1", "1"],

1147 ... "d": ["2", "2", "2", "2"],

1148 ... "e": ["3", "3", "3", "3"],

1149 ... "f": ["4", "4", "4", "4"],

1150 ... }

1151 ... )

1152 ... )

1153 >>> df2 = (

1154 ... df1.withColumn("c", F.col("c").cast("int"))

1155 ... .withColumn("g", F.lit("a"))

1156 ... .withColumn("d", F.lit("null"))

1157 ... .drop("e")

1158 ... )

1159 >>> write_to_path(

1160 ... table=df1,

1161 ... name="left",

1162 ... path="./test",

1163 ... data_format="parquet",

1164 ... mode="overwrite",

1165 ... write_options={"overwriteSchema": "true"},

1166 ... )

1167 >>> write_to_path(

1168 ... table=df2,

1169 ... name="right",

1170 ... path="./test",

1171 ... data_format="parquet",

1172 ... mode="overwrite",

1173 ... write_options={"overwriteSchema": "true"},

1174 ... )

1175 >>>

1176 >>> # Check

1177 >>> pprint(df1.dtypes)

1178 >>> print(df1.show())

1179 >>> print(table_exists("left", "./test", "parquet", spark))

1180 >>> pprint(df2.dtypes)

1181 >>> print(df2.show())

1182 >>> print(table_exists("right", "./test", "parquet", spark))

1183 ```

1184 <div class="result" markdown>

1185 ```{.sh .shell title="Terminal"}

1186 [

1187 ("a", "bigint"),

1188 ("b", "string"),

1189 ("c", "string"),

1190 ("d", "string"),

1191 ("e", "string"),

1192 ("f", "string"),

1193 ]

1194 ```

1195 ```{.txt .text title="Terminal"}

1196 +---+---+---+---+---+---+

1197 | a | b | c | d | e | f |

1198 +---+---+---+---+---+---+

1199 | 0 | a | 1 | 2 | 3 | 4 |

1200 | 1 | b | 1 | 2 | 3 | 4 |

1201 | 2 | c | 1 | 2 | 3 | 4 |

1202 | 3 | d | 1 | 2 | 3 | 4 |

1203 +---+---+---+---+---+---+

1204 ```

1205 ```{.sh .shell title="Terminal"}

1206 True

1207 ```

1208 ```{.sh .shell title="Terminal"}

1209 [

1210 ("a", "bigint"),

1211 ("b", "string"),

1212 ("c", "int"),

1213 ("d", "string"),

1214 ("f", "string"),

1215 ("g", "string"),

1216 ]

1217 ```

1218 ```{.txt .text title="Terminal"}

1219 +---+---+---+------+---+---+

1220 | a | b | c | d | f | g |

1221 +---+---+---+------+---+---+

1222 | 0 | a | 1 | null | 4 | 2 |

1223 | 1 | b | 1 | null | 4 | 2 |

1224 | 2 | c | 1 | null | 4 | 2 |

1225 | 3 | d | 1 | null | 4 | 2 |

1226 +---+---+---+------+---+---+

1227 ```

1228 ```{.sh .shell title="Terminal"}

1229 True

1230 ```

1231 </div>

1232

1233 ```{.py .python linenums="1" title="Example 1: Check matching"}

1234 >>> view_schema_differences(

1235 ... method="table_table",

1236 ... left_table=df1,

1237 ... right_table=df1,

1238 ... include_add_field=True,

1239 ... include_change_field=True,

1240 ... include_remove_field=True,

1241 ... include_change_nullable=True,

1242 ... view_type="return",

1243 ... )

1244 >>> print(diff)

1245 ```

1246 <div class="result" markdown>

1247 ```{.sh .shell title="Terminal"}

1248 None

1249 ```

1250 !!! success "Conclusion: Schemas match."

1251 </div>

1252

1253 ```{.py .python linenums="1" title="Example 2: Check print"}

1254 >>> view_schema_differences(

1255 ... method="table_table",

1256 ... left_table=df1,

1257 ... right_table=df2,

1258 ... include_add_field=True,

1259 ... include_change_field=False,

1260 ... include_remove_field=False,

1261 ... include_change_nullable=False,

1262 ... view_type="print",

1263 ... )

1264 ```

1265 <div class="result" markdown>

1266 ```{.sh .shell title="Terminal"}

1267 [('add', {'left': StructField('e', StringType(), True)})]

1268 ```

1269 !!! failure "Conclusion: Schemas do not match because the `e` field was added."

1270 </div>

1271

1272 ```{.py .python linenums="1" title="Example 3: Check pprint"}

1273 >>> view_schema_differences(

1274 ... method="table_table",

1275 ... left_table=df1,

1276 ... right_table=df2,

1277 ... include_add_field=True,

1278 ... include_change_field=True,

1279 ... include_remove_field=True,

1280 ... include_change_nullable=True,

1281 ... view_type="pprint",

1282 ... )

1283 ```

1284 <div class="result" markdown>

1285 ```{.sh .shell title="Terminal"}

1286 [('add', {'left': StructField('e', StringType(), False)}),

1287 ('remove', {'right': StructField('g', StringType(), False)}),

1288 ('change_type',

1289 {'left': StructField('c', StringType(), False),

1290 'right': StructField('c', IntegerType(), True)}),

1291 ('change_nullable',

1292 {'left': StructField('c', StringType(), False),

1293 'right': StructField('c', IntegerType(), True)})]

1294 ```

1295 !!! failure "Conclusion: Schemas do not match because the `e` field was added, the `g` field was removed, the `c` field had its data type changed, and the `c` field had its nullability changed."

1296 </div>

1297

1298 ```{.py .python linenums="1" title="Example 4: Check with right-hand side as a `path`"}

1299 >>> view_schema_differences(

1300 ... method="table_table",

1301 ... spark_session=spark,

1302 ... left_table=df1,

1303 ... right_table_path="./test",

1304 ... right_table_name="right",

1305 ... right_table_format="parquet",

1306 ... include_add_field=True,

1307 ... include_change_field=False,

1308 ... include_remove_field=False,

1309 ... include_change_nullable=False,

1310 ... view_type="pprint",

1311 ... )

1312 ```

1313 <div class="result" markdown>

1314 ```{.sh .shell title="Terminal"}

1315 [('add', {'left': StructField('e', StringType(), True)})]

1316 ```

1317 !!! failure "Conclusion: Schemas do not match because the `e` field was added."

1318 </div>

1319

1320 ```{.py .python linenums="1" title="Example 5: Check with both sides being a `path`"}

1321 >>> view_schema_differences(

1322 ... method="table_table",

1323 ... spark_session=spark,

1324 ... left_table_path="./test",

1325 ... left_table_name="left",

1326 ... left_table_format="parquet",

1327 ... right_table_path="./test",

1328 ... right_table_name="right",

1329 ... right_table_format="parquet",

1330 ... include_add_field=False,

1331 ... include_change_field=False,

1332 ... include_remove_field=True,

1333 ... include_change_nullable=False,

1334 ... view_type="pprint",

1335 ... )

1336 ```

1337 <div class="result" markdown>

1338 ```{.sh .shell title="Terminal"}

1339 [('remove', {'right': StructField('g', StringType(), True)})]

1340 ```

1341 !!! failure "Conclusion: Schemas do not match because the `g` field was removed."

1342 </div>

1343

1344 ```{.py .python linenums="1" title="Example 6: Invalid `method` parameter"}

1345 >>> view_schema_differences(

1346 ... method="table_table_table",

1347 ... left_table=df1,

1348 ... right_table=df2,

1349 ... include_add_field=True,

1350 ... include_change_field=True,

1351 ... include_remove_field=True,

1352 ... include_change_nullable=True,

1353 ... view_type="return",

1354 ... )

1355 ```

1356 <div class="result" markdown>

1357 ```{.sh .shell title="Terminal"}

1358 AttributeError: Invalid value for `method`: 'table_table_table'

1359 Please use one of the following options:

1360 - For `by_table_and_table`, use one of the following values: ['table', 'table_table', 'tables', 'by_table', 'by_table_and_table', 'table_and_table']

1361 - For `by_table_and_path`, use one of the following values: ['table_and_path', 'table_path', 'by_table_and_path']

1362 - For `by_path_and_table`, use one of the following values: ['path_and_table', 'path_table', 'by_path_and_table']

1363 - For `by_path_and_path`, use one of the following values: ['path_and_path', 'path_path', 'by_path_and_path', 'path', 'paths']

1364 ```

1365 !!! failure "Conclusion: Invalid `method` parameter."

1366 </div>

1367

1368 ??? tip "See Also"

1369 - [`check_schemas_match()`][toolbox_pyspark.schema.check_schemas_match]

1370 """

1371

1372 valid_methods: ValidMethods = ValidMethods()

1373 msg: str = "If using the '{meth}' method, then '{name}' cannot be 'None'."

1374

1375 if method in valid_methods.by_table_and_table:

1376 assert left_table is not None, msg.format(meth=method, name="left_table")

1377 assert right_table is not None, msg.format(meth=method, name="right_table")

1378 return _view_schema_differences_by_table_and_table(

1379 left_table=left_table,

1380 right_table=right_table,

1381 include_change_field=include_change_field,

1382 include_add_field=include_add_field,

1383 include_remove_field=include_remove_field,

1384 include_change_nullable=include_change_nullable,

1385 view_type=view_type,

1386 )

1387 elif method in valid_methods.by_table_and_path:

1388 assert left_table is not None, msg.format(meth=method, name="left_table")

1389 assert right_table_path is not None, msg.format(meth=method, name="right_table_path")

1390 assert right_table_name is not None, msg.format(meth=method, name="right_table_name")

1391 assert spark_session is not None, msg.format(meth=method, name="spark_session")

1392 return _view_schema_differences_by_table_and_path(

1393 left_table=left_table,

1394 right_table_path=right_table_path,

1395 right_table_name=right_table_name,

1396 right_table_format=right_table_format,

1397 spark_session=spark_session,

1398 include_change_field=include_change_field,

1399 include_add_field=include_add_field,

1400 include_remove_field=include_remove_field,

1401 include_change_nullable=include_change_nullable,

1402 view_type=view_type,

1403 )

1404 elif method in valid_methods.by_path_and_table:

1405 assert left_table_path is not None, msg.format(meth=method, name="left_table_path")

1406 assert left_table_name is not None, msg.format(meth=method, name="left_table_name")

1407 assert right_table is not None, msg.format(meth=method, name="right_table")

1408 assert spark_session is not None, msg.format(meth=method, name="spark_session")

1409 return _view_schema_differences_by_path_and_table(

1410 left_table_path=left_table_path,

1411 left_table_name=left_table_name,

1412 left_table_format=left_table_format,

1413 right_table=right_table,

1414 spark_session=spark_session,

1415 include_change_field=include_change_field,

1416 include_add_field=include_add_field,

1417 include_remove_field=include_remove_field,

1418 include_change_nullable=include_change_nullable,

1419 view_type=view_type,

1420 )

1421 elif method in valid_methods.by_path_and_path:

1422 assert left_table_path is not None, msg.format(meth=method, name="left_table_path")

1423 assert left_table_name is not None, msg.format(meth=method, name="left_table_name")

1424 assert right_table_path is not None, msg.format(meth=method, name="right_table_path")

1425 assert right_table_name is not None, msg.format(meth=method, name="right_table_name")

1426 assert spark_session is not None, msg.format(meth=method, name="spark_session")

1427 return _view_schema_differences_by_path_and_path(

1428 left_table_path=left_table_path,

1429 left_table_name=left_table_name,

1430 left_table_format=left_table_format,

1431 right_table_path=right_table_path,

1432 right_table_name=right_table_name,

1433 right_table_format=right_table_format,

1434 spark_session=spark_session,

1435 include_change_field=include_change_field,

1436 include_add_field=include_add_field,

1437 include_remove_field=include_remove_field,

1438 include_change_nullable=include_change_nullable,

1439 view_type=view_type,

1440 )

1441 else:

1442 raise AttributeError(

1443 f"Invalid value for `method`: '{method}'\n"

1444 f"Please use one of the following options:\n"

1445 f"- For `by_table_and_table`, use one of: {valid_methods.by_table_and_table}\n"

1446 f"- For `by_table_and_path`, use one of: {valid_methods.by_table_and_path}\n"

1447 f"- For `by_path_and_table`, use one of: {valid_methods.by_path_and_table}\n"

1448 f"- For `by_path_and_path`, use one of: {valid_methods.by_path_and_path}\n"

1449 )