Coverage for src/toolbox_pyspark/schema.py: 100%

144 statements  

« prev     ^ index     » next       coverage.py v7.6.10, created at 2025-01-25 23:08 +0000

1# ============================================================================ # 

2# # 

3# Title : Schema # 

4# Purpose : Checking, validating, and viewing any schema differences # 

5# between two different tables, either from in-memory variables, # 

6# or pointing to locations on disk. # 

7# # 

8# ============================================================================ # 

9 

10 

11# ---------------------------------------------------------------------------- # 

12# # 

13# Overview #### 

14# # 

15# ---------------------------------------------------------------------------- # 

16 

17 

18# ---------------------------------------------------------------------------- # 

19# Description #### 

20# ---------------------------------------------------------------------------- # 

21 

22 

23""" 

24!!! note "Summary" 

25 The `schema` module is used for checking, validating, and viewing any schema differences between two different tables, either from in-memory variables, or pointing to locations on disk. 

26""" 

27 

28 

29# ---------------------------------------------------------------------------- # 

30# # 

31# Setup #### 

32# # 

33# ---------------------------------------------------------------------------- # 

34 

35 

36# ---------------------------------------------------------------------------- # 

37# Imports #### 

38# ---------------------------------------------------------------------------- # 

39 

40 

41# ## Python StdLib Imports ---- 

42from pprint import pprint 

43from typing import Literal, NamedTuple, Optional, Union 

44 

45# ## Python Third Party Imports ---- 

46from pyspark.sql import DataFrame as psDataFrame, SparkSession 

47from pyspark.sql.types import StructField 

48from toolbox_python.checkers import is_type 

49from toolbox_python.collection_types import str_list, str_set 

50from typeguard import typechecked 

51 

52# ## Local First Party Imports ---- 

53from toolbox_pyspark.io import read_from_path 

54 

55 

56# ---------------------------------------------------------------------------- # 

57# Exports #### 

58# ---------------------------------------------------------------------------- # 

59 

60 

61__all__: str_list = [ 

62 "view_schema_differences", 

63 "check_schemas_match", 

64] 

65 

66 

67# ---------------------------------------------------------------------------- # 

68# # 

69# Functions #### 

70# # 

71# ---------------------------------------------------------------------------- # 

72 

73 

74## --------------------------------------------------------------------------- # 

75## Classes #### 

76## --------------------------------------------------------------------------- # 

77 

78 

79class ValidMethods(NamedTuple): 

80 """ 

81 ```py 

82 by_table_and_table: str_set 

83 by_table_and_path: str_set 

84 by_path_and_table: str_set 

85 by_path_and_path: str_set 

86 ``` 

87 """ 

88 

89 by_table_and_table: str_set = { 

90 "table", 

91 "table_table", 

92 "tables", 

93 "by_table", 

94 "by_table_and_table", 

95 "table_and_table", 

96 } 

97 """ 

98 ```py 

99 { 

100 "table", 

101 "table_table", 

102 "tables", 

103 "by_table", 

104 "by_table_and_table", 

105 "table_and_table", 

106 } 

107 ``` 

108 """ 

109 by_table_and_path: str_set = { 

110 "table_and_path", 

111 "table_path", 

112 "by_table_and_path", 

113 } 

114 """ 

115 ```py 

116 { 

117 "table_and_path", 

118 "table_path", 

119 "by_table_and_path", 

120 } 

121 ``` 

122 """ 

123 by_path_and_table: str_set = { 

124 "path_and_table", 

125 "path_table", 

126 "by_path_and_table", 

127 } 

128 """ 

129 ```py 

130 { 

131 "path_and_table", 

132 "path_table", 

133 "by_path_and_table", 

134 } 

135 ``` 

136 """ 

137 by_path_and_path: str_set = { 

138 "path_and_path", 

139 "path_path", 

140 "by_path_and_path", 

141 "path", 

142 "paths", 

143 } 

144 """ 

145 ```py 

146 { 

147 "path_and_path", 

148 "path_path", 

149 "by_path_and_path", 

150 "path", 

151 "paths", 

152 } 

153 ``` 

154 """ 

155 

156 

157# ---------------------------------------------------------------------------- # 

158# Check Matching #### 

159# ---------------------------------------------------------------------------- # 

160 

161 

162@typechecked 

163def _check_schemas_match_by_table_and_table( 

164 left_table: psDataFrame, 

165 right_table: psDataFrame, 

166 include_change_field: bool = True, 

167 include_add_field: bool = True, 

168 include_remove_field: bool = True, 

169 include_change_nullable: bool = False, 

170 return_object: Literal["results", "check"] = "check", 

171) -> Union[list, bool]: 

172 

173 # Set up 

174 left_schema: dict = left_table.schema.__dict__ 

175 left_names: str_list = left_schema["names"] 

176 left_fields: list[StructField] = left_schema["fields"] 

177 right_schema: dict = right_table.schema.__dict__ 

178 right_names: str_list = right_schema["names"] 

179 right_fields: list[StructField] = right_schema["fields"] 

180 results = list() 

181 

182 # Loop for any additions 

183 if include_add_field: 

184 for left_field in left_fields: 

185 if left_field.name not in right_names: 

186 results.append(("add", {"left": left_field})) 

187 

188 # Loop for any removals 

189 if include_remove_field: 

190 for right_field in right_fields: 

191 if right_field.name not in left_names: 

192 results.append(("remove", {"right": right_field})) 

193 

194 # Loop for any changes 

195 if include_change_field: 

196 for left_field in left_fields: 

197 if left_field.name not in right_names: 

198 continue 

199 right_field: StructField = [ 

200 field for field in right_fields if field.name == left_field.name 

201 ][0] 

202 if left_field.dataType != right_field.dataType: 

203 results.append(("change_type", {"left": left_field, "right": right_field})) 

204 if include_change_nullable: 

205 if left_field.nullable != right_field.nullable: 

206 results.append( 

207 ("change_nullable", {"left": left_field, "right": right_field}) 

208 ) 

209 

210 # Return 

211 if len(results) > 0: 

212 if return_object == "results": 

213 return results 

214 elif return_object == "check": 

215 return False 

216 return True 

217 

218 

219@typechecked 

220def _check_schemas_match_by_table_and_path( 

221 left_table: psDataFrame, 

222 right_table_path: str, 

223 right_table_name: str, 

224 spark_session: SparkSession, 

225 right_table_format: str = "delta", 

226 include_change_field: bool = True, 

227 include_add_field: bool = True, 

228 include_remove_field: bool = True, 

229 include_change_nullable: bool = False, 

230 return_object: Literal["results", "check"] = "check", 

231) -> Union[list, bool]: 

232 right_table: psDataFrame = read_from_path( 

233 name=right_table_name, 

234 path=right_table_path, 

235 spark_session=spark_session, 

236 data_format=right_table_format, 

237 ) 

238 return _check_schemas_match_by_table_and_table( 

239 left_table=left_table, 

240 right_table=right_table, 

241 include_change_field=include_change_field, 

242 include_add_field=include_add_field, 

243 include_remove_field=include_remove_field, 

244 include_change_nullable=include_change_nullable, 

245 return_object=return_object, 

246 ) 

247 

248 

249@typechecked 

250def _check_schemas_match_by_path_and_table( 

251 left_table_path: str, 

252 left_table_name: str, 

253 right_table: psDataFrame, 

254 spark_session: SparkSession, 

255 left_table_format: str = "delta", 

256 include_change_field: bool = True, 

257 include_add_field: bool = True, 

258 include_remove_field: bool = True, 

259 include_change_nullable: bool = False, 

260 return_object: Literal["results", "check"] = "check", 

261) -> Union[list, bool]: 

262 left_table: psDataFrame = read_from_path( 

263 name=left_table_name, 

264 path=left_table_path, 

265 spark_session=spark_session, 

266 data_format=left_table_format, 

267 ) 

268 return _check_schemas_match_by_table_and_table( 

269 left_table=left_table, 

270 right_table=right_table, 

271 include_change_field=include_change_field, 

272 include_add_field=include_add_field, 

273 include_remove_field=include_remove_field, 

274 include_change_nullable=include_change_nullable, 

275 return_object=return_object, 

276 ) 

277 

278 

279@typechecked 

280def _check_schemas_match_by_path_and_path( 

281 left_table_path: str, 

282 left_table_name: str, 

283 right_table_path: str, 

284 right_table_name: str, 

285 spark_session: SparkSession, 

286 left_table_format: str = "delta", 

287 right_table_format: str = "delta", 

288 include_change_field: bool = True, 

289 include_add_field: bool = True, 

290 include_remove_field: bool = True, 

291 include_change_nullable: bool = False, 

292 return_object: Literal["results", "check"] = "check", 

293) -> Union[list, bool]: 

294 left_table: psDataFrame = read_from_path( 

295 name=left_table_name, 

296 path=left_table_path, 

297 spark_session=spark_session, 

298 data_format=left_table_format, 

299 ) 

300 right_table: psDataFrame = read_from_path( 

301 name=right_table_name, 

302 path=right_table_path, 

303 spark_session=spark_session, 

304 data_format=right_table_format, 

305 ) 

306 return _check_schemas_match_by_table_and_table( 

307 left_table=left_table, 

308 right_table=right_table, 

309 include_change_field=include_change_field, 

310 include_add_field=include_add_field, 

311 include_remove_field=include_remove_field, 

312 include_change_nullable=include_change_nullable, 

313 return_object=return_object, 

314 ) 

315 

316 

317@typechecked 

318def check_schemas_match( 

319 method: str = "by_table_and_table", 

320 left_table: Optional[psDataFrame] = None, 

321 right_table: Optional[psDataFrame] = None, 

322 left_table_path: Optional[str] = None, 

323 left_table_name: Optional[str] = None, 

324 right_table_path: Optional[str] = None, 

325 right_table_name: Optional[str] = None, 

326 spark_session: Optional[SparkSession] = None, 

327 left_table_format: str = "delta", 

328 right_table_format: str = "delta", 

329 include_change_field: bool = True, 

330 include_add_field: bool = True, 

331 include_remove_field: bool = True, 

332 include_change_nullable: bool = False, 

333 return_object: Literal["results", "check"] = "check", 

334) -> Union[list[tuple[str, dict[str, StructField]]], bool]: 

335 """ 

336 !!! note "Summary" 

337 Check the schemas between two different tables. 

338 

339 ???+ abstract "Details" 

340 This function is heavily inspired by other packages which check and validate schema differences for `pyspark` tables. This function just streamlines it a bit, and adds additional functionality for whether or not table on either `left` or `right` side is already in-memory or sitting on a directory somewhere else. 

341 

342 Params: 

343 method (str, optional): 

344 The method to use for the comparison. That is, is either side a table in memory or is it a `table` sitting on a `path`?. Check the Notes section for all options available for this parameter.<br> 

345 Defaults to `#!py "by_table_and_table"`. 

346 spark_session (Optional[SparkSession], optional): 

347 The `SparkSession` to use if either the `left` or `right` tables are sitting on a `path` somewhere.<br> 

348 Defaults to `#!py None`. 

349 left_table (Optional[psDataFrame], optional): 

350 If `method` defines the `left` table as a `table`, then this parameter is the actual `dataframe` to do the checking against.<br> 

351 Defaults to `#!py None`. 

352 left_table_path (Optional[str], optional): 

353 If `method` defines the `left` table as a `path`, then this parameter is the actual path location where the table can be found.<br> 

354 Defaults to `#!py None`. 

355 left_table_name (Optional[str], optional): 

356 If `method` defines the `left` table as a `path`, then this parameter is the name of the table found at the given `left_table_path` location.<br> 

357 Defaults to `#!py None`. 

358 left_table_format (str, optional): 

359 If `method` defines the `left` table as a `path`, then this parameter is the format of the table found at the given `left_table_path` location.<br> 

360 Defaults to `#!py "delta"`. 

361 right_table (Optional[psDataFrame], optional): 

362 If `method` defines the `right` table as a `table`, then this parameter is the actual `dataframe` to do the checking against.<br> 

363 Defaults to `#!py None`. 

364 right_table_path (Optional[str], optional): 

365 If `method` defines the `right` table as a `path`, then this parameter is the actual path location where the table can be found.<br> 

366 Defaults to `#!py None`. 

367 right_table_name (Optional[str], optional): 

368 If `method` defines the `right` table as a `path`, then this parameter is the name of the table found at the given `right_table_path` location.<br> 

369 Defaults to `#!py None`. 

370 right_table_format (str, optional): 

371 If `method` defines the `right` table as a `path`, then this parameter is the format of the table found at the given `right_table_path` location.<br> 

372 Defaults to `#!py "delta"`. 

373 include_change_field (bool, optional): 

374 When doing the schema validations, do you want to include any fields where the data-type on the right-hand side is different from the left-hand side?<br> 

375 This can be read as: "What fields have had their data type _changed **between**_ the left-hand side and the right-hand side?"<br> 

376 Defaults to `#!py True`. 

377 include_add_field (bool, optional): 

378 When doing the schema validations, do you want to include any fields that have had any additional fields added to the left-hand side, when compared to the right-hand side?<br> 

379 This can be read as: "What fields have been _added **to**_ the left-hand side?"<br> 

380 Defaults to `#!py True`. 

381 include_remove_field (bool, optional): 

382 When doing the schema validations, do you want to include any fields which are missing from the left-hand side and only existing on the right-hand side?<br> 

383 This can be read as: "What fields been _removed **from**_ the left-hand side?"<br> 

384 Defaults to `#!py True`. 

385 include_change_nullable (bool, optional): 

386 When doing the schema validations, do you want to include any fields which have had their nullability metadata changed on the right-hand side, when compared to the left-hand side?.<br> 

387 This can be read as: "What fields had their nullability _changed **between**_ the left-hand side and the right-hand side?"<br> 

388 Defaults to `#!py False`. 

389 return_object (Literal["results", "check"], optional): 

390 After having checked the schema, how do you want the results to be returned? If `#!py "check"`, then will only return a `#!py bool` value: `#!py True` if the schemas actually match, `#!py False` if there are any differences. If `#!py "results"`, then the actual schema differences will be returned. Check the Notes section for more information on the structure of this object.<br> 

391 Defaults to `#!py "check"`. 

392 

393 Raises: 

394 TypeError: 

395 If any of the inputs parsed to the parameters of this function are not the correct type. Uses the [`@typeguard.typechecked`](https://typeguard.readthedocs.io/en/stable/api.html#typeguard.typechecked) decorator. 

396 AttributeError: 

397 If the value parse'd to `method` is not a valid option. 

398 

399 Returns: 

400 (Union[list[tuple[str, dict[str, StructField]]], bool]): 

401 If `return_object` is `#!py "results"`, then this will be a `#!py list` of `#!py tuple`'s of `#!py dict`'s containing the details of the schema differences. If `return_object` is `#!py "check"`, then it will only be a `#!py bool` object about whether the schemas match or not. 

402 

403 ???+ example "Examples" 

404 

405 ```{.py .python linenums="1" title="Set up"} 

406 >>> # Imports 

407 >>> from pprint import pprint 

408 >>> import pandas as pd 

409 >>> from pyspark.sql import SparkSession, functions as F 

410 >>> from toolbox_pyspark.schema import check_schemas_match 

411 >>> from toolbox_pyspark.io import write_to_path 

412 >>> from toolbox_pyspark.checks import table_exists 

413 >>> 

414 >>> # Instantiate Spark 

415 >>> spark = SparkSession.builder.getOrCreate() 

416 >>> 

417 >>> # Create data 

418 >>> df1 = spark.createDataFrame( 

419 ... pd.DataFrame( 

420 ... { 

421 ... "a": [0, 1, 2, 3], 

422 ... "b": ["a", "b", "c", "d"], 

423 ... "c": ["1", "1", "1", "1"], 

424 ... "d": ["2", "2", "2", "2"], 

425 ... "e": ["3", "3", "3", "3"], 

426 ... "f": ["4", "4", "4", "4"], 

427 ... } 

428 ... ) 

429 ... ) 

430 >>> df2 = ( 

431 ... df1.withColumn("c", F.col("c").cast("int")) 

432 ... .withColumn("g", F.lit("a")) 

433 ... .withColumn("d", F.lit("null")) 

434 ... .drop("e") 

435 ... ) 

436 >>> write_to_path( 

437 ... table=df1, 

438 ... name="left", 

439 ... path="./test", 

440 ... data_format="parquet", 

441 ... mode="overwrite", 

442 ... write_options={"overwriteSchema": "true"}, 

443 ... ) 

444 >>> write_to_path( 

445 ... table=df2, 

446 ... name="right", 

447 ... path="./test", 

448 ... data_format="parquet", 

449 ... mode="overwrite", 

450 ... write_options={"overwriteSchema": "true"}, 

451 ... ) 

452 >>> 

453 >>> # Check 

454 >>> pprint(df1.dtypes) 

455 >>> print(df1.show()) 

456 >>> print(table_exists("left", "./test", "parquet", spark)) 

457 >>> pprint(df2.dtypes) 

458 >>> print(df2.show()) 

459 >>> print(table_exists("right", "./test", "parquet", spark)) 

460 ``` 

461 <div class="result" markdown> 

462 ```{.sh .shell title="Terminal"} 

463 [ 

464 ("a", "bigint"), 

465 ("b", "string"), 

466 ("c", "string"), 

467 ("d", "string"), 

468 ("e", "string"), 

469 ("f", "string"), 

470 ] 

471 ``` 

472 ```{.txt .text title="Terminal"} 

473 +---+---+---+---+---+---+ 

474 | a | b | c | d | e | f | 

475 +---+---+---+---+---+---+ 

476 | 0 | a | 1 | 2 | 3 | 4 | 

477 | 1 | b | 1 | 2 | 3 | 4 | 

478 | 2 | c | 1 | 2 | 3 | 4 | 

479 | 3 | d | 1 | 2 | 3 | 4 | 

480 +---+---+---+---+---+---+ 

481 ``` 

482 ```{.sh .shell title="Terminal"} 

483 True 

484 ``` 

485 ```{.sh .shell title="Terminal"} 

486 [ 

487 ("a", "bigint"), 

488 ("b", "string"), 

489 ("c", "int"), 

490 ("d", "string"), 

491 ("f", "string"), 

492 ("g", "string"), 

493 ] 

494 ``` 

495 ```{.txt .text title="Terminal"} 

496 +---+---+---+------+---+---+ 

497 | a | b | c | d | f | g | 

498 +---+---+---+------+---+---+ 

499 | 0 | a | 1 | null | 4 | 2 | 

500 | 1 | b | 1 | null | 4 | 2 | 

501 | 2 | c | 1 | null | 4 | 2 | 

502 | 3 | d | 1 | null | 4 | 2 | 

503 +---+---+---+------+---+---+ 

504 ``` 

505 ```{.sh .shell title="Terminal"} 

506 True 

507 ``` 

508 </div> 

509 

510 ```{.py .python linenums="1" title="Example 1: Check matching"} 

511 >>> diff = check_schemas_match( 

512 ... method="table_table", 

513 ... left_table=df1, 

514 ... right_table=df1, 

515 ... include_add_field=True, 

516 ... include_change_field=True, 

517 ... include_remove_field=True, 

518 ... include_change_nullable=True, 

519 ... return_object="check", 

520 ... ) 

521 >>> print(diff) 

522 ``` 

523 <div class="result" markdown> 

524 ```{.sh .shell title="Terminal"} 

525 True 

526 ``` 

527 !!! success "Conclusion: Schemas match." 

528 </div> 

529 

530 ```{.py .python linenums="1" title="Example 2: Check not matching"} 

531 >>> diff = check_schemas_match( 

532 ... method="table_table", 

533 ... left_table=df1, 

534 ... right_table=df2, 

535 ... include_add_field=True, 

536 ... include_change_field=True, 

537 ... include_remove_field=True, 

538 ... include_change_nullable=True, 

539 ... return_object="check", 

540 ... ) 

541 >>> print(diff) 

542 ``` 

543 <div class="result" markdown> 

544 ```{.sh .shell title="Terminal"} 

545 False 

546 ``` 

547 !!! failure "Conclusion: Schemas do not match." 

548 </div> 

549 

550 ```{.py .python linenums="1" title="Example 3: Show only `add`"} 

551 >>> diff = check_schemas_match( 

552 ... method="table_table", 

553 ... left_table=df1, 

554 ... right_table=df2, 

555 ... include_add_field=True, 

556 ... include_change_field=False, 

557 ... include_remove_field=False, 

558 ... include_change_nullable=False, 

559 ... return_object="results", 

560 ... ) 

561 >>> print(diff) 

562 ``` 

563 <div class="result" markdown> 

564 ```{.sh .shell title="Terminal"} 

565 [ 

566 ( 

567 "add", 

568 {"left": T.StructField("e", T.StringType(), False)}, 

569 ), 

570 ] 

571 ``` 

572 !!! failure "Conclusion: Schemas do not match because the `e` field was added." 

573 </div> 

574 

575 ```{.py .python linenums="1" title="Example 4: Show `add` and `remove`"} 

576 >>> diff = check_schemas_match( 

577 ... method="table_table", 

578 ... left_table=df1, 

579 ... right_table=df2, 

580 ... include_add_field=True, 

581 ... include_change_field=False, 

582 ... include_remove_field=True, 

583 ... include_change_nullable=False, 

584 ... return_object="results", 

585 ... ) 

586 >>> print(diff) 

587 ``` 

588 <div class="result" markdown> 

589 ```{.sh .shell title="Terminal"} 

590 [ 

591 ( 

592 "add", 

593 {"left": T.StructField("e", T.StringType(), False)}, 

594 ), 

595 ( 

596 "remove", 

597 {"right": T.StructField("g", T.StringType(), False)}, 

598 ), 

599 ] 

600 ``` 

601 !!! failure "Conclusion: Schemas do not match because the `e` field was added and the `g` field was removed." 

602 </div> 

603 

604 ```{.py .python linenums="1" title="Example 5: Show all changes"} 

605 >>> diff = check_schemas_match( 

606 ... method="table_table", 

607 ... left_table=df1, 

608 ... right_table=df2, 

609 ... include_add_field=True, 

610 ... include_change_field=True, 

611 ... include_remove_field=True, 

612 ... include_change_nullable=True, 

613 ... return_object="results", 

614 ... ) 

615 >>> print(diff) 

616 ``` 

617 <div class="result" markdown> 

618 ```{.sh .shell title="Terminal"} 

619 [ 

620 ( 

621 "add", 

622 {"left": T.StructField("e", T.StringType(), False)}, 

623 ), 

624 ( 

625 "remove", 

626 {"right": T.StructField("g", T.StringType(), False)}, 

627 ), 

628 ( 

629 "change_type", 

630 { 

631 "left": T.StructField("c", T.StringType(), False), 

632 "right": T.StructField("c", T.IntegerType(), True), 

633 }, 

634 ), 

635 ( 

636 "change_nullable", 

637 { 

638 "left": T.StructField("c", T.StringType(), False), 

639 "right": T.StructField("c", T.IntegerType(), True), 

640 }, 

641 ), 

642 ] 

643 ``` 

644 !!! failure "Conclusion: Schemas do not match because the `e` field was added, the `g` field was removed, the `c` field had its data type changed, and the `c` field had its nullability changed." 

645 </div> 

646 

647 ```{.py .python linenums="1" title="Example 6: Check where right-hand side is a `path`"} 

648 >>> diff = check_schemas_match( 

649 ... method="path_table", 

650 ... spark_session=spark, 

651 ... left_table=df1, 

652 ... right_table_path="./test", 

653 ... right_table_name="right", 

654 ... right_table_format="parquet", 

655 ... include_add_field=True, 

656 ... include_change_field=False, 

657 ... include_remove_field=False, 

658 ... include_change_nullable=False, 

659 ... return_object="results", 

660 ... ) 

661 >>> print(diff) 

662 ``` 

663 <div class="result" markdown> 

664 ```{.sh .shell title="Terminal"} 

665 [ 

666 ( 

667 "add", 

668 {"left": T.StructField("e", T.StringType(), False)}, 

669 ), 

670 ] 

671 ``` 

672 !!! failure "Conclusion: Schemas do not match because the `e` field was added." 

673 </div> 

674 

675 ```{.py .python linenums="1" title="Example 7: Check where both sides are a `path`"} 

676 >>> diff = check_schemas_match( 

677 ... method="path_path", 

678 ... spark_session=spark, 

679 ... left_table_path="./test", 

680 ... left_table_name="left", 

681 ... left_table_format="parquet", 

682 ... right_table_path="./test", 

683 ... right_table_name="right", 

684 ... right_table_format="parquet", 

685 ... include_add_field=False, 

686 ... include_change_field=True, 

687 ... include_remove_field=False, 

688 ... include_change_nullable=False, 

689 ... return_object="results", 

690 ... ) 

691 >>> print(diff) 

692 ``` 

693 <div class="result" markdown> 

694 ```{.sh .shell title="Terminal"} 

695 [ 

696 ( 

697 "remove", 

698 {"right": T.StructField("g", T.StringType(), True)}, 

699 ), 

700 ] 

701 ``` 

702 !!! failure "Conclusion: Schemas do not match because the `g` field was removed." 

703 </div> 

704 

705 ```{.py .python linenums="1" title="Example 8: Invalid `method` parameter"} 

706 >>> diff = check_schemas_match( 

707 ... method="invalid", 

708 ... left_table=df1, 

709 ... right_table=df2, 

710 ... include_add_field=True, 

711 ... include_change_field=True, 

712 ... include_remove_field=True, 

713 ... include_change_nullable=True, 

714 ... return_object="check", 

715 ... ) 

716 ``` 

717 <div class="result" markdown> 

718 ```{.py .python .title="Terminal"} 

719 AttributeError: Invalid value for `method`: 'invalid' 

720 Please use one of the following options: 

721 - For `by_table_and_table`, use one of the following values: ['table', 'table_table', 'tables', 'by_table', 'by_table_and_table', 'table_and_table'] 

722 - For `by_table_and_path`, use one of the following values: ['table_and_path', 'table_path', 'by_table_and_path'] 

723 - For `by_path_and_table`, use one of the following values: ['path_and_table', 'path_table', 'by_path_and_table'] 

724 - For `by_path_and_path`, use one of the following values: ['path_and_path', 'path_path', 'by_path_and_path', 'path', 'paths'] 

725 ``` 

726 !!! failure "Conclusion: Invalid `method` parameter." 

727 </div> 

728 

729 ???+ info "Notes" 

730 

731 ???+ info "Options available in the `method` parameter" 

732 

733 The options available in the `method` parameter include: 

734 

735 - If the objects on both the left-hand side and the right-hand side are both `dataframes` already loaded to memory, use one of the following values: 

736 <div class="mdx-three-columns" markdown> 

737 - `#!py "table"` 

738 - `#!py "table_table"` 

739 - `#!py "tables"` 

740 - `#!py "by_table"` 

741 - `#!py "by_table_and_table"` 

742 - `#!py "table_and_table"` 

743 </div> 

744 - If the object on the left-hand side is a `dataframe` already loaded to memory, but the object on the right-hand side is a table sitting on a path somewhere, use one of the following values: 

745 <div class="mdx-three-columns" markdown> 

746 - `#!py "table_and_path"` 

747 - `#!py "table_path"` 

748 - `#!py "by_table_and_path"` 

749 </div> 

750 - If the object on the left-hand side is a table sitting on a path somewhere, but the object on the right-hand side is a `dataframe` already loaded to memory, use one of the following values: 

751 <div class="mdx-three-columns" markdown> 

752 - `#!py "path_and_table"` 

753 - `#!py "path_table"` 

754 - `#!py "by_path_and_table"` 

755 </div> 

756 - If the objects on both the left-hand side and the right-hand side are both tables sitting on a path somewhere, then use one of the following values: 

757 <div class="mdx-three-columns" markdown> 

758 - `#!py "path_and_path"` 

759 - `#!py "path_path"` 

760 - `#!py "by_path_and_path"` 

761 - `#!py "path"` 

762 - `#!py "paths"` 

763 </div> 

764 

765 ???+ info "Details about the return object when we set the parameter `#!py return_object="results"`" 

766 

767 - When we set the parameter `#!py return_object="results"`, then we will get an object returned from this function. 

768 - That object will be a `#!py list` of `#!py tuple`'s, each `#!py tuple` is only two-elements long, where the first element is a `#!py str` object, and the second is a `#!py dict` where the keys are `#!py str` and the values are a `#!py StructField` object. 

769 - For each of the `#!py tuple` elements, the first element (the `#!py str` object) describes what the `#!py tuple` is there for. It will be one of four words: `#!py "add"`, `#!py "remove"`, `#!py "change_type"`, or `#!py "change_nullable"`. 

770 - You can change whether these options are included in the schema check by changing the other parameters: `#!py include_change_field`, `#!py include_add_field`, `#!py include_remove_field`, `#!py include_change_nullable`. 

771 - The structure of the list will look like this: 

772 

773 ```{.py .python .title="The structure of the returned object"} 

774 [ 

775 ( 

776 "add", # (1)! 

777 {"left": T.StructField("e", T.StringType(), False)}, # (2)! 

778 ), 

779 ( 

780 "add", # (3)! 

781 {"left": T.StructField("h", T.StringType(), False)}, 

782 ), 

783 ( 

784 "remove", # (4)! 

785 {"right": T.StructField("g", T.StringType(), False)}, # (5)! 

786 ), 

787 ( 

788 "change_type", # (6)! 

789 { 

790 "left": T.StructField("c", T.StringType(), False), # (7)! 

791 "right": T.StructField("c", T.IntegerType(), True), 

792 }, 

793 ), 

794 ( 

795 "change_nullable", # (8)! 

796 { 

797 "left": T.StructField("c", T.StringType(), False), # (9)! 

798 "right": T.StructField("c", T.IntegerType(), True), 

799 }, 

800 ), 

801 ] 

802 ``` 

803 

804 1. When `#!py include_add_field=True`, then the `add` section will always appear first.<br> 

805 If `#!py include_add_field=False`, then this section is omitted. 

806 2. The second element of the `#!py tuple` is a `#!py dict` that has only one `key`-`value` pair.<br> 

807 The `key` will _always_ be the value `#!py "left"`, because these are fields which have been added to the table on the left-hand side and not found on the right-hand side. 

808 3. When there are multiple fields added to the table on the left-hand side, they will appear like this. 

809 4. When `#!py include_remove_field=True`, then the `remove` section will always appear next.<br> 

810 If `#!py include_remove_field=False`, then this section is omitted. 

811 5. The second element of the `#!py tuple` is a `#!py dict` that has only one `key`-`value` pair.<br> 

812 The `key` will _always_ be the value `#!py "right"`, because these are fields which have been removed from the left-hand side and only visible on the right-hand side. 

813 6. When `#!py include_change_field=True`, then the `change_type` section will always appear next.<br> 

814 If `#!py include_change_field=False`, then this section is omitted. 

815 7. The second element of the `#!py tuple` is a `#!py dict` that has two `key`-`value` pairs.<br> 

816 The `key`'s will _always_ be the values `#!py "left"` then `#!py "right"`, because these are fields where the data type has changed between the left-hand side and the right-hand side, and therefore you need to see both to see exactly what has changed. 

817 8. When `#!py include_change_nullable=True`, then the `change_nullable` section will always appear next.<br> 

818 If `#!py include_change_nullable=False`, then this section is omitted. 

819 9. The sectond element of the `#!py tuple` is a `#!py dict` that has two `key`-`value` pairs.<br> 

820 The `key`'s will _always_ be the values `#!py "left"` then `#!py "right"`, because these are fields where the nullability of the firlds are changed between the left-hand side and the right-hand side, and therefore you need to see both to see exactly what has changed. 

821 """ 

822 

823 valid_methods = ValidMethods() 

824 msg: str = "If using the '{meth}' method, then '{name}' cannot be 'None'." 

825 

826 if method in valid_methods.by_table_and_table: 

827 assert left_table is not None, msg.format(meth=method, name="left_table") 

828 assert right_table is not None, msg.format(meth=method, name="right_table") 

829 return _check_schemas_match_by_table_and_table( 

830 left_table=left_table, 

831 right_table=right_table, 

832 include_change_field=include_change_field, 

833 include_add_field=include_add_field, 

834 include_remove_field=include_remove_field, 

835 include_change_nullable=include_change_nullable, 

836 return_object=return_object, 

837 ) 

838 elif method in valid_methods.by_table_and_path: 

839 assert left_table is not None, msg.format(meth=method, name="left_table") 

840 assert right_table_path is not None, msg.format(meth=method, name="right_table_path") 

841 assert right_table_name is not None, msg.format(meth=method, name="right_table_name") 

842 assert spark_session is not None, msg.format(meth=method, name="spark_session") 

843 return _check_schemas_match_by_table_and_path( 

844 left_table=left_table, 

845 right_table_path=right_table_path, 

846 right_table_name=right_table_name, 

847 right_table_format=right_table_format, 

848 spark_session=spark_session, 

849 include_change_field=include_change_field, 

850 include_add_field=include_add_field, 

851 include_remove_field=include_remove_field, 

852 include_change_nullable=include_change_nullable, 

853 return_object=return_object, 

854 ) 

855 elif method in valid_methods.by_path_and_table: 

856 assert left_table_path is not None, msg.format(meth=method, name="left_table_path") 

857 assert left_table_name is not None, msg.format(meth=method, name="left_table_name") 

858 assert right_table is not None, msg.format(meth=method, name="right_table") 

859 assert spark_session is not None, msg.format(meth=method, name="spark_session") 

860 return _check_schemas_match_by_path_and_table( 

861 left_table_path=left_table_path, 

862 left_table_name=left_table_name, 

863 right_table=right_table, 

864 spark_session=spark_session, 

865 left_table_format=left_table_format, 

866 include_change_field=include_change_field, 

867 include_add_field=include_add_field, 

868 include_remove_field=include_remove_field, 

869 include_change_nullable=include_change_nullable, 

870 return_object=return_object, 

871 ) 

872 elif method in valid_methods.by_path_and_path: 

873 assert left_table_path is not None, msg.format(meth=method, name="left_table_path") 

874 assert left_table_name is not None, msg.format(meth=method, name="left_table_name") 

875 assert right_table_path is not None, msg.format(meth=method, name="right_table_path") 

876 assert right_table_name is not None, msg.format(meth=method, name="right_table_name") 

877 assert spark_session is not None, msg.format(meth=method, name="spark_session") 

878 return _check_schemas_match_by_path_and_path( 

879 left_table_path=left_table_path, 

880 left_table_name=left_table_name, 

881 left_table_format=left_table_format, 

882 right_table_path=right_table_path, 

883 right_table_name=right_table_name, 

884 right_table_format=right_table_format, 

885 spark_session=spark_session, 

886 include_change_field=include_change_field, 

887 include_add_field=include_add_field, 

888 include_remove_field=include_remove_field, 

889 include_change_nullable=include_change_nullable, 

890 return_object=return_object, 

891 ) 

892 else: 

893 raise AttributeError( 

894 f"Invalid value for `method`: '{method}'\n" 

895 f"Please use one of the following options:\n" 

896 f"- For `by_table_and_table`, use one of: {valid_methods.by_table_and_table}\n" 

897 f"- For `by_table_and_path`, use one of: {valid_methods.by_table_and_path}\n" 

898 f"- For `by_path_and_table`, use one of: {valid_methods.by_path_and_table}\n" 

899 f"- For `by_path_and_path`, use one of: {valid_methods.by_path_and_path}\n" 

900 ) 

901 

902 

903# ---------------------------------------------------------------------------- # 

904# View Differences #### 

905# ---------------------------------------------------------------------------- # 

906 

907 

908@typechecked 

909def _view_schema_differences_by_table_and_table( 

910 left_table: psDataFrame, 

911 right_table: psDataFrame, 

912 include_change_field: bool = True, 

913 include_add_field: bool = True, 

914 include_remove_field: bool = True, 

915 include_change_nullable: bool = False, 

916 view_type: Literal["print", "pprint", "return"] = "pprint", 

917) -> Optional[Union[list[tuple[str, dict[str, StructField]]], bool]]: 

918 schema_differences: Union[list[tuple[str, dict[str, StructField]]], bool] = ( 

919 check_schemas_match( 

920 method="table_table", 

921 left_table=left_table, 

922 right_table=right_table, 

923 include_change_field=include_change_field, 

924 include_add_field=include_add_field, 

925 include_remove_field=include_remove_field, 

926 include_change_nullable=include_change_nullable, 

927 return_object="results", 

928 ) 

929 ) 

930 if is_type(schema_differences, list) and len(schema_differences) > 0: 

931 if view_type == "print": 

932 print(schema_differences) 

933 elif view_type == "pprint": 

934 pprint(schema_differences) 

935 elif view_type == "return": 

936 return schema_differences 

937 return None 

938 

939 

940@typechecked 

941def _view_schema_differences_by_path_and_path( 

942 left_table_path: str, 

943 left_table_name: str, 

944 right_table_path: str, 

945 right_table_name: str, 

946 spark_session: SparkSession, 

947 left_table_format: str = "delta", 

948 right_table_format: str = "delta", 

949 include_change_field: bool = True, 

950 include_add_field: bool = True, 

951 include_remove_field: bool = True, 

952 include_change_nullable: bool = False, 

953 view_type: Literal["print", "pprint", "return"] = "pprint", 

954) -> Optional[Union[list[tuple[str, dict[str, StructField]]], bool]]: 

955 left_table: psDataFrame = read_from_path( 

956 name=left_table_name, 

957 path=left_table_path, 

958 spark_session=spark_session, 

959 data_format=left_table_format, 

960 ) 

961 right_table: psDataFrame = read_from_path( 

962 name=right_table_name, 

963 path=right_table_path, 

964 spark_session=spark_session, 

965 data_format=right_table_format, 

966 ) 

967 return _view_schema_differences_by_table_and_table( 

968 left_table=left_table, 

969 right_table=right_table, 

970 include_change_field=include_change_field, 

971 include_add_field=include_add_field, 

972 include_remove_field=include_remove_field, 

973 include_change_nullable=include_change_nullable, 

974 view_type=view_type, 

975 ) 

976 

977 

978@typechecked 

979def _view_schema_differences_by_table_and_path( 

980 left_table: psDataFrame, 

981 right_table_path: str, 

982 right_table_name: str, 

983 spark_session: SparkSession, 

984 right_table_format: str = "delta", 

985 include_change_field: bool = True, 

986 include_add_field: bool = True, 

987 include_remove_field: bool = True, 

988 include_change_nullable: bool = False, 

989 view_type: Literal["print", "pprint", "return"] = "pprint", 

990) -> Optional[Union[list[tuple[str, dict[str, StructField]]], bool]]: 

991 right_table: psDataFrame = read_from_path( 

992 name=right_table_name, 

993 path=right_table_path, 

994 spark_session=spark_session, 

995 data_format=right_table_format, 

996 ) 

997 return _view_schema_differences_by_table_and_table( 

998 left_table=left_table, 

999 right_table=right_table, 

1000 include_change_field=include_change_field, 

1001 include_add_field=include_add_field, 

1002 include_remove_field=include_remove_field, 

1003 include_change_nullable=include_change_nullable, 

1004 view_type=view_type, 

1005 ) 

1006 

1007 

1008@typechecked 

1009def _view_schema_differences_by_path_and_table( 

1010 left_table_path: str, 

1011 left_table_name: str, 

1012 right_table: psDataFrame, 

1013 spark_session: SparkSession, 

1014 left_table_format: str = "delta", 

1015 include_change_field: bool = True, 

1016 include_add_field: bool = True, 

1017 include_remove_field: bool = True, 

1018 include_change_nullable: bool = False, 

1019 view_type: Literal["print", "pprint", "return"] = "pprint", 

1020) -> Optional[Union[list[tuple[str, dict[str, StructField]]], bool]]: 

1021 left_table: psDataFrame = read_from_path( 

1022 name=left_table_name, 

1023 path=left_table_path, 

1024 spark_session=spark_session, 

1025 data_format=left_table_format, 

1026 ) 

1027 return _view_schema_differences_by_table_and_table( 

1028 left_table=left_table, 

1029 right_table=right_table, 

1030 include_change_field=include_change_field, 

1031 include_add_field=include_add_field, 

1032 include_remove_field=include_remove_field, 

1033 include_change_nullable=include_change_nullable, 

1034 view_type=view_type, 

1035 ) 

1036 

1037 

1038@typechecked 

1039def view_schema_differences( 

1040 method: str = "by_table_and_table", 

1041 spark_session: Optional[SparkSession] = None, 

1042 left_table: Optional[psDataFrame] = None, 

1043 left_table_path: Optional[str] = None, 

1044 left_table_name: Optional[str] = None, 

1045 left_table_format: str = "delta", 

1046 right_table: Optional[psDataFrame] = None, 

1047 right_table_path: Optional[str] = None, 

1048 right_table_name: Optional[str] = None, 

1049 right_table_format: str = "delta", 

1050 include_change_field: bool = True, 

1051 include_add_field: bool = True, 

1052 include_remove_field: bool = True, 

1053 include_change_nullable: bool = False, 

1054 view_type: Literal["print", "pprint", "return"] = "pprint", 

1055) -> Optional[Union[list[tuple[str, dict[str, StructField]]], bool]]: 

1056 """ 

1057 !!! note "Summary" 

1058 View the schemas between two different tables. 

1059 

1060 ???+ abstract "Details" 

1061 The primary differences between [`check_schemas_match()`][toolbox_pyspark.schema.check_schemas_match] and [`view_schema_differences()`][toolbox_pyspark.schema.view_schema_differences] is that [`check_...()`][toolbox_pyspark.schema.check_schemas_match] returns either a `#!py bool` result, or the actual details of the schema differences, whilst [`view_...()`][toolbox_pyspark.schema.view_schema_differences] may also return the actual details object, but it will also print the result to the terminal for you to review.<br> 

1062 For full details of all the parameters and all the options, including nuances and detailed explanations and thorough examples, please check the [`check_schemas_match()`][toolbox_pyspark.schema.check_schemas_match] function. 

1063 

1064 Params: 

1065 method (str, optional): 

1066 The method to use for the comparison. That is, is either side a table in memory or is it a `table` sitting on a `path`?. Check the Notes section for all options available for this parameter.<br> 

1067 Defaults to `#!py "by_table_and_table"`. 

1068 spark_session (Optional[SparkSession], optional): 

1069 The `SparkSession` to use if either the `left` or `right` tables are sitting on a `path` somewhere.<br> 

1070 Defaults to `#!py None`. 

1071 left_table (Optional[psDataFrame], optional): 

1072 If `method` defines the `left` table as a `table`, then this parameter is the actual `dataframe` to do the checking against.<br> 

1073 Defaults to `#!py None`. 

1074 left_table_path (Optional[str], optional): 

1075 If `method` defines the `left` table as a `path`, then this parameter is the actual path location where the table can be found.<br> 

1076 Defaults to `#!py None`. 

1077 left_table_name (Optional[str], optional): 

1078 If `method` defines the `left` table as a `path`, then this parameter is the name of the table found at the given `left_table_path` location.<br> 

1079 Defaults to `#!py None`. 

1080 left_table_format (str, optional): 

1081 If `method` defines the `left` table as a `path`, then this parameter is the format of the table found at the given `left_table_path` location.<br> 

1082 Defaults to `#!py "delta"`. 

1083 right_table (Optional[psDataFrame], optional): 

1084 If `method` defines the `right` table as a `table`, then this parameter is the actual `dataframe` to do the checking against.<br> 

1085 Defaults to `#!py None`. 

1086 right_table_path (Optional[str], optional): 

1087 If `method` defines the `right` table as a `path`, then this parameter is the actual path location where the table can be found.<br> 

1088 Defaults to `#!py None`. 

1089 right_table_name (Optional[str], optional): 

1090 If `method` defines the `right` table as a `path`, then this parameter is the name of the table found at the given `right_table_path` location.<br> 

1091 Defaults to `#!py None`. 

1092 right_table_format (str, optional): 

1093 If `method` defines the `right` table as a `path`, then this parameter is the format of the table found at the given `right_table_path` location.<br> 

1094 Defaults to `#!py "delta"`. 

1095 include_change_field (bool, optional): 

1096 When doing the schema validations, do you want to include any fields where the data-type on the right-hand side is different from the left-hand side?<br> 

1097 This can be read as: "What fields have had their data type _changed **between**_ the left-hand side and the right-hand side?"<br> 

1098 Defaults to `#!py True`. 

1099 include_add_field (bool, optional): 

1100 When doing the schema validations, do you want to include any fields that have had any additional fields added to the left-hand side, when compared to the right-hand side?<br> 

1101 This can be read as: "What fields have been _added **to**_ the left-hand side?"<br> 

1102 Defaults to `#!py True`. 

1103 include_remove_field (bool, optional): 

1104 When doing the schema validations, do you want to include any fields which are missing from the left-hand side and only existing on the right-hand side?<br> 

1105 This can be read as: "What fields been _removed **from**_ the left-hand side?"<br> 

1106 Defaults to `#!py True`. 

1107 include_change_nullable (bool, optional): 

1108 When doing the schema validations, do you want to include any fields which have had their nullability metadata changed on the right-hand side, when compared to the left-hand side?.<br> 

1109 This can be read as: "What fields had their nullability _changed **between**_ the left-hand side and the right-hand side?"<br> 

1110 Defaults to `#!py False`. 

1111 view_type (Literal["print", "pprint", "return"], optional): 

1112 When returning the output from this function, how do you want it to be displayed? Must be one of `#!py ["print", "pprint", "return"]`.<br> 

1113 Logically, the difference is that `#!py "print"` will display a text value to the terminal that is not formatted in any way; `#!py "pprint"` will display a pretty-printed text value to the terminal; and `#!py "return"` will return the schema differences which can then be assigned to another variable.<br> 

1114 Defaults to `#!py "pprint"`. 

1115 

1116 Raises: 

1117 TypeError: 

1118 If any of the inputs parsed to the parameters of this function are not the correct type. Uses the [`@typeguard.typechecked`](https://typeguard.readthedocs.io/en/stable/api.html#typeguard.typechecked) decorator. 

1119 AttributeError: 

1120 If the value parse'd to `method` is not a valid option. 

1121 

1122 Returns: 

1123 (Optional[list[tuple[str, dict[str, StructField]]]]): 

1124 If `#!py view_type="return"`, then this will be a `#!py list` of `#!py tuple`'s of `#!py dict`'s containing the details of the schema differences. If `#!py view_type!="return"` (or if `#!py view_type="return"`, but there are actually no differences in the schema), then nothing is returned; only printed to terminal. 

1125 

1126 ???+ example "Examples" 

1127 

1128 ```{.py .python linenums="1" title="Set up"} 

1129 >>> # Imports 

1130 >>> from pprint import pprint 

1131 >>> import pandas as pd 

1132 >>> from pyspark.sql import SparkSession, functions as F 

1133 >>> from toolbox_pyspark.schema import view_schema_differences 

1134 >>> from toolbox_pyspark.io import write_to_path 

1135 >>> from toolbox_pyspark.checks import table_exists 

1136 >>> 

1137 >>> # Instantiate Spark 

1138 >>> spark = SparkSession.builder.getOrCreate() 

1139 >>> 

1140 >>> # Create data 

1141 >>> df1 = spark.createDataFrame( 

1142 ... pd.DataFrame( 

1143 ... { 

1144 ... "a": [0, 1, 2, 3], 

1145 ... "b": ["a", "b", "c", "d"], 

1146 ... "c": ["1", "1", "1", "1"], 

1147 ... "d": ["2", "2", "2", "2"], 

1148 ... "e": ["3", "3", "3", "3"], 

1149 ... "f": ["4", "4", "4", "4"], 

1150 ... } 

1151 ... ) 

1152 ... ) 

1153 >>> df2 = ( 

1154 ... df1.withColumn("c", F.col("c").cast("int")) 

1155 ... .withColumn("g", F.lit("a")) 

1156 ... .withColumn("d", F.lit("null")) 

1157 ... .drop("e") 

1158 ... ) 

1159 >>> write_to_path( 

1160 ... table=df1, 

1161 ... name="left", 

1162 ... path="./test", 

1163 ... data_format="parquet", 

1164 ... mode="overwrite", 

1165 ... write_options={"overwriteSchema": "true"}, 

1166 ... ) 

1167 >>> write_to_path( 

1168 ... table=df2, 

1169 ... name="right", 

1170 ... path="./test", 

1171 ... data_format="parquet", 

1172 ... mode="overwrite", 

1173 ... write_options={"overwriteSchema": "true"}, 

1174 ... ) 

1175 >>> 

1176 >>> # Check 

1177 >>> pprint(df1.dtypes) 

1178 >>> print(df1.show()) 

1179 >>> print(table_exists("left", "./test", "parquet", spark)) 

1180 >>> pprint(df2.dtypes) 

1181 >>> print(df2.show()) 

1182 >>> print(table_exists("right", "./test", "parquet", spark)) 

1183 ``` 

1184 <div class="result" markdown> 

1185 ```{.sh .shell title="Terminal"} 

1186 [ 

1187 ("a", "bigint"), 

1188 ("b", "string"), 

1189 ("c", "string"), 

1190 ("d", "string"), 

1191 ("e", "string"), 

1192 ("f", "string"), 

1193 ] 

1194 ``` 

1195 ```{.txt .text title="Terminal"} 

1196 +---+---+---+---+---+---+ 

1197 | a | b | c | d | e | f | 

1198 +---+---+---+---+---+---+ 

1199 | 0 | a | 1 | 2 | 3 | 4 | 

1200 | 1 | b | 1 | 2 | 3 | 4 | 

1201 | 2 | c | 1 | 2 | 3 | 4 | 

1202 | 3 | d | 1 | 2 | 3 | 4 | 

1203 +---+---+---+---+---+---+ 

1204 ``` 

1205 ```{.sh .shell title="Terminal"} 

1206 True 

1207 ``` 

1208 ```{.sh .shell title="Terminal"} 

1209 [ 

1210 ("a", "bigint"), 

1211 ("b", "string"), 

1212 ("c", "int"), 

1213 ("d", "string"), 

1214 ("f", "string"), 

1215 ("g", "string"), 

1216 ] 

1217 ``` 

1218 ```{.txt .text title="Terminal"} 

1219 +---+---+---+------+---+---+ 

1220 | a | b | c | d | f | g | 

1221 +---+---+---+------+---+---+ 

1222 | 0 | a | 1 | null | 4 | 2 | 

1223 | 1 | b | 1 | null | 4 | 2 | 

1224 | 2 | c | 1 | null | 4 | 2 | 

1225 | 3 | d | 1 | null | 4 | 2 | 

1226 +---+---+---+------+---+---+ 

1227 ``` 

1228 ```{.sh .shell title="Terminal"} 

1229 True 

1230 ``` 

1231 </div> 

1232 

1233 ```{.py .python linenums="1" title="Example 1: Check matching"} 

1234 >>> view_schema_differences( 

1235 ... method="table_table", 

1236 ... left_table=df1, 

1237 ... right_table=df1, 

1238 ... include_add_field=True, 

1239 ... include_change_field=True, 

1240 ... include_remove_field=True, 

1241 ... include_change_nullable=True, 

1242 ... view_type="return", 

1243 ... ) 

1244 >>> print(diff) 

1245 ``` 

1246 <div class="result" markdown> 

1247 ```{.sh .shell title="Terminal"} 

1248 None 

1249 ``` 

1250 !!! success "Conclusion: Schemas match." 

1251 </div> 

1252 

1253 ```{.py .python linenums="1" title="Example 2: Check print"} 

1254 >>> view_schema_differences( 

1255 ... method="table_table", 

1256 ... left_table=df1, 

1257 ... right_table=df2, 

1258 ... include_add_field=True, 

1259 ... include_change_field=False, 

1260 ... include_remove_field=False, 

1261 ... include_change_nullable=False, 

1262 ... view_type="print", 

1263 ... ) 

1264 ``` 

1265 <div class="result" markdown> 

1266 ```{.sh .shell title="Terminal"} 

1267 [('add', {'left': StructField('e', StringType(), True)})] 

1268 ``` 

1269 !!! failure "Conclusion: Schemas do not match because the `e` field was added." 

1270 </div> 

1271 

1272 ```{.py .python linenums="1" title="Example 3: Check pprint"} 

1273 >>> view_schema_differences( 

1274 ... method="table_table", 

1275 ... left_table=df1, 

1276 ... right_table=df2, 

1277 ... include_add_field=True, 

1278 ... include_change_field=True, 

1279 ... include_remove_field=True, 

1280 ... include_change_nullable=True, 

1281 ... view_type="pprint", 

1282 ... ) 

1283 ``` 

1284 <div class="result" markdown> 

1285 ```{.sh .shell title="Terminal"} 

1286 [('add', {'left': StructField('e', StringType(), False)}), 

1287 ('remove', {'right': StructField('g', StringType(), False)}), 

1288 ('change_type', 

1289 {'left': StructField('c', StringType(), False), 

1290 'right': StructField('c', IntegerType(), True)}), 

1291 ('change_nullable', 

1292 {'left': StructField('c', StringType(), False), 

1293 'right': StructField('c', IntegerType(), True)})] 

1294 ``` 

1295 !!! failure "Conclusion: Schemas do not match because the `e` field was added, the `g` field was removed, the `c` field had its data type changed, and the `c` field had its nullability changed." 

1296 </div> 

1297 

1298 ```{.py .python linenums="1" title="Example 4: Check with right-hand side as a `path`"} 

1299 >>> view_schema_differences( 

1300 ... method="table_table", 

1301 ... spark_session=spark, 

1302 ... left_table=df1, 

1303 ... right_table_path="./test", 

1304 ... right_table_name="right", 

1305 ... right_table_format="parquet", 

1306 ... include_add_field=True, 

1307 ... include_change_field=False, 

1308 ... include_remove_field=False, 

1309 ... include_change_nullable=False, 

1310 ... view_type="pprint", 

1311 ... ) 

1312 ``` 

1313 <div class="result" markdown> 

1314 ```{.sh .shell title="Terminal"} 

1315 [('add', {'left': StructField('e', StringType(), True)})] 

1316 ``` 

1317 !!! failure "Conclusion: Schemas do not match because the `e` field was added." 

1318 </div> 

1319 

1320 ```{.py .python linenums="1" title="Example 5: Check with both sides being a `path`"} 

1321 >>> view_schema_differences( 

1322 ... method="table_table", 

1323 ... spark_session=spark, 

1324 ... left_table_path="./test", 

1325 ... left_table_name="left", 

1326 ... left_table_format="parquet", 

1327 ... right_table_path="./test", 

1328 ... right_table_name="right", 

1329 ... right_table_format="parquet", 

1330 ... include_add_field=False, 

1331 ... include_change_field=False, 

1332 ... include_remove_field=True, 

1333 ... include_change_nullable=False, 

1334 ... view_type="pprint", 

1335 ... ) 

1336 ``` 

1337 <div class="result" markdown> 

1338 ```{.sh .shell title="Terminal"} 

1339 [('remove', {'right': StructField('g', StringType(), True)})] 

1340 ``` 

1341 !!! failure "Conclusion: Schemas do not match because the `g` field was removed." 

1342 </div> 

1343 

1344 ```{.py .python linenums="1" title="Example 6: Invalid `method` parameter"} 

1345 >>> view_schema_differences( 

1346 ... method="table_table_table", 

1347 ... left_table=df1, 

1348 ... right_table=df2, 

1349 ... include_add_field=True, 

1350 ... include_change_field=True, 

1351 ... include_remove_field=True, 

1352 ... include_change_nullable=True, 

1353 ... view_type="return", 

1354 ... ) 

1355 ``` 

1356 <div class="result" markdown> 

1357 ```{.sh .shell title="Terminal"} 

1358 AttributeError: Invalid value for `method`: 'table_table_table' 

1359 Please use one of the following options: 

1360 - For `by_table_and_table`, use one of the following values: ['table', 'table_table', 'tables', 'by_table', 'by_table_and_table', 'table_and_table'] 

1361 - For `by_table_and_path`, use one of the following values: ['table_and_path', 'table_path', 'by_table_and_path'] 

1362 - For `by_path_and_table`, use one of the following values: ['path_and_table', 'path_table', 'by_path_and_table'] 

1363 - For `by_path_and_path`, use one of the following values: ['path_and_path', 'path_path', 'by_path_and_path', 'path', 'paths'] 

1364 ``` 

1365 !!! failure "Conclusion: Invalid `method` parameter." 

1366 </div> 

1367 

1368 ??? tip "See Also" 

1369 - [`check_schemas_match()`][toolbox_pyspark.schema.check_schemas_match] 

1370 """ 

1371 

1372 valid_methods: ValidMethods = ValidMethods() 

1373 msg: str = "If using the '{meth}' method, then '{name}' cannot be 'None'." 

1374 

1375 if method in valid_methods.by_table_and_table: 

1376 assert left_table is not None, msg.format(meth=method, name="left_table") 

1377 assert right_table is not None, msg.format(meth=method, name="right_table") 

1378 return _view_schema_differences_by_table_and_table( 

1379 left_table=left_table, 

1380 right_table=right_table, 

1381 include_change_field=include_change_field, 

1382 include_add_field=include_add_field, 

1383 include_remove_field=include_remove_field, 

1384 include_change_nullable=include_change_nullable, 

1385 view_type=view_type, 

1386 ) 

1387 elif method in valid_methods.by_table_and_path: 

1388 assert left_table is not None, msg.format(meth=method, name="left_table") 

1389 assert right_table_path is not None, msg.format(meth=method, name="right_table_path") 

1390 assert right_table_name is not None, msg.format(meth=method, name="right_table_name") 

1391 assert spark_session is not None, msg.format(meth=method, name="spark_session") 

1392 return _view_schema_differences_by_table_and_path( 

1393 left_table=left_table, 

1394 right_table_path=right_table_path, 

1395 right_table_name=right_table_name, 

1396 right_table_format=right_table_format, 

1397 spark_session=spark_session, 

1398 include_change_field=include_change_field, 

1399 include_add_field=include_add_field, 

1400 include_remove_field=include_remove_field, 

1401 include_change_nullable=include_change_nullable, 

1402 view_type=view_type, 

1403 ) 

1404 elif method in valid_methods.by_path_and_table: 

1405 assert left_table_path is not None, msg.format(meth=method, name="left_table_path") 

1406 assert left_table_name is not None, msg.format(meth=method, name="left_table_name") 

1407 assert right_table is not None, msg.format(meth=method, name="right_table") 

1408 assert spark_session is not None, msg.format(meth=method, name="spark_session") 

1409 return _view_schema_differences_by_path_and_table( 

1410 left_table_path=left_table_path, 

1411 left_table_name=left_table_name, 

1412 left_table_format=left_table_format, 

1413 right_table=right_table, 

1414 spark_session=spark_session, 

1415 include_change_field=include_change_field, 

1416 include_add_field=include_add_field, 

1417 include_remove_field=include_remove_field, 

1418 include_change_nullable=include_change_nullable, 

1419 view_type=view_type, 

1420 ) 

1421 elif method in valid_methods.by_path_and_path: 

1422 assert left_table_path is not None, msg.format(meth=method, name="left_table_path") 

1423 assert left_table_name is not None, msg.format(meth=method, name="left_table_name") 

1424 assert right_table_path is not None, msg.format(meth=method, name="right_table_path") 

1425 assert right_table_name is not None, msg.format(meth=method, name="right_table_name") 

1426 assert spark_session is not None, msg.format(meth=method, name="spark_session") 

1427 return _view_schema_differences_by_path_and_path( 

1428 left_table_path=left_table_path, 

1429 left_table_name=left_table_name, 

1430 left_table_format=left_table_format, 

1431 right_table_path=right_table_path, 

1432 right_table_name=right_table_name, 

1433 right_table_format=right_table_format, 

1434 spark_session=spark_session, 

1435 include_change_field=include_change_field, 

1436 include_add_field=include_add_field, 

1437 include_remove_field=include_remove_field, 

1438 include_change_nullable=include_change_nullable, 

1439 view_type=view_type, 

1440 ) 

1441 else: 

1442 raise AttributeError( 

1443 f"Invalid value for `method`: '{method}'\n" 

1444 f"Please use one of the following options:\n" 

1445 f"- For `by_table_and_table`, use one of: {valid_methods.by_table_and_table}\n" 

1446 f"- For `by_table_and_path`, use one of: {valid_methods.by_table_and_path}\n" 

1447 f"- For `by_path_and_table`, use one of: {valid_methods.by_path_and_table}\n" 

1448 f"- For `by_path_and_path`, use one of: {valid_methods.by_path_and_path}\n" 

1449 )