Coverage for src/toolbox_pyspark/checks.py: 100%

133 statements  

« prev     ^ index     » next       coverage.py v7.6.10, created at 2025-01-25 23:08 +0000

1# ============================================================================ # 

2# # 

3# Title : Checks # 

4# Purpose : Check and validate various attributed about a given `pyspark` # 

5# `dataframe`. # 

6# # 

7# ============================================================================ # 

8 

9 

10# ---------------------------------------------------------------------------- # 

11# # 

12# Overview #### 

13# # 

14# ---------------------------------------------------------------------------- # 

15 

16 

17# ---------------------------------------------------------------------------- # 

18# Description #### 

19# ---------------------------------------------------------------------------- # 

20 

21 

22""" 

23!!! note "Summary" 

24 The `checks` module is used to check and validate various attributed about a given `pyspark` dataframe. 

25""" 

26 

27 

28# ---------------------------------------------------------------------------- # 

29# # 

30# Setup #### 

31# # 

32# ---------------------------------------------------------------------------- # 

33 

34 

35# ---------------------------------------------------------------------------- # 

36# Imports #### 

37# ---------------------------------------------------------------------------- # 

38 

39# ## Python StdLib Imports ---- 

40from dataclasses import dataclass, fields 

41from typing import Union 

42from warnings import warn 

43 

44# ## Python Third Party Imports ---- 

45from pyspark.sql import ( 

46 DataFrame as psDataFrame, 

47 SparkSession, 

48 functions as F, 

49 types as T, 

50) 

51from toolbox_python.checkers import is_type 

52from toolbox_python.collection_types import str_collection, str_list 

53from typeguard import typechecked 

54 

55# ## Local First Party Imports ---- 

56from toolbox_pyspark.constants import ALL_PYSPARK_TYPES, VALID_PYSPARK_TYPE_NAMES 

57from toolbox_pyspark.io import SPARK_FORMATS, read_from_path 

58from toolbox_pyspark.utils.exceptions import ( 

59 ColumnDoesNotExistError, 

60 InvalidPySparkDataTypeError, 

61 TableDoesNotExistError, 

62) 

63from toolbox_pyspark.utils.warnings import ( 

64 ColumnDoesNotExistWarning, 

65 InvalidPySparkDataTypeWarning, 

66) 

67 

68 

69# ---------------------------------------------------------------------------- # 

70# Exports #### 

71# ---------------------------------------------------------------------------- # 

72 

73 

74__all__: str_list = [ 

75 "ColumnExistsResult", 

76 "column_exists", 

77 "columns_exists", 

78 "assert_column_exists", 

79 "assert_columns_exists", 

80 "warn_column_missing", 

81 "warn_columns_missing", 

82 "is_vaid_spark_type", 

83 "assert_valid_spark_type", 

84 "ColumnsAreTypeResult", 

85 "column_is_type", 

86 "columns_are_type", 

87 "assert_column_is_type", 

88 "assert_columns_are_type", 

89 "warn_column_invalid_type", 

90 "warn_columns_invalid_type", 

91 "table_exists", 

92 "assert_table_exists", 

93 "column_contains_value", 

94] 

95 

96 

97# ---------------------------------------------------------------------------- # 

98# # 

99# Functions #### 

100# # 

101# ---------------------------------------------------------------------------- # 

102 

103 

104# ---------------------------------------------------------------------------- # 

105# Column Existence #### 

106# ---------------------------------------------------------------------------- # 

107 

108 

109@dataclass 

110class ColumnExistsResult: 

111 result: bool 

112 missing_cols: str_list 

113 

114 def __iter__(self): 

115 for field in fields(self): 

116 yield getattr(self, field.name) 

117 

118 

119@typechecked 

120def _columns_exists( 

121 dataframe: psDataFrame, 

122 columns: str_collection, 

123 match_case: bool = False, 

124) -> ColumnExistsResult: 

125 cols: str_collection = columns if match_case else [col.upper() for col in columns] 

126 df_cols: str_list = ( 

127 dataframe.columns if match_case else [df_col.upper() for df_col in dataframe.columns] 

128 ) 

129 missing_cols: str_list = [col for col in cols if col not in df_cols] 

130 return ColumnExistsResult(len(missing_cols) == 0, missing_cols) 

131 

132 

133@typechecked 

134def column_exists( 

135 dataframe: psDataFrame, 

136 column: str, 

137 match_case: bool = False, 

138) -> bool: 

139 """ 

140 !!! note "Summary" 

141 Check whether a given `#!py column` exists as a valid column within `#!py dataframe.columns`. 

142 

143 Params: 

144 dataframe (psDataFrame): 

145 The DataFrame to check. 

146 column (str): 

147 The column to check. 

148 match_case (bool, optional): 

149 Whether or not to match the string case for the columns.<br> 

150 If `#!py False`, will default to: `#!py column.upper()`.<br> 

151 Default: `#!py False`. 

152 

153 Raises: 

154 TypeError: 

155 If any of the inputs parsed to the parameters of this function are not the correct type. Uses the [`@typeguard.typechecked`](https://typeguard.readthedocs.io/en/stable/api.html#typeguard.typechecked) decorator. 

156 

157 Returns: 

158 (bool): 

159 `#!py True` if exists or `#!py False` otherwise. 

160 

161 ???+ example "Examples" 

162 

163 ```{.py .python linenums="1" title="Set up"} 

164 >>> import pandas as pd 

165 >>> from pyspark.sql import SparkSession 

166 >>> from toolbox_pyspark.checks import column_exists 

167 >>> spark = SparkSession.builder.getOrCreate() 

168 >>> df = spark.createDataFrame( 

169 ... pd.DataFrame( 

170 ... { 

171 ... "a": [1, 2, 3, 4], 

172 ... "b": ["a", "b", "c", "d"], 

173 ... } 

174 ... ) 

175 ... ) 

176 ``` 

177 

178 ```{.py .python linenums="1" title="Example1: Column Exists"} 

179 >>> result = column_exists(df, "a") 

180 >>> print(result) 

181 ``` 

182 <div class="result" markdown> 

183 ```{.sh .shell title="Terminal"} 

184 True 

185 ``` 

186 !!! success "Conclusion: Column exists." 

187 </div> 

188 

189 ```{.py .python linenums="1" title="Example 2: Column Missing"} 

190 >>> result = column_exists(df, "c") 

191 >>> print(result) 

192 ``` 

193 <div class="result" markdown> 

194 ```{.sh .shell title="Terminal"} 

195 False 

196 ``` 

197 !!! failure "Conclusion: Column does not exist." 

198 </div> 

199 

200 ??? tip "See Also" 

201 - [`column_exists`][toolbox_pyspark.checks.column_exists] 

202 - [`columns_exists`][toolbox_pyspark.checks.columns_exists] 

203 - [`assert_column_exists`][toolbox_pyspark.checks.assert_column_exists] 

204 - [`assert_columns_exists`][toolbox_pyspark.checks.assert_columns_exists] 

205 - [`warn_column_missing`][toolbox_pyspark.checks.warn_column_missing] 

206 - [`warn_columns_missing`][toolbox_pyspark.checks.warn_columns_missing] 

207 """ 

208 return _columns_exists(dataframe, [column], match_case).result 

209 

210 

211@typechecked 

212def columns_exists( 

213 dataframe: psDataFrame, 

214 columns: str_collection, 

215 match_case: bool = False, 

216) -> bool: 

217 """ 

218 !!! note "Summary" 

219 Check whether all of the values in `#!py columns` exist in `#!py dataframe.columns`. 

220 

221 Params: 

222 dataframe (psDataFrame): 

223 The DataFrame to check. 

224 columns (Union[str_list, str_tuple, str_set]): 

225 The columns to check. 

226 match_case (bool, optional): 

227 Whether or not to match the string case for the columns.<br> 

228 If `#!py False`, will default to: `#!py [col.upper() for col in columns]`.<br> 

229 Default: `#!py False`. 

230 

231 Raises: 

232 TypeError: 

233 If any of the inputs parsed to the parameters of this function are not the correct type. Uses the [`@typeguard.typechecked`](https://typeguard.readthedocs.io/en/stable/api.html#typeguard.typechecked) decorator. 

234 

235 Returns: 

236 (bool): 

237 `#!py True` if all columns exist or `#!py False` otherwise. 

238 

239 ???+ example "Examples" 

240 

241 ```{.py .python linenums="1" title="Set up"} 

242 >>> import pandas as pd 

243 >>> from pyspark.sql import SparkSession 

244 >>> from toolbox_pyspark.checks import columns_exists 

245 >>> spark = SparkSession.builder.getOrCreate() 

246 >>> df = spark.createDataFrame( 

247 ... pd.DataFrame( 

248 ... { 

249 ... "a": [1, 2, 3, 4], 

250 ... "b": ["a", "b", "c", "d"], 

251 ... } 

252 ... ) 

253 ... ) 

254 ``` 

255 

256 ```{.py .python linenums="1" title="Example 1: Columns exist"} 

257 >>> columns_exists(df, ["a", "b"]) 

258 ``` 

259 <div class="result" markdown> 

260 ```{.sh .shell title="Terminal"} 

261 True 

262 ``` 

263 !!! success "Conclusion: All columns exist." 

264 </div> 

265 

266 ```{.py .python linenums="1" title="Example 2: One column missing"} 

267 >>> columns_exists(df, ["b", "d"]) 

268 ``` 

269 <div class="result" markdown> 

270 ```{.sh .shell title="Terminal"} 

271 False 

272 ``` 

273 !!! failure "Conclusion: One column is missing." 

274 </div> 

275 

276 ```{.py .python linenums="1" title="Example 3: All columns missing"} 

277 >>> columns_exists(df, ["c", "d"]) 

278 ``` 

279 <div class="result" markdown> 

280 ```{.sh .shell title="Terminal"} 

281 False 

282 ``` 

283 !!! failure "Conclusion: All columns are missing." 

284 </div> 

285 

286 ??? tip "See Also" 

287 - [`column_exists`][toolbox_pyspark.checks.column_exists] 

288 - [`columns_exists`][toolbox_pyspark.checks.columns_exists] 

289 - [`assert_column_exists`][toolbox_pyspark.checks.assert_column_exists] 

290 - [`assert_columns_exists`][toolbox_pyspark.checks.assert_columns_exists] 

291 - [`warn_column_missing`][toolbox_pyspark.checks.warn_column_missing] 

292 - [`warn_columns_missing`][toolbox_pyspark.checks.warn_columns_missing] 

293 """ 

294 return _columns_exists(dataframe, columns, match_case).result 

295 

296 

297@typechecked 

298def assert_column_exists( 

299 dataframe: psDataFrame, 

300 column: str, 

301 match_case: bool = False, 

302) -> None: 

303 """ 

304 !!! note "Summary" 

305 Check whether a given `#!py column` exists as a valid column within `#!py dataframe.columns`. 

306 

307 Params: 

308 dataframe (psDataFrame): 

309 The DataFrame to check. 

310 column (str): 

311 The column to check. 

312 match_case (bool, optional): 

313 Whether or not to match the string case for the columns.<br> 

314 If `#!py False`, will default to: `#!py column.upper()`.<br> 

315 Default: `#!py True`. 

316 

317 Raises: 

318 TypeError: 

319 If any of the inputs parsed to the parameters of this function are not the correct type. Uses the [`@typeguard.typechecked`](https://typeguard.readthedocs.io/en/stable/api.html#typeguard.typechecked) decorator. 

320 ColumnDoesNotExistError: 

321 If the `#!py column` does not exist within `#!py dataframe.columns`. 

322 

323 Returns: 

324 (type(None)): 

325 Nothing is returned. Either an `#!py ColumnDoesNotExistError` exception is raised, or nothing. 

326 

327 ???+ example "Examples" 

328 

329 ```{.py .python linenums="1" title="Set up"} 

330 >>> import pandas as pd 

331 >>> from pyspark.sql import SparkSession 

332 >>> from toolbox_pyspark.checks import assert_column_exists 

333 >>> spark = SparkSession.builder.getOrCreate() 

334 >>> df = spark.createDataFrame( 

335 ... pd.DataFrame( 

336 ... { 

337 ... "a": [1,2,3,4], 

338 ... "b": ["a", "b", "c", "d"], 

339 ... } 

340 ... ) 

341 ... ) 

342 ``` 

343 

344 ```{.py .python linenums="1" title="Example 1: No error"} 

345 >>> assert_column_exists(df, "a") 

346 ``` 

347 <div class="result" markdown> 

348 ```{.sh .shell title="Terminal"} 

349 None 

350 ``` 

351 !!! success "Conclusion: Column exists." 

352 </div> 

353 

354 ```{.py .python linenums="1" title="Example 2: Error raised"} 

355 >>> assert_column_exists(df, "c") 

356 ``` 

357 <div class="result" markdown> 

358 ```{.txt .text title="Terminal"} 

359 ColumnDoesNotExistError: Column "c" does not exist in "dataframe". 

360 Try one of: ["a", "b"]. 

361 ``` 

362 !!! failure "Conclusion: Column does not exist." 

363 </div> 

364 

365 ??? tip "See Also" 

366 - [`column_exists`][toolbox_pyspark.checks.column_exists] 

367 - [`columns_exists`][toolbox_pyspark.checks.columns_exists] 

368 - [`assert_column_exists`][toolbox_pyspark.checks.assert_column_exists] 

369 - [`assert_columns_exists`][toolbox_pyspark.checks.assert_columns_exists] 

370 - [`warn_column_missing`][toolbox_pyspark.checks.warn_column_missing] 

371 - [`warn_columns_missing`][toolbox_pyspark.checks.warn_columns_missing] 

372 """ 

373 if not column_exists(dataframe, column, match_case): 

374 raise ColumnDoesNotExistError( 

375 f"Column '{column}' does not exist in 'dataframe'.\n" 

376 f"Try one of: {dataframe.columns}." 

377 ) 

378 

379 

380@typechecked 

381def assert_columns_exists( 

382 dataframe: psDataFrame, 

383 columns: Union[str, str_collection], 

384 match_case: bool = False, 

385) -> None: 

386 """ 

387 !!! note "Summary" 

388 Check whether all of the values in `#!py columns` exist in `#!py dataframe.columns`. 

389 

390 Params: 

391 dataframe (psDataFrame): 

392 The DataFrame to check. 

393 columns (Union[str_list, str_tuple, str_set]): 

394 The columns to check. 

395 match_case (bool, optional): 

396 Whether or not to match the string case for the columns.<br> 

397 If `#!py False`, will default to: `#!py [col.upper() for col in columns]`.<br> 

398 Default: `#!py True`. 

399 

400 Raises: 

401 TypeError: 

402 If any of the inputs parsed to the parameters of this function are not the correct type. Uses the [`@typeguard.typechecked`](https://typeguard.readthedocs.io/en/stable/api.html#typeguard.typechecked) decorator. 

403 ColumnDoesNotExistError: 

404 If any of the `#!py columns` do not exist within `#!py dataframe.columns`. 

405 

406 Returns: 

407 (type(None)): 

408 Nothing is returned. Either an `#!py ColumnDoesNotExistError` exception is raised, or nothing. 

409 

410 ???+ example "Examples" 

411 

412 ```{.py .python linenums="1" title="Set up"} 

413 >>> import pandas as pd 

414 >>> from pyspark.sql import SparkSession 

415 >>> from toolbox_pyspark.checks import assert_columns_exists 

416 >>> spark = SparkSession.builder.getOrCreate() 

417 >>> df = spark.createDataFrame( 

418 ... pd.DataFrame( 

419 ... { 

420 ... "a": [1, 2, 3, 4], 

421 ... "b": ["a", "b", "c", "d"], 

422 ... } 

423 ... ) 

424 ... ) 

425 ``` 

426 

427 ```{.py .python linenums="1" title="Example 1: No error"} 

428 >>> assert_columns_exists(df, ["a", "b"]) 

429 ``` 

430 <div class="result" markdown> 

431 ```{.sh .shell title="Terminal"} 

432 None 

433 ``` 

434 !!! success "Conclusion: Columns exist." 

435 </div> 

436 

437 ```{.py .python linenums="1" title="Example 2: One column missing"} 

438 >>> assert_columns_exists(df, ["b", "c"]) 

439 ``` 

440 <div class="result" markdown> 

441 ```{.txt .text title="Terminal"} 

442 ColumnDoesNotExistError: Columns ["c"] do not exist in "dataframe". 

443 Try one of: ["a", "b"]. 

444 ``` 

445 !!! failure "Conclusion: Column "c" does not exist." 

446 </div> 

447 

448 ```{.py .python linenums="1" title="Example 3: Multiple columns missing"} 

449 >>> assert_columns_exists(df, ["b", "c", "d"]) 

450 ``` 

451 <div class="result" markdown> 

452 ```{.txt .text title="Terminal"} 

453 ColumnDoesNotExistError: Columns ["c", "d"] do not exist in "dataframe". 

454 Try one of: ["a", "b"]. 

455 ``` 

456 !!! failure "Conclusion: Columns "c" and "d" does not exist." 

457 </div> 

458 

459 ??? tip "See Also" 

460 - [`column_exists`][toolbox_pyspark.checks.column_exists] 

461 - [`columns_exists`][toolbox_pyspark.checks.columns_exists] 

462 - [`assert_column_exists`][toolbox_pyspark.checks.assert_column_exists] 

463 - [`assert_columns_exists`][toolbox_pyspark.checks.assert_columns_exists] 

464 - [`warn_column_missing`][toolbox_pyspark.checks.warn_column_missing] 

465 - [`warn_columns_missing`][toolbox_pyspark.checks.warn_columns_missing] 

466 """ 

467 columns = [columns] if is_type(columns, str) else columns 

468 (exist, missing_cols) = _columns_exists(dataframe, columns, match_case) 

469 if not exist: 

470 raise ColumnDoesNotExistError( 

471 f"Columns {missing_cols} do not exist in 'dataframe'.\n" 

472 f"Try one of: {dataframe.columns}" 

473 ) 

474 

475 

476@typechecked 

477def warn_column_missing( 

478 dataframe: psDataFrame, 

479 column: str, 

480 match_case: bool = False, 

481) -> None: 

482 """ 

483 !!! summary "Summary" 

484 Check whether a given `#!py column` exists as a valid column within `#!py dataframe.columns`. 

485 

486 Params: 

487 dataframe (psDataFrame): 

488 The DataFrame to check. 

489 column (str): 

490 The column to check. 

491 match_case (bool, optional): 

492 Whether or not to match the string case for the columns.<br> 

493 Defaults to `#!py False`. 

494 

495 Raises: 

496 TypeError: 

497 If any of the inputs parsed to the parameters of this function are not the correct type. Uses the [`@typeguard.typechecked`](https://typeguard.readthedocs.io/en/stable/api.html#typeguard.typechecked) decorator. 

498 

499 Returns: 

500 (type(None)): 

501 Nothing is returned. Either an `#!py ColumnDoesNotExistWarning` exception is raised, or nothing. 

502 

503 ???+ example "Examples" 

504 

505 ```{.py .python linenums="1" title="Set up"} 

506 >>> import pandas as pd 

507 >>> from pyspark.sql import SparkSession 

508 >>> from toolbox_pyspark.checks import warn_column_missing 

509 >>> spark = SparkSession.builder.getOrCreate() 

510 >>> df = spark.createDataFrame( 

511 ... pd.DataFrame( 

512 ... { 

513 ... "a": [1, 2, 3, 4], 

514 ... "b": ["a", "b", "c", "d"], 

515 ... } 

516 ... ) 

517 ... ) 

518 ``` 

519 

520 ```{.py .python linenums="1" title="Example 1: No error"} 

521 >>> warn_column_missing(df, ["a", "b"]) 

522 ``` 

523 <div class="result" markdown> 

524 ```{.txt .text title="Terminal"} 

525 None 

526 ``` 

527 !!! success "Conclusion: Columns exist." 

528 </div> 

529 

530 ```{.py .python linenums="1" title="Example 2: Warning raised"} 

531 >>> warn_column_missing(df, "c") 

532 ``` 

533 <div class="result" markdown> 

534 ```{.txt .text title="Terminal"} 

535 ColumnDoesNotExistWarning: Column "c" does not exist in "dataframe". 

536 Try one of: ["a", "b"]. 

537 ``` 

538 !!! failure "Conclusion: Column does not exist." 

539 </div> 

540 

541 ??? tip "See Also" 

542 - [`column_exists`][toolbox_pyspark.checks.column_exists] 

543 - [`columns_exists`][toolbox_pyspark.checks.columns_exists] 

544 - [`assert_column_exists`][toolbox_pyspark.checks.assert_column_exists 

545 - [`assert_columns_exists`][toolbox_pyspark.checks.assert_columns_exists] 

546 - [`warn_column_missing`][toolbox_pyspark.checks.warn_column_missing] 

547 - [`warn_columns_missing`][toolbox_pyspark.checks.warn_columns_missing] 

548 """ 

549 if not column_exists(dataframe, column, match_case): 

550 warn( 

551 f"Column '{column}' does not exist in 'dataframe'.\n" 

552 f"Try one of: {dataframe.columns}.", 

553 ColumnDoesNotExistWarning, 

554 ) 

555 

556 

557@typechecked 

558def warn_columns_missing( 

559 dataframe: psDataFrame, 

560 columns: Union[str, str_collection], 

561 match_case: bool = False, 

562) -> None: 

563 """ 

564 !!! summary "Summary" 

565 Check whether all of the values in `#!py columns` exist in `#!py dataframe.columns`. 

566 

567 Params: 

568 dataframe (psDataFrame): 

569 The DataFrame to check. 

570 columns (Union[str, str_collection]): 

571 The columns to check. 

572 match_case (bool, optional): 

573 Whether or not to match the string case for the columns.<br> 

574 Defaults to `#!py False`. 

575 

576 Raises: 

577 TypeError: 

578 If any of the inputs parsed to the parameters of this function are not the correct type. Uses the [`@typeguard.typechecked`](https://typeguard.readthedocs.io/en/stable/api.html#typeguard.typechecked) decorator. 

579 

580 Returns: 

581 (type(None)): 

582 Nothing is returned. Either an `#!py ColumnDoesNotExistWarning` exception is raised, or nothing. 

583 

584 ???+ example "Examples" 

585 

586 ```{.py .python linenums="1" title="Set up"} 

587 >>> import pandas as pd 

588 >>> from pyspark.sql import SparkSession 

589 >>> from toolbox_pyspark.checks import warn_columns_missing 

590 >>> spark = SparkSession.builder.getOrCreate() 

591 >>> df = spark.createDataFrame( 

592 ... pd.DataFrame( 

593 ... { 

594 ... "a": [1, 2, 3, 4], 

595 ... "b": ["a", "b", "c", "d"], 

596 ... } 

597 ... ) 

598 ... ) 

599 ``` 

600 

601 ```{.py .python linenums="1" title="Example 1: No error"} 

602 >>> warn_columns_missing(df, ["a", "b"]) 

603 ``` 

604 <div class="result" markdown> 

605 ```{.txt .text title="Terminal"} 

606 None 

607 ``` 

608 !!! success "Conclusion: Columns exist." 

609 </div> 

610 

611 ```{.py .python linenums="1" title="Example 2: One column missing"} 

612 >>> warn_columns_missing(df, ["b", "c"]) 

613 ``` 

614 <div class="result" markdown> 

615 ```{.txt .text title="Terminal"} 

616 ColumnDoesNotExistWarning: Columns ["c"] do not exist in "dataframe". 

617 Try one of: ["a", "b"]. 

618 ``` 

619 !!! failure "Conclusion: Column "c" does not exist." 

620 </div> 

621 

622 ```{.py .python linenums="1" title="Example 3: Multiple columns missing"} 

623 >>> warn_columns_missing(df, ["b", "c", "d"]) 

624 ``` 

625 <div class="result" markdown> 

626 ```{.txt .text title="Terminal"} 

627 ColumnDoesNotExistWarning: Columns ["c", "d"] do not exist in "dataframe". 

628 Try one of: ["a", "b"]. 

629 ``` 

630 !!! failure "Conclusion: Columns "c" and "d" does not exist." 

631 </div> 

632 

633 ??? tip "See Also" 

634 - [`column_exists`][toolbox_pyspark.checks.column_exists] 

635 - [`columns_exists`][toolbox_pyspark.checks.columns_exists] 

636 - [`assert_column_exists`][toolbox_pyspark.checks.assert_column_exists] 

637 - [`assert_columns_exists`][toolbox_pyspark.checks.assert_columns_exists] 

638 - [`warn_column_missing`][toolbox_pyspark.checks.warn_column_missing] 

639 - [`warn_columns_missing`][toolbox_pyspark.checks.warn_columns_missing] 

640 """ 

641 columns = [columns] if is_type(columns, str) else columns 

642 (exist, missing_cols) = _columns_exists(dataframe, columns, match_case) 

643 if not exist: 

644 warn( 

645 f"Columns {missing_cols} do not exist in 'dataframe'.\n" 

646 f"Try one of: {dataframe.columns}", 

647 ColumnDoesNotExistWarning, 

648 ) 

649 

650 

651# ---------------------------------------------------------------------------- # 

652# Type checks #### 

653# ---------------------------------------------------------------------------- # 

654 

655 

656@typechecked 

657def is_vaid_spark_type(datatype: str) -> bool: 

658 """ 

659 !!! note "Summary" 

660 Check whether a given `#!py datatype` is a correct and valid `#!py pyspark` data type. 

661 

662 Params: 

663 datatype (str): 

664 The name of the data type to check. 

665 

666 Raises: 

667 TypeError: 

668 If any of the inputs parsed to the parameters of this function are not the correct type. Uses the [`@typeguard.typechecked`](https://typeguard.readthedocs.io/en/stable/api.html#typeguard.typechecked) decorator. 

669 InvalidPySparkDataTypeError: 

670 If the given `#!py datatype` is not a valid `#!py pyspark` data type. 

671 

672 Returns: 

673 (bool): 

674 `#!py True` if the datatype is valid, `#!py False` otherwise. 

675 

676 ???+ example "Examples" 

677 

678 ```{.py .python linenums="1" title="Set up"} 

679 >>> from toolbox_pyspark.checks import is_vaid_spark_type 

680 ``` 

681 

682 ```{.py .python linenums="1" title="Loop through all valid types"} 

683 >>> type_names = ["string", "char", "varchar", "binary", "boolean", "decimal", "float", "double", "byte", "short", "integer", "long", "date", "timestamp", "timestamp_ntz", "void"] 

684 >>> for type_name in type_names: 

685 ... is_vaid_spark_type(type_name) 

686 ``` 

687 <div class="result" markdown> 

688 Nothing is returned each time. Because they're all valid. 

689 !!! success "Conclusion: They're all valid." 

690 </div> 

691 

692 ```{.py .python linenums="1" title="Check some invalid types"} 

693 >>> type_names = ["np.ndarray", "pd.DataFrame", "dict"] 

694 >>> for type_name in type_names: 

695 ... is_vaid_spark_type(type_name) 

696 ``` 

697 <div class="result" markdown> 

698 ```{.txt .text title="Terminal"} 

699 InvalidPySparkDataTypeError: DataType 'np.ndarray' is not valid. 

700 Must be one of: ["binary", "bool", "boolean", "byte", "char", "date", "decimal", "double", "float", "int", "integer", "long", "short", "str", "string", "timestamp", "timestamp_ntz", "varchar", "void"] 

701 ``` 

702 ```{.txt .text title="Terminal"} 

703 InvalidPySparkDataTypeError: DataType 'pd.DataFrame' is not valid. 

704 Must be one of: ["binary", "bool", "boolean", "byte", "char", "date", "decimal", "double", "float", "int", "integer", "long", "short", "str", "string", "timestamp", "timestamp_ntz", "varchar", "void"] 

705 ``` 

706 ```{.txt .text title="Terminal"} 

707 InvalidPySparkDataTypeError: DataType 'dict' is not valid. 

708 Must be one of: ["binary", "bool", "boolean", "byte", "char", "date", "decimal", "double", "float", "int", "integer", "long", "short", "str", "string", "timestamp", "timestamp_ntz", "varchar", "void"] 

709 ``` 

710 !!! failure "Conclusion: All of these types are invalid." 

711 </div> 

712 

713 ??? tip "See Also" 

714 - [`assert_valid_spark_type`][toolbox_pyspark.checks.assert_valid_spark_type] 

715 """ 

716 return datatype in VALID_PYSPARK_TYPE_NAMES 

717 

718 

719@typechecked 

720def assert_valid_spark_type(datatype: str) -> None: 

721 """ 

722 !!! note "Summary" 

723 Assert whether a given `#!py datatype` is a correct and valid `#!py pyspark` data type. 

724 

725 Params: 

726 datatype (str): 

727 The name of the data type to check. 

728 

729 Raises: 

730 TypeError: 

731 If any of the inputs parsed to the parameters of this function are not the correct type. Uses the [`@typeguard.typechecked`](https://typeguard.readthedocs.io/en/stable/api.html#typeguard.typechecked) decorator. 

732 InvalidPySparkDataTypeError: 

733 If the given `#!py datatype` is not a valid `#!py pyspark` data type. 

734 

735 Returns: 

736 (type(None)): 

737 Nothing is returned. Either an `#!py InvalidPySparkDataTypeError` exception is raised, or nothing. 

738 

739 ???+ example "Examples" 

740 

741 ```{.py .python linenums="1" title="Set up"} 

742 >>> from toolbox_pyspark.checks import assert_valid_spark_type 

743 ``` 

744 

745 ```{.py .python linenums="1" title="Example 1: Valid type"} 

746 >>> assert_valid_spark_type("string") 

747 ``` 

748 <div class="result" markdown> 

749 ```{.txt .text title="Terminal"} 

750 None 

751 ``` 

752 !!! success "Conclusion: Valid type." 

753 </div> 

754 

755 ```{.py .python linenums="1" title="Example 2: Invalid type"} 

756 >>> assert_valid_spark_type("invalid_type") 

757 ``` 

758 <div class="result" markdown> 

759 ```{.txt .text title="Terminal"} 

760 InvalidPySparkDataTypeError: DataType 'invalid_type' is not valid. 

761 Must be one of: ["binary", "bool", "boolean", "byte", "char", "date", "decimal", "double", "float", "int", "integer", "long", "short", "str", "string", "timestamp", "timestamp_ntz", "varchar", "void"] 

762 ``` 

763 !!! failure "Conclusion: Invalid type." 

764 </div> 

765 

766 ??? tip "See Also" 

767 - [`is_vaid_spark_type`][toolbox_pyspark.checks.is_vaid_spark_type] 

768 """ 

769 if not is_vaid_spark_type(datatype): 

770 raise InvalidPySparkDataTypeError( 

771 f"DataType '{datatype}' is not valid.\n" 

772 f"Must be one of: {VALID_PYSPARK_TYPE_NAMES}" 

773 ) 

774 

775 

776# ---------------------------------------------------------------------------- # 

777# Column Types #### 

778# ---------------------------------------------------------------------------- # 

779 

780 

781@dataclass 

782class ColumnsAreTypeResult: 

783 result: bool 

784 invalid_types: list[tuple[str, str]] 

785 

786 def __iter__(self): 

787 for field in fields(self): 

788 yield getattr(self, field.name) 

789 

790 

791def _validate_pyspark_datatype( 

792 datatype: Union[str, type, T.DataType], 

793) -> ALL_PYSPARK_TYPES: 

794 datatype = T.FloatType() if datatype == "float" or datatype is float else datatype 

795 if is_type(datatype, str): 

796 datatype = "string" if datatype == "str" else datatype 

797 datatype = "boolean" if datatype == "bool" else datatype 

798 datatype = "integer" if datatype == "int" else datatype 

799 datatype = "timestamp" if datatype == "datetime" else datatype 

800 try: 

801 datatype = eval(datatype) 

802 except NameError: 

803 datatype = T._parse_datatype_string(s=datatype) # type:ignore 

804 if type(datatype).__name__ == "type": 

805 datatype = T._type_mappings.get(datatype)() # type:ignore 

806 return datatype 

807 

808 

809@typechecked 

810def _columns_are_type( 

811 dataframe: psDataFrame, 

812 columns: Union[str, str_collection], 

813 datatype: str, 

814 match_case: bool = False, 

815) -> ColumnsAreTypeResult: 

816 columns = [columns] if is_type(columns, str) else columns 

817 assert_columns_exists(dataframe, columns, match_case) 

818 assert_valid_spark_type(datatype) 

819 target_type: ALL_PYSPARK_TYPES = _validate_pyspark_datatype(datatype) 

820 df_dtypes: list[tuple[str, str]] = dataframe.dtypes 

821 df_dtypess: list[tuple[str, ALL_PYSPARK_TYPES]] = [ 

822 (col, _validate_pyspark_datatype(dtype)) for col, dtype in df_dtypes 

823 ] 

824 invalid_cols: list[tuple[str, str]] = [ 

825 (col, dtype.simpleString()) 

826 for col, dtype in df_dtypess 

827 if (col.upper() if match_case else col) 

828 in [col.upper() if match_case else col for col in columns] 

829 and dtype != target_type 

830 ] 

831 return ColumnsAreTypeResult(len(invalid_cols) == 0, invalid_cols) 

832 

833 

834@typechecked 

835def column_is_type( 

836 dataframe: psDataFrame, 

837 column: str, 

838 datatype: str, 

839 match_case: bool = False, 

840) -> bool: 

841 """ 

842 !!! note "Summary" 

843 Check whether a given `#!py column` is of a given `#!py datatype` in `#!py dataframe`. 

844 

845 Params: 

846 dataframe (psDataFrame): 

847 The DataFrame to check. 

848 column (str): 

849 The column to check. 

850 datatype (str): 

851 The data type to check. 

852 match_case (bool, optional): 

853 Whether or not to match the string case for the columns.<br> 

854 Defaults to `#!py False`. 

855 

856 Raises: 

857 TypeError: 

858 If any of the inputs parsed to the parameters of this function are not the correct type. Uses the [`@typeguard.typechecked`](https://typeguard.readthedocs.io/en/stable/api.html#typeguard.typechecked) decorator. 

859 ColumnDoesNotExistError: 

860 If the `#!py column` does not exist within `#!py dataframe.columns`. 

861 InvalidPySparkDataTypeError: 

862 If the `#!py datatype` is not a valid `#!py pyspark` data type. 

863 

864 Returns: 

865 (bool): 

866 `#!py True` if the column is of the given `#!py datatype`, `#!py False` otherwise. 

867 

868 ???+ example "Examples" 

869 

870 ```{.py .python linenums="1" title="Set up"} 

871 >>> import pandas as pd 

872 >>> from pyspark.sql import SparkSession 

873 >>> from toolbox_pyspark.checks import column_is_type 

874 >>> spark = SparkSession.builder.getOrCreate() 

875 >>> df = spark.createDataFrame( 

876 ... pd.DataFrame( 

877 ... { 

878 ... "a": [1, 2, 3, 4], 

879 ... "b": ["a", "b", "c", "d"], 

880 ... } 

881 ... ) 

882 ... ) 

883 ``` 

884 

885 ```{.py .python linenums="1" title="Example 1: Column is of type"} 

886 >>> column_is_type(df, "a", "integer") 

887 ``` 

888 <div class="result" markdown> 

889 ```{.sh .shell title="Terminal"} 

890 True 

891 ``` 

892 !!! success "Conclusion: Column is the correct type." 

893 </div> 

894 

895 ```{.py .python linenums="1" title="Example 2: Column is not of type"} 

896 >>> column_is_type(df, "b", "integer") 

897 ``` 

898 <div class="result" markdown> 

899 ```{.sh .shell title="Terminal"} 

900 False 

901 ``` 

902 !!! failure "Conclusion: Column is not the correct type." 

903 </div> 

904 

905 ??? tip "See Also" 

906 - [`column_is_type`][toolbox_pyspark.checks.column_is_type] 

907 - [`columns_are_type`][toolbox_pyspark.checks.columns_are_type] 

908 - [`assert_column_is_type`][toolbox_pyspark.checks.assert_column_is_type] 

909 - [`assert_columns_are_type`][toolbox_pyspark.checks.assert_columns_are_type] 

910 - [`warn_column_invalid_type`][toolbox_pyspark.checks.warn_column_invalid_type] 

911 - [`warn_columns_invalid_type`][toolbox_pyspark.checks.warn_columns_invalid_type] 

912 """ 

913 return _columns_are_type(dataframe, column, datatype, match_case).result 

914 

915 

916@typechecked 

917def columns_are_type( 

918 dataframe: psDataFrame, 

919 columns: Union[str, str_collection], 

920 datatype: str, 

921 match_case: bool = False, 

922) -> bool: 

923 """ 

924 !!! note "Summary" 

925 Check whether the given `#!py columns` are of a given `#!py datatype` in `#!py dataframe`. 

926 

927 Params: 

928 dataframe (psDataFrame): 

929 The DataFrame to check. 

930 columns (Union[str, str_collection]): 

931 The columns to check. 

932 datatype (str): 

933 The data type to check. 

934 match_case (bool, optional): 

935 Whether or not to match the string case for the columns.<br> 

936 Defaults to `#!py False`. 

937 

938 Raises: 

939 TypeError: 

940 If any of the inputs parsed to the parameters of this function are not the correct type. Uses the [`@typeguard.typechecked`](https://typeguard.readthedocs.io/en/stable/api.html#typeguard.typechecked) decorator. 

941 ColumnDoesNotExistError: 

942 If any of the `#!py columns` do not exist within `#!py dataframe.columns`. 

943 InvalidPySparkDataTypeError: 

944 If the `#!py datatype` is not a valid `#!py pyspark` data type. 

945 

946 Returns: 

947 (bool): 

948 `#!py True` if all the columns are of the given `#!py datatype`, `#!py False` otherwise. 

949 

950 ???+ example "Examples" 

951 

952 ```{.py .python linenums="1" title="Set up"} 

953 >>> import pandas as pd 

954 >>> from pyspark.sql import SparkSession 

955 >>> from toolbox_pyspark.checks import columns_are_type 

956 >>> spark = SparkSession.builder.getOrCreate() 

957 >>> df = spark.createDataFrame( 

958 ... pd.DataFrame( 

959 ... { 

960 ... "a": [1, 2, 3, 4], 

961 ... "b": ["a", "b", "c", "d"], 

962 ... "c": [1.1, 2.2, 3.3, 4.4], 

963 ... } 

964 ... ) 

965 ... ) 

966 ``` 

967 

968 ```{.py .python linenums="1" title="Example 1: Columns are of type"} 

969 >>> columns_are_type(df, ["a", "c"], "double") 

970 ``` 

971 <div class="result" markdown> 

972 ```{.sh .shell title="Terminal"} 

973 True 

974 ``` 

975 !!! success "Conclusion: Columns are the correct type." 

976 </div> 

977 

978 ```{.py .python linenums="1" title="Example 2: Columns are not of type"} 

979 >>> columns_are_type(df, ["a", "b"], "double") 

980 ``` 

981 <div class="result" markdown> 

982 ```{.sh .shell title="Terminal"} 

983 False 

984 ``` 

985 !!! failure "Conclusion: Columns are not the correct type." 

986 </div> 

987 

988 ```{.py .python linenums="1" title="Example 3: Single column is of type"} 

989 >>> columns_are_type(df, "a", "integer") 

990 ``` 

991 <div class="result" markdown> 

992 ```{.sh .shell title="Terminal"} 

993 True 

994 ``` 

995 !!! success "Conclusion: Column is the correct type." 

996 </div> 

997 

998 ```{.py .python linenums="1" title="Example 4: Single column is not of type"} 

999 >>> columns_are_type(df, "b", "integer") 

1000 ``` 

1001 <div class="result" markdown> 

1002 ```{.sh .shell title="Terminal"} 

1003 False 

1004 ``` 

1005 !!! failure "Conclusion: Column is not the correct type." 

1006 </div> 

1007 

1008 ??? tip "See Also" 

1009 - [`column_is_type`][toolbox_pyspark.checks.column_is_type] 

1010 - [`columns_are_type`][toolbox_pyspark.checks.columns_are_type] 

1011 - [`assert_column_is_type`][toolbox_pyspark.checks.assert_column_is_type] 

1012 - [`assert_columns_are_type`][toolbox_pyspark.checks.assert_columns_are_type] 

1013 - [`warn_column_invalid_type`][toolbox_pyspark.checks.warn_column_invalid_type] 

1014 - [`warn_columns_invalid_type`][toolbox_pyspark.checks.warn_columns_invalid 

1015 """ 

1016 return _columns_are_type(dataframe, columns, datatype, match_case).result 

1017 

1018 

1019@typechecked 

1020def assert_column_is_type( 

1021 dataframe: psDataFrame, 

1022 column: str, 

1023 datatype: str, 

1024 match_case: bool = False, 

1025) -> None: 

1026 """ 

1027 !!! note "Summary" 

1028 Check whether a given `#!py column` is of a given `#!py datatype` in `#!py dataframe`. 

1029 

1030 Params: 

1031 dataframe (psDataFrame): 

1032 The DataFrame to check. 

1033 column (str): 

1034 The column to check. 

1035 datatype (str): 

1036 The data type to check. 

1037 match_case (bool, optional): 

1038 Whether or not to match the string case for the columns.<br> 

1039 Defaults to `#!py False`. 

1040 

1041 Raises: 

1042 TypeError: 

1043 If any of the inputs parsed to the parameters of this function are not the correct type. Uses the [`@typeguard.typechecked`](https://typeguard.readthedocs.io/en/stable/api.html#typeguard.typechecked) decorator. 

1044 ColumnDoesNotExistError: 

1045 If the `#!py column` does not exist within `#!py dataframe.columns`. 

1046 InvalidPySparkDataTypeError: 

1047 If the given `#!py column` is not of the given `#!py datatype`. 

1048 

1049 Returns: 

1050 (type(None)): 

1051 Nothing is returned. Either an `#!py InvalidPySparkDataTypeError` exception is raised, or nothing. 

1052 

1053 ???+ example "Examples" 

1054 

1055 ```{.py .python linenums="1" title="Set up"} 

1056 >>> import pandas as pd 

1057 >>> from pyspark.sql import SparkSession 

1058 >>> from toolbox_pyspark.checks import assert_column_is_type 

1059 >>> spark = SparkSession.builder.getOrCreate() 

1060 >>> df = spark.createDataFrame( 

1061 ... pd.DataFrame( 

1062 ... { 

1063 ... "a": [1, 2, 3, 4], 

1064 ... "b": ["a", "b", "c", "d"], 

1065 ... } 

1066 ... ) 

1067 ... ) 

1068 ``` 

1069 

1070 ```{.py .python linenums="1" title="Example 1: No error"} 

1071 >>> assert_column_is_type(df, "a", "integer") 

1072 ``` 

1073 <div class="result" markdown> 

1074 ```{.sh .shell title="Terminal"} 

1075 None 

1076 ``` 

1077 !!! success "Conclusion: Column is of type." 

1078 </div> 

1079 

1080 ```{.py .python linenums="1" title="Example 2: Error raised"} 

1081 >>> assert_column_is_type(df, "b", "integer") 

1082 ``` 

1083 <div class="result" markdown> 

1084 ```{.txt .text title="Terminal"} 

1085 InvalidPySparkDataTypeError: Column 'b' is not of type 'integer'. 

1086 ``` 

1087 !!! failure "Conclusion: Column is not of type." 

1088 </div> 

1089 

1090 ??? tip "See Also" 

1091 - [`column_is_type`][toolbox_pyspark.checks.column_is_type] 

1092 - [`columns_are_type`][toolbox_pyspark.checks.columns_are_type] 

1093 - [`assert_column_is_type`][toolbox_pyspark.checks.assert_column_is_type] 

1094 - [`assert_columns_are_type`][toolbox_pyspark.checks.assert_columns_are_type] 

1095 - [`warn_column_invalid_type`][toolbox_pyspark.checks.warn_column_invalid_type] 

1096 - [`warn_columns_invalid_type`][toolbox_pyspark.checks.warn_columns_invalid 

1097 """ 

1098 result, invalid_types = _columns_are_type(dataframe, column, datatype, match_case) 

1099 if not result: 

1100 raise InvalidPySparkDataTypeError( 

1101 f"Column '{column}' is type '{invalid_types[0][1]}', " 

1102 f"which is not the required type: '{datatype}'." 

1103 ) 

1104 

1105 

1106@typechecked 

1107def assert_columns_are_type( 

1108 dataframe: psDataFrame, 

1109 columns: Union[str, str_collection], 

1110 datatype: str, 

1111 match_case: bool = False, 

1112) -> None: 

1113 """ 

1114 !!! note "Summary" 

1115 Check whether the given `#!py columns` are of a given `#!py datatype` in `#!py dataframe`. 

1116 

1117 Params: 

1118 dataframe (psDataFrame): 

1119 The DataFrame to check. 

1120 columns (Union[str, str_collection]): 

1121 The columns to check. 

1122 datatype (str): 

1123 The data type to check. 

1124 match_case (bool, optional): 

1125 Whether or not to match the string case for the columns.<br> 

1126 Defaults to `#!py False`. 

1127 

1128 Raises: 

1129 TypeError: 

1130 If any of the inputs parsed to the parameters of this function are not the correct type. Uses the [`@typeguard.typechecked`](https://typeguard.readthedocs.io/en/stable/api.html#typeguard.typechecked) decorator. 

1131 ColumnDoesNotExistError: 

1132 If any of the `#!py columns` do not exist within `#!py dataframe.columns`. 

1133 InvalidPySparkDataTypeError: 

1134 If any of the given `#!py columns` are not of the given `#!py datatype`. 

1135 

1136 Returns: 

1137 (type(None)): 

1138 Nothing is returned. Either an `#!py InvalidPySparkDataTypeError` exception is raised, or nothing. 

1139 

1140 ???+ example "Examples" 

1141 

1142 ```{.py .python linenums="1" title="Set up"} 

1143 >>> import pandas as pd 

1144 >>> from pyspark.sql import SparkSession 

1145 >>> from toolbox_pyspark.checks import assert_columns_are_type 

1146 >>> spark = SparkSession.builder.getOrCreate() 

1147 >>> df = spark.createDataFrame( 

1148 ... pd.DataFrame( 

1149 ... { 

1150 ... "a": [1, 2, 3, 4], 

1151 ... "b": ["a", "b", "c", "d"], 

1152 ... "c": [1.1, 2.2, 3.3, 4.4], 

1153 ... } 

1154 ... ) 

1155 ... ) 

1156 ``` 

1157 

1158 ```{.py .python linenums="1" title="Example 1: No error"} 

1159 >>> assert_columns_are_type(df, ["a", "c"], "double") 

1160 ``` 

1161 <div class="result" markdown> 

1162 ```{.sh .shell title="Terminal"} 

1163 None 

1164 ``` 

1165 !!! success "Conclusion: Columns are of type." 

1166 </div> 

1167 

1168 ```{.py .python linenums="1" title="Example 2: Error raised"} 

1169 >>> assert_columns_are_type(df, ["a", "b"], "double") 

1170 ``` 

1171 <div class="result" markdown> 

1172 ```{.txt .text title="Terminal"} 

1173 InvalidPySparkDataTypeError: Columns ['a', 'b'] are types ['int', 'string'], which are not the required type: 'double'. 

1174 ``` 

1175 !!! failure "Conclusion: Columns are not of type." 

1176 </div> 

1177 

1178 ```{.py .python linenums="1" title="Example 3: Single column is of type"} 

1179 >>> assert_columns_are_type(df, "a", "integer") 

1180 ``` 

1181 <div class="result" markdown> 

1182 ```{.sh .shell title="Terminal"} 

1183 None 

1184 ``` 

1185 !!! success "Conclusion: Column is of type." 

1186 </div> 

1187 

1188 ```{.py .python linenums="1" title="Example 4: Single column is not of type"} 

1189 >>> assert_columns_are_type(df, "b", "integer") 

1190 ``` 

1191 <div class="result" markdown> 

1192 ```{.txt .text title="Terminal"} 

1193 InvalidPySparkDataTypeError: Columns ['b'] are types ['string'], which are not the required type: 'integer'. 

1194 ``` 

1195 !!! failure "Conclusion: Column is not of type." 

1196 </div> 

1197 

1198 ??? tip "See Also" 

1199 - [`column_is_type`][toolbox_pyspark.checks.column_is_type] 

1200 - [`columns_are_type`][toolbox_pyspark.checks.columns_are_type] 

1201 - [`assert_column_is_type`][toolbox_pyspark.checks.assert_column_is_type] 

1202 - [`assert_columns_are_type`][toolbox_pyspark.checks.assert_columns_are_type] 

1203 - [`warn_column_invalid_type`][toolbox_pyspark.checks.warn_column_invalid_type] 

1204 - [`warn_columns_invalid_type`][toolbox_pyspark.checks.warn_columns_invalid 

1205 """ 

1206 result, invalid_types = _columns_are_type(dataframe, columns, datatype, match_case) 

1207 if not result: 

1208 raise InvalidPySparkDataTypeError( 

1209 f"Columns {[col for col, _ in invalid_types]} are types {[typ for _, typ in invalid_types]}, " 

1210 f"which are not the required type: '{datatype}'." 

1211 ) 

1212 

1213 

1214@typechecked 

1215def warn_column_invalid_type( 

1216 dataframe: psDataFrame, 

1217 column: str, 

1218 datatype: str, 

1219 match_case: bool = False, 

1220) -> None: 

1221 """ 

1222 !!! note "Summary" 

1223 Check whether a given `#!py column` is of a given `#!py datatype` in `#!py dataframe` and raise a warning if not. 

1224 

1225 Params: 

1226 dataframe (psDataFrame): 

1227 The DataFrame to check. 

1228 column (str): 

1229 The column to check. 

1230 datatype (str): 

1231 The data type to check. 

1232 match_case (bool, optional): 

1233 Whether or not to match the string case for the columns.<br> 

1234 Defaults to `#!py False`. 

1235 

1236 Raises: 

1237 TypeError: 

1238 If any of the inputs parsed to the parameters of this function are not the correct type. Uses the [`@typeguard.typechecked`](https://typeguard.readthedocs.io/en/stable/api.html#typeguard.typechecked) decorator. 

1239 

1240 Returns: 

1241 (type(None)): 

1242 Nothing is returned. Either an `#!py InvalidPySparkDataTypeWarning` exception is raised, or nothing. 

1243 

1244 ???+ example "Examples" 

1245 

1246 ```{.py .python linenums="1" title="Set up"} 

1247 >>> import pandas as pd 

1248 >>> from pyspark.sql import SparkSession 

1249 >>> from toolbox_pyspark.checks import warn_column_invalid_type 

1250 >>> spark = SparkSession.builder.getOrCreate() 

1251 >>> df = spark.createDataFrame( 

1252 ... pd.DataFrame( 

1253 ... { 

1254 ... "a": [1, 2, 3, 4], 

1255 ... "b": ["a", "b", "c", "d"], 

1256 ... } 

1257 ... ) 

1258 ... ) 

1259 ``` 

1260 

1261 ```{.py .python linenums="1" title="Example 1: No warning"} 

1262 >>> warn_column_invalid_type(df, "a", "integer") 

1263 ``` 

1264 <div class="result" markdown> 

1265 ```{.txt .text title="Terminal"} 

1266 None 

1267 ``` 

1268 !!! success "Conclusion: Column is of type." 

1269 </div> 

1270 

1271 ```{.py .python linenums="1" title="Example 2: Warning raised"} 

1272 >>> warn_column_invalid_type(df, "b", "integer") 

1273 ``` 

1274 <div class="result" markdown> 

1275 ```{.txt .text title="Terminal"} 

1276 InvalidPySparkDataTypeWarning: Column 'b' is type 'string', which is not the required type: 'integer'. 

1277 ``` 

1278 !!! failure "Conclusion: Column is not of type." 

1279 </div> 

1280 

1281 ??? tip "See Also" 

1282 - [`column_is_type`][toolbox_pyspark.checks.column_is_type] 

1283 - [`columns_are_type`][toolbox_pyspark.checks.columns_are_type] 

1284 - [`assert_column_is_type`][toolbox_pyspark.checks.assert_column_is_type] 

1285 - [`assert_columns_are_type`][toolbox_pyspark.checks.assert_columns_are_type] 

1286 - [`warn_column_invalid_type`][toolbox_pyspark.checks.warn_column_invalid_type] 

1287 - [`warn_columns_invalid_type`][toolbox_pyspark.checks.warn_columns_invalid 

1288 """ 

1289 result, invalid_types = _columns_are_type(dataframe, column, datatype, match_case) 

1290 if not result: 

1291 warn( 

1292 f"Column '{column}' is type '{invalid_types[0][1]}', " 

1293 f"which is not the required type: '{datatype}'.", 

1294 InvalidPySparkDataTypeWarning, 

1295 ) 

1296 

1297 

1298@typechecked 

1299def warn_columns_invalid_type( 

1300 dataframe: psDataFrame, 

1301 columns: Union[str, str_collection], 

1302 datatype: str, 

1303 match_case: bool = False, 

1304) -> None: 

1305 """ 

1306 !!! note "Summary" 

1307 Check whether the given `#!py columns` are of a given `#!py datatype` in `#!py dataframe` and raise a warning if not. 

1308 

1309 Params: 

1310 dataframe (psDataFrame): 

1311 The DataFrame to check. 

1312 columns (Union[str, str_collection]): 

1313 The columns to check. 

1314 datatype (str): 

1315 The data type to check. 

1316 match_case (bool, optional): 

1317 Whether or not to match the string case for the columns.<br> 

1318 Defaults to `#!py False`. 

1319 

1320 Raises: 

1321 TypeError: 

1322 If any of the inputs parsed to the parameters of this function are not the correct type. Uses the [`@typeguard.typechecked`](https://typeguard.readthedocs.io/en/stable/api.html#typeguard.typechecked) decorator. 

1323 

1324 Returns: 

1325 (type(None)): 

1326 Nothing is returned. Either an `#!py InvalidPySparkDataTypeWarning` exception is raised, or nothing. 

1327 

1328 ???+ example "Examples" 

1329 

1330 ```{.py .python linenums="1" title="Set up"} 

1331 >>> import pandas as pd 

1332 >>> from pyspark.sql import SparkSession 

1333 >>> from toolbox_pyspark.checks import warn_columns_invalid_type 

1334 >>> spark = SparkSession.builder.getOrCreate() 

1335 >>> df = spark.createDataFrame( 

1336 ... pd.DataFrame( 

1337 ... { 

1338 ... "a": [1, 2, 3, 4], 

1339 ... "b": ["a", "b", "c", "d"], 

1340 ... "c": [1.1, 2.2, 3.3, 4.4], 

1341 ... } 

1342 ... ) 

1343 ... ) 

1344 ``` 

1345 

1346 ```{.py .python linenums="1" title="Example 1: No warning"} 

1347 >>> warn_columns_invalid_type(df, ["a", "c"], "double") 

1348 ``` 

1349 <div class="result" markdown> 

1350 ```{.txt .text title="Terminal"} 

1351 None 

1352 ``` 

1353 !!! success "Conclusion: Columns are of type." 

1354 </div> 

1355 

1356 ```{.py .python linenums="1" title="Example 2: Warning raised"} 

1357 >>> warn_columns_invalid_type(df, ["a", "b"], "double") 

1358 ``` 

1359 <div class="result" markdown> 

1360 ```{.txt .text title="Terminal"} 

1361 InvalidPySparkDataTypeWarning: Columns ['a', 'b'] are types ['int', 'string'], which are not the required type: 'double'. 

1362 ``` 

1363 !!! failure "Conclusion: Columns are not of type." 

1364 </div> 

1365 

1366 ??? tip "See Also" 

1367 - [`column_is_type`][toolbox_pyspark.checks.column_is_type] 

1368 - [`columns_are_type`][toolbox_pyspark.checks.columns_are_type] 

1369 - [`assert_column_is_type`][toolbox_pyspark.checks.assert_column_is_type] 

1370 - [`assert_columns_are_type`][toolbox_pyspark.checks.assert_columns_are_type] 

1371 - [`warn_column_invalid_type`][toolbox_pyspark.checks.warn_column_invalid_type] 

1372 - [`warn_columns_invalid_type`][toolbox_pyspark.checks.warn_columns_invalid 

1373 """ 

1374 result, invalid_types = _columns_are_type(dataframe, columns, datatype, match_case) 

1375 if not result: 

1376 warn( 

1377 f"Columns {[col for col, _ in invalid_types]} are types {[typ for _, typ in invalid_types]}, " 

1378 f"which are not the required type: '{datatype}'.", 

1379 InvalidPySparkDataTypeWarning, 

1380 ) 

1381 

1382 

1383@typechecked 

1384def column_contains_value( 

1385 dataframe: psDataFrame, 

1386 column: str, 

1387 value: str, 

1388 match_case: bool = False, 

1389) -> bool: 

1390 """ 

1391 !!! note "Summary" 

1392 Check whether a given `#!py column` contains a specific `#!py value` in `#!py dataframe`. 

1393 

1394 Params: 

1395 dataframe (psDataFrame): 

1396 The DataFrame to check. 

1397 column (str): 

1398 The column to check. 

1399 value (str): 

1400 The value to check for. 

1401 match_case (bool, optional): 

1402 Whether or not to match the string case for the value.<br> 

1403 Defaults to `#!py False`. 

1404 

1405 Raises: 

1406 TypeError: 

1407 If any of the inputs parsed to the parameters of this function are not the correct type. Uses the [`@typeguard.typechecked`](https://typeguard.readthedocs.io/en/stable/api.html#typeguard.typechecked) decorator. 

1408 ColumnDoesNotExistError: 

1409 If the `#!py column` does not exist within `#!py dataframe.columns`. 

1410 

1411 Returns: 

1412 (bool): 

1413 `#!py True` if the column contains the value, `#!py False` otherwise. 

1414 

1415 ???+ example "Examples" 

1416 

1417 ```{.py .python linenums="1" title="Set up"} 

1418 >>> import pandas as pd 

1419 >>> from pyspark.sql import SparkSession 

1420 >>> from toolbox_pyspark.checks import column_contains_value 

1421 >>> spark = SparkSession.builder.getOrCreate() 

1422 >>> df = spark.createDataFrame( 

1423 ... pd.DataFrame( 

1424 ... { 

1425 ... "a": [1, 2, 3, 4], 

1426 ... "b": ["a", "b", "c", "d"], 

1427 ... } 

1428 ... ) 

1429 ... ) 

1430 ``` 

1431 

1432 ```{.py .python linenums="1" title="Example 1: Value exists"} 

1433 >>> column_contains_value(df, "b", "a") 

1434 ``` 

1435 <div class="result" markdown> 

1436 ```{.sh .shell title="Terminal"} 

1437 True 

1438 ``` 

1439 !!! success "Conclusion: Value exists in column." 

1440 </div> 

1441 

1442 ```{.py .python linenums="1" title="Example 2: Value does not exist"} 

1443 >>> column_contains_value(df, "b", "z") 

1444 ``` 

1445 <div class="result" markdown> 

1446 ```{.sh .shell title="Terminal"} 

1447 False 

1448 ``` 

1449 !!! failure "Conclusion: Value does not exist in column." 

1450 </div> 

1451 

1452 ??? tip "See Also" 

1453 - [`assert_column_exists`][toolbox_pyspark.checks.assert_column_exists] 

1454 """ 

1455 assert_column_exists(dataframe, column, match_case) 

1456 

1457 if not match_case: 

1458 value = value.lower() 

1459 dataframe = dataframe.withColumn(column, F.lower(F.col(column))) 

1460 

1461 return dataframe.filter(f"{column} = '{value}'").count() > 0 

1462 

1463 

1464# ---------------------------------------------------------------------------- # 

1465# Table Existence #### 

1466# ---------------------------------------------------------------------------- # 

1467 

1468 

1469@typechecked 

1470def table_exists( 

1471 name: str, 

1472 path: str, 

1473 data_format: SPARK_FORMATS, 

1474 spark_session: SparkSession, 

1475) -> bool: 

1476 """ 

1477 !!! note "Summary" 

1478 Will try to read `#!py table` from `#!py path` using `#!py format`, and if successful will return `#!py True` otherwise `#!py False`. 

1479 

1480 Params: 

1481 name (str): 

1482 The name of the table to check exists. 

1483 path (str): 

1484 The directory where the table should be existing. 

1485 data_format (str): 

1486 The format of the table to try checking. 

1487 spark_session (SparkSession): 

1488 The `#!py spark` session to use for the importing. 

1489 

1490 Raises: 

1491 TypeError: 

1492 If any of the inputs parsed to the parameters of this function are not the correct type. Uses the [`@typeguard.typechecked`](https://typeguard.readthedocs.io/en/stable/api.html#typeguard.typechecked) decorator. 

1493 

1494 Returns: 

1495 (bool): 

1496 Returns `#!py True` if the table exists, `False` otherwise. 

1497 

1498 ???+ example "Examples" 

1499 

1500 ```{.py .python linenums="1" title="Set up"} 

1501 >>> # Imports 

1502 >>> import pandas as pd 

1503 >>> from pyspark.sql import SparkSession 

1504 >>> from toolbox_pyspark.io import write_to_path 

1505 >>> from toolbox_pyspark.checks import table_exists 

1506 >>> 

1507 >>> # Constants 

1508 >>> write_name = "test_df" 

1509 >>> write_path = f"./test" 

1510 >>> write_format = "parquet" 

1511 >>> 

1512 >>> # Instantiate Spark 

1513 >>> spark = SparkSession.builder.getOrCreate() 

1514 >>> 

1515 >>> # Create data 

1516 >>> df = spark.createDataFrame( 

1517 ... pd.DataFrame( 

1518 ... { 

1519 ... "a": [1, 2, 3, 4], 

1520 ... "b": ["a", "b", "c", "d"], 

1521 ... } 

1522 ... ) 

1523 ... ) 

1524 >>> 

1525 >>> # Write data 

1526 >>> write_to_path(df, f"{write_name}.{write_format}", write_path) 

1527 ``` 

1528 

1529 ```{.py .python linenums="1" title="Example 1: Table exists"} 

1530 >>> table_exists("test_df.parquet", "./test", "parquet", spark) 

1531 ``` 

1532 <div class="result" markdown> 

1533 ```{.sh .shell title="Terminal"} 

1534 True 

1535 ``` 

1536 !!! success "Conclusion: Table exists." 

1537 </div> 

1538 

1539 ```{.py .python linenums="1" title="Example 2: Table does not exist"} 

1540 >>> table_exists("bad_table_name.parquet", "./test", "parquet", spark) 

1541 ``` 

1542 <div class="result" markdown> 

1543 ```{.sh .shell title="Terminal"} 

1544 False 

1545 ``` 

1546 !!! failure "Conclusion: Table does not exist." 

1547 </div> 

1548 

1549 ??? tip "See Also" 

1550 - [`assert_table_exists`][toolbox_pyspark.checks.assert_table_exists] 

1551 """ 

1552 try: 

1553 _ = read_from_path( 

1554 name=name, 

1555 path=path, 

1556 data_format=data_format, 

1557 spark_session=spark_session, 

1558 ) 

1559 except Exception: 

1560 return False 

1561 return True 

1562 

1563 

1564@typechecked 

1565def assert_table_exists( 

1566 name: str, 

1567 path: str, 

1568 data_format: SPARK_FORMATS, 

1569 spark_session: SparkSession, 

1570) -> None: 

1571 """ 

1572 !!! note "Summary" 

1573 Assert whether a table exists at a given `path` using `data_format`. 

1574 

1575 Params: 

1576 name (str): 

1577 The name of the table to check exists. 

1578 path (str): 

1579 The directory where the table should be existing. 

1580 data_format (str): 

1581 The format of the table to try checking. 

1582 spark_session (SparkSession): 

1583 The `#!py spark` session to use for the importing. 

1584 

1585 Raises: 

1586 TypeError: 

1587 If any of the inputs parsed to the parameters of this function are not the correct type. Uses the [`@typeguard.typechecked`](https://typeguard.readthedocs.io/en/stable/api.html#typeguard.typechecked) decorator. 

1588 TableDoesNotExistError: 

1589 If the table does not exist at the specified location. 

1590 

1591 Returns: 

1592 (type(None)): 

1593 Nothing is returned. Either an `#!py TableDoesNotExistError` exception is raised, or nothing. 

1594 

1595 ???+ example "Examples" 

1596 

1597 ```{.py .python linenums="1" title="Set up"} 

1598 >>> # Imports 

1599 >>> import pandas as pd 

1600 >>> from pyspark.sql import SparkSession 

1601 >>> from toolbox_pyspark.io import write_to_path 

1602 >>> from toolbox_pyspark.checks import assert_table_exists 

1603 >>> 

1604 >>> # Constants 

1605 >>> write_name = "test_df" 

1606 >>> write_path = f"./test" 

1607 >>> write_format = "parquet" 

1608 >>> 

1609 >>> # Instantiate Spark 

1610 >>> spark = SparkSession.builder.getOrCreate() 

1611 >>> 

1612 >>> # Create data 

1613 >>> df = spark.createDataFrame( 

1614 ... pd.DataFrame( 

1615 ... { 

1616 ... "a": [1, 2, 3, 4], 

1617 ... "b": ["a", "b", "c", "d"], 

1618 ... } 

1619 ... ) 

1620 ... ) 

1621 >>> 

1622 >>> # Write data 

1623 >>> write_to_path(df, f"{write_name}.{write_format}", write_path) 

1624 ``` 

1625 

1626 ```{.py .python linenums="1" title="Example 1: Table exists"} 

1627 >>> assert_table_exists("test_df.parquet", "./test", "parquet", spark) 

1628 ``` 

1629 <div class="result" markdown> 

1630 ```{.sh .shell title="Terminal"} 

1631 None 

1632 ``` 

1633 !!! success "Conclusion: Table exists." 

1634 </div> 

1635 

1636 ```{.py .python linenums="1" title="Example 2: Table does not exist"} 

1637 >>> assert_table_exists("bad_table_name.parquet", "./test", "parquet", spark) 

1638 ``` 

1639 <div class="result" markdown> 

1640 ```{.txt .text title="Terminal"} 

1641 TableDoesNotExistError: Table 'bad_table_name.parquet' does not exist at path './test'. 

1642 ``` 

1643 !!! failure "Conclusion: Table does not exist." 

1644 </div> 

1645 

1646 ??? tip "See Also" 

1647 - [`table_exists`][toolbox_pyspark.checks.table_exists] 

1648 """ 

1649 if not table_exists( 

1650 name=name, path=path, data_format=data_format, spark_session=spark_session 

1651 ): 

1652 raise TableDoesNotExistError(f"Table '{name}' does not exist at path '{path}'.")