Coverage for src/toolbox_pyspark/columns.py: 100%

1# ============================================================================ #

2# #

3# Title : Dataframe Cleaning #

4# Purpose : Fetch columns from a given DataFrame using convenient syntax. #

5# #

6# ============================================================================ #

9# ---------------------------------------------------------------------------- #

10# #

11# Overview ####

12# #

13# ---------------------------------------------------------------------------- #

16# ---------------------------------------------------------------------------- #

17# Description ####

18# ---------------------------------------------------------------------------- #

21"""

22!!! note "Summary"

23 The `columns` module is used to fetch columns from a given DataFrame using convenient syntax.

24"""

27# ---------------------------------------------------------------------------- #

28# #

29# Setup ####

30# #

31# ---------------------------------------------------------------------------- #

34# ---------------------------------------------------------------------------- #

35# Imports ####

36# ---------------------------------------------------------------------------- #

39# ## Python StdLib Imports ----

40from typing import Literal, Optional, Union

42# ## Python Third Party Imports ----

43from pyspark.sql import DataFrame as psDataFrame

44from toolbox_python.checkers import is_type

45from toolbox_python.collection_types import str_collection, str_list

46from typeguard import typechecked

48# ## Local First Party Imports ----

49from toolbox_pyspark.checks import (

50 assert_columns_exists,

51 warn_columns_missing,

52)

55# ---------------------------------------------------------------------------- #

56# Exports ####

57# ---------------------------------------------------------------------------- #

60__all__: str_list = [

61 "get_columns",

62 "get_columns_by_likeness",

63 "rename_columns",

64 "reorder_columns",

65 "delete_columns",

66]

69# ---------------------------------------------------------------------------- #

70# #

71# Functions ####

72# #

73# ---------------------------------------------------------------------------- #

76# ---------------------------------------------------------------------------- #

77# Selecting ####

78# ---------------------------------------------------------------------------- #

81@typechecked

82def get_columns(

83 dataframe: psDataFrame,

84 columns: Optional[Union[str, str_collection]] = None,

85) -> str_list:

86 """

87 !!! note "Summary"

88 Get a list of column names from a DataFrame based on optional filter criteria.

90 Params:

91 dataframe (psDataFrame):

92 The DataFrame from which to retrieve column names.

93 columns (Optional[Union[str, str_collection]], optional):

94 Optional filter criteria for selecting columns.<br>

95 If a string is provided, it can be one of the following options:

97 | Value | Description |

98 |-------|-------------|

99 | `#!py "all"` | Return all columns in the DataFrame.

100 | `#!py "all_str"` | Return columns of string type.

101 | `#!py "all_int"` | Return columns of integer type.

102 | `#!py "all_numeric"` | Return columns of numeric types (integers and floats).

103 | `#!py "all_datetime"` or `#!py "all_timestamp"` | Return columns of datetime or timestamp type.

104 | `#!py "all_date"` | Return columns of date type.

105 | Any other string | Return columns matching the provided exact column name.

106

107 If a list or tuple of column names is provided, return only those columns.<br>

108 Defaults to `#!py None` (which returns all columns).

109

110 Raises:

111 TypeError:

112 If any of the inputs parsed to the parameters of this function are not the correct type. Uses the [`@typeguard.typechecked`](https://typeguard.readthedocs.io/en/stable/api.html#typeguard.typechecked) decorator.

113

114 Returns:

115 (str_list):

116 The selected column names from the DataFrame.

117

118 ???+ example "Examples"

119

120 ```{.py .python linenums="1" title="Set up"}

121 >>> # Imports

122 >>> from pprint import pprint

123 >>> import pandas as pd

124 >>> from pyspark.sql import SparkSession, functions as F

125 >>> from toolbox_pyspark.columns import get_columns

126 >>>

127 >>> # Instantiate Spark

128 >>> spark = SparkSession.builder.getOrCreate()

129 >>>

130 >>> # Create data

131 >>> df = (

132 ... spark

133 ... .createDataFrame(

134 ... pd.DataFrame(

135 ... {

136 ... "a": (0, 1, 2, 3),

137 ... "b": ["a", "b", "c", "d"],

138 ... }

139 ... )

140 ... )

141 ... .withColumns(

142 ... {

143 ... "c": F.lit("1").cast("int"),

144 ... "d": F.lit("2").cast("string"),

145 ... "e": F.lit("1.1").cast("float"),

146 ... "f": F.lit("1.2").cast("double"),

147 ... "g": F.lit("2022-01-01").cast("date"),

148 ... "h": F.lit("2022-02-01 01:00:00").cast("timestamp"),

149 ... }

150 ... )

151 ... )

152 >>>

153 >>> # Check

154 >>> df.show()

155 >>> print(df.dtypes)

156 ```

157 <div class="result" markdown>

158 ```{.txt .text title="Terminal"}

159 +---+---+---+---+-----+-----+------------+---------------------+

160 | a | b | c | d | e | f | g | h |

161 +---+---+---+---+-----+-----+------------+---------------------+

162 | 0 | a | 1 | 2 | 1.1 | 1.2 | 2022-01-01 | 2022-02-01 01:00:00 |

163 | 1 | b | 1 | 2 | 1.1 | 1.2 | 2022-01-01 | 2022-02-01 01:00:00 |

164 | 2 | c | 1 | 2 | 1.1 | 1.2 | 2022-01-01 | 2022-02-01 01:00:00 |

165 | 3 | d | 1 | 2 | 1.1 | 1.2 | 2022-01-01 | 2022-02-01 01:00:00 |

166 +---+---+---+---+-----+-----+------------+---------------------+

167 ```

168 ```{.sh .shell title="Terminal"}

169 [

170 ("a", "bigint"),

171 ("b", "string"),

172 ("c", "int"),

173 ("d", "string"),

174 ("e", "float"),

175 ("f", "double"),

176 ("g", "date"),

177 ("h", "timestamp"),

178 ]

179 ```

180 </div>

181

182 ```{.py .python linenums="1" title="Example 1: Default params"}

183 >>> print(get_columns(df).columns)

184 ```

185 <div class="result" markdown>

186 ```{.sh .shell title="Terminal"}

187 ["a", "b", "c", "d", "e", "f", "g", "h"]

188 ```

189 !!! success "Conclusion: Success."

190 </div>

191

192 ```{.py .python linenums="1" title="Example 2: Specific columns"}

193 >>> print(get_columns(df, ["a", "b", "c"]).columns)

194 ```

195 <div class="result" markdown>

196 ```{.sh .shell title="Terminal"}

197 ["a", "b", "c"]

198 ```

199 !!! success "Conclusion: Success."

200 </div>

201

202 ```{.py .python linenums="1" title="Example 3: Single column as list"}

203 >>> print(get_columns(df, ["a"]).columns)

204 ```

205 <div class="result" markdown>

206 ```{.sh .shell title="Terminal"}

207 ["a"]

208 ```

209 !!! success "Conclusion: Success."

210 </div>

211

212 ```{.py .python linenums="1" title="Example 4: Single column as string"}

213 >>> print(get_columns(df, "a").columns)

214 ```

215 <div class="result" markdown>

216 ```{.sh .shell title="Terminal"}

217 ["a"]

218 ```

219 !!! success "Conclusion: Success."

220 </div>

221

222 ```{.py .python linenums="1" title="Example 5: All columns"}

223 >>> print(get_columns(df, "all").columns)

224 ```

225 <div class="result" markdown>

226 ```{.sh .shell title="Terminal"}

227 ["a", "b", "c", "d", "e", "f", "g", "h"]

228 ```

229 !!! success "Conclusion: Success."

230 </div>

231

232 ```{.py .python linenums="1" title="Example 6: All str"}

233 >>> print(get_columns(df, "all_str").columns)

234 ```

235 <div class="result" markdown>

236 ```{.sh .shell title="Terminal"}

237 ["b", "d"]

238 ```

239 !!! success "Conclusion: Success."

240 </div>

241

242 ```{.py .python linenums="1" title="Example 7: All int"}

243 >>> print(get_columns(df, "all int").columns)

244 ```

245 <div class="result" markdown>

246 ```{.sh .shell title="Terminal"}

247 ["c"]

248 ```

249 !!! success "Conclusion: Success."

250 </div>

251

252 ```{.py .python linenums="1" title="Example 8: All float"}

253 >>> print(get_columns(df, "all_decimal").columns)

254 ```

255 <div class="result" markdown>

256 ```{.sh .shell title="Terminal"}

257 ["e", "f"]

258 ```

259 !!! success "Conclusion: Success."

260 </div>

261

262 ```{.py .python linenums="1" title="Example 9: All numeric"}

263 >>> print(get_columns(df, "all_numeric").columns)

264 ```

265 <div class="result" markdown>

266 ```{.sh .shell title="Terminal"}

267 ["c", "e", "f"]

268 ```

269 !!! success "Conclusion: Success."

270 </div>

271

272 ```{.py .python linenums="1" title="Example 10: All date"}

273 >>> print(get_columns(df, "all_date").columns)

274 ```

275 <div class="result" markdown>

276 ```{.sh .shell title="Terminal"}

277 ["g"]

278 ```

279 !!! success "Conclusion: Success."

280 </div>

281

282 ```{.py .python linenums="1" title="Example 11: All datetime"}

283 >>> print(get_columns(df, "all_datetime").columns)

284 ```

285 <div class="result" markdown>

286 ```{.sh .shell title="Terminal"}

287 ["h"]

288 ```

289 !!! success "Conclusion: Success."

290 </div>

291 """

292 if columns is None:

293 return dataframe.columns

294 elif is_type(columns, str):

295 if "all" in columns:

296 if "str" in columns:

297 return [col for col, typ in dataframe.dtypes if typ in ("str", "string")]

298 elif "int" in columns:

299 return [col for col, typ in dataframe.dtypes if typ in ("int", "integer")]

300 elif "numeric" in columns:

301 return [

302 col

303 for col, typ in dataframe.dtypes

304 if typ in ("int", "integer", "float", "double", "long") or "decimal" in typ

305 ]

306 elif "float" in columns or "double" in columns or "decimal" in columns:

307 return [

308 col

309 for col, typ in dataframe.dtypes

310 if typ in ("float", "double", "long") or "decimal" in typ

311 ]

312 elif "datetime" in columns or "timestamp" in columns:

313 return [

314 col for col, typ in dataframe.dtypes if typ in ("datetime", "timestamp")

315 ]

316 elif "date" in columns:

317 return [col for col, typ in dataframe.dtypes if typ in ["date"]]

318 else:

319 return dataframe.columns

320 else:

321 return [columns]

322 else:

323 return list(columns)

324

325

326@typechecked

327def get_columns_by_likeness(

328 dataframe: psDataFrame,

329 starts_with: Optional[str] = None,

330 contains: Optional[str] = None,

331 ends_with: Optional[str] = None,

332 match_case: bool = False,

333 operator: Literal["and", "or", "and not", "or not"] = "and",

334) -> str_list:

335 """

336 !!! note "Summary"

337 Extract the column names from a given `dataframe` based on text that the column name contains.

338

339 ???+ abstract "Details"

340 You can use any combination of `startswith`, `contains`, and `endswith`. Under the hood, these will be implemented with a number of internal `#!py lambda` functions to determine matches.

341

342 The `operator` parameter determines how the conditions (`starts_with`, `contains`, `ends_with`) are combined:

343

344 | Value | Description |

345 |-------|-------------|

346 | `"and"` | All conditions must be true.

347 | `"or"` | At least one condition must be true.

348 | `"and not"` | The first condition must be true and the second condition must be false.

349 | `"or not"` | At least one condition must be true, but not all.

350

351 Params:

352 dataframe (psDataFrame):

353 The `dataframe` from which to extract the column names.

354 starts_with (Optional[str], optional):

355 Extract any columns that starts with this `#!py str`.<br>

356 Determined by using the `#!py str.startswith()` method.<br>

357 Defaults to `#!py None`.

358 contains (Optional[str], optional):

359 Extract any columns that contains this `#!py str` anywhere within it.<br>

360 Determined by using the `#!py in` keyword.<br>

361 Defaults to `#!py None`.

362 ends_with (Optional[str], optional):

363 Extract any columns that ends with this `#!py str`.<br>

364 Determined by using the `#!py str.endswith()` method.<br>

365 Defaults to `#!py None`.

366 match_case (bool, optional):

367 If you want to ensure an exact match for the columns, set this to `#!py True`, else if you want to match the exact case for the columns, set this to `#!py False`.<br>

368 Defaults to `#!py False`.

369 operator (Literal["and", "or", "and not", "or not"], optional):

370 The logical operator to place between the functions.<br>

371 Only used when there are multiple values parsed to the parameters: `#!py starts_with`, `#!py contains`: `#!py ends_with`.<br>

372 Defaults to `#!py and`.

373

374 Returns:

375 (str_list):

376 The list of columns which match the criteria specified.

377

378 ???+ example "Examples"

379

380 ```{.py .python linenums="1" title="Set up"}

381 >>> # Imports

382 >>> import pandas as pd

383 >>> from pyspark.sql import SparkSession

384 >>> from toolbox_pyspark.columns import get_columns_by_likeness

385 >>>

386 >>> # Instantiate Spark

387 >>> spark = SparkSession.builder.getOrCreate()

388 >>>

389 >>> # Create data

390 >>> values = list(range(1, 6))

391 >>> df = spark.createDataFrame(

392 ... pd.DataFrame(

393 ... {

394 ... "aaa": values,

395 ... "aab": values,

396 ... "aac": values,

397 ... "afa": values,

398 ... "afb": values,

399 ... "afc": values,

400 ... "bac": values,

401 ... }

402 ... )

403 ... )

404 >>>

405 >>> # Check

406 >>> df.show()

407 ```

408 <div class="result" markdown>

409 ```{.txt .text title="Terminal"}

410 +-----+-----+-----+-----+-----+-----+-----+

411 | aaa | aab | aac | afa | afb | afc | bac |

412 +-----+-----+-----+-----+-----+-----+-----+

413 | 1 | 1 | 1 | 1 | 1 | 1 | 1 |

414 | 2 | 2 | 2 | 2 | 2 | 2 | 2 |

415 | 3 | 3 | 3 | 3 | 3 | 3 | 3 |

416 | 4 | 4 | 4 | 4 | 4 | 4 | 4 |

417 | 5 | 5 | 5 | 5 | 5 | 5 | 5 |

418 +-----+-----+-----+-----+-----+-----+-----+

419 ```

420 </div>

421

422 ```{.py .python linenums="1" title="Example 1: Starts With"}

423 >>> print(get_columns_by_likeness(df, starts_with="a"))

424 ```

425 <div class="result" markdown>

426 ```{.sh .shell title="Terminal"}

427 ["aaa", "aab", "aac", "afa", "afb", "afc"]

428 ```

429 !!! success "Conclusion: Success."

430 </div>

431

432 ```{.py .python linenums="1" title="Example 2: Contains"}

433 >>> print(get_columns_by_likeness(df, contains="f"))

434 ```

435 <div class="result" markdown>

436 ```{.sh .shell title="Terminal"}

437 ["afa", "afb", "afc"]

438 ```

439 !!! success "Conclusion: Success."

440 </div>

441

442 ```{.py .python linenums="1" title="Example 3: Ends With"}

443 >>> print(get_columns_by_likeness(df, ends_with="c"))

444 ```

445 <div class="result" markdown>

446 ```{.sh .shell title="Terminal"}

447 ["aac", "afc", "bac"]

448 ```

449 !!! success "Conclusion: Success."

450 </div>

451

452 ```{.py .python linenums="1" title="Example 4: Starts With and Contains"}

453 >>> print(get_columns_by_likeness(df, starts_with="a", contains="c"))

454 ```

455 <div class="result" markdown>

456 ```{.sh .shell title="Terminal"}

457 ["aac", "afc"]

458 ```

459 !!! success "Conclusion: Success."

460 </div>

461

462 ```{.py .python linenums="1" title="Example 5: Starts With and Ends With"}

463 >>> print(get_columns_by_likeness(df, starts_with="a", ends_with="b"))

464 ```

465 <div class="result" markdown>

466 ```{.sh .shell title="Terminal"}

467 ["aab", "afb"]

468 ```

469 !!! success "Conclusion: Success."

470 </div>

471

472 ```{.py .python linenums="1" title="Example 6: Contains and Ends With"}

473 >>> print(get_columns_by_likeness(df, contains="f", ends_with="b"))

474 ```

475 <div class="result" markdown>

476 ```{.sh .shell title="Terminal"}

477 ["afb"]

478 ```

479 !!! success "Conclusion: Success."

480 </div>

481

482 ```{.py .python linenums="1" title="Example 7: Starts With and Contains and Ends With"}

483 >>> print(get_columns_by_likeness(df, starts_with="a", contains="f", ends_with="b"))

484 ```

485 <div class="result" markdown>

486 ```{.sh .shell title="Terminal"}

487 ["afb"]

488 ```

489 !!! success "Conclusion: Success."

490 </div>

491

492 ```{.py .python linenums="1" title="Example 8: Using 'or' Operator"}

493 >>> print(get_columns_by_likeness(df, starts_with="a", operator="or", contains="f"))

494 ```

495 <div class="result" markdown>

496 ```{.sh .shell title="Terminal"}

497 ["aaa", "aab", "aac", "afa", "afb", "afc"]

498 ```

499 !!! success "Conclusion: Success."

500 </div>

501

502 ```{.py .python linenums="1" title="Example 9: Using 'and not' Operator"}

503 >>> print(get_columns_by_likeness(df, starts_with="a", operator="and not", contains="f"))

504 ```

505 <div class="result" markdown>

506 ```{.sh .shell title="Terminal"}

507 ["aaa", "aab", "aac"]

508 ```

509 !!! success "Conclusion: Success."

510 </div>

511

512 ```{.py .python linenums="1" title="Example 10: Error Example 1"}

513 >>> print(get_columns_by_likeness(df, starts_with=123))

514 ```

515 <div class="result" markdown>

516 ```{.sh .shell title="Terminal"}

517 TypeError: `starts_with` must be a `string` or `None`.

518 ```

519 !!! failure "Conclusion: Error."

520 </div>

521

522 ```{.py .python linenums="1" title="Example 11: Error Example 2"}

523 >>> print(get_columns_by_likeness(df, operator="xor"))

524 ```

525 <div class="result" markdown>

526 ```{.sh .shell title="Terminal"}

527 ValueError: `operator` must be one of 'and', 'or', 'and not', 'or not'

528 ```

529 !!! failure "Conclusion: Error."

530 </div>

531 """

532

533 # Columns

534 cols: str_list = dataframe.columns

535 if not match_case:

536 cols = [col.upper() for col in cols]

537 starts_with = starts_with.upper() if starts_with is not None else None

538 contains = contains.upper() if contains is not None else None

539 ends_with = ends_with.upper() if ends_with is not None else None

540

541 # Parameters

542 o_: Literal["and", "or", "and not", "or not"] = operator

543 s_: bool = starts_with is not None

544 c_: bool = contains is not None

545 e_: bool = ends_with is not None

546

547 # Functions

548 _ops = {

549 "and": lambda x, y: x and y,

550 "or": lambda x, y: x or y,

551 "and not": lambda x, y: x and not y,

552 "or not": lambda x, y: x or not y,

553 }

554 _s = lambda col, s: col.startswith(s)

555 _c = lambda col, c: c in col

556 _e = lambda col, e: col.endswith(e)

557 _sc = lambda col, s, c: _ops[o_](_s(col, s), _c(col, c))

558 _se = lambda col, s, e: _ops[o_](_s(col, s), _e(col, e))

559 _ce = lambda col, c, e: _ops[o_](_c(col, c), _e(col, e))

560 _sce = lambda col, s, c, e: _ops[o_](_ops[o_](_s(col, s), _c(col, c)), _e(col, e))

561

562 # Logic

563 if s_ and not c_ and not e_:

564 return [col for col in cols if _s(col, starts_with)]

565 elif c_ and not s_ and not e_:

566 return [col for col in cols if _c(col, contains)]

567 elif e_ and not s_ and not c_:

568 return [col for col in cols if _e(col, ends_with)]

569 elif s_ and c_ and not e_:

570 return [col for col in cols if _sc(col, starts_with, contains)]

571 elif s_ and e_ and not c_:

572 return [col for col in cols if _se(col, starts_with, ends_with)]

573 elif c_ and e_ and not s_:

574 return [col for col in cols if _ce(col, contains, ends_with)]

575 elif s_ and c_ and e_:

576 return [col for col in cols if _sce(col, starts_with, contains, ends_with)]

577 else:

578 return cols

579

580

581# ---------------------------------------------------------------------------- #

582# Renaming ####

583# ---------------------------------------------------------------------------- #

584

585

586@typechecked

587def rename_columns(

588 dataframe: psDataFrame,

589 columns: Optional[Union[str, str_collection]] = None,

590 string_function: str = "upper",

591) -> psDataFrame:

592 """

593 !!! note "Summary"

594 Use one of the common Python string functions to be applied to one or multiple columns.

595

596 ???+ abstract "Details"

597 The `string_function` must be a valid string method. For more info on available functions, see: https://docs.python.org/3/library/stdtypes.html#string-methods

598

599 Params:

600 dataframe (psDataFrame):

601 The DataFrame to be updated.

602 columns (Optional[Union[str, str_collection]], optional):

603 The columns to be updated.<br>

604 Must be a valid column on `dataframe`.<br>

605 If not provided, will be applied to all columns.<br>

606 It is also possible to parse the values `"all"`, which will also apply this function to all columns in `dataframe`.<br>

607 Defaults to `None`.

608 string_function (str, optional):

609 The string function to be applied. Defaults to `"upper"`.

610

611 Returns:

612 (psDataFrame):

613 The updated DataFrame.

614

615 ???+ example "Examples"

616

617 ```{.py .python linenums="1" title="Set up"}

618 >>> # Import

619 >>> import pandas as pd

620 >>> from pyspark.sql import SparkSession

621 >>> from toolbox_pyspark.columns import rename_columns

622 >>>

623 >>> # Instantiate Spark

624 >>> spark = SparkSession.builder.getOrCreate()

625 >>>

626 >>> # Create data

627 >>> df = spark.createDataFrame(

628 ... pd.DataFrame(

629 ... {

630 ... "a": [0, 1, 2, 3],

631 ... "b": ["a", "b", "c", "d"],

632 ... "c": ["c", "c", "c", "c"],

633 ... "d": ["d", "d", "d", "d"],

634 ... }

635 ... )

636 ... )

637 >>>

638 >>> # Check

639 >>> df.show()

640 ```

641 <div class="result" markdown>

642 ```{.txt .text title="Terminal"}

643 +---+---+---+---+

644 | a | b | c | d |

645 +---+---+---+---+

646 | 0 | a | c | d |

647 | 1 | b | c | d |

648 | 2 | c | c | d |

649 | 3 | d | c | d |

650 +---+---+---+---+

651 ```

652 </div>

653

654 ```{.py .python linenums="1" title="Example 1: Single column, default params"}

655 >>> print(rename_columns(df, "a").columns)

656 ```

657 <div class="result" markdown>

658 ```{.sh .shell title="Terminal"}

659 ["A", "b", "c", "d"]

660 ```

661 !!! success "Conclusion: Success."

662 </div>

663

664 ```{.py .python linenums="1" title="Example 2: Single column, simple function"}

665 >>> print(rename_columns(df, "a", "upper").columns)

666 ```

667 <div class="result" markdown>

668 ```{.sh .shell title="Terminal"}

669 ["A", "b", "c", "d"]

670 ```

671 !!! success "Conclusion: Success."

672 </div>

673

674 ```{.py .python linenums="1" title="Example 3: Single column, complex function"}

675 >>> print(rename_columns(df, "a", "replace('b', 'test')").columns)

676 ```

677 <div class="result" markdown>

678 ```{.sh .shell title="Terminal"}

679 ["a", "test", "c", "d"]

680 ```

681 !!! success "Conclusion: Success."

682 </div>

683

684 ```{.py .python linenums="1" title="Example 4: Multiple columns"}

685 >>> print(rename_columns(df, ["a", "b"]).columns)

686 ```

687 <div class="result" markdown>

688 ```{.sh .shell title="Terminal"}

689 ["A", "B", "c", "d"]

690 ```

691 !!! success "Conclusion: Success."

692 </div>

693

694 ```{.py .python linenums="1" title="Example 5: Default function over all columns"}

695 >>> print(rename_columns(df).columns)

696 ```

697 <div class="result" markdown>

698 ```{.sh .shell title="Terminal"}

699 ["A", "B", "C", "D"]

700 ```

701 !!! success "Conclusion: Success."

702 </div>

703

704 ```{.py .python linenums="1" title="Example 6: Complex function over multiple columns"}

705 >>> print(rename_columns(df, ["a", "b"], "replace('b', 'test')").columns)

706 ```

707 <div class="result" markdown>

708 ```{.sh .shell title="Terminal"}

709 ["a", "test", "c", "d"]

710 ```

711 !!! success "Conclusion: Success."

712 </div>

713

714 ??? tip "See Also"

715 - [`assert_columns_exists()`][toolbox_pyspark.checks.assert_columns_exists]

716 - [`assert_column_exists()`][toolbox_pyspark.checks.assert_column_exists]

717 """

718 columns = get_columns(dataframe, columns)

719 assert_columns_exists(dataframe=dataframe, columns=columns, match_case=True)

720 cols_exprs: dict[str, str] = {

721 col: eval(

722 f"'{col}'.{string_function}{'()' if not string_function.endswith(')') else ''}"

723 )

724 for col in columns

725 }

726 return dataframe.withColumnsRenamed(cols_exprs)

727

728

729# ---------------------------------------------------------------------------- #

730# Reordering ####

731# ---------------------------------------------------------------------------- #

732

733

734@typechecked

735def reorder_columns(

736 dataframe: psDataFrame,

737 new_order: Optional[str_collection] = None,

738 missing_columns_last: bool = True,

739 key_columns_position: Optional[Literal["first", "last"]] = "first",

740) -> psDataFrame:

741 """

742 !!! note "Summary"

743 Reorder the columns in a given DataFrame in to a custom order, or to put the `key_` columns at the end (that is, to the far right) of the dataframe.

744

745 ???+ abstract "Details"

746 The decision flow chart is as follows:

747

748 ```mermaid

749 graph TD

750 a([begin])

751 z([end])

752 b{{new_order}}

753 c{{missing_cols_last}}

754 d{{key_cols_position}}

755 g[cols = dataframe.columns]

756 h[cols = new_order]

757 i[cols += missing_cols]

758 j[cols = non_key_cols + key_cols]

759 k[cols = key_cols + non_key_cols]

760 l["return dataframe.select(cols)"]

761 a --> b

762 b --is not None--> h --> c

763 b --is None--> g --> d

764 c --False--> l

765 c --True--> i ----> l

766 d --"first"--> k ---> l

767 d --"last"---> j --> l

768 d --None--> l

769 l --> z

770 ```

771

772 Params:

773 dataframe (psDataFrame):

774 The DataFrame to update

775 new_order (Optional[Union[str, str_list, str_tuple, str_set]], optional):

776 The custom order for the columns on the order.<br>

777 Defaults to `#!py None`.

778 missing_columns_last (bool, optional):

779 For any columns existing on `#!py dataframes.columns`, but missing from `#!py new_order`, if `#!py missing_columns_last=True`, then include those missing columns to the right of the dataframe, in the same order that they originally appear.<br>

780 Defaults to `#!py True`.

781 key_columns_position (Optional[Literal["first", "last"]], optional):

782 Where should the `#!py "key_*"` columns be located?.<br>

783

784 - If `#!py "first"`, then they will be relocated to the start of the dataframe, before all other columns.

785 - If `#!py "last"`, then they will be relocated to the end of the dataframe, after all other columns.

786 - If `#!py None`, they they will remain their original order.

787

788 Regardless of their position, their original order will be maintained.

789 Defaults to `#!py "first"`.

790

791 Raises:

792 TypeError:

793 If any of the inputs parsed to the parameters of this function are not the correct type. Uses the [`@typeguard.typechecked`](https://typeguard.readthedocs.io/en/stable/api.html#typeguard.typechecked) decorator.

794

795 Returns:

796 (psDataFrame):

797 The updated DataFrame.

798

799 ???+ example "Examples"

800

801 ```{.py .python linenums="1" title="Set up"}

802 >>> # Imports

803 >>> import pandas as pd

804 >>> from pyspark.sql import SparkSession

805 >>> from toolbox_pyspark.columns import reorder_columns

806 >>>

807 >>> # Instantiate Spark

808 >>> spark = SparkSession.builder.getOrCreate()

809 >>>

810 >>> # Create data

811 >>> df = spark.createDataFrame(

812 ... pd.DataFrame(

813 ... {

814 ... "a": [0, 1, 2, 3],

815 ... "b": ["a", "b", "c", "d"],

816 ... "key_a": ["0", "1", "2", "3"],

817 ... "c": ["1", "1", "1", "1"],

818 ... "d": ["2", "2", "2", "2"],

819 ... "key_c": ["1", "1", "1", "1"],

820 ... "key_e": ["3", "3", "3", "3"],

821 ... }

822 ... )

823 ... )

824 >>>

825 >>> # Check

826 >>> df.show()

827 ```

828 <div class="result" markdown>

829 ```{.txt .text title="Terminal"}

830 +---+---+-------+---+---+-------+-------+

831 | a | b | key_a | c | d | key_c | key_e |

832 +---+---+-------+---+---+-------+-------+

833 | 0 | a | 0 | 1 | 2 | 1 | 3 |

834 | 1 | b | 1 | 1 | 2 | 1 | 3 |

835 | 2 | c | 2 | 1 | 2 | 1 | 3 |

836 | 3 | d | 3 | 1 | 2 | 1 | 3 |

837 +---+---+-------+---+---+-------+-------+

838 ```

839 </div>

840

841 ```{.py .python linenums="1" title="Example 1: Default config"}

842 >>> new_df = reorder_columns(dataframe=df)

843 >>> new_df.show()

844 ```

845 <div class="result" markdown>

846 ```{.txt .text title="Terminal"}

847 +-------+-------+-------+---+---+---+---+

848 | key_a | key_c | key_e | a | b | c | d |

849 +-------+-------+-------+---+---+---+---+

850 | 0 | 1 | 3 | 0 | a | 1 | 2 |

851 | 1 | 1 | 3 | 1 | b | 1 | 2 |

852 | 2 | 1 | 3 | 2 | c | 1 | 2 |

853 | 3 | 1 | 3 | 3 | d | 1 | 2 |

854 +-------+-------+-------+---+---+---+---+

855 ```

856 !!! success "Conclusion: Success."

857 </div>

858

859 ```{.py .python linenums="1" title="Example 2: Custom order"}

860 >>> new_df = reorder_columns(

861 ... dataframe=df,

862 ... new_order=["key_a", "key_c", "b", "key_e", "a", "c", "d"],

863 ... )

864 >>> new_df.show()

865 ```

866 <div class="result" markdown>

867 ```{.txt .text title="Terminal"}

868 +-------+-------+---+-------+---+---+---+

869 | key_a | key_c | b | key_e | a | c | d |

870 +-------+-------+---+-------+---+---+---+

871 | 0 | 1 | a | 3 | 0 | 1 | 2 |

872 | 1 | 1 | b | 3 | 1 | 1 | 2 |

873 | 2 | 1 | c | 3 | 2 | 1 | 2 |

874 | 3 | 1 | d | 3 | 3 | 1 | 2 |

875 +-------+-------+---+-------+---+---+---+

876 ```

877 !!! success "Conclusion: Success."

878 </div>

879

880 ```{.py .python linenums="1" title="Example 3: Custom order, include missing columns"}

881 >>> new_df = reorder_columns(

882 ... dataframe=df,

883 ... new_order=["key_a", "key_c", "a", "b"],

884 ... missing_columns_last=True,

885 ... )

886 >>> new_df.show()

887 ```

888 <div class="result" markdown>

889 ```{.txt .text title="Terminal"}

890 +-------+-------+---+---+-------+---+---+

891 | key_a | key_c | a | b | key_e | c | d |

892 +-------+-------+---+---+-------+---+---+

893 | 0 | 1 | 0 | a | 3 | 1 | 2 |

894 | 1 | 1 | 1 | b | 3 | 1 | 2 |

895 | 2 | 1 | 2 | c | 3 | 1 | 2 |

896 | 3 | 1 | 3 | d | 3 | 1 | 2 |

897 +-------+-------+---+---+-------+---+---+

898 ```

899 !!! success "Conclusion: Success."

900 </div>

901

902 ```{.py .python linenums="1" title="Example 4: Custom order, exclude missing columns"}

903 >>> new_df = reorder_columns(

904 ... dataframe=df,

905 ... new_order=["key_a", "key_c", "a", "b"],

906 ... missing_columns_last=False,

907 ... )

908 >>> new_df.show()

909 ```

910 <div class="result" markdown>

911 ```{.txt .text title="Terminal"}

912 +-------+-------+---+---+

913 | key_a | key_c | a | b |

914 +-------+-------+---+---+

915 | 0 | 1 | 0 | a |

916 | 1 | 1 | 1 | b |

917 | 2 | 1 | 2 | c |

918 | 3 | 1 | 3 | d |

919 +-------+-------+---+---+

920 ```

921 !!! success "Conclusion: Success."

922 </div>

923

924 ```{.py .python linenums="1" title="Example 5: Keys last"}

925 >>> new_df = reorder_columns(

926 ... dataframe=df,

927 ... key_columns_position="last",

928 ... )

929 >>> new_df.show()

930 ```

931 <div class="result" markdown>

932 ```{.txt .text title="Terminal"}

933 +---+---+---+---+-------+-------+-------+

934 | a | b | c | d | key_a | key_c | key_e |

935 +---+---+---+---+-------+-------+-------+

936 | 0 | a | 1 | 2 | 0 | 1 | 3 |

937 | 1 | b | 1 | 2 | 1 | 1 | 3 |

938 | 2 | c | 1 | 2 | 2 | 1 | 3 |

939 | 3 | d | 1 | 2 | 3 | 1 | 3 |

940 +---+---+---+---+-------+-------+-------+

941 ```

942 !!! success "Conclusion: Success."

943 </div>

944

945 ```{.py .python linenums="1" title="Example 6: Keys first"}

946 >>> new_df = reorder_columns(

947 ... dataframe=df,

948 ... key_columns_position="first",

949 ... )

950 >>> new_df.show()

951 ```

952 <div class="result" markdown>

953 ```{.txt .text title="Terminal"}

954 +-------+-------+-------+---+---+---+---+

955 | key_a | key_c | key_e | a | b | c | d |

956 +-------+-------+-------+---+---+---+---+

957 | 0 | 1 | 3 | 0 | a | 1 | 2 |

958 | 1 | 1 | 3 | 1 | b | 1 | 2 |

959 | 2 | 1 | 3 | 2 | c | 1 | 2 |

960 | 3 | 1 | 3 | 3 | d | 1 | 2 |

961 +-------+-------+-------+---+---+---+---+

962 ```

963 !!! success "Conclusion: Success."

964 </div>

965 """

966 df_cols: str_list = dataframe.columns

967 if new_order is not None:

968 cols: str_list = get_columns(dataframe, new_order)

969 if missing_columns_last:

970 cols += [col for col in df_cols if col not in new_order]

971 else:

972 non_key_cols: str_list = [col for col in df_cols if not col.lower().startswith("key_")]

973 key_cols: str_list = [col for col in df_cols if col.lower().startswith("key_")]

974 if key_columns_position == "first":

975 cols = key_cols + non_key_cols

976 elif key_columns_position == "last":

977 cols = non_key_cols + key_cols

978 else:

979 cols = df_cols

980 return dataframe.select(cols)

981

982

983# ---------------------------------------------------------------------------- #

984# Deleting ####

985# ---------------------------------------------------------------------------- #

986

987

988@typechecked

989def delete_columns(

990 dataframe: psDataFrame,

991 columns: Union[str, str_collection],

992 missing_column_handler: Literal["raise", "warn", "pass"] = "pass",

993) -> psDataFrame:

994 """

995 !!! note "Summary"

996 For a given `#!py dataframe`, delete the columns listed in `columns`.

997

998 ???+ abstract "Details"

999 You can use `#!py missing_columns_handler` to specify how to handle missing columns.

1000

1001 Params:

1002 dataframe (psDataFrame):

1003 The dataframe from which to delete the columns

1004 columns (Union[str, str_collection]):

1005 The list of columns to delete.

1006 missing_column_handler (Literal["raise", "warn", "pass"], optional):

1007 How to handle any columns which are missing from `#!py dataframe.columns`.

1008

1009 If _any_ columns in `columns` are missing from `#!py dataframe.columns`, then the following will happen for each option:

1010

1011 | Option | Result |

1012 |--------|--------|

1013 | `#!py "raise"` | An `#!py ColumnDoesNotExistError` exception will be raised

1014 | `#!py "warn"` | An `#!py ColumnDoesNotExistWarning` warning will be raised

1015 | `#!py "pass"` | Nothing will be raised

1016

1017 Defaults to `#!py "pass"`.

1018

1019 Raises:

1020 TypeError:

1021 If any of the inputs parsed to the parameters of this function are not the correct type. Uses the [`@typeguard.typechecked`](https://typeguard.readthedocs.io/en/stable/api.html#typeguard.typechecked) decorator.

1022 ColumnDoesNotExistError:

1023 If any of the `#!py columns` do not exist within `#!py dataframe.columns`.

1024

1025 Returns:

1026 (psDataFrame):

1027 The updated `#!py dataframe`, with the columns listed in `#!py columns` having been removed.

1028

1029 ???+ example "Examples"

1030

1031 ```{.py .python linenums="1" title="Set up"}

1032 >>> # Imports

1033 >>> import pandas as pd

1034 >>> from pyspark.sql import SparkSession

1035 >>> from toolbox_pyspark.columns import delete_columns

1036 >>>

1037 >>> # Instantiate Spark

1038 >>> spark = SparkSession.builder.getOrCreate()

1039 >>>

1040 >>> # Create data

1041 >>> df = spark.createDataFrame(

1042 ... pd.DataFrame(

1043 ... {

1044 ... "a": [0, 1, 2, 3],

1045 ... "b": ["a", "b", "c", "d"],

1046 ... "c": ["c", "c", "c", "c"],

1047 ... "d": ["d", "d", "d", "d"],

1048 ... }

1049 ... )

1050 ... )

1051 >>>

1052 >>> # Check

1053 >>> df.show()

1054 ```

1055 <div class="result" markdown>

1056 ```{.txt .text title="Terminal"}

1057 +---+---+---+---+

1058 | a | b | c | d |

1059 +---+---+---+---+

1060 | 0 | a | c | d |

1061 | 1 | b | c | d |

1062 | 2 | c | c | d |

1063 | 3 | d | c | d |

1064 +---+---+---+---+

1065 ```

1066 </div>

1067

1068 ```{.py .python linenums="1" title="Example 1: Single column"}

1069 >>> df.transform(delete_columns, "a").show()

1070 ```

1071 <div class="result" markdown>

1072 ```{.txt .text title="Terminal"}

1073 +---+---+---+

1074 | b | c | d |

1075 +---+---+---+

1076 | a | c | d |

1077 | b | c | d |

1078 | c | c | d |

1079 | d | c | d |

1080 +---+---+---+

1081 ```

1082 !!! success "Conclusion: Success."

1083 </div>

1084

1085 ```{.py .python linenums="1" title="Example 2: Multiple columns"}

1086 >>> df.transform(delete_columns, ["a", "b"]).show()

1087 ```

1088 <div class="result" markdown>

1089 ```{.txt .text title="Terminal"}

1090 +---+---+

1091 | c | d |

1092 +---+---+

1093 | c | d |

1094 | c | d |

1095 | c | d |

1096 | c | d |

1097 +---+---+

1098 ```

1099 !!! success "Conclusion: Success."

1100 </div>

1101

1102 ```{.py .python linenums="1" title="Example 3: Single column missing, raises error"}

1103 >>> (

1104 ... df.transform(

1105 ... delete_columns,

1106 ... columns="z",

1107 ... missing_column_handler="raise",

1108 ... )

1109 ... .show()

1110 ... )

1111 ```

1112 <div class="result" markdown>

1113 ```{.txt .text title="Terminal"}

1114 ColumnDoesNotExistError: Columns ["z"] do not exist in "dataframe".

1115 Try one of: ["a", "b", "c", "d"]

1116 ```

1117 !!! success "Conclusion: Success."

1118 </div>

1119

1120 ```{.py .python linenums="1" title="Example 4: Multiple columns, one missing, raises error"}

1121 >>> (

1122 ... df.transform(

1123 ... delete_columns,

1124 ... columns=["a", "b", "z"],

1125 ... missing_column_handler="raise",

1126 ... )

1127 ... .show()

1128 ... )

1129 ```

1130 <div class="result" markdown>

1131 ```{.txt .text title="Terminal"}

1132 ColumnDoesNotExistError: Columns ["z"] do not exist in "dataframe".

1133 Try one of: ["a", "b", "c", "d"]

1134 ```

1135 !!! success "Conclusion: Success."

1136 </div>

1137

1138 ```{.py .python linenums="1" title="Example 5: Multiple columns, all missing, raises error"}

1139 >>> (

1140 ... df.transform(

1141 ... delete_columns,

1142 ... columns=["x", "y", "z"],

1143 ... missing_column_handler="raise",

1144 ... )

1145 ... .show()

1146 ... )

1147 ```

1148 <div class="result" markdown>

1149 ```{.txt .text title="Terminal"}

1150 ColumnDoesNotExistError: Columns ["x", "y", "z"] do not exist in "dataframe".

1151 Try one of: ["a", "b", "c", "d"]

1152 ```

1153 !!! success "Conclusion: Success."

1154 </div>

1155

1156 ```{.py .python linenums="1" title="Example 6: Single column missing, raises warning"}

1157 >>> (

1158 ... df.transform(

1159 ... delete_columns,

1160 ... columns="z",

1161 ... missing_column_handler="warn",

1162 ... )

1163 ... .show()

1164 ... )

1165 ```

1166 <div class="result" markdown>

1167 ```{.txt .text title="Terminal"}

1168 ColumnDoesNotExistWarning: Columns missing from "dataframe": ["z"].

1169 Will still proceed to delete columns that do exist.

1170 ```

1171 ```{.txt .text title="Terminal"}

1172 +---+---+---+---+

1173 | a | b | c | d |

1174 +---+---+---+---+

1175 | 0 | a | c | d |

1176 | 1 | b | c | d |

1177 | 2 | c | c | d |

1178 | 3 | d | c | d |

1179 +---+---+---+---+

1180 ```

1181 !!! success "Conclusion: Success."

1182 </div>

1183

1184 ```{.py .python linenums="1" title="Example 7: Multiple columns, one missing, raises warning"}

1185 >>> (

1186 ... df.transform(

1187 ... delete_columns,

1188 ... columns=["a", "b", "z"],

1189 ... missing_column_handler="warn",

1190 ... )

1191 ... .show()

1192 ... )

1193 ```

1194 <div class="result" markdown>

1195 ```{.txt .text title="Terminal"}

1196 ColumnDoesNotExistWarning: Columns missing from "dataframe": ["z"].

1197 Will still proceed to delete columns that do exist.

1198 ```

1199 ```{.txt .text title="Terminal"}

1200 +---+---+

1201 | c | d |

1202 +---+---+

1203 | c | d |

1204 | c | d |

1205 | c | d |

1206 | c | d |

1207 +---+---+

1208 ```

1209 !!! success "Conclusion: Success."

1210 </div>

1211

1212 ```{.py .python linenums="1" title="Example 8: Multiple columns, all missing, raises warning"}

1213 >>> (

1214 ... df.transform(

1215 ... delete_columns,

1216 ... columns=["x", "y", "z"],

1217 ... missing_column_handler="warn",

1218 ... )

1219 ... .show()

1220 ... )

1221 ```

1222 <div class="result" markdown>

1223 ```{.txt .text title="Terminal"}

1224 ColumnDoesNotExistWarning: Columns missing from "dataframe": ["x", "y", "z"].

1225 Will still proceed to delete columns that do exist.

1226 ```

1227 ```{.txt .text title="Terminal"}

1228 +---+---+---+---+

1229 | a | b | c | d |

1230 +---+---+---+---+

1231 | 0 | a | c | d |

1232 | 1 | b | c | d |

1233 | 2 | c | c | d |

1234 | 3 | d | c | d |

1235 +---+---+---+---+

1236 ```

1237 !!! success "Conclusion: Success."

1238 </div>

1239

1240 ```{.py .python linenums="1" title="Example 9: Single column missing, nothing raised"}

1241 >>> (

1242 ... df.transform(

1243 ... delete_columns,

1244 ... columns="z",

1245 ... missing_column_handler="pass",

1246 ... )

1247 ... .show()

1248 ... )

1249 ```

1250 <div class="result" markdown>

1251 ```{.txt .text title="Terminal"}

1252 +---+---+---+---+

1253 | a | b | c | d |

1254 +---+---+---+---+

1255 | 0 | a | c | d |

1256 | 1 | b | c | d |

1257 | 2 | c | c | d |

1258 | 3 | d | c | d |

1259 +---+---+---+---+

1260 ```

1261 !!! success "Conclusion: Success."

1262 </div>

1263

1264 ```{.py .python linenums="1" title="Example 10: Multiple columns, one missing, nothing raised"}

1265 >>> (

1266 ... df.transform(

1267 ... delete_columns,

1268 ... columns=["a", "b", "z"],

1269 ... missing_column_handler="pass",

1270 ... )

1271 ... .show()

1272 ... )

1273 ```

1274 <div class="result" markdown>

1275 ```{.txt .text title="Terminal"}

1276 +---+---+

1277 | c | d |

1278 +---+---+

1279 | c | d |

1280 | c | d |

1281 | c | d |

1282 | c | d |

1283 +---+---+

1284 ```

1285 !!! success "Conclusion: Success."

1286 </div>

1287

1288 ```{.py .python linenums="1" title="Example 11: Multiple columns, all missing, nothing raised"}

1289 >>> (

1290 ... df.transform(

1291 ... delete_columns,

1292 ... columns=["x", "y", "z"],

1293 ... missing_column_handler="pass",

1294 ... )

1295 ... .show()

1296 ... )

1297 ```

1298 <div class="result" markdown>

1299 ```{.txt .text title="Terminal"}

1300 +---+---+---+---+

1301 | a | b | c | d |

1302 +---+---+---+---+

1303 | 0 | a | c | d |

1304 | 1 | b | c | d |

1305 | 2 | c | c | d |

1306 | 3 | d | c | d |

1307 +---+---+---+---+

1308 ```

1309 !!! success "Conclusion: Success."

1310 </div>

1311 """

1312 columns = get_columns(dataframe, columns)

1313 if missing_column_handler == "raise":

1314 assert_columns_exists(dataframe=dataframe, columns=columns)

1315 elif missing_column_handler == "warn":

1316 warn_columns_missing(dataframe=dataframe, columns=columns)

1317 elif missing_column_handler == "pass":

1318 pass

1319 return dataframe.select([col for col in dataframe.columns if col not in columns])