Coverage for src/toolbox_pyspark/formatting.py: 100%

1# ============================================================================ #

2# #

3# Title: Title #

4# Purpose: This module provides functions for formatting and displaying #

5# intermediary Spark DataFrames. #

6# #

7# ============================================================================ #

10# ---------------------------------------------------------------------------- #

11# #

12# Overview ####

13# #

14# ---------------------------------------------------------------------------- #

17# ---------------------------------------------------------------------------- #

18# Description ####

19# ---------------------------------------------------------------------------- #

22"""

23!!! note "Summary"

24 The `formatting` module provides functions for formatting and displaying.

25"""

28# ---------------------------------------------------------------------------- #

29# #

30# Setup ####

31# #

32# ---------------------------------------------------------------------------- #

35## --------------------------------------------------------------------------- #

36## Imports ####

37## --------------------------------------------------------------------------- #

40# ## Python Third Party Imports ----

41from pyspark.sql import DataFrame as psDataFrame, functions as F

42from toolbox_python.collection_types import str_list

43from typeguard import typechecked

46## --------------------------------------------------------------------------- #

47## Exports ####

48## --------------------------------------------------------------------------- #

51__all__: str_list = [

52 "format_numbers",

53 "display_intermediary_table",

54 "display_intermediary_schema",

55 "display_intermediary_columns",

56]

59# ---------------------------------------------------------------------------- #

60# #

61# Main Section ####

62# #

63# ---------------------------------------------------------------------------- #

66@typechecked

67def format_numbers(dataframe: psDataFrame) -> psDataFrame:

68 """

69 !!! note "Summary"

70 Format numbers in a Spark DataFrame.

72 ??? abstract "Details"

73 This function formats numbers in a Spark DataFrame. It formats integers to have no decimal places and floats to have two decimal places. The function is useful for displaying intermediary tables in a more readable format. It will replace all numeric columns to string.

75 Params:

76 dataframe (psDataFrame):

77 The Spark DataFrame to format.

79 Raises:

80 TypeError:

81 If any of the inputs parsed to the parameters of this function are not the correct type. Uses the [`@typeguard.typechecked`](https://typeguard.readthedocs.io/en/stable/api.html#typeguard.typechecked) decorator.

83 Returns:

84 (psDataFrame):

85 The formatted Spark DataFrame.

87 ???+ example "Examples"

89 ```{.py .python linenums="1" title="Set Up"}

90 >>> # Imports

91 >>> import pandas as pd

92 >>> from pyspark.sql import SparkSession

93 >>> from toolbox_pyspark.formatting import format_numbers

94 >>>

95 >>> # Instantiate Spark

96 >>> spark = SparkSession.builder.getOrCreate()

97 >>>

98 >>> # Create data

99 >>> df = spark.createDataFrame(

100 ... pd.DataFrame(

101 ... {

102 ... "a": [1, 2, 3, 4],

103 ... "b": ["a", "b", "c", "d"],

104 ... "c": [1.0, 2.0, 3.0, 4.0],

105 ... "d": [1.1, 2.2, 3.3, 4.4],

106 ... "e": [1000, 10000, 100000, 1000000],

107 ... "f": [1111.11, 22222.22, 333333.33, 4444444.44],

108 ... }

109 ... )

110 ... )

111 >>>

112 >>> # Check

113 >>> df.show()

114 ```

115 <div class="result" markdown>

116 ```{.txt .text title="Terminal"}

117 +---+---+-----+-----+---------+------------+

118 | a | b | c | d | e | f |

119 +---+---+-----+-----+---------+------------+

120 | 1 | a | 1.0 | 1.1 | 1000 | 1111.11 |

121 | 2 | b | 2.0 | 2.2 | 10000 | 22222.22 |

122 | 3 | c | 3.0 | 3.3 | 100000 | 333333.33 |

123 | 4 | d | 4.0 | 4.4 | 1000000 | 4444444.44 |

124 +---+---+-----+-----+---------+------------+

125 ```

126 </div>

127

128 ```{.py .python linenums="1" title="Example 1: Format Numbers by function"}

129 >>> format_numbers(df).show()

130 ```

131 <div class="result" markdown>

132 ```{.txt .text title="Terminal"}

133 +---+---+-----+-----+-----------+--------------+

134 | a | b | c | d | e | f |

135 +---+---+-----+-----+-----------+--------------+

136 | 1 | a | 1.0 | 1.1 | 1,000 | 1,111.11 |

137 | 2 | b | 2.0 | 2.2 | 10,000 | 22,222.22 |

138 | 3 | c | 3.0 | 3.3 | 100,000 | 333,333.33 |

139 | 4 | d | 4.0 | 4.4 | 1,000,000 | 4,444,444.44 |

140 +---+---+-----+-----+-----------+--------------+

141 ```

142 !!! success "Conclusion: Successfully formatted dataframe."

143 </div>

144

145 ```{.py .python linenums="1" title="Example 2: Format Numbers by method"}

146 >>> df.transform(format_numbers).show()

147 ```

148 <div class="result" markdown>

149 ```{.txt .text title="Terminal"}

150 +---+---+-----+-----+-----------+--------------+

151 | a | b | c | d | e | f |

152 +---+---+-----+-----+-----------+--------------+

153 | 1 | a | 1.0 | 1.1 | 1,000 | 1,111.11 |

154 | 2 | b | 2.0 | 2.2 | 10,000 | 22,222.22 |

155 | 3 | c | 3.0 | 3.3 | 100,000 | 333,333.33 |

156 | 4 | d | 4.0 | 4.4 | 1,000,000 | 4,444,444.44 |

157 +---+---+-----+-----+-----------+--------------+

158 ```

159 !!! success "Conclusion: Successfully formatted dataframe."

160 </div>

161 """

162 for col, typ in dataframe.dtypes:

163 if typ in ("int", "tinyint", "smallint", "bigint"):

164 dataframe = dataframe.withColumn(col, F.format_number(col, 0))

165 elif typ in ("float", "double"):

166 dataframe = dataframe.withColumn(col, F.format_number(col, 2))

167 return dataframe

168

169

170@typechecked

171def display_intermediary_table(

172 dataframe: psDataFrame, reformat_numbers: bool = True, num_rows: int = 20

173) -> psDataFrame:

174 """

175 !!! note "Summary"

176 Display an intermediary Spark DataFrame.

177

178 ???+ abstract "Details"

179 This function displays an intermediary Spark DataFrame. The function is useful for displaying intermediary tables in a more readable format. Optionally, it can format numbers in the DataFrame to make it more readable.

180

181 Params:

182 dataframe (psDataFrame):

183 The Spark DataFrame to display.

184 reformat_numbers (bool):

185 Whether to format numbers in the DataFrame. Default is `True`.

186 num_rows (int):

187 The number of rows to display. Default is `20`.

188

189 Raises:

190 TypeError:

191 If any of the inputs parsed to the parameters of this function are not the correct type. Uses the [`@typeguard.typechecked`](https://typeguard.readthedocs.io/en/stable/api.html#typeguard.typechecked) decorator.

192

193 Returns:

194 (psDataFrame):

195 The original Spark DataFrame.

196

197 ???+ example "Examples"

198

199 ```{.py .python linenums="1" title="Set Up"}

200 >>> # Imports

201 >>> import pandas as pd

202 >>> from pyspark.sql import SparkSession

203 >>> from toolbox_pyspark.formatting import display_intermediary_table

204 >>>

205 >>> # Instantiate Spark

206 >>> spark = SparkSession.builder.getOrCreate()

207 >>>

208 >>> # Create data

209 >>> df = spark.createDataFrame(

210 ... pd.DataFrame(

211 ... {

212 ... "a": [1, 2, 3, 4],

213 ... "b": ["a", "b", "c", "d"],

214 ... "c": [1.0, 2.0, 3.0, 4.0],

215 ... "d": [1.1, 2.2, 3.3, 4.4],

216 ... }

217 ... )

218 ... )

219 >>>

220 >>> # Check

221 >>> df.show()

222 ```

223 <div class="result" markdown>

224 ```{.txt .text title="Terminal"}

225 +---+---+-----+-----+

226 | a | b | c | d |

227 +---+---+-----+-----+

228 | 1 | a | 1.0 | 1.1 |

229 | 2 | b | 2.0 | 2.2 |

230 | 3 | c | 3.0 | 3.3 |

231 | 4 | d | 4.0 | 4.4 |

232 +---+---+-----+-----+

233 ```

234 </div>

235

236 ```{.py .python linenums="1" title="Example 1: Display intermediary table with no subsequent formatting"}

237 >>> (

238 ... df

239 ... .transform(display_intermediary_table, reformat_numbers=False, num_rows=2)

240 ... .show()

241 ... )

242 ```

243 <div class="result" markdown>

244 ```{.txt .text title="Terminal"}

245 +---+---+-----+-----+

246 | a | b | c | d |

247 +---+---+-----+-----+

248 | 1 | a | 1.0 | 1.1 |

249 | 2 | b | 2.0 | 2.2 |

250 +---+---+-----+-----+

251 ```

252 ```{.txt .text title="Terminal"}

253 +---+---+-----+-----+

254 | a | b | c | d |

255 +---+---+-----+-----+

256 | 1 | a | 1.0 | 1.1 |

257 | 2 | b | 2.0 | 2.2 |

258 | 3 | c | 3.0 | 3.3 |

259 | 4 | d | 4.0 | 4.4 |

260 +---+---+-----+-----+

261 ```

262 !!! success "Conclusion: Successfully displayed intermediary table with no subsequent formatting."

263 </div>

264

265 ```{.py .python linenums="1" title="Example 2: Display intermediary table with subsequent formatting"}

266 >>> (

267 ... df

268 ... .transform(display_intermediary_table, reformat_numbers=True)

269 ... .withColumn("c", F.expr("c * 2"))

270 ... .show()

271 ... )

272 ```

273 <div class="result" markdown>

274 ```{.txt .text title="Terminal"}

275 +---+---+-----+-----+

276 | a | b | c | d |

277 +---+---+-----+-----+

278 | 1 | a | 1.0 | 1.1 |

279 | 2 | b | 2.0 | 2.2 |

280 | 3 | c | 3.0 | 3.3 |

281 | 4 | d | 4.0 | 4.4 |

282 +---+---+-----+-----+

283 ```

284 ```{.txt .text title="Terminal"}

285 +---+---+-----+-----+

286 | a | b | c | d |

287 +---+---+-----+-----+

288 | 1 | a | 2.0 | 1.1 |

289 | 2 | b | 4.0 | 2.2 |

290 | 3 | c | 6.0 | 3.3 |

291 | 4 | d | 8.0 | 4.4 |

292 +---+---+-----+-----+

293 ```

294 !!! success "Conclusion: Successfully displayed intermediary table with subsequent formatting."

295 </div>

296 """

297 if reformat_numbers:

298 dataframe.transform(format_numbers).show(n=num_rows, truncate=False)

299 else:

300 dataframe.show(n=num_rows, truncate=False)

301 return dataframe

302

303

304def display_intermediary_schema(dataframe: psDataFrame) -> psDataFrame:

305 """

306 !!! note "Summary"

307 Display the schema of an intermediary Spark DataFrame.

308

309 ??? abstract "Details"

310 This function displays the schema of an intermediary Spark DataFrame. The function is useful for displaying intermediary tables in a more readable format.

311

312 Params:

313 dataframe (psDataFrame):

314 The Spark DataFrame to display.

315

316 Returns:

317 (psDataFrame):

318 The original Spark DataFrame.

319

320 ???+ example "Examples"

321

322 ```{.py .python linenums="1" title="Set Up"}

323 >>> # Imports

324 >>> import pandas as pd

325 >>> from pyspark.sql import SparkSession

326 >>> from toolbox_pyspark.formatting import display_intermediary_schema

327 >>>

328 >>> # Instantiate Spark

329 >>> spark = SparkSession.builder.getOrCreate()

330 >>>

331 >>> # Create data

332 >>> df = spark.createDataFrame(

333 ... pd.DataFrame(

334 ... {

335 ... "a": [1, 2, 3, 4],

336 ... "b": ["a", "b", "c", "d"],

337 ... "c": [1.0, 2.0, 3.0, 4.0],

338 ... "d": [1.1, 2.2, 3.3, 4.4],

339 ... }

340 ... )

341 ... )

342 >>>

343 >>> # Check

344 >>> df.show()

345 >>> df.printSchema()

346 ```

347 <div class="result" markdown>

348 ```{.txt .text title="Terminal"}

349 +---+---+-----+-----+

350 | a | b | c | d |

351 +---+---+-----+-----+

352 | 1 | a | 1.0 | 1.1 |

353 | 2 | b | 2.0 | 2.2 |

354 | 3 | c | 3.0 | 3.3 |

355 | 4 | d | 4.0 | 4.4 |

356 +---+---+-----+-----+

357 ```

358 ```{.txt .text title="Terminal"}

359 root

360 |-- a: long (nullable = true)

361 |-- b: string (nullable = true)

362 |-- c: double (nullable = true)

363 |-- d: double (nullable = true)

364 ```

365 </div>

366

367 ```{.py .python linenums="1" title="Example 1: Display intermediary schema"}

368 >>> df.transform(display_intermediary_schema).show()

369 ```

370 <div class="result" markdown>

371 ```{.txt .text title="Terminal"}

372 root

373 |-- a: long (nullable = true)

374 |-- b: string (nullable = true)

375 |-- c: double (nullable = true)

376 |-- d: double (nullable = true)

377 ```

378 ```{.txt .text title="Terminal"}

379 +---+---+-----+-----+

380 | a | b | c | d |

381 +---+---+-----+-----+

382 | 1 | a | 1.0 | 1.1 |

383 | 2 | b | 2.0 | 2.2 |

384 | 3 | c | 3.0 | 3.3 |

385 | 4 | d | 4.0 | 4.4 |

386 +---+---+-----+-----+

387 ```

388 !!! success "Conclusion: Successfully displayed intermediary schema."

389 </div>

390

391 ```{.py .python linenums="1" title="Example 2: Display intermediary schema with subsequent formatting"}

392 >>> df.transform(display_intermediary_schema).withColumn("e", F.expr("c * 2")).show()

393 ```

394 <div class="result" markdown>

395 ```{.txt .text title="Terminal"}

396 root

397 |-- a: long (nullable = true)

398 |-- b: string (nullable = true)

399 |-- c: double (nullable = true)

400 |-- d: double (nullable = true)

401 ```

402 ```{.txt .text title="Terminal"}

403 +---+---+-----+-----+---+

404 | a | b | c | d | e |

405 +---+---+-----+-----+---+

406 | 1 | a | 1.0 | 1.1 | 2 |

407 | 2 | b | 2.0 | 2.2 | 4 |

408 | 3 | c | 3.0 | 3.3 | 6 |

409 | 4 | d | 4.0 | 4.4 | 8 |

410 +---+---+-----+-----+---+

411 ```

412 !!! success "Conclusion: Successfully displayed intermediary schema."

413 </div>

414 """

415 dataframe.printSchema()

416 return dataframe

417

418

419def display_intermediary_columns(dataframe: psDataFrame) -> psDataFrame:

420 """

421 !!! note "Summary"

422 Display the columns of an intermediary Spark DataFrame.

423

424 ??? abstract "Details"

425 This function displays the columns of an intermediary Spark DataFrame. The function is useful for displaying intermediary tables in a more readable format.

426

427 Params:

428 dataframe (psDataFrame):

429 The Spark DataFrame to display.

430

431 Returns:

432 (psDataFrame):

433 The original Spark DataFrame.

434

435 ???+ example "Examples"

436

437 ```{.py .python linenums="1" title="Set Up"}

438 >>> # Imports

439 >>> import pandas as pd

440 >>> from pyspark.sql import SparkSession

441 >>> from toolbox_pyspark.formatting import display_intermediary_columns

442 >>>

443 >>> # Instantiate Spark

444 >>> spark = SparkSession.builder.getOrCreate()

445 >>>

446 >>> # Create data

447 >>> df = spark.createDataFrame(

448 ... pd.DataFrame(

449 ... {

450 ... "a": [1, 2, 3, 4],

451 ... "b": ["a", "b", "c", "d"],

452 ... "c": [1.0, 2.0, 3.0, 4.0],

453 ... "d": [1.1, 2.2, 3.3, 4.4],

454 ... }

455 ... )

456 ... )

457 >>>

458 >>> # Check

459 >>> df.show()

460 ```

461 <div class="result" markdown>

462 ```{.txt .text title="Terminal"}

463 +---+---+-----+-----+

464 | a | b | c | d |

465 +---+---+-----+-----+

466 | 1 | a | 1.0 | 1.1 |

467 | 2 | b | 2.0 | 2.2 |

468 | 3 | c | 3.0 | 3.3 |

469 | 4 | d | 4.0 | 4.4 |

470 +---+---+-----+-----+

471 ```

472 </div>

473

474 ```{.py .python linenums="1" title="Example 1: Display intermediary columns"}

475 >>> df.transform(display_intermediary_columns).show()

476 ```

477 <div class="result" markdown>

478 ```{.txt .text title="Terminal"}

479 ['a', 'b', 'c', 'd']

480 ```

481 ```{.txt .text title="Terminal"}

482 +---+---+-----+-----+

483 | a | b | c | d |

484 +---+---+-----+-----+

485 | 1 | a | 1.0 | 1.1 |

486 | 2 | b | 2.0 | 2.2 |

487 | 3 | c | 3.0 | 3.3 |

488 | 4 | d | 4.0 | 4.4 |

489 +---+---+-----+-----+

490 ```

491 !!! success "Conclusion: Successfully displayed intermediary columns.

492 </div>

493

494 ```{.py .python linenums="1" title="Example 2: Display intermediary columns with subsequent formatting"}

495 >>> df.transform(display_intermediary_columns).withColumn("e", F.expr("c * 2")).show()

496 ```

497 <div class="result" markdown>

498 ```{.txt .text title="Terminal"}

499 ['a', 'b', 'c', 'd']

500 ```

501 ```{.txt .text title="Terminal"}

502 +---+---+-----+-----+---+

503 | a | b | c | d | e |

504 +---+---+-----+-----+---+

505 | 1 | a | 1.0 | 1.1 | 2 |

506 | 2 | b | 2.0 | 2.2 | 4 |

507 | 3 | c | 3.0 | 3.3 | 6 |

508 | 4 | d | 4.0 | 4.4 | 8 |

509 +---+---+-----+-----+---+

510 ```

511 """

512 print(dataframe.columns)

513 return dataframe