Coverage for src/toolbox_pyspark/formatting.py: 100%

24 statements  

« prev     ^ index     » next       coverage.py v7.6.10, created at 2025-01-25 23:08 +0000

1# ============================================================================ # 

2# # 

3# Title: Title # 

4# Purpose: This module provides functions for formatting and displaying # 

5# intermediary Spark DataFrames. # 

6# # 

7# ============================================================================ # 

8 

9 

10# ---------------------------------------------------------------------------- # 

11# # 

12# Overview #### 

13# # 

14# ---------------------------------------------------------------------------- # 

15 

16 

17# ---------------------------------------------------------------------------- # 

18# Description #### 

19# ---------------------------------------------------------------------------- # 

20 

21 

22""" 

23!!! note "Summary" 

24 The `formatting` module provides functions for formatting and displaying. 

25""" 

26 

27 

28# ---------------------------------------------------------------------------- # 

29# # 

30# Setup #### 

31# # 

32# ---------------------------------------------------------------------------- # 

33 

34 

35## --------------------------------------------------------------------------- # 

36## Imports #### 

37## --------------------------------------------------------------------------- # 

38 

39 

40# ## Python Third Party Imports ---- 

41from pyspark.sql import DataFrame as psDataFrame, functions as F 

42from toolbox_python.collection_types import str_list 

43from typeguard import typechecked 

44 

45 

46## --------------------------------------------------------------------------- # 

47## Exports #### 

48## --------------------------------------------------------------------------- # 

49 

50 

51__all__: str_list = [ 

52 "format_numbers", 

53 "display_intermediary_table", 

54 "display_intermediary_schema", 

55 "display_intermediary_columns", 

56] 

57 

58 

59# ---------------------------------------------------------------------------- # 

60# # 

61# Main Section #### 

62# # 

63# ---------------------------------------------------------------------------- # 

64 

65 

66@typechecked 

67def format_numbers(dataframe: psDataFrame) -> psDataFrame: 

68 """ 

69 !!! note "Summary" 

70 Format numbers in a Spark DataFrame. 

71 

72 ??? abstract "Details" 

73 This function formats numbers in a Spark DataFrame. It formats integers to have no decimal places and floats to have two decimal places. The function is useful for displaying intermediary tables in a more readable format. It will replace all numeric columns to string. 

74 

75 Params: 

76 dataframe (psDataFrame): 

77 The Spark DataFrame to format. 

78 

79 Raises: 

80 TypeError: 

81 If any of the inputs parsed to the parameters of this function are not the correct type. Uses the [`@typeguard.typechecked`](https://typeguard.readthedocs.io/en/stable/api.html#typeguard.typechecked) decorator. 

82 

83 Returns: 

84 (psDataFrame): 

85 The formatted Spark DataFrame. 

86 

87 ???+ example "Examples" 

88 

89 ```{.py .python linenums="1" title="Set Up"} 

90 >>> # Imports 

91 >>> import pandas as pd 

92 >>> from pyspark.sql import SparkSession 

93 >>> from toolbox_pyspark.formatting import format_numbers 

94 >>> 

95 >>> # Instantiate Spark 

96 >>> spark = SparkSession.builder.getOrCreate() 

97 >>> 

98 >>> # Create data 

99 >>> df = spark.createDataFrame( 

100 ... pd.DataFrame( 

101 ... { 

102 ... "a": [1, 2, 3, 4], 

103 ... "b": ["a", "b", "c", "d"], 

104 ... "c": [1.0, 2.0, 3.0, 4.0], 

105 ... "d": [1.1, 2.2, 3.3, 4.4], 

106 ... "e": [1000, 10000, 100000, 1000000], 

107 ... "f": [1111.11, 22222.22, 333333.33, 4444444.44], 

108 ... } 

109 ... ) 

110 ... ) 

111 >>> 

112 >>> # Check 

113 >>> df.show() 

114 ``` 

115 <div class="result" markdown> 

116 ```{.txt .text title="Terminal"} 

117 +---+---+-----+-----+---------+------------+ 

118 | a | b | c | d | e | f | 

119 +---+---+-----+-----+---------+------------+ 

120 | 1 | a | 1.0 | 1.1 | 1000 | 1111.11 | 

121 | 2 | b | 2.0 | 2.2 | 10000 | 22222.22 | 

122 | 3 | c | 3.0 | 3.3 | 100000 | 333333.33 | 

123 | 4 | d | 4.0 | 4.4 | 1000000 | 4444444.44 | 

124 +---+---+-----+-----+---------+------------+ 

125 ``` 

126 </div> 

127 

128 ```{.py .python linenums="1" title="Example 1: Format Numbers by function"} 

129 >>> format_numbers(df).show() 

130 ``` 

131 <div class="result" markdown> 

132 ```{.txt .text title="Terminal"} 

133 +---+---+-----+-----+-----------+--------------+ 

134 | a | b | c | d | e | f | 

135 +---+---+-----+-----+-----------+--------------+ 

136 | 1 | a | 1.0 | 1.1 | 1,000 | 1,111.11 | 

137 | 2 | b | 2.0 | 2.2 | 10,000 | 22,222.22 | 

138 | 3 | c | 3.0 | 3.3 | 100,000 | 333,333.33 | 

139 | 4 | d | 4.0 | 4.4 | 1,000,000 | 4,444,444.44 | 

140 +---+---+-----+-----+-----------+--------------+ 

141 ``` 

142 !!! success "Conclusion: Successfully formatted dataframe." 

143 </div> 

144 

145 ```{.py .python linenums="1" title="Example 2: Format Numbers by method"} 

146 >>> df.transform(format_numbers).show() 

147 ``` 

148 <div class="result" markdown> 

149 ```{.txt .text title="Terminal"} 

150 +---+---+-----+-----+-----------+--------------+ 

151 | a | b | c | d | e | f | 

152 +---+---+-----+-----+-----------+--------------+ 

153 | 1 | a | 1.0 | 1.1 | 1,000 | 1,111.11 | 

154 | 2 | b | 2.0 | 2.2 | 10,000 | 22,222.22 | 

155 | 3 | c | 3.0 | 3.3 | 100,000 | 333,333.33 | 

156 | 4 | d | 4.0 | 4.4 | 1,000,000 | 4,444,444.44 | 

157 +---+---+-----+-----+-----------+--------------+ 

158 ``` 

159 !!! success "Conclusion: Successfully formatted dataframe." 

160 </div> 

161 """ 

162 for col, typ in dataframe.dtypes: 

163 if typ in ("int", "tinyint", "smallint", "bigint"): 

164 dataframe = dataframe.withColumn(col, F.format_number(col, 0)) 

165 elif typ in ("float", "double"): 

166 dataframe = dataframe.withColumn(col, F.format_number(col, 2)) 

167 return dataframe 

168 

169 

170@typechecked 

171def display_intermediary_table( 

172 dataframe: psDataFrame, reformat_numbers: bool = True, num_rows: int = 20 

173) -> psDataFrame: 

174 """ 

175 !!! note "Summary" 

176 Display an intermediary Spark DataFrame. 

177 

178 ???+ abstract "Details" 

179 This function displays an intermediary Spark DataFrame. The function is useful for displaying intermediary tables in a more readable format. Optionally, it can format numbers in the DataFrame to make it more readable. 

180 

181 Params: 

182 dataframe (psDataFrame): 

183 The Spark DataFrame to display. 

184 reformat_numbers (bool): 

185 Whether to format numbers in the DataFrame. Default is `True`. 

186 num_rows (int): 

187 The number of rows to display. Default is `20`. 

188 

189 Raises: 

190 TypeError: 

191 If any of the inputs parsed to the parameters of this function are not the correct type. Uses the [`@typeguard.typechecked`](https://typeguard.readthedocs.io/en/stable/api.html#typeguard.typechecked) decorator. 

192 

193 Returns: 

194 (psDataFrame): 

195 The original Spark DataFrame. 

196 

197 ???+ example "Examples" 

198 

199 ```{.py .python linenums="1" title="Set Up"} 

200 >>> # Imports 

201 >>> import pandas as pd 

202 >>> from pyspark.sql import SparkSession 

203 >>> from toolbox_pyspark.formatting import display_intermediary_table 

204 >>> 

205 >>> # Instantiate Spark 

206 >>> spark = SparkSession.builder.getOrCreate() 

207 >>> 

208 >>> # Create data 

209 >>> df = spark.createDataFrame( 

210 ... pd.DataFrame( 

211 ... { 

212 ... "a": [1, 2, 3, 4], 

213 ... "b": ["a", "b", "c", "d"], 

214 ... "c": [1.0, 2.0, 3.0, 4.0], 

215 ... "d": [1.1, 2.2, 3.3, 4.4], 

216 ... } 

217 ... ) 

218 ... ) 

219 >>> 

220 >>> # Check 

221 >>> df.show() 

222 ``` 

223 <div class="result" markdown> 

224 ```{.txt .text title="Terminal"} 

225 +---+---+-----+-----+ 

226 | a | b | c | d | 

227 +---+---+-----+-----+ 

228 | 1 | a | 1.0 | 1.1 | 

229 | 2 | b | 2.0 | 2.2 | 

230 | 3 | c | 3.0 | 3.3 | 

231 | 4 | d | 4.0 | 4.4 | 

232 +---+---+-----+-----+ 

233 ``` 

234 </div> 

235 

236 ```{.py .python linenums="1" title="Example 1: Display intermediary table with no subsequent formatting"} 

237 >>> ( 

238 ... df 

239 ... .transform(display_intermediary_table, reformat_numbers=False, num_rows=2) 

240 ... .show() 

241 ... ) 

242 ``` 

243 <div class="result" markdown> 

244 ```{.txt .text title="Terminal"} 

245 +---+---+-----+-----+ 

246 | a | b | c | d | 

247 +---+---+-----+-----+ 

248 | 1 | a | 1.0 | 1.1 | 

249 | 2 | b | 2.0 | 2.2 | 

250 +---+---+-----+-----+ 

251 ``` 

252 ```{.txt .text title="Terminal"} 

253 +---+---+-----+-----+ 

254 | a | b | c | d | 

255 +---+---+-----+-----+ 

256 | 1 | a | 1.0 | 1.1 | 

257 | 2 | b | 2.0 | 2.2 | 

258 | 3 | c | 3.0 | 3.3 | 

259 | 4 | d | 4.0 | 4.4 | 

260 +---+---+-----+-----+ 

261 ``` 

262 !!! success "Conclusion: Successfully displayed intermediary table with no subsequent formatting." 

263 </div> 

264 

265 ```{.py .python linenums="1" title="Example 2: Display intermediary table with subsequent formatting"} 

266 >>> ( 

267 ... df 

268 ... .transform(display_intermediary_table, reformat_numbers=True) 

269 ... .withColumn("c", F.expr("c * 2")) 

270 ... .show() 

271 ... ) 

272 ``` 

273 <div class="result" markdown> 

274 ```{.txt .text title="Terminal"} 

275 +---+---+-----+-----+ 

276 | a | b | c | d | 

277 +---+---+-----+-----+ 

278 | 1 | a | 1.0 | 1.1 | 

279 | 2 | b | 2.0 | 2.2 | 

280 | 3 | c | 3.0 | 3.3 | 

281 | 4 | d | 4.0 | 4.4 | 

282 +---+---+-----+-----+ 

283 ``` 

284 ```{.txt .text title="Terminal"} 

285 +---+---+-----+-----+ 

286 | a | b | c | d | 

287 +---+---+-----+-----+ 

288 | 1 | a | 2.0 | 1.1 | 

289 | 2 | b | 4.0 | 2.2 | 

290 | 3 | c | 6.0 | 3.3 | 

291 | 4 | d | 8.0 | 4.4 | 

292 +---+---+-----+-----+ 

293 ``` 

294 !!! success "Conclusion: Successfully displayed intermediary table with subsequent formatting." 

295 </div> 

296 """ 

297 if reformat_numbers: 

298 dataframe.transform(format_numbers).show(n=num_rows, truncate=False) 

299 else: 

300 dataframe.show(n=num_rows, truncate=False) 

301 return dataframe 

302 

303 

304def display_intermediary_schema(dataframe: psDataFrame) -> psDataFrame: 

305 """ 

306 !!! note "Summary" 

307 Display the schema of an intermediary Spark DataFrame. 

308 

309 ??? abstract "Details" 

310 This function displays the schema of an intermediary Spark DataFrame. The function is useful for displaying intermediary tables in a more readable format. 

311 

312 Params: 

313 dataframe (psDataFrame): 

314 The Spark DataFrame to display. 

315 

316 Returns: 

317 (psDataFrame): 

318 The original Spark DataFrame. 

319 

320 ???+ example "Examples" 

321 

322 ```{.py .python linenums="1" title="Set Up"} 

323 >>> # Imports 

324 >>> import pandas as pd 

325 >>> from pyspark.sql import SparkSession 

326 >>> from toolbox_pyspark.formatting import display_intermediary_schema 

327 >>> 

328 >>> # Instantiate Spark 

329 >>> spark = SparkSession.builder.getOrCreate() 

330 >>> 

331 >>> # Create data 

332 >>> df = spark.createDataFrame( 

333 ... pd.DataFrame( 

334 ... { 

335 ... "a": [1, 2, 3, 4], 

336 ... "b": ["a", "b", "c", "d"], 

337 ... "c": [1.0, 2.0, 3.0, 4.0], 

338 ... "d": [1.1, 2.2, 3.3, 4.4], 

339 ... } 

340 ... ) 

341 ... ) 

342 >>> 

343 >>> # Check 

344 >>> df.show() 

345 >>> df.printSchema() 

346 ``` 

347 <div class="result" markdown> 

348 ```{.txt .text title="Terminal"} 

349 +---+---+-----+-----+ 

350 | a | b | c | d | 

351 +---+---+-----+-----+ 

352 | 1 | a | 1.0 | 1.1 | 

353 | 2 | b | 2.0 | 2.2 | 

354 | 3 | c | 3.0 | 3.3 | 

355 | 4 | d | 4.0 | 4.4 | 

356 +---+---+-----+-----+ 

357 ``` 

358 ```{.txt .text title="Terminal"} 

359 root 

360 |-- a: long (nullable = true) 

361 |-- b: string (nullable = true) 

362 |-- c: double (nullable = true) 

363 |-- d: double (nullable = true) 

364 ``` 

365 </div> 

366 

367 ```{.py .python linenums="1" title="Example 1: Display intermediary schema"} 

368 >>> df.transform(display_intermediary_schema).show() 

369 ``` 

370 <div class="result" markdown> 

371 ```{.txt .text title="Terminal"} 

372 root 

373 |-- a: long (nullable = true) 

374 |-- b: string (nullable = true) 

375 |-- c: double (nullable = true) 

376 |-- d: double (nullable = true) 

377 ``` 

378 ```{.txt .text title="Terminal"} 

379 +---+---+-----+-----+ 

380 | a | b | c | d | 

381 +---+---+-----+-----+ 

382 | 1 | a | 1.0 | 1.1 | 

383 | 2 | b | 2.0 | 2.2 | 

384 | 3 | c | 3.0 | 3.3 | 

385 | 4 | d | 4.0 | 4.4 | 

386 +---+---+-----+-----+ 

387 ``` 

388 !!! success "Conclusion: Successfully displayed intermediary schema." 

389 </div> 

390 

391 ```{.py .python linenums="1" title="Example 2: Display intermediary schema with subsequent formatting"} 

392 >>> df.transform(display_intermediary_schema).withColumn("e", F.expr("c * 2")).show() 

393 ``` 

394 <div class="result" markdown> 

395 ```{.txt .text title="Terminal"} 

396 root 

397 |-- a: long (nullable = true) 

398 |-- b: string (nullable = true) 

399 |-- c: double (nullable = true) 

400 |-- d: double (nullable = true) 

401 ``` 

402 ```{.txt .text title="Terminal"} 

403 +---+---+-----+-----+---+ 

404 | a | b | c | d | e | 

405 +---+---+-----+-----+---+ 

406 | 1 | a | 1.0 | 1.1 | 2 | 

407 | 2 | b | 2.0 | 2.2 | 4 | 

408 | 3 | c | 3.0 | 3.3 | 6 | 

409 | 4 | d | 4.0 | 4.4 | 8 | 

410 +---+---+-----+-----+---+ 

411 ``` 

412 !!! success "Conclusion: Successfully displayed intermediary schema." 

413 </div> 

414 """ 

415 dataframe.printSchema() 

416 return dataframe 

417 

418 

419def display_intermediary_columns(dataframe: psDataFrame) -> psDataFrame: 

420 """ 

421 !!! note "Summary" 

422 Display the columns of an intermediary Spark DataFrame. 

423 

424 ??? abstract "Details" 

425 This function displays the columns of an intermediary Spark DataFrame. The function is useful for displaying intermediary tables in a more readable format. 

426 

427 Params: 

428 dataframe (psDataFrame): 

429 The Spark DataFrame to display. 

430 

431 Returns: 

432 (psDataFrame): 

433 The original Spark DataFrame. 

434 

435 ???+ example "Examples" 

436 

437 ```{.py .python linenums="1" title="Set Up"} 

438 >>> # Imports 

439 >>> import pandas as pd 

440 >>> from pyspark.sql import SparkSession 

441 >>> from toolbox_pyspark.formatting import display_intermediary_columns 

442 >>> 

443 >>> # Instantiate Spark 

444 >>> spark = SparkSession.builder.getOrCreate() 

445 >>> 

446 >>> # Create data 

447 >>> df = spark.createDataFrame( 

448 ... pd.DataFrame( 

449 ... { 

450 ... "a": [1, 2, 3, 4], 

451 ... "b": ["a", "b", "c", "d"], 

452 ... "c": [1.0, 2.0, 3.0, 4.0], 

453 ... "d": [1.1, 2.2, 3.3, 4.4], 

454 ... } 

455 ... ) 

456 ... ) 

457 >>> 

458 >>> # Check 

459 >>> df.show() 

460 ``` 

461 <div class="result" markdown> 

462 ```{.txt .text title="Terminal"} 

463 +---+---+-----+-----+ 

464 | a | b | c | d | 

465 +---+---+-----+-----+ 

466 | 1 | a | 1.0 | 1.1 | 

467 | 2 | b | 2.0 | 2.2 | 

468 | 3 | c | 3.0 | 3.3 | 

469 | 4 | d | 4.0 | 4.4 | 

470 +---+---+-----+-----+ 

471 ``` 

472 </div> 

473 

474 ```{.py .python linenums="1" title="Example 1: Display intermediary columns"} 

475 >>> df.transform(display_intermediary_columns).show() 

476 ``` 

477 <div class="result" markdown> 

478 ```{.txt .text title="Terminal"} 

479 ['a', 'b', 'c', 'd'] 

480 ``` 

481 ```{.txt .text title="Terminal"} 

482 +---+---+-----+-----+ 

483 | a | b | c | d | 

484 +---+---+-----+-----+ 

485 | 1 | a | 1.0 | 1.1 | 

486 | 2 | b | 2.0 | 2.2 | 

487 | 3 | c | 3.0 | 3.3 | 

488 | 4 | d | 4.0 | 4.4 | 

489 +---+---+-----+-----+ 

490 ``` 

491 !!! success "Conclusion: Successfully displayed intermediary columns. 

492 </div> 

493 

494 ```{.py .python linenums="1" title="Example 2: Display intermediary columns with subsequent formatting"} 

495 >>> df.transform(display_intermediary_columns).withColumn("e", F.expr("c * 2")).show() 

496 ``` 

497 <div class="result" markdown> 

498 ```{.txt .text title="Terminal"} 

499 ['a', 'b', 'c', 'd'] 

500 ``` 

501 ```{.txt .text title="Terminal"} 

502 +---+---+-----+-----+---+ 

503 | a | b | c | d | e | 

504 +---+---+-----+-----+---+ 

505 | 1 | a | 1.0 | 1.1 | 2 | 

506 | 2 | b | 2.0 | 2.2 | 4 | 

507 | 3 | c | 3.0 | 3.3 | 6 | 

508 | 4 | d | 4.0 | 4.4 | 8 | 

509 +---+---+-----+-----+---+ 

510 ``` 

511 """ 

512 print(dataframe.columns) 

513 return dataframe