Coverage for src / ts_stat_tests / utils / data.py: 100%

77 statements  

« prev     ^ index     » next       coverage.py v7.13.2, created at 2026-02-01 09:48 +0000

1# ============================================================================ # 

2# # 

3# Title: Data Utilities # 

4# Purpose: Functions to load classic time series datasets. # 

5# # 

6# ============================================================================ # 

7 

8 

9# ---------------------------------------------------------------------------- # 

10# # 

11# Overview #### 

12# # 

13# ---------------------------------------------------------------------------- # 

14 

15 

16## --------------------------------------------------------------------------- # 

17## Description #### 

18## --------------------------------------------------------------------------- # 

19 

20 

21""" 

22!!! note "Summary" 

23 

24 This module contains utility functions to load classic time series datasets for testing and demonstration purposes. 

25 

26 It provides interfaces for both synthetic data generation (random numbers, sine waves, trends) and external data loading from common benchmarks. 

27""" 

28 

29 

30# ---------------------------------------------------------------------------- # 

31# # 

32# Setup #### 

33# # 

34# ---------------------------------------------------------------------------- # 

35 

36 

37## --------------------------------------------------------------------------- # 

38## Imports #### 

39## --------------------------------------------------------------------------- # 

40 

41 

42# ## Python StdLib Imports ---- 

43from functools import lru_cache 

44 

45# ## Python Third Party Imports ---- 

46import numpy as np 

47import pandas as pd 

48from numpy.random import Generator as RandomGenerator 

49from numpy.typing import NDArray 

50from stochastic.processes.noise import FractionalGaussianNoise 

51from typeguard import typechecked 

52 

53 

54## --------------------------------------------------------------------------- # 

55## Exports #### 

56## --------------------------------------------------------------------------- # 

57 

58 

59__all__: list[str] = [ 

60 "get_random_generator", 

61 "get_random_numbers", 

62 "get_random_numbers_2d", 

63 "get_sine_wave", 

64 "get_normal_curve", 

65 "get_straight_line", 

66 "get_trend_data", 

67 "get_uniform_data", 

68 "get_noise_data", 

69 "load_airline", 

70 "load_macrodata", 

71 "data_airline", 

72 "data_macrodata", 

73 "data_random", 

74 "data_random_2d", 

75 "data_sine", 

76 "data_normal", 

77 "data_line", 

78 "data_trend", 

79 "data_noise", 

80] 

81 

82 

83## --------------------------------------------------------------------------- # 

84## Constants #### 

85## --------------------------------------------------------------------------- # 

86 

87 

88SEED: int = 42 

89 

90 

91# ---------------------------------------------------------------------------- # 

92# # 

93# Data Generators #### 

94# # 

95# ---------------------------------------------------------------------------- # 

96 

97 

98@lru_cache 

99@typechecked 

100def get_random_generator(seed: int) -> RandomGenerator: 

101 r""" 

102 !!! note "Summary" 

103 Generates a NumPy random number generator with a specified seed for reproducibility. 

104 

105 ???+ abstract "Details" 

106 This function returns a `numpy.random.Generator` instance using `default_rng`. This is the recommended way to generate random numbers in modern NumPy (v1.17+). 

107 

108 Params: 

109 seed (int): 

110 The seed value for the random number generator. 

111 

112 Returns: 

113 (RandomGenerator): 

114 A NumPy random number generator initialized with the given seed. 

115 

116 ???+ example "Examples" 

117 

118 ```pycon {.py .python linenums="1" title="Setup"} 

119 >>> from ts_stat_tests.utils.data import get_random_generator 

120 

121 ``` 

122 

123 ```pycon {.py .python linenums="1" title="Example 1: Creating a RandomGenerator"} 

124 >>> rng = get_random_generator(42) 

125 >>> print(rng is not None) 

126 True 

127 >>> print(type(rng)) 

128 <class 'numpy.random._generator.Generator'> 

129 

130 ``` 

131 

132 ??? question "References" 

133 1. [NumPy Random Generator](https://numpy.org/doc/stable/reference/random/generator.html) 

134 

135 """ 

136 return np.random.default_rng(seed) 

137 

138 

139@lru_cache 

140@typechecked 

141def get_random_numbers(seed: int) -> NDArray[np.float64]: 

142 r""" 

143 !!! note "Summary" 

144 Generates an array of random numbers with a specified seed for reproducibility. 

145 

146 ???+ abstract "Details" 

147 Generates a 1D array of 1000 random floating-point numbers distributed uniformly in the half-open interval $[0.0, 1.0)$. 

148 

149 Params: 

150 seed (int): 

151 The seed value for the random number generator. 

152 

153 Returns: 

154 (NDArray[np.float64]): 

155 An array of random numbers with shape (1000,). 

156 

157 ???+ example "Examples" 

158 

159 ```pycon {.py .python linenums="1" title="Setup"} 

160 >>> from ts_stat_tests.utils.data import get_random_numbers 

161 >>> data = get_random_numbers(42) 

162 

163 ``` 

164 

165 ```pycon {.py .python linenums="1" title="Example 1: Generating Random Numbers"} 

166 >>> print(data.shape) 

167 (1000,) 

168 >>> print(type(data)) 

169 <class 'numpy.ndarray'> 

170 >>> print(data.dtype) 

171 float64 

172 >>> print(data[:5]) 

173 [0.77395605 0.43887844 0.85859792 0.69736803 0.09417735] 

174 

175 ``` 

176 

177 ??? question "References" 

178 1. [NumPy Random Generator](https://numpy.org/doc/stable/reference/random/generator.html) 

179 

180 """ 

181 rng: RandomGenerator = get_random_generator(seed) 

182 return rng.random(size=1000) 

183 

184 

185@lru_cache 

186@typechecked 

187def get_random_numbers_2d(seed: int) -> NDArray[np.float64]: 

188 r""" 

189 !!! note "Summary" 

190 Generates a 2D array of random numbers with a specified seed for reproducibility. 

191 

192 ???+ abstract "Details" 

193 Produces a 2D matrix of shape $(4, 3000)$ containing uniform random values. 

194 

195 Params: 

196 seed (int): 

197 The seed value for the random number generator. 

198 

199 Returns: 

200 (NDArray[np.float64]): 

201 A 2D array of random numbers with shape (4, 3000). 

202 

203 ???+ example "Examples" 

204 

205 ```pycon {.py .python linenums="1" title="Setup"} 

206 >>> from ts_stat_tests.utils.data import get_random_numbers_2d 

207 >>> data = get_random_numbers_2d(42) 

208 

209 ``` 

210 

211 ```pycon {.py .python linenums="1" title="Example 1: Generating 2D Random Numbers"} 

212 >>> print(data.shape) 

213 (4, 3000) 

214 >>> print(type(data)) 

215 <class 'numpy.ndarray'> 

216 >>> print(data.dtype) 

217 float64 

218 >>> print(data[:, :5]) 

219 [[0.06206311 0.45826204 0.12903006 0.15232671 0.63228281] 

220 [0.71609997 0.3571156 0.85186786 0.24097716 0.53839349] 

221 [0.74315144 0.90157433 0.59866347 0.52857443 0.89016256] 

222 [0.72072839 0.71123776 0.20269503 0.0366554 0.30379952]] 

223 

224 ``` 

225 

226 ??? question "References" 

227 1. [NumPy Random Generator](https://numpy.org/doc/stable/reference/random/generator.html) 

228 

229 """ 

230 rng: RandomGenerator = get_random_generator(seed) 

231 return rng.random(size=(4, 3000)) 

232 

233 

234@lru_cache 

235@typechecked 

236def get_sine_wave() -> NDArray[np.float64]: 

237 r""" 

238 !!! note "Summary" 

239 Generates a sine wave dataset. 

240 

241 ???+ abstract "Details" 

242 Produces a 1D array of 1000 samples of a sine wave with amplitude 1.0 and period 1000. 

243 

244 Returns: 

245 (NDArray[np.float64]): 

246 An array representing a sine wave with shape (3000,). 

247 

248 ???+ example "Examples" 

249 

250 ```pycon {.py .python linenums="1" title="Setup"} 

251 >>> from ts_stat_tests.utils.data import get_sine_wave 

252 >>> data = get_sine_wave() 

253 

254 ``` 

255 

256 ```pycon {.py .python linenums="1" title="Example 1: Generating a Sine Wave"} 

257 >>> print(data.shape) 

258 (3000,) 

259 >>> print(type(data)) 

260 <class 'numpy.ndarray'> 

261 >>> print(data.dtype) 

262 float64 

263 >>> print(data[:5]) 

264 [0. 0.06279052 0.12533323 0.18738131 0.24868989] 

265 

266 ``` 

267 

268 ??? question "References" 

269 1. [NumPy Trigonometric Functions](https://numpy.org/doc/stable/reference/routines.math.html#trigonometric-functions) 

270 

271 """ 

272 return np.sin(2 * np.pi * 1 * np.arange(3000) / 100) 

273 

274 

275@lru_cache 

276@typechecked 

277def get_normal_curve(seed: int) -> NDArray[np.float64]: 

278 r""" 

279 !!! note "Summary" 

280 Generates a normal distribution curve dataset. 

281 

282 ???+ abstract "Details" 

283 Draws 1000 samples from a standard Gaussian distribution. 

284 

285 Params: 

286 seed (int): 

287 The seed value for the random number generator. 

288 

289 Returns: 

290 (NDArray[np.float64]): 

291 An array representing a normal distribution curve with shape (1000,). 

292 

293 ???+ example "Examples" 

294 

295 ```pycon {.py .python linenums="1" title="Setup"} 

296 >>> from ts_stat_tests.utils.data import get_normal_curve 

297 >>> data = get_normal_curve(42) 

298 

299 ``` 

300 

301 ```pycon {.py .python linenums="1" title="Example 1: Generating Normal Curve Data"} 

302 >>> print(data.shape) 

303 (1000,) 

304 >>> print(type(data)) 

305 <class 'numpy.ndarray'> 

306 >>> print(data.dtype) 

307 float64 

308 >>> print(data[:5]) 

309 [ 0.12993113 -0.75691222 -0.33007356 -1.88579735 -0.37064992] 

310 

311 ``` 

312 

313 ??? question "References" 

314 1. [NumPy Random Normal](https://numpy.org/doc/stable/reference/random/generated/numpy.random.Generator.normal.html) 

315 

316 """ 

317 rng: RandomGenerator = get_random_generator(seed) 

318 return rng.normal(loc=0.0, scale=1.0, size=1000) 

319 

320 

321@lru_cache 

322@typechecked 

323def get_straight_line() -> NDArray[np.float64]: 

324 r""" 

325 !!! note "Summary" 

326 Generates a straight line dataset. 

327 

328 ???+ abstract "Details" 

329 Returns a sequence of integers from 0 to 999. 

330 

331 Returns: 

332 (NDArray[np.float64]): 

333 An array representing a straight line with shape (1000,). 

334 

335 ???+ example "Examples" 

336 

337 ```pycon {.py .python linenums="1" title="Setup"} 

338 >>> from ts_stat_tests.utils.data import get_straight_line 

339 >>> data = get_straight_line() 

340 

341 ``` 

342 

343 ```pycon {.py .python linenums="1" title="Example 1: Generating Straight Line Data"} 

344 >>> print(data.shape) 

345 (1000,) 

346 >>> print(type(data)) 

347 <class 'numpy.ndarray'> 

348 >>> print(data.dtype) 

349 float64 

350 >>> print(data[:5]) 

351 [0. 1. 2. 3. 4.] 

352 

353 ``` 

354 

355 ??? question "References" 

356 1. [NumPy Arange](https://numpy.org/doc/stable/reference/generated/numpy.arange.html) 

357 

358 """ 

359 return np.arange(1000).astype(np.float64) 

360 

361 

362@lru_cache 

363@typechecked 

364def get_trend_data() -> NDArray[np.float64]: 

365 r""" 

366 !!! note "Summary" 

367 Generates trend data. 

368 

369 ???+ abstract "Details" 

370 Generates an array with a linear trend by combining two ramp functions. 

371 

372 Returns: 

373 (NDArray[np.float64]): 

374 An array representing trend data with shape (1000,). 

375 

376 ???+ example "Examples" 

377 

378 ```pycon {.py .python linenums="1" title="Setup"} 

379 >>> from ts_stat_tests.utils.data import get_trend_data 

380 >>> data = get_trend_data() 

381 

382 ``` 

383 

384 ```pycon {.py .python linenums="1" title="Example 1: Generating Trend Data"} 

385 >>> print(data.shape) 

386 (1000,) 

387 >>> print(type(data)) 

388 <class 'numpy.ndarray'> 

389 >>> print(data.dtype) 

390 float64 

391 >>> print(data[:5]) 

392 [0. 1.5 3. 4.5 6. ] 

393 

394 ``` 

395 

396 ??? question "References" 

397 1. [NumPy Arange](https://numpy.org/doc/stable/reference/generated/numpy.arange.html) 

398 

399 """ 

400 return np.arange(1000) + 0.5 * np.arange(1000) 

401 

402 

403@lru_cache 

404@typechecked 

405def get_uniform_data(seed: int) -> NDArray[np.float64]: 

406 r""" 

407 !!! note "Summary" 

408 Generates uniform random data with a specified seed for reproducibility. 

409 

410 ???+ abstract "Details" 

411 Returns a 1D array of 1000 samples from a uniform distribution. 

412 

413 Params: 

414 seed (int): 

415 The seed value for the random number generator. 

416 

417 Returns: 

418 (NDArray[np.float64]): 

419 An array of uniform random data with shape (1000,). 

420 

421 ???+ example "Examples" 

422 

423 ```pycon {.py .python linenums="1" title="Setup"} 

424 >>> from ts_stat_tests.utils.data import get_uniform_data 

425 >>> data = get_uniform_data(42) 

426 

427 ``` 

428 

429 ```pycon {.py .python linenums="1" title="Example 1: Generating Uniform Data"} 

430 >>> print(data.shape) 

431 (1000,) 

432 >>> print(type(data)) 

433 <class 'numpy.ndarray'> 

434 >>> print(data.dtype) 

435 float64 

436 >>> print(data[:5]) 

437 [0.80227457 0.81857128 0.87962986 0.11378193 0.29263938] 

438 

439 ``` 

440 

441 ??? question "References" 

442 1. [NumPy Random Uniform](https://numpy.org/doc/stable/reference/random/generated/numpy.random.Generator.uniform.html) 

443 

444 """ 

445 rng: RandomGenerator = get_random_generator(seed) 

446 return rng.uniform(low=0.0, high=1.0, size=1000) 

447 

448 

449@lru_cache 

450@typechecked 

451def get_noise_data(seed: int) -> NDArray[np.float64]: 

452 r""" 

453 !!! note "Summary" 

454 Generates fractional Gaussian noise data with a specified seed for reproducibility. 

455 

456 ???+ abstract "Details" 

457 Uses the `stochastic` library to sample fractional Gaussian noise with a Hurst exponent of 0.5, effectively producing white noise. 

458 

459 Params: 

460 seed (int): 

461 The seed value for the random number generator. 

462 

463 Returns: 

464 (NDArray[np.float64]): 

465 An array of fractional Gaussian noise data with shape (1000,). 

466 

467 ???+ example "Examples" 

468 

469 ```pycon {.py .python linenums="1" title="Setup"} 

470 >>> from ts_stat_tests.utils.data import get_noise_data 

471 >>> data = get_noise_data(42) 

472 

473 ``` 

474 

475 ```pycon {.py .python linenums="1" title="Example 1: Generating Noise Data"} 

476 >>> print(data.shape) 

477 (1000,) 

478 >>> print(type(data)) 

479 <class 'numpy.ndarray'> 

480 >>> print(data.dtype) 

481 float64 

482 >>> print(data[:5]) 

483 [-0.05413957 -0.0007609 -0.00177524 0.00909899 -0.03044404] 

484 

485 ``` 

486 

487 ??? question "References" 

488 1. [Stochastic Library](https://github.com/crflynn/stochastic) 

489 

490 """ 

491 rng: RandomGenerator = get_random_generator(seed) 

492 return FractionalGaussianNoise(hurst=0.5, rng=rng).sample(1000) 

493 

494 

495# ---------------------------------------------------------------------------- # 

496# # 

497# Data Loaders #### 

498# # 

499# ---------------------------------------------------------------------------- # 

500 

501 

502@lru_cache 

503@typechecked 

504def load_airline() -> pd.Series: 

505 r""" 

506 !!! note "Summary" 

507 Loads the classic Airline Passengers dataset as a pandas Series. 

508 

509 ???+ abstract "Details" 

510 The Air Passengers dataset provides monthly totals of a US airline's international passengers from 1949 to 1960. It is a classic dataset for time series analysis, exhibiting both trend and seasonality. 

511 

512 Returns: 

513 (pd.Series): 

514 The Airline Passengers dataset. 

515 

516 ???+ example "Examples" 

517 

518 ```pycon {.py .python linenums="1" title="Setup"} 

519 >>> from ts_stat_tests.utils.data import load_airline 

520 >>> data = load_airline() 

521 

522 ``` 

523 

524 ```pycon {.py .python linenums="1" title="Example 1: Loading Airline Data"} 

525 >>> print(len(data)) 

526 144 

527 >>> print(type(data)) 

528 <class 'pandas.core.series.Series'> 

529 >>> print(data.head()) 

530 Period 

531 1949-01 112.0 

532 1949-02 118.0 

533 1949-03 132.0 

534 1949-04 129.0 

535 1949-05 121.0 

536 Freq: M, Name: Number of airline passengers, dtype: float64 

537 

538 ``` 

539 

540 ??? success "Credit" 

541 Inspiration from: [`sktime.datasets.load_airline()`](https://www.sktime.net/en/stable/api_reference/generated/sktime.datasets.load_airline.html) 

542 

543 ??? question "References" 

544 1. Box, G. E. P., Jenkins, G. M., Reinsel, G. C., & Ljung, G. M. (2015). Time series analysis: forecasting and control. John Wiley & Sons. 

545 

546 """ 

547 data_source = "https://raw.githubusercontent.com/sktime/sktime/main/sktime/datasets/data/Airline/Airline.csv" 

548 _data = pd.read_csv(data_source, index_col=0, dtype={1: float}).squeeze("columns") 

549 if not isinstance(_data, pd.Series): 

550 raise TypeError("Expected a pandas Series from the data source.") 

551 data: pd.Series = _data 

552 data.index = pd.PeriodIndex(data.index, freq="M", name="Period") 

553 data.name = "Number of airline passengers" 

554 return data 

555 

556 

557@lru_cache 

558@typechecked 

559def load_macrodata() -> pd.DataFrame: 

560 r""" 

561 !!! note "Summary" 

562 Loads the classic Macrodata dataset as a pandas DataFrame. 

563 

564 ???+ abstract "Details" 

565 This dataset contains various US macroeconomic time series from 1959Q1 to 2009Q3. It includes variables such as real GDP, consumption, investment, etc. 

566 

567 Returns: 

568 (pd.DataFrame): 

569 The Macrodata dataset. 

570 

571 ???+ example "Examples" 

572 

573 ```pycon {.py .python linenums="1" title="Setup"} 

574 >>> from ts_stat_tests.utils.data import load_macrodata 

575 >>> data = load_macrodata() 

576 

577 ``` 

578 

579 ```pycon {.py .python linenums="1" title="Example 1: Loading Macrodata"} 

580 >>> print(data.shape) 

581 (203, 14) 

582 >>> print(type(data)) 

583 <class 'pandas.core.frame.DataFrame'> 

584 >>> print(data[["year", "quarter", "realgdp"]].head()) 

585 year quarter realgdp 

586 Period 

587 1959Q1 1959 1 2710.349 

588 1959Q2 1959 2 2778.801 

589 1959Q3 1959 3 2775.488 

590 1959Q4 1959 4 2785.204 

591 1960Q1 1960 1 2847.699 

592 

593 ``` 

594 

595 ??? success "Credit" 

596 Inspiration from: [`statsmodels.datasets.macrodata.load_pandas()`](https://www.statsmodels.org/stable/datasets/generated/statsmodels.datasets.macrodata.macrodata.load_pandas.html) 

597 

598 ??? question "References" 

599 1. R. F. Engle, D. F. Hendry, and J. F. Richard (1983). Exogeneity. Econometrica, 51(2):277–304. 

600 

601 """ 

602 data_source = ( 

603 "https://raw.githubusercontent.com/statsmodels/statsmodels/main/statsmodels/datasets/macrodata/macrodata.csv" 

604 ) 

605 data: pd.DataFrame = pd.read_csv( 

606 data_source, 

607 index_col=None, 

608 dtype={ 

609 "year": int, 

610 "quarter": int, 

611 }, 

612 ) 

613 data.index = pd.PeriodIndex( 

614 data=data.year.astype(str) + "Q" + data.quarter.astype(str), 

615 freq="Q", 

616 name="Period", 

617 ) 

618 return data 

619 

620 

621# ---------------------------------------------------------------------------- # 

622# # 

623# Data Objects #### 

624# # 

625# ---------------------------------------------------------------------------- # 

626 

627 

628data_airline: pd.Series = load_airline() 

629data_macrodata: pd.DataFrame = load_macrodata() 

630data_random: NDArray[np.float64] = get_random_numbers(SEED) 

631data_random_2d: NDArray[np.float64] = get_random_numbers_2d(SEED) 

632data_sine: NDArray[np.float64] = get_sine_wave() 

633data_normal: NDArray[np.float64] = get_normal_curve(SEED) 

634data_line: NDArray[np.float64] = get_straight_line() 

635data_trend: NDArray[np.float64] = get_trend_data() 

636data_noise: NDArray[np.float64] = get_noise_data(SEED)