Coverage for src / ts_stat_tests / utils / data.py: 100%
77 statements
« prev ^ index » next coverage.py v7.13.2, created at 2026-02-01 09:48 +0000
« prev ^ index » next coverage.py v7.13.2, created at 2026-02-01 09:48 +0000
1# ============================================================================ #
2# #
3# Title: Data Utilities #
4# Purpose: Functions to load classic time series datasets. #
5# #
6# ============================================================================ #
9# ---------------------------------------------------------------------------- #
10# #
11# Overview ####
12# #
13# ---------------------------------------------------------------------------- #
16## --------------------------------------------------------------------------- #
17## Description ####
18## --------------------------------------------------------------------------- #
21"""
22!!! note "Summary"
24 This module contains utility functions to load classic time series datasets for testing and demonstration purposes.
26 It provides interfaces for both synthetic data generation (random numbers, sine waves, trends) and external data loading from common benchmarks.
27"""
30# ---------------------------------------------------------------------------- #
31# #
32# Setup ####
33# #
34# ---------------------------------------------------------------------------- #
37## --------------------------------------------------------------------------- #
38## Imports ####
39## --------------------------------------------------------------------------- #
42# ## Python StdLib Imports ----
43from functools import lru_cache
45# ## Python Third Party Imports ----
46import numpy as np
47import pandas as pd
48from numpy.random import Generator as RandomGenerator
49from numpy.typing import NDArray
50from stochastic.processes.noise import FractionalGaussianNoise
51from typeguard import typechecked
54## --------------------------------------------------------------------------- #
55## Exports ####
56## --------------------------------------------------------------------------- #
59__all__: list[str] = [
60 "get_random_generator",
61 "get_random_numbers",
62 "get_random_numbers_2d",
63 "get_sine_wave",
64 "get_normal_curve",
65 "get_straight_line",
66 "get_trend_data",
67 "get_uniform_data",
68 "get_noise_data",
69 "load_airline",
70 "load_macrodata",
71 "data_airline",
72 "data_macrodata",
73 "data_random",
74 "data_random_2d",
75 "data_sine",
76 "data_normal",
77 "data_line",
78 "data_trend",
79 "data_noise",
80]
83## --------------------------------------------------------------------------- #
84## Constants ####
85## --------------------------------------------------------------------------- #
88SEED: int = 42
91# ---------------------------------------------------------------------------- #
92# #
93# Data Generators ####
94# #
95# ---------------------------------------------------------------------------- #
98@lru_cache
99@typechecked
100def get_random_generator(seed: int) -> RandomGenerator:
101 r"""
102 !!! note "Summary"
103 Generates a NumPy random number generator with a specified seed for reproducibility.
105 ???+ abstract "Details"
106 This function returns a `numpy.random.Generator` instance using `default_rng`. This is the recommended way to generate random numbers in modern NumPy (v1.17+).
108 Params:
109 seed (int):
110 The seed value for the random number generator.
112 Returns:
113 (RandomGenerator):
114 A NumPy random number generator initialized with the given seed.
116 ???+ example "Examples"
118 ```pycon {.py .python linenums="1" title="Setup"}
119 >>> from ts_stat_tests.utils.data import get_random_generator
121 ```
123 ```pycon {.py .python linenums="1" title="Example 1: Creating a RandomGenerator"}
124 >>> rng = get_random_generator(42)
125 >>> print(rng is not None)
126 True
127 >>> print(type(rng))
128 <class 'numpy.random._generator.Generator'>
130 ```
132 ??? question "References"
133 1. [NumPy Random Generator](https://numpy.org/doc/stable/reference/random/generator.html)
135 """
136 return np.random.default_rng(seed)
139@lru_cache
140@typechecked
141def get_random_numbers(seed: int) -> NDArray[np.float64]:
142 r"""
143 !!! note "Summary"
144 Generates an array of random numbers with a specified seed for reproducibility.
146 ???+ abstract "Details"
147 Generates a 1D array of 1000 random floating-point numbers distributed uniformly in the half-open interval $[0.0, 1.0)$.
149 Params:
150 seed (int):
151 The seed value for the random number generator.
153 Returns:
154 (NDArray[np.float64]):
155 An array of random numbers with shape (1000,).
157 ???+ example "Examples"
159 ```pycon {.py .python linenums="1" title="Setup"}
160 >>> from ts_stat_tests.utils.data import get_random_numbers
161 >>> data = get_random_numbers(42)
163 ```
165 ```pycon {.py .python linenums="1" title="Example 1: Generating Random Numbers"}
166 >>> print(data.shape)
167 (1000,)
168 >>> print(type(data))
169 <class 'numpy.ndarray'>
170 >>> print(data.dtype)
171 float64
172 >>> print(data[:5])
173 [0.77395605 0.43887844 0.85859792 0.69736803 0.09417735]
175 ```
177 ??? question "References"
178 1. [NumPy Random Generator](https://numpy.org/doc/stable/reference/random/generator.html)
180 """
181 rng: RandomGenerator = get_random_generator(seed)
182 return rng.random(size=1000)
185@lru_cache
186@typechecked
187def get_random_numbers_2d(seed: int) -> NDArray[np.float64]:
188 r"""
189 !!! note "Summary"
190 Generates a 2D array of random numbers with a specified seed for reproducibility.
192 ???+ abstract "Details"
193 Produces a 2D matrix of shape $(4, 3000)$ containing uniform random values.
195 Params:
196 seed (int):
197 The seed value for the random number generator.
199 Returns:
200 (NDArray[np.float64]):
201 A 2D array of random numbers with shape (4, 3000).
203 ???+ example "Examples"
205 ```pycon {.py .python linenums="1" title="Setup"}
206 >>> from ts_stat_tests.utils.data import get_random_numbers_2d
207 >>> data = get_random_numbers_2d(42)
209 ```
211 ```pycon {.py .python linenums="1" title="Example 1: Generating 2D Random Numbers"}
212 >>> print(data.shape)
213 (4, 3000)
214 >>> print(type(data))
215 <class 'numpy.ndarray'>
216 >>> print(data.dtype)
217 float64
218 >>> print(data[:, :5])
219 [[0.06206311 0.45826204 0.12903006 0.15232671 0.63228281]
220 [0.71609997 0.3571156 0.85186786 0.24097716 0.53839349]
221 [0.74315144 0.90157433 0.59866347 0.52857443 0.89016256]
222 [0.72072839 0.71123776 0.20269503 0.0366554 0.30379952]]
224 ```
226 ??? question "References"
227 1. [NumPy Random Generator](https://numpy.org/doc/stable/reference/random/generator.html)
229 """
230 rng: RandomGenerator = get_random_generator(seed)
231 return rng.random(size=(4, 3000))
234@lru_cache
235@typechecked
236def get_sine_wave() -> NDArray[np.float64]:
237 r"""
238 !!! note "Summary"
239 Generates a sine wave dataset.
241 ???+ abstract "Details"
242 Produces a 1D array of 1000 samples of a sine wave with amplitude 1.0 and period 1000.
244 Returns:
245 (NDArray[np.float64]):
246 An array representing a sine wave with shape (3000,).
248 ???+ example "Examples"
250 ```pycon {.py .python linenums="1" title="Setup"}
251 >>> from ts_stat_tests.utils.data import get_sine_wave
252 >>> data = get_sine_wave()
254 ```
256 ```pycon {.py .python linenums="1" title="Example 1: Generating a Sine Wave"}
257 >>> print(data.shape)
258 (3000,)
259 >>> print(type(data))
260 <class 'numpy.ndarray'>
261 >>> print(data.dtype)
262 float64
263 >>> print(data[:5])
264 [0. 0.06279052 0.12533323 0.18738131 0.24868989]
266 ```
268 ??? question "References"
269 1. [NumPy Trigonometric Functions](https://numpy.org/doc/stable/reference/routines.math.html#trigonometric-functions)
271 """
272 return np.sin(2 * np.pi * 1 * np.arange(3000) / 100)
275@lru_cache
276@typechecked
277def get_normal_curve(seed: int) -> NDArray[np.float64]:
278 r"""
279 !!! note "Summary"
280 Generates a normal distribution curve dataset.
282 ???+ abstract "Details"
283 Draws 1000 samples from a standard Gaussian distribution.
285 Params:
286 seed (int):
287 The seed value for the random number generator.
289 Returns:
290 (NDArray[np.float64]):
291 An array representing a normal distribution curve with shape (1000,).
293 ???+ example "Examples"
295 ```pycon {.py .python linenums="1" title="Setup"}
296 >>> from ts_stat_tests.utils.data import get_normal_curve
297 >>> data = get_normal_curve(42)
299 ```
301 ```pycon {.py .python linenums="1" title="Example 1: Generating Normal Curve Data"}
302 >>> print(data.shape)
303 (1000,)
304 >>> print(type(data))
305 <class 'numpy.ndarray'>
306 >>> print(data.dtype)
307 float64
308 >>> print(data[:5])
309 [ 0.12993113 -0.75691222 -0.33007356 -1.88579735 -0.37064992]
311 ```
313 ??? question "References"
314 1. [NumPy Random Normal](https://numpy.org/doc/stable/reference/random/generated/numpy.random.Generator.normal.html)
316 """
317 rng: RandomGenerator = get_random_generator(seed)
318 return rng.normal(loc=0.0, scale=1.0, size=1000)
321@lru_cache
322@typechecked
323def get_straight_line() -> NDArray[np.float64]:
324 r"""
325 !!! note "Summary"
326 Generates a straight line dataset.
328 ???+ abstract "Details"
329 Returns a sequence of integers from 0 to 999.
331 Returns:
332 (NDArray[np.float64]):
333 An array representing a straight line with shape (1000,).
335 ???+ example "Examples"
337 ```pycon {.py .python linenums="1" title="Setup"}
338 >>> from ts_stat_tests.utils.data import get_straight_line
339 >>> data = get_straight_line()
341 ```
343 ```pycon {.py .python linenums="1" title="Example 1: Generating Straight Line Data"}
344 >>> print(data.shape)
345 (1000,)
346 >>> print(type(data))
347 <class 'numpy.ndarray'>
348 >>> print(data.dtype)
349 float64
350 >>> print(data[:5])
351 [0. 1. 2. 3. 4.]
353 ```
355 ??? question "References"
356 1. [NumPy Arange](https://numpy.org/doc/stable/reference/generated/numpy.arange.html)
358 """
359 return np.arange(1000).astype(np.float64)
362@lru_cache
363@typechecked
364def get_trend_data() -> NDArray[np.float64]:
365 r"""
366 !!! note "Summary"
367 Generates trend data.
369 ???+ abstract "Details"
370 Generates an array with a linear trend by combining two ramp functions.
372 Returns:
373 (NDArray[np.float64]):
374 An array representing trend data with shape (1000,).
376 ???+ example "Examples"
378 ```pycon {.py .python linenums="1" title="Setup"}
379 >>> from ts_stat_tests.utils.data import get_trend_data
380 >>> data = get_trend_data()
382 ```
384 ```pycon {.py .python linenums="1" title="Example 1: Generating Trend Data"}
385 >>> print(data.shape)
386 (1000,)
387 >>> print(type(data))
388 <class 'numpy.ndarray'>
389 >>> print(data.dtype)
390 float64
391 >>> print(data[:5])
392 [0. 1.5 3. 4.5 6. ]
394 ```
396 ??? question "References"
397 1. [NumPy Arange](https://numpy.org/doc/stable/reference/generated/numpy.arange.html)
399 """
400 return np.arange(1000) + 0.5 * np.arange(1000)
403@lru_cache
404@typechecked
405def get_uniform_data(seed: int) -> NDArray[np.float64]:
406 r"""
407 !!! note "Summary"
408 Generates uniform random data with a specified seed for reproducibility.
410 ???+ abstract "Details"
411 Returns a 1D array of 1000 samples from a uniform distribution.
413 Params:
414 seed (int):
415 The seed value for the random number generator.
417 Returns:
418 (NDArray[np.float64]):
419 An array of uniform random data with shape (1000,).
421 ???+ example "Examples"
423 ```pycon {.py .python linenums="1" title="Setup"}
424 >>> from ts_stat_tests.utils.data import get_uniform_data
425 >>> data = get_uniform_data(42)
427 ```
429 ```pycon {.py .python linenums="1" title="Example 1: Generating Uniform Data"}
430 >>> print(data.shape)
431 (1000,)
432 >>> print(type(data))
433 <class 'numpy.ndarray'>
434 >>> print(data.dtype)
435 float64
436 >>> print(data[:5])
437 [0.80227457 0.81857128 0.87962986 0.11378193 0.29263938]
439 ```
441 ??? question "References"
442 1. [NumPy Random Uniform](https://numpy.org/doc/stable/reference/random/generated/numpy.random.Generator.uniform.html)
444 """
445 rng: RandomGenerator = get_random_generator(seed)
446 return rng.uniform(low=0.0, high=1.0, size=1000)
449@lru_cache
450@typechecked
451def get_noise_data(seed: int) -> NDArray[np.float64]:
452 r"""
453 !!! note "Summary"
454 Generates fractional Gaussian noise data with a specified seed for reproducibility.
456 ???+ abstract "Details"
457 Uses the `stochastic` library to sample fractional Gaussian noise with a Hurst exponent of 0.5, effectively producing white noise.
459 Params:
460 seed (int):
461 The seed value for the random number generator.
463 Returns:
464 (NDArray[np.float64]):
465 An array of fractional Gaussian noise data with shape (1000,).
467 ???+ example "Examples"
469 ```pycon {.py .python linenums="1" title="Setup"}
470 >>> from ts_stat_tests.utils.data import get_noise_data
471 >>> data = get_noise_data(42)
473 ```
475 ```pycon {.py .python linenums="1" title="Example 1: Generating Noise Data"}
476 >>> print(data.shape)
477 (1000,)
478 >>> print(type(data))
479 <class 'numpy.ndarray'>
480 >>> print(data.dtype)
481 float64
482 >>> print(data[:5])
483 [-0.05413957 -0.0007609 -0.00177524 0.00909899 -0.03044404]
485 ```
487 ??? question "References"
488 1. [Stochastic Library](https://github.com/crflynn/stochastic)
490 """
491 rng: RandomGenerator = get_random_generator(seed)
492 return FractionalGaussianNoise(hurst=0.5, rng=rng).sample(1000)
495# ---------------------------------------------------------------------------- #
496# #
497# Data Loaders ####
498# #
499# ---------------------------------------------------------------------------- #
502@lru_cache
503@typechecked
504def load_airline() -> pd.Series:
505 r"""
506 !!! note "Summary"
507 Loads the classic Airline Passengers dataset as a pandas Series.
509 ???+ abstract "Details"
510 The Air Passengers dataset provides monthly totals of a US airline's international passengers from 1949 to 1960. It is a classic dataset for time series analysis, exhibiting both trend and seasonality.
512 Returns:
513 (pd.Series):
514 The Airline Passengers dataset.
516 ???+ example "Examples"
518 ```pycon {.py .python linenums="1" title="Setup"}
519 >>> from ts_stat_tests.utils.data import load_airline
520 >>> data = load_airline()
522 ```
524 ```pycon {.py .python linenums="1" title="Example 1: Loading Airline Data"}
525 >>> print(len(data))
526 144
527 >>> print(type(data))
528 <class 'pandas.core.series.Series'>
529 >>> print(data.head())
530 Period
531 1949-01 112.0
532 1949-02 118.0
533 1949-03 132.0
534 1949-04 129.0
535 1949-05 121.0
536 Freq: M, Name: Number of airline passengers, dtype: float64
538 ```
540 ??? success "Credit"
541 Inspiration from: [`sktime.datasets.load_airline()`](https://www.sktime.net/en/stable/api_reference/generated/sktime.datasets.load_airline.html)
543 ??? question "References"
544 1. Box, G. E. P., Jenkins, G. M., Reinsel, G. C., & Ljung, G. M. (2015). Time series analysis: forecasting and control. John Wiley & Sons.
546 """
547 data_source = "https://raw.githubusercontent.com/sktime/sktime/main/sktime/datasets/data/Airline/Airline.csv"
548 _data = pd.read_csv(data_source, index_col=0, dtype={1: float}).squeeze("columns")
549 if not isinstance(_data, pd.Series):
550 raise TypeError("Expected a pandas Series from the data source.")
551 data: pd.Series = _data
552 data.index = pd.PeriodIndex(data.index, freq="M", name="Period")
553 data.name = "Number of airline passengers"
554 return data
557@lru_cache
558@typechecked
559def load_macrodata() -> pd.DataFrame:
560 r"""
561 !!! note "Summary"
562 Loads the classic Macrodata dataset as a pandas DataFrame.
564 ???+ abstract "Details"
565 This dataset contains various US macroeconomic time series from 1959Q1 to 2009Q3. It includes variables such as real GDP, consumption, investment, etc.
567 Returns:
568 (pd.DataFrame):
569 The Macrodata dataset.
571 ???+ example "Examples"
573 ```pycon {.py .python linenums="1" title="Setup"}
574 >>> from ts_stat_tests.utils.data import load_macrodata
575 >>> data = load_macrodata()
577 ```
579 ```pycon {.py .python linenums="1" title="Example 1: Loading Macrodata"}
580 >>> print(data.shape)
581 (203, 14)
582 >>> print(type(data))
583 <class 'pandas.core.frame.DataFrame'>
584 >>> print(data[["year", "quarter", "realgdp"]].head())
585 year quarter realgdp
586 Period
587 1959Q1 1959 1 2710.349
588 1959Q2 1959 2 2778.801
589 1959Q3 1959 3 2775.488
590 1959Q4 1959 4 2785.204
591 1960Q1 1960 1 2847.699
593 ```
595 ??? success "Credit"
596 Inspiration from: [`statsmodels.datasets.macrodata.load_pandas()`](https://www.statsmodels.org/stable/datasets/generated/statsmodels.datasets.macrodata.macrodata.load_pandas.html)
598 ??? question "References"
599 1. R. F. Engle, D. F. Hendry, and J. F. Richard (1983). Exogeneity. Econometrica, 51(2):277–304.
601 """
602 data_source = (
603 "https://raw.githubusercontent.com/statsmodels/statsmodels/main/statsmodels/datasets/macrodata/macrodata.csv"
604 )
605 data: pd.DataFrame = pd.read_csv(
606 data_source,
607 index_col=None,
608 dtype={
609 "year": int,
610 "quarter": int,
611 },
612 )
613 data.index = pd.PeriodIndex(
614 data=data.year.astype(str) + "Q" + data.quarter.astype(str),
615 freq="Q",
616 name="Period",
617 )
618 return data
621# ---------------------------------------------------------------------------- #
622# #
623# Data Objects ####
624# #
625# ---------------------------------------------------------------------------- #
628data_airline: pd.Series = load_airline()
629data_macrodata: pd.DataFrame = load_macrodata()
630data_random: NDArray[np.float64] = get_random_numbers(SEED)
631data_random_2d: NDArray[np.float64] = get_random_numbers_2d(SEED)
632data_sine: NDArray[np.float64] = get_sine_wave()
633data_normal: NDArray[np.float64] = get_normal_curve(SEED)
634data_line: NDArray[np.float64] = get_straight_line()
635data_trend: NDArray[np.float64] = get_trend_data()
636data_noise: NDArray[np.float64] = get_noise_data(SEED)