Coverage for src/synthetic_data_generators/time

1# ============================================================================ #

2# #

3# Title: Synthetic Time Series Data #

4# Purpose: Generate synthetic time series data for testing and validation. #

5# Notes: This module provides functions to generate various types of #

6# synthetic time series data, including seasonal, trend, and noise. #

7# It also includes functions to create time series data with #

8# specific characteristics, such as missing values and outliers. #

9# #

10# ============================================================================ #

13# ---------------------------------------------------------------------------- #

14# #

15# Overview ####

16# #

17# ---------------------------------------------------------------------------- #

20## --------------------------------------------------------------------------- #

21## Description ####

22## --------------------------------------------------------------------------- #

25"""

26!!! note "Summary"

27 The [`time_series`][synthetic_data_generators.time_series] module provides a class for generating synthetic time series data. It includes methods for creating time series with various characteristics, such as seasonality, trends, and noise.

28"""

31# ---------------------------------------------------------------------------- #

32# #

33# Set Up ####

34# #

35# ---------------------------------------------------------------------------- #

38## --------------------------------------------------------------------------- #

39## Imports ####

40## --------------------------------------------------------------------------- #

43# ## Future Python Library Imports ----

44from __future__ import annotations

46# ## Python StdLib Imports ----

47from collections.abc import Callable, Sequence

48from datetime import datetime

49from functools import lru_cache

50from typing import Any, Literal, overload

52# ## Python Third Party Imports ----

53import numpy as np

54import pandas as pd

55from numpy.random import Generator as RandomGenerator

56from numpy.typing import NDArray

57from toolbox_python.checkers import assert_all_values_of_type

58from typeguard import typechecked

60# ## Local First Party Imports ----

61from synthetic_data_generators.utils.validators import Validators, number

64## --------------------------------------------------------------------------- #

65## Exports ####

66## --------------------------------------------------------------------------- #

69__all__: list[str] = ["TimeSeriesGenerator"]

72# ---------------------------------------------------------------------------- #

73# #

74# Classes ####

75# #

76# ---------------------------------------------------------------------------- #

79## --------------------------------------------------------------------------- #

80## TimeSeriesGenerator ####

81## --------------------------------------------------------------------------- #

84class TimeSeriesGenerator(Validators):

85 """

86 !!! note "Summary"

87 A class for generating synthetic time series data.

89 ???+ abstract "Details"

90 - This class provides methods to create synthetic time series data with various characteristics, including seasonality, trends, and noise.

91 - The generated data can be used for testing and validation purposes in time series analysis.

92 - The class includes methods to generate holiday indices, fixed error indices, semi-Markov indices, and sine indices.

93 - It also provides a method to generate polynomial trends and ARMA components.

94 - The generated time series data can be customized with different parameters, such as start date, number of periods, and noise scale.

96 Methods:

97 create_time_series(): Generate a synthetic time series with specified characteristics.

98 generate_holiday_index(): Generate a holiday index for the given dates.

99 generate_fixed_error_index(): Generate a fixed error seasonality index for the given dates.

100 generate_semi_markov_index(): Generate a semi-Markov seasonality index for the given dates.

101 generate_sin_index(): Generate a sine seasonality index for the given dates.

102 generate_sin_covar_index(): Generate a sine seasonality index with covariance for the given dates.

103 generate_season_index(): Generate a seasonality index based on the specified style for the given dates.

104 generate_polynom_trend(): Generate a polynomial trend based on interpolation nodes.

105 generate_ARMA(): Generate an ARMA component for the time series.

106

107 Attributes:

108 random_generator (RandomGenerator): An instance of `numpy.random.Generator` used for random number generation.

109 seed (int): The seed value used for random number generation.

110 """

111

112 def __init__(self, seed: int | None = None) -> None:

113 """

114 !!! note "Summary"

115 Initialize the TimeSeriesGenerator class.

116

117 ???+ abstract "Details"

118 - This class is designed to generate synthetic time series data for testing and validation purposes.

119 - It provides methods to create time series data with various characteristics, including seasonality, trends, and noise.

120 - The generated data can be used for testing algorithms, models, and other applications in time series analysis.

121 - The class includes methods for generating holiday indices, fixed error indices, semi-Markov indices, and sine indices.

122 - It also provides a method for generating polynomial trends and ARMA components.

123 - The generated time series data can be customized with different parameters, such as start date, number of periods, and noise scale.

124 - The class is designed to be flexible and extensible, allowing users to easily modify the generation process to suit their needs.

125 - It is built using Python's type hinting and type checking features to ensure that the inputs and outputs are of the expected types.

126 - This helps to catch potential errors early in the development process and improve code readability.

127 """

128 self._set_seed(seed=seed)

129

130 def create_time_series(

131 self,

132 start_date: datetime = datetime(2019, 1, 1),

133 n_periods: int = 1096,

134 interpolation_nodes: Sequence[Sequence[int]] = ([0, 98], [300, 92], [700, 190], [1096, 213]),

135 level_breaks: Sequence[Sequence[int]] | None = ([250, 100], [650, -50]),

136 AR: Sequence[number] | None = None,

137 MA: Sequence[number] | None = None,

138 randomwalk_scale: number = 2,

139 exogenous: Sequence[dict[Literal["coeff", "ts"], Sequence[number]]] | None = None,

140 season_conf: dict[str, Any] | None = {"style": "holiday"},

141 season_eff: number = 0.15,

142 manual_outliers: Sequence[Sequence[int]] | None = None,

143 noise_scale: number = 10,

144 seed: int | None = None,

145 ) -> pd.DataFrame:

146 """

147 !!! note "Summary"

148 Generate a synthetic time series with specified characteristics.

149

150 ???+ abstract "Details"

151 - The function generates a time series based on the specified parameters, including start date, number of periods, interpolation nodes, level breaks, ARMA coefficients, random walk scale, exogenous variables, seasonality configuration, manual outliers, and noise scale.

152 - The generated time series is returned as a pandas DataFrame with two columns: "Date" and "Value".

153 - The "Date" column contains the dates of the time series, and the "Value" column contains the corresponding values.

154 - The function also includes options for generating seasonality indices, fixed error indices, semi-Markov indices, and sine indices.

155 - The generated time series can be customized with different parameters, such as start date, number of periods, and noise scale.

156

157 !!! warning "Important"

158 This function is designed to generate synthetic time series data for testing and validation purposes.

159 It is not intended to be used for production or real-world applications.

160

161 Params:

162 start_date (datetime):

163 The starting date for the time series.

164 Default is `datetime(2019, 1, 1)`.

165 n_periods (int):

166 The number of periods for the time series.

167 Default is `1096`.

168 interpolation_nodes (Sequence[Sequence[int]]):

169 A collection of interpolation nodes, where each node is a tuple containing the x-coordinate and y-coordinate.

170 The x-coordinates should be in ascending order.

171 Default is `([0, 98], [300, 92], [700, 190], [1096, 213])`.

172 level_breaks (Sequence[Sequence[int]] | None):

173 A collection of level breaks, where each break is a tuple containing the index and the value to add.

174 Default is `([250, 100], [650, -50])`.

175 AR (Sequence[number] | None):

176 The autoregressive coefficients for the ARMA model.

177 Default is `None`.

178 MA (Sequence[number] | None):

179 The moving average coefficients for the ARMA model.

180 Default is `None`.

181 randomwalk_scale (number):

182 The scale of the random walk component.

183 Default is `2`.

184 exogenous (Sequence[dict[Literal["coeff", "ts"], Sequence[number]]] | None):

185 A list of exogenous variables to include in the ARMA model.

186 Default is `None`.

187 season_conf (dict[str, Any] | None):

188 A dictionary containing the configuration for seasonality.

189 Default is `{"style": "holiday"}`.

190 season_eff (number):

191 The effectiveness of the seasonality component.

192 Default is `0.15`.

193 manual_outliers (Sequence[Sequence[int]] | None):

194 A collection of manual outliers, where each outlier is a tuple containing the index and the value to set.

195 Default is `None`.

196 noise_scale (number):

197 The scale of the noise component.

198 Default is `10`.

199 seed (int | None):

200 The random seed for reproducibility.

201 Default is `None`.

202

203 Raises:

204 (TypeCheckError):

205 If any of the inputs parsed to the parameters of this function are not the correct type. Uses the [`@typeguard.typechecked`](https://typeguard.readthedocs.io/en/stable/api.html#typeguard.typechecked) decorator.

206 (AssertionError):

207 If `interpolation_nodes` does not contain exactly two elements.

208 (TypeError):

209 If the first element of `interpolation_nodes` is not a `datetime`, or the second element is not an `int`.

210

211 Returns:

212 (pd.DataFrame):

213 A pandas DataFrame containing the generated time series data.

214 The DataFrame has two columns: "Date" and "Value".

215 The "Date" column contains the dates of the time series, and the "Value" column contains the corresponding values.

216 """

217

218 # Validations

219 AR = AR or [1]

220 MA = MA or [0]

221 exogenous = exogenous or []

222 manual_outliers = manual_outliers or []

223 assert AR is not None

224 assert MA is not None

225 assert manual_outliers is not None

226

227 # Set seed

228 if seed:

229 self._set_seed(seed=seed)

230

231 # Date index:

232 dates: list[datetime] = self._get_dates(start_date, n_periods=n_periods)

233

234 # Cubic trend component:

235 trend: NDArray[np.float64] = self.generate_polynom_trend(interpolation_nodes, n_periods)

236

237 # Structural break:

238 break_effect: NDArray[np.float64] = np.zeros(n_periods).astype(np.float64)

239 if level_breaks:

240 for level_break in level_breaks:

241 break_effect[level_break[0] :] += level_break[1]

242

243 # ARMA(AR,MA) component:

244 randomwalk: NDArray[np.float64] = self.generate_ARMA(

245 AR=AR,

246 MA=MA,

247 randomwalk_scale=randomwalk_scale,

248 n_periods=n_periods,

249 exogenous=exogenous,

250 seed=seed,

251 )

252

253 # Season:

254 if season_conf is not None:

255 season: NDArray[np.float64] = self.generate_season_index(dates=dates, **season_conf) # type: ignore

256 season = season * season_eff + (1 - season)

257 else:

258 season = np.ones(n_periods)

259

260 # Noise component on top:

261 noise: NDArray[np.float64] = self.random_generator.normal(

262 loc=0.0,

263 scale=noise_scale,

264 size=n_periods,

265 )

266

267 # Assemble finally:

268 df: pd.DataFrame = pd.DataFrame(

269 list(

270 zip(

271 dates,

272 (trend + break_effect + randomwalk + noise) * season,

273 )

274 ),

275 index=dates,

276 columns=["Date", "Value"],

277 )

278

279 # Manual outliers:

280 if manual_outliers:

281 for manual_outlier in manual_outliers:

282 df.iloc[manual_outlier[0], 1] = manual_outlier[1]

283

284 return df

285

286 @typechecked

287 def generate_holiday_index(

288 self,

289 dates: Sequence[datetime],

290 season_dates: Sequence[Sequence[datetime | int]],

291 ) -> NDArray[np.int_]:

292 """

293 !!! note "Summary"

294 Generate a holiday index for the given dates based on the provided holiday dates.

295

296 ???+ abstract "Details"

297 - A holiday index is a manual selection for date in `dates` to determine whether it is a holiday or not.

298 - Basically, it is a manual index of dates in a univariate time series data set which are actual holidays.

299 - The return array is generated by checking if each date in `dates` is present in the list of holiday dates generated from `season_dates`.

300

301 !!! warning "Important"

302 This function is designed to work with a `.generate_season_index()` when the `style="holiday"`.

303 It is not intended to be called directly.

304

305 Params:

306 dates (Sequence[datetime]):

307 List of datetime objects representing the dates to check.

308 season_dates (Sequence[Sequence[datetime | int]]):

309 Collection of collections containing holiday dates and their respective periods.

310 Each element in the collection should contain exactly two elements: a datetime object and an integer representing the number of periods.

311 Some example inputs include:\n

312 - List of lists containing datetime and periods: `season_dates = [[datetime(2025, 4, 18), 4], [datetime(2024, 3, 29), 4]]`

313 - List of tuples containing datetime and periods: `season_dates = [(datetime(2025, 4, 18), 4), (datetime(2024, 3, 29), 4)]`

314 - Tuple of lists containing datetime and periods: `season_dates = ([datetime(2025, 4, 18), 4], [datetime(2024, 3, 29), 4])`

315 - Tuple of tuples containing datetime and periods: `season_dates = ((datetime(2025, 4, 18), 4), (datetime(2024, 3, 29), 4))`

316

317 Raises:

318 (TypeCheckError):

319 If any of the inputs parsed to the parameters of this function are not the correct type. Uses the [`@typeguard.typechecked`](https://typeguard.readthedocs.io/en/stable/api.html#typeguard.typechecked) decorator.

320 (AssertionError):

321 If `season_dates` does not contain exactly two elements.

322 (TypeError):

323 If the first element of `season_dates` is not a `datetime`, or the second element is not an `int`.

324

325 Returns:

326 (NDArray[np.int_]):

327 An array of the same length as `dates`, where each element is `1` if the corresponding date is a holiday, and `0` otherwise.

328 """

329

330 # Validations

331 assert all(len(elem) == 2 for elem in season_dates)

332 assert_all_values_of_type([season_date[0] for season_date in season_dates], datetime)

333 assert_all_values_of_type([season_date[1] for season_date in season_dates], int)

334

335 # Build dates

336 season_dates_list: list[datetime] = []

337 for _dates in season_dates:

338 season_dates_list.extend(

339 self._get_holiday_period(

340 start_date=_dates[0], # type: ignore

341 periods=_dates[1], # type: ignore

342 )

343 )

344

345 # Tag dates

346 events: NDArray[np.int_] = np.where([_date in season_dates_list for _date in dates], 1, 0)

347

348 # Return

349 return events

350

351 @typechecked

352 def generate_fixed_error_index(

353 self,

354 dates: Sequence[datetime],

355 period_length: int = 7,

356 period_sd: number = 0.5,

357 start_index: int = 4,

358 seed: int | None = None,

359 ) -> NDArray[np.float64]:

360 """

361 !!! note "Summary"

362 Generate a fixed error seasonality index for the given dates.

363

364 ???+ abstract "Details"

365 - A holiday index is a manual selection for date in `dates` to determine whether it is a holiday or not.

366 - A fixed error seasonality index is a non-uniform distribution of dates in a univariate time series data set.

367 - Basically, it is indicating every `period_length` length of days, occurring every `period_sd` number of days, starting from `start_index`.

368 - The return array is a boolean `1` or `0` of length `n_periods`. It will have a seasonality of `period_length` and a disturbance standard deviation of `period_sd`. The result can be used as a non-uniform distribution of weekdays in a histogram (if for eg. frequency is weekly).

369

370 !!! warning "Important"

371 This function is designed to work with a `.generate_season_index()` when the `style="fixed+error"`.

372 It is not intended to be called directly.

373

374 Params:

375 dates (Sequence[datetime]):

376 List of datetime objects representing the dates to check.

377 period_length (int):

378 The length of the period for seasonality.

379 For example, if the frequency is weekly, this would be `7`.

380 Default is `7`.

381 period_sd (number):

382 The standard deviation of the disturbance.

383 Default is `0.5`.

384 start_index (int):

385 The starting index for the seasonality.

386 Default is `4`.

387 seed (int | None):

388 The random seed for reproducibility.

389 Default is `None`.

390

391 Raises:

392 (TypeCheckError):

393 If any of the inputs parsed to the parameters of this function are not the correct type. Uses the [`@typeguard.typechecked`](https://typeguard.readthedocs.io/en/stable/api.html#typeguard.typechecked) decorator.

394

395 Returns:

396 (NDArray[np.int_]):

397 An array of the same length as `dates`, where each element is `1` if the corresponding date is a holiday, and `0` otherwise.

398 """

399

400 # Set seed

401 if seed:

402 self._set_seed(seed=seed)

403

404 # Process

405 n_periods: int = len(dates)

406 events: NDArray[np.int_] = np.zeros(n_periods).astype(np.int_)

407 event_inds: NDArray[Any] = np.arange(n_periods // period_length + 1) * period_length + start_index

408 disturbance: NDArray[np.float64] = self.random_generator.normal(

409 loc=0.0,

410 scale=period_sd,

411 size=len(event_inds),

412 ).astype(int)

413 event_inds = event_inds + disturbance

414

415 # Delete indices that are out of bounds

416 if np.any(event_inds >= n_periods):

417 event_inds = np.delete(event_inds, event_inds >= n_periods)

418

419 # Return

420 return events.astype(np.float64)

421

422 def generate_semi_markov_index(

423 self,

424 dates: Sequence[datetime],

425 period_length: int = 7,

426 period_sd: float = 0.5,

427 start_index: int = 4,

428 seed: int | None = None,

429 ) -> NDArray[np.int_]:

430 """

431 !!! note "Summary"

432 Generate a semi-Markov seasonality index for the given dates.

433

434 ???+ abstract "Details"

435 - A semi-Markov seasonality index is a uniform distribution of dates in a univariate time series data set.

436 - Basically, it is indicating a `period_length` length of days, occurring randomly roughly ever `period_sd` number of days, starting from `start_index`.

437 - The return array is a boolean `1` or `0` of length `n_periods`. It will have a seasonality of `period_length` and a disturbance standard deviation of `period_sd`. The result can be used as a uniform distribution of weekdays in a histogram (if for eg. frequency is weekly).

438

439 Params:

440 dates (Sequence[datetime]):

441 List of datetime objects representing the dates to check.

442 period_length (int):

443 The length of the period for seasonality.

444 For example, if the frequency is weekly, this would be `7`.

445 Default is `7`.

446 period_sd (float):

447 The standard deviation of the disturbance.

448 Default is `0.5`.

449 start_index (int):

450 The starting index for the seasonality.

451 Default is `4`.

452 seed (int | None):

453 The random seed for reproducibility.

454 Default is `None`.

455

456 Raises:

457 (TypeCheckError):

458 If any of the inputs parsed to the parameters of this function are not the correct type. Uses the [`@typeguard.typechecked`](https://typeguard.readthedocs.io/en/stable/api.html#typeguard.typechecked) decorator.

459

460 Returns:

461 (NDArray[np.int_]):

462 An array of the same length as `dates`, where each element is `1` if the corresponding date is a holiday, and `0` otherwise.

463 """

464

465 # Set seed

466 if seed:

467 self._set_seed(seed=seed)

468

469 # Process

470 n_periods: int = len(dates)

471 events: NDArray[np.int_] = np.zeros(n_periods).astype(np.int_)

472 event_inds: list[int] = [start_index]

473 new = self.random_generator.normal(loc=period_length, scale=period_sd, size=1).round()[0]

474 while new + event_inds[-1] < n_periods:

475 event_inds.append(new + event_inds[-1])

476 new = self.random_generator.normal(

477 loc=period_length,

478 scale=period_sd,

479 size=1,

480 ).round()[0]

481 event_indexes: NDArray[np.int_] = np.array(event_inds).astype(np.int_)

482

483 # For any indices defined above, assign `1` to the events array

484 events[event_indexes] = 1

485

486 # Return

487 return events

488

489 def generate_sin_index(

490 self,

491 dates: Sequence[datetime],

492 period_length: int = 7,

493 start_index: int = 4,

494 amplitude: number = 0.5,

495 ) -> NDArray[np.float64]:

496 """

497 !!! note "Summary"

498 Generate a sine seasonality index for the given dates.

499

500 ???+ abstract "Details"

501 - A sine seasonality index is a periodic function that oscillates around a center value.

502 - It is used to model seasonal patterns in time series data.

503 - The return array is a sine wave of length `n_periods`, with a period of `period_length`, a phase shift of `start_index`, and an amplitude of `amplitude`.

504 - The result can be used to represent seasonal patterns in time series data, such as daily or weekly cycles.

505 - With default `amplitude=0.5`, the wave oscillates between `0` and `1` (centered at `0.5`).

506 - The formula used is: `amplitude * sin(...) + (1 - amplitude)`, which ensures the wave oscillates between `(1 - 2*amplitude)` and `1`.

507

508 Params:

509 dates (Sequence[datetime]):

510 List of datetime objects representing the dates to check.

511 period_length (int):

512 The length of the period for seasonality. This is the wavelength of the sine wave.

513 For example, if the frequency is weekly, this would be `7`.

514 Default is `7`.

515 start_index (int):

516 The starting index for the seasonality. Designed to account for seasonal patterns that start at a different point in time.

517 Default is `4`.

518 amplitude (number):

519 The amplitude of the sine wave, controlling the range of oscillation.

520 With `amplitude=0.5` (default), the wave oscillates between `0` and `1`.

521 With `amplitude=0.3`, the wave oscillates between `0.4` and `1`.

522 With `amplitude=1.0`, the wave oscillates between `-1` and `1`.

523 Default is `0.5`.

524

525 Raises:

526 (TypeCheckError):

527 If any of the inputs parsed to the parameters of this function are not the correct type. Uses the [`@typeguard.typechecked`](https://typeguard.readthedocs.io/en/stable/api.html#typeguard.typechecked) decorator.

528

529 Returns:

530 (NDArray[np.float64]):

531 An array of the same length as `dates`, where each element is a sine value representing the seasonal pattern.

532 """

533 n_periods: int = len(dates)

534 events = amplitude * np.sin((np.arange(n_periods) - start_index) / period_length * 2 * np.pi) + (1 - amplitude)

535 return events

536

537 def generate_sin_covar_index(

538 self,

539 dates: Sequence[datetime],

540 period_length: int = 7,

541 start_index: int = 4,

542 amplitude: number = 1.0,

543 ) -> NDArray[np.float64]:

544 """

545 !!! note "Summary"

546 Generate a sine seasonality index with covariance for the given dates.

547

548 ???+ abstract "Details"

549 - A sine seasonality index with covariance is a periodic function with varying frequency.

550 - It is used to model seasonal patterns in time series data, taking into account the covariance structure of the data.

551 - The return array is a sine wave of length `n_periods`, with a period of `period_length`, a phase shift of `start_index`, and controlled amplitude.

552 - The result can be used to represent seasonal patterns in time series data, such as daily or weekly cycles with varying intensity.

553 - Unlike the simple sine index, this method applies a covariance wave to create a more complex, non-uniform seasonal pattern.

554

555 Params:

556 dates (Sequence[datetime]):

557 List of datetime objects representing the dates to check.

558 period_length (int):

559 The length of the period for seasonality. This is the wavelength of the sine wave.

560 For example, if the frequency is weekly, this would be `7`.

561 Default is `7`.

562 start_index (int):

563 The starting index for the seasonality. Designed to account for seasonal patterns that start at a different point in time.

564 Default is `4`.

565 amplitude (number):

566 The amplitude multiplier for the sine wave, controlling the range of oscillation.

567 With `amplitude=1.0` (default), the wave oscillates in its natural range (approximately `-1` to `1`).

568 With `amplitude=0.5`, the wave oscillates in a reduced range (approximately `-0.5` to `0.5`).

569 With `amplitude=2.0`, the wave oscillates in an expanded range (approximately `-2` to `2`).

570 Default is `1.0`.

571

572 Raises:

573 (TypeCheckError):

574 If any of the inputs parsed to the parameters of this function are not the correct type. Uses the [`@typeguard.typechecked`](https://typeguard.readthedocs.io/en/stable/api.html#typeguard.typechecked) decorator.

575

576 Returns:

577 (NDArray[np.float64]):

578 An array of the same length as `dates`, where each element is a sine value representing the seasonal pattern.

579 """

580 n_periods: int = len(dates)

581 covar_wave = (np.sin((np.arange(n_periods) - start_index) / period_length / 6 * np.pi) + 2) / 2

582 dx: NDArray[np.float64] = np.full_like(covar_wave, 0.4)

583 sin_wave: NDArray[np.float64] = amplitude * np.sin((covar_wave * dx).cumsum())

584 return sin_wave

585

586 @overload

587 def generate_season_index(

588 self,

589 dates: Sequence[datetime],

590 style: Literal["fixed+error"],

591 *,

592 period_length: int,

593 period_sd: number,

594 start_index: int,

595 seed: int | None = None,

596 ) -> NDArray[np.float64]: ...

597 @overload

598 def generate_season_index(

599 self,

600 dates: Sequence[datetime],

601 style: Literal["semi-markov"],

602 *,

603 period_length: int,

604 period_sd: number,

605 start_index: int,

606 seed: int,

607 ) -> NDArray[np.float64]: ...

608 @overload

609 def generate_season_index(

610 self,

611 dates: Sequence[datetime],

612 style: Literal["holiday"],

613 *,

614 season_dates: Sequence[Sequence[datetime | int]],

615 seed: int | None = None,

616 ) -> NDArray[np.float64]: ...

617 @overload

618 def generate_season_index(

619 self,

620 dates: Sequence[datetime],

621 style: Literal["sin"],

622 *,

623 period_length: int | None = None,

624 start_index: int | None = None,

625 amplitude: number | None = None,

626 seed: int | None = None,

627 ) -> NDArray[np.float64]: ...

628 @overload

629 def generate_season_index(

630 self,

631 dates: Sequence[datetime],

632 style: Literal["sin_covar"],

633 *,

634 period_length: int,

635 start_index: int,

636 amplitude: number | None = None,

637 seed: int | None = None,

638 ) -> NDArray[np.float64]: ...

639 def generate_season_index(

640 self,

641 dates: Sequence[datetime],

642 style: Literal[

643 "fixed+error",

644 "semi-markov",

645 "holiday",

646 "sin",

647 "sin_covar",

648 ],

649 *,

650 season_dates: Sequence[Sequence[datetime | int]] | None = None,

651 period_length: int | None = None,

652 period_sd: number | None = None,

653 start_index: int | None = None,

654 amplitude: number | None = None,

655 seed: int | None = None,

656 ) -> NDArray[np.float64]:

657 """

658 !!! note "Summary"

659 Generate a seasonality index for the given dates based on the specified style.

660

661 ???+ abstract "Details"

662 - A seasonality index is a manual selection for date in `dates` to determine whether it is a holiday or not.

663 - Basically, it is a manual index of dates in a univariate time series data set which are actual holidays.

664 - The return array is generated by checking if each date in `dates` is present in the list of holiday dates generated from `season_dates`.

665 - The return array is a boolean `1` or `0` of length `n_periods`. It will have a seasonality of `period_length` and a disturbance standard deviation of `period_sd`. The result can be used as a non-uniform distribution of weekdays in a histogram (if for eg. frequency is weekly).

666 - Different styles require different keyword arguments. See the overload signatures for specific parameter requirements.

667

668 Params:

669 dates (Sequence[datetime]):

670 List of datetime objects representing the dates to check.

671 style (Literal["fixed+error", "semi-markov", "holiday", "sin", "sin_covar"]):

672 The style of the seasonality index to generate.

673 Possible values are:

674 - `"fixed+error"`: Fixed error seasonality index.

675 - `"semi-markov"`: Semi-Markov seasonality index.

676 - `"holiday"`: Holiday seasonality index.

677 - `"sin"`: Sine seasonality index.

678 - `"sin_covar"`: Sine seasonality index with covariance.

679

680 Raises:

681 (TypeCheckError):

682 If any of the inputs parsed to the parameters of this function are not the correct type. Uses the [`@typeguard.typechecked`](https://typeguard.readthedocs.io/en/stable/api.html#typeguard.typechecked) decorator.

683 (AssertionError):

684 If `season_dates` does not contain exactly two elements.

685 (TypeError):

686 If the first element of `season_dates` is not a `datetime`, or the second element is not an `int`.

687 (ValueError):

688 If `style` is not one of the supported styles.

689 If `period_length`, `period_sd`, or `start_index` are not provided for the corresponding styles.

690

691 Returns:

692 (NDArray[np.float64]):

693 An array of the same length as `dates`, where each element is a sine value representing the seasonal pattern.

694 """

695

696 # Map styles to functions

697 funcs: dict[str, Callable] = {

698 "fixed+error": self.generate_fixed_error_index,

699 "semi-markov": self.generate_semi_markov_index,

700 "holiday": self.generate_holiday_index,

701 "sin": self.generate_sin_index,

702 "sin_covar": self.generate_sin_covar_index,

703 }

704

705 # Get function based on style

706 func: Callable | None = funcs.get(style)

707

708 # Guard clause for unsupported style

709 if not func:

710 return np.zeros(len(dates)).astype(np.float64)

711

712 # Prepare parameters

713 _params: dict[str, Any] = {

714 "dates": dates,

715 "season_dates": season_dates,

716 "period_length": period_length,

717 "period_sd": period_sd,

718 "start_index": start_index,

719 "amplitude": amplitude,

720 "seed": seed,

721 }

722

723 # Filter out empty parameters

724 params: dict[str, Any] = {key: value for key, value in _params.items() if value is not None}

725

726 # Call function with parameters

727 return func(**params).astype(np.float64) # type:ignore

728

729 def generate_polynom_trend(

730 self,

731 interpolation_nodes: Sequence[Sequence[int]],

732 n_periods: int,

733 ) -> NDArray[np.float64]:

734 """

735 !!! note "Summary"

736 Generate a polynomial trend based on the provided interpolation nodes.

737

738 ???+ abstract "Details"

739 - The polynomial trend is generated using the provided interpolation nodes.

740 - The function supports polynomial trends of order 1 (linear), 2 (quadratic), 3 (cubic), and 4 (quartic).

741 - The generated trend is an array of the same length as `n_periods`, where each element represents the value of the polynomial trend at that period.

742 - The function uses numpy's linear algebra solver to compute the coefficients of the polynomial based on the provided interpolation nodes.

743

744 !!! warning "Important"

745 This function is implemented only up to order 3 (cubic interpolation = four nodes).

746 It is not intended to be used for higher-order polynomial trends.

747

748 Params:

749 interpolation_nodes (Sequence[Sequence[int]]):

750 A collection of interpolation nodes, where each node is a tuple containing the x-coordinate and y-coordinate.

751 The x-coordinates should be in ascending order.

752 n_periods (int):

753 The number of periods for which to generate the polynomial trend.

754 This determines the length of the output array.

755 The generated trend will have the same length as `n_periods`.

756

757 Raises:

758 (TypeCheckError):

759 If any of the inputs parsed to the parameters of this function are not the correct type. Uses the [`@typeguard.typechecked`](https://typeguard.readthedocs.io/en/stable/api.html#typeguard.typechecked) decorator.

760 (AssertionError):

761 If `interpol_nodes` does not contain exactly two elements.

762 (TypeError):

763 If the first element of `interpol_nodes` is not a `datetime`, or the second element is not an `int`.

764

765 Returns:

766 (NDArray[np.float64]):

767 An array of the same length as `n_periods`, where each element represents the value of the polynomial trend at that period.

768 """

769

770 if len(interpolation_nodes) == 0:

771 # No trend component:

772 trend: NDArray[np.float64] = np.zeros(n_periods)

773 return trend

774

775 elif len(interpolation_nodes) == 1:

776 # No trend component:

777 trend: NDArray[np.float64] = np.zeros(n_periods) + interpolation_nodes[0][1]

778 return trend

779

780 elif len(interpolation_nodes) == 2:

781 # Linear trend component:

782 x1, y1 = interpolation_nodes[0]

783 x2, y2 = interpolation_nodes[1]

784 M = np.column_stack((np.array([x1, x2]), np.ones(2)))

785 b = np.array([y1, y2])

786 pvec = np.linalg.solve(M, b)

787 trend: NDArray[np.float64] = np.arange(n_periods).astype(np.float64)

788 trend = pvec[0] * trend + pvec[1]

789 return trend

790

791 elif len(interpolation_nodes) == 3:

792 # Quadratic trend component:

793 x1, y1 = interpolation_nodes[0]

794 x2, y2 = interpolation_nodes[1]

795 x3, y3 = interpolation_nodes[2]

796 M = np.column_stack(

797 (

798 np.array([x1, x2, x3]) * np.array([x1, x2, x3]),

799 np.array([x1, x2, x3]),

800 np.ones(3),

801 )

802 )

803 b = np.array([y1, y2, y3])

804 pvec = np.linalg.solve(M, b)

805 trend: NDArray[np.float64] = np.arange(n_periods).astype(np.float64)

806 trend = pvec[0] * trend * trend + pvec[1] * trend + pvec[2]

807 return trend

808

809 elif len(interpolation_nodes) == 4:

810 # Cubic trend component:

811 x1, y1 = interpolation_nodes[0]

812 x2, y2 = interpolation_nodes[1]

813 x3, y3 = interpolation_nodes[2]

814 x4, y4 = interpolation_nodes[3]

815 M = np.column_stack(

816 (

817 np.array([x1, x2, x3, x4]) * np.array([x1, x2, x3, x4]) * np.array([x1, x2, x3, x4]),

818 np.array([x1, x2, x3, x4]) * np.array([x1, x2, x3, x4]),

819 np.array([x1, x2, x3, x4]),

820 np.ones(4),

821 )

822 )

823 b = np.array([y1, y2, y3, y4])

824 pvec = np.linalg.solve(M, b)

825 trend: NDArray[np.float64] = np.arange(n_periods).astype(np.float64)

826 trend = pvec[0] * trend * trend * trend + pvec[1] * trend * trend + pvec[2] * trend + pvec[3]

827 return trend

828

829 else:

830 # All other values parsed to `interpol_nodes` are not valid. Default to no trend component.

831 trend: NDArray[np.float64] = np.zeros(n_periods)

832 return trend

833

834 def generate_ARMA(

835 self,

836 AR: Sequence[number],

837 MA: Sequence[number],

838 randomwalk_scale: number,

839 n_periods: int,

840 exogenous: Sequence[dict[Literal["coeff", "ts"], Sequence[number]]] | None = None,

841 seed: int | None = None,

842 ) -> NDArray[np.float64]:

843 """

844 !!! note "Summary"

845 Generate an ARMA (AutoRegressive Moving Average) time series.

846

847 ???+ abstract "Details"

848 - The ARMA model is a combination of autoregressive (AR) and moving average (MA) components.

849 - The function generates a time series based on the specified AR and MA coefficients, random walk scale, and optional exogenous variables.

850 - The generated time series is an array of the same length as `n_periods`, where each element represents the value of the ARMA time series at that period.

851 - The function uses numpy's random number generator to generate the noise component of the ARMA model.

852

853 Params:

854 AR (Sequence[number]):

855 List of autoregressive coefficients.

856 The length of the list determines the order of the AR component.

857 All values must be between `0` and `1`.

858 MA (Sequence[number]):

859 List of moving average coefficients.

860 The length of the list determines the order of the MA component.

861 All values must be between `0` and `1`.

862 randomwalk_scale (number):

863 Scale parameter for the random walk component.

864 This controls the standard deviation of the noise added to the time series.

865 n_periods (int):

866 The number of periods for which to generate the ARMA time series.

867 This determines the length of the output array.

868 exogenous (Sequence[dict[Literal["coeff", "ts"], Sequence[number]]] | None):

869 Optional list of exogenous variables, where each variable is represented as a dictionary with keys "coeff" and "ts".

870 "coeff" is a list of coefficients for the exogenous variable, and "ts" is a list of values for that variable.

871 seed (int | None):

872 Random seed for reproducibility.

873 Default is `None`.

874

875 Raises:

876 (TypeCheckError):

877 If any of the inputs parsed to the parameters of this function are not the correct type. Uses the [`@typeguard.typechecked`](https://typeguard.readthedocs.io/en/stable/api.html#typeguard.typechecked) decorator.

878

879 Returns:

880 (NDArray[np.float64]):

881 An array of the same length as `n_periods`, where each element represents the value of the ARMA time series at that period.

882

883 ???+ info "Details about how the `AR` and `MA` Parameters work"

884

885 This [`#!py generate_ARMA()`][synthetic_data_generators.time_series.TimeSeriesGenerator.generate_ARMA] method creates time series data using ARMA (AutoRegressive Moving Average) models.

886 The `#!py AR` parameter is used to model the long-term trends in the data, while the `#!py MA` parameter is used to model the short-term fluctuations.

887

888 **The `AR` (AutoRegressive) Parameter:**

889

890 - The `#!py AR` parameter is a list of coefficients that determine how much past values influence the current value.

891 - Each coefficient represents the weight given to a specific lag (previous time point).

892 - For example, with `#!py AR=[0.6, 0.3]`:

893 - The value at time `#!py t` is influenced by:

894 - 60% of the value at time `#!py t-1` (0.6 x previous value)

895 - 30% of the value at time `#!py t-2` (0.3 x value from two periods ago)

896 - This creates persistence in the data where values tend to follow past trends. Higher AR values (closer to `#!py 1`) create stronger trends and more correlation with past values.

897 - Higher AR values (closer to `#!py 1`) create stronger trends and more correlation with past values.

898 - When `#!py AR=[0]`, the time series is purely random, as it does not depend on past values. Likewise, when `#!py AR=[1]`, the time series is the same as a random walk, as it only depends on the previous value.

899 - When multiple values are provided, the first value is the most recent, and the last value is the oldest. For example, `#!py AR=[0.5, 0.3]` means that the most recent value has a weight of `0.5`, and the second most recent value has a weight of `0.3`. Realistically, the second most recent value will have less influence than the most recent value, and will therefore have a lower value (closer to `#!py 0`), but it can still affect the current value.

900

901 **The `#!py MA` (Moving Average) Parameter:**

902

903 - The MA parameter is a list of coefficients that determine how much past random shocks (errors) influence the current value.

904 - For example, with `#!py MA=[0.2, 0.1]`:

905 - The value at time `#!py t` is influenced by:

906 - 20% of the random shock at time `#!py t-1`

907 - 10% of the random shock at time `#!py t-2`

908 - This creates short-term corrections or adjustments based on recent random fluctuations.

909 - Higher MA values (closer to `#!py 1`) create stronger corrections and more correlation with past shocks.

910 - When `#!py MA=[0]`, the time series is purely autoregressive, as it will depend on past values and does not depend on past shocks. Likewise, when `#!py MA=[1]`, the time series is purely random and will not depend on previous values.

911 - When multiple values are provided, the first value is the most recent, and the last value is the oldest. For example, `#!py MA=[0.5, 0.3]` means that the most recent value has a weight of `0.5`, and the second most recent value has a weight of `0.3`. Realistically, the second most recent value will have less influence than the most recent value, and will therefore have a lower value (closer to `#!py 0`), but it can still affect the current value.

912

913 **Examples and Effects:**

914

915 | Value | Description |

916 |--------------------------------------|-------------|

917 | `#!py AR=[0.9]` | Creates strong persistence - values strongly follow the previous value, resulting in smooth, trending data |

918 | `#!py AR=[0.5,0.3]` | Creates moderate persistence with some oscillation patterns |

919 | `#!py MA=[0.8]` | Creates immediate corrections after random shocks |

920 | `#!py MA=[0.5,0.3]` | Creates moderate corrections with some oscillation patterns |

921 | `#!py AR=[0.7]` `#!py MA=[0.4]` | Combines trend persistence with short-term corrections |

922 """

923

924 # Validations

925 AR = AR or [1]

926 MA = MA or [0]

927 exogenous = exogenous or []

928 assert exogenous is not None

929 self._assert_all_values_are_between(AR, min_value=0, max_value=1)

930 self._assert_all_values_are_between(MA, min_value=0, max_value=1)

931

932 # Set seed

933 if seed:

934 self._set_seed(seed=seed)

935

936 # Add noise

937 u: NDArray[np.float64] = self.random_generator.normal(

938 loc=0.0,

939 scale=randomwalk_scale,

940 size=n_periods,

941 )

942

943 # Generate array

944 ts: NDArray[np.float64] = np.zeros(n_periods).astype(np.float64)

945

946 # Generate ARMA time series

947 for i in range(n_periods):

948 for i_ar in range(min(len(AR), i)):

949 ts[i] = ts[i] + AR[i_ar] * ts[i - 1 - i_ar]

950 ts[i] = ts[i] + u[i]

951 for i_ma in range(min(len(MA), i)):

952 ts[i] = ts[i] - MA[i_ma] * u[i - 1 - i_ma]

953 for exvar in exogenous:

954 for i_ar in range(len(exvar["coeff"])):

955 ts[i] = ts[i] + exvar["coeff"][i_ar] * exvar["ts"][i - i_ar]

956

957 # Return

958 return ts

959

960 ## --------------------------------------------------------------------------- #

961 ## Properties ####

962 ## --------------------------------------------------------------------------- #

963

964 @property

965 def seed(self) -> int | None:

966 """

967 !!! note "Summary"

968 Get the seed value used for random number generation.

969

970 Returns:

971 (int | None):

972 The seed value used for random number generation.

973 """

974 return self._seed

975

976 @property

977 def random_generator(self) -> RandomGenerator:

978 """

979 !!! note "Summary"

980 Get the random number generator instance.

981

982 Returns:

983 (RandomGenerator):

984 The random number generator instance.

985 """

986 return self._random_generator or self._get_random_generator(seed=self._seed)

987

988 ## --------------------------------------------------------------------------- #

989 ## Getters & Setters ####

990 ## --------------------------------------------------------------------------- #

991

992 def _set_seed(self, seed: int | None = None) -> None:

993 """

994 !!! note "Summary"

995 Set the seed value for random number generation.

996

997 Params:

998 seed (int | None):

999 The seed value to set for random number generation.

1000 """

1001 self._seed: int | None = seed

1002 self._random_generator: RandomGenerator | None = None

1003

1004 @lru_cache

1005 def _get_random_generator(self, seed: int | None = None) -> RandomGenerator:

1006 """

1007 !!! note "Summary"

1008 Get the random number generator.

1009

1010 Returns:

1011 (RandomGenerator):

1012 The random number generator instance.

1013 """

1014 return np.random.default_rng(seed=seed)

1015

1016 @staticmethod

1017 @overload

1018 def _get_dates(start_date: datetime, *, end_date: datetime) -> list[datetime]: ...

1019 @staticmethod

1020 @overload

1021 def _get_dates(start_date: datetime, *, n_periods: int) -> list[datetime]: ...

1022 @staticmethod

1023 @lru_cache

1024 def _get_dates(

1025 start_date: datetime, *, end_date: datetime | None = None, n_periods: int | None = None

1026 ) -> list[datetime]:

1027 """

1028 !!! note "Summary"

1029 Generate a list of dates between a start and end date or for a specified number of periods.

1030

1031 Params:

1032 start_date (datetime):

1033 The starting date for generating dates.

1034

1035 Returns:

1036 (list[datetime]):

1037 A list of datetime objects representing the generated dates.

1038 """

1039 return pd.date_range(start=start_date, end=end_date, periods=n_periods).to_pydatetime().tolist() # type:ignore

1040

1041 @staticmethod

1042 @lru_cache

1043 def _get_holiday_period(start_date: datetime, periods: int) -> list[datetime]:

1044 """

1045 !!! note "Summary"

1046 Generate a list of holiday dates starting from a given date.

1047

1048 Params:

1049 start_date (datetime):

1050 The starting date for generating holiday dates.

1051 periods (int):

1052 The number of holiday dates to generate.

1053

1054 Returns:

1055 (list[datetime]):

1056 A list of datetime objects representing the generated holiday dates.

1057 """

1058 return TimeSeriesGenerator._get_dates(start_date, n_periods=periods)

Coverage for src/synthetic_data_generators/time_series.py: 100%

176 statements