Coverage for src/toolbox_pyspark/constants.py: 100%
26 statements
« prev ^ index » next coverage.py v7.6.10, created at 2025-01-25 23:08 +0000
« prev ^ index » next coverage.py v7.6.10, created at 2025-01-25 23:08 +0000
1# ============================================================================ #
2# #
3# Title : Constants #
4# Purpose : Hold the definitions of all constant values used across the #
5# package. #
6# #
7# ============================================================================ #
10# ---------------------------------------------------------------------------- #
11# #
12# Overview ####
13# #
14# ---------------------------------------------------------------------------- #
17# ---------------------------------------------------------------------------- #
18# Description ####
19# ---------------------------------------------------------------------------- #
22"""
23!!! note "Summary"
24 The `constants` module is used to hold the definitions of all constant values used across the package.
25"""
28# ---------------------------------------------------------------------------- #
29# #
30# Setup ####
31# #
32# ---------------------------------------------------------------------------- #
35# ---------------------------------------------------------------------------- #
36# Imports ####
37# ---------------------------------------------------------------------------- #
40# ## Python StdLib Imports ----
41from functools import partial
42from typing import Literal, Union, get_args
43from warnings import warn
45# ## Python Third Party Imports ----
46from pyspark.sql import types as T
47from pyspark.sql.types import _all_atomic_types as pyspark_atomic_types
48from toolbox_python.collection_types import str_list, str_set
50# ## Local First Party Imports ----
51from toolbox_pyspark.utils.whitespaces import WhitespaceCharacters, WhitespaceChatacter
54# ---------------------------------------------------------------------------- #
55# Exports ####
56# ---------------------------------------------------------------------------- #
59__all__: str_list = [
60 "ALL_WHITESPACE_CHARACTERS",
61 "WHITESPACE_CHARACTERS",
62 "VALID_PYSPARK_TYPES",
63 "VALID_PYSPARK_TYPE_NAMES",
64 "ALL_PYSPARK_TYPES",
65 "VALID_PYAPARK_JOIN_TYPES",
66 "ALL_PYSPARK_JOIN_TYPES",
67 "LITERAL_PANDAS_DATAFRAME_NAMES",
68 "LITERAL_PYSPARK_DATAFRAME_NAMES",
69 "LITERAL_NUMPY_ARRAY_NAMES",
70 "LITERAL_LIST_OBJECT_NAMES",
71 "VALID_PANDAS_DATAFRAME_NAMES",
72 "VALID_PYSPARK_DATAFRAME_NAMES",
73 "VALID_NUMPY_ARRAY_NAMES",
74 "VALID_LIST_OBJECT_NAMES",
75 "VALID_DATAFRAME_NAMES",
76 "_DEFAULT_DEPRECATION_WARNING_CLASS",
77 "_DEFAULT_DEPRECATION_WARNING",
78]
81# ---------------------------------------------------------------------------- #
82# #
83# Constants ####
84# #
85# ---------------------------------------------------------------------------- #
88# ---------------------------------------------------------------------------- #
89# White Spaces ####
90# ---------------------------------------------------------------------------- #
93# For full list of characters: https://en.wikipedia.org/wiki/Whitespace_character
94# in the below tuples: ('name','unicode','ascii')
95ALL_WHITESPACE_CHARACTERS: list[tuple[str, str, int]] = [
96 ("character tabulation", "U+0009", 9),
97 ("line feed", "U+000A", 10),
98 ("line tabulation", "U+000B", 11),
99 ("form feed", "U+000C", 12),
100 ("carriage return", "U+000D", 13),
101 ("space", "U+0020", 32),
102 ("next line", "U+0085", 133),
103 ("no-break space", "U+00A0", 160),
104 ("ogham space mark", "U+1680", 5760),
105 ("en quad", "U+2000", 8192),
106 ("em quad", "U+2001", 8193),
107 ("en space", "U+2002", 8194),
108 ("em space", "U+2003", 8195),
109 ("three-per-em space", "U+2004", 8196),
110 ("four-per-em space", "U+2005", 8197),
111 ("six-per-em space", "U+2006", 8198),
112 ("figure space", "U+2007", 8199),
113 ("punctuation space", "U+2008", 8200),
114 ("thin space", "U+2009", 8201),
115 ("hair space", "U+200A", 8202),
116 ("line separator", "U+2028", 8232),
117 ("paragraph separator", "U+2029", 8233),
118 ("narrow no-break space", "U+202F", 8239),
119 ("medium mathematical space", "U+205F", 8287),
120 ("ideographic space", "U+3000", 12288),
121 ("mongolian vowel separator", "U+180E", 6158),
122 ("zero width space", "U+200B", 8203),
123 ("zero width non-joiner", "U+200C", 8204),
124 ("zero width joiner", "U+200D", 8205),
125 ("word joiner", "U+2060", 8288),
126 ("zero width non-breaking space", "U+FEFF", 65279),
127]
129WHITESPACE_CHARACTERS = WhitespaceCharacters(
130 [
131 WhitespaceChatacter(name, unicode, ascii)
132 for name, unicode, ascii in ALL_WHITESPACE_CHARACTERS
133 ]
134)
137# ---------------------------------------------------------------------------- #
138# PySpark Types ####
139# ---------------------------------------------------------------------------- #
142# For a full list of valid types, see: https://spark.apache.org/docs/latest/sql-ref-datatypes.html
143VALID_PYSPARK_TYPES = list(pyspark_atomic_types.values())
144VALID_PYSPARK_TYPE_NAMES: str_list = sorted(
145 list(pyspark_atomic_types.keys()) + ["str", "int", "bool", "datetime"]
146)
147ALL_PYSPARK_TYPES = Union[
148 T.DataType,
149 T.NullType,
150 T.CharType,
151 T.StringType,
152 T.VarcharType,
153 T.BinaryType,
154 T.BooleanType,
155 T.DateType,
156 T.TimestampType,
157 T.TimestampNTZType,
158 T.DecimalType,
159 T.DoubleType,
160 T.FloatType,
161 T.ByteType,
162 T.IntegerType,
163 T.LongType,
164 T.DayTimeIntervalType,
165 T.YearMonthIntervalType,
166 T.ShortType,
167 T.ArrayType,
168 T.MapType,
169 T.StructType,
170]
172VALID_PYAPARK_JOIN_TYPES = Literal[
173 "inner",
174 "cross",
175 "outer",
176 "full",
177 "fullouter",
178 "full_outer",
179 "left",
180 "leftouter",
181 "left_outer",
182 "right",
183 "rightouter",
184 "right_outer",
185 "semi",
186 "leftsemi",
187 "left_semi",
188 "anti",
189 "leftanti",
190 "left_anti",
191]
192ALL_PYSPARK_JOIN_TYPES = set(get_args(VALID_PYAPARK_JOIN_TYPES))
195# ---------------------------------------------------------------------------- #
196# DataFrames ####
197# ---------------------------------------------------------------------------- #
200LITERAL_PANDAS_DATAFRAME_NAMES = Literal[
201 "pandas.DataFrame",
202 "pandas",
203 "pd.DataFrame",
204 "pd.df",
205 "pddf",
206 "pdDataFrame",
207 "pdDF",
208 "pd",
209]
211LITERAL_PYSPARK_DATAFRAME_NAMES = Literal[
212 "spark.DataFrame",
213 "pyspark.DataFrame",
214 "pyspark",
215 "spark",
216 "ps.DataFrame",
217 "ps.df",
218 "psdf",
219 "psDataFrame",
220 "psDF",
221 "ps",
222]
224LITERAL_NUMPY_ARRAY_NAMES = Literal[
225 "numpy.array",
226 "np.array",
227 "np",
228 "numpy",
229 "nparr",
230 "npa",
231 "np.arr",
232 "np.a",
233]
235LITERAL_LIST_OBJECT_NAMES = Literal["list", "lst", "l", "flat_list", "flatten_list"]
237VALID_PANDAS_DATAFRAME_NAMES = set(get_args(LITERAL_PANDAS_DATAFRAME_NAMES))
238VALID_PYSPARK_DATAFRAME_NAMES = set(get_args(LITERAL_PYSPARK_DATAFRAME_NAMES))
239VALID_NUMPY_ARRAY_NAMES = set(get_args(LITERAL_NUMPY_ARRAY_NAMES))
240VALID_LIST_OBJECT_NAMES = set(get_args(LITERAL_LIST_OBJECT_NAMES))
242VALID_DATAFRAME_NAMES: str_set = VALID_PANDAS_DATAFRAME_NAMES.union(
243 VALID_PYSPARK_DATAFRAME_NAMES
244)
247# ---------------------------------------------------------------------------- #
248# Other ####
249# ---------------------------------------------------------------------------- #
252_DEFAULT_DEPRECATION_WARNING_CLASS = DeprecationWarning
253_DEFAULT_DEPRECATION_WARNING = partial(warn, category=_DEFAULT_DEPRECATION_WARNING_CLASS)