Coverage for src/toolbox_pyspark/constants.py: 100%

1# ============================================================================ #

2# #

3# Title : Constants #

4# Purpose : Hold the definitions of all constant values used across the #

5# package. #

6# #

7# ============================================================================ #

10# ---------------------------------------------------------------------------- #

11# #

12# Overview ####

13# #

14# ---------------------------------------------------------------------------- #

17# ---------------------------------------------------------------------------- #

18# Description ####

19# ---------------------------------------------------------------------------- #

22"""

23!!! note "Summary"

24 The `constants` module is used to hold the definitions of all constant values used across the package.

25"""

28# ---------------------------------------------------------------------------- #

29# #

30# Setup ####

31# #

32# ---------------------------------------------------------------------------- #

35# ---------------------------------------------------------------------------- #

36# Imports ####

37# ---------------------------------------------------------------------------- #

40# ## Python StdLib Imports ----

41from functools import partial

42from typing import Literal, Union, get_args

43from warnings import warn

45# ## Python Third Party Imports ----

46from pyspark.sql import types as T

47from pyspark.sql.types import _all_atomic_types as pyspark_atomic_types

48from toolbox_python.collection_types import str_list, str_set

50# ## Local First Party Imports ----

51from toolbox_pyspark.utils.whitespaces import WhitespaceCharacters, WhitespaceChatacter

54# ---------------------------------------------------------------------------- #

55# Exports ####

56# ---------------------------------------------------------------------------- #

59__all__: str_list = [

60 "ALL_WHITESPACE_CHARACTERS",

61 "WHITESPACE_CHARACTERS",

62 "VALID_PYSPARK_TYPES",

63 "VALID_PYSPARK_TYPE_NAMES",

64 "ALL_PYSPARK_TYPES",

65 "VALID_PYAPARK_JOIN_TYPES",

66 "ALL_PYSPARK_JOIN_TYPES",

67 "LITERAL_PANDAS_DATAFRAME_NAMES",

68 "LITERAL_PYSPARK_DATAFRAME_NAMES",

69 "LITERAL_NUMPY_ARRAY_NAMES",

70 "LITERAL_LIST_OBJECT_NAMES",

71 "VALID_PANDAS_DATAFRAME_NAMES",

72 "VALID_PYSPARK_DATAFRAME_NAMES",

73 "VALID_NUMPY_ARRAY_NAMES",

74 "VALID_LIST_OBJECT_NAMES",

75 "VALID_DATAFRAME_NAMES",

76 "_DEFAULT_DEPRECATION_WARNING_CLASS",

77 "_DEFAULT_DEPRECATION_WARNING",

78]

81# ---------------------------------------------------------------------------- #

82# #

83# Constants ####

84# #

85# ---------------------------------------------------------------------------- #

88# ---------------------------------------------------------------------------- #

89# White Spaces ####

90# ---------------------------------------------------------------------------- #

93# For full list of characters: https://en.wikipedia.org/wiki/Whitespace_character

94# in the below tuples: ('name','unicode','ascii')

95ALL_WHITESPACE_CHARACTERS: list[tuple[str, str, int]] = [

96 ("character tabulation", "U+0009", 9),

97 ("line feed", "U+000A", 10),

98 ("line tabulation", "U+000B", 11),

99 ("form feed", "U+000C", 12),

100 ("carriage return", "U+000D", 13),

101 ("space", "U+0020", 32),

102 ("next line", "U+0085", 133),

103 ("no-break space", "U+00A0", 160),

104 ("ogham space mark", "U+1680", 5760),

105 ("en quad", "U+2000", 8192),

106 ("em quad", "U+2001", 8193),

107 ("en space", "U+2002", 8194),

108 ("em space", "U+2003", 8195),

109 ("three-per-em space", "U+2004", 8196),

110 ("four-per-em space", "U+2005", 8197),

111 ("six-per-em space", "U+2006", 8198),

112 ("figure space", "U+2007", 8199),

113 ("punctuation space", "U+2008", 8200),

114 ("thin space", "U+2009", 8201),

115 ("hair space", "U+200A", 8202),

116 ("line separator", "U+2028", 8232),

117 ("paragraph separator", "U+2029", 8233),

118 ("narrow no-break space", "U+202F", 8239),

119 ("medium mathematical space", "U+205F", 8287),

120 ("ideographic space", "U+3000", 12288),

121 ("mongolian vowel separator", "U+180E", 6158),

122 ("zero width space", "U+200B", 8203),

123 ("zero width non-joiner", "U+200C", 8204),

124 ("zero width joiner", "U+200D", 8205),

125 ("word joiner", "U+2060", 8288),

126 ("zero width non-breaking space", "U+FEFF", 65279),

127]

128

129WHITESPACE_CHARACTERS = WhitespaceCharacters(

130 [

131 WhitespaceChatacter(name, unicode, ascii)

132 for name, unicode, ascii in ALL_WHITESPACE_CHARACTERS

133 ]

134)

135

136

137# ---------------------------------------------------------------------------- #

138# PySpark Types ####

139# ---------------------------------------------------------------------------- #

140

141

142# For a full list of valid types, see: https://spark.apache.org/docs/latest/sql-ref-datatypes.html

143VALID_PYSPARK_TYPES = list(pyspark_atomic_types.values())

144VALID_PYSPARK_TYPE_NAMES: str_list = sorted(

145 list(pyspark_atomic_types.keys()) + ["str", "int", "bool", "datetime"]

146)

147ALL_PYSPARK_TYPES = Union[

148 T.DataType,

149 T.NullType,

150 T.CharType,

151 T.StringType,

152 T.VarcharType,

153 T.BinaryType,

154 T.BooleanType,

155 T.DateType,

156 T.TimestampType,

157 T.TimestampNTZType,

158 T.DecimalType,

159 T.DoubleType,

160 T.FloatType,

161 T.ByteType,

162 T.IntegerType,

163 T.LongType,

164 T.DayTimeIntervalType,

165 T.YearMonthIntervalType,

166 T.ShortType,

167 T.ArrayType,

168 T.MapType,

169 T.StructType,

170]

171

172VALID_PYAPARK_JOIN_TYPES = Literal[

173 "inner",

174 "cross",

175 "outer",

176 "full",

177 "fullouter",

178 "full_outer",

179 "left",

180 "leftouter",

181 "left_outer",

182 "right",

183 "rightouter",

184 "right_outer",

185 "semi",

186 "leftsemi",

187 "left_semi",

188 "anti",

189 "leftanti",

190 "left_anti",

191]

192ALL_PYSPARK_JOIN_TYPES = set(get_args(VALID_PYAPARK_JOIN_TYPES))

193

194

195# ---------------------------------------------------------------------------- #

196# DataFrames ####

197# ---------------------------------------------------------------------------- #

198

199

200LITERAL_PANDAS_DATAFRAME_NAMES = Literal[

201 "pandas.DataFrame",

202 "pandas",

203 "pd.DataFrame",

204 "pd.df",

205 "pddf",

206 "pdDataFrame",

207 "pdDF",

208 "pd",

209]

210

211LITERAL_PYSPARK_DATAFRAME_NAMES = Literal[

212 "spark.DataFrame",

213 "pyspark.DataFrame",

214 "pyspark",

215 "spark",

216 "ps.DataFrame",

217 "ps.df",

218 "psdf",

219 "psDataFrame",

220 "psDF",

221 "ps",

222]

223

224LITERAL_NUMPY_ARRAY_NAMES = Literal[

225 "numpy.array",

226 "np.array",

227 "np",

228 "numpy",

229 "nparr",

230 "npa",

231 "np.arr",

232 "np.a",

233]

234

235LITERAL_LIST_OBJECT_NAMES = Literal["list", "lst", "l", "flat_list", "flatten_list"]

236

237VALID_PANDAS_DATAFRAME_NAMES = set(get_args(LITERAL_PANDAS_DATAFRAME_NAMES))

238VALID_PYSPARK_DATAFRAME_NAMES = set(get_args(LITERAL_PYSPARK_DATAFRAME_NAMES))

239VALID_NUMPY_ARRAY_NAMES = set(get_args(LITERAL_NUMPY_ARRAY_NAMES))

240VALID_LIST_OBJECT_NAMES = set(get_args(LITERAL_LIST_OBJECT_NAMES))

241

242VALID_DATAFRAME_NAMES: str_set = VALID_PANDAS_DATAFRAME_NAMES.union(

243 VALID_PYSPARK_DATAFRAME_NAMES

244)

245

246

247# ---------------------------------------------------------------------------- #

248# Other ####

249# ---------------------------------------------------------------------------- #

250

251

252_DEFAULT_DEPRECATION_WARNING_CLASS = DeprecationWarning

253_DEFAULT_DEPRECATION_WARNING = partial(warn, category=_DEFAULT_DEPRECATION_WARNING_CLASS)