Coverage for src/toolbox_pyspark/constants.py: 100%

26 statements  

« prev     ^ index     » next       coverage.py v7.6.10, created at 2025-01-25 23:08 +0000

1# ============================================================================ # 

2# # 

3# Title : Constants # 

4# Purpose : Hold the definitions of all constant values used across the # 

5# package. # 

6# # 

7# ============================================================================ # 

8 

9 

10# ---------------------------------------------------------------------------- # 

11# # 

12# Overview #### 

13# # 

14# ---------------------------------------------------------------------------- # 

15 

16 

17# ---------------------------------------------------------------------------- # 

18# Description #### 

19# ---------------------------------------------------------------------------- # 

20 

21 

22""" 

23!!! note "Summary" 

24 The `constants` module is used to hold the definitions of all constant values used across the package. 

25""" 

26 

27 

28# ---------------------------------------------------------------------------- # 

29# # 

30# Setup #### 

31# # 

32# ---------------------------------------------------------------------------- # 

33 

34 

35# ---------------------------------------------------------------------------- # 

36# Imports #### 

37# ---------------------------------------------------------------------------- # 

38 

39 

40# ## Python StdLib Imports ---- 

41from functools import partial 

42from typing import Literal, Union, get_args 

43from warnings import warn 

44 

45# ## Python Third Party Imports ---- 

46from pyspark.sql import types as T 

47from pyspark.sql.types import _all_atomic_types as pyspark_atomic_types 

48from toolbox_python.collection_types import str_list, str_set 

49 

50# ## Local First Party Imports ---- 

51from toolbox_pyspark.utils.whitespaces import WhitespaceCharacters, WhitespaceChatacter 

52 

53 

54# ---------------------------------------------------------------------------- # 

55# Exports #### 

56# ---------------------------------------------------------------------------- # 

57 

58 

59__all__: str_list = [ 

60 "ALL_WHITESPACE_CHARACTERS", 

61 "WHITESPACE_CHARACTERS", 

62 "VALID_PYSPARK_TYPES", 

63 "VALID_PYSPARK_TYPE_NAMES", 

64 "ALL_PYSPARK_TYPES", 

65 "VALID_PYAPARK_JOIN_TYPES", 

66 "ALL_PYSPARK_JOIN_TYPES", 

67 "LITERAL_PANDAS_DATAFRAME_NAMES", 

68 "LITERAL_PYSPARK_DATAFRAME_NAMES", 

69 "LITERAL_NUMPY_ARRAY_NAMES", 

70 "LITERAL_LIST_OBJECT_NAMES", 

71 "VALID_PANDAS_DATAFRAME_NAMES", 

72 "VALID_PYSPARK_DATAFRAME_NAMES", 

73 "VALID_NUMPY_ARRAY_NAMES", 

74 "VALID_LIST_OBJECT_NAMES", 

75 "VALID_DATAFRAME_NAMES", 

76 "_DEFAULT_DEPRECATION_WARNING_CLASS", 

77 "_DEFAULT_DEPRECATION_WARNING", 

78] 

79 

80 

81# ---------------------------------------------------------------------------- # 

82# # 

83# Constants #### 

84# # 

85# ---------------------------------------------------------------------------- # 

86 

87 

88# ---------------------------------------------------------------------------- # 

89# White Spaces #### 

90# ---------------------------------------------------------------------------- # 

91 

92 

93# For full list of characters: https://en.wikipedia.org/wiki/Whitespace_character 

94# in the below tuples: ('name','unicode','ascii') 

95ALL_WHITESPACE_CHARACTERS: list[tuple[str, str, int]] = [ 

96 ("character tabulation", "U+0009", 9), 

97 ("line feed", "U+000A", 10), 

98 ("line tabulation", "U+000B", 11), 

99 ("form feed", "U+000C", 12), 

100 ("carriage return", "U+000D", 13), 

101 ("space", "U+0020", 32), 

102 ("next line", "U+0085", 133), 

103 ("no-break space", "U+00A0", 160), 

104 ("ogham space mark", "U+1680", 5760), 

105 ("en quad", "U+2000", 8192), 

106 ("em quad", "U+2001", 8193), 

107 ("en space", "U+2002", 8194), 

108 ("em space", "U+2003", 8195), 

109 ("three-per-em space", "U+2004", 8196), 

110 ("four-per-em space", "U+2005", 8197), 

111 ("six-per-em space", "U+2006", 8198), 

112 ("figure space", "U+2007", 8199), 

113 ("punctuation space", "U+2008", 8200), 

114 ("thin space", "U+2009", 8201), 

115 ("hair space", "U+200A", 8202), 

116 ("line separator", "U+2028", 8232), 

117 ("paragraph separator", "U+2029", 8233), 

118 ("narrow no-break space", "U+202F", 8239), 

119 ("medium mathematical space", "U+205F", 8287), 

120 ("ideographic space", "U+3000", 12288), 

121 ("mongolian vowel separator", "U+180E", 6158), 

122 ("zero width space", "U+200B", 8203), 

123 ("zero width non-joiner", "U+200C", 8204), 

124 ("zero width joiner", "U+200D", 8205), 

125 ("word joiner", "U+2060", 8288), 

126 ("zero width non-breaking space", "U+FEFF", 65279), 

127] 

128 

129WHITESPACE_CHARACTERS = WhitespaceCharacters( 

130 [ 

131 WhitespaceChatacter(name, unicode, ascii) 

132 for name, unicode, ascii in ALL_WHITESPACE_CHARACTERS 

133 ] 

134) 

135 

136 

137# ---------------------------------------------------------------------------- # 

138# PySpark Types #### 

139# ---------------------------------------------------------------------------- # 

140 

141 

142# For a full list of valid types, see: https://spark.apache.org/docs/latest/sql-ref-datatypes.html 

143VALID_PYSPARK_TYPES = list(pyspark_atomic_types.values()) 

144VALID_PYSPARK_TYPE_NAMES: str_list = sorted( 

145 list(pyspark_atomic_types.keys()) + ["str", "int", "bool", "datetime"] 

146) 

147ALL_PYSPARK_TYPES = Union[ 

148 T.DataType, 

149 T.NullType, 

150 T.CharType, 

151 T.StringType, 

152 T.VarcharType, 

153 T.BinaryType, 

154 T.BooleanType, 

155 T.DateType, 

156 T.TimestampType, 

157 T.TimestampNTZType, 

158 T.DecimalType, 

159 T.DoubleType, 

160 T.FloatType, 

161 T.ByteType, 

162 T.IntegerType, 

163 T.LongType, 

164 T.DayTimeIntervalType, 

165 T.YearMonthIntervalType, 

166 T.ShortType, 

167 T.ArrayType, 

168 T.MapType, 

169 T.StructType, 

170] 

171 

172VALID_PYAPARK_JOIN_TYPES = Literal[ 

173 "inner", 

174 "cross", 

175 "outer", 

176 "full", 

177 "fullouter", 

178 "full_outer", 

179 "left", 

180 "leftouter", 

181 "left_outer", 

182 "right", 

183 "rightouter", 

184 "right_outer", 

185 "semi", 

186 "leftsemi", 

187 "left_semi", 

188 "anti", 

189 "leftanti", 

190 "left_anti", 

191] 

192ALL_PYSPARK_JOIN_TYPES = set(get_args(VALID_PYAPARK_JOIN_TYPES)) 

193 

194 

195# ---------------------------------------------------------------------------- # 

196# DataFrames #### 

197# ---------------------------------------------------------------------------- # 

198 

199 

200LITERAL_PANDAS_DATAFRAME_NAMES = Literal[ 

201 "pandas.DataFrame", 

202 "pandas", 

203 "pd.DataFrame", 

204 "pd.df", 

205 "pddf", 

206 "pdDataFrame", 

207 "pdDF", 

208 "pd", 

209] 

210 

211LITERAL_PYSPARK_DATAFRAME_NAMES = Literal[ 

212 "spark.DataFrame", 

213 "pyspark.DataFrame", 

214 "pyspark", 

215 "spark", 

216 "ps.DataFrame", 

217 "ps.df", 

218 "psdf", 

219 "psDataFrame", 

220 "psDF", 

221 "ps", 

222] 

223 

224LITERAL_NUMPY_ARRAY_NAMES = Literal[ 

225 "numpy.array", 

226 "np.array", 

227 "np", 

228 "numpy", 

229 "nparr", 

230 "npa", 

231 "np.arr", 

232 "np.a", 

233] 

234 

235LITERAL_LIST_OBJECT_NAMES = Literal["list", "lst", "l", "flat_list", "flatten_list"] 

236 

237VALID_PANDAS_DATAFRAME_NAMES = set(get_args(LITERAL_PANDAS_DATAFRAME_NAMES)) 

238VALID_PYSPARK_DATAFRAME_NAMES = set(get_args(LITERAL_PYSPARK_DATAFRAME_NAMES)) 

239VALID_NUMPY_ARRAY_NAMES = set(get_args(LITERAL_NUMPY_ARRAY_NAMES)) 

240VALID_LIST_OBJECT_NAMES = set(get_args(LITERAL_LIST_OBJECT_NAMES)) 

241 

242VALID_DATAFRAME_NAMES: str_set = VALID_PANDAS_DATAFRAME_NAMES.union( 

243 VALID_PYSPARK_DATAFRAME_NAMES 

244) 

245 

246 

247# ---------------------------------------------------------------------------- # 

248# Other #### 

249# ---------------------------------------------------------------------------- # 

250 

251 

252_DEFAULT_DEPRECATION_WARNING_CLASS = DeprecationWarning 

253_DEFAULT_DEPRECATION_WARNING = partial(warn, category=_DEFAULT_DEPRECATION_WARNING_CLASS)