1
1
import abc
2
+ import warnings
2
3
from collections import Counter
3
4
from itertools import zip_longest
4
5
from math import ceil , floor
@@ -35,16 +36,128 @@ def _subset_violation_counts(
35
36
return len (remainder ) == 0 , remainder
36
37
37
38
39
+ def util_output_postprocessing_sorter (
40
+ collection : Collection , counts : Optional [Collection ] = None
41
+ ):
42
+ """
43
+ Sorts a collection of tuple elements in descending order of their counts,
44
+ and for ties, makes use of the ascending order of the elements themselves.
45
+
46
+ If the first element is not instanceof tuple,
47
+ each element will be transparently packaged into a 1-tuple for processing;
48
+ this process is not visible to the caller.
49
+
50
+ Handles None values as described in `sort_tuple_none_aware`.
51
+ """
52
+ collection = list (collection )
53
+ if not isinstance (collection [0 ], tuple ):
54
+ # package into a 1 tuple and pass into the method again
55
+ packaged_list = [(elem ,) for elem in collection ]
56
+ res_main , res_counts = util_output_postprocessing_sorter (packaged_list , counts )
57
+ return [elem [0 ] for elem in res_main ], res_counts
58
+
59
+ if counts is None :
60
+ return sort_tuple_none_aware (collection ), counts
61
+
62
+ assert len (collection ) == len (
63
+ counts
64
+ ), "collection and counts must have the same length"
65
+
66
+ if len (collection ) <= 1 :
67
+ return collection , counts # empty or 1 element lists are always sorted
68
+
69
+ lst = sort_tuple_none_aware (
70
+ [(- count , * elem ) for count , elem in zip (counts , collection )]
71
+ )
72
+ return [elem [1 :] for elem in lst ], [- elem [0 ] for elem in lst ]
73
+
74
+
75
+ def util_filternull_default_deprecated (values : List [T ]) -> List [T ]:
76
+ return list (filter (lambda value : value is not None , values ))
77
+
78
+
79
+ def util_filternull_never (values : List [T ]) -> List [T ]:
80
+ return values
81
+
82
+
83
+ def util_filternull_element_or_tuple_all (values : List [T ]) -> List [T ]:
84
+ return list (
85
+ filter (
86
+ lambda value : (value is not None )
87
+ and (not (isinstance (value , tuple ) and all (x is None for x in value ))),
88
+ values ,
89
+ )
90
+ )
91
+
92
+
93
+ def util_filternull_element_or_tuple_any (values : List [T ]) -> List [T ]:
94
+ return list (
95
+ filter (
96
+ lambda value : (value is not None )
97
+ and (not (isinstance (value , tuple ) and any (x is None for x in value ))),
98
+ values ,
99
+ )
100
+ )
101
+
102
+
103
+ def sort_tuple_none_aware (collection : Collection [Tuple ], ascending = True ):
104
+ """
105
+ Sorts a collection of either tuples or single elements,
106
+ where `None` is considered the same as the default value of the respective column's type.
107
+ For ints/floats `int()`/`float()` yield `0`/`0.0`, for strings `str()` yields `''`.
108
+ The constructor is determined by calling type() on the first non-`None` element of the respective column.
109
+
110
+ Checks and requires all elements in collection are tuples, and that all tuples have the same length.
111
+ """
112
+ lst = list (collection )
113
+
114
+ if len (lst ) <= 1 :
115
+ return lst # empty or 1 element lists are always sorted
116
+
117
+ assert all (
118
+ isinstance (elem , tuple ) and len (elem ) == len (lst [0 ]) for elem in lst
119
+ ), "all elements must be tuples and have the same length"
120
+
121
+ dtypes_each_tupleelement : List [Optional [type ]] = [None ] * len (lst [0 ])
122
+ for dtypeidx in range (len (dtypes_each_tupleelement )):
123
+ for elem in lst :
124
+ if elem [dtypeidx ] is not None :
125
+ dtypes_each_tupleelement [dtypeidx ] = type (elem [dtypeidx ])
126
+ break
127
+ else :
128
+ # if all entries are None, just use a constant int() == 0
129
+ dtypes_each_tupleelement [dtypeidx ] = int
130
+
131
+ def replace_None_with_default (elem ):
132
+ return tuple (
133
+ (dtype () if subelem is None else subelem )
134
+ for dtype , subelem in zip (dtypes_each_tupleelement , elem )
135
+ )
136
+
137
+ return sorted (
138
+ lst , key = lambda elem : replace_None_with_default (elem ), reverse = not ascending
139
+ )
140
+
141
+
38
142
class Uniques (Constraint , abc .ABC ):
39
143
"""Uniques is an abstract class for comparisons between unique values of a column and a reference.
40
144
41
145
The `Uniques` constraint asserts if the values contained in a column of a `DataSource`
42
146
are part of a reference set of expected values - either externally supplied
43
147
through parameter `uniques` or obtained from another `DataSource`.
44
148
45
- Null values in the column are ignored. To assert the non-existence of them use
149
+ Null values in the column are ignored by default . To assert the non-existence of them use
46
150
the `NullAbsence` constraint via the `add_null_absence_constraint` helper method for
47
151
`WithinRequirement`.
152
+ By default, the null filtering does not trigger if multiple columns are fetched at once.
153
+ It can be configured in more detail by supplying a custom `filter_func` function.
154
+ Some exemplary implementations are available in this module as `util_filternull_default_deprecated`,
155
+ `util_filternull_never`, `util_filternull_element_or_tuple_all`, `util_filternull_element_or_tuple_any`.
156
+ For new deployments, using one of the above filters or a custom one is recommended.
157
+ Passing None as the argument is equivalent to `util_filternull_default_deprecated`, but triggers a warning.
158
+ The deprecated default may change in future versions.
159
+ To silence the warning, set `filter_func` explicitly.
160
+
48
161
49
162
There are two ways to do some post processing of the data obtained from the
50
163
database by providing a function to be executed. In general, no postprocessing
@@ -63,6 +176,31 @@ class Uniques(Constraint, abc.ABC):
63
176
(eager or lazy) of the same type as the type of the values of the column (in their
64
177
Python equivalent).
65
178
179
+ Furthermore, the `max_relative_violations` parameter can be used to set a tolerance
180
+ threshold for the proportion of elements in the data that can violate the constraint
181
+ (default: 0).
182
+ Setting this argument is currently not supported for `UniquesEquality`.
183
+
184
+ For `UniquesSubset`, by default,
185
+ the number of occurrences affects the computed fraction of violations.
186
+ To disable this weighting, set `compare_distinct=True`.
187
+ This argument does not have an effect on the test results for other `Uniques` constraints,
188
+ or if `max_relative_violations` is 0.
189
+
190
+ By default, the assertion messages make use of sets,
191
+ thus, they may differ from run to run despite the exact same situation being present.
192
+ To enforce a reproducible output via (e.g.) sorting, set `output_postprocessing_sorter` to a callable
193
+ which takes in two collections, and returns modified (e.g. sorted) versions of them.
194
+ In most cases, the second argument is simply None,
195
+ but for `UniquesSubset` it is the counts of each of the elements.
196
+ The suggested function is `util_output_postprocessing_sorter` from this file,
197
+ - see its documentation for details.
198
+
199
+ By default, the number of subset or superset remainders (excess or missing values)
200
+ for `UniquesSubset` and `UniquesSuperset` is sliced by [:5] (i.e. the first 5) in the assertion message.
201
+ This can be configured using `output_remainder_slicer`.
202
+ This argument does not have an effect for `UniquesEquality`.
203
+
66
204
One use is of this constraint is to test for consistency in columns with expected
67
205
categorical values.
68
206
"""
@@ -74,23 +212,51 @@ def __init__(
74
212
* ,
75
213
ref2 : DataReference = None ,
76
214
uniques : Collection = None ,
215
+ filter_func : Callable [[List [T ]], List [T ]] = None ,
77
216
map_func : Callable [[T ], T ] = None ,
78
217
reduce_func : Callable [[Collection ], Collection ] = None ,
79
218
max_relative_violations = 0 ,
219
+ compare_distinct = False ,
220
+ output_postprocessing_sorter : Callable [
221
+ [Collection , Optional [Collection ]], Collection
222
+ ] = None ,
223
+ output_remainder_slicer : slice = slice (5 ),
80
224
):
81
225
ref_value : Optional [Tuple [Collection , List ]]
82
226
ref_value = (uniques , []) if uniques else None
83
227
super ().__init__ (ref , ref2 = ref2 , ref_value = ref_value , name = name )
228
+
229
+ if filter_func is None :
230
+ warnings .warn (
231
+ "Using deprecated default null filter function. "
232
+ "Set filter_func explicitly to disable this warning."
233
+ )
234
+ filter_func = util_filternull_default_deprecated
235
+
236
+ self .filter_func = filter_func
84
237
self .local_func = map_func
85
238
self .global_func = reduce_func
86
239
self .max_relative_violations = max_relative_violations
240
+ self .compare_distinct = compare_distinct
241
+ self .output_postprocessing_sorter = output_postprocessing_sorter
242
+ self .output_remainder_slicer = output_remainder_slicer
243
+
244
+ def apply_output_formatting_no_counts (
245
+ self , values : Collection [T ], apply_remainder_limit = False
246
+ ) -> Collection [T ]:
247
+ if self .output_postprocessing_sorter is not None :
248
+ values , _ = self .output_postprocessing_sorter (values ) # type: ignore[call-arg]
249
+ if apply_remainder_limit :
250
+ values = list (values )
251
+ values = values [self .output_remainder_slicer ]
252
+ return values
87
253
88
254
def retrieve (
89
255
self , engine : sa .engine .Engine , ref : DataReference
90
256
) -> Tuple [Tuple [List [T ], List [int ]], OptionalSelections ]:
91
257
uniques , selection = db_access .get_uniques (engine , ref )
92
258
values = list (uniques .keys ())
93
- values = list ( filter ( lambda value : value is not None , values ) )
259
+ values = self . filter_func ( values )
94
260
counts = [uniques [value ] for value in values ]
95
261
if self .local_func :
96
262
values = list (map (self .local_func , values ))
@@ -106,7 +272,11 @@ def retrieve(
106
272
class UniquesEquality (Uniques ):
107
273
def __init__ (self , args , name : str = None , ** kwargs ):
108
274
if kwargs .get ("max_relative_violations" ):
109
- raise RuntimeError ("Some useful message" )
275
+ raise RuntimeError (
276
+ "max_relative_violations is not supported for UniquesEquality."
277
+ )
278
+ if kwargs .get ("compare_distinct" ):
279
+ raise RuntimeError ("compare_distinct is not supported for UniquesEquality." )
110
280
super ().__init__ (args , name = name , ** kwargs )
111
281
112
282
def compare (
@@ -123,22 +293,22 @@ def compare(
123
293
if not is_subset and not is_superset :
124
294
assertion_text = (
125
295
f"{ self .ref } doesn't have the element(s) "
126
- f"'{ lacking_values } ' and has the excess element(s) "
127
- f"'{ excess_values } ' when compared with the reference values. "
296
+ f"'{ self . apply_output_formatting_no_counts ( lacking_values ) } ' and has the excess element(s) "
297
+ f"'{ self . apply_output_formatting_no_counts ( excess_values ) } ' when compared with the reference values. "
128
298
f"{ self .condition_string } "
129
299
)
130
300
return False , assertion_text
131
301
if not is_subset :
132
302
assertion_text = (
133
303
f"{ self .ref } has the excess element(s) "
134
- f"'{ excess_values } ' when compared with the reference values. "
304
+ f"'{ self . apply_output_formatting_no_counts ( excess_values ) } ' when compared with the reference values. "
135
305
f"{ self .condition_string } "
136
306
)
137
307
return False , assertion_text
138
308
if not is_superset :
139
309
assertion_text = (
140
310
f"{ self .ref } doesn't have the element(s) "
141
- f"'{ lacking_values } ' when compared with the reference values. "
311
+ f"'{ self . apply_output_formatting_no_counts ( lacking_values ) } ' when compared with the reference values. "
142
312
f"{ self .condition_string } "
143
313
)
144
314
return False , assertion_text
@@ -153,28 +323,50 @@ def compare(
153
323
) -> Tuple [bool , Optional [str ]]:
154
324
factual_values , factual_counts = factual
155
325
target_values , _ = target
326
+
156
327
is_subset , remainder = _subset_violation_counts (
157
328
factual_values , factual_counts , target_values
158
329
)
159
- n_rows = sum (factual_counts )
160
- n_violations = sum (remainder .values ())
330
+ if not self .compare_distinct :
331
+ n_rows = sum (factual_counts )
332
+ n_violations = sum (remainder .values ())
333
+ else :
334
+ n_rows = len (factual_values )
335
+ n_violations = len (remainder )
336
+
161
337
if (
162
338
n_rows > 0
163
339
and (relative_violations := (n_violations / n_rows ))
164
340
> self .max_relative_violations
165
341
):
342
+ output_elemes , output_counts = list (remainder .keys ()), list (
343
+ remainder .values ()
344
+ )
345
+ if self .output_postprocessing_sorter is not None :
346
+ output_elemes , output_counts = self .output_postprocessing_sorter (
347
+ output_elemes , output_counts
348
+ )
349
+ output_elemes = output_elemes [self .output_remainder_slicer ]
350
+ output_counts = output_counts [self .output_remainder_slicer ]
351
+
166
352
assertion_text = (
167
353
f"{ self .ref } has a fraction of { relative_violations } > "
168
- f"{ self .max_relative_violations } values not being an element of "
169
- f"'{ set (target_values )} '. It has e.g. excess elements "
170
- f"'{ list (remainder .keys ())[:5 ]} '."
354
+ f"{ self .max_relative_violations } { 'DISTINCT ' if self .compare_distinct else '' } values ({ n_violations } / { n_rows } ) not being an element of "
355
+ f"'{ self .apply_output_formatting_no_counts (set (target_values ))} '. It has e.g. ({ self .output_remainder_slicer } ) excess elements "
356
+ f"'{ output_elemes } ' "
357
+ f"with counts { output_counts } ."
171
358
f"{ self .condition_string } "
172
359
)
173
360
return False , assertion_text
174
361
return True , None
175
362
176
363
177
364
class UniquesSuperset (Uniques ):
365
+ def __init__ (self , args , name : str = None , ** kwargs ):
366
+ if kwargs .get ("compare_distinct" ):
367
+ raise RuntimeError ("compare_distinct is not supported for UniquesSuperset." )
368
+ super ().__init__ (args , name = name , ** kwargs )
369
+
178
370
def compare (
179
371
self ,
180
372
factual : Tuple [List [T ], List [int ]],
@@ -185,14 +377,18 @@ def compare(
185
377
is_superset , remainder = _is_superset (factual_values , target_values )
186
378
if (
187
379
len (factual_values ) > 0
188
- and (relative_violations := (len (remainder ) / len (target_values )))
380
+ and (
381
+ relative_violations := (
382
+ (n_violations := (len (remainder ))) / (n_rows := len (target_values ))
383
+ )
384
+ )
189
385
> self .max_relative_violations
190
386
):
191
387
assertion_text = (
192
388
f"{ self .ref } has a fraction of "
193
- f"{ relative_violations } > { self .max_relative_violations } "
194
- f"lacking unique values of '{ set (target_values )} '. E.g. it "
195
- f"doesn't have the unique value(s) '{ list (remainder )[: 5 ] } '."
389
+ f"{ relative_violations } > { self .max_relative_violations } ( { n_violations } / { n_rows } ) "
390
+ f"lacking unique values of '{ self . apply_output_formatting_no_counts ( set (target_values )) } '. E.g. ( { self . output_remainder_slicer } ) it "
391
+ f"doesn't have the unique value(s) '{ self . apply_output_formatting_no_counts ( list (remainder ), apply_remainder_limit = True ) } '."
196
392
f"{ self .condition_string } "
197
393
)
198
394
return False , assertion_text
0 commit comments