Skip to content

Commit 9ea8866

Browse files
committed
adding configuration options to uniques functionality
1 parent efc54e3 commit 9ea8866

File tree

3 files changed

+1206
-23
lines changed

3 files changed

+1206
-23
lines changed

src/datajudge/constraints/uniques.py

+212-16
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import abc
2+
import warnings
23
from collections import Counter
34
from itertools import zip_longest
45
from math import ceil, floor
@@ -35,16 +36,128 @@ def _subset_violation_counts(
3536
return len(remainder) == 0, remainder
3637

3738

39+
def util_output_postprocessing_sorter(
40+
collection: Collection, counts: Optional[Collection] = None
41+
):
42+
"""
43+
Sorts a collection of tuple elements in descending order of their counts,
44+
and for ties, makes use of the ascending order of the elements themselves.
45+
46+
If the first element is not instanceof tuple,
47+
each element will be transparently packaged into a 1-tuple for processing;
48+
this process is not visible to the caller.
49+
50+
Handles None values as described in `sort_tuple_none_aware`.
51+
"""
52+
collection = list(collection)
53+
if not isinstance(collection[0], tuple):
54+
# package into a 1 tuple and pass into the method again
55+
packaged_list = [(elem,) for elem in collection]
56+
res_main, res_counts = util_output_postprocessing_sorter(packaged_list, counts)
57+
return [elem[0] for elem in res_main], res_counts
58+
59+
if counts is None:
60+
return sort_tuple_none_aware(collection), counts
61+
62+
assert len(collection) == len(
63+
counts
64+
), "collection and counts must have the same length"
65+
66+
if len(collection) <= 1:
67+
return collection, counts # empty or 1 element lists are always sorted
68+
69+
lst = sort_tuple_none_aware(
70+
[(-count, *elem) for count, elem in zip(counts, collection)]
71+
)
72+
return [elem[1:] for elem in lst], [-elem[0] for elem in lst]
73+
74+
75+
def util_filternull_default_deprecated(values: List[T]) -> List[T]:
76+
return list(filter(lambda value: value is not None, values))
77+
78+
79+
def util_filternull_never(values: List[T]) -> List[T]:
80+
return values
81+
82+
83+
def util_filternull_element_or_tuple_all(values: List[T]) -> List[T]:
84+
return list(
85+
filter(
86+
lambda value: (value is not None)
87+
and (not (isinstance(value, tuple) and all(x is None for x in value))),
88+
values,
89+
)
90+
)
91+
92+
93+
def util_filternull_element_or_tuple_any(values: List[T]) -> List[T]:
94+
return list(
95+
filter(
96+
lambda value: (value is not None)
97+
and (not (isinstance(value, tuple) and any(x is None for x in value))),
98+
values,
99+
)
100+
)
101+
102+
103+
def sort_tuple_none_aware(collection: Collection[Tuple], ascending=True):
104+
"""
105+
Sorts a collection of either tuples or single elements,
106+
where `None` is considered the same as the default value of the respective column's type.
107+
For ints/floats `int()`/`float()` yield `0`/`0.0`, for strings `str()` yields `''`.
108+
The constructor is determined by calling type() on the first non-`None` element of the respective column.
109+
110+
Checks and requires all elements in collection are tuples, and that all tuples have the same length.
111+
"""
112+
lst = list(collection)
113+
114+
if len(lst) <= 1:
115+
return lst # empty or 1 element lists are always sorted
116+
117+
assert all(
118+
isinstance(elem, tuple) and len(elem) == len(lst[0]) for elem in lst
119+
), "all elements must be tuples and have the same length"
120+
121+
dtypes_each_tupleelement: List[Optional[type]] = [None] * len(lst[0])
122+
for dtypeidx in range(len(dtypes_each_tupleelement)):
123+
for elem in lst:
124+
if elem[dtypeidx] is not None:
125+
dtypes_each_tupleelement[dtypeidx] = type(elem[dtypeidx])
126+
break
127+
else:
128+
# if all entries are None, just use a constant int() == 0
129+
dtypes_each_tupleelement[dtypeidx] = int
130+
131+
def replace_None_with_default(elem):
132+
return tuple(
133+
(dtype() if subelem is None else subelem)
134+
for dtype, subelem in zip(dtypes_each_tupleelement, elem)
135+
)
136+
137+
return sorted(
138+
lst, key=lambda elem: replace_None_with_default(elem), reverse=not ascending
139+
)
140+
141+
38142
class Uniques(Constraint, abc.ABC):
39143
"""Uniques is an abstract class for comparisons between unique values of a column and a reference.
40144
41145
The `Uniques` constraint asserts if the values contained in a column of a `DataSource`
42146
are part of a reference set of expected values - either externally supplied
43147
through parameter `uniques` or obtained from another `DataSource`.
44148
45-
Null values in the column are ignored. To assert the non-existence of them use
149+
Null values in the column are ignored by default. To assert the non-existence of them use
46150
the `NullAbsence` constraint via the `add_null_absence_constraint` helper method for
47151
`WithinRequirement`.
152+
By default, the null filtering does not trigger if multiple columns are fetched at once.
153+
It can be configured in more detail by supplying a custom `filter_func` function.
154+
Some exemplary implementations are available in this module as `util_filternull_default_deprecated`,
155+
`util_filternull_never`, `util_filternull_element_or_tuple_all`, `util_filternull_element_or_tuple_any`.
156+
For new deployments, using one of the above filters or a custom one is recommended.
157+
Passing None as the argument is equivalent to `util_filternull_default_deprecated`, but triggers a warning.
158+
The deprecated default may change in future versions.
159+
To silence the warning, set `filter_func` explicitly.
160+
48161
49162
There are two ways to do some post processing of the data obtained from the
50163
database by providing a function to be executed. In general, no postprocessing
@@ -63,6 +176,31 @@ class Uniques(Constraint, abc.ABC):
63176
(eager or lazy) of the same type as the type of the values of the column (in their
64177
Python equivalent).
65178
179+
Furthermore, the `max_relative_violations` parameter can be used to set a tolerance
180+
threshold for the proportion of elements in the data that can violate the constraint
181+
(default: 0).
182+
Setting this argument is currently not supported for `UniquesEquality`.
183+
184+
For `UniquesSubset`, by default,
185+
the number of occurrences affects the computed fraction of violations.
186+
To disable this weighting, set `compare_distinct=True`.
187+
This argument does not have an effect on the test results for other `Uniques` constraints,
188+
or if `max_relative_violations` is 0.
189+
190+
By default, the assertion messages make use of sets,
191+
thus, they may differ from run to run despite the exact same situation being present.
192+
To enforce a reproducible output via (e.g.) sorting, set `output_postprocessing_sorter` to a callable
193+
which takes in two collections, and returns modified (e.g. sorted) versions of them.
194+
In most cases, the second argument is simply None,
195+
but for `UniquesSubset` it is the counts of each of the elements.
196+
The suggested function is `util_output_postprocessing_sorter` from this file,
197+
- see its documentation for details.
198+
199+
By default, the number of subset or superset remainders (excess or missing values)
200+
for `UniquesSubset` and `UniquesSuperset` is sliced by [:5] (i.e. the first 5) in the assertion message.
201+
This can be configured using `output_remainder_slicer`.
202+
This argument does not have an effect for `UniquesEquality`.
203+
66204
One use is of this constraint is to test for consistency in columns with expected
67205
categorical values.
68206
"""
@@ -74,23 +212,51 @@ def __init__(
74212
*,
75213
ref2: DataReference = None,
76214
uniques: Collection = None,
215+
filter_func: Callable[[List[T]], List[T]] = None,
77216
map_func: Callable[[T], T] = None,
78217
reduce_func: Callable[[Collection], Collection] = None,
79218
max_relative_violations=0,
219+
compare_distinct=False,
220+
output_postprocessing_sorter: Callable[
221+
[Collection, Optional[Collection]], Collection
222+
] = None,
223+
output_remainder_slicer: slice = slice(5),
80224
):
81225
ref_value: Optional[Tuple[Collection, List]]
82226
ref_value = (uniques, []) if uniques else None
83227
super().__init__(ref, ref2=ref2, ref_value=ref_value, name=name)
228+
229+
if filter_func is None:
230+
warnings.warn(
231+
"Using deprecated default null filter function. "
232+
"Set filter_func explicitly to disable this warning."
233+
)
234+
filter_func = util_filternull_default_deprecated
235+
236+
self.filter_func = filter_func
84237
self.local_func = map_func
85238
self.global_func = reduce_func
86239
self.max_relative_violations = max_relative_violations
240+
self.compare_distinct = compare_distinct
241+
self.output_postprocessing_sorter = output_postprocessing_sorter
242+
self.output_remainder_slicer = output_remainder_slicer
243+
244+
def apply_output_formatting_no_counts(
245+
self, values: Collection[T], apply_remainder_limit=False
246+
) -> Collection[T]:
247+
if self.output_postprocessing_sorter is not None:
248+
values, _ = self.output_postprocessing_sorter(values) # type: ignore[call-arg]
249+
if apply_remainder_limit:
250+
values = list(values)
251+
values = values[self.output_remainder_slicer]
252+
return values
87253

88254
def retrieve(
89255
self, engine: sa.engine.Engine, ref: DataReference
90256
) -> Tuple[Tuple[List[T], List[int]], OptionalSelections]:
91257
uniques, selection = db_access.get_uniques(engine, ref)
92258
values = list(uniques.keys())
93-
values = list(filter(lambda value: value is not None, values))
259+
values = self.filter_func(values)
94260
counts = [uniques[value] for value in values]
95261
if self.local_func:
96262
values = list(map(self.local_func, values))
@@ -106,7 +272,11 @@ def retrieve(
106272
class UniquesEquality(Uniques):
107273
def __init__(self, args, name: str = None, **kwargs):
108274
if kwargs.get("max_relative_violations"):
109-
raise RuntimeError("Some useful message")
275+
raise RuntimeError(
276+
"max_relative_violations is not supported for UniquesEquality."
277+
)
278+
if kwargs.get("compare_distinct"):
279+
raise RuntimeError("compare_distinct is not supported for UniquesEquality.")
110280
super().__init__(args, name=name, **kwargs)
111281

112282
def compare(
@@ -123,22 +293,22 @@ def compare(
123293
if not is_subset and not is_superset:
124294
assertion_text = (
125295
f"{self.ref} doesn't have the element(s) "
126-
f"'{lacking_values}' and has the excess element(s) "
127-
f"'{excess_values}' when compared with the reference values. "
296+
f"'{self.apply_output_formatting_no_counts(lacking_values)}' and has the excess element(s) "
297+
f"'{self.apply_output_formatting_no_counts(excess_values)}' when compared with the reference values. "
128298
f"{self.condition_string}"
129299
)
130300
return False, assertion_text
131301
if not is_subset:
132302
assertion_text = (
133303
f"{self.ref} has the excess element(s) "
134-
f"'{excess_values}' when compared with the reference values. "
304+
f"'{self.apply_output_formatting_no_counts(excess_values)}' when compared with the reference values. "
135305
f"{self.condition_string}"
136306
)
137307
return False, assertion_text
138308
if not is_superset:
139309
assertion_text = (
140310
f"{self.ref} doesn't have the element(s) "
141-
f"'{lacking_values}' when compared with the reference values. "
311+
f"'{self.apply_output_formatting_no_counts(lacking_values)}' when compared with the reference values. "
142312
f"{self.condition_string}"
143313
)
144314
return False, assertion_text
@@ -153,28 +323,50 @@ def compare(
153323
) -> Tuple[bool, Optional[str]]:
154324
factual_values, factual_counts = factual
155325
target_values, _ = target
326+
156327
is_subset, remainder = _subset_violation_counts(
157328
factual_values, factual_counts, target_values
158329
)
159-
n_rows = sum(factual_counts)
160-
n_violations = sum(remainder.values())
330+
if not self.compare_distinct:
331+
n_rows = sum(factual_counts)
332+
n_violations = sum(remainder.values())
333+
else:
334+
n_rows = len(factual_values)
335+
n_violations = len(remainder)
336+
161337
if (
162338
n_rows > 0
163339
and (relative_violations := (n_violations / n_rows))
164340
> self.max_relative_violations
165341
):
342+
output_elemes, output_counts = list(remainder.keys()), list(
343+
remainder.values()
344+
)
345+
if self.output_postprocessing_sorter is not None:
346+
output_elemes, output_counts = self.output_postprocessing_sorter(
347+
output_elemes, output_counts
348+
)
349+
output_elemes = output_elemes[self.output_remainder_slicer]
350+
output_counts = output_counts[self.output_remainder_slicer]
351+
166352
assertion_text = (
167353
f"{self.ref} has a fraction of {relative_violations} > "
168-
f"{self.max_relative_violations} values not being an element of "
169-
f"'{set(target_values)}'. It has e.g. excess elements "
170-
f"'{list(remainder.keys())[:5]}'."
354+
f"{self.max_relative_violations} {'DISTINCT ' if self.compare_distinct else ''}values ({n_violations} / {n_rows}) not being an element of "
355+
f"'{self.apply_output_formatting_no_counts(set(target_values))}'. It has e.g. ({self.output_remainder_slicer}) excess elements "
356+
f"'{output_elemes}' "
357+
f"with counts {output_counts}."
171358
f"{self.condition_string}"
172359
)
173360
return False, assertion_text
174361
return True, None
175362

176363

177364
class UniquesSuperset(Uniques):
365+
def __init__(self, args, name: str = None, **kwargs):
366+
if kwargs.get("compare_distinct"):
367+
raise RuntimeError("compare_distinct is not supported for UniquesSuperset.")
368+
super().__init__(args, name=name, **kwargs)
369+
178370
def compare(
179371
self,
180372
factual: Tuple[List[T], List[int]],
@@ -185,14 +377,18 @@ def compare(
185377
is_superset, remainder = _is_superset(factual_values, target_values)
186378
if (
187379
len(factual_values) > 0
188-
and (relative_violations := (len(remainder) / len(target_values)))
380+
and (
381+
relative_violations := (
382+
(n_violations := (len(remainder))) / (n_rows := len(target_values))
383+
)
384+
)
189385
> self.max_relative_violations
190386
):
191387
assertion_text = (
192388
f"{self.ref} has a fraction of "
193-
f"{relative_violations} > {self.max_relative_violations} "
194-
f"lacking unique values of '{set(target_values)}'. E.g. it "
195-
f"doesn't have the unique value(s) '{list(remainder)[:5]}'."
389+
f"{relative_violations} > {self.max_relative_violations} ({n_violations} / {n_rows}) "
390+
f"lacking unique values of '{self.apply_output_formatting_no_counts(set(target_values))}'. E.g. ({self.output_remainder_slicer}) it "
391+
f"doesn't have the unique value(s) '{self.apply_output_formatting_no_counts(list(remainder), apply_remainder_limit=True)}'."
196392
f"{self.condition_string}"
197393
)
198394
return False, assertion_text

0 commit comments

Comments
 (0)