Skip to content

Commit 72ffd10

Browse files
Merge pull request #224 from Quantco/uniques_improvements
adding configuration options to uniques functionality
2 parents efc54e3 + 62f6877 commit 72ffd10

File tree

11 files changed

+1427
-53
lines changed

11 files changed

+1427
-53
lines changed

.github/workflows/ci.yaml

+6-2
Original file line numberDiff line numberDiff line change
@@ -262,7 +262,7 @@ jobs:
262262
uses: ./.github/actions/pytest
263263
with:
264264
backend: bigquery
265-
args: -n auto tests/integration
265+
args: -n 16 -v tests/integration
266266

267267
impala-column:
268268
if: ${{ contains(github.event.pull_request.labels.*.name, 'impala') || contains(github.event.pull_request.labels.*.name, 'ready') || github.ref == 'refs/heads/main' }}
@@ -275,7 +275,11 @@ jobs:
275275
matrix:
276276
PYTHON_VERSION: [ '3.8' ]
277277
SA_VERSION: ["<2.0"]
278-
PYTEST_ARG: ["tests/integration/test_column_capitalization.py", "tests/integration/test_data_source.py", "tests/integration/test_integration.py -k row", "tests/integration/test_integration.py -k uniques", "tests/integration/test_integration.py -k date", "tests/integration/test_integration.py -k varchar", "tests/integration/test_integration.py -k numeric"]
278+
# PYTEST_ARG: ["tests/integration/test_column_capitalization.py", "tests/integration/test_data_source.py", "tests/integration/test_integration.py -k row", "tests/integration/test_integration.py -k uniques", "tests/integration/test_integration.py -k date", "tests/integration/test_integration.py -k varchar", "tests/integration/test_integration.py -k numeric"]
279+
280+
# more comprehensive matching; note that tests which start with test_i and not test_integer are not matched and must be added here
281+
282+
PYTEST_ARG: ["tests/integration/test_integration.py -k 'test_a or test_b or test_c or test_d or test_e or test_f or test_g or test_h or test_integer or test_j or test_k or test_l or test_m'", "tests/integration/test_integration.py -k 'test_n or test_o or test_p or test_q or test_r or test_s or test_t or test_u or test_v or test_w or test_x or test_y or test_z'"]
279283

280284
steps:
281285
- name: Checkout branch

run_integration_tests_postgres.sh

+15
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
#!/bin/bash
2+
3+
docker stop $(docker ps -q --filter name=postgres_datajudge)
4+
5+
./start_postgres.sh &
6+
bash -c "while true; do printf '\nPress enter once postgres is ready: '; sleep 1; done" &
7+
8+
read -p "Press enter to once postgres is ready: "
9+
kill %%
10+
11+
echo "STARTING PYTEST"
12+
pytest tests/integration -vv --backend=postgres "$@"
13+
14+
docker stop $(docker ps -q --filter name=postgres_datajudge)
15+

src/datajudge/constraints/base.py

+23-2
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,13 @@
11
import abc
22
from dataclasses import dataclass, field
33
from functools import lru_cache
4-
from typing import Any, Callable, List, Optional, Tuple, TypeVar
4+
from typing import Any, Callable, Collection, List, Optional, Tuple, TypeVar, Union
55

66
import sqlalchemy as sa
77

88
from ..db_access import DataReference
99
from ..formatter import Formatter
10+
from ..utils import OutputProcessor, output_processor_limit
1011

1112
DEFAULT_FORMATTER = Formatter()
1213

@@ -113,7 +114,15 @@ class Constraint(abc.ABC):
113114
"""
114115

115116
def __init__(
116-
self, ref: DataReference, *, ref2=None, ref_value: Any = None, name: str = None
117+
self,
118+
ref: DataReference,
119+
*,
120+
ref2=None,
121+
ref_value: Any = None,
122+
name: str = None,
123+
output_processors: Optional[
124+
Union[OutputProcessor, List[OutputProcessor]]
125+
] = output_processor_limit,
117126
):
118127
self._check_if_valid_between_or_within(ref2, ref_value)
119128
self.ref = ref
@@ -125,6 +134,12 @@ def __init__(
125134
self.factual_queries: Optional[List[str]] = None
126135
self.target_queries: Optional[List[str]] = None
127136

137+
if (output_processors is not None) and (
138+
not isinstance(output_processors, list)
139+
):
140+
output_processors = [output_processors]
141+
self.output_processors = output_processors
142+
128143
def _check_if_valid_between_or_within(
129144
self, ref2: Optional[DataReference], ref_value: Optional[Any]
130145
):
@@ -241,6 +256,12 @@ def test(self, engine: sa.engine.Engine) -> TestResult:
241256
target_queries,
242257
)
243258

259+
def apply_output_formatting(self, values: Collection) -> Collection:
260+
if self.output_processors is not None:
261+
for output_processor in self.output_processors:
262+
values, _ = output_processor(values)
263+
return values
264+
244265

245266
def format_sample(sample, ref: DataReference) -> str:
246267
"""Build a string from a database row indicating its column values."""

src/datajudge/constraints/miscs.py

+9-2
Original file line numberDiff line numberDiff line change
@@ -130,8 +130,15 @@ def test(self, engine: sa.engine.Engine) -> TestResult:
130130
return TestResult.success()
131131

132132
assertion_text = (
133-
f"{self.ref} has violations of functional dependence, e.g.:\n"
134-
+ "\n".join([f"{tuple(violation)}" for violation in violations][:5])
133+
f"{self.ref} has violations of functional dependence (in total {len(violations)} rows):\n"
134+
+ "\n".join(
135+
[
136+
f"{violation}"
137+
for violation in self.apply_output_formatting(
138+
[tuple(elem) for elem in violations]
139+
)
140+
]
141+
)
135142
)
136143
return TestResult.failure(assertion_text)
137144

src/datajudge/constraints/uniques.py

+103-19
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import abc
2+
import warnings
23
from collections import Counter
34
from itertools import zip_longest
45
from math import ceil, floor
@@ -8,6 +9,7 @@
89

910
from .. import db_access
1011
from ..db_access import DataReference
12+
from ..utils import OutputProcessor, filternull_element, output_processor_limit
1113
from .base import Constraint, OptionalSelections, T, TestResult, ToleranceGetter
1214

1315

@@ -42,9 +44,21 @@ class Uniques(Constraint, abc.ABC):
4244
are part of a reference set of expected values - either externally supplied
4345
through parameter `uniques` or obtained from another `DataSource`.
4446
45-
Null values in the column are ignored. To assert the non-existence of them use
46-
the `NullAbsence` constraint via the `add_null_absence_constraint` helper method for
47-
`WithinRequirement`.
47+
Null values in the columns ``columns`` are ignored. To assert the non-existence of them use
48+
the :meth:`~datajudge.requirements.WithinRequirement.add_null_absence_constraint`` helper method
49+
for ``WithinRequirement``.
50+
By default, the null filtering does not trigger if multiple columns are fetched at once.
51+
It can be configured in more detail by supplying a custom ``filter_func`` function.
52+
Some exemplary implementations are available as :func:`~datajudge.utils.filternull_element`,
53+
:func:`~datajudge.utils.filternull_never`, :func:`~datajudge.utils.filternull_element_or_tuple_all`,
54+
:func:`~datajudge.utils.filternull_element_or_tuple_any`.
55+
Passing ``None`` as the argument is equivalent to :func:`~datajudge.utils.filternull_element` but triggers a warning.
56+
The current default of :func:`~datajudge.utils.filternull_element`
57+
Cause (possibly often unintended) changes in behavior when the users adds a second column
58+
(filtering no longer can trigger at all).
59+
The default will be changed to :func:`~datajudge.utils.filternull_element_or_tuple_all` in future versions.
60+
To silence the warning, set ``filter_func`` explicitly..
61+
4862
4963
There are two ways to do some post processing of the data obtained from the
5064
database by providing a function to be executed. In general, no postprocessing
@@ -63,6 +77,29 @@ class Uniques(Constraint, abc.ABC):
6377
(eager or lazy) of the same type as the type of the values of the column (in their
6478
Python equivalent).
6579
80+
Furthermore, the `max_relative_violations` parameter can be used to set a tolerance
81+
threshold for the proportion of elements in the data that can violate the constraint
82+
(default: 0).
83+
Setting this argument is currently not supported for `UniquesEquality`.
84+
85+
For `UniquesSubset`, by default,
86+
the number of occurrences affects the computed fraction of violations.
87+
To disable this weighting, set `compare_distinct=True`.
88+
This argument does not have an effect on the test results for other `Uniques` constraints,
89+
or if `max_relative_violations` is 0.
90+
91+
By default, the assertion messages make use of sets,
92+
thus, they may differ from run to run despite the exact same situation being present,
93+
and can have an arbitrary length.
94+
To enforce a reproducible, limited output via (e.g.) sorting and slicing,
95+
set `output_processors` to a callable or a list of callables. By default, only the first 100 elements are displayed (:func:`~datajudge.utils.output_processor_limit`).
96+
97+
Each callable takes in two collections, and returns modified (e.g. sorted) versions of them.
98+
In most cases, the second argument is simply None,
99+
but for `UniquesSubset` it is the counts of each of the elements.
100+
The suggested functions are :func:`~datajudge.utils.output_processor_sort` and :func:`~datajudge.utils.output_processor_limit`
101+
- see their respective docstrings for details.
102+
66103
One use is of this constraint is to test for consistency in columns with expected
67104
categorical values.
68105
"""
@@ -71,26 +108,44 @@ def __init__(
71108
self,
72109
ref: DataReference,
73110
name: str = None,
111+
output_processors: Optional[
112+
Union[OutputProcessor, List[OutputProcessor]]
113+
] = output_processor_limit,
74114
*,
75115
ref2: DataReference = None,
76116
uniques: Collection = None,
117+
filter_func: Callable[[List[T]], List[T]] = None,
77118
map_func: Callable[[T], T] = None,
78119
reduce_func: Callable[[Collection], Collection] = None,
79120
max_relative_violations=0,
121+
compare_distinct=False,
80122
):
81123
ref_value: Optional[Tuple[Collection, List]]
82124
ref_value = (uniques, []) if uniques else None
83-
super().__init__(ref, ref2=ref2, ref_value=ref_value, name=name)
125+
super().__init__(
126+
ref,
127+
ref2=ref2,
128+
ref_value=ref_value,
129+
name=name,
130+
output_processors=output_processors,
131+
)
132+
133+
if filter_func is None:
134+
warnings.warn("Using deprecated default null filter function.")
135+
filter_func = filternull_element
136+
137+
self.filter_func = filter_func
84138
self.local_func = map_func
85139
self.global_func = reduce_func
86140
self.max_relative_violations = max_relative_violations
141+
self.compare_distinct = compare_distinct
87142

88143
def retrieve(
89144
self, engine: sa.engine.Engine, ref: DataReference
90145
) -> Tuple[Tuple[List[T], List[int]], OptionalSelections]:
91146
uniques, selection = db_access.get_uniques(engine, ref)
92147
values = list(uniques.keys())
93-
values = list(filter(lambda value: value is not None, values))
148+
values = self.filter_func(values)
94149
counts = [uniques[value] for value in values]
95150
if self.local_func:
96151
values = list(map(self.local_func, values))
@@ -106,7 +161,11 @@ def retrieve(
106161
class UniquesEquality(Uniques):
107162
def __init__(self, args, name: str = None, **kwargs):
108163
if kwargs.get("max_relative_violations"):
109-
raise RuntimeError("Some useful message")
164+
raise RuntimeError(
165+
"max_relative_violations is not supported for UniquesEquality."
166+
)
167+
if kwargs.get("compare_distinct"):
168+
raise RuntimeError("compare_distinct is not supported for UniquesEquality.")
110169
super().__init__(args, name=name, **kwargs)
111170

112171
def compare(
@@ -123,22 +182,22 @@ def compare(
123182
if not is_subset and not is_superset:
124183
assertion_text = (
125184
f"{self.ref} doesn't have the element(s) "
126-
f"'{lacking_values}' and has the excess element(s) "
127-
f"'{excess_values}' when compared with the reference values. "
185+
f"'{self.apply_output_formatting(lacking_values)}' and has the excess element(s) "
186+
f"'{self.apply_output_formatting(excess_values)}' when compared with the reference values. "
128187
f"{self.condition_string}"
129188
)
130189
return False, assertion_text
131190
if not is_subset:
132191
assertion_text = (
133192
f"{self.ref} has the excess element(s) "
134-
f"'{excess_values}' when compared with the reference values. "
193+
f"'{self.apply_output_formatting(excess_values)}' when compared with the reference values. "
135194
f"{self.condition_string}"
136195
)
137196
return False, assertion_text
138197
if not is_superset:
139198
assertion_text = (
140199
f"{self.ref} doesn't have the element(s) "
141-
f"'{lacking_values}' when compared with the reference values. "
200+
f"'{self.apply_output_formatting(lacking_values)}' when compared with the reference values. "
142201
f"{self.condition_string}"
143202
)
144203
return False, assertion_text
@@ -153,28 +212,49 @@ def compare(
153212
) -> Tuple[bool, Optional[str]]:
154213
factual_values, factual_counts = factual
155214
target_values, _ = target
215+
156216
is_subset, remainder = _subset_violation_counts(
157217
factual_values, factual_counts, target_values
158218
)
159-
n_rows = sum(factual_counts)
160-
n_violations = sum(remainder.values())
219+
if not self.compare_distinct:
220+
n_rows = sum(factual_counts)
221+
n_violations = sum(remainder.values())
222+
else:
223+
n_rows = len(factual_values)
224+
n_violations = len(remainder)
225+
161226
if (
162227
n_rows > 0
163228
and (relative_violations := (n_violations / n_rows))
164229
> self.max_relative_violations
165230
):
231+
output_elemes, output_counts = list(remainder.keys()), list(
232+
remainder.values()
233+
)
234+
if self.output_processors is not None:
235+
for output_processor in self.output_processors:
236+
output_elemes, output_counts = output_processor(
237+
output_elemes, output_counts
238+
)
239+
166240
assertion_text = (
167241
f"{self.ref} has a fraction of {relative_violations} > "
168-
f"{self.max_relative_violations} values not being an element of "
169-
f"'{set(target_values)}'. It has e.g. excess elements "
170-
f"'{list(remainder.keys())[:5]}'."
242+
f"{self.max_relative_violations} {'DISTINCT ' if self.compare_distinct else ''}values ({n_violations} / {n_rows}) not being an element of "
243+
f"'{self.apply_output_formatting(set(target_values))}'. It has excess elements "
244+
f"'{output_elemes}' "
245+
f"with counts {output_counts}."
171246
f"{self.condition_string}"
172247
)
173248
return False, assertion_text
174249
return True, None
175250

176251

177252
class UniquesSuperset(Uniques):
253+
def __init__(self, args, name: str = None, **kwargs):
254+
if kwargs.get("compare_distinct"):
255+
raise RuntimeError("compare_distinct is not supported for UniquesSuperset.")
256+
super().__init__(args, name=name, **kwargs)
257+
178258
def compare(
179259
self,
180260
factual: Tuple[List[T], List[int]],
@@ -185,14 +265,18 @@ def compare(
185265
is_superset, remainder = _is_superset(factual_values, target_values)
186266
if (
187267
len(factual_values) > 0
188-
and (relative_violations := (len(remainder) / len(target_values)))
268+
and (
269+
relative_violations := (
270+
(n_violations := (len(remainder))) / (n_rows := len(target_values))
271+
)
272+
)
189273
> self.max_relative_violations
190274
):
191275
assertion_text = (
192276
f"{self.ref} has a fraction of "
193-
f"{relative_violations} > {self.max_relative_violations} "
194-
f"lacking unique values of '{set(target_values)}'. E.g. it "
195-
f"doesn't have the unique value(s) '{list(remainder)[:5]}'."
277+
f"{relative_violations} > {self.max_relative_violations} ({n_violations} / {n_rows}) "
278+
f"lacking unique values of '{self.apply_output_formatting(set(target_values))}'. It "
279+
f"doesn't have the unique value(s) '{self.apply_output_formatting(list(remainder))}'."
196280
f"{self.condition_string}"
197281
)
198282
return False, assertion_text

0 commit comments

Comments
 (0)