Skip to content

Commit 91ea51c

Browse files
committed
add output_processor_limit
1 parent c1fec1a commit 91ea51c

File tree

5 files changed

+115
-7
lines changed

5 files changed

+115
-7
lines changed

src/datajudge/constraints/uniques.py

+9-5
Original file line numberDiff line numberDiff line change
@@ -86,13 +86,17 @@ class Uniques(Constraint, abc.ABC):
8686
or if `max_relative_violations` is 0.
8787
8888
By default, the assertion messages make use of sets,
89-
thus, they may differ from run to run despite the exact same situation being present.
90-
To enforce a reproducible output via (e.g.) sorting, set `output_processor` to a callable
91-
which takes in two collections, and returns modified (e.g. sorted) versions of them.
89+
thus, they may differ from run to run despite the exact same situation being present,
90+
and can have an arbitrary length.
91+
To enforce a reproducible, limited output via (e.g.) sorting and slicing,
92+
set `output_processors` to a list of callables
93+
94+
Each callable takes in two collections, and returns modified (e.g. sorted) versions of them.
9295
In most cases, the second argument is simply None,
9396
but for `UniquesSubset` it is the counts of each of the elements.
94-
The suggested function is ``datajudge.utils.output_processor_sort`` from this file,
95-
- see its documentation for details.
97+
The suggested functions are ``datajudge.utils.output_processor_sort``
98+
and ``datajudge.utils.output_processor_limit``.,
99+
- see their respective docstrings for details.
96100
97101
One use is of this constraint is to test for consistency in columns with expected
98102
categorical values.

src/datajudge/requirements.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -287,7 +287,7 @@ def add_uniques_equality_constraint(
287287
To silence the warning, set ``filter_func`` explicitly.
288288
289289
See the ``Uniques`` class for further parameter details on ``map_func`` and
290-
``reduce_func``, and ``output_processor``.
290+
``reduce_func``, and ``output_processors``.
291291
"""
292292

293293
ref = DataReference(self.data_source, columns, condition)
@@ -1497,7 +1497,7 @@ def add_uniques_equality_constraint(
14971497
To silence the warning, set ``filter_func`` explicitly.
14981498
14991499
See the ``Uniques`` class for further parameter details on ``map_func``,
1500-
``reduce_func``, and ``output_processor``.
1500+
``reduce_func``, and ``output_processors``.
15011501
"""
15021502

15031503
ref = DataReference(self.data_source, columns1, condition1)

src/datajudge/utils.py

+26
Original file line numberDiff line numberDiff line change
@@ -88,6 +88,32 @@ def output_processor_sort(
8888
return [elem[1:] for elem in lst], [-elem[0] for elem in lst]
8989

9090

91+
def output_processor_limit(
92+
collection: Collection, counts: Optional[Collection] = None, limit: int = 100
93+
) -> Collection:
94+
"""
95+
Limits the collection to the first ``limit`` elements.
96+
If the list was shortened,
97+
will add a ``limit+1``-th string element,
98+
informing the user of the truncation.
99+
The default limit of ``100`` can be adjusted using ``functools.partial``
100+
"""
101+
collection = list(collection)
102+
103+
ret_collection = collection[:limit]
104+
ret_counts = None if counts is None else list(counts)[:limit]
105+
if len(collection) > limit:
106+
ret_collection.append(
107+
f"<SHORTENED OUTPUT, displaying the first {limit} / {len(collection)} elements above>"
108+
)
109+
if ret_counts is not None:
110+
ret_counts.append(
111+
f"<SHORTENED OUTPUT, displaying the first {limit} / {len(collection)} counts above>"
112+
)
113+
114+
return ret_collection, ret_counts
115+
116+
91117
def filternull_element(values: List) -> List:
92118
return [value for value in values if value is not None]
93119

tests/integration/conftest.py

+12
Original file line numberDiff line numberDiff line change
@@ -716,6 +716,18 @@ def unique_table2(engine, metadata):
716716
return TEST_DB_NAME, SCHEMA, table_name
717717

718718

719+
@pytest.fixture(scope="module")
720+
def unique_table_extralong(engine, metadata):
721+
table_name = "unique_table_extralong"
722+
columns = [
723+
sa.Column("col_int", sa.Integer()),
724+
sa.Column("col_varchar", _string_column(engine)),
725+
]
726+
data = [{"col_int": i // 2, "col_varchar": f"hi{i // 3}"} for i in range(12345)]
727+
_handle_table(engine, metadata, table_name, columns, data)
728+
return TEST_DB_NAME, SCHEMA, table_name
729+
730+
719731
@pytest.fixture(scope="module")
720732
def nested_table(engine, metadata):
721733
table_name = "nested_table"

tests/integration/test_integration.py

+66
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
filternull_element_or_tuple_all,
2020
filternull_element_or_tuple_any,
2121
filternull_never,
22+
output_processor_limit,
2223
output_processor_sort,
2324
)
2425

@@ -974,6 +975,71 @@ def test_uniques_subset_within_complex_with_outputcheck(engine, unique_table1, d
974975
), test_result.failure_message
975976

976977

978+
@pytest.mark.parametrize(
979+
"data",
980+
[
981+
(
982+
negation,
983+
["col_int", "col_varchar"],
984+
[(0, "hi0"), (1, "hi0")],
985+
0,
986+
filternull_element_or_tuple_any,
987+
True,
988+
[output_processor_sort, output_processor_limit],
989+
None,
990+
None,
991+
"column(s) 'col_int', 'col_varchar' has a fraction of 0.9997569866342649 > 0 DISTINCT values (8228 / 8230) not being an element of '[(0, 'hi0'), (1, 'hi0')]'. It has excess elements '[(2, 'hi1'), (3, 'hi2'), (5, 'hi3'), (6, 'hi4'), (8, 'hi5'), (9, 'hi6'), (11, 'hi7'), (12, 'hi8'), (14, 'hi9'), (15, 'hi10'), (17, 'hi11'), (18, 'hi12'), (20, 'hi13'), (21, 'hi14'), (23, 'hi15'), (24, 'hi16'), (26, 'hi17'), (27, 'hi18'), (29, 'hi19'), (30, 'hi20'), (32, 'hi21'), (33, 'hi22'), (35, 'hi23'), (36, 'hi24'), (38, 'hi25'), (39, 'hi26'), (41, 'hi27'), (42, 'hi28'), (44, 'hi29'), (45, 'hi30'), (47, 'hi31'), (48, 'hi32'), (50, 'hi33'), (51, 'hi34'), (53, 'hi35'), (54, 'hi36'), (56, 'hi37'), (57, 'hi38'), (59, 'hi39'), (60, 'hi40'), (62, 'hi41'), (63, 'hi42'), (65, 'hi43'), (66, 'hi44'), (68, 'hi45'), (69, 'hi46'), (71, 'hi47'), (72, 'hi48'), (74, 'hi49'), (75, 'hi50'), (77, 'hi51'), (78, 'hi52'), (80, 'hi53'), (81, 'hi54'), (83, 'hi55'), (84, 'hi56'), (86, 'hi57'), (87, 'hi58'), (89, 'hi59'), (90, 'hi60'), (92, 'hi61'), (93, 'hi62'), (95, 'hi63'), (96, 'hi64'), (98, 'hi65'), (99, 'hi66'), (101, 'hi67'), (102, 'hi68'), (104, 'hi69'), (105, 'hi70'), (107, 'hi71'), (108, 'hi72'), (110, 'hi73'), (111, 'hi74'), (113, 'hi75'), (114, 'hi76'), (116, 'hi77'), (117, 'hi78'), (119, 'hi79'), (120, 'hi80'), (122, 'hi81'), (123, 'hi82'), (125, 'hi83'), (126, 'hi84'), (128, 'hi85'), (129, 'hi86'), (131, 'hi87'), (132, 'hi88'), (134, 'hi89'), (135, 'hi90'), (137, 'hi91'), (138, 'hi92'), (140, 'hi93'), (141, 'hi94'), (143, 'hi95'), (144, 'hi96'), (146, 'hi97'), (147, 'hi98'), (149, 'hi99'), (150, 'hi100'), '<SHORTENED OUTPUT, displaying the first 100 / 8228 elements above>']' with counts [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, '<SHORTENED OUTPUT, displaying the first 100 / 8228 counts above>'].",
992+
),
993+
(
994+
negation,
995+
["col_int", "col_varchar"],
996+
[(0, "hi0"), (1, "hi0")],
997+
0,
998+
filternull_element_or_tuple_any,
999+
True,
1000+
[output_processor_sort, functools.partial(output_processor_limit, limit=5)],
1001+
None,
1002+
None,
1003+
"column(s) 'col_int', 'col_varchar' has a fraction of 0.9997569866342649 > 0 DISTINCT values (8228 / 8230) not being an element of '[(0, 'hi0'), (1, 'hi0')]'. It has excess elements '[(2, 'hi1'), (3, 'hi2'), (5, 'hi3'), (6, 'hi4'), (8, 'hi5'), '<SHORTENED OUTPUT, displaying the first 5 / 8228 elements above>']' with counts [2, 2, 2, 2, 2, '<SHORTENED OUTPUT, displaying the first 5 / 8228 counts above>'].",
1004+
),
1005+
],
1006+
)
1007+
def test_uniques_subset_within_complex_with_outputcheck_extralong(
1008+
engine, unique_table_extralong, data
1009+
):
1010+
(
1011+
operation,
1012+
columns,
1013+
uniques,
1014+
max_relative_violations,
1015+
filter_func,
1016+
compare_distinct,
1017+
output_processors,
1018+
function,
1019+
condition,
1020+
failure_message_suffix,
1021+
) = data
1022+
req = requirements.WithinRequirement.from_table(*unique_table_extralong)
1023+
req.add_uniques_subset_constraint(
1024+
columns,
1025+
uniques,
1026+
max_relative_violations,
1027+
filter_func=filter_func,
1028+
compare_distinct=compare_distinct,
1029+
output_processors=output_processors,
1030+
condition=condition,
1031+
map_func=function,
1032+
)
1033+
1034+
test_result = req[0].test(engine)
1035+
print(test_result)
1036+
print(test_result.failure_message)
1037+
assert operation(test_result.outcome), test_result.failure_message
1038+
assert test_result.failure_message.endswith(
1039+
failure_message_suffix
1040+
), test_result.failure_message
1041+
1042+
9771043
@pytest.mark.parametrize(
9781044
"data",
9791045
[

0 commit comments

Comments
 (0)