Skip to content

Commit

Permalink
Implement a Constraint limiting the fraction of NULL values in a …
Browse files Browse the repository at this point in the history
…column. (#113)

* Draft missingness constraint.

* Add error handling.

* Fix failure_message.

* Add docstrings.

* Rename.

* Remove NullAbsence constraint.

* Add changelog entry.

* Fix docstring.
  • Loading branch information
kklein authored Mar 14, 2023
1 parent 7ebd7c2 commit 1665e65
Show file tree
Hide file tree
Showing 5 changed files with 140 additions and 22 deletions.
5 changes: 4 additions & 1 deletion CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,14 @@
Changelog
=========

1.5.0 - 2022.03.XX

1.5.0 - 2022.03.14
------------------

**New features**

- Implement :meth:`datajudge.BetweenRequirement.add_max_null_fraction_constraint` and
:meth:`datajudge.WithinRequirement.add_max_null_fraction_constraint`.
- Implement :meth:`datajudge.BetweenRequirement.add_numeric_percentile_constraint` and
:meth:`datajudge.WithinRequirement.add_numeric_percentile_constraint`.

Expand Down
47 changes: 35 additions & 12 deletions src/datajudge/constraints/miscs.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,15 +119,38 @@ def test(self, engine: sa.engine.Engine) -> TestResult:
return TestResult.failure(assertion_text)


class NullAbsence(Constraint):
def __init__(self, ref: DataReference, name: str = None):
# This is arguably hacky. Passing this pointless string ensures that
# None-checks fail.
super().__init__(ref, ref_value="NoNull", name=name)

def test(self, engine: sa.engine) -> TestResult:
assertion_message = f"{self.ref.get_string()} contains NULLS."
query_result, selections = db_access.contains_null(engine, self.ref)
self.factual_selections = selections
result = not query_result
return TestResult(result, assertion_message)
class MaxNullFraction(Constraint):
def __init__(
self,
ref,
*,
ref2: DataReference = None,
max_null_fraction: float = None,
max_relative_deviation: float = 0,
name: str = None,
):
super().__init__(ref, ref2=ref2, ref_value=max_null_fraction, name=name)
if max_null_fraction is not None and not (0 <= max_null_fraction <= 1):
raise ValueError(
f"max_null_fraction was expected to lie within [0, 1] but is "
f"{max_null_fraction}."
)
if max_relative_deviation < 0:
raise ValueError(
f"{max_relative_deviation} is negative even though it needs to be positive."
)
self.max_relative_deviation = max_relative_deviation

def retrieve(self, engine: sa.engine.Engine, ref: DataReference):
return db_access.get_missing_fraction(engine=engine, ref=ref)

def compare(
self, missing_fraction_factual: float, missing_fracion_target: float
) -> Tuple[bool, Optional[str]]:
threshold = missing_fracion_target * (1 + self.max_relative_deviation)
result = missing_fraction_factual <= threshold
assertion_text = (
f"{missing_fraction_factual} of {self.ref.get_string()} values are NULL "
f"while only {self.target_prefix}{threshold} were allowed to be NULL."
)
return result, assertion_text
22 changes: 14 additions & 8 deletions src/datajudge/db_access.py
Original file line number Diff line number Diff line change
Expand Up @@ -889,16 +889,22 @@ def get_unique_count_union(engine, ref, ref2):
return result, [selection]


def contains_null(engine, ref):
selection = ref.get_selection(engine)
subquery = selection.distinct().alias()
selection = (
def get_missing_fraction(engine, ref):
selection = ref.get_selection(engine).subquery()
n_rows_total_selection = sa.select([sa.func.count()]).select_from(selection)
n_rows_missing_selection = (
sa.select([sa.func.count()])
.select_from(subquery)
.where(subquery.c[ref.get_column(engine)].is_(None))
.select_from(selection)
.where(selection.c[ref.get_column(engine)].is_(None))
)
with engine.connect() as connection:
n_rows_total = connection.execute(n_rows_total_selection).scalar()
n_rows_missing = connection.execute(n_rows_missing_selection).scalar()

return (
n_rows_missing / n_rows_total,
[n_rows_total_selection, n_rows_missing_selection],
)
n_rows = engine.connect().execute(selection).scalar()
return n_rows > 0, [selection]


def get_column_names(engine, ref):
Expand Down
47 changes: 46 additions & 1 deletion src/datajudge/requirements.py
Original file line number Diff line number Diff line change
Expand Up @@ -177,7 +177,27 @@ def add_null_absence_constraint(
self, column: str, condition: Condition = None, name: str = None
):
ref = DataReference(self.data_source, [column], condition)
self._constraints.append(miscs_constraints.NullAbsence(ref, name=name))
self._constraints.append(
miscs_constraints.MaxNullFraction(ref, max_null_fraction=0, name=name)
)

def add_max_null_fraction_constraint(
self,
column: str,
max_null_fraction: float,
condition: Condition = None,
name: str = None,
):
"""Assert that ``column`` has less than a certain fraction of ``NULL`` values.
``max_null_fraction`` is expected to lie within [0, 1].
"""
ref = DataReference(self.data_source, [column], condition)
self._constraints.append(
miscs_constraints.MaxNullFraction(
ref, max_null_fraction=max_null_fraction, name=name
)
)

def add_n_rows_equality_constraint(
self, n_rows: int, condition: Condition = None, name: str = None
Expand Down Expand Up @@ -1105,6 +1125,31 @@ def add_n_uniques_max_loss_constraint(
)
)

def add_max_null_fraction_constraint(
self,
column1: str,
column2: str,
max_relative_deviation: float,
condition1: Condition = None,
condition2: Condition = None,
name: str = None,
):
"""Assert that the fraction of ``NULL`` values of one is at most that of the other.
Given that ``column2``\'s underlying data has a fraction ``q`` of ``NULL`` values, the
``max_relative_deviation`` parameter allows ``column1``\'s underlying data to have a
fraction ``(1 + max_relative_deviation) * q`` of ``NULL`` values.
"""
ref = DataReference(self.data_source, [column1], condition1)
ref2 = DataReference(self.data_source2, [column2], condition2)
self._constraints.append(
miscs_constraints.MaxNullFraction(
ref,
ref2=ref2,
max_relative_deviation=max_relative_deviation,
)
)

def add_numeric_min_constraint(
self,
column1: str,
Expand Down
41 changes: 41 additions & 0 deletions tests/integration/test_integration.py
Original file line number Diff line number Diff line change
Expand Up @@ -1842,6 +1842,47 @@ def test_null_absence_within(engine, get_fixture, data):
assert operation(req[0].test(engine).outcome)


@pytest.mark.parametrize(
"data",
[
(identity, 2 / 62),
(negation, 2 / 63),
],
)
def test_max_null_fraction_within(engine, unique_table1, data):
(operation, max_null_fraction) = data
req = requirements.WithinRequirement.from_table(*unique_table1)
req.add_max_null_fraction_constraint(
column="col_int", max_null_fraction=max_null_fraction
)
test_result = req[0].test(engine)
assert operation(test_result.outcome), test_result.failure_message


@pytest.mark.parametrize(
"data",
[
(identity, "col_int", "col_int", 0),
(identity, "col_varchar", "col_int", 0),
(identity, "col_int", "col_varchar", 1),
(negation, "col_int", "col_varchar", 0.99),
],
)
def test_max_null_fraction_between(engine, unique_table1, data):
(operation, column1, column2, max_relative_deviation) = data
req = requirements.BetweenRequirement.from_tables(
*unique_table1,
*unique_table1,
)
req.add_max_null_fraction_constraint(
column1=column1,
column2=column2,
max_relative_deviation=max_relative_deviation,
)
test_result = req[0].test(engine)
assert operation(test_result.outcome), test_result.failure_message


@pytest.mark.parametrize(
"data",
[
Expand Down

0 comments on commit 1665e65

Please sign in to comment.