Merge branch 'main' into pixi-update

kklein · web-flow · commit 4501ace44146 · 2024-09-05T13:42:22.000+02:00
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -7,6 +7,15 @@
 Changelog
 =========
 
+1.9.2 - 2024.09.05
+------------------
+
+**Bug fixes**
+
+- Fix a bug in :class:`datajudge.constraints.numeric.NumericPercentile` which
+  could lead to off-by-one errors in retrieving a percentile value.
+
+
 1.9.0 - 2024.06.25
 ------------------
 
diff --git a/src/datajudge/__init__.py b/src/datajudge/__init__.py
@@ -13,4 +13,4 @@
     "WithinRequirement",
 ]
 
-__version__ = "1.9.1"
+__version__ = "1.9.2"
diff --git a/src/datajudge/db_access.py b/src/datajudge/db_access.py
@@ -887,31 +887,42 @@ def get_percentile(engine, ref, percentage):
     row_count = "dj_row_count"
     row_num = "dj_row_num"
     column_name = ref.get_column(engine)
-    column = ref.get_selection(engine).subquery().c[column_name]
-    subquery = (
-        sa.select(
-            column,
-            sa.func.row_number().over(order_by=column).label(row_num),
-            sa.func.count().over(partition_by=None).label(row_count),
-        )
-        .where(column.is_not(None))
-        .subquery()
+    base_selection = ref.get_selection(engine)
+    column = base_selection.subquery().c[column_name]
+
+    counting_selection = sa.select(
+        column,
+        sa.func.row_number().over(order_by=column).label(row_num),
+        sa.func.count().over(partition_by=None).label(row_count),
+    ).where(column.is_not(None))
+    counting_subquery = counting_selection.subquery()
+
+    inferior_selection = sa.select(*counting_subquery.columns).where(
+        counting_subquery.c[row_num] * 100.0 / counting_subquery.c[row_count]
+        < percentage
     )
-
-    constrained_selection = (
-        sa.select(*subquery.columns)
-        .where(subquery.c[row_num] * 100.0 / subquery.c[row_count] <= percentage)
-        .subquery()
+    inferior_subquery = inferior_selection.subquery()
+
+    argmin_selection = sa.select(
+        sa.case(
+            # Case 1: We we pick the next value.
+            (
+                sa.func.count(inferior_subquery.c[row_num]) > 0,
+                sa.func.max(inferior_subquery.c[row_num]) + 1,
+            ),
+            # Case 2: We pick the first value since the inferior subquery
+            # is empty.
+            (sa.func.count(inferior_subquery.c[row_num]) == 0, 1),
+            # Case 3: We received a reference without numerical values.
+            else_=None,
+        )
     )
 
-    max_selection = sa.select(
-        sa.func.max(constrained_selection.c[row_num])
-    ).scalar_subquery()
-    selection = sa.select(constrained_selection.c[column_name]).where(
-        constrained_selection.c[row_num] == max_selection
+    percentile_selection = sa.select(counting_subquery.c[column_name]).where(
+        counting_subquery.c[row_num] == argmin_selection
     )
-    result = engine.connect().execute(selection).scalar()
-    return result, [selection]
+    result = engine.connect().execute(percentile_selection).scalar()
+    return result, [percentile_selection]
 
 
 def get_min_length(engine, ref):
diff --git a/src/datajudge/requirements.py b/src/datajudge/requirements.py
@@ -662,7 +662,7 @@ def add_numeric_percentile_constraint(
     ):
         """Assert that the ``percentage``-th percentile is approximately ``expected_percentile``.
 
-        The percentile is defined as the value present in ``column`` for which
+        The percentile is defined as the smallest value present in ``column`` for which
         ``percentage`` % of the values in ``column`` are less or equal. ``NULL`` values
         are ignored.
 
@@ -1864,7 +1864,7 @@ def add_numeric_percentile_constraint(
     ):
         """Assert that the ``percentage``-th percentile is approximately equal.
 
-        The percentile is defined as the value present in ``column1`` / ``column2``
+        The percentile is defined as the smallest value present in ``column1`` / ``column2``
         for which ``percentage`` % of the values in ``column1`` / ``column2`` are
         less or equal. ``NULL`` values are ignored.
 
diff --git a/tests/integration/test_integration.py b/tests/integration/test_integration.py
@@ -1740,14 +1740,27 @@ def test_numeric_mean_between(engine, int_table1, int_table2, data):
 @pytest.mark.parametrize(
     "data",
     [
-        (identity, 20, 3, 0, 0, None),
-        (identity, 20, 2.8, 0.21, None, None),
-        (identity, 20, 2.8, None, 0.1, None),
-        (negation, 20, 2.8, 0, None, None),
-        (negation, 20, 2.8, None, 0, None),
-        (negation, 20, 2.8, 0, 0, None),
+        # The data at hand in int_table1 are [1, 2, ..., 19].
+        # According to the definition of percentile in our doc string,
+        # the 20th percentile should be the smallest value in our data
+        # for which 20% of the data is less or equal that value.
+        # For the value 3, we have that |{1,2,3}|/19 ~ .16 of the values
+        # are less or equal.
+        # For the value 4, we have that |{1,2,3,4}|/19 ~ .21 of the values
+        # are less or equal.
+        # Hence the expected 20th percentile should be 4.
+        (identity, 20, 4, 0, 0, None),
+        (identity, 20, 3.8, 0.21, None, None),
+        (identity, 20, 3.8, None, 0.1, None),
+        (negation, 20, 3.8, 0, None, None),
+        (negation, 20, 3.8, None, 0, None),
+        (negation, 20, 3.8, 0, 0, None),
         (negation, 20, 3.2, 0, 0, None),
-        (identity, 20, 2, 0, 0, Condition(raw_string="col_int <= 11")),
+        # The expected percentile changes when conditioning.
+        # |{1,2}|/11 ~ .18
+        # |{1,2,3}|/11 ~ .27
+        (identity, 20, 3, 0, 0, Condition(raw_string="col_int <= 11")),
+        (negation, 20, 2.8, 0, 0, Condition(raw_string="col_int <= 11")),
     ],
 )
 def test_numeric_percentile_within(engine, int_table1, data):
@@ -1775,7 +1788,17 @@ def test_numeric_percentile_within(engine, int_table1, data):
 @pytest.mark.parametrize(
     "data",
     [
-        # With the following condition, we expect the values [0, 0, 1, 1, None].
+        # With the following condition, we expect the following values
+        # to be present in unique_table1's column col_int:
+        # [0, 0, 1, 1, None]
+        (
+            identity,
+            24,
+            0,
+            0,
+            None,
+            Condition(raw_string="col_int <= 1 or col_int IS NULL"),
+        ),
         (
             identity,
             25,
@@ -1787,7 +1810,7 @@ def test_numeric_percentile_within(engine, int_table1, data):
         (
             identity,
             74,
-            0,
+            1,
             0,
             None,
             Condition(raw_string="col_int <= 1 or col_int IS NULL"),
@@ -1835,12 +1858,16 @@ def test_numeric_percentile_within_null(engine, unique_table1, data):
 @pytest.mark.parametrize(
     "data",
     [
+        # The 20th percentile of int_table1 is 4.
+        # The 20th percentile of int_table2 is 5.
+        # Hence, the absolute deviation is 1 and
+        # the relative deviation is 1/5 = .2.
         (identity, 20, 1, None, None, None),
-        (identity, 20, None, 0.25, None, None),
-        (identity, 20, 1, 0.25, None, None),
+        (identity, 20, None, 0.20, None, None),
+        (identity, 20, 1, 0.20, None, None),
         (negation, 20, 0, 0, None, None),
         (negation, 20, 0.9, None, None, None),
-        (negation, 20, None, 0.20, None, None),
+        (negation, 20, None, 0.19, None, None),
         (identity, 20, 0, 0, Condition(raw_string="col_int >=2"), None),
     ],
 )

Original file line number	Diff line number	Diff line change
`@@ -13,4 +13,4 @@`
`13`	`13`	`"WithinRequirement",`
`14`	`14`	`]`
`15`	`15`
`16`		`-__version__ = "1.9.1"`
	`16`	`+__version__ = "1.9.2"`