Skip to content

Commit 4501ace

Browse files
authored
Merge branch 'main' into pixi-update
2 parents fcc2d1b + a0ec606 commit 4501ace

File tree

5 files changed

+83
-36
lines changed

5 files changed

+83
-36
lines changed

CHANGELOG.rst

+9
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,15 @@
77
Changelog
88
=========
99

10+
1.9.2 - 2024.09.05
11+
------------------
12+
13+
**Bug fixes**
14+
15+
- Fix a bug in :class:`datajudge.constraints.numeric.NumericPercentile` which
16+
could lead to off-by-one errors in retrieving a percentile value.
17+
18+
1019
1.9.0 - 2024.06.25
1120
------------------
1221

src/datajudge/__init__.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -13,4 +13,4 @@
1313
"WithinRequirement",
1414
]
1515

16-
__version__ = "1.9.1"
16+
__version__ = "1.9.2"

src/datajudge/db_access.py

+32-21
Original file line numberDiff line numberDiff line change
@@ -887,31 +887,42 @@ def get_percentile(engine, ref, percentage):
887887
row_count = "dj_row_count"
888888
row_num = "dj_row_num"
889889
column_name = ref.get_column(engine)
890-
column = ref.get_selection(engine).subquery().c[column_name]
891-
subquery = (
892-
sa.select(
893-
column,
894-
sa.func.row_number().over(order_by=column).label(row_num),
895-
sa.func.count().over(partition_by=None).label(row_count),
896-
)
897-
.where(column.is_not(None))
898-
.subquery()
890+
base_selection = ref.get_selection(engine)
891+
column = base_selection.subquery().c[column_name]
892+
893+
counting_selection = sa.select(
894+
column,
895+
sa.func.row_number().over(order_by=column).label(row_num),
896+
sa.func.count().over(partition_by=None).label(row_count),
897+
).where(column.is_not(None))
898+
counting_subquery = counting_selection.subquery()
899+
900+
inferior_selection = sa.select(*counting_subquery.columns).where(
901+
counting_subquery.c[row_num] * 100.0 / counting_subquery.c[row_count]
902+
< percentage
899903
)
900-
901-
constrained_selection = (
902-
sa.select(*subquery.columns)
903-
.where(subquery.c[row_num] * 100.0 / subquery.c[row_count] <= percentage)
904-
.subquery()
904+
inferior_subquery = inferior_selection.subquery()
905+
906+
argmin_selection = sa.select(
907+
sa.case(
908+
# Case 1: We we pick the next value.
909+
(
910+
sa.func.count(inferior_subquery.c[row_num]) > 0,
911+
sa.func.max(inferior_subquery.c[row_num]) + 1,
912+
),
913+
# Case 2: We pick the first value since the inferior subquery
914+
# is empty.
915+
(sa.func.count(inferior_subquery.c[row_num]) == 0, 1),
916+
# Case 3: We received a reference without numerical values.
917+
else_=None,
918+
)
905919
)
906920

907-
max_selection = sa.select(
908-
sa.func.max(constrained_selection.c[row_num])
909-
).scalar_subquery()
910-
selection = sa.select(constrained_selection.c[column_name]).where(
911-
constrained_selection.c[row_num] == max_selection
921+
percentile_selection = sa.select(counting_subquery.c[column_name]).where(
922+
counting_subquery.c[row_num] == argmin_selection
912923
)
913-
result = engine.connect().execute(selection).scalar()
914-
return result, [selection]
924+
result = engine.connect().execute(percentile_selection).scalar()
925+
return result, [percentile_selection]
915926

916927

917928
def get_min_length(engine, ref):

src/datajudge/requirements.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -662,7 +662,7 @@ def add_numeric_percentile_constraint(
662662
):
663663
"""Assert that the ``percentage``-th percentile is approximately ``expected_percentile``.
664664
665-
The percentile is defined as the value present in ``column`` for which
665+
The percentile is defined as the smallest value present in ``column`` for which
666666
``percentage`` % of the values in ``column`` are less or equal. ``NULL`` values
667667
are ignored.
668668
@@ -1864,7 +1864,7 @@ def add_numeric_percentile_constraint(
18641864
):
18651865
"""Assert that the ``percentage``-th percentile is approximately equal.
18661866
1867-
The percentile is defined as the value present in ``column1`` / ``column2``
1867+
The percentile is defined as the smallest value present in ``column1`` / ``column2``
18681868
for which ``percentage`` % of the values in ``column1`` / ``column2`` are
18691869
less or equal. ``NULL`` values are ignored.
18701870

tests/integration/test_integration.py

+39-12
Original file line numberDiff line numberDiff line change
@@ -1740,14 +1740,27 @@ def test_numeric_mean_between(engine, int_table1, int_table2, data):
17401740
@pytest.mark.parametrize(
17411741
"data",
17421742
[
1743-
(identity, 20, 3, 0, 0, None),
1744-
(identity, 20, 2.8, 0.21, None, None),
1745-
(identity, 20, 2.8, None, 0.1, None),
1746-
(negation, 20, 2.8, 0, None, None),
1747-
(negation, 20, 2.8, None, 0, None),
1748-
(negation, 20, 2.8, 0, 0, None),
1743+
# The data at hand in int_table1 are [1, 2, ..., 19].
1744+
# According to the definition of percentile in our doc string,
1745+
# the 20th percentile should be the smallest value in our data
1746+
# for which 20% of the data is less or equal that value.
1747+
# For the value 3, we have that |{1,2,3}|/19 ~ .16 of the values
1748+
# are less or equal.
1749+
# For the value 4, we have that |{1,2,3,4}|/19 ~ .21 of the values
1750+
# are less or equal.
1751+
# Hence the expected 20th percentile should be 4.
1752+
(identity, 20, 4, 0, 0, None),
1753+
(identity, 20, 3.8, 0.21, None, None),
1754+
(identity, 20, 3.8, None, 0.1, None),
1755+
(negation, 20, 3.8, 0, None, None),
1756+
(negation, 20, 3.8, None, 0, None),
1757+
(negation, 20, 3.8, 0, 0, None),
17491758
(negation, 20, 3.2, 0, 0, None),
1750-
(identity, 20, 2, 0, 0, Condition(raw_string="col_int <= 11")),
1759+
# The expected percentile changes when conditioning.
1760+
# |{1,2}|/11 ~ .18
1761+
# |{1,2,3}|/11 ~ .27
1762+
(identity, 20, 3, 0, 0, Condition(raw_string="col_int <= 11")),
1763+
(negation, 20, 2.8, 0, 0, Condition(raw_string="col_int <= 11")),
17511764
],
17521765
)
17531766
def test_numeric_percentile_within(engine, int_table1, data):
@@ -1775,7 +1788,17 @@ def test_numeric_percentile_within(engine, int_table1, data):
17751788
@pytest.mark.parametrize(
17761789
"data",
17771790
[
1778-
# With the following condition, we expect the values [0, 0, 1, 1, None].
1791+
# With the following condition, we expect the following values
1792+
# to be present in unique_table1's column col_int:
1793+
# [0, 0, 1, 1, None]
1794+
(
1795+
identity,
1796+
24,
1797+
0,
1798+
0,
1799+
None,
1800+
Condition(raw_string="col_int <= 1 or col_int IS NULL"),
1801+
),
17791802
(
17801803
identity,
17811804
25,
@@ -1787,7 +1810,7 @@ def test_numeric_percentile_within(engine, int_table1, data):
17871810
(
17881811
identity,
17891812
74,
1790-
0,
1813+
1,
17911814
0,
17921815
None,
17931816
Condition(raw_string="col_int <= 1 or col_int IS NULL"),
@@ -1835,12 +1858,16 @@ def test_numeric_percentile_within_null(engine, unique_table1, data):
18351858
@pytest.mark.parametrize(
18361859
"data",
18371860
[
1861+
# The 20th percentile of int_table1 is 4.
1862+
# The 20th percentile of int_table2 is 5.
1863+
# Hence, the absolute deviation is 1 and
1864+
# the relative deviation is 1/5 = .2.
18381865
(identity, 20, 1, None, None, None),
1839-
(identity, 20, None, 0.25, None, None),
1840-
(identity, 20, 1, 0.25, None, None),
1866+
(identity, 20, None, 0.20, None, None),
1867+
(identity, 20, 1, 0.20, None, None),
18411868
(negation, 20, 0, 0, None, None),
18421869
(negation, 20, 0.9, None, None, None),
1843-
(negation, 20, None, 0.20, None, None),
1870+
(negation, 20, None, 0.19, None, None),
18441871
(identity, 20, 0, 0, Condition(raw_string="col_int >=2"), None),
18451872
],
18461873
)

0 commit comments

Comments
 (0)