@@ -1740,14 +1740,27 @@ def test_numeric_mean_between(engine, int_table1, int_table2, data):
1740
1740
@pytest .mark .parametrize (
1741
1741
"data" ,
1742
1742
[
1743
- (identity , 20 , 3 , 0 , 0 , None ),
1744
- (identity , 20 , 2.8 , 0.21 , None , None ),
1745
- (identity , 20 , 2.8 , None , 0.1 , None ),
1746
- (negation , 20 , 2.8 , 0 , None , None ),
1747
- (negation , 20 , 2.8 , None , 0 , None ),
1748
- (negation , 20 , 2.8 , 0 , 0 , None ),
1743
+ # The data at hand in int_table1 are [1, 2, ..., 19].
1744
+ # According to the definition of percentile in our doc string,
1745
+ # the 20th percentile should be the smallest value in our data
1746
+ # for which 20% of the data is less or equal that value.
1747
+ # For the value 3, we have that |{1,2,3}|/19 ~ .16 of the values
1748
+ # are less or equal.
1749
+ # For the value 4, we have that |{1,2,3,4}|/19 ~ .21 of the values
1750
+ # are less or equal.
1751
+ # Hence the expected 20th percentile should be 4.
1752
+ (identity , 20 , 4 , 0 , 0 , None ),
1753
+ (identity , 20 , 3.8 , 0.21 , None , None ),
1754
+ (identity , 20 , 3.8 , None , 0.1 , None ),
1755
+ (negation , 20 , 3.8 , 0 , None , None ),
1756
+ (negation , 20 , 3.8 , None , 0 , None ),
1757
+ (negation , 20 , 3.8 , 0 , 0 , None ),
1749
1758
(negation , 20 , 3.2 , 0 , 0 , None ),
1750
- (identity , 20 , 2 , 0 , 0 , Condition (raw_string = "col_int <= 11" )),
1759
+ # The expected percentile changes when conditioning.
1760
+ # |{1,2}|/11 ~ .18
1761
+ # |{1,2,3}|/11 ~ .27
1762
+ (identity , 20 , 3 , 0 , 0 , Condition (raw_string = "col_int <= 11" )),
1763
+ (negation , 20 , 2.8 , 0 , 0 , Condition (raw_string = "col_int <= 11" )),
1751
1764
],
1752
1765
)
1753
1766
def test_numeric_percentile_within (engine , int_table1 , data ):
@@ -1775,7 +1788,17 @@ def test_numeric_percentile_within(engine, int_table1, data):
1775
1788
@pytest .mark .parametrize (
1776
1789
"data" ,
1777
1790
[
1778
- # With the following condition, we expect the values [0, 0, 1, 1, None].
1791
+ # With the following condition, we expect the following values
1792
+ # to be present in unique_table1's column col_int:
1793
+ # [0, 0, 1, 1, None]
1794
+ (
1795
+ identity ,
1796
+ 24 ,
1797
+ 0 ,
1798
+ 0 ,
1799
+ None ,
1800
+ Condition (raw_string = "col_int <= 1 or col_int IS NULL" ),
1801
+ ),
1779
1802
(
1780
1803
identity ,
1781
1804
25 ,
@@ -1787,7 +1810,7 @@ def test_numeric_percentile_within(engine, int_table1, data):
1787
1810
(
1788
1811
identity ,
1789
1812
74 ,
1790
- 0 ,
1813
+ 1 ,
1791
1814
0 ,
1792
1815
None ,
1793
1816
Condition (raw_string = "col_int <= 1 or col_int IS NULL" ),
@@ -1835,12 +1858,16 @@ def test_numeric_percentile_within_null(engine, unique_table1, data):
1835
1858
@pytest .mark .parametrize (
1836
1859
"data" ,
1837
1860
[
1861
+ # The 20th percentile of int_table1 is 4.
1862
+ # The 20th percentile of int_table2 is 5.
1863
+ # Hence, the absolute deviation is 1 and
1864
+ # the relative deviation is 1/5 = .2.
1838
1865
(identity , 20 , 1 , None , None , None ),
1839
- (identity , 20 , None , 0.25 , None , None ),
1840
- (identity , 20 , 1 , 0.25 , None , None ),
1866
+ (identity , 20 , None , 0.20 , None , None ),
1867
+ (identity , 20 , 1 , 0.20 , None , None ),
1841
1868
(negation , 20 , 0 , 0 , None , None ),
1842
1869
(negation , 20 , 0.9 , None , None , None ),
1843
- (negation , 20 , None , 0.20 , None , None ),
1870
+ (negation , 20 , None , 0.19 , None , None ),
1844
1871
(identity , 20 , 0 , 0 , Condition (raw_string = "col_int >=2" ), None ),
1845
1872
],
1846
1873
)
0 commit comments