From d3b91ad81b788a3c8ce16e121e54f66f97de1c6b Mon Sep 17 00:00:00 2001 From: Armaan Date: Wed, 5 Feb 2025 11:42:00 -0500 Subject: [PATCH 1/3] test refactor for floating point failures + 1 mypy fix --- .pre-commit-config.yaml | 2 +- dataprofiler/data_readers/parquet_data.py | 2 +- .../test_categorical_column_profile.py | 40 ++++++++++++++++++- .../tests/profilers/test_profile_builder.py | 15 ++++++- 4 files changed, 54 insertions(+), 5 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index a2047e1b..092cc5a4 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -19,7 +19,7 @@ repos: # Flake8: complexity and style checking # https://flake8.pycqa.org/en/latest/user/using-hooks.html - repo: https://github.com/pycqa/flake8 - rev: 4.0.1 + rev: 5.0.4 hooks: - id: flake8 additional_dependencies: [flake8-docstrings] diff --git a/dataprofiler/data_readers/parquet_data.py b/dataprofiler/data_readers/parquet_data.py index ee625316..0af1b563 100644 --- a/dataprofiler/data_readers/parquet_data.py +++ b/dataprofiler/data_readers/parquet_data.py @@ -68,7 +68,7 @@ def __init__( self._load_data(data) @property - def file_encoding(self) -> None: + def file_encoding(self) -> Optional[str]: """Set file encoding to None since not detected for avro.""" return None diff --git a/dataprofiler/tests/profilers/test_categorical_column_profile.py b/dataprofiler/tests/profilers/test_categorical_column_profile.py index 55d2ea68..16a223f4 100644 --- a/dataprofiler/tests/profilers/test_categorical_column_profile.py +++ b/dataprofiler/tests/profilers/test_categorical_column_profile.py @@ -1,4 +1,5 @@ import json +import math import os import unittest from collections import defaultdict @@ -731,7 +732,44 @@ def test_categorical_diff(self): }, } actual_diff = profile.diff(profile2) - self.assertDictEqual(expected_diff, actual_diff) + + assert expected_diff["categorical"] == actual_diff["categorical"] + assert ( + expected_diff["statistics"]["unique_count"] + == actual_diff["statistics"]["unique_count"] + ) + assert math.isclose( + expected_diff["statistics"]["unique_ratio"], + actual_diff["statistics"]["unique_ratio"], + ) + assert ( + expected_diff["statistics"]["categories"] + == actual_diff["statistics"]["categories"] + ) + assert math.isclose( + expected_diff["statistics"]["gini_impurity"], + actual_diff["statistics"]["gini_impurity"], + ) + assert math.isclose( + expected_diff["statistics"]["unalikeability"], + actual_diff["statistics"]["unalikeability"], + ) + assert ( + expected_diff["statistics"]["categorical_count"] + == actual_diff["statistics"]["categorical_count"] + ) + assert math.isclose( + expected_diff["statistics"]["chi2-test"]["chi2-statistic"], + actual_diff["statistics"]["chi2-test"]["chi2-statistic"], + ) + assert ( + expected_diff["statistics"]["chi2-test"]["deg_of_free"] + == actual_diff["statistics"]["chi2-test"]["deg_of_free"] + ) + assert math.isclose( + expected_diff["statistics"]["chi2-test"]["p-value"], + actual_diff["statistics"]["chi2-test"]["p-value"], + ) # Test with one categorical column matching df_not_categorical = pd.Series( diff --git a/dataprofiler/tests/profilers/test_profile_builder.py b/dataprofiler/tests/profilers/test_profile_builder.py index f9bbf14a..02521194 100644 --- a/dataprofiler/tests/profilers/test_profile_builder.py +++ b/dataprofiler/tests/profilers/test_profile_builder.py @@ -1,5 +1,6 @@ import json import logging +import math import os import random import re @@ -2162,8 +2163,18 @@ def test_diff_categorical_chi2_test(self, *mocks): "deg_of_free": 2, "p-value": 0.3099238764710244, } - self.assertDictEqual( - expected_chi2_test_dict, diff["data_stats"][0]["statistics"]["chi2-test"] + actual_chi2_test_dict = diff["data_stats"][0]["statistics"]["chi2-test"] + + assert math.isclose( + expected_chi2_test_dict["chi2-statistic"], + actual_chi2_test_dict["chi2-statistic"], + ) + assert ( + expected_chi2_test_dict["deg_of_free"] + == actual_chi2_test_dict["deg_of_free"] + ) + assert math.isclose( + expected_chi2_test_dict["p-value"], actual_chi2_test_dict["p-value"] ) @mock.patch( From 18b55e800cc3842fcfabd52d70e65bb96dbd2fd7 Mon Sep 17 00:00:00 2001 From: Armaan Date: Wed, 5 Feb 2025 16:19:47 -0500 Subject: [PATCH 2/3] adding inline ignores for mypy and TODOs for context --- dataprofiler/profilers/float_column_profile.py | 5 ++++- dataprofiler/profilers/int_column_profile.py | 5 ++++- dataprofiler/profilers/numerical_column_stats.py | 5 ++++- dataprofiler/profilers/text_column_profile.py | 5 ++++- setup.cfg | 1 - 5 files changed, 16 insertions(+), 5 deletions(-) diff --git a/dataprofiler/profilers/float_column_profile.py b/dataprofiler/profilers/float_column_profile.py index 3d6ede32..741a7ddc 100644 --- a/dataprofiler/profilers/float_column_profile.py +++ b/dataprofiler/profilers/float_column_profile.py @@ -194,8 +194,11 @@ def load_from_dict(cls, data, config: dict | None = None): return profile + # TODO: refactor BaseColumnProfiler.profile to not be an @property + # NumericStatsMixin inherits from BaseColumnProfile and adding @property to + # NumericStatisMixin.profile() results in a breaking change - ignoring [override] @property - def profile(self) -> dict: + def profile(self) -> dict: #type: ignore[override] """ Return the profile of the column. diff --git a/dataprofiler/profilers/int_column_profile.py b/dataprofiler/profilers/int_column_profile.py index fe38eec6..f74713e0 100644 --- a/dataprofiler/profilers/int_column_profile.py +++ b/dataprofiler/profilers/int_column_profile.py @@ -92,8 +92,11 @@ def load_from_dict(cls, data, config: dict | None = None): profile._reformat_numeric_stats_types_on_serialized_profiles() return profile + # TODO: refactor BaseColumnProfiler.profile to not be an @property + # NumericStatsMixin inherits from BaseColumnProfile and adding @property to + # NumericStatisMixin.profile() results in a breaking change - ignoring [override] @property - def profile(self) -> dict: + def profile(self) -> dict: #type: ignore[override] """ Return the profile of the column. diff --git a/dataprofiler/profilers/numerical_column_stats.py b/dataprofiler/profilers/numerical_column_stats.py index 202a704f..202695ee 100644 --- a/dataprofiler/profilers/numerical_column_stats.py +++ b/dataprofiler/profilers/numerical_column_stats.py @@ -365,7 +365,10 @@ def _add_helper( other1._median_abs_dev_is_enabled and other2._median_abs_dev_is_enabled ) - def profile(self) -> dict: + # TODO: refactor BaseColumnProfiler.profile to not be an @property + # NumericStatsMixin inherits from BaseColumnProfile and adding @property to + # NumericStatisMixin.profile() results in a breaking change - ignoring [override] + def profile(self) -> dict: # type: ignore[override] """ Return profile of the column. diff --git a/dataprofiler/profilers/text_column_profile.py b/dataprofiler/profilers/text_column_profile.py index eb79643f..09376d69 100644 --- a/dataprofiler/profilers/text_column_profile.py +++ b/dataprofiler/profilers/text_column_profile.py @@ -84,8 +84,11 @@ def report(self, remove_disabled_flag: bool = False) -> dict: return profile + # TODO: refactor BaseColumnProfiler.profile to not be an @property + # NumericStatsMixin inherits from BaseColumnProfile and adding @property to + # NumericStatisMixin.profile() results in a breaking change - ignoring [override] @property - def profile(self) -> dict: + def profile(self) -> dict: #type: ignore[override] """ Return the profile of the column. diff --git a/setup.cfg b/setup.cfg index 6ec3e8a6..6c2be03b 100644 --- a/setup.cfg +++ b/setup.cfg @@ -17,7 +17,6 @@ warn_unused_configs = True ignore_missing_imports = True no_implicit_optional = False exclude = ^dataprofiler/tests/|^resources/|^examples|venv*/ -disable_error_code = override [check-manifest] ignore-default-rules=True From 6d5c841c1554952e0318a11ad1048f8dac73561b Mon Sep 17 00:00:00 2001 From: Armaan Date: Wed, 5 Feb 2025 16:25:06 -0500 Subject: [PATCH 3/3] ran black for changes in previous commit --- dataprofiler/profilers/float_column_profile.py | 2 +- dataprofiler/profilers/int_column_profile.py | 2 +- dataprofiler/profilers/numerical_column_stats.py | 2 +- dataprofiler/profilers/text_column_profile.py | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/dataprofiler/profilers/float_column_profile.py b/dataprofiler/profilers/float_column_profile.py index 741a7ddc..f16b62c9 100644 --- a/dataprofiler/profilers/float_column_profile.py +++ b/dataprofiler/profilers/float_column_profile.py @@ -198,7 +198,7 @@ def load_from_dict(cls, data, config: dict | None = None): # NumericStatsMixin inherits from BaseColumnProfile and adding @property to # NumericStatisMixin.profile() results in a breaking change - ignoring [override] @property - def profile(self) -> dict: #type: ignore[override] + def profile(self) -> dict: # type: ignore[override] """ Return the profile of the column. diff --git a/dataprofiler/profilers/int_column_profile.py b/dataprofiler/profilers/int_column_profile.py index f74713e0..3b2fe267 100644 --- a/dataprofiler/profilers/int_column_profile.py +++ b/dataprofiler/profilers/int_column_profile.py @@ -96,7 +96,7 @@ def load_from_dict(cls, data, config: dict | None = None): # NumericStatsMixin inherits from BaseColumnProfile and adding @property to # NumericStatisMixin.profile() results in a breaking change - ignoring [override] @property - def profile(self) -> dict: #type: ignore[override] + def profile(self) -> dict: # type: ignore[override] """ Return the profile of the column. diff --git a/dataprofiler/profilers/numerical_column_stats.py b/dataprofiler/profilers/numerical_column_stats.py index 202695ee..c9c8ce7b 100644 --- a/dataprofiler/profilers/numerical_column_stats.py +++ b/dataprofiler/profilers/numerical_column_stats.py @@ -368,7 +368,7 @@ def _add_helper( # TODO: refactor BaseColumnProfiler.profile to not be an @property # NumericStatsMixin inherits from BaseColumnProfile and adding @property to # NumericStatisMixin.profile() results in a breaking change - ignoring [override] - def profile(self) -> dict: # type: ignore[override] + def profile(self) -> dict: # type: ignore[override] """ Return profile of the column. diff --git a/dataprofiler/profilers/text_column_profile.py b/dataprofiler/profilers/text_column_profile.py index 09376d69..c7d47757 100644 --- a/dataprofiler/profilers/text_column_profile.py +++ b/dataprofiler/profilers/text_column_profile.py @@ -88,7 +88,7 @@ def report(self, remove_disabled_flag: bool = False) -> dict: # NumericStatsMixin inherits from BaseColumnProfile and adding @property to # NumericStatisMixin.profile() results in a breaking change - ignoring [override] @property - def profile(self) -> dict: #type: ignore[override] + def profile(self) -> dict: # type: ignore[override] """ Return the profile of the column.