From f10eecf3bfc76a97d6c1423032012bdccd7dcec5 Mon Sep 17 00:00:00 2001 From: Andrew Li Date: Mon, 1 Apr 2024 14:32:05 -0500 Subject: [PATCH] update data labeler --- .../profilers/data_labeler_column_profile.py | 16 ++++--- .../test_data_labeler_column_profile.py | 42 ++++++++++--------- 2 files changed, 33 insertions(+), 25 deletions(-) diff --git a/dataprofiler/profilers/data_labeler_column_profile.py b/dataprofiler/profilers/data_labeler_column_profile.py index d9bfe1ee9..cfe427536 100644 --- a/dataprofiler/profilers/data_labeler_column_profile.py +++ b/dataprofiler/profilers/data_labeler_column_profile.py @@ -5,7 +5,9 @@ from typing import Dict, cast import numpy as np -from pandas import DataFrame, Series +import pandas as pd +import polars as pl +from polars import Series from ..labelers.base_data_labeler import BaseDataLabeler from ..labelers.data_labelers import DataLabeler @@ -394,7 +396,7 @@ def diff(self, other_profile: DataLabelerColumn, options: dict = None) -> dict: @BaseColumnProfiler._timeit(name="data_labeler_predict") def _update_predictions( self, - df_series: DataFrame, + df_series: Series, prev_dependent_properties: dict = None, subset_properties: dict = None, ) -> None: @@ -411,8 +413,9 @@ def _update_predictions( :type df_series: pandas.DataFrame :return: None """ + df_series_pd = df_series.to_pandas() predictions = self.data_labeler.predict( - df_series, predict_options=dict(show_confidences=True) + df_series_pd, predict_options=dict(show_confidences=True) ) # remove PAD from output (reserved zero index) if self.data_labeler.model.requires_zero_mapping: @@ -441,7 +444,7 @@ def _update_helper(self, df_series_clean: Series, profile: dict) -> None: Update the column profile properties. :param df_series_clean: df series with nulls removed - :type df_series_clean: pandas.core.series.Series + :type df_series_clean: polars.Series :param profile: float profile dictionary :type profile: dict :return: None @@ -453,10 +456,13 @@ def update(self, df_series: Series) -> DataLabelerColumn: Update the column profile. :param df_series: df series - :type df_series: pandas.core.series.Series + :type df_series: polars.Series :return: updated DataLabelerColumn :rtype: DataLabelerColumn """ + # TODO remove onces profiler builder is updated + if type(df_series) == pd.Series: + df_series = pl.from_pandas(df_series) # type: ignore if len(df_series) == 0: return self diff --git a/dataprofiler/tests/profilers/test_data_labeler_column_profile.py b/dataprofiler/tests/profilers/test_data_labeler_column_profile.py index 35f448aea..866bc972a 100644 --- a/dataprofiler/tests/profilers/test_data_labeler_column_profile.py +++ b/dataprofiler/tests/profilers/test_data_labeler_column_profile.py @@ -4,7 +4,7 @@ from unittest import mock import numpy as np -import pandas as pd +import polars as pl from dataprofiler.labelers import BaseDataLabeler from dataprofiler.profilers import profiler_utils @@ -21,6 +21,8 @@ spec=BaseDataLabeler, ) class TestDataLabelerColumnProfiler(unittest.TestCase): + maxDiff = None + @staticmethod def _setup_data_labeler_mock(mock_instance): mock_DataLabeler = mock_instance.return_value @@ -46,7 +48,7 @@ def mock_predict(data, *args, **kwargs): def test_base_case(self, mock_instance): self._setup_data_labeler_mock(mock_instance) - data = pd.Series([], dtype=object) + data = pl.Series([], dtype=object) profiler = DataLabelerColumn(data.name) time_array = [float(i) for i in range(4, 0, -1)] @@ -74,7 +76,7 @@ def test_base_case(self, mock_instance): def test_update(self, mock_instance): self._setup_data_labeler_mock(mock_instance) - data = pd.Series(["1", "2", "3"]) + data = pl.Series(["1", "2", "3"]) profiler = DataLabelerColumn(data.name) profiler.update(data) @@ -93,7 +95,7 @@ def test_update_reserve_label_mapping(self, mock_instance): mock_DataLabeler.reverse_label_mapping = {0: "PAD", 1: "a", 2: "b"} mock_DataLabeler.model.num_labels = 3 - data = pd.Series(["1", "2", "3"]) + data = pl.Series(["1", "2", "3"]) profiler = DataLabelerColumn(data.name) profiler.update(data) @@ -111,7 +113,7 @@ def mock_low_predict(data, *args, **kwargs): mock_instance.return_value.predict.side_effect = mock_low_predict - data = pd.Series(["1"]) + data = pl.Series(["1"]) profiler = DataLabelerColumn(data.name) profiler.update(data) self.assertEqual("could not determine", profiler.data_label) @@ -144,7 +146,7 @@ def mock_low_predict(data, *args, **kwargs): mock_instance.return_value.predict.side_effect = mock_low_predict - data = pd.Series(["1"] * 10) + data = pl.Series(["1"] * 10) profiler = DataLabelerColumn(data.name) profiler.update(data) self.assertEqual("a|c|b", profiler.data_label) @@ -152,7 +154,7 @@ def mock_low_predict(data, *args, **kwargs): def test_profile(self, mock_instance): self._setup_data_labeler_mock(mock_instance) - data = pd.Series(["1", "2", "3"]) + data = pl.Series(["1", "2", "3"]) profiler = DataLabelerColumn(data.name) expected_profile = { @@ -180,7 +182,7 @@ def test_profile(self, mock_instance): def test_report(self, mock_instance): self._setup_data_labeler_mock(mock_instance) - data = pd.Series(["1", "2", "3"]) + data = pl.Series(["1", "2", "3"]) profile = DataLabelerColumn(data.name) report1 = profile.profile @@ -206,7 +208,7 @@ def test_label_match(self, mock_instance): mock_DataLabeler.model.num_labels = 4 mock_DataLabeler.model.requires_zero_mapping = False - data = pd.Series(["1", "2", "3", "4", "5", "6"]) + data = pl.Series(["1", "2", "3", "4", "5", "6"]) profiler = DataLabelerColumn(data.name) profiler.sample_size = 1 @@ -217,8 +219,8 @@ def test_label_match(self, mock_instance): def test_profile_merge(self, mock_instance): self._setup_data_labeler_mock(mock_instance) - data = pd.Series(["1", "2", "3", "11"]) - data2 = pd.Series(["4", "5", "6", "7", "9", "10", "12"]) + data = pl.Series(["1", "2", "3", "11"]) + data2 = pl.Series(["4", "5", "6", "7", "9", "10", "12"]) expected_profile = { "data_label": "a|b", @@ -313,8 +315,8 @@ def test_profile_merge_with_different_options(self, mock_instance): self._setup_data_labeler_mock(mock_instance) # Different max_sample_size values - data = pd.Series(["1", "2", "3", "11"]) - data2 = pd.Series(["4", "5", "6", "7", "9", "10", "12"]) + data = pl.Series(["1", "2", "3", "11"]) + data2 = pl.Series(["4", "5", "6", "7", "9", "10", "12"]) options = DataLabelerOptions() options.max_sample_size = 20 profiler = DataLabelerColumn(data.name, options=options) @@ -392,8 +394,8 @@ def test_empty_data(self, *mocks): # Mock out the data_label, avg_predictions, and label_representation # properties - profiler1.update(pd.Series()) - profiler2.update(pd.Series()) + profiler1.update(pl.Series()) + profiler2.update(pl.Series()) merge_profile = profiler1 + profiler2 self.assertIsNone(merge_profile._rank_distribution) @@ -447,7 +449,7 @@ def test_json_encode(self, mock_instance): def test_json_encode_after_update(self, mock_instance): self._setup_data_labeler_mock(mock_instance) - data = pd.Series(["1", "2", "3", "4"], dtype=object) + data = pl.Series(["1", "2", "3", "4"], dtype=object) profiler = DataLabelerColumn(data.name) profiler.data_labeler._default_model_loc = "this is a test model loc" with test_utils.mock_timeit(): @@ -458,7 +460,7 @@ def test_json_encode_after_update(self, mock_instance): { "class": "DataLabelerColumn", "data": { - "name": None, + "name": "", "col_index": float("nan"), "sample_size": 4, "metadata": {}, @@ -492,7 +494,7 @@ def test_json_decode(self, mock_utils_DataLabeler, mock_BaseDataLabeler): self._setup_data_labeler_mock(mock_BaseDataLabeler) mock_utils_DataLabeler.load_from_library.side_effect = mock_BaseDataLabeler - data = pd.Series(["1", "2", "3", "4"], dtype=object) + data = pl.Series(["1", "2", "3", "4"], dtype=object) expected = DataLabelerColumn(data.name) expected.data_labeler._default_model_loc = "structured_model" serialized = json.dumps(expected, cls=ProfileEncoder) @@ -537,7 +539,7 @@ def test_json_decode_after_update( self._setup_data_labeler_mock(mock_BaseDataLabeler) mock_utils_DataLabeler.load_from_library.side_effect = mock_BaseDataLabeler - data = pd.Series(["1", "2", "3", "4"], dtype=object) + data = pl.Series(["1", "2", "3", "4"], dtype=object) expected = DataLabelerColumn(data.name) expected.data_labeler._default_model_loc = "structured_model" with test_utils.mock_timeit(): @@ -547,7 +549,7 @@ def test_json_decode_after_update( deserialized = load_column_profile(json.loads(serialized)) test_utils.assert_profiles_equal(deserialized, expected) - update_data = pd.Series(["4", "5", "6", "7"], dtype=object) + update_data = pl.Series(["4", "5", "6", "7"], dtype=object) deserialized.update(update_data) assert deserialized.sample_size == 8