From e6e20cc3538a2723075d7abe329572a47e54f89f Mon Sep 17 00:00:00 2001 From: Andrew Li Date: Mon, 5 Feb 2024 15:59:00 -0600 Subject: [PATCH 1/3] update profiler utils --- dataprofiler/profilers/profiler_utils.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/dataprofiler/profilers/profiler_utils.py b/dataprofiler/profilers/profiler_utils.py index e38e1b04..5edc4898 100644 --- a/dataprofiler/profilers/profiler_utils.py +++ b/dataprofiler/profilers/profiler_utils.py @@ -26,6 +26,7 @@ ) import numpy as np +import polars as pl import psutil import scipy from pandas import DataFrame, Series @@ -331,6 +332,7 @@ def biased_skew(df_series: Series) -> np.float64: :return: biased skewness :rtype: np.float64 """ + df_series = pl.from_pandas(df_series, nan_to_null=False) n = len(df_series) if n < 1: return np.float64(np.nan) @@ -369,6 +371,7 @@ def biased_kurt(df_series: Series) -> np.float64: :return: biased kurtosis :rtype: np.float64 """ + df_series = pl.from_pandas(df_series, nan_to_null=False) n = len(df_series) if n < 1: return np.float64(np.nan) From bcf6b26917b690e7b35befb38fca173fefe3e843 Mon Sep 17 00:00:00 2001 From: Andrew Li Date: Mon, 5 Feb 2024 16:26:12 -0600 Subject: [PATCH 2/3] finish updates --- dataprofiler/profilers/numerical_column_stats.py | 8 ++++++++ dataprofiler/profilers/profiler_utils.py | 11 ++++++----- 2 files changed, 14 insertions(+), 5 deletions(-) diff --git a/dataprofiler/profilers/numerical_column_stats.py b/dataprofiler/profilers/numerical_column_stats.py index 74c24e21..7fe05aee 100644 --- a/dataprofiler/profilers/numerical_column_stats.py +++ b/dataprofiler/profilers/numerical_column_stats.py @@ -1924,6 +1924,10 @@ def _get_skewness( ): return + if self._greater_than_64_bit and type(df_series) is pd.Series: + df_series = df_series.to_numpy(dtype=float) + else: + df_series = pl.from_pandas(df_series, nan_to_null=False) batch_biased_skewness = profiler_utils.biased_skew(df_series) subset_properties["biased_skewness"] = batch_biased_skewness batch_count = subset_properties["match_count"] @@ -1968,6 +1972,10 @@ def _get_kurtosis( ): return + if self._greater_than_64_bit and type(df_series) is pd.Series: + df_series = df_series.to_numpy(dtype=float) + else: + df_series = pl.from_pandas(df_series, nan_to_null=False) batch_biased_kurtosis = profiler_utils.biased_kurt(df_series) subset_properties["biased_kurtosis"] = batch_biased_kurtosis batch_count = subset_properties["match_count"] diff --git a/dataprofiler/profilers/profiler_utils.py b/dataprofiler/profilers/profiler_utils.py index 5edc4898..a81dca7a 100644 --- a/dataprofiler/profilers/profiler_utils.py +++ b/dataprofiler/profilers/profiler_utils.py @@ -29,7 +29,8 @@ import polars as pl import psutil import scipy -from pandas import DataFrame, Series +from pandas import DataFrame +from polars import Series from ..labelers.data_labelers import DataLabeler @@ -321,7 +322,7 @@ def add_nested_dictionaries(first_dict: dict, second_dict: dict) -> dict: return merged_dict -def biased_skew(df_series: Series) -> np.float64: +def biased_skew(df_series: Series | np.ndarray) -> np.float64: """ Calculate the biased estimator for skewness of the given data. @@ -332,7 +333,6 @@ def biased_skew(df_series: Series) -> np.float64: :return: biased skewness :rtype: np.float64 """ - df_series = pl.from_pandas(df_series, nan_to_null=False) n = len(df_series) if n < 1: return np.float64(np.nan) @@ -360,7 +360,7 @@ def biased_skew(df_series: Series) -> np.float64: return skew -def biased_kurt(df_series: Series) -> np.float64: +def biased_kurt(df_series: Series | np.ndarray) -> np.float64: """ Calculate the biased estimator for kurtosis of the given data. @@ -371,7 +371,6 @@ def biased_kurt(df_series: Series) -> np.float64: :return: biased kurtosis :rtype: np.float64 """ - df_series = pl.from_pandas(df_series, nan_to_null=False) n = len(df_series) if n < 1: return np.float64(np.nan) @@ -678,6 +677,8 @@ def get_memory_size(data: list | np.ndarray | DataFrame, unit: str = "M") -> flo :type unit: string :return: memory size of the input data """ + if type(data) is DataFrame: + data = pl.from_pandas(data) unit_map: dict = collections.defaultdict(B=0, K=1, M=2, G=3) if unit not in unit_map: raise ValueError( From 7f7fc573fdbf2e9b36c0588636797b2c534bfdc4 Mon Sep 17 00:00:00 2001 From: Andrew Li Date: Fri, 16 Feb 2024 13:40:07 -0600 Subject: [PATCH 3/3] finish int updates --- dataprofiler/profilers/int_column_profile.py | 23 +++++++++++++++----- 1 file changed, 17 insertions(+), 6 deletions(-) diff --git a/dataprofiler/profilers/int_column_profile.py b/dataprofiler/profilers/int_column_profile.py index 014465c7..5e1ad6ee 100644 --- a/dataprofiler/profilers/int_column_profile.py +++ b/dataprofiler/profilers/int_column_profile.py @@ -3,6 +3,7 @@ import numpy as np import pandas as pd +import polars as pl from .base_column_profilers import BaseColumnPrimitiveTypeProfiler, BaseColumnProfiler from .numerical_column_stats import NumericStatsMixin @@ -113,7 +114,7 @@ def data_type_ratio(self) -> float | None: return None @classmethod - def _is_each_row_int(cls, df_series: pd.Series) -> list[bool]: + def _is_each_row_int(cls, df_series: pl.Series) -> list[bool]: """ Return true if given is numerical and int values. @@ -134,7 +135,7 @@ def _is_each_row_int(cls, df_series: pd.Series) -> list[bool]: return [NumericStatsMixin.is_int(x) for x in df_series] - def _update_helper(self, df_series_clean: pd.Series, profile: dict) -> None: + def _update_helper(self, df_series_clean: pl.Series, profile: dict) -> None: """ Update col profile properties with clean dataset and its known null params. @@ -144,6 +145,7 @@ def _update_helper(self, df_series_clean: pd.Series, profile: dict) -> None: :type profile: dict :return: None """ + df_series_clean = pd.Series(df_series_clean.to_numpy()) if self._NumericStatsMixin__calculations: NumericStatsMixin._update_helper(self, df_series_clean, profile) self._update_column_base_properties(profile) @@ -157,23 +159,32 @@ def update(self, df_series: pd.Series) -> IntColumn: :return: updated IntColumn :rtype: IntColumn """ + self._greater_than_64_bit = ( + not df_series.empty + and df_series.apply(pd.to_numeric, errors="coerce").dtype == "O" + ) + if self._greater_than_64_bit: + df_series = pl.Series(df_series.to_list(), dtype=pl.Object) + else: + df_series = pl.from_pandas(df_series) if len(df_series) == 0: return self - df_series = df_series.reset_index(drop=True) is_each_row_int = self._is_each_row_int(df_series) sample_size = len(is_each_row_int) - match_int_count = np.sum(is_each_row_int) + match_int_count = np.sum([is_each_row_int]) profile = dict(match_count=match_int_count, sample_size=sample_size) BaseColumnProfiler._perform_property_calcs( self, self.__calculations, - df_series=df_series[is_each_row_int], + df_series=df_series.filter(is_each_row_int), prev_dependent_properties={}, subset_properties=profile, ) - self._update_helper(df_series_clean=df_series[is_each_row_int], profile=profile) + self._update_helper( + df_series_clean=df_series.filter(is_each_row_int), profile=profile + ) return self