Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Int col profile update #1095

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 17 additions & 6 deletions dataprofiler/profilers/int_column_profile.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

import numpy as np
import pandas as pd
import polars as pl

from .base_column_profilers import BaseColumnPrimitiveTypeProfiler, BaseColumnProfiler
from .numerical_column_stats import NumericStatsMixin
Expand Down Expand Up @@ -113,7 +114,7 @@ def data_type_ratio(self) -> float | None:
return None

@classmethod
def _is_each_row_int(cls, df_series: pd.Series) -> list[bool]:
def _is_each_row_int(cls, df_series: pl.Series) -> list[bool]:
"""
Return true if given is numerical and int values.

Expand All @@ -134,7 +135,7 @@ def _is_each_row_int(cls, df_series: pd.Series) -> list[bool]:

return [NumericStatsMixin.is_int(x) for x in df_series]

def _update_helper(self, df_series_clean: pd.Series, profile: dict) -> None:
def _update_helper(self, df_series_clean: pl.Series, profile: dict) -> None:
"""
Update col profile properties with clean dataset and its known null params.

Expand All @@ -144,6 +145,7 @@ def _update_helper(self, df_series_clean: pd.Series, profile: dict) -> None:
:type profile: dict
:return: None
"""
df_series_clean = pd.Series(df_series_clean.to_numpy())
if self._NumericStatsMixin__calculations:
NumericStatsMixin._update_helper(self, df_series_clean, profile)
self._update_column_base_properties(profile)
Expand All @@ -157,23 +159,32 @@ def update(self, df_series: pd.Series) -> IntColumn:
:return: updated IntColumn
:rtype: IntColumn
"""
self._greater_than_64_bit = (
not df_series.empty
and df_series.apply(pd.to_numeric, errors="coerce").dtype == "O"
)
if self._greater_than_64_bit:
df_series = pl.Series(df_series.to_list(), dtype=pl.Object)
else:
df_series = pl.from_pandas(df_series)
if len(df_series) == 0:
return self

df_series = df_series.reset_index(drop=True)
is_each_row_int = self._is_each_row_int(df_series)
sample_size = len(is_each_row_int)
match_int_count = np.sum(is_each_row_int)
match_int_count = np.sum([is_each_row_int])
profile = dict(match_count=match_int_count, sample_size=sample_size)

BaseColumnProfiler._perform_property_calcs(
self,
self.__calculations,
df_series=df_series[is_each_row_int],
df_series=df_series.filter(is_each_row_int),
prev_dependent_properties={},
subset_properties=profile,
)

self._update_helper(df_series_clean=df_series[is_each_row_int], profile=profile)
self._update_helper(
df_series_clean=df_series.filter(is_each_row_int), profile=profile
)

return self
8 changes: 8 additions & 0 deletions dataprofiler/profilers/numerical_column_stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -1924,6 +1924,10 @@ def _get_skewness(
):
return

if self._greater_than_64_bit and type(df_series) is pd.Series:
df_series = df_series.to_numpy(dtype=float)
else:
df_series = pl.from_pandas(df_series, nan_to_null=False)
batch_biased_skewness = profiler_utils.biased_skew(df_series)
subset_properties["biased_skewness"] = batch_biased_skewness
batch_count = subset_properties["match_count"]
Expand Down Expand Up @@ -1968,6 +1972,10 @@ def _get_kurtosis(
):
return

if self._greater_than_64_bit and type(df_series) is pd.Series:
df_series = df_series.to_numpy(dtype=float)
else:
df_series = pl.from_pandas(df_series, nan_to_null=False)
batch_biased_kurtosis = profiler_utils.biased_kurt(df_series)
subset_properties["biased_kurtosis"] = batch_biased_kurtosis
batch_count = subset_properties["match_count"]
Expand Down
10 changes: 7 additions & 3 deletions dataprofiler/profilers/profiler_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,9 +26,11 @@
)

import numpy as np
import polars as pl
import psutil
import scipy
from pandas import DataFrame, Series
from pandas import DataFrame
from polars import Series

from ..labelers.data_labelers import DataLabeler

Expand Down Expand Up @@ -320,7 +322,7 @@ def add_nested_dictionaries(first_dict: dict, second_dict: dict) -> dict:
return merged_dict


def biased_skew(df_series: Series) -> np.float64:
def biased_skew(df_series: Series | np.ndarray) -> np.float64:
"""
Calculate the biased estimator for skewness of the given data.

Expand Down Expand Up @@ -358,7 +360,7 @@ def biased_skew(df_series: Series) -> np.float64:
return skew


def biased_kurt(df_series: Series) -> np.float64:
def biased_kurt(df_series: Series | np.ndarray) -> np.float64:
"""
Calculate the biased estimator for kurtosis of the given data.

Expand Down Expand Up @@ -675,6 +677,8 @@ def get_memory_size(data: list | np.ndarray | DataFrame, unit: str = "M") -> flo
:type unit: string
:return: memory size of the input data
"""
if type(data) is DataFrame:
data = pl.from_pandas(data)
unit_map: dict = collections.defaultdict(B=0, K=1, M=2, G=3)
if unit not in unit_map:
raise ValueError(
Expand Down
Loading