From 7461c0dc9cb81035914456a93f7369243053fd22 Mon Sep 17 00:00:00 2001 From: Elvin <1487819688@qq.com> Date: Fri, 22 Nov 2024 17:03:08 +0800 Subject: [PATCH 1/2] bugfix: datetime_formatter error --- sdgx/data_processors/formatters/datetime.py | 34 ++++------ ...rator_connector_with_datetime_formatter.py | 63 +++++++++++++++++++ 2 files changed, 75 insertions(+), 22 deletions(-) create mode 100644 tests/optmize/test_generator_connector_with_datetime_formatter.py diff --git a/sdgx/data_processors/formatters/datetime.py b/sdgx/data_processors/formatters/datetime.py index 5c4276fe..540b4fd8 100644 --- a/sdgx/data_processors/formatters/datetime.py +++ b/sdgx/data_processors/formatters/datetime.py @@ -124,25 +124,19 @@ def convert_datetime_columns(datetime_column_list, datetime_formats, processed_d Returns: - result_data (pd.DataFrame): Processed table data with datetime columns converted to timestamp """ - - def convert_single_column_datetime_to_timestamp( - column_data: pd.Series, datetime_format: str - ): + def datetime_formatter(each_value, datetime_format): """ - convert each single column datetime string to timestamp int value. + convert each single column datetime string to timestamp int value. """ - res = [] - for each_value in column_data: - try: - datetime_obj = datetime.strptime(str(each_value), datetime_format) - each_stamp = datetime.timestamp(datetime_obj) - except Exception as e: - logger.warning(f"An error occured when convert str to timestamp {e}.") - logger.warning(f"Input parameters: ({str(each_value)}, {datetime_format})") - logger.warning(f"Input type: ({type(each_value)}, {type(datetime_format)})") - each_stamp = 0 - res.append(each_stamp) - return pd.Series(res) + try: + datetime_obj = datetime.strptime(str(each_value), datetime_format) + each_stamp = datetime.timestamp(datetime_obj) + except Exception as e: + logger.warning(f"An error occured when convert str to timestamp {e}.") + logger.warning(f"Input parameters: ({str(each_value)}, {datetime_format})") + logger.warning(f"Input type: ({type(each_value)}, {type(datetime_format)})") + each_stamp = 0 + return each_stamp # Make a copy of processed_data to avoid modifying the original data result_data = processed_data.copy() @@ -150,11 +144,7 @@ def convert_single_column_datetime_to_timestamp( # Convert each datetime column in datetime_column_list to timestamp for column in datetime_column_list: # Convert datetime to timestamp (int) - timestamp_col = convert_single_column_datetime_to_timestamp( - processed_data[column], datetime_formats[column] - ) - result_data[column] = timestamp_col - + result_data[column] = result_data[column].apply(datetime_formatter, datetime_format=datetime_formats[column]) return result_data def reverse_convert(self, processed_data: pd.DataFrame) -> pd.DataFrame: diff --git a/tests/optmize/test_generator_connector_with_datetime_formatter.py b/tests/optmize/test_generator_connector_with_datetime_formatter.py new file mode 100644 index 00000000..40e24433 --- /dev/null +++ b/tests/optmize/test_generator_connector_with_datetime_formatter.py @@ -0,0 +1,63 @@ +import faker +import pandas as pd +import pytest +from typing_extensions import Generator + +from sdgx.data_connectors.generator_connector import GeneratorConnector +from sdgx.data_loader import DataLoader +from sdgx.data_models.metadata import Metadata +from sdgx.data_processors.formatters.datetime import DatetimeFormatter + +CHUNK_SIZE = 100 + + +@pytest.fixture +def datetime_test_df(): + total_row = 150 + ff = faker.Faker() + df = pd.DataFrame([ff.date() for i in range(total_row)], columns=['date']) + return df + + +def test_datetime_formatter_test_df(datetime_test_df: pd.DataFrame): + def df_generator(): + yield datetime_test_df + + data_processors = [DatetimeFormatter()] + dataconnector = GeneratorConnector(df_generator) + dataloader = DataLoader(dataconnector, chunksize=CHUNK_SIZE) + + metadata = Metadata.from_dataloader(dataloader) + metadata.datetime_columns = ["date"] + metadata.discrete_columns = [] + metadata.datetime_format = { + "date": "%Y-%m-%d" + } + + for d in data_processors: + d.fit(metadata=metadata, tabular_data=dataloader) + + def chunk_generator() -> Generator[pd.DataFrame, None, None]: + for chunk in dataloader.iter(): + for d in data_processors: + chunk = d.convert(chunk) + + assert not chunk.isna().any().any() + assert not chunk.isnull().any().any() + yield chunk + + processed_dataloader = DataLoader( + GeneratorConnector(chunk_generator), + identity=dataloader.identity + ) + + df = processed_dataloader.load_all() + + assert not df.isna().any().any() + assert not df.isnull().any().any() + + reverse_converted_df = df + for d in data_processors: + reverse_converted_df = d.reverse_convert(df) + + assert reverse_converted_df.eq(datetime_test_df).all().all() From 31765534bebe52249a0044d6a5d21a714267d7e8 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 22 Nov 2024 10:06:17 +0000 Subject: [PATCH 2/2] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- sdgx/data_processors/formatters/datetime.py | 7 +++++-- .../test_generator_connector_with_datetime_formatter.py | 9 +++------ 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/sdgx/data_processors/formatters/datetime.py b/sdgx/data_processors/formatters/datetime.py index 540b4fd8..61e0d4a3 100644 --- a/sdgx/data_processors/formatters/datetime.py +++ b/sdgx/data_processors/formatters/datetime.py @@ -124,9 +124,10 @@ def convert_datetime_columns(datetime_column_list, datetime_formats, processed_d Returns: - result_data (pd.DataFrame): Processed table data with datetime columns converted to timestamp """ + def datetime_formatter(each_value, datetime_format): """ - convert each single column datetime string to timestamp int value. + convert each single column datetime string to timestamp int value. """ try: datetime_obj = datetime.strptime(str(each_value), datetime_format) @@ -144,7 +145,9 @@ def datetime_formatter(each_value, datetime_format): # Convert each datetime column in datetime_column_list to timestamp for column in datetime_column_list: # Convert datetime to timestamp (int) - result_data[column] = result_data[column].apply(datetime_formatter, datetime_format=datetime_formats[column]) + result_data[column] = result_data[column].apply( + datetime_formatter, datetime_format=datetime_formats[column] + ) return result_data def reverse_convert(self, processed_data: pd.DataFrame) -> pd.DataFrame: diff --git a/tests/optmize/test_generator_connector_with_datetime_formatter.py b/tests/optmize/test_generator_connector_with_datetime_formatter.py index 40e24433..9896f5ab 100644 --- a/tests/optmize/test_generator_connector_with_datetime_formatter.py +++ b/tests/optmize/test_generator_connector_with_datetime_formatter.py @@ -15,7 +15,7 @@ def datetime_test_df(): total_row = 150 ff = faker.Faker() - df = pd.DataFrame([ff.date() for i in range(total_row)], columns=['date']) + df = pd.DataFrame([ff.date() for i in range(total_row)], columns=["date"]) return df @@ -30,9 +30,7 @@ def df_generator(): metadata = Metadata.from_dataloader(dataloader) metadata.datetime_columns = ["date"] metadata.discrete_columns = [] - metadata.datetime_format = { - "date": "%Y-%m-%d" - } + metadata.datetime_format = {"date": "%Y-%m-%d"} for d in data_processors: d.fit(metadata=metadata, tabular_data=dataloader) @@ -47,8 +45,7 @@ def chunk_generator() -> Generator[pd.DataFrame, None, None]: yield chunk processed_dataloader = DataLoader( - GeneratorConnector(chunk_generator), - identity=dataloader.identity + GeneratorConnector(chunk_generator), identity=dataloader.identity ) df = processed_dataloader.load_all()