diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index cb5219b..9a0d5bd 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -39,7 +39,7 @@ jobs: - name: Set up Python uses: actions/setup-python@v5 with: - python-version: 3.7 + python-version: 3.11 - name: Give PyPI a chance to update the index run: sleep 240 - name: Install from PyPI diff --git a/tests/test_disaggregation.py b/tests/test_disaggregation.py index 7a795a9..628787d 100644 --- a/tests/test_disaggregation.py +++ b/tests/test_disaggregation.py @@ -12,7 +12,7 @@ from tsdisagg import disaggregate_series from tsdisagg.time_conversion import FREQ_CONVERSION_FACTORS, MONTHS, get_frequency_name -from tsdisagg.ts_disagg import build_conversion_matrix +from tsdisagg.ts_disagg import METHOD, build_conversion_matrix def generate_random_index_pair( @@ -93,6 +93,39 @@ def frequencies(draw: Callable[[SearchStrategy[int]], int]) -> tuple[str, str]: return low_freq, high_freq +@pytest.fixture() +def exports_m(): + exports_m = pd.read_csv("tests/data/exports_m.csv", index_col=0) + exports_m.index = pd.date_range(start="1972-01-01", freq="MS", periods=exports_m.shape[0]) + exports_m.columns = ["exports"] + return exports_m + + +@pytest.fixture() +def sales_a(): + sales_a = pd.read_csv("tests/data/sales_a.csv", index_col=0) + sales_a.index = pd.date_range(start="1975-01-01", freq="YS", periods=sales_a.shape[0]) + sales_a.columns = ["sales"] + return sales_a + + +@pytest.fixture() +def exports_q(): + exports_q = pd.read_csv("tests/data/exports_q.csv", index_col=0) + exports_q.index = pd.date_range(start="1972-01-01", freq="QS-OCT", periods=exports_q.shape[0]) + exports_q.columns = ["exports"] + return exports_q + + +@pytest.fixture() +def imports_q(): + imports_q = pd.read_csv("tests/data/imports_q.csv", index_col=0) + imports_q.index = pd.date_range(start="1972-01-01", freq="QS-OCT", periods=imports_q.shape[0]) + imports_q.columns = ["imports"] + + return imports_q + + @given(frequencies()) @pytest.mark.parametrize("agg_func", ["sum", "mean", "first", "last"]) def test_build_C_matrix(agg_func, frequencies): @@ -388,18 +421,6 @@ def test_invalid_dataframe_warnings(): agg_func="sum", ) - with pytest.raises(ValueError, match="low_freq_df has missing values"): - disaggregate_series( - pd.DataFrame( - {"data": [1, np.nan, 3]}, index=pd.date_range("2020-01-01", periods=3, freq="D") - ), - pd.DataFrame( - {"data": [1, 2, 3]}, index=pd.date_range("2020-01-01", periods=3, freq="D") - ), - method="denton", - agg_func="sum", - ) - with pytest.raises(ValueError, match="high_freq_df has missing values"): disaggregate_series( pd.DataFrame( @@ -475,5 +496,38 @@ def test_invalid_dataframe_warnings(): ) +@pytest.mark.parametrize("method", ["denton", "chow-lin", "litterman"]) +@pytest.mark.parametrize("missing_in_center", [True, False]) +def test_disagg_with_internal_low_freq_missing( + sales_a, exports_q, exports_m, method: METHOD, missing_in_center +): + sales_a = sales_a.copy() + + if missing_in_center: + sales_a.iloc[10] = np.nan + else: + sales_a.iloc[0] = np.nan + + result = disaggregate_series( + sales_a, + high_freq_df=exports_m.assign(Constant=1) if "denton" not in method else None, + method=method, + agg_func="sum", + target_freq="MS", + optimizer_kwargs={"method": "nelder-mead"}, + verbose=False, + ) + + assert result.isna().sum() == 0 + + if "denton" in method: + assert np.all( + result.index + == pd.date_range(start=sales_a.index[0], periods=12 * sales_a.shape[0], freq="MS") + ) + else: + assert np.all(result.index == exports_m.index) + + if __name__ == "__main__": unittest.main() diff --git a/tsdisagg/ts_disagg.py b/tsdisagg/ts_disagg.py index 28f2476..5c58f09 100644 --- a/tsdisagg/ts_disagg.py +++ b/tsdisagg/ts_disagg.py @@ -218,8 +218,8 @@ def prepare_input_dataframes(low_freq_df, high_freq_df, target_freq, method): "No datetime index found on the dataframe passed as argument to low_freq_df." ) - if low_freq_df.isna().any().any(): - raise ValueError("low_freq_df has missing values.") + # if low_freq_df.isna().any().any(): + # raise ValueError("low_freq_df has missing values.") if high_freq_df is not None: if not isinstance(high_freq_df.index, pd.core.indexes.datetimes.DatetimeIndex): @@ -395,7 +395,8 @@ def disaggregate_series( ) C = build_conversion_matrix(low_freq_df, high_freq_df, time_conversion_factor, agg_func) - drop_rows = np.all(C == 0, axis=1) + drop_rows = np.all(C == 0, axis=1) | low_freq_df.isna().values.ravel() + if any(drop_rows): dropped = low_freq_df.index.strftime("%Y-%m-%d")[drop_rows] warnings.warn( @@ -403,7 +404,7 @@ def disaggregate_series( UserWarning, ) - y = df.iloc[:, target_idx].dropna().loc[~drop_rows] + y = low_freq_df.loc[~drop_rows].squeeze() C = C[~drop_rows, :] X = df.drop(columns=df.columns[target_idx]) @@ -463,7 +464,16 @@ def disaggregate_series( ul = y - C @ p y_hat = p + D @ ul - output = pd.Series(y_hat, index=df.index, name=target_column) + if not isinstance(y_hat, pd.Series | pd.DataFrame): + output = pd.Series(y_hat, index=df.index, name=target_column) + elif isinstance(y_hat, pd.Series): + output = y_hat + output.name = target_column + else: + output = y_hat.iloc[:, 0] + output.name = target_column + + output.index = df.index output.index.freq = output.index.inferred_freq if return_optim_res and result is not None: