Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Allow missing data in low frequency data #17

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/release.yml
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ jobs:
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: 3.7
python-version: 3.11
- name: Give PyPI a chance to update the index
run: sleep 240
- name: Install from PyPI
Expand Down
80 changes: 67 additions & 13 deletions tests/test_disaggregation.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@

from tsdisagg import disaggregate_series
from tsdisagg.time_conversion import FREQ_CONVERSION_FACTORS, MONTHS, get_frequency_name
from tsdisagg.ts_disagg import build_conversion_matrix
from tsdisagg.ts_disagg import METHOD, build_conversion_matrix


def generate_random_index_pair(
Expand Down Expand Up @@ -93,6 +93,39 @@ def frequencies(draw: Callable[[SearchStrategy[int]], int]) -> tuple[str, str]:
return low_freq, high_freq


@pytest.fixture()
def exports_m():
exports_m = pd.read_csv("tests/data/exports_m.csv", index_col=0)
exports_m.index = pd.date_range(start="1972-01-01", freq="MS", periods=exports_m.shape[0])
exports_m.columns = ["exports"]
return exports_m


@pytest.fixture()
def sales_a():
sales_a = pd.read_csv("tests/data/sales_a.csv", index_col=0)
sales_a.index = pd.date_range(start="1975-01-01", freq="YS", periods=sales_a.shape[0])
sales_a.columns = ["sales"]
return sales_a


@pytest.fixture()
def exports_q():
exports_q = pd.read_csv("tests/data/exports_q.csv", index_col=0)
exports_q.index = pd.date_range(start="1972-01-01", freq="QS-OCT", periods=exports_q.shape[0])
exports_q.columns = ["exports"]
return exports_q


@pytest.fixture()
def imports_q():
imports_q = pd.read_csv("tests/data/imports_q.csv", index_col=0)
imports_q.index = pd.date_range(start="1972-01-01", freq="QS-OCT", periods=imports_q.shape[0])
imports_q.columns = ["imports"]

return imports_q


@given(frequencies())
@pytest.mark.parametrize("agg_func", ["sum", "mean", "first", "last"])
def test_build_C_matrix(agg_func, frequencies):
Expand Down Expand Up @@ -388,18 +421,6 @@ def test_invalid_dataframe_warnings():
agg_func="sum",
)

with pytest.raises(ValueError, match="low_freq_df has missing values"):
disaggregate_series(
pd.DataFrame(
{"data": [1, np.nan, 3]}, index=pd.date_range("2020-01-01", periods=3, freq="D")
),
pd.DataFrame(
{"data": [1, 2, 3]}, index=pd.date_range("2020-01-01", periods=3, freq="D")
),
method="denton",
agg_func="sum",
)

with pytest.raises(ValueError, match="high_freq_df has missing values"):
disaggregate_series(
pd.DataFrame(
Expand Down Expand Up @@ -475,5 +496,38 @@ def test_invalid_dataframe_warnings():
)


@pytest.mark.parametrize("method", ["denton", "chow-lin", "litterman"])
@pytest.mark.parametrize("missing_in_center", [True, False])
def test_disagg_with_internal_low_freq_missing(
sales_a, exports_q, exports_m, method: METHOD, missing_in_center
):
sales_a = sales_a.copy()

if missing_in_center:
sales_a.iloc[10] = np.nan
else:
sales_a.iloc[0] = np.nan

result = disaggregate_series(
sales_a,
high_freq_df=exports_m.assign(Constant=1) if "denton" not in method else None,
method=method,
agg_func="sum",
target_freq="MS",
optimizer_kwargs={"method": "nelder-mead"},
verbose=False,
)

assert result.isna().sum() == 0

if "denton" in method:
assert np.all(
result.index
== pd.date_range(start=sales_a.index[0], periods=12 * sales_a.shape[0], freq="MS")
)
else:
assert np.all(result.index == exports_m.index)


if __name__ == "__main__":
unittest.main()
20 changes: 15 additions & 5 deletions tsdisagg/ts_disagg.py
Original file line number Diff line number Diff line change
Expand Up @@ -218,8 +218,8 @@
"No datetime index found on the dataframe passed as argument to low_freq_df."
)

if low_freq_df.isna().any().any():
raise ValueError("low_freq_df has missing values.")
# if low_freq_df.isna().any().any():
# raise ValueError("low_freq_df has missing values.")

if high_freq_df is not None:
if not isinstance(high_freq_df.index, pd.core.indexes.datetimes.DatetimeIndex):
Expand Down Expand Up @@ -395,15 +395,16 @@
)

C = build_conversion_matrix(low_freq_df, high_freq_df, time_conversion_factor, agg_func)
drop_rows = np.all(C == 0, axis=1)
drop_rows = np.all(C == 0, axis=1) | low_freq_df.isna().values.ravel()

if any(drop_rows):
dropped = low_freq_df.index.strftime("%Y-%m-%d")[drop_rows]
warnings.warn(
f'Insufficent high-frequency data to decompose the following dates: {", ".join(dropped)}',
UserWarning,
)

y = df.iloc[:, target_idx].dropna().loc[~drop_rows]
y = low_freq_df.loc[~drop_rows].squeeze()
C = C[~drop_rows, :]
X = df.drop(columns=df.columns[target_idx])

Expand Down Expand Up @@ -463,7 +464,16 @@
ul = y - C @ p
y_hat = p + D @ ul

output = pd.Series(y_hat, index=df.index, name=target_column)
if not isinstance(y_hat, pd.Series | pd.DataFrame):
output = pd.Series(y_hat, index=df.index, name=target_column)
elif isinstance(y_hat, pd.Series):
output = y_hat
output.name = target_column

Check warning on line 471 in tsdisagg/ts_disagg.py

View check run for this annotation

Codecov / codecov/patch

tsdisagg/ts_disagg.py#L469-L471

Added lines #L469 - L471 were not covered by tests
else:
output = y_hat.iloc[:, 0]
output.name = target_column

Check warning on line 474 in tsdisagg/ts_disagg.py

View check run for this annotation

Codecov / codecov/patch

tsdisagg/ts_disagg.py#L473-L474

Added lines #L473 - L474 were not covered by tests

output.index = df.index
output.index.freq = output.index.inferred_freq

if return_optim_res and result is not None:
Expand Down
Loading