From 0a959cb24df366c3bec4a397dda1cc7f0c39d510 Mon Sep 17 00:00:00 2001 From: marinamatic <147725905+marinamatic@users.noreply.github.com> Date: Tue, 29 Oct 2024 10:25:26 +0100 Subject: [PATCH 01/21] first commit --- documentation/catalogs/feature_catalogue.md | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 documentation/catalogs/feature_catalogue.md diff --git a/documentation/catalogs/feature_catalogue.md b/documentation/catalogs/feature_catalogue.md new file mode 100644 index 00000000..e69de29b From 8743a4203045de226425c5d47906b13de3d302fe Mon Sep 17 00:00:00 2001 From: marinamatic <147725905+marinamatic@users.noreply.github.com> Date: Wed, 30 Oct 2024 17:50:36 +0100 Subject: [PATCH 02/21] script to generate catalogue of features with very amateur questionable code --- .../catalogs/generate_features_catalog.py | 87 +++++++++++++++++++ 1 file changed, 87 insertions(+) create mode 100644 documentation/catalogs/generate_features_catalog.py diff --git a/documentation/catalogs/generate_features_catalog.py b/documentation/catalogs/generate_features_catalog.py new file mode 100644 index 00000000..72a3ca50 --- /dev/null +++ b/documentation/catalogs/generate_features_catalog.py @@ -0,0 +1,87 @@ +import os +import re +import logging +import pandas as pd +import sys +from pathlib import Path +from tabulate import tabulate + +logging.basicConfig( + level=logging.ERROR, format="%(asctime)s %(name)s - %(levelname)s - %(message)s" +) +logger = logging.getLogger(__name__) + +# Path to the root and querysets +PATH = Path(__file__).resolve() +indices = [i for i, x in enumerate(PATH.parts) if x == "views_pipeline"] +PATH_ROOT = Path(*PATH.parts[:indices[-1] + 1]) + +querysets_path = PATH_ROOT / 'common_querysets' +GITHUB_URL = 'https://github.com/prio-data/views_pipeline/blob/production/' + +def extract_columns_from_querysets(querysets_path): + """ + Parses each queryset file in the common_querysets folder to extract columns, querysets, and LOA. + """ + columns_info = [] + + for file_path in querysets_path.glob("*.py"): + with open(file_path, 'r') as file: + content = file.read() + queryset_name = file_path.stem + + # Find all Column definitions + column_matches = re.findall(r'Column\((.*?)\)', content) + for match in column_matches: + column_name = re.search(r'"(.*?)"', match).group(1) + loa_match = re.search(r'from_loa="(.*?)"', match) + loa = loa_match.group(1) if loa_match else None + columns_info.append({ + 'column_name': column_name, + 'queryset': queryset_name, + 'loa': loa + }) + + # Convert to DataFrame for merging and remove duplicates + df = pd.DataFrame(columns_info).drop_duplicates() + + # Group by column_name and aggregate querysets as a comma-separated string + df = df.groupby(['column_name', 'loa'], as_index=False).agg({ + 'queryset': lambda x: ', '.join(sorted(set(x))) # Join unique querysets per feature + }) + + return df + +def generate_markdown_table(columns_info): + """ + Generates a nicely formatted markdown table for a feature catalog. + """ + headers = ['Name in viewser', 'Human-readable name', 'Data source (with link)', + 'Last updated (minutes:hours:day:month:year)', 'Associated querysets/models', 'Notes'] + + table_data = [] + for _, row in columns_info.iterrows(): + table_data.append([ + row['column_name'], + '', # Placeholder for Human-readable name + '', # Placeholder for Data source + 'needs manual update (as of now)', # Placeholder for Last updated + row['queryset'], + '', #Placeholder for notes + ]) + + # Generate markdown with tabulate + markdown_table = tabulate(table_data, headers=headers, tablefmt="pipe", colalign=("center",)) + + return markdown_table + +if __name__ == "__main__": + # Extract feature information from querysets + columns_info = extract_columns_from_querysets(querysets_path) + + # Generate the markdown table for the feature catalog + feature_catalog = generate_markdown_table(columns_info) + + # Save the markdown table + with open('documentation/catalogs/feature_catalog.md', 'w') as f: + f.write(feature_catalog) From cc199d6c6e88f81b580d37bd9b3344a6e2c4f68c Mon Sep 17 00:00:00 2001 From: marinamatic <147725905+marinamatic@users.noreply.github.com> Date: Thu, 31 Oct 2024 17:50:18 +0100 Subject: [PATCH 03/21] first commit --- .../024_development_and_production_sync.md | 78 ++++++++++++++++++- 1 file changed, 77 insertions(+), 1 deletion(-) diff --git a/documentation/ADRs/024_development_and_production_sync.md b/documentation/ADRs/024_development_and_production_sync.md index 30404ce4..91cd92b1 100644 --- a/documentation/ADRs/024_development_and_production_sync.md +++ b/documentation/ADRs/024_development_and_production_sync.md @@ -1 +1,77 @@ -TODO \ No newline at end of file + +## Development and Production Sync +*Refinement of Model Configuration Files Structure* + +| ADR Info | Details | +|---------------------|-------------------| +| Subject | Production and Development Branch Synchronization | +| ADR Number | 024 | +| Status | Proposed | +| Author | Simon | +| Date | 31.10.2024. | + +## Context + +We aim to establish a new benchmark in MLOps for early warning systems (EWS), specifically for conflict forecasting, which demands high standards of reliability, transparency, and seamless update processes. Given the high stakes of forecasting in EWS, the branching strategy must support robust, transparent, and consistent updates, with a focus on ensuring production stability while accommodating active, iterative development. + +To support continuous quality assurance, real-time monitoring, and rapid model updates, the synchronization between development and production branches must be structured to maintain reliability and performance while addressing the following critical needs: +- Irregular Deployment Frequency: The project requires deployments ranging from weekly to monthly, demanding a workflow that can handle periodic updates without disrupting production stability. +- Critical Model Monitoring: Ensuring real-time model monitoring is essential to maintain the accuracy and reliability of predictions, with a strong focus on data drift detection, model performance assessment, and feature validation across deployment cycles. +- Coupled ML and Non-ML Components: Some non-ML components are tightly integrated with ML workflows, requiring synchronized updates to avoid dependency issues in production. +- Versioning and Traceability: Maintaining version control and artifact management is crucial for reproducibility, rollback, and historical comparison, particularly in a pipeline that supports high-stakes decision-making and early action. + +This ADR defines the branching and synchronization structure necessary to support these requirements while adhering to MLOps best practices, ensuring the production branch remains stable and reliable for operational forecasting while allowing iterative improvements in development. + +## Decision + +To achieve the requirements described in the Context section, we will implement the following strategy for branching and synchronization strategy, optimized for the EWS pipeline. + +### Overview + +**Branch Structure and Sync Strategy** + +1. **Primary Branches** +- **Production:** Serves as the stable branch for all production-ready code and models. Only validated updates are merged here, ensuring production stability for high-stakes decision-making. +- **Development:** Acts as the main integration branch for feature development, model updates, and experiment integration. All new features are developed in dedicated feature branches based on this branch and merged via Pull Requests (PRs) to ensure controlled updates and testing. + +2. **Feature Branch Workflow** +- Feature branches are created off development for isolated testing of new features, models, or configurations. +- Each feature branch undergoes rigorous PR reviews and automated testing to ensure compatibility, stability, and performance before merging into development. This approach maintains the stability of development, reducing errors upon merging to production. + +3. **Syncing Development to Production** +- **Periodic Pull Requests:** At regular intervals (between weekly and monthly), development will be merged into production via a Pull Request once a full validation cycle is completed. +- **Staging Environment Validation:** A staging environment replicates production settings to validate the integrity of development before merging into production. This includes running inference tests, drift detection, performance checks, and monitoring to detect issues pre-deployment, ensuring production stability. + +4. **Hotfix Branches** +- For urgent issues in production, hotfix branches are created directly from production, fixed, tested, and merged back into production. These hotfixes are then backported to development to maintain consistency between branches. + +5. **Versioning** +- **Semantic Versioning:** Each production release is tagged with semantic versioning (e.g., v1.0.0, v1.1.0) to facilitate traceability and rollback. + +## Consequences + +**Positive Effects:** +- **Production Stability:** Clear separation between development and production minimizes the risk of untested code or model updates affecting production stability. +- **Enhanced Monitoring and Quality Assurance:** The use of a staging environment and comprehensive validation checks before each merge ensures consistent quality and reliability in production. +- **Rapid Issue Resolution:** Hotfix branches allow urgent fixes to be deployed directly to production, reducing downtime and maintaining model performance for critical decision-making. + +**Negative Effects:** +- **Increased Complexity in Workflow:** Multiple branches and regular sync requirements add to the complexity of the branching strategy, necessitating disciplined version control and coordination across teams. +- **Resource Overhead for Staging and Testing:** Maintaining a staging environment and conducting extensive validation tests for each update demands additional resources but is justified by the critical need for model reliability in production. + + +## Rationale +This branching and sync structure balances flexibility in development with reliability in production. By keeping development and production branches separate and introducing a staging validation step, we ensure that production remains stable and capable of handling high-stakes forecasts while enabling iterative development in development. The addition of hotfix branches further reduces the risk of downtime due to critical issues in production. + +### Considerations +- **Sync Delays:** Frequent updates in development may slow down synchronization with production if not carefully managed. Scheduled periodic merges and staging validation cycles mitigate this risk. +- **Resource Allocation:** The staging environment and enhanced testing for each PR demand additional computational resources and time but align with the need for stability and reliability in conflict forecasting. + +## Additional Notes + + +## Feedback and Suggestions +Feedback is welcome on any additional sync requirements, monitoring tools, or branching conventions. Input on optimizing the staging environment and hotfix management process is also appreciated to ensure alignment with best practices. + +--- + From 088a7f92d1707883ab9a0cffe6cf7fcb3cbee8db Mon Sep 17 00:00:00 2001 From: Polichinl Date: Fri, 1 Nov 2024 01:50:46 +0100 Subject: [PATCH 04/21] naming convention --- documentation/catalogs/feature_catalog.md | 90 +++++++++++++++++++++ documentation/catalogs/feature_catalogue.md | 0 2 files changed, 90 insertions(+) create mode 100644 documentation/catalogs/feature_catalog.md delete mode 100644 documentation/catalogs/feature_catalogue.md diff --git a/documentation/catalogs/feature_catalog.md b/documentation/catalogs/feature_catalog.md new file mode 100644 index 00000000..0b69805c --- /dev/null +++ b/documentation/catalogs/feature_catalog.md @@ -0,0 +1,90 @@ +| Name in viewser | Human-readable name | Data source (with link) | Last updated (minutes:hours:day:month:year) | Associated querysets/models | Notes | +|:-----------------------:|:----------------------|:--------------------------|:----------------------------------------------|:-----------------------------------------------------------------------------------------|:--------| +| agri_ih | | | needs manual update (as of now) | queryset_blank_space | | +| barren_ih | | | needs manual update (as of now) | queryset_blank_space | | +| c_id | | | needs manual update (as of now) | queryset_meow_meow | | +| col | | | needs manual update (as of now) | queryset_meow_meow | | +| cropprop | | | needs manual update (as of now) | queryset_old_money | | +| decay_ged_ns_1 | | | needs manual update (as of now) | queryset_electric_relaxation | | +| decay_ged_ns_100 | | | needs manual update (as of now) | queryset_electric_relaxation | | +| decay_ged_ns_25 | | | needs manual update (as of now) | queryset_electric_relaxation | | +| decay_ged_ns_5 | | | needs manual update (as of now) | queryset_electric_relaxation | | +| decay_ged_ns_5 | | | needs manual update (as of now) | queryset_lavender_haze | | +| decay_ged_os_1 | | | needs manual update (as of now) | queryset_electric_relaxation | | +| decay_ged_os_1 | | | needs manual update (as of now) | queryset_blank_space, queryset_orange_pasta | | +| decay_ged_os_100 | | | needs manual update (as of now) | queryset_electric_relaxation | | +| decay_ged_os_25 | | | needs manual update (as of now) | queryset_electric_relaxation | | +| decay_ged_os_5 | | | needs manual update (as of now) | queryset_electric_relaxation | | +| decay_ged_os_5 | | | needs manual update (as of now) | queryset_lavender_haze | | +| decay_ged_sb_1 | | | needs manual update (as of now) | queryset_electric_relaxation | | +| decay_ged_sb_1 | | | needs manual update (as of now) | queryset_blank_space, queryset_orange_pasta | | +| decay_ged_sb_100 | | | needs manual update (as of now) | queryset_electric_relaxation | | +| decay_ged_sb_25 | | | needs manual update (as of now) | queryset_electric_relaxation | | +| decay_ged_sb_25 | | | needs manual update (as of now) | queryset_blank_space, queryset_orange_pasta | | +| decay_ged_sb_5 | | | needs manual update (as of now) | queryset_electric_relaxation | | +| decay_ged_sb_5 | | | needs manual update (as of now) | queryset_lavender_haze | | +| decay_ged_sb_500 | | | needs manual update (as of now) | queryset_electric_relaxation | | +| dist_diamsec | | | needs manual update (as of now) | queryset_blank_space, queryset_lavender_haze | | +| dist_petroleum | | | needs manual update (as of now) | queryset_blank_space | | +| forest_ih | | | needs manual update (as of now) | queryset_blank_space | | +| ged_gte_1 | | | needs manual update (as of now) | queryset_wildest_dream, queryset_yellow_pikachu | | +| ged_ns | | | needs manual update (as of now) | queryset_lavender_haze | | +| ged_ns_dummy_t0 | | | needs manual update (as of now) | queryset_electric_relaxation | | +| ged_ns_dummy_t1 | | | needs manual update (as of now) | queryset_electric_relaxation | | +| ged_ns_dummy_t2 | | | needs manual update (as of now) | queryset_electric_relaxation | | +| ged_ns_dummy_t3 | | | needs manual update (as of now) | queryset_electric_relaxation | | +| ged_ns_dummy_t4 | | | needs manual update (as of now) | queryset_electric_relaxation | | +| ged_ns_dummy_t5 | | | needs manual update (as of now) | queryset_electric_relaxation | | +| ged_ns_dummy_t6 | | | needs manual update (as of now) | queryset_electric_relaxation | | +| ged_os | | | needs manual update (as of now) | queryset_lavender_haze | | +| ged_os_dummy_t0 | | | needs manual update (as of now) | queryset_electric_relaxation | | +| ged_os_dummy_t1 | | | needs manual update (as of now) | queryset_electric_relaxation | | +| ged_os_dummy_t2 | | | needs manual update (as of now) | queryset_electric_relaxation | | +| ged_os_dummy_t3 | | | needs manual update (as of now) | queryset_electric_relaxation | | +| ged_os_dummy_t4 | | | needs manual update (as of now) | queryset_electric_relaxation | | +| ged_os_dummy_t5 | | | needs manual update (as of now) | queryset_electric_relaxation | | +| ged_os_dummy_t6 | | | needs manual update (as of now) | queryset_electric_relaxation | | +| ged_sb | | | needs manual update (as of now) | queryset_lavender_haze | | +| ged_sb_dep | | | needs manual update (as of now) | queryset_electric_relaxation | | +| ged_sb_dep | | | needs manual update (as of now) | queryset_orange_pasta, queryset_yellow_pikachu | | +| ged_sb_dummy_t0 | | | needs manual update (as of now) | queryset_electric_relaxation | | +| ged_sb_dummy_t1 | | | needs manual update (as of now) | queryset_electric_relaxation | | +| ged_sb_dummy_t2 | | | needs manual update (as of now) | queryset_electric_relaxation | | +| ged_sb_dummy_t3 | | | needs manual update (as of now) | queryset_electric_relaxation | | +| ged_sb_dummy_t4 | | | needs manual update (as of now) | queryset_electric_relaxation | | +| ged_sb_dummy_t5 | | | needs manual update (as of now) | queryset_electric_relaxation | | +| ged_sb_dummy_t6 | | | needs manual update (as of now) | queryset_electric_relaxation | | +| greq_1_excluded | | | needs manual update (as of now) | queryset_blank_space, queryset_old_money | | +| imr_mean | | | needs manual update (as of now) | queryset_blank_space, queryset_lavender_haze | | +| ln_bdist3 | | | needs manual update (as of now) | queryset_blank_space, queryset_lavender_haze | | +| ln_capdist | | | needs manual update (as of now) | queryset_blank_space, queryset_lavender_haze | | +| ln_gcp_mer | | | needs manual update (as of now) | queryset_blank_space | | +| ln_ged_sb | | | needs manual update (as of now) | queryset_blank_space, queryset_old_money, queryset_orange_pasta | | +| ln_ged_sb_dep | | | needs manual update (as of now) | queryset_blank_space, queryset_lavender_haze, queryset_old_money, queryset_wildest_dream | | +| ln_ns_best | | | needs manual update (as of now) | queryset_meow_meow | | +| ln_os_best | | | needs manual update (as of now) | queryset_meow_meow | | +| ln_pgd_ttime_mean | | | needs manual update (as of now) | queryset_old_money | | +| ln_pop_gpw_sum | | | needs manual update (as of now) | queryset_blank_space, queryset_lavender_haze, queryset_old_money, queryset_orange_pasta | | +| ln_sb_best | | | needs manual update (as of now) | queryset_meow_meow | | +| ln_ttime_mean | | | needs manual update (as of now) | queryset_blank_space, queryset_lavender_haze | | +| month | | | needs manual update (as of now) | queryset_meow_meow | | +| mountains_mean | | | needs manual update (as of now) | queryset_blank_space | | +| pasture_ih | | | needs manual update (as of now) | queryset_blank_space | | +| pgd_imr_mean | | | needs manual update (as of now) | queryset_old_money | | +| pgd_urban_ih | | | needs manual update (as of now) | queryset_old_money | | +| row | | | needs manual update (as of now) | queryset_meow_meow | | +| savanna_ih | | | needs manual update (as of now) | queryset_blank_space | | +| shrub_ih | | | needs manual update (as of now) | queryset_blank_space | | +| splag_1_1_sb_1 | | | needs manual update (as of now) | queryset_blank_space, queryset_lavender_haze, queryset_orange_pasta | | +| sptime_dist_k1_1_ged_sb | | | needs manual update (as of now) | queryset_lavender_haze | | +| sptime_dist_k1_2_ged_sb | | | needs manual update (as of now) | queryset_lavender_haze | | +| sptime_dist_k1_3_ged_sb | | | needs manual update (as of now) | queryset_lavender_haze | | +| treelag_1_ns | | | needs manual update (as of now) | queryset_yellow_pikachu | | +| treelag_1_os | | | needs manual update (as of now) | queryset_yellow_pikachu | | +| treelag_1_sb | | | needs manual update (as of now) | queryset_lavender_haze, queryset_yellow_pikachu | | +| treelag_2_ns | | | needs manual update (as of now) | queryset_yellow_pikachu | | +| treelag_2_os | | | needs manual update (as of now) | queryset_yellow_pikachu | | +| treelag_2_sb | | | needs manual update (as of now) | queryset_lavender_haze, queryset_yellow_pikachu | | +| urban_ih | | | needs manual update (as of now) | queryset_blank_space | | +| wdi_nv_agr_totl_kd | | | needs manual update (as of now) | queryset_old_money | | +| year_id | | | needs manual update (as of now) | queryset_meow_meow | | \ No newline at end of file diff --git a/documentation/catalogs/feature_catalogue.md b/documentation/catalogs/feature_catalogue.md deleted file mode 100644 index e69de29b..00000000 From 72b2300617b588c721f8c2969a17925a92142852 Mon Sep 17 00:00:00 2001 From: Polichinl Date: Fri, 1 Nov 2024 01:50:53 +0100 Subject: [PATCH 05/21] path stuff --- .../catalogs/generate_features_catalog.py | 51 +++++++++++++++++-- 1 file changed, 46 insertions(+), 5 deletions(-) diff --git a/documentation/catalogs/generate_features_catalog.py b/documentation/catalogs/generate_features_catalog.py index 72a3ca50..5c91d6c8 100644 --- a/documentation/catalogs/generate_features_catalog.py +++ b/documentation/catalogs/generate_features_catalog.py @@ -11,21 +11,57 @@ ) logger = logging.getLogger(__name__) + + # Path to the root and querysets PATH = Path(__file__).resolve() indices = [i for i, x in enumerate(PATH.parts) if x == "views_pipeline"] PATH_ROOT = Path(*PATH.parts[:indices[-1] + 1]) querysets_path = PATH_ROOT / 'common_querysets' -GITHUB_URL = 'https://github.com/prio-data/views_pipeline/blob/production/' -def extract_columns_from_querysets(querysets_path): + + +def get_path_common_querysets(): + """ + Retrieves the path to the 'common_querysets' directory within the 'views_pipeline' directory. + + This function identifies the 'views_pipeline' directory within the path of the current file, + constructs a new path up to and including this directory, and then appends the relative path + to the 'common_querysets' directory. If the 'views_pipeline' directory or the 'common_querysets' + directory is not found, it raises a ValueError. + + Returns: + Path: The path to the 'common_querysets' directory. + + Raises: + ValueError: If the 'views_pipeline' directory or the 'common_querysets' directory is not found in the provided path. + """ + + PATH = Path(__file__) + + # Locate 'views_pipeline' in the current file's path parts + if 'views_pipeline' in PATH.parts: + PATH_ROOT = Path(*PATH.parts[:PATH.parts.index('views_pipeline') + 1]) + PATH_COMMON_QUERYSETS = PATH_ROOT / 'common_querysets' + + # Check if 'common_querysets' directory exists + if not PATH_COMMON_QUERYSETS.exists(): + raise ValueError("The 'common_querysets' directory was not found in the provided path.") + + else: + raise ValueError("The 'views_pipeline' directory was not found in the provided path.") + + return PATH_COMMON_QUERYSETS + + +def extract_columns_from_querysets(PATH_COMMON_QUERYSETS): """ Parses each queryset file in the common_querysets folder to extract columns, querysets, and LOA. """ columns_info = [] - for file_path in querysets_path.glob("*.py"): + for file_path in PATH_COMMON_QUERYSETS.glob("*.py"): with open(file_path, 'r') as file: content = file.read() queryset_name = file_path.stem @@ -76,12 +112,17 @@ def generate_markdown_table(columns_info): return markdown_table if __name__ == "__main__": + + GITHUB_URL = 'https://github.com/prio-data/views_pipeline/blob/production/' + + PATH_COMMON_QUERYSETS = get_path_common_querysets() + # Extract feature information from querysets - columns_info = extract_columns_from_querysets(querysets_path) + columns_info = extract_columns_from_querysets(PATH_COMMON_QUERYSETS) # Generate the markdown table for the feature catalog feature_catalog = generate_markdown_table(columns_info) # Save the markdown table - with open('documentation/catalogs/feature_catalog.md', 'w') as f: + with open('feature_catalog.md', 'w') as f: # saved locally right next to this script f.write(feature_catalog) From 317ffb005ee93401533211d250cad617a5c0f943 Mon Sep 17 00:00:00 2001 From: Polichinl Date: Fri, 1 Nov 2024 01:55:34 +0100 Subject: [PATCH 06/21] removed old code --- documentation/catalogs/generate_features_catalog.py | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/documentation/catalogs/generate_features_catalog.py b/documentation/catalogs/generate_features_catalog.py index 5c91d6c8..f013b50f 100644 --- a/documentation/catalogs/generate_features_catalog.py +++ b/documentation/catalogs/generate_features_catalog.py @@ -11,17 +11,6 @@ ) logger = logging.getLogger(__name__) - - -# Path to the root and querysets -PATH = Path(__file__).resolve() -indices = [i for i, x in enumerate(PATH.parts) if x == "views_pipeline"] -PATH_ROOT = Path(*PATH.parts[:indices[-1] + 1]) - -querysets_path = PATH_ROOT / 'common_querysets' - - - def get_path_common_querysets(): """ Retrieves the path to the 'common_querysets' directory within the 'views_pipeline' directory. From bab63b06b6f78d8f5e15d85f58fcd3f858d32f6a Mon Sep 17 00:00:00 2001 From: jimdale Date: Fri, 1 Nov 2024 13:48:33 +0100 Subject: [PATCH 07/21] proposed adr for ensemble reconciliation --- .../ADRs/027_ensmeble_reconciliation.md | 47 +++++++++++++++++++ 1 file changed, 47 insertions(+) create mode 100644 documentation/ADRs/027_ensmeble_reconciliation.md diff --git a/documentation/ADRs/027_ensmeble_reconciliation.md b/documentation/ADRs/027_ensmeble_reconciliation.md new file mode 100644 index 00000000..d4874c52 --- /dev/null +++ b/documentation/ADRs/027_ensmeble_reconciliation.md @@ -0,0 +1,47 @@ +Ensemble reconcilation + +| ADR Info | Details | +|---------------------|-------------------------| +| Subject | Ensemble reconciliation | +| ADR Number | 027 | +| Status | proposed | +| Author | Jim | +| Date | 01/11/2024 | + +## Context +The notebook-based views3/fatalities002 pipeline generates a cm and a pgm ensemble. It was found that the pgm ensemble suffered from what might be termed normalisation issues, in that the peak and total numbers of fatalities forecast at pgm level are clearly too low. In particular, summing forecast fatalities over the pg cells belonging to a given country, a dcomapring to the fatalities forecast for the same country at cm level almost always gives the result that the summed pgm values are significantly - often an order of magnitude - lower. +As a quick fix, therefore, a reconciliation function was created which accepts a pgm forecast dataframe and a cm forecast dataframe, fetches via viewser a pgm->cm mapping, computes for every country for every month the sum over its constituent pg cells, and renormalises the pgm forecasts for those cells so that the sum matches the cm-level forecast. A check is performed which ensures that the set of months in the two input dfs is the same. +The reconciliation will be applied to every pgm-level constituent model from which the pgm ensemble is built. + +## Decision +This reconciliation is to be implemented in the pipeline as a temporary fix in lieu of improvements to the pgm models. The reconciliation function itself needs to be globally available, so should live in common utils. +For each ensemble, a new item of metadata will be created, 'reconcile_with', whose value will either be None, or the name of another ensemble. No checks need be done on whether a valid choice has been made, since the function already checks to see that the two ensembles it is presented with have correctly-formatted indexes, and the identical month-sets. This change needs to be present in the ensemble-creation meta-tool, with the default value of None. +In an ensemble's generate_forecast.py, a code fragment needs to be added where if reconcile_with is not None, the ensemble named by reconcile_with is fetched from storage and presented to the reconciliation function along with each pgm constituent model in turn. +Warnings are to be issued and logged if negative-valued forecasts are encountered (before setting them to zero) and if large normalisations are necessary. + + +### Overview +This is to be viewed as a temporary fix intended to align forecasts from the new pipeline with those of the old. Warnings are issued to inform the user if large normalisations are being performed, which indicates poorly-performing pgm-level models. +This feature is very simple to disable via the ensemble metadata dict. + +## Consequences + +**Positive Effects:** +- Allows replication of a necessary but frowned-upon feature of the old pipeline +- Keeps the user informed about the relative performance of the pgm and cm models. Serious inconsistency between the two sets of models is a useful indicator of poor (probably pgm-level) model performance. + +**Negative Effects:** +- This solution is little more than a hack which we arguably do not want in the codebase +- This does require changes to the ensemble template and all extant ensembles to ensure that their metadata contains the new key. + +## Rationale +This is the least intrusive means of implementing this feature, and allowing it to be easily turned on and off + +### Considerations +None + +## Additional Notes +None + +## Feedback and Suggestions +Feedback welcomed From 1d4a1a51f01ccf5273f20641734d942bff1a9f8f Mon Sep 17 00:00:00 2001 From: marinamatic <147725905+marinamatic@users.noreply.github.com> Date: Fri, 1 Nov 2024 13:57:25 +0100 Subject: [PATCH 08/21] added table placeholder --- documentation/catalogs/generate_features_catalog.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/documentation/catalogs/generate_features_catalog.py b/documentation/catalogs/generate_features_catalog.py index f013b50f..6609ac7f 100644 --- a/documentation/catalogs/generate_features_catalog.py +++ b/documentation/catalogs/generate_features_catalog.py @@ -88,8 +88,8 @@ def generate_markdown_table(columns_info): for _, row in columns_info.iterrows(): table_data.append([ row['column_name'], - '', # Placeholder for Human-readable name - '', # Placeholder for Data source + 'needs manual update', # Placeholder for Human-readable name + 'needs manual update', # Placeholder for Data source 'needs manual update (as of now)', # Placeholder for Last updated row['queryset'], '', #Placeholder for notes From f88373494b2a69c9f3ad0361bdf5a8de75864742 Mon Sep 17 00:00:00 2001 From: marinamatic <147725905+marinamatic@users.noreply.github.com> Date: Fri, 1 Nov 2024 14:10:19 +0100 Subject: [PATCH 09/21] spell fixes --- documentation/ADRs/024_development_and_production_sync.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/documentation/ADRs/024_development_and_production_sync.md b/documentation/ADRs/024_development_and_production_sync.md index 91cd92b1..4211f86f 100644 --- a/documentation/ADRs/024_development_and_production_sync.md +++ b/documentation/ADRs/024_development_and_production_sync.md @@ -1,6 +1,6 @@ ## Development and Production Sync -*Refinement of Model Configuration Files Structure* + | ADR Info | Details | |---------------------|-------------------| @@ -24,7 +24,7 @@ This ADR defines the branching and synchronization structure necessary to suppor ## Decision -To achieve the requirements described in the Context section, we will implement the following strategy for branching and synchronization strategy, optimized for the EWS pipeline. +To achieve the requirements described in the Context section, we will implement the following strategy for branching and synchronization strategy, optimized for the EWS pipeline: ### Overview From 385679c8b9c23385c29ba7719f4f118389f69162 Mon Sep 17 00:00:00 2001 From: marinamatic <147725905+marinamatic@users.noreply.github.com> Date: Fri, 1 Nov 2024 14:37:19 +0100 Subject: [PATCH 10/21] accepted ADR014 --- documentation/ADRs/014_input_drift_detection.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/documentation/ADRs/014_input_drift_detection.md b/documentation/ADRs/014_input_drift_detection.md index 3ff84d7e..e62dc09c 100644 --- a/documentation/ADRs/014_input_drift_detection.md +++ b/documentation/ADRs/014_input_drift_detection.md @@ -4,7 +4,7 @@ |-------------------|-----------------------| | Subject | Input drift detection | | ADR Number | 014 | -| Status | proposed | +| Status | Accepted | | Author | Jim Dale | | Date | 02/10/2024 | From d67c355b7ced02750c04916582d52a1f9975bb6e Mon Sep 17 00:00:00 2001 From: marinamatic <147725905+marinamatic@users.noreply.github.com> Date: Fri, 1 Nov 2024 14:38:29 +0100 Subject: [PATCH 11/21] number edit ADR008 --- documentation/ADRs/008_no_jupyter_notebooks_in_production.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/documentation/ADRs/008_no_jupyter_notebooks_in_production.md b/documentation/ADRs/008_no_jupyter_notebooks_in_production.md index b4bb7459..05a2eaa5 100644 --- a/documentation/ADRs/008_no_jupyter_notebooks_in_production.md +++ b/documentation/ADRs/008_no_jupyter_notebooks_in_production.md @@ -3,7 +3,7 @@ | ADR Info | Details | |---------------------|-----------------------------------------------| | Subject | No Use of Jupyter Notebooks in Production | -| ADR Number | 001 | +| ADR Number | 008 | | Status | Accepted | | Author | Jim, Mihai, Xiaolong, Simon, Sara | | Date | 30.07.2024 | From 32362ed1b6e20be1e0f86e4995dda1ff26ad432a Mon Sep 17 00:00:00 2001 From: marinamatic <147725905+marinamatic@users.noreply.github.com> Date: Fri, 1 Nov 2024 14:42:23 +0100 Subject: [PATCH 12/21] accepted ADR023 --- documentation/ADRs/023_production_development.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/documentation/ADRs/023_production_development.md b/documentation/ADRs/023_production_development.md index 6e0897b8..5a749ebf 100644 --- a/documentation/ADRs/023_production_development.md +++ b/documentation/ADRs/023_production_development.md @@ -5,7 +5,7 @@ |---------------------|-------------------| | Subject | Production and Development Branches | | ADR Number | 023 | -| Status | proposed | +| Status | Accepted | | Author | Borbála | | Date | 29.10.2024. | From a430a4dd45d5ccf3f3105d83e5462b7e59235eb5 Mon Sep 17 00:00:00 2001 From: marinamatic <147725905+marinamatic@users.noreply.github.com> Date: Fri, 1 Nov 2024 14:49:54 +0100 Subject: [PATCH 13/21] accepted ADR016 --- .../ADRs/016_input_drift_detection_logging.md | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/documentation/ADRs/016_input_drift_detection_logging.md b/documentation/ADRs/016_input_drift_detection_logging.md index b8256be7..b5cbf94d 100644 --- a/documentation/ADRs/016_input_drift_detection_logging.md +++ b/documentation/ADRs/016_input_drift_detection_logging.md @@ -5,7 +5,7 @@ |---------------------|-------------------------------| | Subject | Input Drift Detection Logging | | ADR Number | 016 | -| Status | proposed | +| Status | Accepted | | Author | Jim Dale | | Date | 28/10/2014 | @@ -14,19 +14,19 @@ An input drift detection system has been implemented as part of the viewser data For related ADRs on the generation of different log files and other general logging standards/routines, please see the ADRs below: [NOTE: new relevant ADRs links should be added] -[009_log_file_for_generated_data](/documentation/ADRs/009_log_file_for_generated_data.md) +- [009_log_file_for_generated_data](/documentation/ADRs/009_log_file_for_generated_data.md) -[017_log_files_for_offline_evaluation](/documentation/ADRs/017_log_files_for_offline_evaluation.md) +- [017_log_files_for_offline_evaluation](/documentation/ADRs/017_log_files_for_offline_evaluation.md) -[018_log_files_for_online_evaluation](/documentation/ADRs/018_log_files_for_online_evaluation.md) +- [018_log_files_for_online_evaluation](/documentation/ADRs/018_log_files_for_online_evaluation.md) -[019_log_files_for_model_training](/documentation/ADRs/019_log_files_for_model_training.md) +- [019_log_files_for_model_training](/documentation/ADRs/019_log_files_for_model_training.md) -[020_log_files_and_realtime_alerts](/documentation/ADRs/020_log_files_and_realtime_alerts.md) +- [020_log_files_and_realtime_alerts](/documentation/ADRs/020_log_files_and_realtime_alerts.md) -[025_log_level_standards](/documentation/ADRs/025_log_level_standards.md) +- [025_log_level_standards](/documentation/ADRs/025_log_level_standards.md) -[026_log_files_for_input_data](/documentation/ADRs/026_log_files_for_input_data.md) +- [026_log_files_for_input_data](/documentation/ADRs/026_log_files_for_input_data.md) ## Decision From 3fc6920c550082cc4c708b94efcb41d7f159fb2d Mon Sep 17 00:00:00 2001 From: marinamatic <147725905+marinamatic@users.noreply.github.com> Date: Fri, 1 Nov 2024 14:55:44 +0100 Subject: [PATCH 14/21] title edit ADR011 --- documentation/ADRs/011_Common_Querysets_for_Model_Pipelines.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/documentation/ADRs/011_Common_Querysets_for_Model_Pipelines.md b/documentation/ADRs/011_Common_Querysets_for_Model_Pipelines.md index 85c679b9..406ec70f 100644 --- a/documentation/ADRs/011_Common_Querysets_for_Model_Pipelines.md +++ b/documentation/ADRs/011_Common_Querysets_for_Model_Pipelines.md @@ -1,4 +1,4 @@ -# ADR 011 - Common Querysets for Model Pipelines +#Common Querysets for Model Pipelines | ADR Info | Details | From f552d11600fbe730c64dcb0290540273d8e3e0ab Mon Sep 17 00:00:00 2001 From: marinamatic <147725905+marinamatic@users.noreply.github.com> Date: Fri, 1 Nov 2024 14:59:47 +0100 Subject: [PATCH 15/21] uniforming titles --- documentation/ADRs/022_model_catalogs.md | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/documentation/ADRs/022_model_catalogs.md b/documentation/ADRs/022_model_catalogs.md index 376fe715..552c6e44 100644 --- a/documentation/ADRs/022_model_catalogs.md +++ b/documentation/ADRs/022_model_catalogs.md @@ -1,6 +1,5 @@ - -## Create Model Catalogs +# Create Model Catalogs | ADR Info | Details | From e468cf874ac22df36c57abba79d9397ffe1a5da2 Mon Sep 17 00:00:00 2001 From: marinamatic <147725905+marinamatic@users.noreply.github.com> Date: Fri, 1 Nov 2024 15:00:49 +0100 Subject: [PATCH 16/21] more title edits --- documentation/ADRs/023_production_development.md | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/documentation/ADRs/023_production_development.md b/documentation/ADRs/023_production_development.md index 5a749ebf..7c50f2d1 100644 --- a/documentation/ADRs/023_production_development.md +++ b/documentation/ADRs/023_production_development.md @@ -1,5 +1,4 @@ -## Production and Development Branches -*Using production and development branches instead of main* +# Production and Development Branches | ADR Info | Details | |---------------------|-------------------| From 02ad36487dfcb8452b37519c518b1e61d62cb034 Mon Sep 17 00:00:00 2001 From: marinamatic <147725905+marinamatic@users.noreply.github.com> Date: Fri, 1 Nov 2024 15:05:01 +0100 Subject: [PATCH 17/21] accepted ADR025 --- documentation/ADRs/025_log _level_standards.md | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/documentation/ADRs/025_log _level_standards.md b/documentation/ADRs/025_log _level_standards.md index 922f2f2b..529a48e6 100644 --- a/documentation/ADRs/025_log _level_standards.md +++ b/documentation/ADRs/025_log _level_standards.md @@ -5,7 +5,7 @@ |---------------------|-------------------| | Subject | Logging Levels Configuration | | ADR Number | 025 | -| Status | Proposed | +| Status | Accepted | | Author | Simon | | Date | 30.10.2024 | @@ -17,19 +17,19 @@ The following log levels—DEBUG, INFO, WARNING, ERROR, and CRITICAL—are confi For related ADRs on the generation of different log files and other general logging standards/routines, please see the ADRs below: [NOTE: new relevant ADRs links should be added] -[009_log_file_for_generated_data](/documentation/ADRs/009_log_file_for_generated_data.md) +- [009_log_file_for_generated_data](/documentation/ADRs/009_log_file_for_generated_data.md) -[016_input_drift_detection_logging](/documentation/ADRs/016_input_drift_detection_logging.md) +- [016_input_drift_detection_logging](/documentation/ADRs/016_input_drift_detection_logging.md) -[017_log_files_for_offline_evaluation](/documentation/ADRs/017_log_files_for_offline_evaluation.md) +- [017_log_files_for_offline_evaluation](/documentation/ADRs/017_log_files_for_offline_evaluation.md) -[018_log_files_for_online_evaluation](/documentation/ADRs/018_log_files_for_online_evaluation.md) +- [018_log_files_for_online_evaluation](/documentation/ADRs/018_log_files_for_online_evaluation.md) -[019_log_files_for_model_training](/documentation/ADRs/019_log_files_for_model_training.md) +- [019_log_files_for_model_training](/documentation/ADRs/019_log_files_for_model_training.md) -[020_log_files_and_realtime_alerts](/documentation/ADRs/020_log_files_and_realtime_alerts.md) +- [020_log_files_and_realtime_alerts](/documentation/ADRs/020_log_files_and_realtime_alerts.md) -[026_log_files_for_input_data](/documentation/ADRs/026_log_files_for_input_data.md) +- [026_log_files_for_input_data](/documentation/ADRs/026_log_files_for_input_data.md) ## Decision From c824fd6acdf74808759700c9e6743e5120d10b06 Mon Sep 17 00:00:00 2001 From: jimdale Date: Fri, 1 Nov 2024 15:20:21 +0100 Subject: [PATCH 18/21] implemented comments from Simon --- documentation/ADRs/027_ensmeble_reconciliation.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/documentation/ADRs/027_ensmeble_reconciliation.md b/documentation/ADRs/027_ensmeble_reconciliation.md index d4874c52..6c277c20 100644 --- a/documentation/ADRs/027_ensmeble_reconciliation.md +++ b/documentation/ADRs/027_ensmeble_reconciliation.md @@ -12,6 +12,7 @@ Ensemble reconcilation The notebook-based views3/fatalities002 pipeline generates a cm and a pgm ensemble. It was found that the pgm ensemble suffered from what might be termed normalisation issues, in that the peak and total numbers of fatalities forecast at pgm level are clearly too low. In particular, summing forecast fatalities over the pg cells belonging to a given country, a dcomapring to the fatalities forecast for the same country at cm level almost always gives the result that the summed pgm values are significantly - often an order of magnitude - lower. As a quick fix, therefore, a reconciliation function was created which accepts a pgm forecast dataframe and a cm forecast dataframe, fetches via viewser a pgm->cm mapping, computes for every country for every month the sum over its constituent pg cells, and renormalises the pgm forecasts for those cells so that the sum matches the cm-level forecast. A check is performed which ensures that the set of months in the two input dfs is the same. The reconciliation will be applied to every pgm-level constituent model from which the pgm ensemble is built. +this is a known issue with legacy models that were adapted in various forms from the old pipeline. Although some hyper-parameters might help mitigate these issues, the challenges are inherent to the models' architecture and loss functions. ## Decision This reconciliation is to be implemented in the pipeline as a temporary fix in lieu of improvements to the pgm models. The reconciliation function itself needs to be globally available, so should live in common utils. @@ -21,8 +22,9 @@ Warnings are to be issued and logged if negative-valued forecasts are encountere ### Overview -This is to be viewed as a temporary fix intended to align forecasts from the new pipeline with those of the old. Warnings are issued to inform the user if large normalisations are being performed, which indicates poorly-performing pgm-level models. +Reconciliation is being deployed partly to allow the aligning of forecasts from the new pipeline with those of the old. Warnings are issued to inform the user if large normalisations are being performed, which indicates poorly-performing pgm-level models. This feature is very simple to disable via the ensemble metadata dict. +The reconciliation machinery will be maintained as a stable approach to maintain strict consistency between CM-level and aggregated PGM data. In future, it should NOT be viewed as a tool to systematically up-bias PGM models that underestimate conflict fatalities. This underestimation is fundamentally a modeling issue, not a reconciliation problem. Going forward, the explicit goal for all model development efforts is to design architectures, loss functions, optimization routines, sampling strategies, and other methods that address these issues. ## Consequences From 09dd3ccd750216433ba8bb96b56334143882eb70 Mon Sep 17 00:00:00 2001 From: jimdale Date: Fri, 1 Nov 2024 15:27:49 +0100 Subject: [PATCH 19/21] implemented first and last comments from Simon --- documentation/ADRs/027_ensmeble_reconciliation.md | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/documentation/ADRs/027_ensmeble_reconciliation.md b/documentation/ADRs/027_ensmeble_reconciliation.md index 6c277c20..4fcd8c72 100644 --- a/documentation/ADRs/027_ensmeble_reconciliation.md +++ b/documentation/ADRs/027_ensmeble_reconciliation.md @@ -9,10 +9,12 @@ Ensemble reconcilation | Date | 01/11/2024 | ## Context -The notebook-based views3/fatalities002 pipeline generates a cm and a pgm ensemble. It was found that the pgm ensemble suffered from what might be termed normalisation issues, in that the peak and total numbers of fatalities forecast at pgm level are clearly too low. In particular, summing forecast fatalities over the pg cells belonging to a given country, a dcomapring to the fatalities forecast for the same country at cm level almost always gives the result that the summed pgm values are significantly - often an order of magnitude - lower. +The notebook-based views3/fatalities002 pipeline generates a cm and a pgm ensemble. It was found that the pgm ensemble suffered from what might be termed normalisation issues, in that the peak and total numbers of fatalities forecast at pgm level are clearly too low. In particular, summing forecast fatalities over the pg cells belonging to a given country, a dcomapring to the fatalities forecast for the same country at cm level almost always gives the result that the summed pgm values are significantly - often an order of magnitude - lower As a quick fix, therefore, a reconciliation function was created which accepts a pgm forecast dataframe and a cm forecast dataframe, fetches via viewser a pgm->cm mapping, computes for every country for every month the sum over its constituent pg cells, and renormalises the pgm forecasts for those cells so that the sum matches the cm-level forecast. A check is performed which ensures that the set of months in the two input dfs is the same. +This is then equivalent to an up-biasing of all the pgm models, which plainly is not a satisfying solution. The reconciliation will be applied to every pgm-level constituent model from which the pgm ensemble is built. this is a known issue with legacy models that were adapted in various forms from the old pipeline. Although some hyper-parameters might help mitigate these issues, the challenges are inherent to the models' architecture and loss functions. +Going forward, the explicit goal for all model development efforts is to design architectures, loss functions, optimization routines, sampling strategies, and other methods that address these issues. ## Decision This reconciliation is to be implemented in the pipeline as a temporary fix in lieu of improvements to the pgm models. The reconciliation function itself needs to be globally available, so should live in common utils. @@ -24,7 +26,7 @@ Warnings are to be issued and logged if negative-valued forecasts are encountere ### Overview Reconciliation is being deployed partly to allow the aligning of forecasts from the new pipeline with those of the old. Warnings are issued to inform the user if large normalisations are being performed, which indicates poorly-performing pgm-level models. This feature is very simple to disable via the ensemble metadata dict. -The reconciliation machinery will be maintained as a stable approach to maintain strict consistency between CM-level and aggregated PGM data. In future, it should NOT be viewed as a tool to systematically up-bias PGM models that underestimate conflict fatalities. This underestimation is fundamentally a modeling issue, not a reconciliation problem. Going forward, the explicit goal for all model development efforts is to design architectures, loss functions, optimization routines, sampling strategies, and other methods that address these issues. +The reconciliation machinery will be maintained as a stable approach to maintain strict consistency between CM-level and aggregated PGM data. In future, it should NOT be viewed as a tool to systematically up-bias PGM models that underestimate conflict fatalities. This underestimation is fundamentally a modeling issue, not a reconciliation problem. Future work will be directed at finding genuine solutions to these issues, as opposed to sticking-plasters. ## Consequences From fe7466a5380cff99d26c26b0241e464e8c5398f1 Mon Sep 17 00:00:00 2001 From: Polichinl Date: Fri, 1 Nov 2024 15:30:00 +0100 Subject: [PATCH 20/21] Accepted --- documentation/ADRs/024_development_and_production_sync.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/documentation/ADRs/024_development_and_production_sync.md b/documentation/ADRs/024_development_and_production_sync.md index 4211f86f..ba487ad0 100644 --- a/documentation/ADRs/024_development_and_production_sync.md +++ b/documentation/ADRs/024_development_and_production_sync.md @@ -6,7 +6,7 @@ |---------------------|-------------------| | Subject | Production and Development Branch Synchronization | | ADR Number | 024 | -| Status | Proposed | +| Status | Accepted | | Author | Simon | | Date | 31.10.2024. | From e6dedbd0d7ff6c5ca642bd1f20666aba92522c84 Mon Sep 17 00:00:00 2001 From: Polichinl Date: Fri, 1 Nov 2024 15:37:06 +0100 Subject: [PATCH 21/21] Accepted (forgort to do this in the right branch) --- documentation/ADRs/027_ensmeble_reconciliation.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/documentation/ADRs/027_ensmeble_reconciliation.md b/documentation/ADRs/027_ensmeble_reconciliation.md index 4fcd8c72..336526d8 100644 --- a/documentation/ADRs/027_ensmeble_reconciliation.md +++ b/documentation/ADRs/027_ensmeble_reconciliation.md @@ -4,7 +4,7 @@ Ensemble reconcilation |---------------------|-------------------------| | Subject | Ensemble reconciliation | | ADR Number | 027 | -| Status | proposed | +| Status | Accepted | | Author | Jim | | Date | 01/11/2024 |