Merge pull request #114 from prio-data/create_pgm_catalog_01

Create pgm catalog 01
prio-data · Oct 29, 2024 · b9fb7e4 · b9fb7e4
2 parents 1eb221b + 42efa0d
commit b9fb7e4
Show file tree

Hide file tree

Showing 9 changed files with 275 additions and 47 deletions.
diff --git a/.github/workflows/check_if_new_model_added.yml b/.github/workflows/check_if_new_model_added.yml
@@ -0,0 +1,44 @@
+name: Check for new model directories in views_pipeline/models
+
+on:
+  push:
+    branches:
+      - create_pgm_catalog_01 # for testing on this branch
+      - production
+      - development
+    paths:
+      - models/*/configs/config_deployment.py
+      - models/*/configs/config_meta.py
+      - common_querysets/
+  workflow_dispatch: # for triggering manually
+
+jobs:
+  check-new-folder:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v3
+        with:
+          fetch-depth: 0
+
+      - name: Generate catalog if models directory has changed
+        run: |
+          python documentation/catalogs/generate_model_catalog.py  
+          if [ $? -ne 0 ]; then
+            echo "Generating catalogs failed."
+            exit 1
+          fi
+          echo "Model catalog is updated."           
+          git status
+
+      - name: Configure Git
+        run: |
+          git config --global user.name "GitHub Actions"
+          git config --global user.email "actions@github.com"
+
+      - name: Commit and Push Changes
+        run: |
+          git add documentation/catalogs/cm_model_catalog.md documentation/catalogs/pgm_model_catalog.md
+          git commit -m "Automated changes by GitHub Actions" || echo "Nothing to commit"
+          git push https://${{ secrets.VIEWS_PIPELINE_ACCESS_TOKEN }}:x-oauth-basic@github.com/prio-data/views_pipeline.git 
+      
diff --git a/.github/workflows/prevent_merge_when_branch_behind.yml b/.github/workflows/prevent_merge_when_branch_behind.yml
@@ -1,10 +1,11 @@
-name: Require Branch to Be Up-to-Date with Main
+name: Require Branch to Be Up-to-Date with Production
 
 # Trigger this workflow on pull request events targeting a specific branch.
 on:
   pull_request:
     branches:
-      - main
+      - production
+      - development
       - test-protect-main-merge # for testing
   workflow_dispatch: # enables manual triggering
 
@@ -17,18 +18,18 @@ jobs:
         with:
           ref: ${{ github.event.pull_request.head.ref }}
 
-      - name: Fetch main branch
+      - name: Fetch production branch
         run: |
           git fetch --unshallow
-          git fetch origin main
+          git fetch origin production
 
-      - name: Compare branch with main
+      - name: Compare branch with production
         run: |
-          if git merge-base --is-ancestor origin/main HEAD; then
-            echo "::notice ::Branch is up-to-date with main."
+          if git merge-base --is-ancestor origin/production HEAD; then
+            echo "::notice ::Branch is up-to-date with production."
           else
-            echo "::error ::Merge Blocked: Your branch is behind the latest commits on main. Please update your branch with the latest changes from main before attempting to merge."
-            echo "Merge base: $(git merge-base HEAD origin/main)"
+            echo "::error ::Merge Blocked: Your branch is behind the latest commits on production. Please update your branch with the latest changes from production before attempting to merge."
+            echo "Merge base: $(git merge-base HEAD origin/production)"
             exit 1
           fi
 

diff --git a/common_utils/set_path.py b/common_utils/set_path.py
@@ -5,7 +5,9 @@
 from ensemble_path import EnsemblePath
 
 # Configure logging - don't know if this is necessary here
-# logging.basicConfig(level=logging.WARNING)
+logging.basicConfig(
+    level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
+)
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.INFO)
 

diff --git a/documentation/catalogs/cm_model_catalog.md b/documentation/catalogs/cm_model_catalog.md
@@ -1,36 +1,3 @@
 | Model Name | Algorithm | Target | Input Features | Non-default Hyperparameters | Forecasting Type | Implementation Status | Implementation Date | Author |
 | ---------- | --------- | ------ | -------------- | --------------------------- | ---------------- | --------------------- | ------------------- | ------ |
-| fatalities002_baseline_rf | XGBRFRegressor | ln_ged_sb_dep | - [fatalities002_baseline](https://github.com/prio-data/viewsforecasting/blob/github_workflows/Tools/cm_querysets.py#L24) | n_estimators=300, n_jobs=nj | Direct multi-step | no | NA | NA |
-| fatalities002_conflicthistory_rf | XGBRFRegressor | ln_ged_sb_dep | - [fatalities002_conflict_history](https://github.com/prio-data/viewsforecasting/blob/github_workflows/Tools/cm_querysets.py#L3087) | n_estimators=250, n_jobs=nj | Direct multi-step | no | NA | NA |
-| fatalities002_conflicthistory_gbm | GradientBoostingRegressor | ln_ged_sb_dep | - [fatalities002_conflict_history](https://github.com/prio-data/viewsforecasting/blob/github_workflows/Tools/cm_querysets.py#L3087) | n_estimators=200 | Direct multi-step | no | NA | NA |
-| fatalities002_conflicthistory_hurdle_lgb | HurdleRegression | ln_ged_sb_dep | - [fatalities002_conflict_history](https://github.com/prio-data/viewsforecasting/blob/github_workflows/Tools/cm_querysets.py#L3087) | clf_name="LGBMClassifier", reg_name="LGBMRegressor" | Direct multi-step | no | NA | NA |
-| fatalities002_conflicthistory_long_xgb | XGBRegressor | ln_ged_sb_dep | - [fatalities002_conflict_history_long](https://github.com/prio-data/viewsforecasting/blob/github_workflows/Tools/cm_querysets.py#L3101) | n_estimators=100, learning_rate=0.05, n_jobs=nj | Direct multi-step | no | NA | NA |
-| fatalities002_vdem_hurdle_xgb | HurdleRegression | ln_ged_sb_dep | - [fatalities002_vdem_short](https://github.com/prio-data/viewsforecasting/blob/github_workflows/Tools/cm_querysets.py#L1213) | clf_name="XGBClassifier", reg_name="XGBRegressor" | Direct multi-step | no | NA | NA |
-| fatalities002_wdi_rf | XGBRFRegressor | ln_ged_sb_dep | - [fatalities002_wdi_short](https://github.com/prio-data/viewsforecasting/blob/github_workflows/Tools/cm_querysets.py#L1635) | n_estimators=300, n_jobs=nj | Direct multi-step | no | NA | NA |
-| fatalities002_topics_rf | XGBRFRegressor | ln_ged_sb_dep | - [fatalities002_topics](https://github.com/prio-data/viewsforecasting/blob/github_workflows/Tools/cm_querysets.py#L82) | n_estimators=250, n_jobs=nj | Direct multi-step | no | NA | NA |
-| fatalities002_topics_xgb | XGBRegressor | ln_ged_sb_dep | - [fatalities002_topics](https://github.com/prio-data/viewsforecasting/blob/github_workflows/Tools/cm_querysets.py#L82) | n_estimators=80, learning_rate=0.05, n_jobs=nj | Direct multi-step | no | NA | NA |
-| fatalities002_topics_hurdle_lgb | HurdleRegression | ln_ged_sb_dep | - [fatalities002_topics](https://github.com/prio-data/viewsforecasting/blob/github_workflows/Tools/cm_querysets.py#L82) | clf_name="LGBMClassifier", reg_name="LGBMRegressor" | Direct multi-step | no | NA | NA |
-| fatalities002_joint_broad_rf | XGBRFRegressor | ln_ged_sb_dep | - [fatalities002_joint_broad](https://github.com/prio-data/viewsforecasting/blob/github_workflows/Tools/cm_querysets.py#L2098) | n_estimators=250, n_jobs=nj | Direct multi-step | no | NA | NA |
-| fatalities002_joint_broad_hurdle_rf | HurdleRegression | ln_ged_sb_dep | - [fatalities002_joint_broad](https://github.com/prio-data/viewsforecasting/blob/github_workflows/Tools/cm_querysets.py#L2098) | clf_name="RFClassifier", reg_name="RFRegressor" | Direct multi-step | no | NA | NA |
-| fatalities002_joint_narrow_xgb | XGBRFRegressor | ln_ged_sb_dep | - [fatalities002_joint_narrow](https://github.com/prio-data/viewsforecasting/blob/github_workflows/Tools/cm_querysets.py#L1861) | n_estimators=250, n_jobs=nj | Direct multi-step | no | NA | NA |
-| fatalities002_joint_narrow_hurdle_xgb | HurdleRegression | ln_ged_sb_dep | - [fatalities002_joint_narrow](https://github.com/prio-data/viewsforecasting/blob/github_workflows/Tools/cm_querysets.py#L1861) | clf_name="XGBClassifier", reg_name="XGBRegressor" | Direct multi-step | no | NA | NA |
-| fatalities002_joint_narrow_hurdle_lgb | HurdleRegression | ln_ged_sb_dep | - [fatalities002_joint_narrow](https://github.com/prio-data/viewsforecasting/blob/github_workflows/Tools/cm_querysets.py#L1861) | clf_name="LGBMClassifier", reg_name="LGBMRegressor" | Direct multi-step | no | NA | NA |
-| fatalities002_all_pca3_xgb | XGBRegressor | ln_ged_sb_dep | - [fatalities002_all_features](https://github.com/prio-data/viewsforecasting/blob/github_workflows/Tools/cm_querysets.py#L3199) | n_estimators=100, learning_rate=0.05, n_jobs=nj | Direct multi-step | no | NA | NA |
-| fatalities002_aquastat_rf | XGBRFRegressor | ln_ged_sb_dep | - [fatalities002_aquastat](https://github.com/prio-data/viewsforecasting/blob/github_workflows/Tools/cm_querysets.py#L647) | n_estimators=300, n_jobs=nj | Direct multi-step | no | NA | NA |
-| fatalities002_faostat_rf | XGBRFRegressor | ln_ged_sb_dep | - [fatalities002_faostat](https://github.com/prio-data/viewsforecasting/blob/github_workflows/Tools/cm_querysets.py#L2705) | n_estimators=300, n_jobs=nj | Direct multi-step | no | NA | NA |
-| fatalities002_faoprices_rf | XGBRFRegressor | ln_ged_sb_dep | - [fatalities002_faoprices](https://github.com/prio-data/viewsforecasting/blob/github_workflows/Tools/cm_querysets.py#L2955) | n_estimators=300, n_jobs=nj | Direct multi-step | no | NA | NA |
-| fatalities002_imfweo_rf | XGBRFRegressor | ln_ged_sb_dep | - [fatalities002_imfweo](https://github.com/prio-data/viewsforecasting/blob/github_workflows/Tools/cm_querysets.py#L3021) | n_estimators=300, n_jobs=nj | Direct multi-step | no | NA | NA |
-| fatalities002_Markov_glm | rf | ln_ged_sb_dep | - [fatalities002_joint_narrow](https://github.com/prio-data/viewsforecasting/blob/github_workflows/Tools/cm_querysets.py#L1861) | None | Direct multi-step | no | NA | NA |
-| fatalities002_Markov_rf | glm | ln_ged_sb_dep | - [fatalities002_joint_narrow](https://github.com/prio-data/viewsforecasting/blob/github_workflows/Tools/cm_querysets.py#L1861) | None | Direct multi-step | no | NA | NA |
-| fatalities002_pgm_baseline_lgbm | lgbm_regressor | ln_ged_sb_dep | - [fatalities002_pgm_baseline](https://github.com/prio-data/viewsforecasting/blob/github_workflows/Tools/pgm_querysets.py#L34) | None | Direct multi-step | no | NA | NA |
-| fatalities002_pgm_conflictlong_lgbm | lgbm_regressor | ln_ged_sb_dep | - [fatalities002_pgm_conflictlong](https://github.com/prio-data/viewsforecasting/blob/github_workflows/Tools/pgm_querysets.py#L110) | None | Direct multi-step | no | NA | NA |
-| fatalities002_pgm_conflictlong_hurdle_lgbm | hur_lgbm_regressor | ln_ged_sb_dep | - [fatalities002_pgm_conflictlong](https://github.com/prio-data/viewsforecasting/blob/github_workflows/Tools/pgm_querysets.py#L110) | None | Direct multi-step | no | NA | NA |
-| fatalities002_pgm_escwa_drought_hurdle_lgbm | hur_lgbm_regressor | ln_ged_sb_dep | - [fatalities002_pgm_escwa_drought](https://github.com/prio-data/viewsforecasting/blob/github_workflows/Tools/pgm_querysets.py#L283) | None | Direct multi-step | no | NA | NA |
-| fatalities002_pgm_escwa_drought_lgbm | lgbm_regressor | ln_ged_sb_dep | - [fatalities002_pgm_escwa_drought](https://github.com/prio-data/viewsforecasting/blob/github_workflows/Tools/pgm_querysets.py#L283) | None | Direct multi-step | no | NA | NA |
-| fatalities002_pgm_natsoc_hurdle_lgbm | hur_lgbm_regressor | ln_ged_sb_dep | - [fatalities002_pgm_natsoc](https://github.com/prio-data/viewsforecasting/blob/github_workflows/Tools/pgm_querysets.py#L451) | None | Direct multi-step | no | NA | NA |
-| fatalities002_pgm_natsoc_lgbm | lgbm_regressor | ln_ged_sb_dep | - [fatalities002_pgm_natsoc](https://github.com/prio-data/viewsforecasting/blob/github_workflows/Tools/pgm_querysets.py#L451) | None | Direct multi-step | no | NA | NA |
-| fatalities002_pgm_broad_hurdle_lgbm | hur_lgbm_regressor | ln_ged_sb_dep | - [fatalities002_pgm_broad](https://github.com/prio-data/viewsforecasting/blob/github_workflows/Tools/pgm_querysets.py#L614) | None | Direct multi-step | no | NA | NA |
-| fatalities002_pgm_broad_lgbm | lgbm_regressor | ln_ged_sb_dep | - [fatalities002_pgm_broad](https://github.com/prio-data/viewsforecasting/blob/github_workflows/Tools/pgm_querysets.py#L614) | None | Direct multi-step | no | NA | NA |
-| fatalities002_pgm_conflict_history_xgb | xgb_regressor | ln_ged_sb_dep | - [fatalities002_pgm_conflict_history](https://github.com/prio-data/viewsforecasting/blob/github_workflows/Tools/pgm_querysets.py#L770) | None | Direct multi-step | no | NA | NA |
-| fatalities002_pgm_conflict_treelag_hurdle | hur_regressor | ln_ged_sb_dep | - [fatalities002_pgm_conflict_treelag](https://github.com/prio-data/viewsforecasting/blob/github_workflows/Tools/pgm_querysets.py#L1018) | None | Direct multi-step | no | NA | NA |
-| fatalities002_pgm_conflict_sptime_dist_hurdle | hur_regressor | ln_ged_sb_dep | - [fatalities002_pgm_conflict_sptime_dist](https://github.com/prio-data/viewsforecasting/blob/github_workflows/Tools/pgm_querysets.py#L1061) | None | Direct multi-step | no | NA | NA |
+| electric_relaxation | RandomForestClassifier | ged_sb_dep | - [escwa001_cflong](https://github.com/prio-data/views_pipeline/blob/main/common_querysets/queryset_electric_relaxation.py) | - [hyperparameters electric_relaxation](https://github.com/prio-data/views_pipeline/blob/main/models/electric_relaxation/configs/config_hyperparameters.py) | None | shadow | NA | Sara |
diff --git a/documentation/catalogs/generate_links_to_querysets.py b/documentation/catalogs/generate_links_to_querysets.py
@@ -10,7 +10,7 @@
 
 # The GitHub repo link 
 # TODO: github_workflows should be changed to main when merged
-GITHUB_URL = 'https://github.com/prio-data/viewsforecasting/blob/github_workflows/' 
+GITHUB_URL = 'https://github.com/prio-data/viewsforecasting/blob/main/' 
 
 
 
@@ -140,7 +140,7 @@ def generate_markdown_table(models):
 models_dict = extract_models(model_def_path)
 markdown_table = generate_markdown_table(models_dict)
 
-with open('documentation/catalogs/cm_model_catalog.md', 'w') as f:
+with open('documentation/catalogs/model_catalog_old_pipeline.md', 'w') as f:
     f.write(markdown_table)
 
 

diff --git a/documentation/catalogs/generate_model_catalog.py b/documentation/catalogs/generate_model_catalog.py
@@ -0,0 +1,169 @@
+import os
+import logging
+logging.basicConfig(
+    level=logging.ERROR, format="%(asctime)s %(name)s - %(levelname)s - %(message)s"
+)
+logger = logging.getLogger(__name__)
+
+import sys
+from pathlib import Path
+
+PATH = Path(__file__).resolve()
+indices = [i for i, x in enumerate(PATH.parts) if x == "views_pipeline"]
+PATH_ROOT = Path(*PATH.parts[:indices[-1] + 1])
+
+sys.path.insert(0, str(PATH_ROOT))
+sys.path.insert(0, str(PATH_ROOT/"common_utils"))
+
+from model_path import ModelPath
+
+GITHUB_URL = 'https://github.com/prio-data/views_pipeline/blob/main/' 
+
+
+
+
+
+def extract_models(model_class):
+    """
+    It creates a dictionary containing all the necessary information about a model by merging the config_meta.py, config_deployement.py and config_hyperparameters.py dictionaries.
+
+    Parameters:
+    model_class: ModelPath class object from ModelPath.py
+
+    Returns:
+    model_dict: A dictionary containing the following relevant keys:
+        -name: model name from config_meta.py
+        -algorithm: algorithm from config_meta.py
+        -depvar: depvar from config_meta.py
+        -queryset: markdown link with marker 'queryset' from config_meta.py pointing to the queryset in common_querysets
+        -level: 'priogrid_month' or 'country_month' from queryset
+        -creator: creator from config_meta.py
+        -deployment_status: deployment_status from config_deployment.py
+        -hyperparameters: markdown link with marker 'hyperparameters model_name' config_meta.py pointing to the model specific config_hyperparameters.py
+    """
+
+    model_dict = {}
+    tmp_dict = {}
+    config_meta = os.path.join(model_class.configs, 'config_meta.py')
+    config_deployment = os.path.join(model_class.configs, 'config_deployment.py')
+    config_hyperparameters = os.path.join(model_class.configs, 'config_hyperparameters.py')
+
+
+    if os.path.exists(config_meta):
+        logging.info(f"Found meta config: {config_meta}")
+        with open(config_meta, 'r') as file:
+            code = file.read()
+            exec(code, {}, tmp_dict)
+        model_dict.update(tmp_dict['get_meta_config']())
+        model_dict['queryset'] = create_link(model_dict['queryset'], model_class.queryset_path) if 'queryset' in model_dict else 'None'
+
+
+    if os.path.exists(config_deployment):
+        logging.info(f"Found deployment config: {config_deployment}")
+        with open(config_deployment, 'r') as file:
+            code = file.read()
+            exec(code, {}, tmp_dict) 
+        model_dict.update(tmp_dict['get_deployment_config']())
+
+    if os.path.exists(config_hyperparameters):
+        logging.info(f"Found hyperparameters config: {config_hyperparameters}") 
+        model_dict['hyperparameters'] = create_link(f"hyperparameters {model_class.model_name}", Path(model_class.get_scripts()['config_hyperparameters.py']))
+
+    return model_dict
+
+
+
+def create_link(marker, filepath: Path):
+    """
+    Generates a markdown-formatted link to a specific file in the repository's main branch. It creates the link by merging the path of the repository and the relative_path created from filepath.
+
+    Parameters:
+    marker: a marker that will be displayed as the clickable text in the markdown link
+    filepath: absolute path of the file
+
+    Returns:
+    str: A markdown link in the format `- [marker](GITHUB_URL/relative_filepath)`
+    """
+    relative_path = filepath.relative_to(ModelPath.get_root())
+    link_template = '- [{marker}]({url}{file})'
+    return link_template.format(marker=marker, url=GITHUB_URL, file=relative_path)
+
+
+
+def generate_markdown_table(models_list):
+    """
+    Function to generate markdown table from the model dictionaries.
+
+    Parameters:
+    model_list: list of model dictionaries containing all the necessary information
+
+    Returns:
+    markdown_table: a markdown table with links to the querysets and hyperparameters
+    """
+
+    headers = ['Model Name', 'Algorithm', 'Target', 'Input Features', 'Non-default Hyperparameters', 'Forecasting Type', 'Implementation Status', 'Implementation Date', 'Author']
+
+    markdown_table = '| ' + ' '.join([f"{header} |" for header in headers]) + '\n'
+    markdown_table += '| ' + ' '.join(['-' * len(header) + ' |' for header in headers]) + '\n'
+
+
+    for model in models_list:
+
+
+        row = [
+            model.get('name', ''),
+            str(model.get('algorithm', '')).split('(')[0],
+            model.get('depvar', '') if model.get('depvar', '') else ", ".join(model.get('target(S)', '')),
+            model.get('queryset', ''),
+            model.get('hyperparameters',''),
+            'None',#Direct multi-step',
+            model.get('deployment_status', ''),
+            'NA',
+            model.get('creator', '')
+        ]
+        markdown_table += '| ' + ' | '.join(row) + ' |\n'
+
+    return markdown_table
+
+
+
+
+if __name__ == "__main__":
+    #import time
+    #start_time = time.time()
+
+    models_list_cm = []
+    models_list_pgm = []
+
+    for model_name in os.listdir(PATH_ROOT / 'models'):
+        model_path = os.path.join(PATH_ROOT / 'models', model_name)
+
+
+        if os.path.isdir(model_path): 
+            model_class = ModelPath(model_name, validate=True)
+
+
+
+            model = extract_models(model_class)
+
+            if 'level' in model and model['level'] == 'pgm':
+                models_list_pgm.append(model)
+            if 'level' in model and model['level'] == 'cm':
+                models_list_cm.append(model)
+
+
+
+
+
+
+    markdown_table_pgm = generate_markdown_table(models_list_pgm)
+    with open('documentation/catalogs/pgm_model_catalog.md', 'w') as f:
+        f.write(markdown_table_pgm)
+
+    markdown_table_cm = generate_markdown_table(models_list_cm)
+    with open('documentation/catalogs/cm_model_catalog.md', 'w') as f:
+        f.write(markdown_table_cm)
+
+    #print("--- %s seconds ---" % (time.time() - start_time))
+
+