From 0a959cb24df366c3bec4a397dda1cc7f0c39d510 Mon Sep 17 00:00:00 2001
From: marinamatic <147725905+marinamatic@users.noreply.github.com>
Date: Tue, 29 Oct 2024 10:25:26 +0100
Subject: [PATCH 1/6] first commit

---
 documentation/catalogs/feature_catalogue.md | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 create mode 100644 documentation/catalogs/feature_catalogue.md

diff --git a/documentation/catalogs/feature_catalogue.md b/documentation/catalogs/feature_catalogue.md
new file mode 100644
index 00000000..e69de29b

From 8743a4203045de226425c5d47906b13de3d302fe Mon Sep 17 00:00:00 2001
From: marinamatic <147725905+marinamatic@users.noreply.github.com>
Date: Wed, 30 Oct 2024 17:50:36 +0100
Subject: [PATCH 2/6] script to generate catalogue of features with very
 amateur questionable code

---
 .../catalogs/generate_features_catalog.py     | 87 +++++++++++++++++++
 1 file changed, 87 insertions(+)
 create mode 100644 documentation/catalogs/generate_features_catalog.py

diff --git a/documentation/catalogs/generate_features_catalog.py b/documentation/catalogs/generate_features_catalog.py
new file mode 100644
index 00000000..72a3ca50
--- /dev/null
+++ b/documentation/catalogs/generate_features_catalog.py
@@ -0,0 +1,87 @@
+import os
+import re
+import logging
+import pandas as pd
+import sys
+from pathlib import Path
+from tabulate import tabulate
+
+logging.basicConfig(
+    level=logging.ERROR, format="%(asctime)s %(name)s - %(levelname)s - %(message)s"
+)
+logger = logging.getLogger(__name__)
+
+# Path to the root and querysets
+PATH = Path(__file__).resolve()
+indices = [i for i, x in enumerate(PATH.parts) if x == "views_pipeline"]
+PATH_ROOT = Path(*PATH.parts[:indices[-1] + 1])
+
+querysets_path = PATH_ROOT / 'common_querysets'
+GITHUB_URL = 'https://github.com/prio-data/views_pipeline/blob/production/' 
+
+def extract_columns_from_querysets(querysets_path):
+    """
+    Parses each queryset file in the common_querysets folder to extract columns, querysets, and LOA.
+    """
+    columns_info = []
+    
+    for file_path in querysets_path.glob("*.py"):
+        with open(file_path, 'r') as file:
+            content = file.read()
+            queryset_name = file_path.stem
+            
+            # Find all Column definitions
+            column_matches = re.findall(r'Column\((.*?)\)', content)
+            for match in column_matches:
+                column_name = re.search(r'"(.*?)"', match).group(1)
+                loa_match = re.search(r'from_loa="(.*?)"', match)
+                loa = loa_match.group(1) if loa_match else None
+                columns_info.append({
+                    'column_name': column_name,
+                    'queryset': queryset_name,
+                    'loa': loa
+                })
+    
+    # Convert to DataFrame for merging and remove duplicates
+    df = pd.DataFrame(columns_info).drop_duplicates()
+    
+    # Group by column_name and aggregate querysets as a comma-separated string
+    df = df.groupby(['column_name', 'loa'], as_index=False).agg({
+        'queryset': lambda x: ', '.join(sorted(set(x)))  # Join unique querysets per feature
+    })
+    
+    return df
+
+def generate_markdown_table(columns_info):
+    """
+    Generates a nicely formatted markdown table for a feature catalog.
+    """
+    headers = ['Name in viewser', 'Human-readable name', 'Data source (with link)', 
+               'Last updated (minutes:hours:day:month:year)', 'Associated querysets/models', 'Notes']
+
+    table_data = []
+    for _, row in columns_info.iterrows():
+        table_data.append([
+            row['column_name'],
+            '',  # Placeholder for Human-readable name
+            '',  # Placeholder for Data source
+            'needs manual update (as of now)',  # Placeholder for Last updated
+            row['queryset'],
+            '',  #Placeholder for notes
+        ])
+    
+    # Generate markdown with tabulate
+    markdown_table = tabulate(table_data, headers=headers, tablefmt="pipe", colalign=("center",))
+
+    return markdown_table
+
+if __name__ == "__main__":
+    # Extract feature information from querysets
+    columns_info = extract_columns_from_querysets(querysets_path)
+    
+    # Generate the markdown table for the feature catalog
+    feature_catalog = generate_markdown_table(columns_info)
+    
+    # Save the markdown table
+    with open('documentation/catalogs/feature_catalog.md', 'w') as f:
+        f.write(feature_catalog)

From 088a7f92d1707883ab9a0cffe6cf7fcb3cbee8db Mon Sep 17 00:00:00 2001
From: Polichinl <simmaa@prio.org>
Date: Fri, 1 Nov 2024 01:50:46 +0100
Subject: [PATCH 3/6] naming convention

---
 documentation/catalogs/feature_catalog.md   | 90 +++++++++++++++++++++
 documentation/catalogs/feature_catalogue.md |  0
 2 files changed, 90 insertions(+)
 create mode 100644 documentation/catalogs/feature_catalog.md
 delete mode 100644 documentation/catalogs/feature_catalogue.md

diff --git a/documentation/catalogs/feature_catalog.md b/documentation/catalogs/feature_catalog.md
new file mode 100644
index 00000000..0b69805c
--- /dev/null
+++ b/documentation/catalogs/feature_catalog.md
@@ -0,0 +1,90 @@
+|     Name in viewser     | Human-readable name   | Data source (with link)   | Last updated (minutes:hours:day:month:year)   | Associated querysets/models                                                              | Notes   |
+|:-----------------------:|:----------------------|:--------------------------|:----------------------------------------------|:-----------------------------------------------------------------------------------------|:--------|
+|         agri_ih         |                       |                           | needs manual update (as of now)               | queryset_blank_space                                                                     |         |
+|        barren_ih        |                       |                           | needs manual update (as of now)               | queryset_blank_space                                                                     |         |
+|          c_id           |                       |                           | needs manual update (as of now)               | queryset_meow_meow                                                                       |         |
+|           col           |                       |                           | needs manual update (as of now)               | queryset_meow_meow                                                                       |         |
+|        cropprop         |                       |                           | needs manual update (as of now)               | queryset_old_money                                                                       |         |
+|     decay_ged_ns_1      |                       |                           | needs manual update (as of now)               | queryset_electric_relaxation                                                             |         |
+|    decay_ged_ns_100     |                       |                           | needs manual update (as of now)               | queryset_electric_relaxation                                                             |         |
+|     decay_ged_ns_25     |                       |                           | needs manual update (as of now)               | queryset_electric_relaxation                                                             |         |
+|     decay_ged_ns_5      |                       |                           | needs manual update (as of now)               | queryset_electric_relaxation                                                             |         |
+|     decay_ged_ns_5      |                       |                           | needs manual update (as of now)               | queryset_lavender_haze                                                                   |         |
+|     decay_ged_os_1      |                       |                           | needs manual update (as of now)               | queryset_electric_relaxation                                                             |         |
+|     decay_ged_os_1      |                       |                           | needs manual update (as of now)               | queryset_blank_space, queryset_orange_pasta                                              |         |
+|    decay_ged_os_100     |                       |                           | needs manual update (as of now)               | queryset_electric_relaxation                                                             |         |
+|     decay_ged_os_25     |                       |                           | needs manual update (as of now)               | queryset_electric_relaxation                                                             |         |
+|     decay_ged_os_5      |                       |                           | needs manual update (as of now)               | queryset_electric_relaxation                                                             |         |
+|     decay_ged_os_5      |                       |                           | needs manual update (as of now)               | queryset_lavender_haze                                                                   |         |
+|     decay_ged_sb_1      |                       |                           | needs manual update (as of now)               | queryset_electric_relaxation                                                             |         |
+|     decay_ged_sb_1      |                       |                           | needs manual update (as of now)               | queryset_blank_space, queryset_orange_pasta                                              |         |
+|    decay_ged_sb_100     |                       |                           | needs manual update (as of now)               | queryset_electric_relaxation                                                             |         |
+|     decay_ged_sb_25     |                       |                           | needs manual update (as of now)               | queryset_electric_relaxation                                                             |         |
+|     decay_ged_sb_25     |                       |                           | needs manual update (as of now)               | queryset_blank_space, queryset_orange_pasta                                              |         |
+|     decay_ged_sb_5      |                       |                           | needs manual update (as of now)               | queryset_electric_relaxation                                                             |         |
+|     decay_ged_sb_5      |                       |                           | needs manual update (as of now)               | queryset_lavender_haze                                                                   |         |
+|    decay_ged_sb_500     |                       |                           | needs manual update (as of now)               | queryset_electric_relaxation                                                             |         |
+|      dist_diamsec       |                       |                           | needs manual update (as of now)               | queryset_blank_space, queryset_lavender_haze                                             |         |
+|     dist_petroleum      |                       |                           | needs manual update (as of now)               | queryset_blank_space                                                                     |         |
+|        forest_ih        |                       |                           | needs manual update (as of now)               | queryset_blank_space                                                                     |         |
+|        ged_gte_1        |                       |                           | needs manual update (as of now)               | queryset_wildest_dream, queryset_yellow_pikachu                                          |         |
+|         ged_ns          |                       |                           | needs manual update (as of now)               | queryset_lavender_haze                                                                   |         |
+|     ged_ns_dummy_t0     |                       |                           | needs manual update (as of now)               | queryset_electric_relaxation                                                             |         |
+|     ged_ns_dummy_t1     |                       |                           | needs manual update (as of now)               | queryset_electric_relaxation                                                             |         |
+|     ged_ns_dummy_t2     |                       |                           | needs manual update (as of now)               | queryset_electric_relaxation                                                             |         |
+|     ged_ns_dummy_t3     |                       |                           | needs manual update (as of now)               | queryset_electric_relaxation                                                             |         |
+|     ged_ns_dummy_t4     |                       |                           | needs manual update (as of now)               | queryset_electric_relaxation                                                             |         |
+|     ged_ns_dummy_t5     |                       |                           | needs manual update (as of now)               | queryset_electric_relaxation                                                             |         |
+|     ged_ns_dummy_t6     |                       |                           | needs manual update (as of now)               | queryset_electric_relaxation                                                             |         |
+|         ged_os          |                       |                           | needs manual update (as of now)               | queryset_lavender_haze                                                                   |         |
+|     ged_os_dummy_t0     |                       |                           | needs manual update (as of now)               | queryset_electric_relaxation                                                             |         |
+|     ged_os_dummy_t1     |                       |                           | needs manual update (as of now)               | queryset_electric_relaxation                                                             |         |
+|     ged_os_dummy_t2     |                       |                           | needs manual update (as of now)               | queryset_electric_relaxation                                                             |         |
+|     ged_os_dummy_t3     |                       |                           | needs manual update (as of now)               | queryset_electric_relaxation                                                             |         |
+|     ged_os_dummy_t4     |                       |                           | needs manual update (as of now)               | queryset_electric_relaxation                                                             |         |
+|     ged_os_dummy_t5     |                       |                           | needs manual update (as of now)               | queryset_electric_relaxation                                                             |         |
+|     ged_os_dummy_t6     |                       |                           | needs manual update (as of now)               | queryset_electric_relaxation                                                             |         |
+|         ged_sb          |                       |                           | needs manual update (as of now)               | queryset_lavender_haze                                                                   |         |
+|       ged_sb_dep        |                       |                           | needs manual update (as of now)               | queryset_electric_relaxation                                                             |         |
+|       ged_sb_dep        |                       |                           | needs manual update (as of now)               | queryset_orange_pasta, queryset_yellow_pikachu                                           |         |
+|     ged_sb_dummy_t0     |                       |                           | needs manual update (as of now)               | queryset_electric_relaxation                                                             |         |
+|     ged_sb_dummy_t1     |                       |                           | needs manual update (as of now)               | queryset_electric_relaxation                                                             |         |
+|     ged_sb_dummy_t2     |                       |                           | needs manual update (as of now)               | queryset_electric_relaxation                                                             |         |
+|     ged_sb_dummy_t3     |                       |                           | needs manual update (as of now)               | queryset_electric_relaxation                                                             |         |
+|     ged_sb_dummy_t4     |                       |                           | needs manual update (as of now)               | queryset_electric_relaxation                                                             |         |
+|     ged_sb_dummy_t5     |                       |                           | needs manual update (as of now)               | queryset_electric_relaxation                                                             |         |
+|     ged_sb_dummy_t6     |                       |                           | needs manual update (as of now)               | queryset_electric_relaxation                                                             |         |
+|     greq_1_excluded     |                       |                           | needs manual update (as of now)               | queryset_blank_space, queryset_old_money                                                 |         |
+|        imr_mean         |                       |                           | needs manual update (as of now)               | queryset_blank_space, queryset_lavender_haze                                             |         |
+|        ln_bdist3        |                       |                           | needs manual update (as of now)               | queryset_blank_space, queryset_lavender_haze                                             |         |
+|       ln_capdist        |                       |                           | needs manual update (as of now)               | queryset_blank_space, queryset_lavender_haze                                             |         |
+|       ln_gcp_mer        |                       |                           | needs manual update (as of now)               | queryset_blank_space                                                                     |         |
+|        ln_ged_sb        |                       |                           | needs manual update (as of now)               | queryset_blank_space, queryset_old_money, queryset_orange_pasta                          |         |
+|      ln_ged_sb_dep      |                       |                           | needs manual update (as of now)               | queryset_blank_space, queryset_lavender_haze, queryset_old_money, queryset_wildest_dream |         |
+|       ln_ns_best        |                       |                           | needs manual update (as of now)               | queryset_meow_meow                                                                       |         |
+|       ln_os_best        |                       |                           | needs manual update (as of now)               | queryset_meow_meow                                                                       |         |
+|    ln_pgd_ttime_mean    |                       |                           | needs manual update (as of now)               | queryset_old_money                                                                       |         |
+|     ln_pop_gpw_sum      |                       |                           | needs manual update (as of now)               | queryset_blank_space, queryset_lavender_haze, queryset_old_money, queryset_orange_pasta  |         |
+|       ln_sb_best        |                       |                           | needs manual update (as of now)               | queryset_meow_meow                                                                       |         |
+|      ln_ttime_mean      |                       |                           | needs manual update (as of now)               | queryset_blank_space, queryset_lavender_haze                                             |         |
+|          month          |                       |                           | needs manual update (as of now)               | queryset_meow_meow                                                                       |         |
+|     mountains_mean      |                       |                           | needs manual update (as of now)               | queryset_blank_space                                                                     |         |
+|       pasture_ih        |                       |                           | needs manual update (as of now)               | queryset_blank_space                                                                     |         |
+|      pgd_imr_mean       |                       |                           | needs manual update (as of now)               | queryset_old_money                                                                       |         |
+|      pgd_urban_ih       |                       |                           | needs manual update (as of now)               | queryset_old_money                                                                       |         |
+|           row           |                       |                           | needs manual update (as of now)               | queryset_meow_meow                                                                       |         |
+|       savanna_ih        |                       |                           | needs manual update (as of now)               | queryset_blank_space                                                                     |         |
+|        shrub_ih         |                       |                           | needs manual update (as of now)               | queryset_blank_space                                                                     |         |
+|     splag_1_1_sb_1      |                       |                           | needs manual update (as of now)               | queryset_blank_space, queryset_lavender_haze, queryset_orange_pasta                      |         |
+| sptime_dist_k1_1_ged_sb |                       |                           | needs manual update (as of now)               | queryset_lavender_haze                                                                   |         |
+| sptime_dist_k1_2_ged_sb |                       |                           | needs manual update (as of now)               | queryset_lavender_haze                                                                   |         |
+| sptime_dist_k1_3_ged_sb |                       |                           | needs manual update (as of now)               | queryset_lavender_haze                                                                   |         |
+|      treelag_1_ns       |                       |                           | needs manual update (as of now)               | queryset_yellow_pikachu                                                                  |         |
+|      treelag_1_os       |                       |                           | needs manual update (as of now)               | queryset_yellow_pikachu                                                                  |         |
+|      treelag_1_sb       |                       |                           | needs manual update (as of now)               | queryset_lavender_haze, queryset_yellow_pikachu                                          |         |
+|      treelag_2_ns       |                       |                           | needs manual update (as of now)               | queryset_yellow_pikachu                                                                  |         |
+|      treelag_2_os       |                       |                           | needs manual update (as of now)               | queryset_yellow_pikachu                                                                  |         |
+|      treelag_2_sb       |                       |                           | needs manual update (as of now)               | queryset_lavender_haze, queryset_yellow_pikachu                                          |         |
+|        urban_ih         |                       |                           | needs manual update (as of now)               | queryset_blank_space                                                                     |         |
+|   wdi_nv_agr_totl_kd    |                       |                           | needs manual update (as of now)               | queryset_old_money                                                                       |         |
+|         year_id         |                       |                           | needs manual update (as of now)               | queryset_meow_meow                                                                       |         |
\ No newline at end of file
diff --git a/documentation/catalogs/feature_catalogue.md b/documentation/catalogs/feature_catalogue.md
deleted file mode 100644
index e69de29b..00000000

From 72b2300617b588c721f8c2969a17925a92142852 Mon Sep 17 00:00:00 2001
From: Polichinl <simmaa@prio.org>
Date: Fri, 1 Nov 2024 01:50:53 +0100
Subject: [PATCH 4/6] path stuff

---
 .../catalogs/generate_features_catalog.py     | 51 +++++++++++++++++--
 1 file changed, 46 insertions(+), 5 deletions(-)

diff --git a/documentation/catalogs/generate_features_catalog.py b/documentation/catalogs/generate_features_catalog.py
index 72a3ca50..5c91d6c8 100644
--- a/documentation/catalogs/generate_features_catalog.py
+++ b/documentation/catalogs/generate_features_catalog.py
@@ -11,21 +11,57 @@
 )
 logger = logging.getLogger(__name__)
 
+
+
 # Path to the root and querysets
 PATH = Path(__file__).resolve()
 indices = [i for i, x in enumerate(PATH.parts) if x == "views_pipeline"]
 PATH_ROOT = Path(*PATH.parts[:indices[-1] + 1])
 
 querysets_path = PATH_ROOT / 'common_querysets'
-GITHUB_URL = 'https://github.com/prio-data/views_pipeline/blob/production/' 
 
-def extract_columns_from_querysets(querysets_path):
+
+
+def get_path_common_querysets():
+    """
+    Retrieves the path to the 'common_querysets' directory within the 'views_pipeline' directory.
+
+    This function identifies the 'views_pipeline' directory within the path of the current file,
+    constructs a new path up to and including this directory, and then appends the relative path
+    to the 'common_querysets' directory. If the 'views_pipeline' directory or the 'common_querysets'
+    directory is not found, it raises a ValueError.
+
+    Returns:
+        Path: The path to the 'common_querysets' directory.
+
+    Raises:
+        ValueError: If the 'views_pipeline' directory or the 'common_querysets' directory is not found in the provided path.
+    """
+
+    PATH = Path(__file__)
+
+    # Locate 'views_pipeline' in the current file's path parts
+    if 'views_pipeline' in PATH.parts:
+        PATH_ROOT = Path(*PATH.parts[:PATH.parts.index('views_pipeline') + 1])
+        PATH_COMMON_QUERYSETS = PATH_ROOT / 'common_querysets'
+
+        # Check if 'common_querysets' directory exists
+        if not PATH_COMMON_QUERYSETS.exists():
+            raise ValueError("The 'common_querysets' directory was not found in the provided path.")
+        
+    else:
+        raise ValueError("The 'views_pipeline' directory was not found in the provided path.")
+
+    return PATH_COMMON_QUERYSETS
+
+
+def extract_columns_from_querysets(PATH_COMMON_QUERYSETS):
     """
     Parses each queryset file in the common_querysets folder to extract columns, querysets, and LOA.
     """
     columns_info = []
     
-    for file_path in querysets_path.glob("*.py"):
+    for file_path in PATH_COMMON_QUERYSETS.glob("*.py"):
         with open(file_path, 'r') as file:
             content = file.read()
             queryset_name = file_path.stem
@@ -76,12 +112,17 @@ def generate_markdown_table(columns_info):
     return markdown_table
 
 if __name__ == "__main__":
+
+    GITHUB_URL = 'https://github.com/prio-data/views_pipeline/blob/production/' 
+
+    PATH_COMMON_QUERYSETS = get_path_common_querysets()
+
     # Extract feature information from querysets
-    columns_info = extract_columns_from_querysets(querysets_path)
+    columns_info = extract_columns_from_querysets(PATH_COMMON_QUERYSETS)
     
     # Generate the markdown table for the feature catalog
     feature_catalog = generate_markdown_table(columns_info)
     
     # Save the markdown table
-    with open('documentation/catalogs/feature_catalog.md', 'w') as f:
+    with open('feature_catalog.md', 'w') as f: # saved locally right next to this script
         f.write(feature_catalog)

From 317ffb005ee93401533211d250cad617a5c0f943 Mon Sep 17 00:00:00 2001
From: Polichinl <simmaa@prio.org>
Date: Fri, 1 Nov 2024 01:55:34 +0100
Subject: [PATCH 5/6] removed old code

---
 documentation/catalogs/generate_features_catalog.py | 11 -----------
 1 file changed, 11 deletions(-)

diff --git a/documentation/catalogs/generate_features_catalog.py b/documentation/catalogs/generate_features_catalog.py
index 5c91d6c8..f013b50f 100644
--- a/documentation/catalogs/generate_features_catalog.py
+++ b/documentation/catalogs/generate_features_catalog.py
@@ -11,17 +11,6 @@
 )
 logger = logging.getLogger(__name__)
 
-
-
-# Path to the root and querysets
-PATH = Path(__file__).resolve()
-indices = [i for i, x in enumerate(PATH.parts) if x == "views_pipeline"]
-PATH_ROOT = Path(*PATH.parts[:indices[-1] + 1])
-
-querysets_path = PATH_ROOT / 'common_querysets'
-
-
-
 def get_path_common_querysets():
     """
     Retrieves the path to the 'common_querysets' directory within the 'views_pipeline' directory.

From 1d4a1a51f01ccf5273f20641734d942bff1a9f8f Mon Sep 17 00:00:00 2001
From: marinamatic <147725905+marinamatic@users.noreply.github.com>
Date: Fri, 1 Nov 2024 13:57:25 +0100
Subject: [PATCH 6/6] added table placeholder

---
 documentation/catalogs/generate_features_catalog.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/documentation/catalogs/generate_features_catalog.py b/documentation/catalogs/generate_features_catalog.py
index f013b50f..6609ac7f 100644
--- a/documentation/catalogs/generate_features_catalog.py
+++ b/documentation/catalogs/generate_features_catalog.py
@@ -88,8 +88,8 @@ def generate_markdown_table(columns_info):
     for _, row in columns_info.iterrows():
         table_data.append([
             row['column_name'],
-            '',  # Placeholder for Human-readable name
-            '',  # Placeholder for Data source
+            'needs manual update',  # Placeholder for Human-readable name
+            'needs manual update',  # Placeholder for Data source
             'needs manual update (as of now)',  # Placeholder for Last updated
             row['queryset'],
             '',  #Placeholder for notes