From 0a959cb24df366c3bec4a397dda1cc7f0c39d510 Mon Sep 17 00:00:00 2001 From: marinamatic <147725905+marinamatic@users.noreply.github.com> Date: Tue, 29 Oct 2024 10:25:26 +0100 Subject: [PATCH 1/6] first commit --- documentation/catalogs/feature_catalogue.md | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 documentation/catalogs/feature_catalogue.md diff --git a/documentation/catalogs/feature_catalogue.md b/documentation/catalogs/feature_catalogue.md new file mode 100644 index 00000000..e69de29b From 8743a4203045de226425c5d47906b13de3d302fe Mon Sep 17 00:00:00 2001 From: marinamatic <147725905+marinamatic@users.noreply.github.com> Date: Wed, 30 Oct 2024 17:50:36 +0100 Subject: [PATCH 2/6] script to generate catalogue of features with very amateur questionable code --- .../catalogs/generate_features_catalog.py | 87 +++++++++++++++++++ 1 file changed, 87 insertions(+) create mode 100644 documentation/catalogs/generate_features_catalog.py diff --git a/documentation/catalogs/generate_features_catalog.py b/documentation/catalogs/generate_features_catalog.py new file mode 100644 index 00000000..72a3ca50 --- /dev/null +++ b/documentation/catalogs/generate_features_catalog.py @@ -0,0 +1,87 @@ +import os +import re +import logging +import pandas as pd +import sys +from pathlib import Path +from tabulate import tabulate + +logging.basicConfig( + level=logging.ERROR, format="%(asctime)s %(name)s - %(levelname)s - %(message)s" +) +logger = logging.getLogger(__name__) + +# Path to the root and querysets +PATH = Path(__file__).resolve() +indices = [i for i, x in enumerate(PATH.parts) if x == "views_pipeline"] +PATH_ROOT = Path(*PATH.parts[:indices[-1] + 1]) + +querysets_path = PATH_ROOT / 'common_querysets' +GITHUB_URL = 'https://github.com/prio-data/views_pipeline/blob/production/' + +def extract_columns_from_querysets(querysets_path): + """ + Parses each queryset file in the common_querysets folder to extract columns, querysets, and LOA. + """ + columns_info = [] + + for file_path in querysets_path.glob("*.py"): + with open(file_path, 'r') as file: + content = file.read() + queryset_name = file_path.stem + + # Find all Column definitions + column_matches = re.findall(r'Column\((.*?)\)', content) + for match in column_matches: + column_name = re.search(r'"(.*?)"', match).group(1) + loa_match = re.search(r'from_loa="(.*?)"', match) + loa = loa_match.group(1) if loa_match else None + columns_info.append({ + 'column_name': column_name, + 'queryset': queryset_name, + 'loa': loa + }) + + # Convert to DataFrame for merging and remove duplicates + df = pd.DataFrame(columns_info).drop_duplicates() + + # Group by column_name and aggregate querysets as a comma-separated string + df = df.groupby(['column_name', 'loa'], as_index=False).agg({ + 'queryset': lambda x: ', '.join(sorted(set(x))) # Join unique querysets per feature + }) + + return df + +def generate_markdown_table(columns_info): + """ + Generates a nicely formatted markdown table for a feature catalog. + """ + headers = ['Name in viewser', 'Human-readable name', 'Data source (with link)', + 'Last updated (minutes:hours:day:month:year)', 'Associated querysets/models', 'Notes'] + + table_data = [] + for _, row in columns_info.iterrows(): + table_data.append([ + row['column_name'], + '', # Placeholder for Human-readable name + '', # Placeholder for Data source + 'needs manual update (as of now)', # Placeholder for Last updated + row['queryset'], + '', #Placeholder for notes + ]) + + # Generate markdown with tabulate + markdown_table = tabulate(table_data, headers=headers, tablefmt="pipe", colalign=("center",)) + + return markdown_table + +if __name__ == "__main__": + # Extract feature information from querysets + columns_info = extract_columns_from_querysets(querysets_path) + + # Generate the markdown table for the feature catalog + feature_catalog = generate_markdown_table(columns_info) + + # Save the markdown table + with open('documentation/catalogs/feature_catalog.md', 'w') as f: + f.write(feature_catalog) From 088a7f92d1707883ab9a0cffe6cf7fcb3cbee8db Mon Sep 17 00:00:00 2001 From: Polichinl Date: Fri, 1 Nov 2024 01:50:46 +0100 Subject: [PATCH 3/6] naming convention --- documentation/catalogs/feature_catalog.md | 90 +++++++++++++++++++++ documentation/catalogs/feature_catalogue.md | 0 2 files changed, 90 insertions(+) create mode 100644 documentation/catalogs/feature_catalog.md delete mode 100644 documentation/catalogs/feature_catalogue.md diff --git a/documentation/catalogs/feature_catalog.md b/documentation/catalogs/feature_catalog.md new file mode 100644 index 00000000..0b69805c --- /dev/null +++ b/documentation/catalogs/feature_catalog.md @@ -0,0 +1,90 @@ +| Name in viewser | Human-readable name | Data source (with link) | Last updated (minutes:hours:day:month:year) | Associated querysets/models | Notes | +|:-----------------------:|:----------------------|:--------------------------|:----------------------------------------------|:-----------------------------------------------------------------------------------------|:--------| +| agri_ih | | | needs manual update (as of now) | queryset_blank_space | | +| barren_ih | | | needs manual update (as of now) | queryset_blank_space | | +| c_id | | | needs manual update (as of now) | queryset_meow_meow | | +| col | | | needs manual update (as of now) | queryset_meow_meow | | +| cropprop | | | needs manual update (as of now) | queryset_old_money | | +| decay_ged_ns_1 | | | needs manual update (as of now) | queryset_electric_relaxation | | +| decay_ged_ns_100 | | | needs manual update (as of now) | queryset_electric_relaxation | | +| decay_ged_ns_25 | | | needs manual update (as of now) | queryset_electric_relaxation | | +| decay_ged_ns_5 | | | needs manual update (as of now) | queryset_electric_relaxation | | +| decay_ged_ns_5 | | | needs manual update (as of now) | queryset_lavender_haze | | +| decay_ged_os_1 | | | needs manual update (as of now) | queryset_electric_relaxation | | +| decay_ged_os_1 | | | needs manual update (as of now) | queryset_blank_space, queryset_orange_pasta | | +| decay_ged_os_100 | | | needs manual update (as of now) | queryset_electric_relaxation | | +| decay_ged_os_25 | | | needs manual update (as of now) | queryset_electric_relaxation | | +| decay_ged_os_5 | | | needs manual update (as of now) | queryset_electric_relaxation | | +| decay_ged_os_5 | | | needs manual update (as of now) | queryset_lavender_haze | | +| decay_ged_sb_1 | | | needs manual update (as of now) | queryset_electric_relaxation | | +| decay_ged_sb_1 | | | needs manual update (as of now) | queryset_blank_space, queryset_orange_pasta | | +| decay_ged_sb_100 | | | needs manual update (as of now) | queryset_electric_relaxation | | +| decay_ged_sb_25 | | | needs manual update (as of now) | queryset_electric_relaxation | | +| decay_ged_sb_25 | | | needs manual update (as of now) | queryset_blank_space, queryset_orange_pasta | | +| decay_ged_sb_5 | | | needs manual update (as of now) | queryset_electric_relaxation | | +| decay_ged_sb_5 | | | needs manual update (as of now) | queryset_lavender_haze | | +| decay_ged_sb_500 | | | needs manual update (as of now) | queryset_electric_relaxation | | +| dist_diamsec | | | needs manual update (as of now) | queryset_blank_space, queryset_lavender_haze | | +| dist_petroleum | | | needs manual update (as of now) | queryset_blank_space | | +| forest_ih | | | needs manual update (as of now) | queryset_blank_space | | +| ged_gte_1 | | | needs manual update (as of now) | queryset_wildest_dream, queryset_yellow_pikachu | | +| ged_ns | | | needs manual update (as of now) | queryset_lavender_haze | | +| ged_ns_dummy_t0 | | | needs manual update (as of now) | queryset_electric_relaxation | | +| ged_ns_dummy_t1 | | | needs manual update (as of now) | queryset_electric_relaxation | | +| ged_ns_dummy_t2 | | | needs manual update (as of now) | queryset_electric_relaxation | | +| ged_ns_dummy_t3 | | | needs manual update (as of now) | queryset_electric_relaxation | | +| ged_ns_dummy_t4 | | | needs manual update (as of now) | queryset_electric_relaxation | | +| ged_ns_dummy_t5 | | | needs manual update (as of now) | queryset_electric_relaxation | | +| ged_ns_dummy_t6 | | | needs manual update (as of now) | queryset_electric_relaxation | | +| ged_os | | | needs manual update (as of now) | queryset_lavender_haze | | +| ged_os_dummy_t0 | | | needs manual update (as of now) | queryset_electric_relaxation | | +| ged_os_dummy_t1 | | | needs manual update (as of now) | queryset_electric_relaxation | | +| ged_os_dummy_t2 | | | needs manual update (as of now) | queryset_electric_relaxation | | +| ged_os_dummy_t3 | | | needs manual update (as of now) | queryset_electric_relaxation | | +| ged_os_dummy_t4 | | | needs manual update (as of now) | queryset_electric_relaxation | | +| ged_os_dummy_t5 | | | needs manual update (as of now) | queryset_electric_relaxation | | +| ged_os_dummy_t6 | | | needs manual update (as of now) | queryset_electric_relaxation | | +| ged_sb | | | needs manual update (as of now) | queryset_lavender_haze | | +| ged_sb_dep | | | needs manual update (as of now) | queryset_electric_relaxation | | +| ged_sb_dep | | | needs manual update (as of now) | queryset_orange_pasta, queryset_yellow_pikachu | | +| ged_sb_dummy_t0 | | | needs manual update (as of now) | queryset_electric_relaxation | | +| ged_sb_dummy_t1 | | | needs manual update (as of now) | queryset_electric_relaxation | | +| ged_sb_dummy_t2 | | | needs manual update (as of now) | queryset_electric_relaxation | | +| ged_sb_dummy_t3 | | | needs manual update (as of now) | queryset_electric_relaxation | | +| ged_sb_dummy_t4 | | | needs manual update (as of now) | queryset_electric_relaxation | | +| ged_sb_dummy_t5 | | | needs manual update (as of now) | queryset_electric_relaxation | | +| ged_sb_dummy_t6 | | | needs manual update (as of now) | queryset_electric_relaxation | | +| greq_1_excluded | | | needs manual update (as of now) | queryset_blank_space, queryset_old_money | | +| imr_mean | | | needs manual update (as of now) | queryset_blank_space, queryset_lavender_haze | | +| ln_bdist3 | | | needs manual update (as of now) | queryset_blank_space, queryset_lavender_haze | | +| ln_capdist | | | needs manual update (as of now) | queryset_blank_space, queryset_lavender_haze | | +| ln_gcp_mer | | | needs manual update (as of now) | queryset_blank_space | | +| ln_ged_sb | | | needs manual update (as of now) | queryset_blank_space, queryset_old_money, queryset_orange_pasta | | +| ln_ged_sb_dep | | | needs manual update (as of now) | queryset_blank_space, queryset_lavender_haze, queryset_old_money, queryset_wildest_dream | | +| ln_ns_best | | | needs manual update (as of now) | queryset_meow_meow | | +| ln_os_best | | | needs manual update (as of now) | queryset_meow_meow | | +| ln_pgd_ttime_mean | | | needs manual update (as of now) | queryset_old_money | | +| ln_pop_gpw_sum | | | needs manual update (as of now) | queryset_blank_space, queryset_lavender_haze, queryset_old_money, queryset_orange_pasta | | +| ln_sb_best | | | needs manual update (as of now) | queryset_meow_meow | | +| ln_ttime_mean | | | needs manual update (as of now) | queryset_blank_space, queryset_lavender_haze | | +| month | | | needs manual update (as of now) | queryset_meow_meow | | +| mountains_mean | | | needs manual update (as of now) | queryset_blank_space | | +| pasture_ih | | | needs manual update (as of now) | queryset_blank_space | | +| pgd_imr_mean | | | needs manual update (as of now) | queryset_old_money | | +| pgd_urban_ih | | | needs manual update (as of now) | queryset_old_money | | +| row | | | needs manual update (as of now) | queryset_meow_meow | | +| savanna_ih | | | needs manual update (as of now) | queryset_blank_space | | +| shrub_ih | | | needs manual update (as of now) | queryset_blank_space | | +| splag_1_1_sb_1 | | | needs manual update (as of now) | queryset_blank_space, queryset_lavender_haze, queryset_orange_pasta | | +| sptime_dist_k1_1_ged_sb | | | needs manual update (as of now) | queryset_lavender_haze | | +| sptime_dist_k1_2_ged_sb | | | needs manual update (as of now) | queryset_lavender_haze | | +| sptime_dist_k1_3_ged_sb | | | needs manual update (as of now) | queryset_lavender_haze | | +| treelag_1_ns | | | needs manual update (as of now) | queryset_yellow_pikachu | | +| treelag_1_os | | | needs manual update (as of now) | queryset_yellow_pikachu | | +| treelag_1_sb | | | needs manual update (as of now) | queryset_lavender_haze, queryset_yellow_pikachu | | +| treelag_2_ns | | | needs manual update (as of now) | queryset_yellow_pikachu | | +| treelag_2_os | | | needs manual update (as of now) | queryset_yellow_pikachu | | +| treelag_2_sb | | | needs manual update (as of now) | queryset_lavender_haze, queryset_yellow_pikachu | | +| urban_ih | | | needs manual update (as of now) | queryset_blank_space | | +| wdi_nv_agr_totl_kd | | | needs manual update (as of now) | queryset_old_money | | +| year_id | | | needs manual update (as of now) | queryset_meow_meow | | \ No newline at end of file diff --git a/documentation/catalogs/feature_catalogue.md b/documentation/catalogs/feature_catalogue.md deleted file mode 100644 index e69de29b..00000000 From 72b2300617b588c721f8c2969a17925a92142852 Mon Sep 17 00:00:00 2001 From: Polichinl Date: Fri, 1 Nov 2024 01:50:53 +0100 Subject: [PATCH 4/6] path stuff --- .../catalogs/generate_features_catalog.py | 51 +++++++++++++++++-- 1 file changed, 46 insertions(+), 5 deletions(-) diff --git a/documentation/catalogs/generate_features_catalog.py b/documentation/catalogs/generate_features_catalog.py index 72a3ca50..5c91d6c8 100644 --- a/documentation/catalogs/generate_features_catalog.py +++ b/documentation/catalogs/generate_features_catalog.py @@ -11,21 +11,57 @@ ) logger = logging.getLogger(__name__) + + # Path to the root and querysets PATH = Path(__file__).resolve() indices = [i for i, x in enumerate(PATH.parts) if x == "views_pipeline"] PATH_ROOT = Path(*PATH.parts[:indices[-1] + 1]) querysets_path = PATH_ROOT / 'common_querysets' -GITHUB_URL = 'https://github.com/prio-data/views_pipeline/blob/production/' -def extract_columns_from_querysets(querysets_path): + + +def get_path_common_querysets(): + """ + Retrieves the path to the 'common_querysets' directory within the 'views_pipeline' directory. + + This function identifies the 'views_pipeline' directory within the path of the current file, + constructs a new path up to and including this directory, and then appends the relative path + to the 'common_querysets' directory. If the 'views_pipeline' directory or the 'common_querysets' + directory is not found, it raises a ValueError. + + Returns: + Path: The path to the 'common_querysets' directory. + + Raises: + ValueError: If the 'views_pipeline' directory or the 'common_querysets' directory is not found in the provided path. + """ + + PATH = Path(__file__) + + # Locate 'views_pipeline' in the current file's path parts + if 'views_pipeline' in PATH.parts: + PATH_ROOT = Path(*PATH.parts[:PATH.parts.index('views_pipeline') + 1]) + PATH_COMMON_QUERYSETS = PATH_ROOT / 'common_querysets' + + # Check if 'common_querysets' directory exists + if not PATH_COMMON_QUERYSETS.exists(): + raise ValueError("The 'common_querysets' directory was not found in the provided path.") + + else: + raise ValueError("The 'views_pipeline' directory was not found in the provided path.") + + return PATH_COMMON_QUERYSETS + + +def extract_columns_from_querysets(PATH_COMMON_QUERYSETS): """ Parses each queryset file in the common_querysets folder to extract columns, querysets, and LOA. """ columns_info = [] - for file_path in querysets_path.glob("*.py"): + for file_path in PATH_COMMON_QUERYSETS.glob("*.py"): with open(file_path, 'r') as file: content = file.read() queryset_name = file_path.stem @@ -76,12 +112,17 @@ def generate_markdown_table(columns_info): return markdown_table if __name__ == "__main__": + + GITHUB_URL = 'https://github.com/prio-data/views_pipeline/blob/production/' + + PATH_COMMON_QUERYSETS = get_path_common_querysets() + # Extract feature information from querysets - columns_info = extract_columns_from_querysets(querysets_path) + columns_info = extract_columns_from_querysets(PATH_COMMON_QUERYSETS) # Generate the markdown table for the feature catalog feature_catalog = generate_markdown_table(columns_info) # Save the markdown table - with open('documentation/catalogs/feature_catalog.md', 'w') as f: + with open('feature_catalog.md', 'w') as f: # saved locally right next to this script f.write(feature_catalog) From 317ffb005ee93401533211d250cad617a5c0f943 Mon Sep 17 00:00:00 2001 From: Polichinl Date: Fri, 1 Nov 2024 01:55:34 +0100 Subject: [PATCH 5/6] removed old code --- documentation/catalogs/generate_features_catalog.py | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/documentation/catalogs/generate_features_catalog.py b/documentation/catalogs/generate_features_catalog.py index 5c91d6c8..f013b50f 100644 --- a/documentation/catalogs/generate_features_catalog.py +++ b/documentation/catalogs/generate_features_catalog.py @@ -11,17 +11,6 @@ ) logger = logging.getLogger(__name__) - - -# Path to the root and querysets -PATH = Path(__file__).resolve() -indices = [i for i, x in enumerate(PATH.parts) if x == "views_pipeline"] -PATH_ROOT = Path(*PATH.parts[:indices[-1] + 1]) - -querysets_path = PATH_ROOT / 'common_querysets' - - - def get_path_common_querysets(): """ Retrieves the path to the 'common_querysets' directory within the 'views_pipeline' directory. From 1d4a1a51f01ccf5273f20641734d942bff1a9f8f Mon Sep 17 00:00:00 2001 From: marinamatic <147725905+marinamatic@users.noreply.github.com> Date: Fri, 1 Nov 2024 13:57:25 +0100 Subject: [PATCH 6/6] added table placeholder --- documentation/catalogs/generate_features_catalog.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/documentation/catalogs/generate_features_catalog.py b/documentation/catalogs/generate_features_catalog.py index f013b50f..6609ac7f 100644 --- a/documentation/catalogs/generate_features_catalog.py +++ b/documentation/catalogs/generate_features_catalog.py @@ -88,8 +88,8 @@ def generate_markdown_table(columns_info): for _, row in columns_info.iterrows(): table_data.append([ row['column_name'], - '', # Placeholder for Human-readable name - '', # Placeholder for Data source + 'needs manual update', # Placeholder for Human-readable name + 'needs manual update', # Placeholder for Data source 'needs manual update (as of now)', # Placeholder for Last updated row['queryset'], '', #Placeholder for notes