stuff

prio-data · Oct 28, 2024 · 81af531 · 81af531
1 parent 9724c9a
commit 81af531
Show file tree

Hide file tree

Showing 16 changed files with 868 additions and 10 deletions.
diff --git a/common_querysets/queryset_meow_meow.py b/common_querysets/queryset_meow_meow.py
@@ -0,0 +1,41 @@
+from viewser import Queryset, Column
+
+def generate():
+    """
+    Contains the configuration for the input data in the form of a viewser queryset. That is the data from viewser that is used to train the model.
+    This configuration is "behavioral" so modifying it will affect the model's runtime behavior and integration into the deployment system.
+    There is no guarantee that the model will work if the input data configuration is changed here without changing the model settings and algorithm accordingly.
+
+    Returns:
+    - queryset_base (Queryset): A queryset containing the base data for the model training.
+    """
+
+    # VIEWSER 6, Example configuration. Modify as needed.
+
+    queryset_base = (Queryset("meow_meow", "priogrid_month")
+        # Create a new column 'ln_sb_best' using data from 'priogrid_month' and 'ged_sb_best_count_nokgi' column
+        # Apply logarithmic transformation, handle missing values by replacing them with NA
+        .with_column(Column("ln_sb_best", from_loa="priogrid_month", from_column="ged_sb_best_count_nokgi")
+            .transform.ops.ln().transform.missing.replace_na())
+
+        # Create a new column 'ln_ns_best' using data from 'priogrid_month' and 'ged_ns_best_count_nokgi' column
+        # Apply logarithmic transformation, handle missing values by replacing them with NA
+        .with_column(Column("ln_ns_best", from_loa="priogrid_month", from_column="ged_ns_best_count_nokgi")
+            .transform.ops.ln().transform.missing.replace_na())
+
+        # Create a new column 'ln_os_best' using data from 'priogrid_month' and 'ged_os_best_count_nokgi' column
+        # Apply logarithmic transformation, handle missing values by replacing them with NA
+        .with_column(Column("ln_os_best", from_loa="priogrid_month", from_column="ged_os_best_count_nokgi")
+            .transform.ops.ln().transform.missing.replace_na())
+
+        # Create columns for month and year_id
+        .with_column(Column("month", from_loa="month", from_column="month"))
+        .with_column(Column("year_id", from_loa="country_year", from_column="year_id"))
+
+        # Create columns for country_id, col, and row
+        .with_column(Column("c_id", from_loa="country_year", from_column="country_id"))
+        .with_column(Column("col", from_loa="priogrid", from_column="col"))
+        .with_column(Column("row", from_loa="priogrid", from_column="row"))
+    )
+
+    return queryset_base
diff --git a/common_utils/ensemble_path.py b/common_utils/ensemble_path.py
@@ -2,7 +2,7 @@
 import logging
 from pathlib import Path
 from typing import Union
-
+import sys
 logging.basicConfig(
     level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
 )
@@ -36,11 +36,115 @@ def __init__(
         """
         super().__init__(ensemble_name_or_path, validate)
         # Additional ensemble-specific initialization...
+        print(self._validate)
+
+    def _initialize_directories(self) -> None:
+        """
+        Initializes the necessary directories for the ensemble.
+
+        Creates and sets up various directories required for the ensemble, such as architectures, artifacts, configs, data, etc.
+        """
+        # Call the parent class's _initialize_directories method
+        super()._initialize_directories()
+
+        # List of directories to keep
+        keep_dirs = {
+            "artifacts",
+            "configs",
+            "data",
+            "data/generated",
+            "data/processed",
+            "notebooks",
+            "reports",
+            "reports/figures",
+            "reports/papers",
+            "reports/plots",
+            "reports/slides",
+            "reports/timelapse",
+            "src",
+            "src/dataloaders",
+            "src/forecasting",
+            "src/management",
+            "src/offline_evaluation",
+            "src/training",
+            "src/utils",
+            "src/visualization",
+        }
+
+        # Remove directories that are not in the keep_dirs list
+        for attr, value in list(self.__dict__.items()):
+            if Path(value).relative_to(self.model_dir) not in keep_dirs:
+                delattr(self, attr)
+
+        # Initialize directories as per the new structure
+        self.model_dir = self._get_model_dir()
+        self.artifacts = self._build_absolute_directory(Path("artifacts"))
+        self.configs = self._build_absolute_directory(Path("configs"))
+        self.data = self._build_absolute_directory(Path("data"))
+        self.data_generated = self._build_absolute_directory(Path("data/generated"))
+        self.data_processed = self._build_absolute_directory(Path("data/processed"))
+        self.notebooks = self._build_absolute_directory(Path("notebooks"))
+        self.reports = self._build_absolute_directory(Path("reports"))
+        self.reports_figures = self._build_absolute_directory(Path("reports/figures"))
+        self.reports_papers = self._build_absolute_directory(Path("reports/papers"))
+        self.reports_plots = self._build_absolute_directory(Path("reports/plots"))
+        self.reports_slides = self._build_absolute_directory(Path("reports/slides"))
+        self.reports_timelapse = self._build_absolute_directory(
+            Path("reports/timelapse")
+        )
+        self.src = self._build_absolute_directory(Path("src"))
+        self.dataloaders = self._build_absolute_directory(Path("src/dataloaders"))
+        self.forecasting = self._build_absolute_directory(Path("src/forecasting"))
+        self.management = self._build_absolute_directory(Path("src/management"))
+        self.offline_evaluation = self._build_absolute_directory(
+            Path("src/offline_evaluation")
+        )
+        self.training = self._build_absolute_directory(Path("src/training"))
+        self.utils = self._build_absolute_directory(Path("src/utils"))
+        self.visualization = self._build_absolute_directory(Path("src/visualization"))
+        self._templates = self.meta_tools / "templates"
+        self._sys_paths = None
+        # if self.common_querysets not in sys.path:
+        #     sys.path.insert(0, str(self.common_querysets))
+        # self.queryset_path = self.common_querysets / f"queryset_{self.model_name}.py"
+        # self._queryset = None
+
+    def _initialize_scripts(self) -> None:
+        """
+        Initializes the necessary scripts for the ensemble.
+
+        Creates and sets up various scripts required for the ensemble, such as configuration scripts, main script, and other utility scripts.
+        """
+        self.scripts = [
+            self._build_absolute_directory(Path("configs/config_deployment.py")),
+            self._build_absolute_directory(Path("configs/config_hyperparameters.py")),
+            self._build_absolute_directory(Path("configs/config_meta.py")),
+            self._build_absolute_directory(Path("main.py")),
+            self._build_absolute_directory(Path("README.md")),
+            self._build_absolute_directory(Path("requirements.txt")),
+            self._build_absolute_directory(Path("artifacts/model_metadata_dict.py")),
+            self._build_absolute_directory(Path("src/dataloaders/get_data.py")),
+            self._build_absolute_directory(
+                Path("src/forecasting/generate_forecast.py")
+            ),
+            self._build_absolute_directory(
+                Path("src/management/execute_model_runs.py")
+            ),
+            self._build_absolute_directory(
+                Path("src/management/execute_model_tasks.py")
+            ),
+            self._build_absolute_directory(
+                Path("src/offline_evaluation/evaluate_ensemble.py")
+            ),
+            self._build_absolute_directory(Path("src/training/train_ensemble.py")),
+            self._build_absolute_directory(Path("src/utils/utils_outputs.py")),
+            self._build_absolute_directory(Path("src/utils/utils_run.py")),
+            self._build_absolute_directory(Path("src/visualization/visual.py")),
+            # self.common_querysets / f"queryset_{self.model_name}.py",
+        ]
 
 
-# if __name__ == "__main__":
-#     ensemble_path = EnsemblePath("white_mustang", validate=True)
-#     ensemble_path.view_directories()
-#     ensemble_path.view_scripts()
-#     print(ensemble_path.get_queryset())
-#     del ensemble_path
+if __name__ == "__main__":
+    ensemble_path = EnsemblePath("white_mustang", validate=True)
+    print(ensemble_path.get_directories())
+    del ensemble_path
diff --git a/meta_tools/ensemble_scaffold_builder.py b/meta_tools/ensemble_scaffold_builder.py
@@ -0,0 +1,226 @@
+from pathlib import Path
+from utils.utils_model_naming import validate_model_name
+import datetime
+import logging
+import sys
+
+PATH = Path(__file__)
+if 'views_pipeline' in PATH.parts:
+    PATH_ROOT = Path(*PATH.parts[:PATH.parts.index('views_pipeline') + 1])
+    PATH_COMMON_UTILS = PATH_ROOT / 'common_utils'
+    if not PATH_COMMON_UTILS.exists():
+        raise ValueError("The 'common_utils' directory was not found in the provided path.")
+    sys.path.insert(0, str(PATH_COMMON_UTILS))
+    sys.path.insert(0, str(PATH_ROOT))
+else:
+    raise ValueError("The 'views_pipeline' directory was not found in the provided path.")
+
+# print(str(Path(__file__).parent.parent))
+from common_utils import model_path, ensemble_path
+
+from templates.ensemble import (
+    template_config_deployment,
+    template_config_hyperparameters,
+    template_config_meta,
+    template_main,
+)
+
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+
+class EnsembleScaffoldBuilder:
+    """
+    A class to create and manage the directory structure and scripts for a machine learning ensemble.
+
+    Attributes:
+        model_name (str): The name of the ensemble for which the directory structure is to be created.
+        _model (ModelPath): An instance of the ModelPath class to manage model paths.
+        _subdirs (list of str): A list of subdirectories to be created within the ensemble directory.
+        _scripts (list of str): A list of script paths to be created within the ensemble directory.
+
+    Methods:
+        __init__(model_name: str) -> None:
+            Initializes the EnsembleScaffoldBuilder with the given model name and sets up paths.
+
+        build_model_directory() -> Path:
+            Creates the ensemble directory and its subdirectories, and initializes necessary files such as README.md
+            and requirements.txt.
+
+            Returns:
+                Path: The path to the created ensemble directory.
+
+            Raises:
+                FileExistsError: If the ensemble directory already exists.
+
+        build_model_scripts() -> None:
+            Generates the necessary configuration and main scripts for the ensemble.
+
+            Raises:
+                FileNotFoundError: If the ensemble directory does not exist.
+
+        assess_model_directory() -> dict:
+            Assesses the ensemble directory by checking for the presence of expected directories.
+
+            Returns:
+                dict: A dictionary containing assessment results with two keys:
+                    - 'model_dir': The path to the ensemble directory.
+                    - 'structure_errors': A list of errors related to missing directories.
+
+        assess_model_scripts() -> dict:
+            Assesses the ensemble directory by checking for the presence of expected scripts.
+
+            Returns:
+                dict: A dictionary containing assessment results with two keys:
+                    - 'model_dir': The path to the ensemble directory.
+                    - 'missing_scripts': A set of missing script paths.
+    """
+
+    def __init__(self, model_name) -> None:
+        """
+        Initialize a EnsembleScaffoldBuilder object with the given ensemble name and set up paths.
+
+        Args:
+            model_name (str): The name of the model for which directories and files are to be created.
+
+        Returns:
+            None
+        """
+        self._model = ensemble_path.EnsemblePath(model_name, validate=False)
+        self._subdirs = self._model.get_directories().values()
+        self._scripts = self._model.get_scripts().values()
+
+    def build_model_directory(self) -> Path:
+        """
+        Create the ensemble directory and its subdirectories, and initialize necessary files such as README.md and requirements.txt.
+
+        Returns:
+            Path: The path to the created ensemble directory.
+
+        Raises:
+            FileExistsError: If the ensemble directory already exists.
+        """
+        if self._model.model_dir.exists():
+            logger.info(
+                f"Ensemble directory already exists: {self._model.model_dir}. Proceeding with existing directory."
+            )
+        else:
+            self._model.model_dir.mkdir(parents=True, exist_ok=False)
+            logger.info(f"Created new ensemble directory: {self._model.model_dir}")
+
+        for subdir in self._subdirs:
+            subdir = Path(subdir)
+            if not subdir.exists():
+                try:
+                    subdir.mkdir(parents=True, exist_ok=True)
+                    if subdir.exists():
+                        logging.info(f"Created subdirectory: {subdir}")
+                    else:
+                        logging.error(f"Did not create subdirectory: {subdir}")
+                except Exception as e:
+                    logging.error(f"Error creating subdirectory: {subdir}. {e}")
+            else:
+                logging.info(f"Subdirectory already exists: {subdir}. Skipping.")
+
+        # Create README.md and requirements.txt
+        readme_path = self._model.model_dir / "README.md"
+        with open(readme_path, "w") as readme_file:
+            readme_file.write(
+                f"# Ensemble README\n## Ensemble name: {self._model.model_name}\n## Created on: {str(datetime.datetime.now())}"
+            )
+        if readme_path.exists():
+            logging.info(f"Created README.md: {readme_path}")
+        else:
+            logging.error(f"Did not create README.md: {readme_path}")
+
+        requirements_path = self._model.model_dir / "requirements.txt"
+        with open(requirements_path, "w") as requirements_file:
+            requirements_file.write("# Requirements\n")
+        if requirements_path.exists():
+            logging.info(f"Created requirements.txt: {requirements_path}")
+        else:
+            logging.error(f"Did not create requirements.txt: {requirements_path}")
+        return self._model.model_dir
+
+    def build_model_scripts(self):
+        if not self._model.model_dir.exists():
+            raise FileNotFoundError(
+                f"Ensemble directory {self._model.model_dir} does not exist. Please call build_model_directory() first. Aborting script generation."
+            )
+        template_config_deployment.generate(
+            script_dir=self._model.model_dir / "configs/config_deployment.py"
+        )
+        template_config_hyperparameters.generate(
+            script_dir=self._model.model_dir / "configs/config_hyperparameters.py",
+        )
+        template_config_meta.generate(
+            script_dir=self._model.model_dir / "configs/config_meta.py",
+            model_name=self._model.model_name,
+        )
+        template_main.generate(script_dir=self._model.model_dir / "main.py")
+
+    def assess_model_directory(self) -> dict:
+        """
+        Assess the ensemble directory by checking for the presence of expected directories.
+
+        Returns:
+            dict: A dictionary containing assessment results with two keys:
+                - 'model_dir': The path to the ensemble directory.
+                - 'structure_errors': A list of errors related to missing directories or files.
+        """
+        assessment = {"model_dir": self._model.model_dir, "structure_errors": []}
+        if not self._model.model_dir.exists():
+            raise FileNotFoundError(
+                f"Ensemble directory {self._model.model_dir} does not exist. Please call build_model_directory() first."
+            )
+        updated_model_path = ensemble_path.EnsemblePath(self._model.model_name, validate=True)
+        assessment["structure_errors"] = set(
+            updated_model_path.get_directories().values()
+        ) - set(self._subdirs)
+        del updated_model_path
+        return assessment
+
+    def assess_model_scripts(self) -> dict:
+        """
+        Assess the ensemble directory by checking for the presence of expected directories.
+
+        Returns:
+            dict: A dictionary containing assessment results with two keys:
+                - 'model_dir': The path to the ensemble directory.
+                - 'structure_errors': A list of errors related to missing directories or files.
+        """
+        assessment = {"model_dir": self._model.model_dir, "missing_scripts": set()}
+        if not self._model.model_dir.exists():
+            raise FileNotFoundError(
+                f"Ensemble directory {self._model.model_dir} does not exist. Please call build_model_directory() first."
+            )
+        for script_path in self._scripts:
+            script_path = Path(script_path)
+            if not script_path.exists():
+                assessment["missing_scripts"].add(script_path)
+        return assessment
+
+
+if __name__ == "__main__":
+    model_name = str(input("Enter the name of the ensemble: "))
+    while (
+        not validate_model_name(model_name)
+        or model_path.ModelPath.check_if_model_dir_exists(model_name)
+        or ensemble_path.EnsemblePath.check_if_model_dir_exists(model_name)
+    ):
+        error = "Invalid input. Please use the format 'adjective_noun' in lowercase, e.g., 'happy_kitten' that does not already exist as a model or ensemble."
+        logging.error(error)
+        model_name = str(input("Enter the name of the model: "))
+    model_directory_builder = EnsembleScaffoldBuilder(model_name)
+    model_directory_builder.build_model_directory()
+    assessment = model_directory_builder.assess_model_directory()
+    if not assessment["structure_errors"]:
+        logging.info("Ensemble directory structure is complete.")
+    else:
+        logging.warning(f"Structure errors: {assessment['structure_errors']}")
+    model_directory_builder.build_model_scripts()
+    assessment = model_directory_builder.assess_model_scripts()
+    if not assessment["missing_scripts"]:
+        logging.info("All scripts have been successfully generated.")
+    else:
+        logging.warning(f"Missing scripts: {assessment['missing_scripts']}")