Skip to content

Commit

Permalink
stuff
Browse files Browse the repository at this point in the history
  • Loading branch information
smellycloud committed Oct 28, 2024
1 parent 9724c9a commit 81af531
Show file tree
Hide file tree
Showing 16 changed files with 868 additions and 10 deletions.
41 changes: 41 additions & 0 deletions common_querysets/queryset_meow_meow.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
from viewser import Queryset, Column

def generate():
"""
Contains the configuration for the input data in the form of a viewser queryset. That is the data from viewser that is used to train the model.
This configuration is "behavioral" so modifying it will affect the model's runtime behavior and integration into the deployment system.
There is no guarantee that the model will work if the input data configuration is changed here without changing the model settings and algorithm accordingly.
Returns:
- queryset_base (Queryset): A queryset containing the base data for the model training.
"""

# VIEWSER 6, Example configuration. Modify as needed.

queryset_base = (Queryset("meow_meow", "priogrid_month")
# Create a new column 'ln_sb_best' using data from 'priogrid_month' and 'ged_sb_best_count_nokgi' column
# Apply logarithmic transformation, handle missing values by replacing them with NA
.with_column(Column("ln_sb_best", from_loa="priogrid_month", from_column="ged_sb_best_count_nokgi")
.transform.ops.ln().transform.missing.replace_na())

# Create a new column 'ln_ns_best' using data from 'priogrid_month' and 'ged_ns_best_count_nokgi' column
# Apply logarithmic transformation, handle missing values by replacing them with NA
.with_column(Column("ln_ns_best", from_loa="priogrid_month", from_column="ged_ns_best_count_nokgi")
.transform.ops.ln().transform.missing.replace_na())

# Create a new column 'ln_os_best' using data from 'priogrid_month' and 'ged_os_best_count_nokgi' column
# Apply logarithmic transformation, handle missing values by replacing them with NA
.with_column(Column("ln_os_best", from_loa="priogrid_month", from_column="ged_os_best_count_nokgi")
.transform.ops.ln().transform.missing.replace_na())

# Create columns for month and year_id
.with_column(Column("month", from_loa="month", from_column="month"))
.with_column(Column("year_id", from_loa="country_year", from_column="year_id"))

# Create columns for country_id, col, and row
.with_column(Column("c_id", from_loa="country_year", from_column="country_id"))
.with_column(Column("col", from_loa="priogrid", from_column="col"))
.with_column(Column("row", from_loa="priogrid", from_column="row"))
)

return queryset_base
118 changes: 111 additions & 7 deletions common_utils/ensemble_path.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import logging
from pathlib import Path
from typing import Union

import sys
logging.basicConfig(
level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
)
Expand Down Expand Up @@ -36,11 +36,115 @@ def __init__(
"""
super().__init__(ensemble_name_or_path, validate)
# Additional ensemble-specific initialization...
print(self._validate)

def _initialize_directories(self) -> None:
"""
Initializes the necessary directories for the ensemble.
Creates and sets up various directories required for the ensemble, such as architectures, artifacts, configs, data, etc.
"""
# Call the parent class's _initialize_directories method
super()._initialize_directories()

# List of directories to keep
keep_dirs = {
"artifacts",
"configs",
"data",
"data/generated",
"data/processed",
"notebooks",
"reports",
"reports/figures",
"reports/papers",
"reports/plots",
"reports/slides",
"reports/timelapse",
"src",
"src/dataloaders",
"src/forecasting",
"src/management",
"src/offline_evaluation",
"src/training",
"src/utils",
"src/visualization",
}

# Remove directories that are not in the keep_dirs list
for attr, value in list(self.__dict__.items()):
if Path(value).relative_to(self.model_dir) not in keep_dirs:
delattr(self, attr)

# Initialize directories as per the new structure
self.model_dir = self._get_model_dir()
self.artifacts = self._build_absolute_directory(Path("artifacts"))
self.configs = self._build_absolute_directory(Path("configs"))
self.data = self._build_absolute_directory(Path("data"))
self.data_generated = self._build_absolute_directory(Path("data/generated"))
self.data_processed = self._build_absolute_directory(Path("data/processed"))
self.notebooks = self._build_absolute_directory(Path("notebooks"))
self.reports = self._build_absolute_directory(Path("reports"))
self.reports_figures = self._build_absolute_directory(Path("reports/figures"))
self.reports_papers = self._build_absolute_directory(Path("reports/papers"))
self.reports_plots = self._build_absolute_directory(Path("reports/plots"))
self.reports_slides = self._build_absolute_directory(Path("reports/slides"))
self.reports_timelapse = self._build_absolute_directory(
Path("reports/timelapse")
)
self.src = self._build_absolute_directory(Path("src"))
self.dataloaders = self._build_absolute_directory(Path("src/dataloaders"))
self.forecasting = self._build_absolute_directory(Path("src/forecasting"))
self.management = self._build_absolute_directory(Path("src/management"))
self.offline_evaluation = self._build_absolute_directory(
Path("src/offline_evaluation")
)
self.training = self._build_absolute_directory(Path("src/training"))
self.utils = self._build_absolute_directory(Path("src/utils"))
self.visualization = self._build_absolute_directory(Path("src/visualization"))
self._templates = self.meta_tools / "templates"
self._sys_paths = None
# if self.common_querysets not in sys.path:
# sys.path.insert(0, str(self.common_querysets))
# self.queryset_path = self.common_querysets / f"queryset_{self.model_name}.py"
# self._queryset = None

def _initialize_scripts(self) -> None:
"""
Initializes the necessary scripts for the ensemble.
Creates and sets up various scripts required for the ensemble, such as configuration scripts, main script, and other utility scripts.
"""
self.scripts = [
self._build_absolute_directory(Path("configs/config_deployment.py")),
self._build_absolute_directory(Path("configs/config_hyperparameters.py")),
self._build_absolute_directory(Path("configs/config_meta.py")),
self._build_absolute_directory(Path("main.py")),
self._build_absolute_directory(Path("README.md")),
self._build_absolute_directory(Path("requirements.txt")),
self._build_absolute_directory(Path("artifacts/model_metadata_dict.py")),
self._build_absolute_directory(Path("src/dataloaders/get_data.py")),
self._build_absolute_directory(
Path("src/forecasting/generate_forecast.py")
),
self._build_absolute_directory(
Path("src/management/execute_model_runs.py")
),
self._build_absolute_directory(
Path("src/management/execute_model_tasks.py")
),
self._build_absolute_directory(
Path("src/offline_evaluation/evaluate_ensemble.py")
),
self._build_absolute_directory(Path("src/training/train_ensemble.py")),
self._build_absolute_directory(Path("src/utils/utils_outputs.py")),
self._build_absolute_directory(Path("src/utils/utils_run.py")),
self._build_absolute_directory(Path("src/visualization/visual.py")),
# self.common_querysets / f"queryset_{self.model_name}.py",
]


# if __name__ == "__main__":
# ensemble_path = EnsemblePath("white_mustang", validate=True)
# ensemble_path.view_directories()
# ensemble_path.view_scripts()
# print(ensemble_path.get_queryset())
# del ensemble_path
if __name__ == "__main__":
ensemble_path = EnsemblePath("white_mustang", validate=True)
print(ensemble_path.get_directories())
del ensemble_path
226 changes: 226 additions & 0 deletions meta_tools/ensemble_scaffold_builder.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,226 @@
from pathlib import Path
from utils.utils_model_naming import validate_model_name
import datetime
import logging
import sys

PATH = Path(__file__)
if 'views_pipeline' in PATH.parts:
PATH_ROOT = Path(*PATH.parts[:PATH.parts.index('views_pipeline') + 1])
PATH_COMMON_UTILS = PATH_ROOT / 'common_utils'
if not PATH_COMMON_UTILS.exists():
raise ValueError("The 'common_utils' directory was not found in the provided path.")
sys.path.insert(0, str(PATH_COMMON_UTILS))
sys.path.insert(0, str(PATH_ROOT))
else:
raise ValueError("The 'views_pipeline' directory was not found in the provided path.")

# print(str(Path(__file__).parent.parent))
from common_utils import model_path, ensemble_path

from templates.ensemble import (
template_config_deployment,
template_config_hyperparameters,
template_config_meta,
template_main,
)

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)


class EnsembleScaffoldBuilder:
"""
A class to create and manage the directory structure and scripts for a machine learning ensemble.
Attributes:
model_name (str): The name of the ensemble for which the directory structure is to be created.
_model (ModelPath): An instance of the ModelPath class to manage model paths.
_subdirs (list of str): A list of subdirectories to be created within the ensemble directory.
_scripts (list of str): A list of script paths to be created within the ensemble directory.
Methods:
__init__(model_name: str) -> None:
Initializes the EnsembleScaffoldBuilder with the given model name and sets up paths.
build_model_directory() -> Path:
Creates the ensemble directory and its subdirectories, and initializes necessary files such as README.md
and requirements.txt.
Returns:
Path: The path to the created ensemble directory.
Raises:
FileExistsError: If the ensemble directory already exists.
build_model_scripts() -> None:
Generates the necessary configuration and main scripts for the ensemble.
Raises:
FileNotFoundError: If the ensemble directory does not exist.
assess_model_directory() -> dict:
Assesses the ensemble directory by checking for the presence of expected directories.
Returns:
dict: A dictionary containing assessment results with two keys:
- 'model_dir': The path to the ensemble directory.
- 'structure_errors': A list of errors related to missing directories.
assess_model_scripts() -> dict:
Assesses the ensemble directory by checking for the presence of expected scripts.
Returns:
dict: A dictionary containing assessment results with two keys:
- 'model_dir': The path to the ensemble directory.
- 'missing_scripts': A set of missing script paths.
"""

def __init__(self, model_name) -> None:
"""
Initialize a EnsembleScaffoldBuilder object with the given ensemble name and set up paths.
Args:
model_name (str): The name of the model for which directories and files are to be created.
Returns:
None
"""
self._model = ensemble_path.EnsemblePath(model_name, validate=False)
self._subdirs = self._model.get_directories().values()
self._scripts = self._model.get_scripts().values()

def build_model_directory(self) -> Path:
"""
Create the ensemble directory and its subdirectories, and initialize necessary files such as README.md and requirements.txt.
Returns:
Path: The path to the created ensemble directory.
Raises:
FileExistsError: If the ensemble directory already exists.
"""
if self._model.model_dir.exists():
logger.info(
f"Ensemble directory already exists: {self._model.model_dir}. Proceeding with existing directory."
)
else:
self._model.model_dir.mkdir(parents=True, exist_ok=False)
logger.info(f"Created new ensemble directory: {self._model.model_dir}")

for subdir in self._subdirs:
subdir = Path(subdir)
if not subdir.exists():
try:
subdir.mkdir(parents=True, exist_ok=True)
if subdir.exists():
logging.info(f"Created subdirectory: {subdir}")
else:
logging.error(f"Did not create subdirectory: {subdir}")
except Exception as e:
logging.error(f"Error creating subdirectory: {subdir}. {e}")
else:
logging.info(f"Subdirectory already exists: {subdir}. Skipping.")

# Create README.md and requirements.txt
readme_path = self._model.model_dir / "README.md"
with open(readme_path, "w") as readme_file:
readme_file.write(
f"# Ensemble README\n## Ensemble name: {self._model.model_name}\n## Created on: {str(datetime.datetime.now())}"
)
if readme_path.exists():
logging.info(f"Created README.md: {readme_path}")
else:
logging.error(f"Did not create README.md: {readme_path}")

requirements_path = self._model.model_dir / "requirements.txt"
with open(requirements_path, "w") as requirements_file:
requirements_file.write("# Requirements\n")
if requirements_path.exists():
logging.info(f"Created requirements.txt: {requirements_path}")
else:
logging.error(f"Did not create requirements.txt: {requirements_path}")
return self._model.model_dir

def build_model_scripts(self):
if not self._model.model_dir.exists():
raise FileNotFoundError(
f"Ensemble directory {self._model.model_dir} does not exist. Please call build_model_directory() first. Aborting script generation."
)
template_config_deployment.generate(
script_dir=self._model.model_dir / "configs/config_deployment.py"
)
template_config_hyperparameters.generate(
script_dir=self._model.model_dir / "configs/config_hyperparameters.py",
)
template_config_meta.generate(
script_dir=self._model.model_dir / "configs/config_meta.py",
model_name=self._model.model_name,
)
template_main.generate(script_dir=self._model.model_dir / "main.py")

def assess_model_directory(self) -> dict:
"""
Assess the ensemble directory by checking for the presence of expected directories.
Returns:
dict: A dictionary containing assessment results with two keys:
- 'model_dir': The path to the ensemble directory.
- 'structure_errors': A list of errors related to missing directories or files.
"""
assessment = {"model_dir": self._model.model_dir, "structure_errors": []}
if not self._model.model_dir.exists():
raise FileNotFoundError(
f"Ensemble directory {self._model.model_dir} does not exist. Please call build_model_directory() first."
)
updated_model_path = ensemble_path.EnsemblePath(self._model.model_name, validate=True)
assessment["structure_errors"] = set(
updated_model_path.get_directories().values()
) - set(self._subdirs)
del updated_model_path
return assessment

def assess_model_scripts(self) -> dict:
"""
Assess the ensemble directory by checking for the presence of expected directories.
Returns:
dict: A dictionary containing assessment results with two keys:
- 'model_dir': The path to the ensemble directory.
- 'structure_errors': A list of errors related to missing directories or files.
"""
assessment = {"model_dir": self._model.model_dir, "missing_scripts": set()}
if not self._model.model_dir.exists():
raise FileNotFoundError(
f"Ensemble directory {self._model.model_dir} does not exist. Please call build_model_directory() first."
)
for script_path in self._scripts:
script_path = Path(script_path)
if not script_path.exists():
assessment["missing_scripts"].add(script_path)
return assessment


if __name__ == "__main__":
model_name = str(input("Enter the name of the ensemble: "))
while (
not validate_model_name(model_name)
or model_path.ModelPath.check_if_model_dir_exists(model_name)
or ensemble_path.EnsemblePath.check_if_model_dir_exists(model_name)
):
error = "Invalid input. Please use the format 'adjective_noun' in lowercase, e.g., 'happy_kitten' that does not already exist as a model or ensemble."
logging.error(error)
model_name = str(input("Enter the name of the model: "))
model_directory_builder = EnsembleScaffoldBuilder(model_name)
model_directory_builder.build_model_directory()
assessment = model_directory_builder.assess_model_directory()
if not assessment["structure_errors"]:
logging.info("Ensemble directory structure is complete.")
else:
logging.warning(f"Structure errors: {assessment['structure_errors']}")
model_directory_builder.build_model_scripts()
assessment = model_directory_builder.assess_model_scripts()
if not assessment["missing_scripts"]:
logging.info("All scripts have been successfully generated.")
else:
logging.warning(f"Missing scripts: {assessment['missing_scripts']}")
Loading

0 comments on commit 81af531

Please sign in to comment.