prio-data · Polichinel · Oct 28, 2024 · Sep 26, 2024 · Oct 3, 2024 · Oct 15, 2024
diff --git a/common_utils/set_path.py b/common_utils/set_path.py
@@ -44,14 +44,10 @@ def setup_model_paths(PATH):
         PATH_model: The path (pathlib path object) including the "models" directory and its immediate subdirectory.
     """
 
-    if "models" in PATH.parts:
-        PATH_MODEL = Path(*[i for i in PATH.parts[:PATH.parts.index("models") + 2]])
-        return PATH_MODEL
-    else:
-        # error_message = "The 'models' directory was not found in the provided path."
-        # logger.warning(error_message)
-        # raise ValueError(error_message)
-        return None
+    PATH_MODEL = Path(*[i for i in PATH.parts[:PATH.parts.index("models") + 2]])
+
+    return PATH_MODEL
+
 
 
 def setup_ensemble_paths(PATH):
@@ -66,15 +62,11 @@ def setup_ensemble_paths(PATH):
     Returns:
         PATH_ENSEMBLE: The path (pathlib path object) including the "ensembles" directory and its immediate subdirectory.
     """
-    if "ensembles" in PATH.parts:
-        PATH_ENSEMBLE = Path(*[i for i in PATH.parts[:PATH.parts.index("ensembles") + 2]])
-        return PATH_ENSEMBLE
-
-    else:
-        # error_message = "The 'ensembles' directory was not found in the provided path."
-        # logger.warning(error_message)
-        # raise ValueError(error_message)
-        return None
+
+    PATH_ENSEMBLE = Path(*[i for i in PATH.parts[:PATH.parts.index("ensembles") + 2]])
+
+    return PATH_ENSEMBLE
+
 
 def setup_project_paths(PATH) -> None:
     """
@@ -108,19 +100,15 @@ def setup_project_paths(PATH) -> None:
     #    PATH_MODEL = Path(*[i for i in PATH.parts[:PATH.parts.index("models")+2]]) # The +2 is to include the "models" and the individual model name in the path
 
     PATH_ROOT = setup_root_paths(PATH)
-
 
-    try:
+    if "models" in PATH.parts:
         PATH_MODEL = setup_model_paths(PATH)
-    except ValueError as e:
+        PATH_ENSEMBLE = None
+    elif "ensembles" in PATH.parts:
         PATH_MODEL = None
-        logger.warning(e)
-
-    try:
         PATH_ENSEMBLE = setup_ensemble_paths(PATH)
-    except ValueError as e:
-        PATH_ENSEMBLE = None
-        logger.warning(e)
+    else:
+        logger.error("The provided path does not contain a model or ensemble directory.")
 
     # print(f"Root path: {PATH_ROOT}") # debug
     # print(f"Model path: {PATH_MODEL}") # debug
@@ -199,18 +187,14 @@ def setup_data_paths(PATH) -> Path:
 
     """
 
-    # PATH_MODEL = Path(*[i for i in PATH.parts[:PATH.parts.index("models")+2]]) # The +2 is to include the "models" and the individual model name in the path
-    try:
+    if "models" in PATH.parts:
         PATH_MODEL = setup_model_paths(PATH)
-    except ValueError as e:
+        PATH_ENSEMBLE = None
+    elif "ensembles" in PATH.parts:
         PATH_MODEL = None
-        logger.warning(e)
-
-    try:
         PATH_ENSEMBLE = setup_ensemble_paths(PATH)
-    except ValueError as e:
-        PATH_ENSEMBLE = None
-        logger.warning(e)
+    else:
+        logger.error("The provided path does not contain a model or ensemble directory.")
 
     PATH_DATA = PATH_MODEL / "data" if PATH_MODEL else PATH_ENSEMBLE / "data"
     PATH_RAW = PATH_DATA / "raw"

diff --git a/common_utils/utils_artifacts.py b/common_utils/utils_artifacts.py
@@ -1,6 +1,9 @@
-import os
+import logging
 from pathlib import Path
 
+logger = logging.getLogger(__name__)
+
+
 def get_artifact_files(PATH, run_type):
     """
     Retrieve artifact files from a directory that match the given run type and common extensions.
@@ -51,7 +54,7 @@ def get_latest_model_artifact(PATH, run_type):
 
     #print statements for debugging
     # print(f"artifacts availible: {model_files}")
-    print(f"artifact used: {model_files[0]}")
+    logger.info(f"artifact used: {model_files[0]}")
 
     # Return the latest model file
     #PATH_MODEL_ARTIFACT = os.path.join(path, model_files[0])

diff --git a/common_utils/utils_dataloaders.py b/common_utils/utils_dataloaders.py
@@ -2,6 +2,7 @@
 import os
 import numpy as np
 import pandas as pd
+import logging
 
 # from config_partitioner import get_partitioner_dict
 from set_partition import get_partitioner_dict
@@ -10,6 +11,8 @@
 from utils_df_to_vol_conversion import df_to_vol
 from viewser import Queryset, Column
 
+logger = logging.getLogger(__name__)
+
 
 def fetch_data_from_viewser(month_first, month_last, drift_config_dict, self_test):
     """
@@ -20,7 +23,7 @@ def fetch_data_from_viewser(month_first, month_last, drift_config_dict, self_tes
     Returns:
         pd.DataFrame: The prepared DataFrame with initial processing done.
     """
-    print(f'Beginning file download through viewser with month range {month_first},{month_last}')
+    logger.info(f'Beginning file download through viewser with month range {month_first},{month_last}')
     queryset_base = get_input_data_config()  # just used here..
     df, alerts = queryset_base.publish().fetch_with_drift_detection(start_date=month_first,
                                                                     end_date=month_last - 1,
@@ -167,7 +170,7 @@ def get_views_df(partition, override_month=None, self_test=False):
 
     if partition == 'forecasting' and override_month is not None:
         month_last = override_month
-        print(f'\n ***Warning: overriding end month in forecasting partition to {month_last} ***\n')
+        logger.warning(f'Overriding end month in forecasting partition to {month_last} ***\n')
 
     df, alerts = fetch_data_from_viewser(month_first, month_last, drift_config_dict, self_test)
 
@@ -203,15 +206,15 @@ def fetch_or_load_views_df(partition, PATH_RAW, self_test=False, use_saved=False
         # Check if the VIEWSER data file exists
         try:
             df = pd.read_pickle(path_viewser_df)
-            print(f'Reading saved data from {path_viewser_df}')
+            logger.info(f'Reading saved data from {path_viewser_df}')
 
         except:
             raise RuntimeError(f'Use of saved data was specified but {path_viewser_df} not found')
 
     else:
-        print(f'Fetching file...')
+        logger.info(f'Fetching file...')
         df, alerts = get_views_df(partition, override_month, self_test)  # which is then used here
-        print(f'Saving file to {path_viewser_df}')
+        logger.info(f'Saving file to {path_viewser_df}')
         df.to_pickle(path_viewser_df)
 
     if validate_df_partition(df, partition, override_month):
@@ -249,17 +252,17 @@ def create_or_load_views_vol(partition, PATH_PROCESSED, PATH_RAW):
 
     # Check if the volume exists
     if os.path.isfile(path_vol):
-        print('Volume already created')
+        logger.info('Volume already created')
         vol = np.load(path_vol)
     else:
-        print('Creating volume...')
+        logger.info('Creating volume...')
         path_raw = os.path.join(str(PATH_RAW), f'{partition}_viewser_df.pkl')
         vol = df_to_vol(pd.read_pickle(path_raw))
-        print(f'shape of volume: {vol.shape}')
-        print(f'Saving volume to {path_vol}')
+        logger.info(f'shape of volume: {vol.shape}')
+        logger.info(f'Saving volume to {path_vol}')
         np.save(path_vol, vol)
 
-    print('Done')
+    logger.info('Done')
 
     return vol
 
@@ -337,8 +340,8 @@ def ensure_float64(df):
         df.select_dtypes(include=['number']).dtypes != np.float64]
 
     if len(non_float64_cols) > 0:
-        print(
-            f"Warning: DataFrame contains non-np.float64 numeric columns. Converting the following columns: {', '.join(non_float64_cols)}")
+        logger.warning(
+            f"DataFrame contains non-np.float64 numeric columns. Converting the following columns: {', '.join(non_float64_cols)}")
 
         for col in non_float64_cols:
             df[col] = df[col].astype(np.float64)

diff --git a/common_utils/utils_logger.py b/common_utils/utils_logger.py
@@ -0,0 +1,33 @@
+import logging
+
+
+def setup_logging(log_file: str, log_level=logging.INFO):
+    """
+    Sets up logging to both a specified file and the terminal (console).
+
+    Args:
+        log_file (str): The file where logs should be written.
+        log_level (int): The logging level. Default is logging.INFO.
+    """
+
+    basic_logger = logging.getLogger()
+    basic_logger.setLevel(log_level)
+
+    file_handler = logging.FileHandler(log_file)
+    console_handler = logging.StreamHandler()
+
+    file_handler.setLevel(log_level)
+    console_handler.setLevel(log_level)
+
+    formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
+    file_handler.setFormatter(formatter)
+    console_handler.setFormatter(formatter)
+
+    # Clear previous handlers if they exist
+    if basic_logger.hasHandlers():
+        basic_logger.handlers.clear()
+
+    basic_logger.addHandler(file_handler)
+    basic_logger.addHandler(console_handler)
+
+    return basic_logger
diff --git a/ensembles/cruel_summer/main.py b/ensembles/cruel_summer/main.py
@@ -1,11 +1,6 @@
-import sys
-import time
 import wandb
-
-import logging
-logging.basicConfig(filename='run.log', encoding='utf-8', level=logging.INFO,
-                    format='%(asctime)s - %(levelname)s - %(message)s')
-logger = logging.getLogger(__name__)
+import sys
+import warnings
 
 from pathlib import Path
 PATH = Path(__file__)
@@ -14,22 +9,19 @@
 from set_path import setup_project_paths
 setup_project_paths(PATH)
 
-from execute_model_runs import execute_single_run
 from utils_cli_parser import parse_args, validate_arguments
+from utils_logger import setup_logging
+from execute_model_runs import execute_single_run
 
+warnings.filterwarnings("ignore")
+
+logger = setup_logging('run.log')
 
-if __name__ == "__main__":
-    args = parse_args()
-    validate_arguments(args)
 
-    # wandb login
+if __name__ == "__main__":
     wandb.login()
 
-    start_t = time.time()
+    args = parse_args()
+    validate_arguments(args)
 
     execute_single_run(args)
-
-    end_t = time.time()
-    minutes = (end_t - start_t) / 60
-    logger.info(f'Done. Runtime: {minutes:.3f} minutes.\n')
-
diff --git a/ensembles/cruel_summer/src/forecasting/generate_forecast.py b/ensembles/cruel_summer/src/forecasting/generate_forecast.py
@@ -2,11 +2,7 @@
 from datetime import datetime
 import pandas as pd
 import pickle
-
 import logging
-logging.basicConfig(filename='../../run.log', encoding='utf-8', level=logging.INFO,
-                    format='%(asctime)s - %(levelname)s - %(message)s')
-logger = logging.getLogger(__name__)
 
 from pathlib import Path
 PATH = Path(__file__)
@@ -21,6 +17,8 @@
 from utils_run import get_standardized_df, get_aggregated_df
 from utils_artifacts import get_latest_model_artifact
 
+logger = logging.getLogger(__name__)
+
 
 def forecast_ensemble(config):
     run_type = config['run_type']

diff --git a/ensembles/cruel_summer/src/management/execute_model_tasks.py b/ensembles/cruel_summer/src/management/execute_model_tasks.py
@@ -1,10 +1,7 @@
 import sys
 import wandb
-
 import logging
-logging.basicConfig(filename='../../run.log', encoding='utf-8', level=logging.INFO,
-                    format='%(asctime)s - %(levelname)s - %(message)s')
-logger = logging.getLogger(__name__)
+import time
 
 from pathlib import Path
 PATH = Path(__file__)
@@ -17,6 +14,8 @@
 from generate_forecast import forecast_ensemble
 from utils_wandb import add_wandb_monthly_metrics
 
+logger = logging.getLogger(__name__)
+
 
 def execute_model_tasks(config=None, project=None, eval=None, forecast=None):
     """
@@ -34,6 +33,8 @@ def execute_model_tasks(config=None, project=None, eval=None, forecast=None):
         artifact_name (optional): Specific names of the model artifact to load for evaluation or forecasting.
     """
 
+    start_t = time.time()
+
     # Initialize WandB
     with wandb.init(project=project, entity="views_pipeline",
                     config=config):  # project and config ignored when running a sweep
@@ -52,3 +53,7 @@ def execute_model_tasks(config=None, project=None, eval=None, forecast=None):
         if forecast:
             logger.info(f"Forecasting ensemble model {config['name']}...")
             forecast_ensemble(config)
+
+        end_t = time.time()
+        minutes = (end_t - start_t) / 60
+        logger.info(f'Done. Runtime: {minutes:.3f} minutes.\n')
diff --git a/ensembles/cruel_summer/src/offline_evaluation/evaluate_ensemble.py b/ensembles/cruel_summer/src/offline_evaluation/evaluate_ensemble.py
@@ -1,11 +1,5 @@
 import sys
-import warnings
-warnings.filterwarnings("ignore")
-
 import logging
-logging.basicConfig(filename='../../run.log', encoding='utf-8', level=logging.INFO,
-                    format='%(asctime)s - %(levelname)s - %(message)s')
-logger = logging.getLogger(__name__)
 
 from pathlib import Path
 PATH = Path(__file__)
@@ -23,6 +17,8 @@
 from utils_wandb import log_wandb_log_dict
 from views_forecasts.extensions import *
 
+logger = logging.getLogger(__name__)
+
 
 def evaluate_ensemble(config):
     run_type = config['run_type']

diff --git a/ensembles/cruel_summer/src/utils/utils_checks.py b/ensembles/cruel_summer/src/utils/utils_checks.py
@@ -1,10 +1,6 @@
 import sys
 from datetime import datetime
-
 import logging
-logging.basicConfig(filename='../../run.log', encoding='utf-8', level=logging.INFO,
-                    format='%(asctime)s - %(levelname)s - %(message)s')
-logger = logging.getLogger(__name__)
 
 from pathlib import Path
 PATH = Path(__file__)
@@ -15,6 +11,9 @@
 
 from utils_log_files import read_log_file
 
+logger = logging.getLogger(__name__)
+
+
 def check_model_conditions(PATH_GENERATED, config):
     """
     Checks if the model meets the required conditions based on the log file.

diff --git a/ensembles/cruel_summer/src/utils/utils_log_files.py b/ensembles/cruel_summer/src/utils/utils_log_files.py
@@ -1,9 +1,9 @@
 from pathlib import Path
 import logging
-logging.basicConfig(filename='../../run.log', encoding='utf-8', level=logging.INFO,
-                    format='%(asctime)s - %(levelname)s - %(message)s')
+
 logger = logging.getLogger(__name__)
 
+
 def create_log_file(PATH_GENERATED,
                     config,
                     model_timestamp,