diff --git a/src/faivor/metrics/classification/__init__.py b/src/faivor/metrics/classification/__init__.py index e69de29..5295df3 100644 --- a/src/faivor/metrics/classification/__init__.py +++ b/src/faivor/metrics/classification/__init__.py @@ -0,0 +1,14 @@ +from .classification_metrics import ( + PERFORMANCE_METRICS as performance, + FAIRNESS_METRICS as fairness, + EXPLAINABILITY_METRICS as explainability +) + +class ClassificationMetrics: + def __init__(self): + self.performance = performance + self.fairness = fairness + self.explainability = explainability + +# Create an instance for easy access +metrics = ClassificationMetrics() \ No newline at end of file diff --git a/src/faivor/metrics/classification/classification_metrics.py b/src/faivor/metrics/classification/classification_metrics.py new file mode 100644 index 0000000..52946d4 --- /dev/null +++ b/src/faivor/metrics/classification/classification_metrics.py @@ -0,0 +1,3 @@ +from ..config_loader import load_metrics + +PERFORMANCE_METRICS, FAIRNESS_METRICS, EXPLAINABILITY_METRICS = load_metrics("classification/classification_metrics.yaml") \ No newline at end of file diff --git a/src/faivor/metrics/classification/classification_metrics.yaml b/src/faivor/metrics/classification/classification_metrics.yaml new file mode 100644 index 0000000..ed53fd8 --- /dev/null +++ b/src/faivor/metrics/classification/classification_metrics.yaml @@ -0,0 +1,100 @@ +performance: + - function_name: accuracy_score + regular_name: Accuracy Score + description: Accuracy classification score. + func: sklearn.metrics.accuracy_score + is_torch: false + - function_name: balanced_accuracy_score + regular_name: Balanced Accuracy Score + description: Balanced accuracy classification score. + func: sklearn.metrics.balanced_accuracy_score + is_torch: false + - function_name: average_precision_score + regular_name: Average Precision Score + description: Compute average precision (AP) from prediction scores. + func: sklearn.metrics.average_precision_score + is_torch: false + - function_name: f1_score + regular_name: F1 Score + description: F1 score, harmonic mean of precision and recall. + func: sklearn.metrics.f1_score + is_torch: false + - function_name: precision_score + regular_name: Precision Score + description: Precision classification score. + func: sklearn.metrics.precision_score + is_torch: false + - function_name: recall_score + regular_name: Recall Score + description: Recall classification score. + func: sklearn.metrics.recall_score + is_torch: false + - function_name: roc_auc_score + regular_name: ROC AUC Score + description: Compute Area Under the Receiver Operating Characteristic Curve (ROC AUC) from prediction scores. + func: sklearn.metrics.roc_auc_score + is_torch: false + - function_name: jaccard_score + regular_name: Jaccard Score + description: Jaccard similarity coefficient. + func: sklearn.metrics.jaccard_score + is_torch: false + - function_name: log_loss + regular_name: Log Loss + description: Log loss, aka logistic regression loss or cross-entropy loss. + func: sklearn.metrics.log_loss + is_torch: false + - function_name: matthews_corrcoef + regular_name: Matthews Correlation Coefficient + description: Compute the Matthews correlation coefficient (MCC). + func: sklearn.metrics.matthews_corrcoef + is_torch: false + - function_name: brier_score_loss + regular_name: Brier Score Loss + description: Compute the Brier score loss. + func: sklearn.metrics.brier_score_loss + is_torch: false + - function_name: top_k_accuracy_score + regular_name: Top K Accuracy Score + description: Top-k accuracy classification score. + func: sklearn.metrics.top_k_accuracy_score + is_torch: false + - function_name: roc_curve + regular_name: ROC Curve + description: Compute Receiver operating characteristic (ROC) curve. + func: sklearn.metrics.roc_curve + is_torch: false + - function_name: precision_recall_curve + regular_name: Precision Recall Curve + description: Compute precision-recall pairs for different probability thresholds. + func: sklearn.metrics.precision_recall_curve + is_torch: false + - function_name: hamming_loss + regular_name: Hamming Loss + description: Compute the average Hamming loss or Hamming distance between two sets of samples. + func: sklearn.metrics.hamming_loss + is_torch: false + - function_name: zero_one_loss + regular_name: Zero One Loss + description: Zero-one classification loss. + func: sklearn.metrics.zero_one_loss + is_torch: false + - function_name: confusion_matrix + regular_name: Confusion Matrix + description: Compute confusion matrix to evaluate the accuracy of a classification. + func: sklearn.metrics.confusion_matrix + is_torch: false + +fairness: + - function_name: disparate_impact + regular_name: Disparate Impact + description: Calculates the disparate impact for classification by comparing the rate of favorable outcomes for different groups. + func: faivor.metrics.classification.fairness.disparate_impact + is_torch: false + +explainability: + - function_name: prediction_entropy + regular_name: Prediction Entropy + description: Calculates the entropy of predictions to measure model uncertainty. + func: faivor.metrics.classification.explainability.prediction_entropy + is_torch: false \ No newline at end of file diff --git a/src/faivor/metrics/classification/explainability.py b/src/faivor/metrics/classification/explainability.py index e5bb2c4..fd99047 100644 --- a/src/faivor/metrics/classification/explainability.py +++ b/src/faivor/metrics/classification/explainability.py @@ -1,36 +1,41 @@ -from typing import List -from sklearn import metrics as skm - -__all__ = ["ClassificationExplainabilityMetrics"] - - -class ClassificationExplainabilityMetricsMeta(type): - """Metaclass for dynamically creating classification explainability metric classes.""" - - _WHITELISTED_METRICS: List[str] = [] # sklearn doesn't provide direct explainability metrics - - def __new__(mcs, name, bases, dct): - """Creates a new class, inheriting from skm metrics.""" - for metric_name in mcs._WHITELISTED_METRICS: - metric_function = getattr(skm, metric_name, None) - if metric_function: - def method_wrapper(self, y_true, y_pred, **kwargs): - return metric_function(y_true, y_pred, **kwargs) - dct[metric_name] = method_wrapper - return super().__new__(mcs, name, bases, dct) - - -class BaseClassificationExplainabilityMetrics: - """Base class for classification explainability metrics.""" - pass - - -class ClassificationExplainabilityMetrics(BaseClassificationExplainabilityMetrics, metaclass=ClassificationExplainabilityMetricsMeta): - """Class for classification explainability metrics.""" - - def custom_prediction_entropy(self, probas): - """Calculate the average entropy of prediction probabilities.""" - import numpy as np - probas = np.asarray(probas) - log_probs = np.log2(probas) - return -np.mean(np.sum(probas * log_probs, axis=1)) +import numpy as np +from scipy.stats import entropy + +def prediction_entropy(y_prob) -> float: + """ + Calculates the entropy of predictions for classification. + + Entropy is a measure of uncertainty. Higher entropy in predictions indicates + higher model uncertainty. This function computes the average entropy across all predictions. + + Parameters + ---------- + y_prob : array-like of shape (n_samples, n_classes) or (n_samples,) + The predicted probabilities for each class. Can be either: + - A 2D array of shape (n_samples, n_classes) where each row represents + the probability distribution over classes for a single sample. + - A 1D array of shape (n_samples,) for binary classification, representing + the probability of the positive class (class 1). + + Returns + ------- + float + The average prediction entropy. Returns np.nan if input is empty or invalid. + """ + y_prob = np.asarray(y_prob) + if y_prob.size == 0: + return np.nan + + if y_prob.ndim == 1: # assume binary classification and probabilities are for positive class + y_prob = np.vstack([1 - y_prob, y_prob]).T # create 2D prob array: [[p(class0), p(class1)], ...] + + if np.any(y_prob < 0) or np.any(y_prob > 1): + return np.nan # probabilities should be between 0 and 1 + + # Normalize probabilities to ensure they sum to 1 (handle potential rounding errors) + y_prob_normalized = y_prob / np.sum(y_prob, axis=1, keepdims=True) + + # Calculate entropy for each prediction + entropies = entropy(y_prob_normalized, axis=1) + + return np.mean(entropies) \ No newline at end of file diff --git a/src/faivor/metrics/classification/fairness.py b/src/faivor/metrics/classification/fairness.py index 93f4f19..fcea681 100644 --- a/src/faivor/metrics/classification/fairness.py +++ b/src/faivor/metrics/classification/fairness.py @@ -1,74 +1,55 @@ -from typing import List -from sklearn import metrics as skm -from torchmetrics import Accuracy, F1Score, Precision, Recall -import torch -__all__ = ["ClassificationFairnessMetrics"] - - -class ClassificationFairnessMetricsMeta(type): - """Metaclass for dynamically creating classification fairness metric classes.""" - - _WHITELISTED_METRICS: List[str] = [ - "accuracy_score", # useful for group fairness comparisons - ] - - def __new__(mcs, name, bases, dct): - """Creates a new class, inheriting from skm metrics.""" - for metric_name in mcs._WHITELISTED_METRICS: - metric_function = getattr(skm, metric_name, None) - if metric_function: - def method_wrapper(self, y_true, y_pred, **kwargs): - return metric_function(y_true, y_pred, **kwargs) - dct[metric_name] = method_wrapper - - for metric_name in ["accuracy", "f1_score", "precision", "recall"]: - if metric_name == "accuracy": - metric_class = Accuracy - elif metric_name == "f1_score": - metric_class = F1Score - elif metric_name == "precision": - metric_class = Precision - elif metric_name == "recall": - metric_class = Recall - - def torchmetrics_method_wrapper(self, y_true, y_pred, **kwargs): - metric = metric_class(task = "binary", **kwargs) - return metric( - torch.tensor(y_pred, dtype = torch.float32), - torch.tensor(y_true, dtype= torch.int), - ).detach().cpu().item() - dct[metric_name] = torchmetrics_method_wrapper - return super().__new__(mcs, name, bases, dct) - - -class BaseClassificationFairnessMetrics: - """Base class for classification fairness metrics.""" - pass - - -class ClassificationFairnessMetrics(BaseClassificationFairnessMetrics, metaclass=ClassificationFairnessMetricsMeta): - """Class for classification fairness metrics.""" - - def custom_disparate_impact(self, y_true, y_pred, sensitive_attribute): - """Calculates Disparate Impact for classification.""" - import numpy as np - y_true, y_pred, sensitive_attribute = np.asarray(y_true), np.asarray(y_pred), np.asarray(sensitive_attribute) - - unique_sensitive_values = np.unique(sensitive_attribute) - if len(unique_sensitive_values) < 2: - return np.nan - - group_positive_rates = [] - for value in unique_sensitive_values: - group_mask = sensitive_attribute == value - if group_mask.sum() == 0: - group_positive_rates.append(np.nan) - else: - group_positive_rates.append(np.mean(y_pred[group_mask] == np.max(y_pred))) # Assuming 1 is the positive class - - group_positive_rates = np.asarray(group_positive_rates) - if np.isnan(group_positive_rates).any(): - return np.nan - - return np.min(group_positive_rates) / np.max(group_positive_rates) - +import numpy as np + +def disparate_impact(y_true, y_pred, sensitive_attribute, favorable_outcome=1) -> float: + """ + Calculates Disparate Impact for classification. + + Disparate Impact (DI) is the ratio of the rate of favorable outcomes for the + disadvantaged group compared to the advantaged group. A common threshold for + concern is DI < 0.8, indicating potential adverse impact. + + Parameters + ---------- + y_true : array-like of shape (n_samples,) + The true target values (binary: 0 or 1). + y_pred : array-like of shape (n_samples,) + The predicted target values (binary: 0 or 1). + sensitive_attribute : array-like of shape (n_samples,) + The sensitive attribute values (categorical). + favorable_outcome : int or float, default=1 + The value representing the favorable outcome in y_true and y_pred. + + Returns + ------- + float + The disparate impact ratio. Returns np.nan if there's only one group or + if the advantaged group has no favorable outcomes. + """ + y_true, y_pred, sensitive_attribute = ( + np.asarray(y_true), + np.asarray(y_pred), + np.asarray(sensitive_attribute), + ) + + unique_sensitive_values = np.unique(sensitive_attribute) + if len(unique_sensitive_values) < 2: + return np.nan # Not applicable for less than 2 groups + + favorable_rates = {} + for value in unique_sensitive_values: + group_mask = sensitive_attribute == value + group_size = group_mask.sum() + if group_size == 0: + favorable_rates[value] = 0 # Handle empty groups to avoid division by zero later, assume 0 favorable rate + else: + favorable_outcomes_count = np.sum(y_pred[group_mask] == favorable_outcome) + favorable_rates[value] = favorable_outcomes_count / group_size + + rates = np.array(list(favorable_rates.values())) + min_rate = np.min(rates) + max_rate = np.max(rates) + + if max_rate == 0: # avoid division by zero if advantaged group has no favorable outcomes + return np.nan + + return min_rate / max_rate \ No newline at end of file diff --git a/src/faivor/metrics/classification/performance.py b/src/faivor/metrics/classification/performance.py index a3e45e7..84514d2 100644 --- a/src/faivor/metrics/classification/performance.py +++ b/src/faivor/metrics/classification/performance.py @@ -1,66 +1,8 @@ -import inspect -from typing import List from sklearn import metrics as skm +import numpy as np - -__all__ = ["ClassificationPerformanceMetrics"] - - -class ClassificationPerformanceMetricsMeta(type): - """Metaclass for dynamically creating classification performance metric classes.""" - - _WHITELISTED_METRICS: List[str] = [ - "accuracy_score", - "balanced_accuracy_score", - "average_precision_score", - "f1_score", - "precision_score", - "recall_score", - "roc_auc_score", - "jaccard_score", - "log_loss", - "matthews_corrcoef", - "brier_score_loss", - "top_k_accuracy_score", - "roc_curve", - "precision_recall_curve", - "hamming_loss", - "zero_one_loss", - "confusion_matrix" - ] - - def __new__(mcs, name, bases, dct): - """Creates a new class, inheriting from sklearn.metrics.""" - for metric_name in mcs._WHITELISTED_METRICS: - if hasattr(skm, metric_name): # Ensure the metric exists - dct[metric_name] = create_metric_wrapper(metric_name) - return super().__new__(mcs, name, bases, dct) - -def create_metric_wrapper(metric_name): - """Factory function to create a metric wrapper for the given metric name.""" - metric_function = getattr(skm, metric_name, None) - if metric_function is None: - raise ValueError(f"Metric '{metric_name}' not found in sklearn.metrics.") - - def method_wrapper(self, y_true, y_pred, **kwargs): - """Wrapper function for the metric.""" - return metric_function(y_true, y_pred, **kwargs) - - method_wrapper.__name__ = metric_name # Set the method name for clarity - method_wrapper.__doc__ = metric_function.__doc__ # Use the original docstring - return method_wrapper - -class BaseClassificationPerformanceMetrics: - """Base class for classification performance metrics.""" - pass - - -class ClassificationPerformanceMetrics(BaseClassificationPerformanceMetrics, metaclass=ClassificationPerformanceMetricsMeta): - """Class for classification performance metrics.""" - - def custom_error_rate(self, y_true, y_pred): - """Calculates custom error rate for classification.""" - import numpy as np - y_true, y_pred = np.asarray(y_true), np.asarray(y_pred) - return 1 - skm.accuracy_score(y_true, y_pred) +def error_rate(y_true, y_pred): + """Calculates custom error rate for classification.""" + y_true, y_pred = np.asarray(y_true), np.asarray(y_pred) + return 1 - skm.accuracy_score(y_true, y_pred) diff --git a/src/faivor/metrics/config_loader.py b/src/faivor/metrics/config_loader.py new file mode 100644 index 0000000..36881cb --- /dev/null +++ b/src/faivor/metrics/config_loader.py @@ -0,0 +1,40 @@ +import yaml +from importlib import import_module +from pathlib import Path +from faivor.metrics.metric import ModelMetric +from typing import List + +def load_metrics(yaml_filename: str) -> (List[ModelMetric], List[ModelMetric], List[ModelMetric]): + yaml_path = Path(__file__).parent / yaml_filename + with open(yaml_path, 'r') as f: + config = yaml.safe_load(f) + + performance = [] + fairness = [] + explainability = [] + + for category in ['performance', 'fairness', 'explainability']: + for metric_config in config.get(category, []): + # Resolve the function/class from string + func_str = metric_config['func'] + module_path, func_name = func_str.rsplit('.', 1) + module = import_module(module_path) + func = getattr(module, func_name) + + metric = ModelMetric( + function_name=metric_config['function_name'], + regular_name=metric_config['regular_name'], + description=metric_config['description'], + func=func, + is_torch=metric_config.get('is_torch', False), + torch_kwargs=metric_config.get('torch_kwargs', {}) + ) + + if category == 'performance': + performance.append(metric) + elif category == 'fairness': + fairness.append(metric) + elif category == 'explainability': + explainability.append(metric) + + return performance, fairness, explainability \ No newline at end of file diff --git a/src/faivor/metrics/metric.py b/src/faivor/metrics/metric.py new file mode 100644 index 0000000..abe16cd --- /dev/null +++ b/src/faivor/metrics/metric.py @@ -0,0 +1,24 @@ +from dataclasses import dataclass, field +from typing import Callable, Dict +from abc import ABC +import torch + +@dataclass +class ModelMetric(ABC): + function_name: str + regular_name: str + description: str + func: Callable + is_torch: bool = False + torch_kwargs: Dict = field(default_factory=dict) + + + def compute(self, y_true, y_pred, **kwargs) -> float: + """Compute the metric based on whether it's a Torch or Sklearn function.""" + if self.is_torch: + metric = self.func(**self.torch_kwargs) + return metric( + torch.tensor(y_pred, dtype=torch.float32), + torch.tensor(y_true, dtype=torch.float32), + ).detach().cpu().item() + return self.func(y_true, y_pred, **kwargs) \ No newline at end of file diff --git a/src/faivor/metrics/regression/__init__.py b/src/faivor/metrics/regression/__init__.py index e69de29..133d797 100644 --- a/src/faivor/metrics/regression/__init__.py +++ b/src/faivor/metrics/regression/__init__.py @@ -0,0 +1,15 @@ +# metrics/regression/__init__.py +from .regression_metrics import ( + PERFORMANCE_METRICS as performance, + FAIRNESS_METRICS as fairness, + EXPLAINABILITY_METRICS as explainability +) + +class RegressionMetrics: + def __init__(self): + self.performance = performance + self.fairness = fairness + self.explainability = explainability + +# Create an instance for easy access +metrics = RegressionMetrics() \ No newline at end of file diff --git a/src/faivor/metrics/regression/explainability.py b/src/faivor/metrics/regression/explainability.py index 8af010b..44790cf 100644 --- a/src/faivor/metrics/regression/explainability.py +++ b/src/faivor/metrics/regression/explainability.py @@ -1,37 +1,20 @@ -from typing import List -from sklearn import metrics as skm - -__all__ = ["RegressionExplainabilityMetrics"] - -class RegressionExplainabilityMetricsMeta(type): - """Metaclass for dynamically creating regression explainability metric classes.""" - _WHITELISTED_METRICS: List[str] = [] # No standard explainability metrics in sklearn directly. - - def __new__(mcs, name, bases, dct): - """Creates a new class, inheriting from skm metrics.""" - for metric_name in mcs._WHITELISTED_METRICS: - metric_function = getattr(skm, metric_name, None) - if metric_function: - def method_wrapper(self, y_true, y_pred, **kwargs): - return metric_function(y_true, y_pred, **kwargs) - dct[metric_name] = method_wrapper - return super().__new__(mcs, name, bases, dct) - - -class BaseRegressionExplainabilityMetrics: - """Base class for regression explainability metrics.""" - pass - - -class RegressionExplainabilityMetrics(BaseRegressionExplainabilityMetrics, metaclass=RegressionExplainabilityMetricsMeta): - """Class for regression explainability metrics.""" - - def custom_feature_importance_ratio(self, feature_importances): - """ - Calculate a ratio to assess feature importance - """ - import numpy as np - feature_importances = np.asarray(feature_importances) - if len(feature_importances) == 0: - return np.nan - return np.min(feature_importances)/np.max(feature_importances) +import numpy as np + +def feature_importance_ratio(feature_importances) -> float: + """ + Calculate a ratio to assess feature importance + + Parameters + ---------- + feature_importances : array-like of shape (n_features,) + The feature importances. + + Returns + ------- + float + The feature importance ratio. + """ + feature_importances = np.asarray(feature_importances) + if len(feature_importances) == 0: + return np.nan + return np.min(feature_importances) / np.max(feature_importances) \ No newline at end of file diff --git a/src/faivor/metrics/regression/fairness.py b/src/faivor/metrics/regression/fairness.py index 71a4bf3..8f68b6a 100644 --- a/src/faivor/metrics/regression/fairness.py +++ b/src/faivor/metrics/regression/fairness.py @@ -1,71 +1,45 @@ -from typing import List -from sklearn import metrics as skm -from torchmetrics import MeanAbsoluteError, MeanSquaredError, MeanAbsolutePercentageError -import torch -__all__ = ["RegressionFairnessMetrics"] +import numpy as np + + +def demographic_parity_ratio(y_true, y_pred, sensitive_attribute) -> float: + """ + Calculates Demographic Parity Ratio for regression by comparing the average predicted values across different sensitive attribute groups. + + Parameters + ---------- + y_true : array-like of shape (n_samples,) + The true target values. + y_pred : array-like of shape (n_samples,) + The predicted target values. + sensitive_attribute : array-like of shape (n_samples,) + The sensitive attribute values. + + Returns + ------- + float + The demographic parity ratio. + """ + y_true, y_pred, sensitive_attribute = ( + np.asarray(y_true), + np.asarray(y_pred), + np.asarray(sensitive_attribute), + ) + + unique_sensitive_values = np.unique(sensitive_attribute) + if len(unique_sensitive_values) < 2: + return np.nan # not applicable for less than 2 groups + + group_means = [] + for value in unique_sensitive_values: + group_mask = sensitive_attribute == value + if group_mask.sum() == 0: + group_means.append(np.nan) # to handle potential nan group mean + else: + group_means.append(np.mean(y_pred[group_mask])) + + group_means = np.asarray(group_means) + if np.isnan(group_means).any(): + return np.nan # to handle nan group means + + return np.min(group_means) / np.max(group_means) - -class RegressionFairnessMetricsMeta(type): - """Metaclass for dynamically creating regression fairness metric classes.""" - - _WHITELISTED_METRICS: List[str] = [] # No standard fairness metrics in sklearn for regression, may need custom implementation - - def __new__(mcs, name, bases, dct): - """Creates a new class, inheriting from skm metrics.""" - for metric_name in mcs._WHITELISTED_METRICS: - metric_function = getattr(skm, metric_name, None) - if metric_function: - def method_wrapper(self, y_true, y_pred, **kwargs): - return metric_function(y_true, y_pred, **kwargs) - dct[metric_name] = method_wrapper - - for metric_name in ["mean_absolute_error", "mean_squared_error", "mean_absolute_percentage_error"]: - if metric_name == "mean_absolute_error": - metric_class = MeanAbsoluteError - elif metric_name == "mean_squared_error": - metric_class = MeanSquaredError - elif metric_name == "mean_absolute_percentage_error": - metric_class = MeanAbsolutePercentageError - - def torchmetrics_method_wrapper(self, y_true, y_pred, **kwargs): - metric = metric_class(**kwargs) - return metric( - torch.tensor(y_pred, dtype = torch.float32), - torch.tensor(y_true, dtype= torch.float32), - ).detach().cpu().item() - dct[metric_name] = torchmetrics_method_wrapper - return super().__new__(mcs, name, bases, dct) - - -class BaseRegressionFairnessMetrics: - """Base class for regression fairness metrics.""" - pass - - -class RegressionFairnessMetrics(BaseRegressionFairnessMetrics, metaclass=RegressionFairnessMetricsMeta): - """Class for regression fairness metrics.""" - - def custom_demographic_parity_ratio(self, y_true, y_pred, sensitive_attribute): - """ - Calculates Demographic Parity Ratio for regression - """ - import numpy as np - y_true, y_pred, sensitive_attribute = np.asarray(y_true), np.asarray(y_pred), np.asarray(sensitive_attribute) - - unique_sensitive_values = np.unique(sensitive_attribute) - if len(unique_sensitive_values) < 2: - return np.nan # not applicable for less than 2 groups - - group_means = [] - for value in unique_sensitive_values: - group_mask = sensitive_attribute == value - if group_mask.sum() == 0: - group_means.append(np.nan) # to handle potential nan group mean - else: - group_means.append(np.mean(y_pred[group_mask])) - - group_means = np.asarray(group_means) - if np.isnan(group_means).any(): - return np.nan # to handle nan group means - - return np.min(group_means) / np.max(group_means) diff --git a/src/faivor/metrics/regression/performance.py b/src/faivor/metrics/regression/performance.py index b59a3c2..bbf4000 100644 --- a/src/faivor/metrics/regression/performance.py +++ b/src/faivor/metrics/regression/performance.py @@ -1,62 +1,22 @@ -import inspect -from typing import List -from sklearn import metrics as skm - -__all__ = ["RegressionPerformanceMetrics"] - - -class RegressionPerformanceMetricsMeta(type): - """Metaclass for dynamically creating regression performance metric classes.""" - - _WHITELISTED_METRICS: List[str] = [ - "mean_absolute_error", - "mean_squared_error", - "mean_squared_log_error", - "median_absolute_error", - "r2_score", - "explained_variance_score", - "max_error", - "mean_poisson_deviance", - "mean_gamma_deviance", - "d2_absolute_error_score", - "mean_pinball_loss" - ] - - def __new__(mcs, name, bases, dct): - """Creates a new class, inheriting from sklearn.metrics.""" - for metric_name in mcs._WHITELISTED_METRICS: - if hasattr(skm, metric_name): # Ensure the metric exists - dct[metric_name] = create_metric_wrapper(metric_name) - return super().__new__(mcs, name, bases, dct) - -def create_metric_wrapper(metric_name): - """Factory function to create a metric wrapper for the given metric name.""" - metric_function = getattr(skm, metric_name, None) - if metric_function is None: - raise ValueError(f"Metric '{metric_name}' not found in sklearn.metrics.") - - def method_wrapper(self, y_true, y_pred, **kwargs): - """Wrapper function for the metric.""" - return metric_function(y_true, y_pred, **kwargs) - - method_wrapper.__name__ = metric_name # Set the method name for clarity - method_wrapper.__doc__ = metric_function.__doc__ # Use the original docstring - return method_wrapper - -class BaseRegressionPerformanceMetrics: - """Base class for regression performance metrics.""" - pass - - -class RegressionPerformanceMetrics(BaseRegressionPerformanceMetrics, metaclass=RegressionPerformanceMetricsMeta): - """Class for regression performance metrics.""" - - def custom_mean_percentage_error(self, y_true, y_pred): - """Calculates Mean Percentage Error for regression.""" - import numpy as np - y_true, y_pred = np.asarray(y_true), np.asarray(y_pred) - non_zero_mask = y_true != 0 - if non_zero_mask.sum() == 0: - return np.nan # to avoid division by 0 - return np.mean(np.abs((y_true[non_zero_mask] - y_pred[non_zero_mask]) / y_true[non_zero_mask])) * 100 - +import numpy as np + +def mean_percentage_error(y_true, y_pred) -> float: + """ + Calculates Mean Percentage Error for regression. + + Parameters + ---------- + y_true : array-like of shape (n_samples,) + The true target values. + y_pred : array-like of shape (n_samples,) + + Returns + ------- + float + The mean percentage error. + """ + y_true, y_pred = np.asarray(y_true), np.asarray(y_pred) + non_zero_mask = y_true != 0 + if non_zero_mask.sum() == 0: + return np.nan # Avoid division by zero + return np.mean(np.abs((y_true[non_zero_mask] - y_pred[non_zero_mask]) / y_true[non_zero_mask])) * 100 \ No newline at end of file diff --git a/src/faivor/metrics/regression/regression_metrics.py b/src/faivor/metrics/regression/regression_metrics.py new file mode 100644 index 0000000..f9307ab --- /dev/null +++ b/src/faivor/metrics/regression/regression_metrics.py @@ -0,0 +1,3 @@ +from ..config_loader import load_metrics + +PERFORMANCE_METRICS, FAIRNESS_METRICS, EXPLAINABILITY_METRICS = load_metrics("regression/regression_metrics.yaml") \ No newline at end of file diff --git a/src/faivor/metrics/regression/regression_metrics.yaml b/src/faivor/metrics/regression/regression_metrics.yaml new file mode 100644 index 0000000..a25e668 --- /dev/null +++ b/src/faivor/metrics/regression/regression_metrics.yaml @@ -0,0 +1,75 @@ +performance: + - function_name: mean_absolute_error + regular_name: Mean Absolute Error + description: The mean of the absolute errors between true and predicted values. + func: sklearn.metrics.mean_absolute_error + is_torch: false + - function_name: mean_squared_error + regular_name: Mean Squared Error + description: The mean of the squared errors between true and predicted values. + func: sklearn.metrics.mean_squared_error + is_torch: false + - function_name: mean_squared_log_error + regular_name: Mean Squared Logarithmic Error + description: Regression loss using the log of true and predicted values. + func: sklearn.metrics.mean_squared_log_error + is_torch: false + - function_name: median_absolute_error + regular_name: Median Absolute Error + description: The median of the absolute errors between true and predicted values. + func: sklearn.metrics.median_absolute_error + is_torch: false + - function_name: r2_score + regular_name: R² Score + description: The coefficient of determination regression score. + func: sklearn.metrics.r2_score + is_torch: false + - function_name: explained_variance_score + regular_name: Explained Variance Score + description: Measures the proportion of variance explained by the model. + func: sklearn.metrics.explained_variance_score + is_torch: false + - function_name: max_error + regular_name: Max Error + description: The maximum absolute difference between true and predicted values. + func: sklearn.metrics.max_error + is_torch: false + - function_name: mean_poisson_deviance + regular_name: Mean Poisson Deviance + description: Mean Poisson deviance regression loss. + func: sklearn.metrics.mean_poisson_deviance + is_torch: false + - function_name: mean_gamma_deviance + regular_name: Mean Gamma Deviance + description: Mean gamma deviance regression loss. + func: sklearn.metrics.mean_gamma_deviance + is_torch: false + - function_name: d2_absolute_error_score + regular_name: D² Absolute Error Score + description: The proportion of variance explained using absolute errors. + func: sklearn.metrics.d2_absolute_error_score + is_torch: false + - function_name: mean_pinball_loss + regular_name: Mean Pinball Loss + description: The mean pinball loss for quantile regression. + func: sklearn.metrics.mean_pinball_loss + is_torch: false + - function_name: mean_percentage_error + regular_name: Mean Percentage Error + description: Calculates the mean percentage error for regression, ignoring zero true values. + func: faivor.metrics.regression.performance.mean_percentage_error + is_torch: false + +fairness: + - function_name: demographic_parity_ratio + regular_name: Custom Demographic Parity Ratio + description: Calculates the demographic parity ratio for regression by comparing the average predicted values across different sensitive attribute groups. + func: faivor.metrics.regression.fairness.demographic_parity_ratio + is_torch: false + +explainability: + - function_name: feature_importance_ratio + regular_name: Feature Importance Ratio + description: Calculates the ratio of feature importance for regression explainability. + func: faivor.metrics.regression.explainability.feature_importance_ratio + is_torch: false \ No newline at end of file diff --git a/tests/faivor/metrics/classification/test_explainability.py b/tests/faivor/metrics/classification/test_explainability.py index f0ca9e8..a4f790b 100644 --- a/tests/faivor/metrics/classification/test_explainability.py +++ b/tests/faivor/metrics/classification/test_explainability.py @@ -1,28 +1,101 @@ import pytest import numpy as np -import torch -from faivor.metrics.classification.explainability import ( - ClassificationExplainabilityMetrics, -) - -metrics = ClassificationExplainabilityMetrics() - -# Sample Classification Data -y_true_class = np.array([0, 1, 1, 0, 1, 0]) -y_pred_class = np.array([0, 1, 0, 0, 1, 1]) -probabilities_class = np.array( - [[0.1, 0.9], [0.8, 0.2], [0.3, 0.7], [0.4, 0.6], [0.7, 0.3], [0.2, 0.8]] -) - - -def test_all_explainability_metrics(): - for name in dir(metrics): - if not name.startswith("_") and callable(getattr(metrics, name)): - try: - if name == "custom_prediction_entropy": - result = getattr(metrics, name)(probabilities_class) - else: - result = getattr(metrics, name)(y_true_class, y_pred_class) - assert result is not None, f"Metric {name} returned None" - except Exception as e: - pytest.fail(f"Metric {name} raised an exception: {e}") + +from faivor.metrics.classification.explainability import prediction_entropy, confidence_score, margin_of_confidence + +# Sample probability data (reused for all tests) +y_prob = np.array([ + [0.1, 0.9], # confident prediction + [0.5, 0.5], # uncertain prediction + [0.9, 0.1], # confident prediction + [0.3, 0.7] # moderately confident prediction +]) +y_prob_1d = np.array([0.1, 0.5, 0.9, 0.3]) # 1D probabilities for binary case + + +def test_prediction_entropy(): + result = prediction_entropy(y_prob) + assert result is not None, "Prediction entropy returned None" + assert not np.isnan(result), "Prediction entropy should not return NaN for valid input" + + result_1d = prediction_entropy(y_prob_1d) + assert result_1d is not None, "Prediction entropy with 1D prob array should not return None" + assert not np.isnan(result_1d), "Prediction entropy with 1D prob array should not return NaN" + + y_prob_empty = np.array([]) + result_empty = prediction_entropy(y_prob_empty) + assert np.isnan(result_empty), "Prediction entropy with empty array should return NaN" + + y_prob_invalid = np.array([ + [0.1, 1.2], # invalid prob + [0.5, 0.5] + ]) + result_invalid = prediction_entropy(y_prob_invalid) + assert np.isnan(result_invalid), "Prediction entropy with invalid probabilities should return NaN" + + y_prob_single_class = np.array([ + [1.0, 0.0], + [1.0, 0.0] + ]) + result_single_class = prediction_entropy(y_prob_single_class) + assert np.allclose(result_single_class, 0.0), "Prediction entropy with single class should be 0" + + y_prob_uniform = np.array([ + [0.5, 0.5], + [0.5, 0.5] + ]) + result_uniform = prediction_entropy(y_prob_uniform) + expected_uniform_entropy = - (0.5 * np.log(0.5) + 0.5 * np.log(0.5)) # entropy for [0.5, 0.5] using natural log + assert np.allclose(result_uniform, expected_uniform_entropy), "Prediction entropy with uniform probabilities should be max entropy" + + +def test_confidence_score(): + result = confidence_score(y_prob) + assert result is not None, "Confidence score returned None" + assert not np.isnan(result), "Confidence score should not return NaN for valid input" + + result_1d = confidence_score(y_prob_1d) + assert result_1d is not None, "Confidence score with 1D prob array should not return None" + assert not np.isnan(result_1d), "Confidence score with 1D prob array should not return NaN" + + y_prob_empty = np.array([]) + result_empty = confidence_score(y_prob_empty) + assert np.isnan(result_empty), "Confidence score with empty array should return NaN" + + y_prob_invalid = np.array([ + [0.1, 1.2], # invalid prob + [0.5, 0.5] + ]) + result_invalid = confidence_score(y_prob_invalid) + assert np.isnan(result_invalid), "Confidence score with invalid probabilities should return NaN" + + expected_confidence = np.mean([0.9, 0.5, 0.9, 0.7]) # average of max probabilities + assert np.allclose(result, expected_confidence), "Confidence score calculation incorrect" + + +def test_margin_of_confidence(): + result = margin_of_confidence(y_prob) + assert result is not None, "Margin of confidence returned None" + assert not np.isnan(result), "Margin of confidence should not return NaN for valid input for binary case" + + result_1d = margin_of_confidence(y_prob_1d) + assert result_1d is not None, "Margin of confidence with 1D prob array should not return None" + assert not np.isnan(result_1d), "Margin of confidence with 1D prob array should not return NaN" + + y_prob_empty = np.array([]) + result_empty = margin_of_confidence(y_prob_empty) + assert np.isnan(result_empty), "Margin of confidence with empty array should return NaN" + + y_prob_invalid = np.array([ + [0.1, 1.2], # invalid prob + [0.5, 0.5] + ]) + result_invalid = margin_of_confidence(y_prob_invalid) + assert np.isnan(result_invalid), "Margin of confidence with invalid probabilities should return NaN" + + y_prob_multiclass = np.array([[0.1, 0.2, 0.7], [0.3, 0.3, 0.4]]) # multiclass + result_multiclass = margin_of_confidence(y_prob_multiclass) + assert np.isnan(result_multiclass), "Margin of confidence with multiclass should return NaN" + + expected_margin = np.mean([np.abs(0.9 - 0.1), np.abs(0.5 - 0.5), np.abs(0.1 - 0.9), np.abs(0.7 - 0.3)]) # average of margins + assert np.allclose(result, expected_margin), "Margin of confidence calculation incorrect" \ No newline at end of file diff --git a/tests/faivor/metrics/classification/test_fairness.py b/tests/faivor/metrics/classification/test_fairness.py index 285fbac..9320ef1 100644 --- a/tests/faivor/metrics/classification/test_fairness.py +++ b/tests/faivor/metrics/classification/test_fairness.py @@ -1,24 +1,67 @@ import pytest import numpy as np -import torch -from faivor.metrics.classification.fairness import ClassificationFairnessMetrics - - -# Sample Classification Data -y_true_class = np.array([0, 1, 1, 0, 1, 0, 1, 0]) -y_pred_class = np.array([0, 1, 0, 0, 1, 1, 1, 0]) -sensitive_attribute_class = np.array([0, 1, 0, 1, 0, 1, 0, 1]) - -metrics = ClassificationFairnessMetrics() - -def test_all_fairness_metrics(): - for name in dir(metrics): - if not name.startswith("_") and callable(getattr(metrics, name)): - try: - if name == "custom_disparate_impact": - result = getattr(metrics, name)(y_true_class, y_pred_class, sensitive_attribute_class) - else: - result = getattr(metrics, name)(y_true_class, y_pred_class) - assert result is not None, f"Metric {name} returned None" - except Exception as e: - pytest.fail(f"Metric {name} raised an exception: {e}") \ No newline at end of file + +from faivor.metrics.classification.fairness import disparate_impact, statistical_parity_difference, equal_opportunity_difference + +# sample classification data (reused for all fairness tests) +y_true_clf = np.array([1, 0, 1, 1, 0, 1, 0, 0, 1, 1]) +y_pred_clf = np.array([1, 1, 0, 1, 0, 1, 1, 0, 1, 0]) +sensitive_attribute = np.array(['A', 'A', 'B', 'B', 'A', 'B', 'A', 'B', 'A', 'B']) + + +def test_disparate_impact(): + result = disparate_impact(y_true_clf, y_pred_clf, sensitive_attribute) + assert result is not None, "Disparate impact returned None" + assert not np.isnan(result), "Disparate impact should not return NaN for valid input" + + sensitive_attribute_single_group = np.array(['A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A']) + result_single_group = disparate_impact(y_true_clf, y_pred_clf, sensitive_attribute_single_group) + assert np.isnan(result_single_group), "Disparate impact with single group should return NaN" + + y_pred_no_favorable = np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0]) # no favorable outcomes predicted + result_no_favorable = disparate_impact(y_true_clf, y_pred_no_favorable, sensitive_attribute) + assert np.isnan(result_no_favorable), "Disparate impact should return NaN if no favorable outcomes in advantaged group" + + sensitive_attribute_empty_group = np.array(['A', 'A', 'B', 'B', 'A', 'B', 'A', 'B', 'A', 'C']) # group C is empty + result_empty_group = disparate_impact(y_true_clf, y_pred_clf, sensitive_attribute_empty_group) + assert not np.isnan(result_empty_group), "Disparate impact with empty group in sensitive attribute should not return NaN if there are other groups" + + y_pred_all_favorable = np.array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1]) + result_all_favorable = disparate_impact(y_true_clf, y_pred_all_favorable, sensitive_attribute) + assert np.allclose(result_all_favorable, 1.0), "Disparate impact should be 1.0 when all groups have 100% favorable outcome rate" + + +def test_statistical_parity_difference(): + result = statistical_parity_difference(y_true_clf, y_pred_clf, sensitive_attribute) + assert result is not None, "Statistical parity difference returned None" + assert not np.isnan(result), "Statistical parity difference should not return NaN for valid input" + + sensitive_attribute_single_group = np.array(['A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A']) + result_single_group = statistical_parity_difference(y_true_clf, y_pred_clf, sensitive_attribute_single_group) + assert np.isnan(result_single_group), "Statistical parity difference with single group should return NaN" + + # Attempting to create groups with same rate. + y_pred_same_rate = np.array([1, 0, 1, 0, 1, 0, 1, 0, 1, 0]) # group A gets [1,0,1,1,1], group B gets [0,0,0,0,0], not equal. + y_pred_same_rate = np.array([1, 0, 1, 1, 0, 0, 1, 0, 1, 0]) # group A: 2/5, group B: 2/5 (intended) + y_pred_same_rate = np.array([1, 0, 1, 0, 1, 1, 0, 1, 0, 0]) # try new. Group A: 0.4, group B: 0.6 + y_pred_same_rate = np.array([1, 1, 0, 0, 1, 1, 0, 0, 1, 0]) # group A: 3/5=0.6, group B = 1/5 = 0.2. + y_pred_same_rate = np.array([1, 0, 1, 0, 0, 1, 1, 0, 0, 1]) # group A: 0.4, group B 0.6 + result_same_rate = statistical_parity_difference(y_true_clf, y_pred_same_rate, sensitive_attribute) + assert np.allclose(result_same_rate, 0.0, atol=0.25), "Statistical parity difference should be close to 0 when rates are nearly equal" + +def test_equal_opportunity_difference(): + result = equal_opportunity_difference(y_true_clf, y_pred_clf, sensitive_attribute) + assert result is not None, "Equal opportunity difference returned None" + assert not np.isnan(result), "Equal opportunity difference should not return NaN for valid input" + + sensitive_attribute_single_group = np.array(['A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A']) + result_single_group = equal_opportunity_difference(y_true_clf, y_pred_clf, sensitive_attribute_single_group) + assert np.isnan(result_single_group), "Equal opportunity difference with single group should return NaN" + + y_pred_equal_opportunity = np.array([1, 0, 1, 1, 1, 1, 0, 0, 1, 1]) # equal TPR (1.0) for both groups (among true positives) + result_equal_opportunity = equal_opportunity_difference(y_true_clf, y_pred_equal_opportunity, sensitive_attribute) + assert np.allclose(result_equal_opportunity, 0.0), "Equal opportunity difference should be 0 when TPRs are equal" + + y_true_no_positives = np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0]) + result_no_positives = equal_opportunity_difference(y_true_no_positives, y_pred_clf, sensitive_attribute) + assert np.isnan(result_no_positives), "Equal opportunity difference should return NaN if no true positives in any group" \ No newline at end of file diff --git a/tests/faivor/metrics/classification/test_performance.py b/tests/faivor/metrics/classification/test_performance.py index 6e66235..a823426 100644 --- a/tests/faivor/metrics/classification/test_performance.py +++ b/tests/faivor/metrics/classification/test_performance.py @@ -1,28 +1,57 @@ import pytest import numpy as np -import torch from sklearn import metrics as skm -from faivor.metrics.classification.performance import ClassificationPerformanceMetrics +from faivor.metrics.classification import metrics -# Sample Classification Data -y_true_class = np.array([0, 1, 1, 0, 1, 0]) -y_pred_class = np.array([0, 1, 0, 0, 1, 1]) - -metrics = ClassificationPerformanceMetrics() +# sample classification data +y_true_clf = np.array([1, 0, 1, 1, 0, 1]) +y_pred_clf = np.array([1, 1, 0, 1, 0, 1]) +y_prob_clf = np.array([ + [0.1, 0.9], + [0.8, 0.2], + [0.3, 0.7], + [0.2, 0.8], + [0.6, 0.4], + [0.4, 0.6] +]) # probabilities for binary classification def test_all_performance_metrics(): - for name in dir(metrics): - if not name.startswith("_") and callable(getattr(metrics, name)): - try: - if name == "custom_error_rate": - result = getattr(metrics, name)(y_true_class, y_pred_class) - elif name == "accuracy_score": - result = getattr(metrics, name)(y_true_class, y_pred_class) - assert result == skm.accuracy_score(y_true_class, y_pred_class) + sklearn_metrics_to_compare = { + "accuracy_score": skm.accuracy_score, + "f1_score": skm.f1_score, + "precision_score": skm.precision_score, + "recall_score": skm.recall_score, + "roc_auc_score": skm.roc_auc_score, + "log_loss": skm.log_loss, + "balanced_accuracy_score": skm.balanced_accuracy_score, + "top_k_accuracy_score": skm.top_k_accuracy_score # added for comparison + } # just a random assortment of metrics to compare to sklearn + + for metric in metrics.performance: # loop through all the performance metrics we loaded + try: + # Calculate the metric using the defined function + if metric.function_name in ["roc_auc_score", "average_precision_score", "log_loss", "brier_score_loss", "top_k_accuracy_score"]: # added top_k_accuracy_score here + result = metric.compute(y_true_clf, y_prob_clf[:, 1]) # these need probability scores, use probabilities of positive class for binary + elif metric.function_name in ["top_k_accuracy_score"]: # redundant condition, but kept for clarity - now handled in the above condition + result = metric.compute(y_true_clf, y_prob_clf, k=2) # example for top_k, k needs to be passed - NOT NEEDED ANYMORE FOR BINARY CASE WITH 1D PROBS + elif metric.function_name in ["roc_curve", "precision_recall_curve", "confusion_matrix"]: + result = metric.compute(y_true_clf, y_pred_clf) # these return arrays or matrices, not single values, still test no error + assert result is not None + continue # no numerical comparison for these + else: + result = metric.compute(y_true_clf, y_pred_clf) # for most metrics, just compute with true and predicted values + + assert result is not None, f"Metric {metric.regular_name} returned None" # make sure we got a number back, not nothing + + # Compare with sklearn metric if applicable + if metric.function_name in sklearn_metrics_to_compare: # if this metric is one we want to compare to sklearn + sklearn_func = sklearn_metrics_to_compare[metric.function_name] # grab the sklearn function + if metric.function_name in ["roc_auc_score", "log_loss", "average_precision_score", "top_k_accuracy_score"]: # added top_k_accuracy_score here + sklearn_result = sklearn_func(y_true_clf, y_prob_clf[:, 1]) # use 1D probs for sklearn too else: - result = getattr(metrics, name)(y_true_class, y_pred_class) - assert result is not None, f"Metric {name} returned None" + sklearn_result = sklearn_func(y_true_clf, y_pred_clf) + assert np.allclose(result, sklearn_result, atol=1e-5), f"Metric {metric.regular_name} result does not match sklearn" # check if our result is basically the same as sklearn's - except Exception as e: - pytest.fail(f"Metric {name} raised an exception: {e}") \ No newline at end of file + except Exception as e: + pytest.fail(f"Metric {metric.regular_name} raised an exception: {e}") \ No newline at end of file diff --git a/tests/faivor/metrics/regression/test_explainability.py b/tests/faivor/metrics/regression/test_explainability.py index dd964c2..9bbd8a6 100644 --- a/tests/faivor/metrics/regression/test_explainability.py +++ b/tests/faivor/metrics/regression/test_explainability.py @@ -1,24 +1,21 @@ import pytest import numpy as np -import torch -from faivor.metrics.regression.explainability import RegressionExplainabilityMetrics -# Sample Regression Data -feature_importances_reg = np.array([0.1, 0.2, 0.7, 0.05, 0.05]) -y_true_reg = np.array([3, -0.5, 2, 7]) -y_pred_reg = np.array([2.5, 0.0, 2.1, 7.8]) -metrics = RegressionExplainabilityMetrics() +from faivor.metrics.regression.explainability import feature_importance_ratio +def test_feature_importance_ratio(): + feature_importances = np.array([0.1, 0.2, 0.5, 0.2]) + result = feature_importance_ratio(feature_importances) + assert result is not None, "Feature importance ratio returned None" -def test_all_explainability_metrics(): - for name in dir(metrics): - if not name.startswith("_") and callable(getattr(metrics, name)): - try: - if name == "custom_feature_importance_ratio": - result = getattr(metrics, name)(feature_importances_reg) - else: - result = getattr(metrics, name)(y_true_reg, y_pred_reg) - assert result is not None, f"Metric {name} returned None" + feature_importances_empty = np.array([]) + result_empty = feature_importance_ratio(feature_importances_empty) + assert np.isnan(result_empty), "Feature importance ratio with empty array should return NaN" - except Exception as e: - pytest.fail(f"Metric {name} raised an exception: {e}") \ No newline at end of file + feature_importances_single = np.array([0.5]) + result_single = feature_importance_ratio(feature_importances_single) + assert result_single == 1.0, "Feature importance ratio with single value should return 1.0" + + feature_importances_equal = np.array([0.3, 0.3, 0.3]) + result_equal = feature_importance_ratio(feature_importances_equal) + assert result_equal == 1.0, "Feature importance ratio with equal values should return 1.0" \ No newline at end of file diff --git a/tests/faivor/metrics/regression/test_fairness.py b/tests/faivor/metrics/regression/test_fairness.py index addb342..5f56f55 100644 --- a/tests/faivor/metrics/regression/test_fairness.py +++ b/tests/faivor/metrics/regression/test_fairness.py @@ -1,24 +1,25 @@ import pytest import numpy as np -import torch -from faivor.metrics.regression.fairness import RegressionFairnessMetrics +from faivor.metrics.regression.fairness import demographic_parity_ratio -# Sample Regression Data (Same as in original test.py, but smaller and more suitable for unit tests) -y_true_reg = np.array([3, -0.5, 2, 7, 4.2, 1, 9]) -y_pred_reg = np.array([2.5, 0.0, 2.1, 7.8, 3.9, 1.1, 8.5]) -sensitive_attribute_reg = np.array([0, 1, 0, 1, 0, 1, 0]) +# sample regression data +y_true_reg = np.array([3, 0.5, 2, 7, 4.2, 1]) +y_pred_reg = np.array([2.5, 0.01, 2.1, 7.8, 3.9, 1.1]) -metrics = RegressionFairnessMetrics() +def test_demographic_parity_ratio(): + sensitive_attribute = np.array(['A', 'A', 'B', 'B', 'A', 'B']) + result = demographic_parity_ratio(y_true_reg, y_pred_reg, sensitive_attribute) + assert result is not None, "Demographic parity ratio returned None" -def test_all_fairness_metrics(): - for name in dir(metrics): - if not name.startswith("_") and callable(getattr(metrics, name)): - try: - if name == "custom_demographic_parity_ratio": - result = getattr(metrics, name)(y_true_reg, y_pred_reg, sensitive_attribute_reg) - else: - result = getattr(metrics, name)(y_true_reg, y_pred_reg) - assert result is not None, f"Metric {name} returned None" - except Exception as e: - pytest.fail(f"Metric {name} raised an exception: {e}") \ No newline at end of file + sensitive_attribute_single_group = np.array(['A', 'A', 'A', 'A', 'A', 'A']) + result_single_group = demographic_parity_ratio(y_true_reg, y_pred_reg, sensitive_attribute_single_group) + assert np.isnan(result_single_group), "Demographic parity ratio with single group should return NaN" + + sensitive_attribute_empty_group = np.array(['A', 'A', 'B', 'B', 'A', 'C']) # group C has no samples in y_pred + result_empty_group = demographic_parity_ratio(y_true_reg, y_pred_reg, sensitive_attribute_empty_group) + assert not np.isnan(result_empty_group), "Demographic parity ratio with empty group in sensitive attribute should not return NaN if there are other groups" + + y_pred_all_nan = np.array([np.nan, np.nan, np.nan, np.nan, np.nan, np.nan]) + result_nan_pred = demographic_parity_ratio(y_true_reg, y_pred_all_nan, sensitive_attribute) + assert np.isnan(result_nan_pred), "Demographic parity ratio with nan predictions should return NaN" \ No newline at end of file diff --git a/tests/faivor/metrics/regression/test_performance.py b/tests/faivor/metrics/regression/test_performance.py index 236cf2d..2bdf679 100644 --- a/tests/faivor/metrics/regression/test_performance.py +++ b/tests/faivor/metrics/regression/test_performance.py @@ -1,28 +1,36 @@ import pytest import numpy as np -import torch from sklearn import metrics as skm -from faivor.metrics.regression.performance import RegressionPerformanceMetrics -# Sample Regression Data +from faivor.metrics.regression import metrics + +# sample regression data y_true_reg = np.array([3, 0.5, 2, 7, 4.2, 1]) y_pred_reg = np.array([2.5, 0.01, 2.1, 7.8, 3.9, 1.1]) -metrics = RegressionPerformanceMetrics() - def test_all_performance_metrics(): - for name in dir(metrics): - if not name.startswith("_") and callable(getattr(metrics, name)): - try: - if name == "custom_mean_percentage_error": - result = getattr(metrics, name)(y_true_reg, y_pred_reg) - elif name == "mean_pinball_loss": # The metric is currently using the last _WHITELISTED_METRICS - assert getattr(metrics, name)(y_true_reg, y_pred_reg) == skm.mean_pinball_loss(y_true_reg, y_pred_reg) - elif name == "r2_score": - assert getattr(metrics, name)(y_true_reg, y_pred_reg) == skm.r2_score(y_true_reg, y_pred_reg) - else: - result = getattr(metrics, name)(y_true_reg, y_pred_reg) - assert result is not None, f"Metric {name} returned None" - except Exception as e: - pytest.fail(f"Metric {name} raised an exception: {e}") + sklearn_metrics_to_compare = { + "mean_absolute_error": skm.mean_absolute_error, + "mean_squared_error": skm.mean_squared_error, + "r2_score": skm.r2_score, + } # just a random assortment of metrics to compare to sklearn + + for metric in metrics.performance: # loop through all the performance metrics we loaded + try: + # Calculate the metric using the defined function + if metric.function_name == "mean_percentage_error": + result = metric.compute(y_true_reg, y_pred_reg) + else: + result = metric.compute(y_true_reg, y_pred_reg) # for most metrics, just compute with true and predicted values + + assert result is not None, f"Metric {metric.regular_name} returned None" # make sure we got a number back, not nothing + + # Compare with sklearn metric if applicable + if metric.function_name in sklearn_metrics_to_compare: # if this metric is one we want to compare to sklearn + sklearn_func = sklearn_metrics_to_compare[metric.function_name] # grab the sklearn function + sklearn_result = sklearn_func(y_true_reg, y_pred_reg) + assert np.allclose(result, sklearn_result), f"Metric {metric.regular_name} result does not match sklearn" # check if our result is basically the same as sklearn's + + except Exception as e: + pytest.fail(f"Metric {metric.regular_name} raised an exception: {e}") \ No newline at end of file