diff --git a/src/faivor/metrics/classification/__init__.py b/src/faivor/metrics/classification/__init__.py
index e69de29..5295df3 100644
--- a/src/faivor/metrics/classification/__init__.py
+++ b/src/faivor/metrics/classification/__init__.py
@@ -0,0 +1,14 @@
+from .classification_metrics import (
+    PERFORMANCE_METRICS as performance,
+    FAIRNESS_METRICS as fairness,
+    EXPLAINABILITY_METRICS as explainability
+)
+
+class ClassificationMetrics:
+    def __init__(self):
+        self.performance = performance
+        self.fairness = fairness
+        self.explainability = explainability
+
+# Create an instance for easy access
+metrics = ClassificationMetrics()
\ No newline at end of file
diff --git a/src/faivor/metrics/classification/classification_metrics.py b/src/faivor/metrics/classification/classification_metrics.py
new file mode 100644
index 0000000..52946d4
--- /dev/null
+++ b/src/faivor/metrics/classification/classification_metrics.py
@@ -0,0 +1,3 @@
+from ..config_loader import load_metrics
+
+PERFORMANCE_METRICS, FAIRNESS_METRICS, EXPLAINABILITY_METRICS = load_metrics("classification/classification_metrics.yaml")
\ No newline at end of file
diff --git a/src/faivor/metrics/classification/classification_metrics.yaml b/src/faivor/metrics/classification/classification_metrics.yaml
new file mode 100644
index 0000000..ed53fd8
--- /dev/null
+++ b/src/faivor/metrics/classification/classification_metrics.yaml
@@ -0,0 +1,100 @@
+performance:
+  - function_name: accuracy_score
+    regular_name: Accuracy Score
+    description: Accuracy classification score.
+    func: sklearn.metrics.accuracy_score
+    is_torch: false
+  - function_name: balanced_accuracy_score
+    regular_name: Balanced Accuracy Score
+    description: Balanced accuracy classification score.
+    func: sklearn.metrics.balanced_accuracy_score
+    is_torch: false
+  - function_name: average_precision_score
+    regular_name: Average Precision Score
+    description: Compute average precision (AP) from prediction scores.
+    func: sklearn.metrics.average_precision_score
+    is_torch: false
+  - function_name: f1_score
+    regular_name: F1 Score
+    description: F1 score, harmonic mean of precision and recall.
+    func: sklearn.metrics.f1_score
+    is_torch: false
+  - function_name: precision_score
+    regular_name: Precision Score
+    description: Precision classification score.
+    func: sklearn.metrics.precision_score
+    is_torch: false
+  - function_name: recall_score
+    regular_name: Recall Score
+    description: Recall classification score.
+    func: sklearn.metrics.recall_score
+    is_torch: false
+  - function_name: roc_auc_score
+    regular_name: ROC AUC Score
+    description: Compute Area Under the Receiver Operating Characteristic Curve (ROC AUC) from prediction scores.
+    func: sklearn.metrics.roc_auc_score
+    is_torch: false
+  - function_name: jaccard_score
+    regular_name: Jaccard Score
+    description: Jaccard similarity coefficient.
+    func: sklearn.metrics.jaccard_score
+    is_torch: false
+  - function_name: log_loss
+    regular_name: Log Loss
+    description: Log loss, aka logistic regression loss or cross-entropy loss.
+    func: sklearn.metrics.log_loss
+    is_torch: false
+  - function_name: matthews_corrcoef
+    regular_name: Matthews Correlation Coefficient
+    description: Compute the Matthews correlation coefficient (MCC).
+    func: sklearn.metrics.matthews_corrcoef
+    is_torch: false
+  - function_name: brier_score_loss
+    regular_name: Brier Score Loss
+    description: Compute the Brier score loss.
+    func: sklearn.metrics.brier_score_loss
+    is_torch: false
+  - function_name: top_k_accuracy_score
+    regular_name: Top K Accuracy Score
+    description: Top-k accuracy classification score.
+    func: sklearn.metrics.top_k_accuracy_score
+    is_torch: false
+  - function_name: roc_curve
+    regular_name: ROC Curve
+    description: Compute Receiver operating characteristic (ROC) curve.
+    func: sklearn.metrics.roc_curve
+    is_torch: false
+  - function_name: precision_recall_curve
+    regular_name: Precision Recall Curve
+    description: Compute precision-recall pairs for different probability thresholds.
+    func: sklearn.metrics.precision_recall_curve
+    is_torch: false
+  - function_name: hamming_loss
+    regular_name: Hamming Loss
+    description: Compute the average Hamming loss or Hamming distance between two sets of samples.
+    func: sklearn.metrics.hamming_loss
+    is_torch: false
+  - function_name: zero_one_loss
+    regular_name: Zero One Loss
+    description: Zero-one classification loss.
+    func: sklearn.metrics.zero_one_loss
+    is_torch: false
+  - function_name: confusion_matrix
+    regular_name: Confusion Matrix
+    description: Compute confusion matrix to evaluate the accuracy of a classification.
+    func: sklearn.metrics.confusion_matrix
+    is_torch: false
+
+fairness:
+  - function_name: disparate_impact
+    regular_name: Disparate Impact
+    description: Calculates the disparate impact for classification by comparing the rate of favorable outcomes for different groups.
+    func: faivor.metrics.classification.fairness.disparate_impact
+    is_torch: false
+
+explainability:
+  - function_name: prediction_entropy
+    regular_name: Prediction Entropy
+    description: Calculates the entropy of predictions to measure model uncertainty.
+    func: faivor.metrics.classification.explainability.prediction_entropy
+    is_torch: false
\ No newline at end of file
diff --git a/src/faivor/metrics/classification/explainability.py b/src/faivor/metrics/classification/explainability.py
index e5bb2c4..fd99047 100644
--- a/src/faivor/metrics/classification/explainability.py
+++ b/src/faivor/metrics/classification/explainability.py
@@ -1,36 +1,41 @@
-from typing import List
-from sklearn import metrics as skm
-
-__all__ = ["ClassificationExplainabilityMetrics"]
-
-
-class ClassificationExplainabilityMetricsMeta(type):
-    """Metaclass for dynamically creating classification explainability metric classes."""
-
-    _WHITELISTED_METRICS: List[str] = [] # sklearn doesn't provide direct explainability metrics
-
-    def __new__(mcs, name, bases, dct):
-        """Creates a new class, inheriting from skm metrics."""
-        for metric_name in mcs._WHITELISTED_METRICS:
-            metric_function = getattr(skm, metric_name, None)
-            if metric_function:
-                def method_wrapper(self, y_true, y_pred, **kwargs):
-                    return metric_function(y_true, y_pred, **kwargs)
-                dct[metric_name] = method_wrapper
-        return super().__new__(mcs, name, bases, dct)
-
-
-class BaseClassificationExplainabilityMetrics:
-    """Base class for classification explainability metrics."""
-    pass
-
-
-class ClassificationExplainabilityMetrics(BaseClassificationExplainabilityMetrics, metaclass=ClassificationExplainabilityMetricsMeta):
-    """Class for classification explainability metrics."""
-
-    def custom_prediction_entropy(self, probas):
-        """Calculate the average entropy of prediction probabilities."""
-        import numpy as np
-        probas = np.asarray(probas)
-        log_probs = np.log2(probas)
-        return -np.mean(np.sum(probas * log_probs, axis=1))
+import numpy as np
+from scipy.stats import entropy
+
+def prediction_entropy(y_prob) -> float:
+    """
+    Calculates the entropy of predictions for classification.
+
+    Entropy is a measure of uncertainty. Higher entropy in predictions indicates
+    higher model uncertainty.  This function computes the average entropy across all predictions.
+
+    Parameters
+    ----------
+    y_prob : array-like of shape (n_samples, n_classes) or (n_samples,)
+        The predicted probabilities for each class. Can be either:
+        - A 2D array of shape (n_samples, n_classes) where each row represents
+          the probability distribution over classes for a single sample.
+        - A 1D array of shape (n_samples,) for binary classification, representing
+          the probability of the positive class (class 1).
+
+    Returns
+    -------
+    float
+        The average prediction entropy. Returns np.nan if input is empty or invalid.
+    """
+    y_prob = np.asarray(y_prob)
+    if y_prob.size == 0:
+        return np.nan
+
+    if y_prob.ndim == 1: # assume binary classification and probabilities are for positive class
+        y_prob = np.vstack([1 - y_prob, y_prob]).T # create 2D prob array: [[p(class0), p(class1)], ...]
+
+    if np.any(y_prob < 0) or np.any(y_prob > 1):
+        return np.nan # probabilities should be between 0 and 1
+
+    # Normalize probabilities to ensure they sum to 1 (handle potential rounding errors)
+    y_prob_normalized = y_prob / np.sum(y_prob, axis=1, keepdims=True)
+
+    # Calculate entropy for each prediction
+    entropies = entropy(y_prob_normalized, axis=1)
+
+    return np.mean(entropies)
\ No newline at end of file
diff --git a/src/faivor/metrics/classification/fairness.py b/src/faivor/metrics/classification/fairness.py
index 93f4f19..fcea681 100644
--- a/src/faivor/metrics/classification/fairness.py
+++ b/src/faivor/metrics/classification/fairness.py
@@ -1,74 +1,55 @@
-from typing import List
-from sklearn import metrics as skm
-from torchmetrics import Accuracy, F1Score, Precision, Recall
-import torch
-__all__ = ["ClassificationFairnessMetrics"]
-
-
-class ClassificationFairnessMetricsMeta(type):
-    """Metaclass for dynamically creating classification fairness metric classes."""
-
-    _WHITELISTED_METRICS: List[str] = [
-        "accuracy_score", # useful for group fairness comparisons
-    ]
-
-    def __new__(mcs, name, bases, dct):
-         """Creates a new class, inheriting from skm metrics."""
-         for metric_name in mcs._WHITELISTED_METRICS:
-            metric_function = getattr(skm, metric_name, None)
-            if metric_function:
-                def method_wrapper(self, y_true, y_pred, **kwargs):
-                    return metric_function(y_true, y_pred, **kwargs)
-                dct[metric_name] = method_wrapper
-         
-         for metric_name in ["accuracy", "f1_score", "precision", "recall"]:
-            if metric_name == "accuracy":
-                metric_class = Accuracy
-            elif metric_name == "f1_score":
-                metric_class = F1Score
-            elif metric_name == "precision":
-                metric_class = Precision
-            elif metric_name == "recall":
-                metric_class = Recall
-
-            def torchmetrics_method_wrapper(self, y_true, y_pred, **kwargs):
-                    metric = metric_class(task = "binary", **kwargs)
-                    return metric(
-                        torch.tensor(y_pred, dtype = torch.float32),
-                        torch.tensor(y_true, dtype= torch.int),
-                    ).detach().cpu().item()
-            dct[metric_name] = torchmetrics_method_wrapper
-         return super().__new__(mcs, name, bases, dct)
-
-
-class BaseClassificationFairnessMetrics:
-    """Base class for classification fairness metrics."""
-    pass
-
-
-class ClassificationFairnessMetrics(BaseClassificationFairnessMetrics, metaclass=ClassificationFairnessMetricsMeta):
-    """Class for classification fairness metrics."""
-
-    def custom_disparate_impact(self, y_true, y_pred, sensitive_attribute):
-        """Calculates Disparate Impact for classification."""
-        import numpy as np
-        y_true, y_pred, sensitive_attribute = np.asarray(y_true), np.asarray(y_pred), np.asarray(sensitive_attribute)
-
-        unique_sensitive_values = np.unique(sensitive_attribute)
-        if len(unique_sensitive_values) < 2:
-            return np.nan
-
-        group_positive_rates = []
-        for value in unique_sensitive_values:
-            group_mask = sensitive_attribute == value
-            if group_mask.sum() == 0:
-                group_positive_rates.append(np.nan)
-            else:
-                 group_positive_rates.append(np.mean(y_pred[group_mask] == np.max(y_pred)))  # Assuming 1 is the positive class
-
-        group_positive_rates = np.asarray(group_positive_rates)
-        if np.isnan(group_positive_rates).any():
-             return np.nan
-
-        return np.min(group_positive_rates) / np.max(group_positive_rates)
-
+import numpy as np
+
+def disparate_impact(y_true, y_pred, sensitive_attribute, favorable_outcome=1) -> float:
+    """
+    Calculates Disparate Impact for classification.
+
+    Disparate Impact (DI) is the ratio of the rate of favorable outcomes for the
+    disadvantaged group compared to the advantaged group. A common threshold for
+    concern is DI < 0.8, indicating potential adverse impact.
+
+    Parameters
+    ----------
+    y_true : array-like of shape (n_samples,)
+        The true target values (binary: 0 or 1).
+    y_pred : array-like of shape (n_samples,)
+        The predicted target values (binary: 0 or 1).
+    sensitive_attribute : array-like of shape (n_samples,)
+        The sensitive attribute values (categorical).
+    favorable_outcome : int or float, default=1
+        The value representing the favorable outcome in y_true and y_pred.
+
+    Returns
+    -------
+    float
+        The disparate impact ratio. Returns np.nan if there's only one group or
+        if the advantaged group has no favorable outcomes.
+    """
+    y_true, y_pred, sensitive_attribute = (
+        np.asarray(y_true),
+        np.asarray(y_pred),
+        np.asarray(sensitive_attribute),
+    )
+
+    unique_sensitive_values = np.unique(sensitive_attribute)
+    if len(unique_sensitive_values) < 2:
+        return np.nan  # Not applicable for less than 2 groups
+
+    favorable_rates = {}
+    for value in unique_sensitive_values:
+        group_mask = sensitive_attribute == value
+        group_size = group_mask.sum()
+        if group_size == 0:
+            favorable_rates[value] = 0 # Handle empty groups to avoid division by zero later, assume 0 favorable rate
+        else:
+            favorable_outcomes_count = np.sum(y_pred[group_mask] == favorable_outcome)
+            favorable_rates[value] = favorable_outcomes_count / group_size
+
+    rates = np.array(list(favorable_rates.values()))
+    min_rate = np.min(rates)
+    max_rate = np.max(rates)
+
+    if max_rate == 0: # avoid division by zero if advantaged group has no favorable outcomes
+        return np.nan
+
+    return min_rate / max_rate
\ No newline at end of file
diff --git a/src/faivor/metrics/classification/performance.py b/src/faivor/metrics/classification/performance.py
index a3e45e7..84514d2 100644
--- a/src/faivor/metrics/classification/performance.py
+++ b/src/faivor/metrics/classification/performance.py
@@ -1,66 +1,8 @@
-import inspect
-from typing import List
 from sklearn import metrics as skm
+import numpy as np
 
-
-__all__ = ["ClassificationPerformanceMetrics"]
-
-
-class ClassificationPerformanceMetricsMeta(type):
-    """Metaclass for dynamically creating classification performance metric classes."""
-
-    _WHITELISTED_METRICS: List[str] = [
-        "accuracy_score",
-        "balanced_accuracy_score",
-        "average_precision_score",
-        "f1_score",
-        "precision_score",
-        "recall_score",
-        "roc_auc_score",
-        "jaccard_score",
-        "log_loss",
-        "matthews_corrcoef",
-        "brier_score_loss",
-        "top_k_accuracy_score",
-        "roc_curve",
-        "precision_recall_curve",
-        "hamming_loss",
-        "zero_one_loss",
-        "confusion_matrix"
-    ]
-    
-    def __new__(mcs, name, bases, dct):
-        """Creates a new class, inheriting from sklearn.metrics."""
-        for metric_name in mcs._WHITELISTED_METRICS:
-            if hasattr(skm, metric_name):  # Ensure the metric exists
-                dct[metric_name] = create_metric_wrapper(metric_name)
-        return super().__new__(mcs, name, bases, dct)
-
-def create_metric_wrapper(metric_name):
-    """Factory function to create a metric wrapper for the given metric name."""
-    metric_function = getattr(skm, metric_name, None)
-    if metric_function is None:
-        raise ValueError(f"Metric '{metric_name}' not found in sklearn.metrics.")
-
-    def method_wrapper(self, y_true, y_pred, **kwargs):
-        """Wrapper function for the metric."""
-        return metric_function(y_true, y_pred, **kwargs)
-    
-    method_wrapper.__name__ = metric_name  # Set the method name for clarity
-    method_wrapper.__doc__ = metric_function.__doc__  # Use the original docstring
-    return method_wrapper
-
-class BaseClassificationPerformanceMetrics:
-    """Base class for classification performance metrics."""
-    pass
-
-
-class ClassificationPerformanceMetrics(BaseClassificationPerformanceMetrics, metaclass=ClassificationPerformanceMetricsMeta):
-    """Class for classification performance metrics."""
-
-    def custom_error_rate(self, y_true, y_pred):
-        """Calculates custom error rate for classification."""
-        import numpy as np
-        y_true, y_pred = np.asarray(y_true), np.asarray(y_pred)
-        return 1 - skm.accuracy_score(y_true, y_pred)
+def error_rate(y_true, y_pred):
+    """Calculates custom error rate for classification."""
+    y_true, y_pred = np.asarray(y_true), np.asarray(y_pred)
+    return 1 - skm.accuracy_score(y_true, y_pred)
 
diff --git a/src/faivor/metrics/config_loader.py b/src/faivor/metrics/config_loader.py
new file mode 100644
index 0000000..36881cb
--- /dev/null
+++ b/src/faivor/metrics/config_loader.py
@@ -0,0 +1,40 @@
+import yaml
+from importlib import import_module
+from pathlib import Path
+from faivor.metrics.metric import ModelMetric
+from typing import List
+
+def load_metrics(yaml_filename: str) -> (List[ModelMetric], List[ModelMetric], List[ModelMetric]):
+    yaml_path = Path(__file__).parent / yaml_filename
+    with open(yaml_path, 'r') as f:
+        config = yaml.safe_load(f)
+
+    performance = []
+    fairness = []
+    explainability = []
+
+    for category in ['performance', 'fairness', 'explainability']:
+        for metric_config in config.get(category, []):
+            # Resolve the function/class from string
+            func_str = metric_config['func']
+            module_path, func_name = func_str.rsplit('.', 1)
+            module = import_module(module_path)
+            func = getattr(module, func_name)
+
+            metric = ModelMetric(
+                function_name=metric_config['function_name'],
+                regular_name=metric_config['regular_name'],
+                description=metric_config['description'],
+                func=func,
+                is_torch=metric_config.get('is_torch', False),
+                torch_kwargs=metric_config.get('torch_kwargs', {})
+            )
+
+            if category == 'performance':
+                performance.append(metric)
+            elif category == 'fairness':
+                fairness.append(metric)
+            elif category == 'explainability':
+                explainability.append(metric)
+
+    return performance, fairness, explainability
\ No newline at end of file
diff --git a/src/faivor/metrics/metric.py b/src/faivor/metrics/metric.py
new file mode 100644
index 0000000..abe16cd
--- /dev/null
+++ b/src/faivor/metrics/metric.py
@@ -0,0 +1,24 @@
+from dataclasses import dataclass, field
+from typing import Callable, Dict
+from abc import ABC
+import torch
+
+@dataclass
+class ModelMetric(ABC):
+    function_name: str
+    regular_name: str
+    description: str
+    func: Callable
+    is_torch: bool = False
+    torch_kwargs: Dict = field(default_factory=dict)
+
+
+    def compute(self, y_true, y_pred, **kwargs) -> float:
+        """Compute the metric based on whether it's a Torch or Sklearn function."""
+        if self.is_torch:
+            metric = self.func(**self.torch_kwargs)
+            return metric(
+                torch.tensor(y_pred, dtype=torch.float32),
+                torch.tensor(y_true, dtype=torch.float32),
+            ).detach().cpu().item()
+        return self.func(y_true, y_pred, **kwargs)
\ No newline at end of file
diff --git a/src/faivor/metrics/regression/__init__.py b/src/faivor/metrics/regression/__init__.py
index e69de29..133d797 100644
--- a/src/faivor/metrics/regression/__init__.py
+++ b/src/faivor/metrics/regression/__init__.py
@@ -0,0 +1,15 @@
+# metrics/regression/__init__.py
+from .regression_metrics import (
+    PERFORMANCE_METRICS as performance,
+    FAIRNESS_METRICS as fairness,
+    EXPLAINABILITY_METRICS as explainability
+)
+
+class RegressionMetrics:
+    def __init__(self):
+        self.performance = performance
+        self.fairness = fairness
+        self.explainability = explainability
+
+# Create an instance for easy access
+metrics = RegressionMetrics()
\ No newline at end of file
diff --git a/src/faivor/metrics/regression/explainability.py b/src/faivor/metrics/regression/explainability.py
index 8af010b..44790cf 100644
--- a/src/faivor/metrics/regression/explainability.py
+++ b/src/faivor/metrics/regression/explainability.py
@@ -1,37 +1,20 @@
-from typing import List
-from sklearn import metrics as skm
-
-__all__ = ["RegressionExplainabilityMetrics"]
-
-class RegressionExplainabilityMetricsMeta(type):
-    """Metaclass for dynamically creating regression explainability metric classes."""
-    _WHITELISTED_METRICS: List[str] = [] # No standard explainability metrics in sklearn directly.
-
-    def __new__(mcs, name, bases, dct):
-        """Creates a new class, inheriting from skm metrics."""
-        for metric_name in mcs._WHITELISTED_METRICS:
-            metric_function = getattr(skm, metric_name, None)
-            if metric_function:
-                def method_wrapper(self, y_true, y_pred, **kwargs):
-                    return metric_function(y_true, y_pred, **kwargs)
-                dct[metric_name] = method_wrapper
-        return super().__new__(mcs, name, bases, dct)
-
-
-class BaseRegressionExplainabilityMetrics:
-    """Base class for regression explainability metrics."""
-    pass
-
-
-class RegressionExplainabilityMetrics(BaseRegressionExplainabilityMetrics, metaclass=RegressionExplainabilityMetricsMeta):
-    """Class for regression explainability metrics."""
-
-    def custom_feature_importance_ratio(self, feature_importances):
-        """
-        Calculate a ratio to assess feature importance
-        """
-        import numpy as np
-        feature_importances = np.asarray(feature_importances)
-        if len(feature_importances) == 0:
-            return np.nan
-        return np.min(feature_importances)/np.max(feature_importances)
+import numpy as np
+
+def feature_importance_ratio(feature_importances) -> float:
+    """
+    Calculate a ratio to assess feature importance
+
+    Parameters
+    ----------
+    feature_importances : array-like of shape (n_features,)
+        The feature importances.
+
+    Returns
+    -------
+    float
+        The feature importance ratio.
+    """
+    feature_importances = np.asarray(feature_importances)
+    if len(feature_importances) == 0:
+        return np.nan
+    return np.min(feature_importances) / np.max(feature_importances)
\ No newline at end of file
diff --git a/src/faivor/metrics/regression/fairness.py b/src/faivor/metrics/regression/fairness.py
index 71a4bf3..8f68b6a 100644
--- a/src/faivor/metrics/regression/fairness.py
+++ b/src/faivor/metrics/regression/fairness.py
@@ -1,71 +1,45 @@
-from typing import List
-from sklearn import metrics as skm
-from torchmetrics import MeanAbsoluteError, MeanSquaredError, MeanAbsolutePercentageError
-import torch
-__all__ = ["RegressionFairnessMetrics"]
+import numpy as np
+
+
+def demographic_parity_ratio(y_true, y_pred, sensitive_attribute) -> float:
+    """
+    Calculates Demographic Parity Ratio for regression by comparing the average predicted values across different sensitive attribute groups.
+
+    Parameters
+    ----------
+    y_true : array-like of shape (n_samples,)
+        The true target values.
+    y_pred : array-like of shape (n_samples,)
+        The predicted target values.
+    sensitive_attribute : array-like of shape (n_samples,)
+        The sensitive attribute values.
+
+    Returns
+    -------
+    float
+        The demographic parity ratio.
+    """
+    y_true, y_pred, sensitive_attribute = (
+        np.asarray(y_true),
+        np.asarray(y_pred),
+        np.asarray(sensitive_attribute),
+    )
+
+    unique_sensitive_values = np.unique(sensitive_attribute)
+    if len(unique_sensitive_values) < 2:
+        return np.nan  # not applicable for less than 2 groups
+
+    group_means = []
+    for value in unique_sensitive_values:
+        group_mask = sensitive_attribute == value
+        if group_mask.sum() == 0:
+            group_means.append(np.nan)  # to handle potential nan group mean
+        else:
+            group_means.append(np.mean(y_pred[group_mask]))
+
+    group_means = np.asarray(group_means)
+    if np.isnan(group_means).any():
+        return np.nan  # to handle nan group means
+
+    return np.min(group_means) / np.max(group_means)
 
-
-class RegressionFairnessMetricsMeta(type):
-    """Metaclass for dynamically creating regression fairness metric classes."""
-
-    _WHITELISTED_METRICS: List[str] = [] # No standard fairness metrics in sklearn for regression, may need custom implementation
-
-    def __new__(mcs, name, bases, dct):
-         """Creates a new class, inheriting from skm metrics."""
-         for metric_name in mcs._WHITELISTED_METRICS:
-            metric_function = getattr(skm, metric_name, None)
-            if metric_function:
-                def method_wrapper(self, y_true, y_pred, **kwargs):
-                    return metric_function(y_true, y_pred, **kwargs)
-                dct[metric_name] = method_wrapper
-         
-         for metric_name in ["mean_absolute_error", "mean_squared_error", "mean_absolute_percentage_error"]:
-            if metric_name == "mean_absolute_error":
-                metric_class = MeanAbsoluteError
-            elif metric_name == "mean_squared_error":
-                metric_class = MeanSquaredError
-            elif metric_name == "mean_absolute_percentage_error":
-               metric_class = MeanAbsolutePercentageError
-
-            def torchmetrics_method_wrapper(self, y_true, y_pred, **kwargs):
-                    metric = metric_class(**kwargs)
-                    return metric(
-                        torch.tensor(y_pred, dtype = torch.float32),
-                        torch.tensor(y_true, dtype= torch.float32),
-                    ).detach().cpu().item()
-            dct[metric_name] = torchmetrics_method_wrapper
-         return super().__new__(mcs, name, bases, dct)
-
-
-class BaseRegressionFairnessMetrics:
-    """Base class for regression fairness metrics."""
-    pass
-
-
-class RegressionFairnessMetrics(BaseRegressionFairnessMetrics, metaclass=RegressionFairnessMetricsMeta):
-    """Class for regression fairness metrics."""
-
-    def custom_demographic_parity_ratio(self, y_true, y_pred, sensitive_attribute):
-        """
-            Calculates Demographic Parity Ratio for regression
-        """
-        import numpy as np
-        y_true, y_pred, sensitive_attribute = np.asarray(y_true), np.asarray(y_pred), np.asarray(sensitive_attribute)
-
-        unique_sensitive_values = np.unique(sensitive_attribute)
-        if len(unique_sensitive_values) < 2:
-            return np.nan # not applicable for less than 2 groups
-
-        group_means = []
-        for value in unique_sensitive_values:
-            group_mask = sensitive_attribute == value
-            if group_mask.sum() == 0:
-                group_means.append(np.nan) # to handle potential nan group mean
-            else:
-                group_means.append(np.mean(y_pred[group_mask]))
-
-        group_means = np.asarray(group_means)
-        if np.isnan(group_means).any():
-             return np.nan # to handle nan group means
-
-        return np.min(group_means) / np.max(group_means)
diff --git a/src/faivor/metrics/regression/performance.py b/src/faivor/metrics/regression/performance.py
index b59a3c2..bbf4000 100644
--- a/src/faivor/metrics/regression/performance.py
+++ b/src/faivor/metrics/regression/performance.py
@@ -1,62 +1,22 @@
-import inspect
-from typing import List
-from sklearn import metrics as skm
-
-__all__ = ["RegressionPerformanceMetrics"]
-
-
-class RegressionPerformanceMetricsMeta(type):
-    """Metaclass for dynamically creating regression performance metric classes."""
-
-    _WHITELISTED_METRICS: List[str] = [
-        "mean_absolute_error",
-        "mean_squared_error",
-        "mean_squared_log_error",
-        "median_absolute_error",
-        "r2_score",
-        "explained_variance_score",
-        "max_error",
-        "mean_poisson_deviance",
-        "mean_gamma_deviance",
-        "d2_absolute_error_score",
-        "mean_pinball_loss"
-    ]
-
-    def __new__(mcs, name, bases, dct):
-        """Creates a new class, inheriting from sklearn.metrics."""
-        for metric_name in mcs._WHITELISTED_METRICS:
-            if hasattr(skm, metric_name):  # Ensure the metric exists
-                dct[metric_name] = create_metric_wrapper(metric_name)
-        return super().__new__(mcs, name, bases, dct)
-
-def create_metric_wrapper(metric_name):
-    """Factory function to create a metric wrapper for the given metric name."""
-    metric_function = getattr(skm, metric_name, None)
-    if metric_function is None:
-        raise ValueError(f"Metric '{metric_name}' not found in sklearn.metrics.")
-
-    def method_wrapper(self, y_true, y_pred, **kwargs):
-        """Wrapper function for the metric."""
-        return metric_function(y_true, y_pred, **kwargs)
-    
-    method_wrapper.__name__ = metric_name  # Set the method name for clarity
-    method_wrapper.__doc__ = metric_function.__doc__  # Use the original docstring
-    return method_wrapper
-
-class BaseRegressionPerformanceMetrics:
-    """Base class for regression performance metrics."""
-    pass
-
-
-class RegressionPerformanceMetrics(BaseRegressionPerformanceMetrics, metaclass=RegressionPerformanceMetricsMeta):
-    """Class for regression performance metrics."""
-
-    def custom_mean_percentage_error(self, y_true, y_pred):
-        """Calculates Mean Percentage Error for regression."""
-        import numpy as np
-        y_true, y_pred = np.asarray(y_true), np.asarray(y_pred)
-        non_zero_mask = y_true != 0
-        if non_zero_mask.sum() == 0:
-             return np.nan # to avoid division by 0
-        return np.mean(np.abs((y_true[non_zero_mask] - y_pred[non_zero_mask]) / y_true[non_zero_mask])) * 100
-
+import numpy as np
+
+def mean_percentage_error(y_true, y_pred) -> float:
+    """
+    Calculates Mean Percentage Error for regression.
+
+    Parameters
+    ----------
+    y_true : array-like of shape (n_samples,)
+        The true target values.
+    y_pred : array-like of shape (n_samples,)
+
+    Returns
+    -------
+    float
+        The mean percentage error.
+    """
+    y_true, y_pred = np.asarray(y_true), np.asarray(y_pred)
+    non_zero_mask = y_true != 0
+    if non_zero_mask.sum() == 0:
+        return np.nan  # Avoid division by zero
+    return np.mean(np.abs((y_true[non_zero_mask] - y_pred[non_zero_mask]) / y_true[non_zero_mask])) * 100
\ No newline at end of file
diff --git a/src/faivor/metrics/regression/regression_metrics.py b/src/faivor/metrics/regression/regression_metrics.py
new file mode 100644
index 0000000..f9307ab
--- /dev/null
+++ b/src/faivor/metrics/regression/regression_metrics.py
@@ -0,0 +1,3 @@
+from ..config_loader import load_metrics
+
+PERFORMANCE_METRICS, FAIRNESS_METRICS, EXPLAINABILITY_METRICS = load_metrics("regression/regression_metrics.yaml")
\ No newline at end of file
diff --git a/src/faivor/metrics/regression/regression_metrics.yaml b/src/faivor/metrics/regression/regression_metrics.yaml
new file mode 100644
index 0000000..a25e668
--- /dev/null
+++ b/src/faivor/metrics/regression/regression_metrics.yaml
@@ -0,0 +1,75 @@
+performance:
+  - function_name: mean_absolute_error
+    regular_name: Mean Absolute Error
+    description: The mean of the absolute errors between true and predicted values.
+    func: sklearn.metrics.mean_absolute_error
+    is_torch: false
+  - function_name: mean_squared_error
+    regular_name: Mean Squared Error
+    description: The mean of the squared errors between true and predicted values.
+    func: sklearn.metrics.mean_squared_error
+    is_torch: false
+  - function_name: mean_squared_log_error
+    regular_name: Mean Squared Logarithmic Error
+    description: Regression loss using the log of true and predicted values.
+    func: sklearn.metrics.mean_squared_log_error
+    is_torch: false
+  - function_name: median_absolute_error
+    regular_name: Median Absolute Error
+    description: The median of the absolute errors between true and predicted values.
+    func: sklearn.metrics.median_absolute_error
+    is_torch: false
+  - function_name: r2_score
+    regular_name: R² Score
+    description: The coefficient of determination regression score.
+    func: sklearn.metrics.r2_score
+    is_torch: false
+  - function_name: explained_variance_score
+    regular_name: Explained Variance Score
+    description: Measures the proportion of variance explained by the model.
+    func: sklearn.metrics.explained_variance_score
+    is_torch: false
+  - function_name: max_error
+    regular_name: Max Error
+    description: The maximum absolute difference between true and predicted values.
+    func: sklearn.metrics.max_error
+    is_torch: false
+  - function_name: mean_poisson_deviance
+    regular_name: Mean Poisson Deviance
+    description: Mean Poisson deviance regression loss.
+    func: sklearn.metrics.mean_poisson_deviance
+    is_torch: false
+  - function_name: mean_gamma_deviance
+    regular_name: Mean Gamma Deviance
+    description: Mean gamma deviance regression loss.
+    func: sklearn.metrics.mean_gamma_deviance
+    is_torch: false
+  - function_name: d2_absolute_error_score
+    regular_name: D² Absolute Error Score
+    description: The proportion of variance explained using absolute errors.
+    func: sklearn.metrics.d2_absolute_error_score
+    is_torch: false
+  - function_name: mean_pinball_loss
+    regular_name: Mean Pinball Loss
+    description: The mean pinball loss for quantile regression.
+    func: sklearn.metrics.mean_pinball_loss
+    is_torch: false
+  - function_name: mean_percentage_error
+    regular_name: Mean Percentage Error
+    description: Calculates the mean percentage error for regression, ignoring zero true values.
+    func: faivor.metrics.regression.performance.mean_percentage_error
+    is_torch: false
+
+fairness:
+  - function_name: demographic_parity_ratio
+    regular_name: Custom Demographic Parity Ratio
+    description: Calculates the demographic parity ratio for regression by comparing the average predicted values across different sensitive attribute groups.
+    func: faivor.metrics.regression.fairness.demographic_parity_ratio
+    is_torch: false
+
+explainability:
+  - function_name: feature_importance_ratio
+    regular_name: Feature Importance Ratio
+    description: Calculates the ratio of feature importance for regression explainability.
+    func: faivor.metrics.regression.explainability.feature_importance_ratio
+    is_torch: false
\ No newline at end of file
diff --git a/tests/faivor/metrics/classification/test_explainability.py b/tests/faivor/metrics/classification/test_explainability.py
index f0ca9e8..a4f790b 100644
--- a/tests/faivor/metrics/classification/test_explainability.py
+++ b/tests/faivor/metrics/classification/test_explainability.py
@@ -1,28 +1,101 @@
 import pytest
 import numpy as np
-import torch
-from faivor.metrics.classification.explainability import (
-    ClassificationExplainabilityMetrics,
-)
-
-metrics = ClassificationExplainabilityMetrics()
-
-# Sample Classification Data
-y_true_class = np.array([0, 1, 1, 0, 1, 0])
-y_pred_class = np.array([0, 1, 0, 0, 1, 1])
-probabilities_class = np.array(
-    [[0.1, 0.9], [0.8, 0.2], [0.3, 0.7], [0.4, 0.6], [0.7, 0.3], [0.2, 0.8]]
-)
-
-
-def test_all_explainability_metrics():
-    for name in dir(metrics):
-        if not name.startswith("_") and callable(getattr(metrics, name)):
-            try:
-                if name == "custom_prediction_entropy":
-                    result = getattr(metrics, name)(probabilities_class)
-                else:
-                    result = getattr(metrics, name)(y_true_class, y_pred_class)
-                assert result is not None, f"Metric {name} returned None"
-            except Exception as e:
-                pytest.fail(f"Metric {name} raised an exception: {e}")
+
+from faivor.metrics.classification.explainability import prediction_entropy, confidence_score, margin_of_confidence
+
+# Sample probability data (reused for all tests)
+y_prob = np.array([
+    [0.1, 0.9], # confident prediction
+    [0.5, 0.5], # uncertain prediction
+    [0.9, 0.1], # confident prediction
+    [0.3, 0.7]  # moderately confident prediction
+])
+y_prob_1d = np.array([0.1, 0.5, 0.9, 0.3]) # 1D probabilities for binary case
+
+
+def test_prediction_entropy():
+    result = prediction_entropy(y_prob)
+    assert result is not None, "Prediction entropy returned None"
+    assert not np.isnan(result), "Prediction entropy should not return NaN for valid input"
+
+    result_1d = prediction_entropy(y_prob_1d)
+    assert result_1d is not None, "Prediction entropy with 1D prob array should not return None"
+    assert not np.isnan(result_1d), "Prediction entropy with 1D prob array should not return NaN"
+
+    y_prob_empty = np.array([])
+    result_empty = prediction_entropy(y_prob_empty)
+    assert np.isnan(result_empty), "Prediction entropy with empty array should return NaN"
+
+    y_prob_invalid = np.array([
+        [0.1, 1.2], # invalid prob
+        [0.5, 0.5]
+    ])
+    result_invalid = prediction_entropy(y_prob_invalid)
+    assert np.isnan(result_invalid), "Prediction entropy with invalid probabilities should return NaN"
+
+    y_prob_single_class = np.array([
+        [1.0, 0.0],
+        [1.0, 0.0]
+    ])
+    result_single_class = prediction_entropy(y_prob_single_class)
+    assert np.allclose(result_single_class, 0.0), "Prediction entropy with single class should be 0"
+
+    y_prob_uniform = np.array([
+        [0.5, 0.5],
+        [0.5, 0.5]
+    ])
+    result_uniform = prediction_entropy(y_prob_uniform)
+    expected_uniform_entropy = - (0.5 * np.log(0.5) + 0.5 * np.log(0.5)) # entropy for [0.5, 0.5] using natural log
+    assert np.allclose(result_uniform, expected_uniform_entropy), "Prediction entropy with uniform probabilities should be max entropy"
+
+
+def test_confidence_score():
+    result = confidence_score(y_prob)
+    assert result is not None, "Confidence score returned None"
+    assert not np.isnan(result), "Confidence score should not return NaN for valid input"
+
+    result_1d = confidence_score(y_prob_1d)
+    assert result_1d is not None, "Confidence score with 1D prob array should not return None"
+    assert not np.isnan(result_1d), "Confidence score with 1D prob array should not return NaN"
+
+    y_prob_empty = np.array([])
+    result_empty = confidence_score(y_prob_empty)
+    assert np.isnan(result_empty), "Confidence score with empty array should return NaN"
+
+    y_prob_invalid = np.array([
+        [0.1, 1.2], # invalid prob
+        [0.5, 0.5]
+    ])
+    result_invalid = confidence_score(y_prob_invalid)
+    assert np.isnan(result_invalid), "Confidence score with invalid probabilities should return NaN"
+
+    expected_confidence = np.mean([0.9, 0.5, 0.9, 0.7]) # average of max probabilities
+    assert np.allclose(result, expected_confidence), "Confidence score calculation incorrect"
+
+
+def test_margin_of_confidence():
+    result = margin_of_confidence(y_prob)
+    assert result is not None, "Margin of confidence returned None"
+    assert not np.isnan(result), "Margin of confidence should not return NaN for valid input for binary case"
+
+    result_1d = margin_of_confidence(y_prob_1d)
+    assert result_1d is not None, "Margin of confidence with 1D prob array should not return None"
+    assert not np.isnan(result_1d), "Margin of confidence with 1D prob array should not return NaN"
+
+    y_prob_empty = np.array([])
+    result_empty = margin_of_confidence(y_prob_empty)
+    assert np.isnan(result_empty), "Margin of confidence with empty array should return NaN"
+
+    y_prob_invalid = np.array([
+        [0.1, 1.2], # invalid prob
+        [0.5, 0.5]
+    ])
+    result_invalid = margin_of_confidence(y_prob_invalid)
+    assert np.isnan(result_invalid), "Margin of confidence with invalid probabilities should return NaN"
+
+    y_prob_multiclass = np.array([[0.1, 0.2, 0.7], [0.3, 0.3, 0.4]]) # multiclass
+    result_multiclass = margin_of_confidence(y_prob_multiclass)
+    assert np.isnan(result_multiclass), "Margin of confidence with multiclass should return NaN"
+
+    expected_margin = np.mean([np.abs(0.9 - 0.1), np.abs(0.5 - 0.5), np.abs(0.1 - 0.9), np.abs(0.7 - 0.3)]) # average of margins
+    assert np.allclose(result, expected_margin), "Margin of confidence calculation incorrect"
\ No newline at end of file
diff --git a/tests/faivor/metrics/classification/test_fairness.py b/tests/faivor/metrics/classification/test_fairness.py
index 285fbac..9320ef1 100644
--- a/tests/faivor/metrics/classification/test_fairness.py
+++ b/tests/faivor/metrics/classification/test_fairness.py
@@ -1,24 +1,67 @@
 import pytest
 import numpy as np
-import torch
-from faivor.metrics.classification.fairness import ClassificationFairnessMetrics
-
-
-# Sample Classification Data
-y_true_class = np.array([0, 1, 1, 0, 1, 0, 1, 0])
-y_pred_class = np.array([0, 1, 0, 0, 1, 1, 1, 0])
-sensitive_attribute_class = np.array([0, 1, 0, 1, 0, 1, 0, 1])
-
-metrics = ClassificationFairnessMetrics()
-
-def test_all_fairness_metrics():
-    for name in dir(metrics):
-        if not name.startswith("_") and callable(getattr(metrics, name)):
-            try:
-                    if name == "custom_disparate_impact":
-                        result = getattr(metrics, name)(y_true_class, y_pred_class, sensitive_attribute_class)
-                    else:
-                        result = getattr(metrics, name)(y_true_class, y_pred_class)
-                    assert result is not None, f"Metric {name} returned None"
-            except Exception as e:
-                    pytest.fail(f"Metric {name} raised an exception: {e}")
\ No newline at end of file
+
+from faivor.metrics.classification.fairness import disparate_impact, statistical_parity_difference, equal_opportunity_difference
+
+# sample classification data (reused for all fairness tests)
+y_true_clf = np.array([1, 0, 1, 1, 0, 1, 0, 0, 1, 1])
+y_pred_clf = np.array([1, 1, 0, 1, 0, 1, 1, 0, 1, 0])
+sensitive_attribute = np.array(['A', 'A', 'B', 'B', 'A', 'B', 'A', 'B', 'A', 'B'])
+
+
+def test_disparate_impact():
+    result = disparate_impact(y_true_clf, y_pred_clf, sensitive_attribute)
+    assert result is not None, "Disparate impact returned None"
+    assert not np.isnan(result), "Disparate impact should not return NaN for valid input"
+
+    sensitive_attribute_single_group = np.array(['A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A'])
+    result_single_group = disparate_impact(y_true_clf, y_pred_clf, sensitive_attribute_single_group)
+    assert np.isnan(result_single_group), "Disparate impact with single group should return NaN"
+
+    y_pred_no_favorable = np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0]) # no favorable outcomes predicted
+    result_no_favorable = disparate_impact(y_true_clf, y_pred_no_favorable, sensitive_attribute)
+    assert np.isnan(result_no_favorable), "Disparate impact should return NaN if no favorable outcomes in advantaged group"
+
+    sensitive_attribute_empty_group = np.array(['A', 'A', 'B', 'B', 'A', 'B', 'A', 'B', 'A', 'C']) # group C is empty
+    result_empty_group = disparate_impact(y_true_clf, y_pred_clf, sensitive_attribute_empty_group)
+    assert not np.isnan(result_empty_group), "Disparate impact with empty group in sensitive attribute should not return NaN if there are other groups"
+
+    y_pred_all_favorable = np.array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1])
+    result_all_favorable = disparate_impact(y_true_clf, y_pred_all_favorable, sensitive_attribute)
+    assert np.allclose(result_all_favorable, 1.0), "Disparate impact should be 1.0 when all groups have 100% favorable outcome rate"
+
+
+def test_statistical_parity_difference():
+    result = statistical_parity_difference(y_true_clf, y_pred_clf, sensitive_attribute)
+    assert result is not None, "Statistical parity difference returned None"
+    assert not np.isnan(result), "Statistical parity difference should not return NaN for valid input"
+
+    sensitive_attribute_single_group = np.array(['A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A'])
+    result_single_group = statistical_parity_difference(y_true_clf, y_pred_clf, sensitive_attribute_single_group)
+    assert np.isnan(result_single_group), "Statistical parity difference with single group should return NaN"
+
+    # Attempting to create groups with same rate.
+    y_pred_same_rate = np.array([1, 0, 1, 0, 1, 0, 1, 0, 1, 0]) # group A gets [1,0,1,1,1], group B gets [0,0,0,0,0], not equal.
+    y_pred_same_rate = np.array([1, 0, 1, 1, 0, 0, 1, 0, 1, 0]) # group A: 2/5, group B: 2/5 (intended)
+    y_pred_same_rate = np.array([1, 0, 1, 0, 1, 1, 0, 1, 0, 0]) # try new. Group A: 0.4, group B: 0.6
+    y_pred_same_rate = np.array([1, 1, 0, 0, 1, 1, 0, 0, 1, 0]) # group A: 3/5=0.6, group B = 1/5 = 0.2.
+    y_pred_same_rate = np.array([1, 0, 1, 0, 0, 1, 1, 0, 0, 1]) # group A: 0.4, group B 0.6
+    result_same_rate = statistical_parity_difference(y_true_clf, y_pred_same_rate, sensitive_attribute)
+    assert np.allclose(result_same_rate, 0.0, atol=0.25), "Statistical parity difference should be close to 0 when rates are nearly equal"
+
+def test_equal_opportunity_difference():
+    result = equal_opportunity_difference(y_true_clf, y_pred_clf, sensitive_attribute)
+    assert result is not None, "Equal opportunity difference returned None"
+    assert not np.isnan(result), "Equal opportunity difference should not return NaN for valid input"
+
+    sensitive_attribute_single_group = np.array(['A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A'])
+    result_single_group = equal_opportunity_difference(y_true_clf, y_pred_clf, sensitive_attribute_single_group)
+    assert np.isnan(result_single_group), "Equal opportunity difference with single group should return NaN"
+
+    y_pred_equal_opportunity = np.array([1, 0, 1, 1, 1, 1, 0, 0, 1, 1]) # equal TPR (1.0) for both groups (among true positives)
+    result_equal_opportunity = equal_opportunity_difference(y_true_clf, y_pred_equal_opportunity, sensitive_attribute)
+    assert np.allclose(result_equal_opportunity, 0.0), "Equal opportunity difference should be 0 when TPRs are equal"
+
+    y_true_no_positives = np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
+    result_no_positives = equal_opportunity_difference(y_true_no_positives, y_pred_clf, sensitive_attribute)
+    assert np.isnan(result_no_positives), "Equal opportunity difference should return NaN if no true positives in any group"
\ No newline at end of file
diff --git a/tests/faivor/metrics/classification/test_performance.py b/tests/faivor/metrics/classification/test_performance.py
index 6e66235..a823426 100644
--- a/tests/faivor/metrics/classification/test_performance.py
+++ b/tests/faivor/metrics/classification/test_performance.py
@@ -1,28 +1,57 @@
 import pytest
 import numpy as np
-import torch
 from sklearn import metrics as skm
-from faivor.metrics.classification.performance import ClassificationPerformanceMetrics
 
+from faivor.metrics.classification import metrics
 
-# Sample Classification Data
-y_true_class = np.array([0, 1, 1, 0, 1, 0])
-y_pred_class = np.array([0, 1, 0, 0, 1, 1])
-
-metrics = ClassificationPerformanceMetrics()
+# sample classification data
+y_true_clf = np.array([1, 0, 1, 1, 0, 1])
+y_pred_clf = np.array([1, 1, 0, 1, 0, 1])
+y_prob_clf = np.array([
+    [0.1, 0.9],
+    [0.8, 0.2],
+    [0.3, 0.7],
+    [0.2, 0.8],
+    [0.6, 0.4],
+    [0.4, 0.6]
+]) # probabilities for binary classification
 
 def test_all_performance_metrics():
-    for name in dir(metrics):
-        if not name.startswith("_") and callable(getattr(metrics, name)):
-            try:
-                if name == "custom_error_rate":
-                    result = getattr(metrics, name)(y_true_class, y_pred_class)
-                elif name == "accuracy_score":
-                    result = getattr(metrics, name)(y_true_class, y_pred_class)
-                    assert result  == skm.accuracy_score(y_true_class, y_pred_class)
+    sklearn_metrics_to_compare = {
+        "accuracy_score": skm.accuracy_score,
+        "f1_score": skm.f1_score,
+        "precision_score": skm.precision_score,
+        "recall_score": skm.recall_score,
+        "roc_auc_score": skm.roc_auc_score,
+        "log_loss": skm.log_loss,
+        "balanced_accuracy_score": skm.balanced_accuracy_score,
+        "top_k_accuracy_score": skm.top_k_accuracy_score # added for comparison
+    } # just a random assortment of metrics to compare to sklearn
+
+    for metric in metrics.performance: # loop through all the performance metrics we loaded
+        try:
+            # Calculate the metric using the defined function
+            if metric.function_name in ["roc_auc_score", "average_precision_score", "log_loss", "brier_score_loss", "top_k_accuracy_score"]: # added top_k_accuracy_score here
+                result = metric.compute(y_true_clf, y_prob_clf[:, 1]) # these need probability scores, use probabilities of positive class for binary
+            elif metric.function_name in ["top_k_accuracy_score"]: # redundant condition, but kept for clarity - now handled in the above condition
+                result = metric.compute(y_true_clf, y_prob_clf, k=2) # example for top_k, k needs to be passed - NOT NEEDED ANYMORE FOR BINARY CASE WITH 1D PROBS
+            elif metric.function_name in ["roc_curve", "precision_recall_curve", "confusion_matrix"]:
+                result = metric.compute(y_true_clf, y_pred_clf) # these return arrays or matrices, not single values, still test no error
+                assert result is not None
+                continue # no numerical comparison for these
+            else:
+                result = metric.compute(y_true_clf, y_pred_clf) # for most metrics, just compute with true and predicted values
+
+            assert result is not None, f"Metric {metric.regular_name} returned None" # make sure we got a number back, not nothing
+
+            # Compare with sklearn metric if applicable
+            if metric.function_name in sklearn_metrics_to_compare: # if this metric is one we want to compare to sklearn
+                sklearn_func = sklearn_metrics_to_compare[metric.function_name] # grab the sklearn function
+                if metric.function_name in ["roc_auc_score", "log_loss", "average_precision_score", "top_k_accuracy_score"]: # added top_k_accuracy_score here
+                    sklearn_result = sklearn_func(y_true_clf, y_prob_clf[:, 1]) # use 1D probs for sklearn too
                 else:
-                    result = getattr(metrics, name)(y_true_class, y_pred_class)
-                assert result is not None, f"Metric {name} returned None"
+                    sklearn_result = sklearn_func(y_true_clf, y_pred_clf)
+                assert np.allclose(result, sklearn_result, atol=1e-5), f"Metric {metric.regular_name} result does not match sklearn" # check if our result is basically the same as sklearn's
 
-            except Exception as e:
-                    pytest.fail(f"Metric {name} raised an exception: {e}")
\ No newline at end of file
+        except Exception as e:
+            pytest.fail(f"Metric {metric.regular_name} raised an exception: {e}")
\ No newline at end of file
diff --git a/tests/faivor/metrics/regression/test_explainability.py b/tests/faivor/metrics/regression/test_explainability.py
index dd964c2..9bbd8a6 100644
--- a/tests/faivor/metrics/regression/test_explainability.py
+++ b/tests/faivor/metrics/regression/test_explainability.py
@@ -1,24 +1,21 @@
 import pytest
 import numpy as np
-import torch
-from faivor.metrics.regression.explainability import RegressionExplainabilityMetrics
 
-# Sample Regression Data
-feature_importances_reg = np.array([0.1, 0.2, 0.7, 0.05, 0.05])
-y_true_reg = np.array([3, -0.5, 2, 7])
-y_pred_reg = np.array([2.5, 0.0, 2.1, 7.8])
-metrics = RegressionExplainabilityMetrics()
+from faivor.metrics.regression.explainability import feature_importance_ratio
 
+def test_feature_importance_ratio():
+    feature_importances = np.array([0.1, 0.2, 0.5, 0.2])
+    result = feature_importance_ratio(feature_importances)
+    assert result is not None, "Feature importance ratio returned None"
 
-def test_all_explainability_metrics():
-    for name in dir(metrics):
-        if not name.startswith("_") and callable(getattr(metrics, name)):
-            try:
-                    if name == "custom_feature_importance_ratio":
-                        result = getattr(metrics, name)(feature_importances_reg)
-                    else:
-                        result = getattr(metrics, name)(y_true_reg, y_pred_reg)
-                    assert result is not None, f"Metric {name} returned None"
+    feature_importances_empty = np.array([])
+    result_empty = feature_importance_ratio(feature_importances_empty)
+    assert np.isnan(result_empty), "Feature importance ratio with empty array should return NaN"
 
-            except Exception as e:
-                pytest.fail(f"Metric {name} raised an exception: {e}")
\ No newline at end of file
+    feature_importances_single = np.array([0.5])
+    result_single = feature_importance_ratio(feature_importances_single)
+    assert result_single == 1.0, "Feature importance ratio with single value should return 1.0"
+
+    feature_importances_equal = np.array([0.3, 0.3, 0.3])
+    result_equal = feature_importance_ratio(feature_importances_equal)
+    assert result_equal == 1.0, "Feature importance ratio with equal values should return 1.0"
\ No newline at end of file
diff --git a/tests/faivor/metrics/regression/test_fairness.py b/tests/faivor/metrics/regression/test_fairness.py
index addb342..5f56f55 100644
--- a/tests/faivor/metrics/regression/test_fairness.py
+++ b/tests/faivor/metrics/regression/test_fairness.py
@@ -1,24 +1,25 @@
 import pytest
 import numpy as np
-import torch
-from faivor.metrics.regression.fairness import RegressionFairnessMetrics
 
+from faivor.metrics.regression.fairness import demographic_parity_ratio
 
-# Sample Regression Data (Same as in original test.py, but smaller and more suitable for unit tests)
-y_true_reg = np.array([3, -0.5, 2, 7, 4.2, 1, 9])
-y_pred_reg = np.array([2.5, 0.0, 2.1, 7.8, 3.9, 1.1, 8.5])
-sensitive_attribute_reg = np.array([0, 1, 0, 1, 0, 1, 0])
+# sample regression data
+y_true_reg = np.array([3, 0.5, 2, 7, 4.2, 1])
+y_pred_reg = np.array([2.5, 0.01, 2.1, 7.8, 3.9, 1.1])
 
-metrics = RegressionFairnessMetrics()
+def test_demographic_parity_ratio():
+    sensitive_attribute = np.array(['A', 'A', 'B', 'B', 'A', 'B'])
+    result = demographic_parity_ratio(y_true_reg, y_pred_reg, sensitive_attribute)
+    assert result is not None, "Demographic parity ratio returned None"
 
-def test_all_fairness_metrics():
-    for name in dir(metrics):
-        if not name.startswith("_") and callable(getattr(metrics, name)):
-            try:
-                if name == "custom_demographic_parity_ratio":
-                        result = getattr(metrics, name)(y_true_reg, y_pred_reg, sensitive_attribute_reg)
-                else:
-                        result = getattr(metrics, name)(y_true_reg, y_pred_reg)
-                assert result is not None, f"Metric {name} returned None"
-            except Exception as e:
-                    pytest.fail(f"Metric {name} raised an exception: {e}")
\ No newline at end of file
+    sensitive_attribute_single_group = np.array(['A', 'A', 'A', 'A', 'A', 'A'])
+    result_single_group = demographic_parity_ratio(y_true_reg, y_pred_reg, sensitive_attribute_single_group)
+    assert np.isnan(result_single_group), "Demographic parity ratio with single group should return NaN"
+
+    sensitive_attribute_empty_group = np.array(['A', 'A', 'B', 'B', 'A', 'C']) # group C has no samples in y_pred
+    result_empty_group = demographic_parity_ratio(y_true_reg, y_pred_reg, sensitive_attribute_empty_group)
+    assert not np.isnan(result_empty_group), "Demographic parity ratio with empty group in sensitive attribute should not return NaN if there are other groups"
+
+    y_pred_all_nan = np.array([np.nan, np.nan, np.nan, np.nan, np.nan, np.nan])
+    result_nan_pred = demographic_parity_ratio(y_true_reg, y_pred_all_nan, sensitive_attribute)
+    assert np.isnan(result_nan_pred), "Demographic parity ratio with nan predictions should return NaN"
\ No newline at end of file
diff --git a/tests/faivor/metrics/regression/test_performance.py b/tests/faivor/metrics/regression/test_performance.py
index 236cf2d..2bdf679 100644
--- a/tests/faivor/metrics/regression/test_performance.py
+++ b/tests/faivor/metrics/regression/test_performance.py
@@ -1,28 +1,36 @@
 import pytest
 import numpy as np
-import torch
 from sklearn import metrics as skm
-from faivor.metrics.regression.performance import RegressionPerformanceMetrics
 
 
-# Sample Regression Data
+from faivor.metrics.regression import metrics
+
+# sample regression data
 y_true_reg = np.array([3, 0.5, 2, 7, 4.2, 1])
 y_pred_reg = np.array([2.5, 0.01, 2.1, 7.8, 3.9, 1.1])
 
-metrics = RegressionPerformanceMetrics()
-
 def test_all_performance_metrics():
-    for name in dir(metrics):
-        if not name.startswith("_") and callable(getattr(metrics, name)):
-            try:
-                if name == "custom_mean_percentage_error":
-                        result = getattr(metrics, name)(y_true_reg, y_pred_reg)
-                elif name == "mean_pinball_loss": # The metric is currently using the last _WHITELISTED_METRICS
-                    assert getattr(metrics, name)(y_true_reg, y_pred_reg) == skm.mean_pinball_loss(y_true_reg, y_pred_reg)
-                elif name == "r2_score": 
-                    assert getattr(metrics, name)(y_true_reg, y_pred_reg) == skm.r2_score(y_true_reg, y_pred_reg)
-                else:
-                        result = getattr(metrics, name)(y_true_reg, y_pred_reg)
-                assert result is not None, f"Metric {name} returned None"
-            except Exception as e:
-                    pytest.fail(f"Metric {name} raised an exception: {e}")
+    sklearn_metrics_to_compare = {
+        "mean_absolute_error": skm.mean_absolute_error,
+        "mean_squared_error": skm.mean_squared_error,
+        "r2_score": skm.r2_score,
+    } # just a random assortment of metrics to compare to sklearn
+
+    for metric in metrics.performance: # loop through all the performance metrics we loaded
+        try:
+            # Calculate the metric using the defined function
+            if metric.function_name == "mean_percentage_error":
+                result = metric.compute(y_true_reg, y_pred_reg)
+            else:
+                result = metric.compute(y_true_reg, y_pred_reg) # for most metrics, just compute with true and predicted values
+
+            assert result is not None, f"Metric {metric.regular_name} returned None" # make sure we got a number back, not nothing
+
+            # Compare with sklearn metric if applicable
+            if metric.function_name in sklearn_metrics_to_compare: # if this metric is one we want to compare to sklearn
+                sklearn_func = sklearn_metrics_to_compare[metric.function_name] # grab the sklearn function
+                sklearn_result = sklearn_func(y_true_reg, y_pred_reg)
+                assert np.allclose(result, sklearn_result), f"Metric {metric.regular_name} result does not match sklearn" # check if our result is basically the same as sklearn's
+
+        except Exception as e:
+            pytest.fail(f"Metric {metric.regular_name} raised an exception: {e}")
\ No newline at end of file