Merge branch 'refs/heads/pep_benchmarking'

# Conflicts: # poetry.lock # pyproject.toml # src/biopsykit/io/tfm.py # src/biopsykit/utils/array_handling.py # src/biopsykit/utils/exceptions.py
mad-lab-fau · Jan 3, 2025 · f996f32 · f996f32
2 parents 2117b2c + 17d33fe
commit f996f32
Show file tree

Hide file tree

Showing 64 changed files with 24,388 additions and 27 deletions.
diff --git a/pyproject.toml b/pyproject.toml
@@ -42,6 +42,8 @@ mne = {version = "^1.2.1", optional = true}
 IPython = {version = "^7", optional = true}
 ipywidgets = {version = "^8", optional = true}
 ipympl = {version = "^0.9", optional = true}
+tpcp = ">=1"
+ts2vg = "^1.2.4"
 statannotations = "^0.7.1"
 
 [tool.poetry.extras]

diff --git a/src/biopsykit/io/tfm.py b/src/biopsykit/io/tfm.py
@@ -2,9 +2,8 @@
 from typing import ClassVar, Optional
 
 import pandas as pd
-from scipy.io import loadmat
-
 from biopsykit.utils._types import path_t
+from scipy.io import loadmat
 
 
 class TFMDataset:
@@ -43,20 +42,15 @@ def from_mat_file(
         # channel_mapping: Optional[Dict[str, str]] = None,
         tz: Optional[str] = "Europe/Berlin",
     ):
-        """Load TFM data from a .mat file.
+        """Load a TFM dataset from a .mat file.
 
         Parameters
         ----------
-        path : str
-            Path to the .mat file containing the TFM data.
+        path : str or :class:`~pathlib.Path`
+            Path to the .mat file.
         tz : str, optional
             Timezone of the data. Default: "Europe/Berlin"
 
-        Returns
-        -------
-        :class:`~biopsykit.io.tfm.TFMDataset`
-            TFM dataset object.
-
         """
         data = loadmat(path, struct_as_record=False, squeeze_me=True)
         data_raw = data["RAW_SIGNALS"]
@@ -65,13 +59,13 @@ def from_mat_file(
         data_dict = {key: getattr(data_raw, value) for key, value in cls.CHANNEL_MAPPING.items()}
         return cls(data_dict=data_dict, tz=tz, sampling_rate_dict={})
 
-    def data_as_df(self):
-        """Return data as a single pandas DataFrame.
+    def data_as_df(self) -> dict[str, pd.DataFrame]:
+        """Return the TFM data as a dictionary of pandas DataFrames.
 
         Returns
         -------
-        :class:`~pandas.DataFrame`
-            Data as a single pandas DataFrame.
+        dict
+            Dictionary containing the TFM data as pandas DataFrames. Keys are channel names, values are the dataframes.
 
         """
         return self._data
diff --git a/src/biopsykit/signals/__init__.py b/src/biopsykit/signals/__init__.py
@@ -1,4 +1,4 @@
 """Module for processing different types of biosignals."""
-from biopsykit.signals import ecg, eeg, imu, rsp
+from biopsykit.signals import ecg, eeg, icg, imu, rsp
 
-__all__ = ["ecg", "eeg", "imu", "rsp"]
+__all__ = ["ecg", "eeg", "imu", "rsp", "icg"]
diff --git a/src/biopsykit/signals/_base_extraction.py b/src/biopsykit/signals/_base_extraction.py
@@ -0,0 +1,41 @@
+from typing import Literal, get_args
+
+import pandas as pd
+from tpcp import Algorithm
+
+HANDLE_MISSING_EVENTS = Literal["raise", "warn", "ignore"]
+
+
+class CanHandleMissingEventsMixin(Algorithm):
+    def __init__(self, handle_missing_events: HANDLE_MISSING_EVENTS = "warn"):
+        """Mixin class to handle missing events in the input dataframes.
+
+        Parameters
+        ----------
+        handle_missing_events : one of {"warn", "raise", "ignore"}, optional
+            How to handle missing data in the input dataframes. If "warn", a warning is raised if missing data is found.
+            If "raise", an exception is raised if missing data is found. If "ignore", missing data is ignored.
+            Default: "warn"
+
+        """
+        self.handle_missing_events = handle_missing_events
+
+    def _check_valid_missing_handling(self):
+        if self.handle_missing_events not in get_args(HANDLE_MISSING_EVENTS):
+            raise ValueError(
+                f"Invalid value '{self.handle_missing_events}' for 'handle_missing_events'. "
+                f"Must be one of {get_args(HANDLE_MISSING_EVENTS)}."
+            )
+
+
+class BaseExtraction(Algorithm):
+    """Base class which defines the interface for all fiducial point extraction algorithms.
+
+    Results:
+        points_ : saves positions of extracted points in pd.DataFrame
+    """
+
+    _action_methods = "extract"
+
+    # results
+    points_: pd.DataFrame
diff --git a/src/biopsykit/signals/_dtypes.py b/src/biopsykit/signals/_dtypes.py
@@ -0,0 +1,13 @@
+import pandas as pd
+from biopsykit.utils.exceptions import ValidationError
+
+__all__ = ["assert_sample_columns_int"]
+
+
+def assert_sample_columns_int(data: pd.DataFrame) -> None:
+    """Assert that the columns of a DataFrame that have "_sample" in their name are of type int."""
+    if not any(data.columns.str.contains("_sample")):
+        raise ValidationError("DataFrame does not contain any columns with '_sample' in their name!")
+    for col in data.columns:
+        if "_sample" in col and not pd.api.types.is_integer_dtype(data[col]):
+            raise ValidationError(f"Column '{col}' is not of type 'int'!")
diff --git a/src/biopsykit/signals/ecg/__init__.py b/src/biopsykit/signals/ecg/__init__.py
@@ -1,5 +1,5 @@
 """Module for ECG data analysis and visualization."""
-from biopsykit.signals.ecg import plotting
+from biopsykit.signals.ecg import event_extraction, plotting, preprocessing, segmentation
 from biopsykit.signals.ecg.ecg import EcgProcessor
 
-__all__ = ["EcgProcessor", "plotting"]
+__all__ = ["EcgProcessor", "plotting", "preprocessing", "segmentation", "event_extraction"]
diff --git a/src/biopsykit/signals/ecg/event_extraction/__init__.py b/src/biopsykit/signals/ecg/event_extraction/__init__.py
@@ -0,0 +1,16 @@
+"""Module for ECG event extraction."""
+from biopsykit.signals.ecg.event_extraction._base_ecg_extraction import BaseEcgExtraction
+from biopsykit.signals.ecg.event_extraction._q_peak_forounzafar2018 import QPeakExtractionForouzanfar2018
+from biopsykit.signals.ecg.event_extraction._q_peak_martinez2004_neurokit import QPeakExtractionMartinez2004Neurokit
+from biopsykit.signals.ecg.event_extraction._q_peak_scipy_findpeaks_neurokit import (
+    QPeakExtractionSciPyFindPeaksNeurokit,
+)
+from biopsykit.signals.ecg.event_extraction._q_peak_vanlien2013 import QPeakExtractionVanLien2013
+
+__all__ = [
+    "BaseEcgExtraction",
+    "QPeakExtractionVanLien2013",
+    "QPeakExtractionMartinez2004Neurokit",
+    "QPeakExtractionSciPyFindPeaksNeurokit",
+    "QPeakExtractionForouzanfar2018",
+]
diff --git a/src/biopsykit/signals/ecg/event_extraction/_base_ecg_extraction.py b/src/biopsykit/signals/ecg/event_extraction/_base_ecg_extraction.py
@@ -0,0 +1,16 @@
+import pandas as pd
+
+__all__ = ["BaseEcgExtraction"]
+
+from biopsykit.signals._base_extraction import BaseExtraction
+
+
+class BaseEcgExtraction(BaseExtraction):
+    def extract(
+        self,
+        *,
+        ecg: pd.Series,
+        heartbeats: pd.DataFrame,
+        sampling_rate_hz: float,
+    ):
+        raise NotImplementedError("This is an abstract method and needs to be implemented in a subclass.")
diff --git a/src/biopsykit/signals/ecg/event_extraction/_q_peak_forounzafar2018.py b/src/biopsykit/signals/ecg/event_extraction/_q_peak_forounzafar2018.py
@@ -0,0 +1,94 @@
+import numpy as np
+import pandas as pd
+from biopsykit.signals._base_extraction import HANDLE_MISSING_EVENTS, CanHandleMissingEventsMixin
+from biopsykit.signals._dtypes import assert_sample_columns_int
+from biopsykit.signals.ecg.event_extraction._base_ecg_extraction import BaseEcgExtraction
+from biopsykit.utils._datatype_validation_helper import _assert_has_columns, _assert_is_dtype
+from biopsykit.utils.array_handling import sanitize_input_series
+from tpcp import Parameter
+
+
+class QPeakExtractionForouzanfar2018(BaseEcgExtraction, CanHandleMissingEventsMixin):
+    """Algorithm by Forouzanfar et al. (2018) for Q-peak extraction."""
+
+    scaling_factor: Parameter[float]
+
+    def __init__(self, scaling_factor: float = 2000, handle_missing_events: HANDLE_MISSING_EVENTS = "warn"):
+        """Initialize new QPeakExtractionVanLien algorithm instance.
+
+        Parameters
+        ----------
+        scaling_factor : float, optional
+            Scaling factor for the threshold used to detect the Q-peak. Default: 2000
+        handle_missing_events : one of {"warn", "raise", "ignore"}, optional
+            How to handle missing data in the input dataframes. Default: "warn"
+        """
+        super().__init__(handle_missing_events=handle_missing_events)
+        self.scaling_factor = scaling_factor
+
+    # @make_action_safe
+    def extract(
+        self,
+        *,
+        ecg: pd.DataFrame,
+        heartbeats: pd.DataFrame,
+        sampling_rate_hz: int,  # noqa: ARG002
+    ):
+        """Extract Q-peaks from given ECG cleaned signal.
+
+        The results are saved in the ``points_`` attribute of the super class.
+
+        Parameters
+        ----------
+        ecg: :class:`~pandas.DataFrame`
+            ECG signal
+        heartbeats: :class:`~pandas.DataFrame`
+            DataFrame containing one row per segmented heartbeat, each row contains start, end, and R-peak
+            location (in samples from beginning of signal) of that heartbeat, index functions as id of heartbeat
+        sampling_rate_hz: int
+            Sampling rate of ECG signal in hz
+
+        Returns
+        -------
+            self
+
+        Raises
+        ------
+        :exc:`~biopsykit.utils.exceptions.EventExtractionError`
+            If missing data is found and ``handle_missing`` is set to "raise"
+
+        """
+        self._check_valid_missing_handling()
+        ecg = sanitize_input_series(ecg, name="ecg")
+        ecg = ecg.squeeze()
+
+        # result df
+        q_peaks = pd.DataFrame(index=heartbeats.index, columns=["q_peak_sample", "nan_reason"])
+
+        # search Q-peak for each heartbeat of the given signal
+        for idx, data in heartbeats.iterrows():
+            heartbeat_start = data["start_sample"]
+            r_peak_sample = data["r_peak_sample"]
+
+            # set an individual threshold for detecting the Q-peaks based on the R-peak
+            threshold = (-1.2 * ecg.iloc[r_peak_sample]) / self.scaling_factor
+
+            # search for the Q-peak as the last sample before the R-peak that is below the threshold
+            ecg_before_r_peak = ecg[heartbeat_start:r_peak_sample].reset_index(drop=True)
+            ecg_below = np.where(ecg_before_r_peak < threshold)[0]
+
+            if len(ecg_below) == 0:
+                q_peaks.loc[idx, "q_peak_sample"] = np.nan
+                q_peaks.loc[idx, "nan_reason"] = "no_value_below_threshold"
+                continue
+
+            q_peak_sample = heartbeat_start + ecg_below[-1]
+            q_peaks.loc[idx, "q_peak_sample"] = q_peak_sample
+
+        _assert_is_dtype(q_peaks, pd.DataFrame)
+        _assert_has_columns(q_peaks, [["q_peak_sample", "nan_reason"]])
+        q_peaks = q_peaks.astype({"q_peak_sample": "Int64", "nan_reason": "object"})
+        assert_sample_columns_int(q_peaks)
+
+        self.points_ = q_peaks
+        return self
diff --git a/src/biopsykit/signals/ecg/event_extraction/_q_peak_martinez2004_neurokit.py b/src/biopsykit/signals/ecg/event_extraction/_q_peak_martinez2004_neurokit.py
@@ -0,0 +1,134 @@
+import warnings
+
+import neurokit2 as nk
+import numpy as np
+import pandas as pd
+from biopsykit.signals._base_extraction import HANDLE_MISSING_EVENTS, CanHandleMissingEventsMixin
+from biopsykit.signals._dtypes import assert_sample_columns_int
+from biopsykit.signals.ecg.event_extraction._base_ecg_extraction import BaseEcgExtraction
+from biopsykit.utils._datatype_validation_helper import _assert_has_columns, _assert_is_dtype
+from biopsykit.utils.array_handling import sanitize_input_series
+from biopsykit.utils.exceptions import EventExtractionError
+
+
+class QPeakExtractionMartinez2004Neurokit(BaseEcgExtraction, CanHandleMissingEventsMixin):
+    """Algorithm by Martinez et al. (2004) for Q-peak extraction using the DWT method implemented in NeuroKit2."""
+
+    def __init__(self, handle_missing_events: HANDLE_MISSING_EVENTS = "warn"):
+        """Initialize new QPeakExtractionMartinez2004Neurokit algorithm instance.
+
+        Parameters
+        ----------
+        handle_missing_events : one of {"warn", "raise", "ignore"}, optional
+            How to handle missing data in the input dataframes. Default: "warn"
+
+        """
+        super().__init__(handle_missing_events=handle_missing_events)
+
+    # @make_action_safe
+    def extract(
+        self,
+        *,
+        ecg: pd.DataFrame,
+        heartbeats: pd.DataFrame,
+        sampling_rate_hz: int,
+    ):
+        """Extract Q-peaks from given ECG cleaned signal.
+
+        The results are saved in the ``points_`` attribute of the super class.
+
+        Parameters
+        ----------
+        ecg: :class:`~pandas.DataFrame`
+            ECG signal
+        heartbeats: :class:`~pandas.DataFrame`
+            DataFrame containing one row per segmented heartbeat, each row contains start, end, and R-peak
+            location (in samples from beginning of signal) of that heartbeat, index functions as id of heartbeat
+        sampling_rate_hz: int
+            Sampling rate of ECG signal in hz
+
+        Returns
+        -------
+            self
+
+        Raises
+        ------
+        :exc:`~biopsykit.utils.exceptions.EventExtractionError`
+            If missing data is found and ``handle_missing`` is set to "raise"
+
+        """
+        self._check_valid_missing_handling()
+
+        ecg = sanitize_input_series(ecg, name="ecg")
+        ecg = ecg.squeeze()
+
+        # result df
+        q_peaks = pd.DataFrame(index=heartbeats.index, columns=["q_peak", "nan_reason"])
+
+        # used subsequently to store ids of heartbeats for which no AO or IVC could be detected
+        heartbeats_no_q = []
+        heartbeats_q_after_r = []
+
+        # some neurokit functions (for example ecg_delineate()) don't work with r-peaks input as Series, so list instead
+        r_peaks = list(heartbeats["r_peak_sample"])
+
+        _, waves = nk.ecg_delineate(ecg, rpeaks=r_peaks, sampling_rate=sampling_rate_hz, method="dwt", show=False)
+
+        extracted_q_peaks = waves["ECG_Q_Peaks"]
+
+        # find heartbeat to which Q-peak belongs and save Q-peak position in corresponding row
+        for idx, q in enumerate(extracted_q_peaks):
+            # for some heartbeats, no Q can be detected, will be NaN in resulting df
+            if np.isnan(q):
+                heartbeats_no_q.append(idx)
+            else:
+                heartbeat_idx = heartbeats.loc[(heartbeats["start_sample"] < q) & (q < heartbeats["end_sample"])].index[
+                    0
+                ]
+
+                # Q occurs after R, which is not valid
+                if heartbeats["r_peak_sample"].loc[heartbeat_idx].item() < q:
+                    heartbeats_q_after_r.append(heartbeat_idx)
+                    q_peaks.loc[heartbeat_idx, "q_peak"] = np.NaN
+                # valid Q-peak found
+                else:
+                    q_peaks.loc[heartbeat_idx, "q_peak"] = q
+
+        # inform user about missing Q-values
+        if q_peaks.isna().sum()[0] > 0:
+            nan_rows = q_peaks[q_peaks["q_peak"].isna()]
+            nan_rows = nan_rows.drop(index=heartbeats_q_after_r)
+            nan_rows = nan_rows.drop(index=heartbeats_no_q)
+
+            missing_str = f"No Q-peak detected in {q_peaks.isna().sum()[0]} heartbeats:\n"
+            if len(heartbeats_no_q) > 0:
+                q_peaks.loc[heartbeats_no_q, "nan_reason"] = "no_q_peak"
+                missing_str += (
+                    f"- for heartbeats {heartbeats_no_q} the neurokit algorithm was not able to detect a Q-peak\n"
+                )
+            if len(heartbeats_q_after_r) > 0:
+                q_peaks.loc[heartbeats_no_q, "nan_reason"] = "q_after_r_peak"
+                missing_str += (
+                    f"- for heartbeats {heartbeats_q_after_r} the detected Q is invalid "
+                    f"because it occurs after the R-peak\n"
+                )
+            if len(nan_rows.index.values) > 0:
+                q_peaks.loc[nan_rows.index, "nan_reason"] = "no_q_peak_within_heartbeats"
+                missing_str += (
+                    f"- for {nan_rows.index.to_numpy()} apparently none of the found Q-peaks "
+                    f"were within these heartbeats"
+                )
+
+            if self.handle_missing_events == "warn":
+                warnings.warn(missing_str)
+            elif self.handle_missing_events == "raise":
+                raise EventExtractionError(missing_str)
+
+        q_peaks.columns = ["q_peak_sample", "nan_reason"]
+        _assert_is_dtype(q_peaks, pd.DataFrame)
+        _assert_has_columns(q_peaks, [["q_peak_sample", "nan_reason"]])
+        q_peaks = q_peaks.astype({"q_peak_sample": "Int64", "nan_reason": "object"})
+        assert_sample_columns_int(q_peaks)
+
+        self.points_ = q_peaks
+        return self