diff --git a/doc/bulk_statistics/index.rst b/doc/bulk_statistics/index.rst index bb06de32..2560e3bb 100644 --- a/doc/bulk_statistics/index.rst +++ b/doc/bulk_statistics/index.rst @@ -2,9 +2,14 @@ Compute bulk statistics ########################## +Bulk statistics allow for a wide range of properties of detected objects to be calculated during feature detection and segmentation or as a postprocessing step. +The :py:meth:`tobac.utils.bulk_statistics.get_statistics_from_mask` function applies one or more functions over one or more data fields for each detected object. +For example, one could calculate the convective mass flux for each detected feature by providing fields of vertical velocity, cloud water content and area. +Numpy-like broadcasting is supported, allowing 2D and 3D data to be combined. + .. toctree:: - :maxdepth: 2 + :maxdepth: 1 - notebooks/compute_statistics_during_feature_detection_example - notebooks/compute_statistics_during_segmentation_example + notebooks/compute_statistics_during_feature_detection + notebooks/compute_statistics_during_segmentation notebooks/compute_statistics_postprocessing_example diff --git a/doc/index.rst b/doc/index.rst index 7cfc5888..d008e946 100644 --- a/doc/index.rst +++ b/doc/index.rst @@ -67,7 +67,7 @@ The project is currently being extended by several contributors to include addit .. toctree:: - :caption: Compute bulk statistics online or in postprocessing + :caption: Compute bulk statistics :maxdepth: 2 bulk_statistics/index diff --git a/doc/tobac.rst b/doc/tobac.rst index a3b011d4..87cd45ab 100644 --- a/doc/tobac.rst +++ b/doc/tobac.rst @@ -71,7 +71,7 @@ tobac.tracking module tobac.utils modules ------------------ -tobac.utils.general modules +tobac.utils.general module ------------------ .. automodule:: tobac.utils.general @@ -79,7 +79,15 @@ tobac.utils.general modules :undoc-members: :show-inheritance: -tobac.utils.mask modules +tobac.utils.bulk_statistics module +------------------ + +.. automodule:: tobac.utils.bulk_statistics + :members: + :undoc-members: + :show-inheritance: + +tobac.utils.mask module ------------------ .. automodule:: tobac.utils.mask diff --git a/tobac/utils/bulk_statistics.py b/tobac/utils/bulk_statistics.py index f27f9fd5..adc782cc 100644 --- a/tobac/utils/bulk_statistics.py +++ b/tobac/utils/bulk_statistics.py @@ -25,38 +25,54 @@ def get_statistics( default: Union[None, float] = None, id_column: str = "feature", ) -> pd.DataFrame: - """ - Get bulk statistics for objects (e.g. features or segmented features) given a labelled mask of the objects - and any input field with the same dimensions. + """Get bulk statistics for objects (e.g. features or segmented features) + given a labelled mask of the objects and any input field with the same + dimensions or that can be broadcast with labels according to numpy-like + broadcasting rules. - The statistics are added as a new column to the existing feature dataframe. Users can specify which statistics are computed by - providing a dictionary with the column name of the metric and the respective function. + The statistics are added as a new column to the existing feature dataframe. + Users can specify which statistics are computed by providing a dictionary + with the column name of the metric and the respective function. Parameters ---------- + features: pd.DataFrame + Dataframe with features or segmented features (output from feature + detection or segmentation), which can be for the specific timestep or + for the whole dataset + labels : np.ndarray[int] - Mask with labels of each regions to apply function to (e.g. output of segmentation for a specific timestep) + Mask with labels of each regions to apply function to (e.g. output of + segmentation for a specific timestep) + *fields : tuple[np.ndarray] - Fields to give as arguments to each function call. Must have the same shape as labels. - features: pd.DataFrame - Dataframe with features or segmented features (output from feature detection or segmentation) - can be for the specific timestep or for the whole dataset + Fields to give as arguments to each function call. If the shape does not + match that of labels, numpy-style broadcasting will be applied. + statistic: dict[str, Callable], optional (default: {'ncells':np.count_nonzero}) - Dictionary with function(s) to apply over each region as values and the name of the respective statistics as keys - default is to just count the number of cells associated with each feature and write it to the feature dataframe + Dictionary with function(s) to apply over each region as values and the + name of the respective statistics as keys. Default is to just count the + number of cells associated with each feature and write it to the feature + dataframe. + index: None | list[int], optional (default: None) list of indices of regions in labels to apply function to. If None, will - default to all integer feature labels in labels + default to all integer feature labels in labels. + default: None | float, optional (default: None) - default value to return in a region that has no values + default value to return in a region that has no values. + id_column: str, optional (default: "feature") - Name of the column in feature dataframe that contains IDs that match with the labels in mask. The default is the column "feature". + Name of the column in feature dataframe that contains IDs that match with + the labels in mask. The default is the column "feature". - Returns: - ------- - features: pd.DataFrame - Updated feature dataframe with bulk statistics for each feature saved in a new column + Returns + ------- + features: pd.DataFrame + Updated feature dataframe with bulk statistics for each feature saved + in a new column. """ + # if mask and input data dimensions do not match we can broadcast using numpy broadcasting rules for field in fields: if labels.shape != field.shape: @@ -157,36 +173,48 @@ def get_statistics_from_mask( default: Union[None, float] = None, id_column: str = "feature", ) -> pd.DataFrame: - """ - Derives bulk statistics for each object in the segmentation mask. + """Derives bulk statistics for each object in the segmentation mask, and + returns a features Dataframe with these properties for each feature. + Parameters + ---------- + features: pd.DataFrame + Dataframe with segmented features (output from feature detection or + segmentation). Timesteps must not be exactly the same as in segmentation + mask but all labels in the mask need to be present in the feature + dataframe. - Parameters: - ----------- segmentation_mask : xr.DataArray Segmentation mask output + *fields : xr.DataArray[np.ndarray] - Field(s) with input data. Needs to have the same dimensions as the segmentation mask. - features: pd.DataFrame - Dataframe with segmented features (output from feature detection or segmentation). - Timesteps must not be exactly the same as in segmentation mask but all labels in the mask need to be present in the feature dataframe. + Field(s) with input data. If field does not have a time dimension it + will be considered time invariant, and the entire field will be passed + for each time step in segmentation_mask. If the shape does not match + that of labels, numpy-style broadcasting will be applied. + statistic: dict[str, Callable], optional (default: {'ncells':np.count_nonzero}) - Dictionary with function(s) to apply over each region as values and the name of the respective statistics as keys - default is to just count the number of cells associated with each feature and write it to the feature dataframe + Dictionary with function(s) to apply over each region as values and the + name of the respective statistics as keys. Default is to calculate the + mean value of the field over each feature. + index: None | list[int], optional (default: None) list of indexes of regions in labels to apply function to. If None, will - default to all integers between 1 and the maximum value in labels + default to all integers between 1 and the maximum value in labels + default: None | float, optional (default: None) default value to return in a region that has no values - id_column: str, optional (default: "feature") - Name of the column in feature dataframe that contains IDs that match with the labels in mask. The default is the column "feature". + id_column: str, optional (default: "feature") + Name of the column in feature dataframe that contains IDs that match + with the labels in mask. The default is the column "feature". - Returns: - ------- - features: pd.DataFrame - Updated feature dataframe with bulk statistics for each feature saved in a new column + Returns + ------- + features: pd.DataFrame + Updated feature dataframe with bulk statistics for each feature saved in a new column """ + # check that mask and input data have the same dimensions for field in fields: if segmentation_mask.shape != field.shape: