Merge branch 'master' into dp_sf_pap

broadinstitute · Jan 12, 2024 · b15ef4d · b15ef4d
2 parents 618d857 + 982b56b
commit b15ef4d
Show file tree

Hide file tree

Showing 33 changed files with 612 additions and 65 deletions.
diff --git a/.gitattributes b/.gitattributes
@@ -15,3 +15,10 @@ model_zoo/ECG2AF/km.jpg filter=lfs diff=lfs merge=lfs -text
 model_zoo/ECG2AF/study_design.jpg filter=lfs diff=lfs merge=lfs -text
 model_zoo/ECG2AF/architecture.png filter=lfs diff=lfs merge=lfs -text
 model_zoo/ECG2AF/salience.jpg filter=lfs diff=lfs merge=lfs -text
+model_zoo/cardiac_mri_derived_left_ventricular_mass/Lseg.png filter=lfs diff=lfs merge=lfs -text
+model_zoo/cardiac_mri_derived_left_ventricular_mass/Lreg.png filter=lfs diff=lfs merge=lfs -text
+model_zoo/left_ventricular_mass_from_ecg_student_and_mri_teacher/TrainingAndTestSets.jpg filter=lfs diff=lfs merge=lfs -text
+model_zoo/liver_fat_from_mri_ukb/liver_fat_from_echo_teacher_model.png filter=lfs diff=lfs merge=lfs -text
+model_zoo/liver_fat_from_mri_ukb/liver_fat_from_ideal_student_model.png filter=lfs diff=lfs merge=lfs -text
+model_zoo/ECG_PheWAS/ukb_phewas.png filter=lfs diff=lfs merge=lfs -text
+model_zoo/dropfuse/overview.png filter=lfs diff=lfs merge=lfs -text
diff --git a/README.md b/README.md
@@ -2,11 +2,11 @@
 `ML4H` is a toolkit for machine learning on clinical data of all kinds including genetics, labs, imaging, clinical notes, and more. The diverse data modalities of biomedicine offer different perspectives on the underlying challenge of understanding human health. For this reason, `ML4H` is built on a foundation of multimodal multitask modeling, hoping to leverage all available data to help power research and inform clinical care. Our tools help apply clinical research standards to ML models by carefully considering bias and longitudinal outcomes. Our project grew out of efforts at the Broad Institute to make it easy to work with the UK Biobank on the Google Cloud Platform and has since expanded to include proprietary data from academic medical centers. To put cutting-edge AI and ML to use making the world healthier, we're fostering interdisciplinary collaborations across industry and academia.  We'd love to work with you too!    
 
 `ML4H` is best described with Five Verbs: Ingest, Tensorize, TensorMap, Model, Evaluate
-* Ingest: collect files onto one system
-* Tensorize: write raw files (XML, DICOM, NIFTI, PNG) into HD5 files
-* TensorMap: tag data (typically from an HD5) with an interpretation and a method for generation
-* ModelFactory: connect TensorMaps with a trainable architectures
-* Evaluate: generate plots that enable domain-driven inspection of models and results
+* **Ingest**: collect files onto one system
+* **Tensorize**: write raw files (XML, DICOM, NIFTI, PNG) into HD5 files
+* **TensorMap**: tag data (typically from an HD5) with an interpretation and a method for generation
+* **ModelFactory**: connect TensorMaps with a trainable neural network architecture loss function, and optimization strategy
+* **Evaluate**: generate plots that enable domain-driven inspection of models and results
 
 # Getting Started
 * [Setting up your local environment](#setting-up-your-local-environment)

diff --git a/docker/vm_boot_images/config/tensorflow-requirements.txt b/docker/vm_boot_images/config/tensorflow-requirements.txt
@@ -43,5 +43,4 @@ google-cloud-storage
 umap-learn[plot]
 neurite
 voxelmorph
-pystrum
-
+pystrum
diff --git a/ml4h/TensorMap.py b/ml4h/TensorMap.py
@@ -204,8 +204,6 @@ def __init__(
         elif self.activation is None and (self.is_survival_curve() or self.is_time_to_event()):
             self.activation = 'sigmoid'
 
-
-
         if self.channel_map is None and self.is_time_to_event():
             self.channel_map = DEFAULT_TIME_TO_EVENT_CHANNELS
 

diff --git a/ml4h/arguments.py b/ml4h/arguments.py
@@ -385,7 +385,7 @@ def parse_args():
     )
 
     # Arguments for explorations/infer_stats_from_segmented_regions
-    parser.add_argument('--analyze_ground_truth', default=False, action='store_true', help='Filter by images with ground truth segmentations, for comparison')
+    parser.add_argument('--analyze_ground_truth', default=False, action='store_true', help='Whether or not to filter by images with ground truth segmentations, for comparison')
     parser.add_argument('--structures_to_analyze', nargs='*', default=[], help='Structure names to include in the .tsv files and scatter plots')
     parser.add_argument('--erosion_radius', default=1, type=int, help='Radius of the unit disk structuring element for erosion preprocessing')
     parser.add_argument('--intensity_thresh', type=float, help='Threshold value for preprocessing')

diff --git a/ml4h/data_descriptions.py b/ml4h/data_descriptions.py
@@ -5,6 +5,7 @@
 from typing import Callable, List, Union, Optional, Tuple, Dict, Any
 
 import h5py
+import datetime
 import numcodecs
 import numpy as np
 import pandas as pd
@@ -331,10 +332,9 @@ def __init__(
     ):
         """
         Gets data from a column of the provided DataFrame.
-        :param df: Must be multi-indexed with sample_id, loading_option
-        # TODO: allow multiple loading options
         :param col: The column name to get data from
         :param process_col: Function to turn the column value into Tensor
+        :param name: Optional overwrite of the df column name
         """
         self.process_col = process_col or self._default_process_call
         self.df = df

diff --git a/ml4h/models/legacy_models.py b/ml4h/models/legacy_models.py
@@ -350,7 +350,7 @@ def make_hidden_layer_model(parent_model: Model, tensor_maps_in: List[TensorMap]
     dummy_input = {tm.input_name(): np.zeros((1,) + parent_model.get_layer(tm.input_name()).input_shape[0][1:]) for tm in tensor_maps_in}
     intermediate_layer_model = Model(inputs=parent_inputs, outputs=target_layer.output)
     # If we do not predict here then the graph is disconnected, I do not know why?!
-    intermediate_layer_model.predict(dummy_input)
+    intermediate_layer_model.predict(dummy_input, verbose=0)
     return intermediate_layer_model
 
 
@@ -1344,7 +1344,7 @@ def make_paired_autoencoder_model(
 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 def embed_model_predict(model, tensor_maps_in, embed_layer, test_data, batch_size):
     embed_model = make_hidden_layer_model(model, tensor_maps_in, embed_layer)
-    return embed_model.predict(test_data, batch_size=batch_size)
+    return embed_model.predict(test_data, batch_size=batch_size, verbose=0)
 
 
 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

diff --git a/ml4h/plots.py b/ml4h/plots.py
@@ -770,7 +770,7 @@ def plot_scatter(
 
     ax1.set_xlabel("Predictions")
     ax1.set_ylabel("Actual")
-    ax1.set_title(title)
+    ax1.set_title(f'{title} N = {len(prediction)}' )
     ax1.legend(loc="lower right")
 
     sns.distplot(prediction, label="Predicted", color="r", ax=ax2)
@@ -2253,7 +2253,7 @@ def plot_ecg_rest(
     tensor_paths: List[str],
     rows: List[int],
     out_folder: str,
-    is_blind: bool,
+    is_blind: bool
 ) -> None:
     """Plots resting ECGs including annotations and LVH criteria
-Original file line number
+Diff line change
@@ Expand Up / @@ -43,5 +43,4 @@ google-cloud-storage @@
     umap-learn[plot]
     neurite
     voxelmorph
-    pystrum
+    pystrum