PRSQC WDL First Iteration (#177)

PRSQC WDL v0.1
broadinstitute · Jun 11, 2024 · 1e32078 · 1e32078
1 parent f211198
commit 1e32078
Show file tree

Hide file tree

Showing 2 changed files with 264 additions and 1 deletion.
diff --git a/.dockstore.yml b/.dockstore.yml
@@ -137,4 +137,7 @@ workflows:
     primaryDescriptorPath: /Utilities/WDLs/DetectPCANovelties.wdl
   - name: FindSamplesAndSimpleBenchmark
     subclass: WDL
-    primaryDescriptorPath: /BenchmarkVCFs/FindSamplesAndSimpleBenchmark.wdl
+    primaryDescriptorPath: /BenchmarkVCFs/FindSamplesAndSimpleBenchmark.wdl
+  - name: PRSQC
+    subclass: WDL
+    primaryDescriptorPath: /Utilities/WDLs/PRSQC.wdl
diff --git a/Utilities/WDLs/PRSQC.wdl b/Utilities/WDLs/PRSQC.wdl
@@ -0,0 +1,260 @@
+version 1.0
+
+# Simple PRS QC wdl for the PROGRESS VA project
+workflow PRSQC {
+    input {
+        String sample_name
+        File prs_control
+        File prs_sample
+        File control_acceptable_range
+        File sample_acceptable_range
+        File control_expected_pcs
+        Float margin_control_pcs
+        # This alphashape is essentially a bounding polygon containing the set of points in the form (PC1, PC2)
+        # from a training set, such as 1kG. It was generated using the library: https://github.com/bellockk/alphashape
+        # We have a script that allows generating alphashapes with an arbitrary training set (of valid format),
+        # which can be found here: https://github.com/broadinstitute/palantir-workflows/blob/main/Utilities/Dockers/Alphashape/generate_alphashape.py
+        # The main (and pretty much only one besides training data) parameter of interest is "alpha", which determines how tight
+        # the bounding polygon is. Our default is set at 8.0, which was determined to be a good value experimentally.
+        # Users who intend to use a training set different than the default (1kG) should be aware of this.
+        File alphashape
+        Boolean manually_passed_control = false
+    }
+
+    Int cpu = 1
+    Int mem_gb = 4
+    Int disk_size_gb = 32
+    String docker = "us.gcr.io/broad-dsde-methods/python-data-slim:1.0"
+
+    if(!manually_passed_control) {
+        call CheckScores as CheckControlScoreAgainstExpectedValue {
+            input:
+                scores = prs_control,
+                acceptable_range = control_acceptable_range,
+                cpu = cpu,
+                mem_gb = mem_gb,
+                disk_size_gb = disk_size_gb,
+                docker = docker
+        }
+
+        call CheckControlPCsAgainstExpectedValues {
+            input:
+                scores = prs_control,
+                expected_values = control_expected_pcs,
+                margin = margin_control_pcs,
+                cpu = cpu,
+                mem_gb = mem_gb,
+                disk_size_gb = disk_size_gb,
+                docker = docker
+        }
+    }
+
+    call CheckScores as CheckScoreWithinReportableRange {
+        input:
+            scores = prs_sample,
+            acceptable_range = sample_acceptable_range,
+            cpu = cpu,
+            mem_gb = mem_gb,
+            disk_size_gb = disk_size_gb,
+            docker = docker
+    }
+
+    call DetectPCANovelties as DetectPCANoveltiesSample {
+        input:
+            test_sample = prs_sample,
+            alphashape = alphashape,
+            cpu = cpu,
+            mem_gb = mem_gb,
+            disk_size_gb = disk_size_gb
+    }
+
+    output {
+        Boolean qc_passed = CheckScoreWithinReportableRange.qc_passed && DetectPCANoveltiesSample.pca_qc_passed && select_first([CheckControlScoreAgainstExpectedValue.qc_passed, true]) && select_first([CheckControlPCsAgainstExpectedValues.pca_qc_passed, true])
+        File? qc_failures_control = CheckControlScoreAgainstExpectedValue.qc_failures
+        File qc_failures_sample = CheckScoreWithinReportableRange.qc_failures
+    }
+}
+
+task CheckScores {
+    input {
+        File scores
+        File acceptable_range
+
+        Int cpu
+        Int mem_gb
+        Int disk_size_gb
+        String docker
+    }
+
+    String output_basename = basename(scores, ".tsv")
+
+    command <<<
+        set -euo pipefail
+
+        cat <<'EOF' > check_scores_against_expected_values.py
+        import pandas as pd
+
+        scores = pd.read_csv('~{scores}', sep = '\t', header = 0)
+        acceptable_range = pd.read_csv('~{acceptable_range}', sep = '\t', header = 0)
+
+        # Check whether PRS_SCORE and FULL_MODEL_SCORE are within the acceptable range
+        prs_score_passed = scores.loc[0, "PRS_SCORE"] >= acceptable_range.loc[0, "MIN"] and scores.loc[0, "PRS_SCORE"] <= acceptable_range.loc[0, "MAX"]
+        full_model_score_passed = scores.loc[0, "FULL_MODEL_SCORE"] >= acceptable_range.loc[1, "MIN"] and scores.loc[0, "FULL_MODEL_SCORE"] <= acceptable_range.loc[1, "MAX"]
+
+        with open('~{output_basename}.qc_passed.txt', 'w') as qc_passed:
+            if prs_score_passed and full_model_score_passed:
+                qc_passed.write("true\n")
+            else:
+                qc_passed.write("false\n")
+
+        with open('~{output_basename}.qc_failures.txt', 'w') as qc_failures:
+            if not prs_score_passed:
+                qc_failures.write("PRS_SCORE " + str(scores.loc[0, "PRS_SCORE"]) + " outside acceptable range with MIN: " + str(acceptable_range.loc[0, "MIN"]) + " , MAX: " + str(acceptable_range.loc[0, "MAX"]) + "\n")
+            if not full_model_score_passed:
+                qc_failures.write("FULL_MODEL_SCORE " + str(scores.loc[0, "FULL_MODEL_SCORE"]) + " outside acceptable range with MIN: " + str(acceptable_range.loc[1, "MIN"]) + " , MAX: " + str(acceptable_range.loc[1, "MAX"]) + "\n")
+
+        EOF
+        python3 check_scores_against_expected_values.py
+    >>>
+
+    runtime {
+        docker: docker
+        disks: "local-disk " + disk_size_gb + " HDD"
+        memory: mem_gb + " GiB"
+        cpu: cpu
+    }
+
+    output {
+        Boolean qc_passed = read_boolean("~{output_basename}.qc_passed.txt")
+        File qc_failures = "~{output_basename}.qc_failures.txt"
+    }
+}
+
+task CheckControlPCsAgainstExpectedValues {
+    input {
+        File scores
+        File expected_values
+        Float margin
+
+        Int cpu
+        Int mem_gb
+        Int disk_size_gb
+        String docker
+    }
+
+    String output_basename = basename(scores, ".tsv")
+
+    command <<<
+        set -euo pipefail
+
+        cat <<'EOF' > check_control_pcs_against_expected_values.py
+        import pandas as pd
+
+        scores = pd.read_csv('~{scores}', sep = '\t', header = 0)
+        expected_values = pd.read_csv('~{expected_values}', sep = '\t', header = 0)
+
+        # Check whether PC1 and PC2 are within the acceptable range
+        pc1_within_range = abs(scores.loc[0, "PC1"] - expected_values.loc[0, "PC1"]) <= ~{margin}
+        pc2_within_range = abs(scores.loc[0, "PC2"] - expected_values.loc[0, "PC2"]) <= ~{margin}
+
+        with open('~{output_basename}.pca_qc_passed.txt', 'w') as pca_qc_passed:
+            if pc1_within_range and pc2_within_range:
+                pca_qc_passed.write("true\n")
+            else:
+                pca_qc_passed.write("false\n")
+
+        EOF
+        python3 check_control_pcs_against_expected_values.py
+    >>>
+
+    runtime {
+        docker: docker
+        disks: "local-disk " + disk_size_gb + " HDD"
+        memory: mem_gb + " GiB"
+        cpu: cpu
+    }
+
+    output {
+        Boolean pca_qc_passed = read_boolean("~{output_basename}.pca_qc_passed.txt")
+    }
+}
+
+task DetectPCANovelties{
+    input {
+        File test_sample
+        File alphashape
+        Float distance_threshold = 0.01
+        Int cpu
+        Int mem_gb
+        Int disk_size_gb
+        String docker = "us.gcr.io/broad-dsde-methods/kockan/alphashape@sha256:96d5a34da2ff6da6e2cd0f85cca2fa2400c2d73e02e1957def111fbf04c2fdda"
+    }
+
+    String output_basename = basename(test_sample, ".tsv")
+
+    command <<<
+        set -euo pipefail
+
+        cat <<'EOF' > detect_pca_novelties.py
+        # Automatically flag novelties in 2D PCA plots using Concave Hulls generated via alphashapes
+        import sys
+        import pickle
+        import numpy as np
+        import pandas as pd
+        import matplotlib.pyplot as plt
+        import matplotlib.patches as mpatches
+        from descartes import PolygonPatch
+        from shapely.geometry import Point
+        import alphashape
+
+        # Load alphashape
+        with open("~{alphashape}", "rb") as infile:
+            alpha_shape = pickle.load(infile)
+
+        # Set the distance threshold
+        dist_thresh = ~{distance_threshold}
+
+        # Read input data
+        df_test = pd.read_csv("~{test_sample}", sep = '\t', header = 0)
+        test_point = Point(df_test.loc[0, "PC1"], df_test.loc[0, "PC2"])
+
+        # Prepare output plot for nice visualization
+        fig, ax = plt.subplots()
+        ax.add_patch(PolygonPatch(alpha_shape, alpha = 0.2))
+
+        with open("~{output_basename}.pca_qc.txt", 'w') as outfile:
+            # Test and label the sample as a novelty or a regular observation
+            dist = test_point.distance(alpha_shape)
+            if alpha_shape.contains(test_point) or dist < dist_thresh:
+                outfile.write("true" + "\n")
+                plt.scatter(test_point.x, test_point.y, c = 'green', alpha = 1.0, s = 10)
+            else:
+                outfile.write("false" + "\n")
+                plt.scatter(test_point.x, test_point.y, c = 'red', alpha = 1.0, s = 10)
+
+        # Plotting for nice visualization
+        plt.title("Alphashape Automated Novelty Flagging: ~{output_basename}")
+        plt.xlabel("PC1")
+        plt.ylabel("PC2")
+        colors = ['green', 'red']
+        labels = ['Pass', 'Fail']
+        patches = [(plt.Line2D([], [], color = colors[i], label = "{:s}".format(labels[i]), marker = "o", linewidth = 0)) for i in range(len(labels))]
+        plt.legend(loc = 'center left', bbox_to_anchor = (1, 0.5), handles = patches)
+        plt.savefig("~{output_basename}.pca_qc.png", dpi = 300, bbox_inches = 'tight')
+
+        EOF
+        python3 detect_pca_novelties.py
+    >>>
+
+    output {
+        Boolean pca_qc_passed = read_boolean("~{output_basename}.pca_qc.txt")
+        File pca_qc_plot = "~{output_basename}.pca_qc.png"
+    }
+
+    runtime {
+        cpu: cpu
+        memory: "~{mem_gb} GiB"
+        disks: "local-disk ~{disk_size_gb} HDD"
+        docker: docker
+    }
+}