Skip to content

Commit

Permalink
PRSQC WDL First Iteration (#177)
Browse files Browse the repository at this point in the history
PRSQC WDL v0.1
  • Loading branch information
kockan authored Jun 11, 2024
1 parent f211198 commit 1e32078
Show file tree
Hide file tree
Showing 2 changed files with 264 additions and 1 deletion.
5 changes: 4 additions & 1 deletion .dockstore.yml
Original file line number Diff line number Diff line change
Expand Up @@ -137,4 +137,7 @@ workflows:
primaryDescriptorPath: /Utilities/WDLs/DetectPCANovelties.wdl
- name: FindSamplesAndSimpleBenchmark
subclass: WDL
primaryDescriptorPath: /BenchmarkVCFs/FindSamplesAndSimpleBenchmark.wdl
primaryDescriptorPath: /BenchmarkVCFs/FindSamplesAndSimpleBenchmark.wdl
- name: PRSQC
subclass: WDL
primaryDescriptorPath: /Utilities/WDLs/PRSQC.wdl
260 changes: 260 additions & 0 deletions Utilities/WDLs/PRSQC.wdl
Original file line number Diff line number Diff line change
@@ -0,0 +1,260 @@
version 1.0

# Simple PRS QC wdl for the PROGRESS VA project
workflow PRSQC {
input {
String sample_name
File prs_control
File prs_sample
File control_acceptable_range
File sample_acceptable_range
File control_expected_pcs
Float margin_control_pcs
# This alphashape is essentially a bounding polygon containing the set of points in the form (PC1, PC2)
# from a training set, such as 1kG. It was generated using the library: https://github.com/bellockk/alphashape
# We have a script that allows generating alphashapes with an arbitrary training set (of valid format),
# which can be found here: https://github.com/broadinstitute/palantir-workflows/blob/main/Utilities/Dockers/Alphashape/generate_alphashape.py
# The main (and pretty much only one besides training data) parameter of interest is "alpha", which determines how tight
# the bounding polygon is. Our default is set at 8.0, which was determined to be a good value experimentally.
# Users who intend to use a training set different than the default (1kG) should be aware of this.
File alphashape
Boolean manually_passed_control = false
}

Int cpu = 1
Int mem_gb = 4
Int disk_size_gb = 32
String docker = "us.gcr.io/broad-dsde-methods/python-data-slim:1.0"

if(!manually_passed_control) {
call CheckScores as CheckControlScoreAgainstExpectedValue {
input:
scores = prs_control,
acceptable_range = control_acceptable_range,
cpu = cpu,
mem_gb = mem_gb,
disk_size_gb = disk_size_gb,
docker = docker
}

call CheckControlPCsAgainstExpectedValues {
input:
scores = prs_control,
expected_values = control_expected_pcs,
margin = margin_control_pcs,
cpu = cpu,
mem_gb = mem_gb,
disk_size_gb = disk_size_gb,
docker = docker
}
}

call CheckScores as CheckScoreWithinReportableRange {
input:
scores = prs_sample,
acceptable_range = sample_acceptable_range,
cpu = cpu,
mem_gb = mem_gb,
disk_size_gb = disk_size_gb,
docker = docker
}

call DetectPCANovelties as DetectPCANoveltiesSample {
input:
test_sample = prs_sample,
alphashape = alphashape,
cpu = cpu,
mem_gb = mem_gb,
disk_size_gb = disk_size_gb
}

output {
Boolean qc_passed = CheckScoreWithinReportableRange.qc_passed && DetectPCANoveltiesSample.pca_qc_passed && select_first([CheckControlScoreAgainstExpectedValue.qc_passed, true]) && select_first([CheckControlPCsAgainstExpectedValues.pca_qc_passed, true])
File? qc_failures_control = CheckControlScoreAgainstExpectedValue.qc_failures
File qc_failures_sample = CheckScoreWithinReportableRange.qc_failures
}
}

task CheckScores {
input {
File scores
File acceptable_range

Int cpu
Int mem_gb
Int disk_size_gb
String docker
}

String output_basename = basename(scores, ".tsv")

command <<<
set -euo pipefail

cat <<'EOF' > check_scores_against_expected_values.py
import pandas as pd
scores = pd.read_csv('~{scores}', sep = '\t', header = 0)
acceptable_range = pd.read_csv('~{acceptable_range}', sep = '\t', header = 0)
# Check whether PRS_SCORE and FULL_MODEL_SCORE are within the acceptable range
prs_score_passed = scores.loc[0, "PRS_SCORE"] >= acceptable_range.loc[0, "MIN"] and scores.loc[0, "PRS_SCORE"] <= acceptable_range.loc[0, "MAX"]
full_model_score_passed = scores.loc[0, "FULL_MODEL_SCORE"] >= acceptable_range.loc[1, "MIN"] and scores.loc[0, "FULL_MODEL_SCORE"] <= acceptable_range.loc[1, "MAX"]
with open('~{output_basename}.qc_passed.txt', 'w') as qc_passed:
if prs_score_passed and full_model_score_passed:
qc_passed.write("true\n")
else:
qc_passed.write("false\n")
with open('~{output_basename}.qc_failures.txt', 'w') as qc_failures:
if not prs_score_passed:
qc_failures.write("PRS_SCORE " + str(scores.loc[0, "PRS_SCORE"]) + " outside acceptable range with MIN: " + str(acceptable_range.loc[0, "MIN"]) + " , MAX: " + str(acceptable_range.loc[0, "MAX"]) + "\n")
if not full_model_score_passed:
qc_failures.write("FULL_MODEL_SCORE " + str(scores.loc[0, "FULL_MODEL_SCORE"]) + " outside acceptable range with MIN: " + str(acceptable_range.loc[1, "MIN"]) + " , MAX: " + str(acceptable_range.loc[1, "MAX"]) + "\n")
EOF
python3 check_scores_against_expected_values.py
>>>
runtime {
docker: docker
disks: "local-disk " + disk_size_gb + " HDD"
memory: mem_gb + " GiB"
cpu: cpu
}
output {
Boolean qc_passed = read_boolean("~{output_basename}.qc_passed.txt")
File qc_failures = "~{output_basename}.qc_failures.txt"
}
}
task CheckControlPCsAgainstExpectedValues {
input {
File scores
File expected_values
Float margin
Int cpu
Int mem_gb
Int disk_size_gb
String docker
}
String output_basename = basename(scores, ".tsv")
command <<<
set -euo pipefail
cat <<'EOF' > check_control_pcs_against_expected_values.py
import pandas as pd
scores = pd.read_csv('~{scores}', sep = '\t', header = 0)
expected_values = pd.read_csv('~{expected_values}', sep = '\t', header = 0)
# Check whether PC1 and PC2 are within the acceptable range
pc1_within_range = abs(scores.loc[0, "PC1"] - expected_values.loc[0, "PC1"]) <= ~{margin}
pc2_within_range = abs(scores.loc[0, "PC2"] - expected_values.loc[0, "PC2"]) <= ~{margin}
with open('~{output_basename}.pca_qc_passed.txt', 'w') as pca_qc_passed:
if pc1_within_range and pc2_within_range:
pca_qc_passed.write("true\n")
else:
pca_qc_passed.write("false\n")
EOF
python3 check_control_pcs_against_expected_values.py
>>>
runtime {
docker: docker
disks: "local-disk " + disk_size_gb + " HDD"
memory: mem_gb + " GiB"
cpu: cpu
}
output {
Boolean pca_qc_passed = read_boolean("~{output_basename}.pca_qc_passed.txt")
}
}
task DetectPCANovelties{
input {
File test_sample
File alphashape
Float distance_threshold = 0.01
Int cpu
Int mem_gb
Int disk_size_gb
String docker = "us.gcr.io/broad-dsde-methods/kockan/alphashape@sha256:96d5a34da2ff6da6e2cd0f85cca2fa2400c2d73e02e1957def111fbf04c2fdda"
}
String output_basename = basename(test_sample, ".tsv")
command <<<
set -euo pipefail
cat <<'EOF' > detect_pca_novelties.py
# Automatically flag novelties in 2D PCA plots using Concave Hulls generated via alphashapes
import sys
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
from descartes import PolygonPatch
from shapely.geometry import Point
import alphashape
# Load alphashape
with open("~{alphashape}", "rb") as infile:
alpha_shape = pickle.load(infile)
# Set the distance threshold
dist_thresh = ~{distance_threshold}
# Read input data
df_test = pd.read_csv("~{test_sample}", sep = '\t', header = 0)
test_point = Point(df_test.loc[0, "PC1"], df_test.loc[0, "PC2"])
# Prepare output plot for nice visualization
fig, ax = plt.subplots()
ax.add_patch(PolygonPatch(alpha_shape, alpha = 0.2))
with open("~{output_basename}.pca_qc.txt", 'w') as outfile:
# Test and label the sample as a novelty or a regular observation
dist = test_point.distance(alpha_shape)
if alpha_shape.contains(test_point) or dist < dist_thresh:
outfile.write("true" + "\n")
plt.scatter(test_point.x, test_point.y, c = 'green', alpha = 1.0, s = 10)
else:
outfile.write("false" + "\n")
plt.scatter(test_point.x, test_point.y, c = 'red', alpha = 1.0, s = 10)
# Plotting for nice visualization
plt.title("Alphashape Automated Novelty Flagging: ~{output_basename}")
plt.xlabel("PC1")
plt.ylabel("PC2")
colors = ['green', 'red']
labels = ['Pass', 'Fail']
patches = [(plt.Line2D([], [], color = colors[i], label = "{:s}".format(labels[i]), marker = "o", linewidth = 0)) for i in range(len(labels))]
plt.legend(loc = 'center left', bbox_to_anchor = (1, 0.5), handles = patches)
plt.savefig("~{output_basename}.pca_qc.png", dpi = 300, bbox_inches = 'tight')
EOF
python3 detect_pca_novelties.py
>>>
output {
Boolean pca_qc_passed = read_boolean("~{output_basename}.pca_qc.txt")
File pca_qc_plot = "~{output_basename}.pca_qc.png"
}
runtime {
cpu: cpu
memory: "~{mem_gb} GiB"
disks: "local-disk ~{disk_size_gb} HDD"
docker: docker
}
}

0 comments on commit 1e32078

Please sign in to comment.