GLIMPSE2 improvements (#203)

* Glimpse improvements - Updated to most recent GLIMPSE2 docker, which includes some bug fixes especially for Glimpse2SplitReference - Added memory scaling factor for phasing - Amended documentation - Improved docker building bash script to automatically determine the docker tag - Added default value for interval_list for merging - Removed monitoring_script inputs and outputs
broadinstitute · Feb 18, 2025 · b283c77 · b283c77
1 parent ebbf5b8
commit b283c77
Show file tree

Hide file tree

Showing 8 changed files with 49 additions and 61 deletions.
diff --git a/GlimpseImputationPipeline/Glimpse2Imputation.wdl b/GlimpseImputationPipeline/Glimpse2Imputation.wdl
@@ -23,13 +23,13 @@ workflow Glimpse2Imputation {
         Int? effective_population_size
 
         Int preemptible = 9
-        String docker = "us.gcr.io/broad-dsde-methods/glimpse:kachulis_ck_bam_reader_retry_cf5822c"
+        String docker = "us.gcr.io/broad-dsde-methods/glimpse:odelaneau_bd93ade"
         String docker_extract_num_sites_from_reference_chunk = "us.gcr.io/broad-dsde-methods/glimpse_extract_num_sites_from_reference_chunks:michaelgatzen_edc7f3a"
         Int cpu_ligate = 4
         Int mem_gb_ligate = 4
         Int? cpu_phase
         Int? mem_gb_phase
-        File? monitoring_script
+        Float mem_scaling_factor_phase = 1.0
     }
 
     scatter (reference_chunk in read_lines(reference_chunks)) {
@@ -59,7 +59,7 @@ workflow Glimpse2Imputation {
                     n_samples = n_samples
             }
 
-            if (SelectResourceParameters.memory_gb > 256 || SelectResourceParameters.request_n_cpus > 32) {
+            if (ceil(SelectResourceParameters.memory_gb * mem_scaling_factor_phase) > 256 || SelectResourceParameters.request_n_cpus > 32) {
                 # force failure if we're accidently going to request too much resources and spend too much money
                 Int safety_check_memory_gb = -1
                 Int safety_check_n_cpu = -1
@@ -84,8 +84,7 @@ workflow Glimpse2Imputation {
                 preemptible = preemptible,
                 docker = docker,
                 cpu = select_first([cpu_phase, safety_check_n_cpu, SelectResourceParameters.request_n_cpus]),
-                mem_gb = select_first([mem_gb_phase, safety_check_memory_gb, SelectResourceParameters.memory_gb]),
-                monitoring_script = monitoring_script
+                mem_gb = select_first([mem_gb_phase, safety_check_memory_gb, ceil(select_first([SelectResourceParameters.memory_gb, -1]) * mem_scaling_factor_phase)])
         }
     }
 
@@ -99,7 +98,6 @@ workflow Glimpse2Imputation {
             docker = docker,
             cpu = cpu_ligate,
             mem_gb = mem_gb_ligate,
-            monitoring_script = monitoring_script
     }
 
     call CombineCoverageMetrics {
@@ -112,7 +110,6 @@ workflow Glimpse2Imputation {
         input:
             imputed_vcf = GlimpseLigate.imputed_vcf,
             output_basename = output_basename,
-            monitoring_script = monitoring_script
     }
 
 
@@ -123,9 +120,6 @@ workflow Glimpse2Imputation {
 
         File qc_metrics = CollectQCMetrics.qc_metrics
         File coverage_metrics = CombineCoverageMetrics.coverage_metrics
-
-        Array[File?] glimpse_phase_monitoring = GlimpsePhase.monitoring
-        File? glimpse_ligate_monitoring = GlimpseLigate.monitoring
     }
 }
 
@@ -152,7 +146,6 @@ task GlimpsePhase {
         Int preemptible = 9
         Int max_retries = 3
         String docker
-        File? monitoring_script
     }
 
     parameter_meta {
@@ -169,7 +162,6 @@ task GlimpsePhase {
         set -euo pipefail
 
         export GCS_OAUTH_TOKEN=$(/root/google-cloud-sdk/bin/gcloud auth application-default print-access-token)
-        ~{"bash " + monitoring_script + " > monitoring.log &"}
 
         cram_paths=( ~{sep=" " crams} )
         cram_index_paths=( ~{sep=" " cram_indices} )
@@ -236,7 +228,6 @@ task GlimpsePhase {
     output {
         File imputed_vcf = "phase_output.bcf"
         File imputed_vcf_index = "phase_output.bcf.csi"
-        File? monitoring = "monitoring.log"
         File coverage_metrics = "phase_output_stats_coverage.txt.gz"
     }
 }
@@ -254,14 +245,11 @@ task GlimpseLigate {
         Int preemptible = 1
         Int max_retries = 3
         String docker
-        File? monitoring_script
     }
 
     command <<<
         set -xeuo pipefail
 
-        ~{"bash " + monitoring_script + " > monitoring.log &"}
-
         NPROC=$(nproc)
         echo "nproc reported ${NPROC} CPUs, using that number as the threads argument for GLIMPSE."
 
@@ -289,7 +277,6 @@ task GlimpseLigate {
         File imputed_vcf = "~{output_basename}.imputed.vcf.gz"
         File imputed_vcf_index = "~{output_basename}.imputed.vcf.gz.tbi"
         File imputed_vcf_md5sum = "~{output_basename}.imputed.vcf.gz.md5sum"
-        File? monitoring = "monitoring.log"
     }
 }
 
@@ -302,7 +289,6 @@ task CollectQCMetrics {
         String docker = "hailgenetics/hail:0.2.126-py3.11"
         Int cpu = 4
         Int mem_gb = 16
-        File? monitoring_script
     }
 
     parameter_meta {
@@ -316,8 +302,6 @@ task CollectQCMetrics {
     command <<<
         set -euo pipefail
 
-        ~{"bash " + monitoring_script + " > monitoring.log &"}
-
         cat <<'EOF' > script.py
 import hail as hl
 import pandas as pd
@@ -345,7 +329,6 @@ EOF
 
     output {
         File qc_metrics = "~{output_basename}.qc_metrics.tsv"
-        File? monitoring = "monitoring.log"
     }
 }
 

diff --git a/GlimpseImputationPipeline/Glimpse2ImputationAndCheckQC.wdl b/GlimpseImputationPipeline/Glimpse2ImputationAndCheckQC.wdl
@@ -28,9 +28,9 @@ workflow Glimpse2ImputationAndCheckQC {
         Int? preemptible
         String? docker
         String? docker_extract_num_sites_from_reference_chunk
+        Float? mem_scaling_factor_phase
         Int? cpu_ligate
         Int? mem_gb_ligate
-        File? monitoring_script
 
         # For documentation, please refer to CheckQC.wdl
         File qc_metrics_thresholds
@@ -60,9 +60,9 @@ workflow Glimpse2ImputationAndCheckQC {
             preemptible = preemptible,
             docker = docker,
             docker_extract_num_sites_from_reference_chunk = docker_extract_num_sites_from_reference_chunk,
+            mem_scaling_factor_phase = mem_scaling_factor_phase,
             cpu_ligate = cpu_ligate,
-            mem_gb_ligate = mem_gb_ligate,
-            monitoring_script = monitoring_script
+            mem_gb_ligate = mem_gb_ligate
     }
 
     call Glimpse2CheckQC.Glimpse2CheckQC {
@@ -84,9 +84,6 @@ workflow Glimpse2ImputationAndCheckQC {
         File qc_metrics = Glimpse2Imputation.qc_metrics
         File coverage_metrics = Glimpse2Imputation.coverage_metrics
 
-        Array[File?] glimpse_phase_monitoring = Glimpse2Imputation.glimpse_phase_monitoring
-        File? glimpse_ligate_monitoring = Glimpse2Imputation.glimpse_ligate_monitoring
-
         Boolean qc_passed = Glimpse2CheckQC.qc_passed
         File qc_failures = Glimpse2CheckQC.qc_failures
     }

diff --git a/GlimpseImputationPipeline/Glimpse2ImputationInBatches.wdl b/GlimpseImputationPipeline/Glimpse2ImputationInBatches.wdl
@@ -28,16 +28,16 @@ workflow Glimpse2ImputationInBatches {
         Int? preemptible
         String? docker
         String? docker_extract_num_sites_from_reference_chunk
+        Float? mem_scaling_factor_phase
         Int? cpu_ligate
         Int? mem_gb_ligate
         Int? mem_gb_merge
-        File? monitoring_script
 
         String? docker_gatk
         String? docker_count_samples
         String? docker_merge
 
-        File interval_list_for_merge
+        File? interval_list_for_merge
         Int? merge_scatter_count
     }
 
@@ -68,9 +68,9 @@ workflow Glimpse2ImputationInBatches {
                 preemptible = preemptible,
                 docker = docker,
                 docker_extract_num_sites_from_reference_chunk = docker_extract_num_sites_from_reference_chunk,
+                mem_scaling_factor_phase = mem_scaling_factor_phase,
                 cpu_ligate = cpu_ligate,
-                mem_gb_ligate = mem_gb_ligate,
-                monitoring_script = monitoring_script
+                mem_gb_ligate = mem_gb_ligate
         }
     }
 

diff --git a/GlimpseImputationPipeline/Glimpse2MergeBatches.wdl b/GlimpseImputationPipeline/Glimpse2MergeBatches.wdl
@@ -13,7 +13,7 @@ workflow Glimpse2MergeBatches {
         String docker_count_samples = "us.gcr.io/broad-dsde-methods/bcftools:v1.3"
         String docker_merge = "us.gcr.io/broad-dsde-methods/samtools-suite:v1.1"
 
-        File interval_list
+        File interval_list = "gs://gcp-public-data--broad-references/hg38/v0/wgs_metrics_intervals.interval_list"
         Int scatter_count = 100
 
         Int mem_gb_merge = 16

diff --git a/GlimpseImputationPipeline/Glimpse2SplitReference.wdl b/GlimpseImputationPipeline/Glimpse2SplitReference.wdl
@@ -30,8 +30,7 @@ workflow Glimpse2SplitReference {
         Boolean keep_monomorphic_ref_sites = true
 
         Int preemptible = 1
-        String docker = "us.gcr.io/broad-dsde-methods/glimpse:odelaneau_e0b9b56"
-        File? monitoring_script
+        String docker = "us.gcr.io/broad-dsde-methods/glimpse:odelaneau_bd93ade"
     }
 
     scatter (i_contig in range(length(contig_regions))) {
@@ -54,15 +53,13 @@ workflow Glimpse2SplitReference {
                 uniform_number_variants = uniform_number_variants,
                 keep_monomorphic_ref_sites = keep_monomorphic_ref_sites,
                 preemptible = preemptible,
-                docker = docker,
-                monitoring_script = monitoring_script
+                docker = docker
         }
     }
 
     output {
         Array[File] chunks = GlimpseSplitReferenceTask.chunks
         Array[File] reference_chunks = flatten(GlimpseSplitReferenceTask.split_reference_chunks)
-        Array[File?] split_reference_monitoring = GlimpseSplitReferenceTask.monitoring
     }
 }
 
@@ -84,7 +81,6 @@ task GlimpseSplitReferenceTask {
         Int disk_size_gb = ceil(2.2 * size(reference_panel, "GiB") + size(genetic_map, "GiB") + 100)
         Int preemptible = 1
         String docker
-        File? monitoring_script
     }
 
     String reference_output_dir = "reference_output_dir"
@@ -94,8 +90,6 @@ task GlimpseSplitReferenceTask {
     command <<<
         set -xeuo pipefail
 
-        ~{"bash " + monitoring_script + " > monitoring.log &"}
-
         NPROC=$(nproc)
         echo "nproc reported ${NPROC} CPUs, using that number as the threads argument for GLIMPSE."
 
@@ -145,7 +139,5 @@ task GlimpseSplitReferenceTask {
         # have a built-in way to do that, we have to rely on the command section to do that. However, we don't have access to that bash
         # variable in the output section, so we have to use glob here and return the first (and only) result.
         File chunks = glob("chunks_contigindex_*.txt")[0]
-
-        File? monitoring = "monitoring.log"
     }
 }
diff --git a/GlimpseImputationPipeline/README.md b/GlimpseImputationPipeline/README.md
@@ -47,14 +47,12 @@ contig_name_in_reference_panel = "chr1"
 - **Int? min_window_cm**: Optional minimum window size in [Centimorgan](https://en.wikipedia.org/wiki/Centimorgan). See note on chunk sizes above.
 - **Boolean uniform_number_variants = false**: When set to true, each chunk will have approximately the same number of sites while. Each chunk will cover a different (Centimorgan) genetic linkage region with the smallest chunk still being larger than `min_window_cm`.
 - **Int preemtible = 1**: Number of preemptible attempts.
-- **File? monitoring_script**: Optional path to monitoring script. If ommitted, no monitoring will occur and the `split_reference_monitoring** output will not be available.
 
 ### Output
 - **Array[File] chunks**: One `chunks_contigindex_{CONTIGINDEX}.txt` file per region as defined in the `contig_regions` input that each include the definitions of the chunks in that region.
 - **Array[File] split_reference_chunks**: All binary representations of the reference panel for each chunk in each contig region. Each file is named `reference_panel_contigindex_${CONTIGINDEX}_chunkindex_${CHUNKINDEX}` with `CONTIGINDEX` and `CHUNKINDEX` being 4-digit zero-based indices with leading zeros in order to simplify the correct ordering of intervals.
 - **Array[String] num_sites**: The number of sites in each chunk.
 - **Array[String] num_sites_uniform**: The number of sites in each chunk when using `uniform_number_variants`, otherwise empty.
-- **File? monitoring**: A monitoring log if the `monitoring_script` input is set, otherwise null.
 
 ## Glimpse2Imputation
 
@@ -64,7 +62,9 @@ The input to this workflow can bei either single-sample or multi-sample VCFs wit
 
 This implementation uses Cromwell's [checkpoint feature](https://cromwell.readthedocs.io/en/stable/optimizations/CheckpointFiles/) to drastically reduce cost by being able to run entirely on preemptible machines and preserving the computation progress made leading up to a preemption event. It is therefore recommended to allow for a relatively high number of preemptible attempts (the default value for the `preemptible` input is 9).
 
-**Note:** _The checkpointing feature, as well as a separate binary required to extract the number of sites in a given reference chunk for resource selection, is not available in the official GLIMPSE repo yet. Therefore, the default value for the `docker` input is set to `us.gcr.io/broad-dsde-methods/ckachulis/glimpse_for_wdl_pipeline:checkpointing_and_extract_num_sites`._
+The memory and CPU resource requirements for the computationally intensive phasing task are automatically determined, based on the number of sites in the reference panel chunks and the number of samples, unless these resource requirements are explicitly defined by setting `cpu_phase` and `mem_gb_phase`. The extraction of the number of sites in the reference panel chunks requires functionality not implemented directly into GLIMPSE2 and is provided in the `docker_extract_num_sites_from_reference_chunk` argument. See the [Docker section](#docker-image-for-extracting-number-of-sites-from-reference-panel) of this README for more information. The memory resource requirement can be scaled linearly using the `mem_scaling_factor_phase` argument.
+
+**Note:** _The automatic resource requirement determination provides an estimate of the required resources, it is expected that in some cases the amount of required memory is underestimated. **It is therefore advised to run this workflow with Cromwell's [retry with more memory](https://cromwell.readthedocs.io/en/stable/cromwell_features/RetryWithMoreMemory/) feature (also available in [Terra](https://support.terra.bio/hc/en-us/articles/4403215299355-Out-of-Memory-Retry)).**_
 
 **Note**: _GLIMPSE2 does not support the input of multiple CRAM files with the same basename when streaming (e.g. `gs://a/file.cram`, `gs://b/file.cram`), due to the way that htslib is implemented. This workflow will check for a potential filename collision and will fail with an error message if such a collision occurs._
 
@@ -87,16 +87,15 @@ This implementation uses Cromwell's [checkpoint feature](https://cromwell.readth
 - **Int? effective_population_size**: See [GLIMPSE2 documentation](https://odelaneau.github.io/GLIMPSE/docs/documentation/phase/#model-parameters).
 - **String docker**: Docker image to run imputation with. This docker image requires a few features that the original GLIMPSE2 does not include, see the [Docker section](#Docker-Images) of this README for more information.
 - **String docker_extract_num_sites_from_reference_chunk**: Docker image to extract the number of common and rare sites from each reference chunk. See the [Docker section](#docker-image-for-extracting-number-of-sites-from-reference-panel) of this README for more information.
+- **String mem_scaling_factor_phase**: Linear scaling of the automatically determined memory requirement. For running single samples with reference panel chunk sizes of 48 cM, a value of 1.5 is recommended.
 - **Int preemtible = 9**: Number of preemptible attempts.
-- **File? monitoring_script**: Optional path to monitoring script. If ommitted, no monitoring will occur and the `split_reference_monitoring** output will not be available.
 
 ### Output
 
 - **File imputed_vcf**: Single imputed VCF that covers all regions defined in the `contig_regions` input in GlimpseSplitReference. The name of the file is the basename of `input_vcf` with `.imputed.vcf.gz` added.
 - **File imputed_vcf_index**: Index to `imputed_vcf`.
 - **File? qc_metrics**: Output of Hail's [`sample_qc`](https://hail.is/docs/0.2/methods/genetics.html#hail.methods.sample_qc) method as a TSV table if `collect_qc_metrics` is set, otherwise null.
-- **Array[File?] glimpse_phase_monitoring**: A monitoring log for each parallelized chunk if the `monitoring_script` input is set, otherwise null.
-- **File? glimpse_ligate_monitoring**: A monitoring log for the ligate task if the `monitoring_script` input is set, otherwise null.
+- **File? coverage_metrics**: A TSV file containing metrics about the input CRAMs as a way for checking that sufficient coverage has been provided. This output is null if VCF input is provided instead of CRAM input.
 
 ## Glimpse2MergeBatches
 
@@ -105,6 +104,8 @@ This workflow merges multiple batches of imputed multi-sample VCFs into one and
 ### Input
 - **Array[File] imputed_vcf**: GLIMPSE2 output imputed VCFs.
 - **Array[File] imputed_vcf_indices**: Indices to `imputed_vcf`.
+- **Int? scatter_count**: Merging large batches of samples (approx. more than 100) requires too much resources to be done in one job. This parameter defines the number of chunks that the genome (as defined by `interval_list`) should be divided into. The default value of 100 is appropriate for merging multiple batches of 1,000 samples each.
+- **File? interval_list**: This file should contain all chromosomes that imputation has been performed on. The default value contains the hg38 contigs chr1-chr22, chrX, chrY, and chrM from start until end. Adding unused contigs only affects the efficiency for the merging process.
 - **Array[File?]? qc_metrics**: Optional array of qc_metrics of the individual batches to be merged into one `merged_qc_metrics` TSV file. Can otherwise be _null_ or can only contain _null_ values.
 - **String output_basename**: Basename for merged output VCF.