Skip to content

Commit

Permalink
GLIMPSE2 improvements (#203)
Browse files Browse the repository at this point in the history
* Glimpse improvements

- Updated to most recent GLIMPSE2 docker, which includes some bug fixes especially for Glimpse2SplitReference
- Added memory scaling factor for phasing
- Amended documentation
- Improved docker building bash script to automatically determine the docker tag
- Added default value for interval_list for merging
- Removed monitoring_script inputs and outputs
  • Loading branch information
michaelgatzen authored Feb 18, 2025
1 parent ebbf5b8 commit b283c77
Show file tree
Hide file tree
Showing 8 changed files with 49 additions and 61 deletions.
25 changes: 4 additions & 21 deletions GlimpseImputationPipeline/Glimpse2Imputation.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -23,13 +23,13 @@ workflow Glimpse2Imputation {
Int? effective_population_size

Int preemptible = 9
String docker = "us.gcr.io/broad-dsde-methods/glimpse:kachulis_ck_bam_reader_retry_cf5822c"
String docker = "us.gcr.io/broad-dsde-methods/glimpse:odelaneau_bd93ade"
String docker_extract_num_sites_from_reference_chunk = "us.gcr.io/broad-dsde-methods/glimpse_extract_num_sites_from_reference_chunks:michaelgatzen_edc7f3a"
Int cpu_ligate = 4
Int mem_gb_ligate = 4
Int? cpu_phase
Int? mem_gb_phase
File? monitoring_script
Float mem_scaling_factor_phase = 1.0
}

scatter (reference_chunk in read_lines(reference_chunks)) {
Expand Down Expand Up @@ -59,7 +59,7 @@ workflow Glimpse2Imputation {
n_samples = n_samples
}

if (SelectResourceParameters.memory_gb > 256 || SelectResourceParameters.request_n_cpus > 32) {
if (ceil(SelectResourceParameters.memory_gb * mem_scaling_factor_phase) > 256 || SelectResourceParameters.request_n_cpus > 32) {
# force failure if we're accidently going to request too much resources and spend too much money
Int safety_check_memory_gb = -1
Int safety_check_n_cpu = -1
Expand All @@ -84,8 +84,7 @@ workflow Glimpse2Imputation {
preemptible = preemptible,
docker = docker,
cpu = select_first([cpu_phase, safety_check_n_cpu, SelectResourceParameters.request_n_cpus]),
mem_gb = select_first([mem_gb_phase, safety_check_memory_gb, SelectResourceParameters.memory_gb]),
monitoring_script = monitoring_script
mem_gb = select_first([mem_gb_phase, safety_check_memory_gb, ceil(select_first([SelectResourceParameters.memory_gb, -1]) * mem_scaling_factor_phase)])
}
}

Expand All @@ -99,7 +98,6 @@ workflow Glimpse2Imputation {
docker = docker,
cpu = cpu_ligate,
mem_gb = mem_gb_ligate,
monitoring_script = monitoring_script
}

call CombineCoverageMetrics {
Expand All @@ -112,7 +110,6 @@ workflow Glimpse2Imputation {
input:
imputed_vcf = GlimpseLigate.imputed_vcf,
output_basename = output_basename,
monitoring_script = monitoring_script
}


Expand All @@ -123,9 +120,6 @@ workflow Glimpse2Imputation {

File qc_metrics = CollectQCMetrics.qc_metrics
File coverage_metrics = CombineCoverageMetrics.coverage_metrics

Array[File?] glimpse_phase_monitoring = GlimpsePhase.monitoring
File? glimpse_ligate_monitoring = GlimpseLigate.monitoring
}
}

Expand All @@ -152,7 +146,6 @@ task GlimpsePhase {
Int preemptible = 9
Int max_retries = 3
String docker
File? monitoring_script
}

parameter_meta {
Expand All @@ -169,7 +162,6 @@ task GlimpsePhase {
set -euo pipefail

export GCS_OAUTH_TOKEN=$(/root/google-cloud-sdk/bin/gcloud auth application-default print-access-token)
~{"bash " + monitoring_script + " > monitoring.log &"}

cram_paths=( ~{sep=" " crams} )
cram_index_paths=( ~{sep=" " cram_indices} )
Expand Down Expand Up @@ -236,7 +228,6 @@ task GlimpsePhase {
output {
File imputed_vcf = "phase_output.bcf"
File imputed_vcf_index = "phase_output.bcf.csi"
File? monitoring = "monitoring.log"
File coverage_metrics = "phase_output_stats_coverage.txt.gz"
}
}
Expand All @@ -254,14 +245,11 @@ task GlimpseLigate {
Int preemptible = 1
Int max_retries = 3
String docker
File? monitoring_script
}

command <<<
set -xeuo pipefail

~{"bash " + monitoring_script + " > monitoring.log &"}

NPROC=$(nproc)
echo "nproc reported ${NPROC} CPUs, using that number as the threads argument for GLIMPSE."

Expand Down Expand Up @@ -289,7 +277,6 @@ task GlimpseLigate {
File imputed_vcf = "~{output_basename}.imputed.vcf.gz"
File imputed_vcf_index = "~{output_basename}.imputed.vcf.gz.tbi"
File imputed_vcf_md5sum = "~{output_basename}.imputed.vcf.gz.md5sum"
File? monitoring = "monitoring.log"
}
}

Expand All @@ -302,7 +289,6 @@ task CollectQCMetrics {
String docker = "hailgenetics/hail:0.2.126-py3.11"
Int cpu = 4
Int mem_gb = 16
File? monitoring_script
}

parameter_meta {
Expand All @@ -316,8 +302,6 @@ task CollectQCMetrics {
command <<<
set -euo pipefail

~{"bash " + monitoring_script + " > monitoring.log &"}

cat <<'EOF' > script.py
import hail as hl
import pandas as pd
Expand Down Expand Up @@ -345,7 +329,6 @@ EOF

output {
File qc_metrics = "~{output_basename}.qc_metrics.tsv"
File? monitoring = "monitoring.log"
}
}

Expand Down
9 changes: 3 additions & 6 deletions GlimpseImputationPipeline/Glimpse2ImputationAndCheckQC.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -28,9 +28,9 @@ workflow Glimpse2ImputationAndCheckQC {
Int? preemptible
String? docker
String? docker_extract_num_sites_from_reference_chunk
Float? mem_scaling_factor_phase
Int? cpu_ligate
Int? mem_gb_ligate
File? monitoring_script

# For documentation, please refer to CheckQC.wdl
File qc_metrics_thresholds
Expand Down Expand Up @@ -60,9 +60,9 @@ workflow Glimpse2ImputationAndCheckQC {
preemptible = preemptible,
docker = docker,
docker_extract_num_sites_from_reference_chunk = docker_extract_num_sites_from_reference_chunk,
mem_scaling_factor_phase = mem_scaling_factor_phase,
cpu_ligate = cpu_ligate,
mem_gb_ligate = mem_gb_ligate,
monitoring_script = monitoring_script
mem_gb_ligate = mem_gb_ligate
}

call Glimpse2CheckQC.Glimpse2CheckQC {
Expand All @@ -84,9 +84,6 @@ workflow Glimpse2ImputationAndCheckQC {
File qc_metrics = Glimpse2Imputation.qc_metrics
File coverage_metrics = Glimpse2Imputation.coverage_metrics

Array[File?] glimpse_phase_monitoring = Glimpse2Imputation.glimpse_phase_monitoring
File? glimpse_ligate_monitoring = Glimpse2Imputation.glimpse_ligate_monitoring

Boolean qc_passed = Glimpse2CheckQC.qc_passed
File qc_failures = Glimpse2CheckQC.qc_failures
}
Expand Down
8 changes: 4 additions & 4 deletions GlimpseImputationPipeline/Glimpse2ImputationInBatches.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -28,16 +28,16 @@ workflow Glimpse2ImputationInBatches {
Int? preemptible
String? docker
String? docker_extract_num_sites_from_reference_chunk
Float? mem_scaling_factor_phase
Int? cpu_ligate
Int? mem_gb_ligate
Int? mem_gb_merge
File? monitoring_script

String? docker_gatk
String? docker_count_samples
String? docker_merge

File interval_list_for_merge
File? interval_list_for_merge
Int? merge_scatter_count
}

Expand Down Expand Up @@ -68,9 +68,9 @@ workflow Glimpse2ImputationInBatches {
preemptible = preemptible,
docker = docker,
docker_extract_num_sites_from_reference_chunk = docker_extract_num_sites_from_reference_chunk,
mem_scaling_factor_phase = mem_scaling_factor_phase,
cpu_ligate = cpu_ligate,
mem_gb_ligate = mem_gb_ligate,
monitoring_script = monitoring_script
mem_gb_ligate = mem_gb_ligate
}
}

Expand Down
2 changes: 1 addition & 1 deletion GlimpseImputationPipeline/Glimpse2MergeBatches.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ workflow Glimpse2MergeBatches {
String docker_count_samples = "us.gcr.io/broad-dsde-methods/bcftools:v1.3"
String docker_merge = "us.gcr.io/broad-dsde-methods/samtools-suite:v1.1"

File interval_list
File interval_list = "gs://gcp-public-data--broad-references/hg38/v0/wgs_metrics_intervals.interval_list"
Int scatter_count = 100

Int mem_gb_merge = 16
Expand Down
12 changes: 2 additions & 10 deletions GlimpseImputationPipeline/Glimpse2SplitReference.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,7 @@ workflow Glimpse2SplitReference {
Boolean keep_monomorphic_ref_sites = true

Int preemptible = 1
String docker = "us.gcr.io/broad-dsde-methods/glimpse:odelaneau_e0b9b56"
File? monitoring_script
String docker = "us.gcr.io/broad-dsde-methods/glimpse:odelaneau_bd93ade"
}

scatter (i_contig in range(length(contig_regions))) {
Expand All @@ -54,15 +53,13 @@ workflow Glimpse2SplitReference {
uniform_number_variants = uniform_number_variants,
keep_monomorphic_ref_sites = keep_monomorphic_ref_sites,
preemptible = preemptible,
docker = docker,
monitoring_script = monitoring_script
docker = docker
}
}

output {
Array[File] chunks = GlimpseSplitReferenceTask.chunks
Array[File] reference_chunks = flatten(GlimpseSplitReferenceTask.split_reference_chunks)
Array[File?] split_reference_monitoring = GlimpseSplitReferenceTask.monitoring
}
}

Expand All @@ -84,7 +81,6 @@ task GlimpseSplitReferenceTask {
Int disk_size_gb = ceil(2.2 * size(reference_panel, "GiB") + size(genetic_map, "GiB") + 100)
Int preemptible = 1
String docker
File? monitoring_script
}

String reference_output_dir = "reference_output_dir"
Expand All @@ -94,8 +90,6 @@ task GlimpseSplitReferenceTask {
command <<<
set -xeuo pipefail

~{"bash " + monitoring_script + " > monitoring.log &"}

NPROC=$(nproc)
echo "nproc reported ${NPROC} CPUs, using that number as the threads argument for GLIMPSE."

Expand Down Expand Up @@ -145,7 +139,5 @@ task GlimpseSplitReferenceTask {
# have a built-in way to do that, we have to rely on the command section to do that. However, we don't have access to that bash
# variable in the output section, so we have to use glob here and return the first (and only) result.
File chunks = glob("chunks_contigindex_*.txt")[0]

File? monitoring = "monitoring.log"
}
}
13 changes: 7 additions & 6 deletions GlimpseImputationPipeline/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -47,14 +47,12 @@ contig_name_in_reference_panel = "chr1"
- **Int? min_window_cm**: Optional minimum window size in [Centimorgan](https://en.wikipedia.org/wiki/Centimorgan). See note on chunk sizes above.
- **Boolean uniform_number_variants = false**: When set to true, each chunk will have approximately the same number of sites while. Each chunk will cover a different (Centimorgan) genetic linkage region with the smallest chunk still being larger than `min_window_cm`.
- **Int preemtible = 1**: Number of preemptible attempts.
- **File? monitoring_script**: Optional path to monitoring script. If ommitted, no monitoring will occur and the `split_reference_monitoring** output will not be available.

### Output
- **Array[File] chunks**: One `chunks_contigindex_{CONTIGINDEX}.txt` file per region as defined in the `contig_regions` input that each include the definitions of the chunks in that region.
- **Array[File] split_reference_chunks**: All binary representations of the reference panel for each chunk in each contig region. Each file is named `reference_panel_contigindex_${CONTIGINDEX}_chunkindex_${CHUNKINDEX}` with `CONTIGINDEX` and `CHUNKINDEX` being 4-digit zero-based indices with leading zeros in order to simplify the correct ordering of intervals.
- **Array[String] num_sites**: The number of sites in each chunk.
- **Array[String] num_sites_uniform**: The number of sites in each chunk when using `uniform_number_variants`, otherwise empty.
- **File? monitoring**: A monitoring log if the `monitoring_script` input is set, otherwise null.

## Glimpse2Imputation

Expand All @@ -64,7 +62,9 @@ The input to this workflow can bei either single-sample or multi-sample VCFs wit

This implementation uses Cromwell's [checkpoint feature](https://cromwell.readthedocs.io/en/stable/optimizations/CheckpointFiles/) to drastically reduce cost by being able to run entirely on preemptible machines and preserving the computation progress made leading up to a preemption event. It is therefore recommended to allow for a relatively high number of preemptible attempts (the default value for the `preemptible` input is 9).

**Note:** _The checkpointing feature, as well as a separate binary required to extract the number of sites in a given reference chunk for resource selection, is not available in the official GLIMPSE repo yet. Therefore, the default value for the `docker` input is set to `us.gcr.io/broad-dsde-methods/ckachulis/glimpse_for_wdl_pipeline:checkpointing_and_extract_num_sites`._
The memory and CPU resource requirements for the computationally intensive phasing task are automatically determined, based on the number of sites in the reference panel chunks and the number of samples, unless these resource requirements are explicitly defined by setting `cpu_phase` and `mem_gb_phase`. The extraction of the number of sites in the reference panel chunks requires functionality not implemented directly into GLIMPSE2 and is provided in the `docker_extract_num_sites_from_reference_chunk` argument. See the [Docker section](#docker-image-for-extracting-number-of-sites-from-reference-panel) of this README for more information. The memory resource requirement can be scaled linearly using the `mem_scaling_factor_phase` argument.

**Note:** _The automatic resource requirement determination provides an estimate of the required resources, it is expected that in some cases the amount of required memory is underestimated. **It is therefore advised to run this workflow with Cromwell's [retry with more memory](https://cromwell.readthedocs.io/en/stable/cromwell_features/RetryWithMoreMemory/) feature (also available in [Terra](https://support.terra.bio/hc/en-us/articles/4403215299355-Out-of-Memory-Retry)).**_

**Note**: _GLIMPSE2 does not support the input of multiple CRAM files with the same basename when streaming (e.g. `gs://a/file.cram`, `gs://b/file.cram`), due to the way that htslib is implemented. This workflow will check for a potential filename collision and will fail with an error message if such a collision occurs._

Expand All @@ -87,16 +87,15 @@ This implementation uses Cromwell's [checkpoint feature](https://cromwell.readth
- **Int? effective_population_size**: See [GLIMPSE2 documentation](https://odelaneau.github.io/GLIMPSE/docs/documentation/phase/#model-parameters).
- **String docker**: Docker image to run imputation with. This docker image requires a few features that the original GLIMPSE2 does not include, see the [Docker section](#Docker-Images) of this README for more information.
- **String docker_extract_num_sites_from_reference_chunk**: Docker image to extract the number of common and rare sites from each reference chunk. See the [Docker section](#docker-image-for-extracting-number-of-sites-from-reference-panel) of this README for more information.
- **String mem_scaling_factor_phase**: Linear scaling of the automatically determined memory requirement. For running single samples with reference panel chunk sizes of 48 cM, a value of 1.5 is recommended.
- **Int preemtible = 9**: Number of preemptible attempts.
- **File? monitoring_script**: Optional path to monitoring script. If ommitted, no monitoring will occur and the `split_reference_monitoring** output will not be available.

### Output

- **File imputed_vcf**: Single imputed VCF that covers all regions defined in the `contig_regions` input in GlimpseSplitReference. The name of the file is the basename of `input_vcf` with `.imputed.vcf.gz` added.
- **File imputed_vcf_index**: Index to `imputed_vcf`.
- **File? qc_metrics**: Output of Hail's [`sample_qc`](https://hail.is/docs/0.2/methods/genetics.html#hail.methods.sample_qc) method as a TSV table if `collect_qc_metrics` is set, otherwise null.
- **Array[File?] glimpse_phase_monitoring**: A monitoring log for each parallelized chunk if the `monitoring_script` input is set, otherwise null.
- **File? glimpse_ligate_monitoring**: A monitoring log for the ligate task if the `monitoring_script` input is set, otherwise null.
- **File? coverage_metrics**: A TSV file containing metrics about the input CRAMs as a way for checking that sufficient coverage has been provided. This output is null if VCF input is provided instead of CRAM input.

## Glimpse2MergeBatches

Expand All @@ -105,6 +104,8 @@ This workflow merges multiple batches of imputed multi-sample VCFs into one and
### Input
- **Array[File] imputed_vcf**: GLIMPSE2 output imputed VCFs.
- **Array[File] imputed_vcf_indices**: Indices to `imputed_vcf`.
- **Int? scatter_count**: Merging large batches of samples (approx. more than 100) requires too much resources to be done in one job. This parameter defines the number of chunks that the genome (as defined by `interval_list`) should be divided into. The default value of 100 is appropriate for merging multiple batches of 1,000 samples each.
- **File? interval_list**: This file should contain all chromosomes that imputation has been performed on. The default value contains the hg38 contigs chr1-chr22, chrX, chrY, and chrM from start until end. Adding unused contigs only affects the efficiency for the merging process.
- **Array[File?]? qc_metrics**: Optional array of qc_metrics of the individual batches to be merged into one `merged_qc_metrics` TSV file. Can otherwise be _null_ or can only contain _null_ values.
- **String output_basename**: Basename for merged output VCF.

Expand Down
Loading

0 comments on commit b283c77

Please sign in to comment.