From 84e09bae1a4e101206566ee81a3c3628178564f3 Mon Sep 17 00:00:00 2001 From: Leah Kemp Date: Thu, 7 Nov 2024 14:01:39 +1300 Subject: [PATCH] Release v0.1.2 (#13) --- README.md | 116 ++++---- config/nextflow_pipeface.config | 39 ++- config/parameters_pipeface.json | 5 +- docs/database_versions.txt | 9 + docs/run_on_nci.md | 39 ++- docs/software_versions.txt | 11 + pipeface.nf | 474 ++++++++++++++++---------------- 7 files changed, 392 insertions(+), 301 deletions(-) create mode 100644 docs/database_versions.txt create mode 100644 docs/software_versions.txt diff --git a/README.md b/README.md index f24c855..be58d7c 100644 --- a/README.md +++ b/README.md @@ -4,31 +4,33 @@ Pipefaceee. -Nextflow pipeline to align, variant call (SNP's, indels's, SV's) and phase long read [ONT](https://nanoporetech.com/) and/or [pacbio](https://www.pacb.com/) HiFi data. +Nextflow pipeline to align, variant call (SNP's, indels's, SV's), phase and optionally annotate (SNP's, indels's) long read [ONT](https://nanoporetech.com/) and/or [pacbio](https://www.pacb.com/) HiFi data. + +There currently exists tools and workflows that undertake comparable analyses, but pipeface serves as a central workflow to process long read data (both ONT and pacbio HiFi data). Pipeface's future hold's STR, CNV and tandem repeat calling, as well as the analysis of cohorts.

## Workflow -### Overview - ```mermaid %%{init: {'theme':'dark'}}%% flowchart LR -input_data("Input data: \n\n ONT fastq.gz \n and/or \n ONT fastq \n and/or \n ONT uBAM \n and/or \n pacbio HiFi uBAM") +input_data("Input data:

ONT fastq.gz
and/or
ONT fastq
and/or
ONT uBAM
and/or
pacbio HiFi uBAM") merging{{"Merge runs (if needed)"}} alignment{{"bam to fastq conversion (if needed), alignment, sorting"}} depth{{"Calculate alignment depth"}} snp_indel_calling{{"SNP/indel variant calling"}} snp_indel_phasing{{"SNP/indel phasing"}} +snp_indel_annotation{{"SNP/indel annotation (optional - hg38 only)"}} haplotagging{{"Haplotagging bams"}} sv_calling{{"Structural variant calling"}} input_data-.->merging-.->alignment-.->snp_indel_calling-.->snp_indel_phasing-.->haplotagging-.->sv_calling alignment-.->depth alignment-.->haplotagging +snp_indel_phasing-.->snp_indel_annotation ``` @@ -38,45 +40,50 @@ alignment-.->haplotagging %%{init: {'theme':'dark'}}%% flowchart LR -ont_data_f1("Sample 1 \n\n Input data: \n\n ONT fastq.gz") -ont_data_f2("Sample 1 \n\n Input data: \n\n ONT fastq.gz") -pacbio_data_f3("Sample 2 \n\n Input data: \n\n Pacbio HiFi uBAM") -pacbio_data_f4("Sample 2 \n\n Input data: \n\n Pacbio HiFi uBAM") -ont_data_f5("Sample 3 \n\n Input data: \n\n ONT fastq") -ont_data_f6("Sample 4 \n\n Input data: \n\n ONT uBAM") - -merging_m1{{"Description: merge runs \n\n Main tools: GNU coreutils \n\n Commands: cat"}} -merging_m2{{"Description: merge runs \n\n Main tools: Samtools \n\n Commands: samtools merge"}} - -alignment_s1{{"Description: alignment, sorting \n\n Main tools: Minimap2 and Samtools \n\n Commands: minimap2 and samtools sort"}} -alignment_s2{{"Description: alignment, sorting \n\n Main tools: Minimap2 and Samtools \n\n Commands: minimap2 and samtools sort"}} -alignment_s3{{"Description: bam to fastq conversion, alignment, sorting \n\n Main tools: Minimap2 and Samtools \n\n Commands: minimap2 and samtools sort"}} -alignment_s4{{"Description: bam to fastq conversion, alignment, sorting \n\n Main tools: Minimap2 and Samtools \n\n Commands: minimap2 and samtools sort"}} - -depth_s1{{"Description: calculate alignment depth \n\n Main tools: Samtools \n\n Commands: samtools depth"}} -depth_s2{{"Description: calculate alignment depth \n\n Main tools: Samtools \n\n Commands: samtools depth"}} -depth_s3{{"Description: calculate alignment depth \n\n Main tools: Samtools \n\n Commands: samtools depth"}} -depth_s4{{"Description: calculate alignment depth \n\n Main tools: Samtools \n\n Commands: samtools depth"}} - -snp_indel_calling_s1{{"Description: SNP/indel variant calling \n\n Main tools: Clair3 or DeepVariant (NVIDIA Parabricks) \n\n Commands: run_clair3.sh or pbrun deepvariant"}} -snp_indel_calling_s2{{"Description: SNP/indel variant calling \n\n Main tools: Clair3 or DeepVariant (NVIDIA Parabricks) \n\n Commands: run_clair3.sh or pbrun deepvariant"}} -snp_indel_calling_s3{{"Description: SNP/indel variant calling \n\n Main tools: Clair3 or DeepVariant (NVIDIA Parabricks) \n\n Commands: run_clair3.sh or pbrun deepvariant"}} -snp_indel_calling_s4{{"Description: SNP/indel variant calling \n\n Main tools: Clair3 or DeepVariant (NVIDIA Parabricks) \n\n Commands: run_clair3.sh or pbrun deepvariant"}} - -snp_indel_phasing_s1{{"Description: SNP/indel phasing \n\n Main tools: WhatsHap \n\n Commands: whatshap phase"}} -snp_indel_phasing_s2{{"Description: SNP/indel phasing \n\n Main tools: WhatsHap \n\n Commands: whatshap phase"}} -snp_indel_phasing_s3{{"Description: SNP/indel phasing \n\n Main tools: WhatsHap \n\n Commands: whatshap phase"}} -snp_indel_phasing_s4{{"Description: SNP/indel phasing \n\n Main tools: WhatsHap \n\n Commands: whatshap phase"}} - -haplotagging_s1{{"Description: haplotagging bams \n\n Main tools: WhatsHap \n\n Commands: whatshap haplotag"}} -haplotagging_s2{{"Description: haplotagging bams \n\n Main tools: WhatsHap \n\n Commands: whatshap haplotag"}} -haplotagging_s3{{"Description: haplotagging bams \n\n Main tools: WhatsHap \n\n Commands: whatshap haplotag"}} -haplotagging_s4{{"Description: haplotagging bams \n\n Main tools: WhatsHap \n\n Commands: whatshap haplotag"}} - -sv_calling_s1{{"Description: structural variant calling \n\n Main tools: Sniffles2 and/or cuteSV \n\n Commands: sniffles and/or cuteSV"}} -sv_calling_s2{{"Description: structural variant calling \n\n Main tools: Sniffles2 and/or cuteSV \n\n Commands: sniffles and/or cuteSV"}} -sv_calling_s3{{"Description: structural variant calling \n\n Main tools: Sniffles2 and/or cuteSV \n\n Commands: sniffles and/or cuteSV"}} -sv_calling_s4{{"Description: structural variant calling \n\n Main tools: Sniffles2 and/or cuteSV \n\n Commands: sniffles and/or cuteSV"}} +ont_data_f1("Sample 1

Input data:

ONT fastq.gz") +ont_data_f2("Sample 1

Input data:

ONT fastq.gz") +pacbio_data_f3("Sample 2

Input data:

Pacbio HiFi uBAM") +pacbio_data_f4("Sample 2

Input data:

Pacbio HiFi uBAM") +ont_data_f5("Sample 3

Input data:

ONT fastq") +ont_data_f6("Sample 4

Input data:

ONT uBAM") + +merging_m1{{"Description: merge runs

Main tools: GNU coreutils

Commands: cat"}} +merging_m2{{"Description: merge runs

Main tools: Samtools

Commands: samtools merge"}} + +alignment_s1{{"Description: alignment, sorting

Main tools: Minimap2 and Samtools

Commands: minimap2 and samtools sort"}} +alignment_s2{{"Description: alignment, sorting

Main tools: Minimap2 and Samtools

Commands: minimap2 and samtools sort"}} +alignment_s3{{"Description: bam to fastq conversion, alignment, sorting

Main tools: Minimap2 and Samtools

Commands: minimap2 and samtools sort"}} +alignment_s4{{"Description: bam to fastq conversion, alignment, sorting

Main tools: Minimap2 and Samtools

Commands: minimap2 and samtools sort"}} + +depth_s1{{"Description: calculate alignment depth

Main tools: Samtools

Commands: samtools depth"}} +depth_s2{{"Description: calculate alignment depth

Main tools: Samtools

Commands: samtools depth"}} +depth_s3{{"Description: calculate alignment depth

Main tools: Samtools

Commands: samtools depth"}} +depth_s4{{"Description: calculate alignment depth

Main tools: Samtools

Commands: samtools depth"}} + +snp_indel_calling_s1{{"Description: SNP/indel variant calling

Main tools: Clair3 or DeepVariant

Commands: run_clair3.sh or run_deepvariant"}} +snp_indel_calling_s2{{"Description: SNP/indel variant calling

Main tools: Clair3 or DeepVariant

Commands: run_clair3.sh or run_deepvariant"}} +snp_indel_calling_s3{{"Description: SNP/indel variant calling

Main tools: Clair3 or DeepVariant

Commands: run_clair3.sh or run_deepvariant"}} +snp_indel_calling_s4{{"Description: SNP/indel variant calling

Main tools: Clair3 or DeepVariant

Commands: run_clair3.sh or run_deepvariant"}} + +snp_indel_phasing_s1{{"Description: SNP/indel phasing

Main tools: WhatsHap

Commands: whatshap phase"}} +snp_indel_phasing_s2{{"Description: SNP/indel phasing

Main tools: WhatsHap

Commands: whatshap phase"}} +snp_indel_phasing_s3{{"Description: SNP/indel phasing

Main tools: WhatsHap

Commands: whatshap phase"}} +snp_indel_phasing_s4{{"Description: SNP/indel phasing

Main tools: WhatsHap

Commands: whatshap phase"}} + +snp_indel_annotation_s1{{"Description: SNP/indel annotation (optional - hg38 only)"

Main tools: ensembl-vep

Commands: vep}} +snp_indel_annotation_s2{{"Description: SNP/indel annotation (optional - hg38 only)"

Main tools: ensembl-vep

Commands: vep}} +snp_indel_annotation_s3{{"Description: SNP/indel annotation (optional - hg38 only)"

Main tools: ensembl-vep

Commands: vep}} +snp_indel_annotation_s4{{"Description: SNP/indel annotation (optional - hg38 only)"

Main tools: ensembl-vep

Commands: vep}} + +haplotagging_s1{{"Description: haplotagging bams

Main tools: WhatsHap

Commands: whatshap haplotag"}} +haplotagging_s2{{"Description: haplotagging bams

Main tools: WhatsHap

Commands: whatshap haplotag"}} +haplotagging_s3{{"Description: haplotagging bams

Main tools: WhatsHap

Commands: whatshap haplotag"}} +haplotagging_s4{{"Description: haplotagging bams

Main tools: WhatsHap

Commands: whatshap haplotag"}} + +sv_calling_s1{{"Description: structural variant calling

Main tools: Sniffles2 and/or cuteSV

Commands: sniffles and/or cuteSV"}} +sv_calling_s2{{"Description: structural variant calling

Main tools: Sniffles2 and/or cuteSV

Commands: sniffles and/or cuteSV"}} +sv_calling_s3{{"Description: structural variant calling

Main tools: Sniffles2 and/or cuteSV

Commands: sniffles and/or cuteSV"}} +sv_calling_s4{{"Description: structural variant calling

Main tools: Sniffles2 and/or cuteSV

Commands: sniffles and/or cuteSV"}} ont_data_f1-.->merging_m1-.->alignment_s1-.->snp_indel_calling_s1-.->snp_indel_phasing_s1-.->haplotagging_s1-.->sv_calling_s1 ont_data_f2-.->merging_m1 @@ -96,6 +103,11 @@ alignment_s2-.->haplotagging_s2 alignment_s3-.->haplotagging_s3 alignment_s4-.->haplotagging_s4 +snp_indel_phasing_s1-.->snp_indel_annotation_s1 +snp_indel_phasing_s2-.->snp_indel_annotation_s2 +snp_indel_phasing_s3-.->snp_indel_annotation_s3 +snp_indel_phasing_s4-.->snp_indel_annotation_s4 + ``` ## Main analyses @@ -107,10 +119,11 @@ alignment_s4-.->haplotagging_s4 ## Main tools - [Minimap2](https://github.com/lh3/minimap2) -- [Clair3](https://github.com/HKU-BAL/Clair3) OR [DeepVariant](https://github.com/google/deepvariant) (wrapped in [NVIDIA Parabricks](https://docs.nvidia.com/clara/parabricks/latest/)) +- [Clair3](https://github.com/HKU-BAL/Clair3) or [DeepVariant](https://github.com/google/deepvariant) - [WhatsHap](https://github.com/whatshap/whatshap) -- [Sniffles2](https://github.com/fritzsedlazeck/Sniffles) AND/OR [cuteSV](https://github.com/tjiangHIT/cuteSV) +- [Sniffles2](https://github.com/fritzsedlazeck/Sniffles) and/or [cuteSV](https://github.com/tjiangHIT/cuteSV) - [Samtools](https://github.com/samtools/samtools) +- [ensembl-vep](https://github.com/Ensembl/ensembl-vep) ## Main input files @@ -119,6 +132,7 @@ alignment_s4-.->haplotagging_s4 - ONT/pacbio HiFi FASTQ (gzipped or uncompressed) or unaligned BAM - Indexed reference genome - Clair3 models (if running Clair3) +- [DeepVariant GPU 1.6.1 docker container](https://hub.docker.com/layers/google/deepvariant/1.6.1-gpu/images/sha256-7929c55106d3739daa18d52802913c43af4ca2879db29656056f59005d1d46cb?context=explore) pulled via singularity (if running DeepVariant) ### Optional @@ -129,18 +143,26 @@ alignment_s4-.->haplotagging_s4 - Aligned, sorted and haplotagged bam - Clair3 or DeepVariant phased SNP/indel VCF file -- Clair3 or DeepVariant SNP/indel gVCF file +- Clair3 or DeepVariant phased and annotated SNP/indel VCF file (optional - hg38 only) - Phased Sniffles2 and/or un-phased cuteSV SV VCF file ## Assumptions - Running pipeline on Australia's [National Computational Infrastructure (NCI)](https://nci.org.au/) - Access to if89 project on [National Computational Infrastructure (NCI)](https://nci.org.au/) +- Access to xy86 project on [National Computational Infrastructure (NCI)](https://nci.org.au/) (if running variant annotation) - Access to pipeline dependencies: - [Nextflow and it's java dependency](https://nf-co.re/docs/usage/installation). Validated to run on: - Nextflow 24.04.1 - - Java version 17.0.2 + - Java 17.0.2 + +*[See the list of software and their versions used by this version of pipeface](./docs/software_versions.txt) as well as the [list of variant databases and their versions](./docs/database_versions.txt) if variant annotation is carried out (assuming the default [nextflow_pipeface.config](./config/nextflow_pipeface.config) file is used).* ## Run it! See a walkthrough for how to [run pipeface on NCI](./docs/run_on_nci.md). + +## Credit + +This is a highly collaborative project, with many contributions from the [Genomic Technologies Lab](https://www.garvan.org.au/research/labs-groups/genomic-technologies-lab). Notably, Dr Andre Reis and Dr Ira Deveson are closely involved in the development of this pipeline. The installation and hosting of software used in this pipeline has and continues to be supported by the [Australian BioCommons Tools and Workflows project (if89)](https://australianbiocommons.github.io/ables/if89/). + diff --git a/config/nextflow_pipeface.config b/config/nextflow_pipeface.config index 146dcdf..6668007 100644 --- a/config/nextflow_pipeface.config +++ b/config/nextflow_pipeface.config @@ -1,11 +1,21 @@ +params.vep_db = '/g/data/if89/datalib/vep/112/grch38/' +params.revel_db = '/g/data/xy86/revel/1.3/grch38/new_tabbed_revel_grch38.tsv.gz' +params.gnomad_db = '/g/data/xy86/gnomad/genomes/v4.1.0/gnomad.joint.v4.1.sites.chrall.vcf.gz' +params.clinvar_db = '/g/data/xy86/clinvar/2024-08-25/grch38/clinvar_20240825.vcf.gz' +params.cadd_snv_db = '/g/data/xy86/cadd/1.7/grch38/whole_genome_SNVs.tsv.gz' +params.cadd_indel_db = '/g/data/xy86/cadd/1.7/grch38/gnomad.genomes.r4.0.indel.tsv.gz' +params.spliceai_snv_db = '/g/data/xy86/spliceai/v1.3/grch38/spliceai_scores.raw.snv.hg38.vcf.gz' +params.spliceai_indel_db = '/g/data/xy86/spliceai/v1.3/grch38/spliceai_scores.raw.indel.hg38.vcf.gz' +params.alphamissense_db = '/g/data/xy86/alphamissense/grch38/AlphaMissense_hg38.tsv.gz' + process { executor = 'pbspro' project = 'kr68' - storage = 'gdata/if89+scratch/kr68+gdata/kr68+gdata/ox63' + storage = 'gdata/if89+gdata/xy86+scratch/kr68+gdata/kr68+gdata/ox63' // provide proper access to if89 environmental modules - beforeScript = 'module use -a /g/data/if89/apps/modulefiles' + beforeScript = 'module use -a /g/data/if89/apps/modulefiles && module use -a /g/data/if89/shpcroot/modules' withName: scrape_settings { queue = 'normal' @@ -33,7 +43,7 @@ process { withName: minimap2 { queue = 'normal' cpus = '16' - time = '10h' + time = '14h' memory = '64GB' module = 'minimap2/2.28:samtools/1.19' } @@ -49,21 +59,30 @@ process { withName: clair3 { queue = 'normal' cpus = '32' - time = '6h' + time = '9h' memory = '128GB' - module = 'clair3/v1.0.9:htslib/1.16' + module = 'clair3/v1.0.9' } withName: deepvariant { queue = 'gpuvolta' cpus = '24' gpus = '2' - time = '6h' - memory = '180GB' - module = 'parabricks/4.2.1:htslib/1.16' + time = '8h' + memory = '192GB' + disk = '80GB' + module = 'singularity' + } + + withName: vep_snv { + queue = 'normal' + cpus = '32' + time = '10h' + memory = '128GB' + module = 'singularity:htslib/1.16:ensemblorg/ensembl-vep/release_112.0' } - withName: 'whatshap_phase_clair3|whatshap_phase_dv|whatshap_haplotag' { + withName: 'whatshap_phase|whatshap_haplotag' { queue = 'normal' cpus = '4' time = '10h' @@ -87,7 +106,7 @@ process { module = 'cuteSV/1.0.13:htslib/1.16' } - withName: 'publish_settings|publish_bam_header|publish_minimap2|publish_clair3|publish_deepvariant|publish_whatshap_phase_clair3|publish_whatshap_phase_dv|publish_whatshap_haplotag|publish_sniffles|publish_cutesv' { + withName: 'publish_settings|publish_bam_header|publish_depth|publish_whatshap_phase|publish_whatshap_haplotag|publish_sniffles|publish_cutesv' { queue = 'normal' cpus = '1' time = '20m' diff --git a/config/parameters_pipeface.json b/config/parameters_pipeface.json index 1481fc7..f7c765c 100644 --- a/config/parameters_pipeface.json +++ b/config/parameters_pipeface.json @@ -7,7 +7,8 @@ "tandem_repeat": "", "snp_indel_caller": "", "sv_caller": "", - "outdir": "" + "annotate": "", + "outdir": "", + "deepvariant_container": "" } - diff --git a/docs/database_versions.txt b/docs/database_versions.txt new file mode 100644 index 0000000..c9bc458 --- /dev/null +++ b/docs/database_versions.txt @@ -0,0 +1,9 @@ +vep-cache: homo_sapiens/112/GRCh38 +REVEL database: 1.3 +gnomAD database: 4.1.0 +ClinVar database: 2024-08-25 +CADD SNV database: 1.7/GRCh38 +CADD indel database: 1.7/GRCh38 +SpliceAI SNV database: 1.3 +SpliceAI indel database: 1.3 +AlphaMissense database: GRCh38 diff --git a/docs/run_on_nci.md b/docs/run_on_nci.md index a311504..69823fa 100644 --- a/docs/run_on_nci.md +++ b/docs/run_on_nci.md @@ -6,10 +6,11 @@ - [Reference genome](#reference-genome) - [hg38](#hg38) - [hs1](#hs1) - - [Clair3 models](#clair3-models) + - [Clair3 models (if running clair3)](#clair3-models-if-running-clair3) - [ONT](#ont) - [Pacbio HiFi revio](#pacbio-hifi-revio) - - [3. Modify in_data.csv](#3-modify-in_datacsv) + - [DeepVariant container (if running DeepVariant)](#deepvariant-container-if-running-deepvariant) + - [3. Modify in\_data.csv](#3-modify-in_datacsv) - [4. Modify nextflow\_pipeface.config](#4-modify-nextflow_pipefaceconfig) - [5. Modify parameters\_pipeface.json](#5-modify-parameters_pipefacejson) - [6. Get pipeline dependencies](#6-get-pipeline-dependencies) @@ -86,7 +87,7 @@ module load samtools/1.19 samtools faidx hs1.fa ``` -### Clair3 models +### Clair3 models (if running clair3) #### ONT @@ -116,6 +117,15 @@ Untar tar -xvf hifi_revio.tar.gz ``` +### DeepVariant container (if running DeepVariant) + +Get a local copy of the DeepVariant GPU container v1.6.1 (singularity image file) + +```bash +module load singularity +singularity pull deepvariant_1.6.1-gpu.sif docker://google/deepvariant:deeptrio-1.6.1-gpu +``` + ## 3. Modify in_data.csv Specify the sample ID, file path to the data, data type, file path to regions of interest bed file (optional) and file path to clair3 model (if running Clair3) for each data to be analysed. Eg: @@ -150,10 +160,11 @@ Modify the NCI project to which to charge the analysis. Eg: Modify access to project specific directories. Eg: ```txt - storage = 'gdata/if89+scratch/kr68+gdata/kr68' + storage = 'gdata/if89+gdata/xy86+scratch/kr68+gdata/kr68' ``` > **_Note:_** Don't remove access to if89 gdata (`gdata/if89`). This is required to access environmental modules used in the pipeline +> **_Note:_** Similarly, don't remove access to xy86 gdata (`gdata/xy86`) if running variant annotation. This is required to access variant annotation databases used in the pipeline ## 5. Modify parameters_pipeface.json @@ -219,12 +230,30 @@ Specify the SV caller to use ('sniffles', 'cutesv' or 'both'). Eg: "sv_caller": "both", ``` +Specify whether variant annotation should be carried out ('yes' or 'no'). Eg: + +```json + "annotate": "no", +``` + +*OR* + +```json + "annotate": "yes", +``` + Specify the directory in which to write the pipeline outputs (please provide a full path). Eg: ```json "outdir": "/g/data/ox63/results" ``` +Specify the path to DeepVariant GPU container v1.6.1 (singularity image file). Eg: + +```json + "deepvariant_container": "./deepvariant_1.6.1-gpu.sif" +``` + ## 6. Get pipeline dependencies You may use the centrally installed nextflow environmental module available on NCI to access the nextflow and java dependencies @@ -251,5 +280,5 @@ The resources requested and the queue each process is submitted to may be modifi Similarly, with some coding skills, the environmental modules used by each process in the pipeline may be modified. This means you're able to substitute in different versions of software used by the pipeline. However, keep in mind that the pipeline doesn't account for differences in parameterisation between software versions. -This also means this pipeline is adaptable to other HPC's if appropriate environmental modules are included in `./config/nextflow_pipeface.config` (or if you get around to creating a nextflow configuration file pointing to appropriate containerised software before I do) and modify the job scheduler specific configuration if needed. +This also means this pipeline is adaptable to other HPC's if appropriate environmental modules are included in `./config/nextflow_pipeface.config` (or if you get around to creating a nextflow configuration file pointing to appropriate containerised software before I do) and modify the job scheduler specific configuration if needed. If you wish to use the variant annotation component of the pipeline, you'll additionally need to create local copies of the variant annotation databases used by the pipeline. diff --git a/docs/software_versions.txt b/docs/software_versions.txt new file mode 100644 index 0000000..67aea44 --- /dev/null +++ b/docs/software_versions.txt @@ -0,0 +1,11 @@ +Samtools: 1.19 +Minimap2: 2.28-r1209 +Clair3: 1.0.9 +DeepVariant 1.6.1 +WhatsHap: 2.3 +Sniffles2: 2.3.3 +cuteSV: 1.0.13 +GNU coreutils: 8.30 +HTSlib: 1.16 +GNU Awk: 4.2.1 +ensembl-vep: 112 diff --git a/pipeface.nf b/pipeface.nf index 12f9a58..0a3d131 100644 --- a/pipeface.nf +++ b/pipeface.nf @@ -1,9 +1,11 @@ + nextflow.enable.dsl=2 // set defaults for optional params (set default optional input files to dummy NONE file) // secret sauce second outdir params.outdir2 = "" params.tandem_repeat = "NONE" +params.annotate_override = "" process scrape_settings { @@ -15,6 +17,7 @@ process scrape_settings { val tandem_repeat val snp_indel_caller val sv_caller + val annotate val outdir output: @@ -43,6 +46,7 @@ process scrape_settings { echo "Tandem repeat file: $tandem_repeat" >> pipeface_settings.txt echo "SNP/indel caller: $snp_indel_caller" >> pipeface_settings.txt echo "SV caller: $reported_sv_caller" >> pipeface_settings.txt + echo "Annotate: $annotate" >> pipeface_settings.txt echo "Outdir: $outdir" >> pipeface_settings.txt """ @@ -177,7 +181,7 @@ process merge_runs { else if( extension == 'bam' ) """ ${ - if (files instanceof List && files.size() > 1) { + if (files instanceof List && files.size() > 1) { "samtools merge -@ ${task.cpus} ${files} -o merged" } else { "ln -s ${files} merged" @@ -200,8 +204,8 @@ process minimap2 { val ref_index output: - tuple val(sample_id), val(data_type), val(regions_of_interest), val(clair3_model), path('minimap2.sorted.bam'), path('minimap2.sorted.bam.bai') - tuple val(sample_id), path('minimap2.sorted.bam'), path('minimap2.version.txt') + tuple val(sample_id), val(data_type), val(regions_of_interest), val(clair3_model), path('sorted.bam'), path('sorted.bam.bai') + tuple val(sample_id), path('sorted.bam') script: // conditionally define preset @@ -224,14 +228,11 @@ process minimap2 { -a \ -x $preset \ -t ${task.cpus} \ - -R '@RG\\t$sample_id:S1\\tSM:$sample_id' \ - $ref - | samtools sort -@ ${task.cpus} -o minimap2.sorted.bam - + $ref - | samtools sort -@ ${task.cpus} -o sorted.bam - # index bam samtools index \ -@ ${task.cpus} \ - minimap2.sorted.bam - # grab version - minimap2 --version > minimap2.version.txt + sorted.bam """ else if( extension == 'gz' | extension == 'fastq' ) """ @@ -243,21 +244,17 @@ process minimap2 { -x $preset \ -t ${task.cpus} \ $ref \ - -R '@RG\\tID:$sample_id\\tSM:$sample_id' \ - $merged | samtools sort -@ ${task.cpus} -o minimap2.sorted.bam - + $merged | samtools sort -@ ${task.cpus} -o sorted.bam - # index bam samtools index \ -@ ${task.cpus} \ - minimap2.sorted.bam - # grab version - minimap2 --version > minimap2.version.txt + sorted.bam """ stub: """ - touch minimap2.sorted.bam - touch minimap2.sorted.bam.bai - touch minimap2.version.txt + touch sorted.bam + touch sorted.bam.bai """ } @@ -265,10 +262,10 @@ process minimap2 { process depth { input: - tuple val(sample_id), path(bam), path(minimap2_version) + tuple val(sample_id), path(bam) output: - tuple val(sample_id), path('minimap2.version.txt'), path('depth.tsv') + tuple val(sample_id), path('depth.tsv') script: """ @@ -292,18 +289,17 @@ process depth { } -process publish_minimap2 { +process publish_depth { publishDir "$outdir/$sample_id/$outdir2", mode: 'copy', overwrite: true, saveAs: { filename -> "$sample_id.$ref_name.$filename" } input: - tuple val(sample_id), path(minimap_version), path(depth) + tuple val(sample_id), path(depth) val outdir val outdir2 val ref_name output: - path 'minimap2.version.txt' path 'depth.tsv' script: @@ -313,7 +309,6 @@ process publish_minimap2 { stub: """ - touch minimap2.version.txt touch depth.tsv """ @@ -327,8 +322,7 @@ process clair3 { val ref_index output: - tuple val(sample_id), val(data_type), path(bam), path(bam_index), path('clair3.snp_indel.vcf.gz'), path('clair3.snp_indel.vcf.gz.tbi') - tuple val(sample_id), path('clair3.snp_indel.g.vcf.gz'), path('clair3.snp_indel.g.vcf.gz.tbi'), path('clair3.version.txt') + tuple val(sample_id), val(data_type), path(bam), path(bam_index), path('snp_indel.vcf.gz'), path('snp_indel.vcf.gz.tbi') script: // define a string to optionally pass regions of interest bed file @@ -354,56 +348,61 @@ process clair3 { --include_all_ctgs \ $regions_of_interest_optional # rename files - ln -s merge_output.vcf.gz clair3.snp_indel.vcf.gz - ln -s merge_output.vcf.gz.tbi clair3.snp_indel.vcf.gz.tbi - ln -s merge_output.gvcf.gz clair3.snp_indel.g.vcf.gz - ln -s merge_output.gvcf.gz.tbi clair3.snp_indel.g.vcf.gz.tbi - # grab version - run_clair3.sh --version > clair3.version.txt + ln -s merge_output.vcf.gz snp_indel.vcf.gz + ln -s merge_output.vcf.gz.tbi snp_indel.vcf.gz.tbi """ stub: """ - touch clair3.snp_indel.vcf.gz - touch clair3.snp_indel.vcf.gz.tbi - touch clair3.snp_indel.g.vcf.gz - touch clair3.snp_indel.g.vcf.gz.tbi - touch clair3.version.txt + touch snp_indel.vcf.gz + touch snp_indel.vcf.gz.tbi """ } -process publish_clair3 { - - publishDir "$outdir/$sample_id/$outdir2", mode: 'copy', overwrite: true, saveAs: { filename -> "$sample_id.$ref_name.$filename" } +process deepvariant { input: - tuple val(sample_id), path(snp_indel_gvcf), path(snp_indel_gvcf_index), path(version) - val outdir - val outdir2 - val ref_name + tuple val(sample_id), val(data_type), val(regions_of_interest), val(clair3_model), path(bam), path(bam_index) + val ref + val ref_index + val deepvariant_container output: - val sample_id - path 'clair3.snp_indel.g.vcf.gz' - path 'clair3.snp_indel.g.vcf.gz.tbi' - path 'clair3.version.txt' + tuple val(sample_id), val(data_type), path(bam), path(bam_index), path('snp_indel.vcf.gz'), path('snp_indel.vcf.gz.tbi') script: + // conditionally define model type + if( data_type == 'ont' ) { + model = 'ONT_R104' + } + else if ( data_type == 'pacbio' ) { + model = 'PACBIO' + } + // define an optional string to pass regions of interest bed file + def regions_of_interest_optional = file(regions_of_interest).name != 'NONE' ? "--regions $regions_of_interest" : '' """ - echo "Publishing files" + # run deepvariant + singularity run $deepvariant_container run_deepvariant \ + --reads=$bam \ + --ref=$ref \ + --sample_name=$sample_id \ + --output_vcf=snp_indel.vcf.gz \ + --model_type=$model \ + $regions_of_interest_optional \ + --num_shards=${task.cpus} \ + --postprocess_cpus=${task.cpus} """ stub: """ - touch clair3.snp_indel.g.vcf.gz - touch clair3.snp_indel.g.vcf.gz.tbi - touch clair3.version.txt + touch snp_indel.vcf.gz + touch snp_indel.vcf.gz.tbi """ } -process whatshap_phase_clair3 { +process whatshap_phase { input: tuple val(sample_id), val(data_type), path(bam), path(bam_index), path(snp_indel_vcf), path(snp_indel_vcf_index) @@ -411,122 +410,48 @@ process whatshap_phase_clair3 { val ref_index output: - tuple val(sample_id), val(data_type), path(bam), path(bam_index), path('clair3.snp_indel.phased.vcf.gz'), path('clair3.snp_indel.phased.vcf.gz.tbi') - tuple val(sample_id), path('clair3.snp_indel.phased.vcf.gz'), path('clair3.snp_indel.phased.vcf.gz.tbi'), path('clair3.snp_indel.phased.read_list.txt') + tuple val(sample_id), val(data_type), path(bam), path(bam_index), path('snp_indel.phased.vcf.gz'), path('snp_indel.phased.vcf.gz.tbi') + tuple val(sample_id), path('snp_indel.phased.vcf.gz'), path('snp_indel.phased.vcf.gz.tbi'), path('snp_indel.phased.read_list.txt') + tuple val(sample_id), path('snp_indel.phased.vcf.gz'), path('snp_indel.phased.vcf.gz.tbi') script: """ # run whatshap phase whatshap phase \ --reference $ref \ - --output clair3.snp_indel.phased.vcf.gz \ - --output-read-list clair3.snp_indel.phased.read_list.txt \ + --output snp_indel.phased.vcf.gz \ + --output-read-list snp_indel.phased.read_list.txt \ --sample $sample_id \ --ignore-read-groups $snp_indel_vcf $bam # index vcf - tabix clair3.snp_indel.phased.vcf.gz + tabix snp_indel.phased.vcf.gz """ stub: """ - touch clair3.snp_indel.phased.vcf.gz - touch clair3.snp_indel.phased.vcf.gz.tbi - touch clair3.snp_indel.phased.read_list.txt + touch snp_indel.phased.vcf.gz + touch snp_indel.phased.vcf.gz.tbi + touch snp_indel.phased.read_list.txt """ } -process publish_whatshap_phase_clair3 { +process publish_whatshap_phase { - publishDir "$outdir/$sample_id/$outdir2", mode: 'copy', overwrite: true, saveAs: { filename -> "$sample_id.$ref_name.$filename" } + publishDir "$outdir/$sample_id/$outdir2", mode: 'copy', overwrite: true, saveAs: { filename -> "$sample_id.$ref_name.$snp_indel_caller.$filename" } input: tuple val(sample_id), path(snp_indel_phased_vcf), path(snp_indel_phased_vcf_index), path(snp_indel_phased_read_list) val outdir val outdir2 val ref_name + val snp_indel_caller output: val sample_id - path 'clair3.snp_indel.phased.vcf.gz' - path 'clair3.snp_indel.phased.vcf.gz.tbi' - path 'clair3.snp_indel.phased.read_list.txt' - - script: - """ - echo "Publishing files" - """ - - stub: - """ - touch clair3.snp_indel.phased.vcf.gz - touch clair3.snp_indel.phased.vcf.gz.tbi - touch clair3.snp_indel.phased.read_list.txt - """ - -} - -process deepvariant { - - input: - tuple val(sample_id), val(data_type), val(regions_of_interest), val(clair3_model), path(bam), path(bam_index) - val ref - val ref_index - - output: - tuple val(sample_id), val(data_type), path(bam), path(bam_index), path('deepvariant.snp_indel.vcf.gz'), path('deepvariant.snp_indel.vcf.gz.tbi') - tuple val(sample_id), path('deepvariant.snp_indel.g.vcf.gz'), path('deepvariant.snp_indel.g.vcf.gz.tbi'), path('deepvariant.version.txt') - - script: - // define an optional string to pass regions of interest bed file - def regions_of_interest_optional = file(regions_of_interest).name != 'NONE' ? "-L $regions_of_interest" : '' - """ - # run deepvariant - pbrun deepvariant \ - --mode $data_type \ - --ref $ref \ - --in-bam $bam \ - --gvcf \ - --out-variants deepvariant.snp_indel.g.vcf.gz \ - --num-gpus ${task.gpus} \ - --num-cpu-threads-per-stream ${task.cpus} \ - --num-streams-per-gpu 1 \ - $regions_of_interest_optional - # compress and index vcf - bgzip \ - -@ ${task.cpus} \ - deepvariant.snp_indel.vcf - tabix deepvariant.snp_indel.vcf.gz - # grab version - pbrun deepvariant --version >> deepvariant.version.txt - """ - - stub: - """ - touch deepvariant.snp_indel.vcf.gz - touch deepvariant.snp_indel.vcf.gz.tbi - touch deepvariant.snp_indel.g.vcf.gz - touch deepvariant.snp_indel.g.vcf.gz.tbi - touch deepvariant.version.txt - """ - -} - -process publish_deepvariant { - - publishDir "$outdir/$sample_id/$outdir2", mode: 'copy', overwrite: true, saveAs: { filename -> "$sample_id.$ref_name.$filename" } - - input: - tuple val(sample_id), path(snp_indel_gvcf), path(snp_indel_gvcf_index), path(deepvariant_version) - val outdir - val outdir2 - val ref_name - - output: - val sample_id - path 'deepvariant.snp_indel.g.vcf.gz' - path 'deepvariant.snp_indel.g.vcf.gz.tbi' - path 'deepvariant.version.txt' + path 'snp_indel.phased.vcf.gz' + path 'snp_indel.phased.vcf.gz.tbi' + path 'snp_indel.phased.read_list.txt' script: """ @@ -535,61 +460,91 @@ process publish_deepvariant { stub: """ - touch deepvariant.snp_indel.g.vcf.gz - touch deepvariant.snp_indel.g.vcf.gz.tbi - touch deepvariant.version.txt + touch snp_indel.phased.vcf.gz + touch snp_indel.phased.vcf.gz.tbi + touch snp_indel.phased.read_list.txt """ } -process whatshap_phase_dv { +process vep_snv { input: - tuple val(sample_id), val(data_type), path(bam), path(bam_index), path(snp_indel_vcf), path(snp_indel_vcf_index) + tuple val(sample_id), path(snp_indel_phased_vcf), path(snp_indel_phased_vcf_index) val ref val ref_index + val vep_db + val revel_db + val gnomad_db + val clinvar_db + val cadd_snv_db + val cadd_indel_db + val spliceai_snv_db + val spliceai_indel_db + val alphamissense_db output: - tuple val(sample_id), val(data_type), path(bam), path(bam_index), path('deepvariant.snp_indel.phased.vcf.gz'), path('deepvariant.snp_indel.phased.vcf.gz.tbi') - tuple val(sample_id), path('deepvariant.snp_indel.phased.vcf.gz'), path('deepvariant.snp_indel.phased.vcf.gz.tbi'), path('deepvariant.snp_indel.phased.read_list.txt') + tuple val(sample_id), path('snp_indel.phased.annotated.vcf.gz'), path('snp_indel.phased.annotated.vcf.gz.tbi') script: """ - # run whatshap phase - whatshap phase \ - --reference $ref \ - --output deepvariant.snp_indel.phased.vcf.gz \ - --output-read-list deepvariant.snp_indel.phased.read_list.txt \ - --sample $sample_id \ - --ignore-read-groups $snp_indel_vcf $bam + # run vep + vep -i $snp_indel_phased_vcf \ + -o snp_indel.phased.annotated.vcf.gz \ + --format vcf \ + --vcf \ + --fasta $ref \ + --dir $vep_db \ + --assembly GRCh38 \ + --species homo_sapiens \ + --cache \ + --offline \ + --merged \ + --sift b \ + --polyphen b \ + --symbol \ + --hgvs \ + --hgvsg \ + --plugin REVEL,file=$revel_db \ + --custom file=$gnomad_db,short_name=gnomAD,format=vcf,type=exact,fields=AF_joint%AF_exomes%AF_genomes%nhomalt_joint%nhomalt_exomes%nhomalt_genomes \ + --custom file=$clinvar_db,short_name=ClinVar,format=vcf,type=exact,coords=0,fields=CLNSIG \ + --plugin CADD,snv=$cadd_snv_db,indels=$cadd_indel_db \ + --plugin SpliceAI,snv=$spliceai_snv_db,indel=$spliceai_indel_db \ + --plugin AlphaMissense,file=$alphamissense_db \ + --uploaded_allele \ + --check_existing \ + --filter_common \ + --no_intergenic \ + --pick \ + --fork ${task.cpus} \ + --no_stats \ + --compress_output bgzip # index vcf - tabix deepvariant.snp_indel.phased.vcf.gz + tabix snp_indel.phased.annotated.vcf.gz """ stub: """ - touch deepvariant.snp_indel.phased.vcf.gz - touch deepvariant.snp_indel.phased.vcf.gz.tbi - touch deepvariant.snp_indel.phased.read_list.txt + touch snp_indel.phased.annotated.vcf.gz + touch snp_indel.phased.annotated.vcf.gz.tbi """ } -process publish_whatshap_phase_dv { +process publish_vep_snv { - publishDir "$outdir/$sample_id/$outdir2", mode: 'copy', overwrite: true, saveAs: { filename -> "$sample_id.$ref_name.$filename" } + publishDir "$outdir/$sample_id/$outdir2", mode: 'copy', overwrite: true, saveAs: { filename -> "$sample_id.$ref_name.$snp_indel_caller.$filename" } input: - tuple val(sample_id), path(snp_indel_phased_vcf), path(snp_indel_phased_vcf_index), path(snp_indel_phased_read_list) + tuple val(sample_id), path(snp_indel_phased_annotated_vcf), path(snp_indel_phased_annotated_vcf_index) val outdir val outdir2 val ref_name + val snp_indel_caller output: - val sample_id - path 'deepvariant.snp_indel.phased.vcf.gz' - path 'deepvariant.snp_indel.phased.vcf.gz.tbi' - path 'deepvariant.snp_indel.phased.read_list.txt' + path 'snp_indel.phased.annotated.vcf.gz' + path 'snp_indel.phased.annotated.vcf.gz.tbi' script: """ @@ -598,9 +553,8 @@ process publish_whatshap_phase_dv { stub: """ - touch deepvariant.snp_indel.phased.vcf.gz - touch deepvariant.snp_indel.phased.vcf.gz.tbi - touch deepvariant.snp_indel.phased.read_list.txt + touch snp_indel.phased.annotated.vcf.gz + touch snp_indel.phased.annotated.vcf.gz.tbi """ } @@ -613,54 +567,52 @@ process whatshap_haplotag { val ref_index output: - tuple val(sample_id), val(data_type), path('minimap2.whatshap.sorted.haplotagged.bam'), path('minimap2.whatshap.sorted.haplotagged.bam.bai') - tuple val(sample_id), path('minimap2.whatshap.sorted.haplotagged.bam'), path('minimap2.whatshap.sorted.haplotagged.bam.bai'), path('minimap2.whatshap.sorted.haplotagged.tsv'), path('whatshap.version.txt') + tuple val(sample_id), val(data_type), path('sorted.haplotagged.bam'), path('sorted.haplotagged.bam.bai') + tuple val(sample_id), path('sorted.haplotagged.bam'), path('sorted.haplotagged.bam.bai'), path('sorted.haplotagged.tsv') script: """ # run whatshap haplotag whatshap haplotag \ --reference $ref \ - --output minimap2.whatshap.sorted.haplotagged.bam \ + --output sorted.haplotagged.bam \ --sample $sample_id \ --tag-supplementary \ --ignore-read-groups \ --output-threads ${task.cpus} \ - --output-haplotag-list minimap2.whatshap.sorted.haplotagged.tsv \ + --output-haplotag-list sorted.haplotagged.tsv \ $snp_indel_vcf $bam # index bam samtools index \ -@ ${task.cpus} \ - minimap2.whatshap.sorted.haplotagged.bam - # grab version - whatshap --version > whatshap.version.txt + sorted.haplotagged.bam """ stub: """ - touch minimap2.whatshap.sorted.haplotagged.bam - touch minimap2.whatshap.sorted.haplotagged.bam.bai - touch minimap2.whatshap.sorted.haplotagged.tsv - touch whatshap.version.txt + touch sorted.haplotagged.bam + touch sorted.haplotagged.bam.bai + touch sorted.haplotagged.tsv """ } process publish_whatshap_haplotag { - publishDir "$outdir/$sample_id/$outdir2", mode: 'copy', overwrite: true, saveAs: { filename -> "$sample_id.$ref_name.$filename" } + def mapper_phaser = "minimap2.whatshap" + + publishDir "$outdir/$sample_id/$outdir2", mode: 'copy', overwrite: true, saveAs: { filename -> "$sample_id.$ref_name.$mapper_phaser.$filename" } input: - tuple val(sample_id), path(haplotagged_bam), path(haplotagged_bam_index), path(haplotagged_bam_tsv), path(whatshap_version) + tuple val(sample_id), path(haplotagged_bam), path(haplotagged_bam_index), path(haplotagged_bam_tsv) val outdir val outdir2 val ref_name output: - path 'minimap2.whatshap.sorted.haplotagged.bam' - path 'minimap2.whatshap.sorted.haplotagged.bam.bai' - path 'minimap2.whatshap.sorted.haplotagged.tsv' - path 'whatshap.version.txt' + path 'sorted.haplotagged.bam' + path 'sorted.haplotagged.bam.bai' + path 'sorted.haplotagged.tsv' script: """ @@ -669,10 +621,9 @@ process publish_whatshap_haplotag { stub: """ - touch minimap2.whatshap.sorted.haplotagged.bam - touch minimap2.whatshap.sorted.haplotagged.bam.bai - touch minimap2.whatshap.sorted.haplotagged.tsv - touch whatshap.version.txt + touch sorted.haplotagged.bam + touch sorted.haplotagged.bam.bai + touch sorted.haplotagged.tsv """ } @@ -686,7 +637,7 @@ process sniffles { val tandem_repeat output: - tuple val(sample_id), path('sniffles.sv.phased.vcf.gz'), path('sniffles.sv.phased.vcf.gz.tbi'), path('sniffles.sv.phased.snf'), path('sniffles.version.txt') + tuple val(sample_id), path('sv.phased.vcf.gz'), path('sv.phased.vcf.gz.tbi') script: // define a string to optionally pass tandem repeat bed file @@ -698,40 +649,35 @@ process sniffles { --input $haplotagged_bam \ --threads ${task.cpus} \ --sample-id $sample_id \ - --vcf sniffles.sv.phased.vcf.gz \ - --snf sniffles.sv.phased.snf \ + --vcf sv.phased.vcf.gz \ --output-rnames \ --minsvlen 20 \ --phase $tandem_repeat_optional - # grab version - sniffles --version > sniffles.version.txt """ stub: """ - touch sniffles.sv.phased.vcf.gz - touch sniffles.sv.phased.vcf.gz.tbi - touch sniffles.sv.phased.snf - touch sniffles.version.txt + touch sv.phased.vcf.gz + touch sv.phased.vcf.gz.tbi """ } process publish_sniffles { - publishDir "$outdir/$sample_id/$outdir2", mode: 'copy', overwrite: true, saveAs: { filename -> "$sample_id.$ref_name.$filename" } + def sv_caller = "sniffles" + + publishDir "$outdir/$sample_id/$outdir2", mode: 'copy', overwrite: true, saveAs: { filename -> "$sample_id.$ref_name.$sv_caller.$filename" } input: - tuple val(sample_id), path(sv_vcf), path(sv_vcf_index), path(sv_snf), path(sniffles_version) + tuple val(sample_id), path(sv_vcf), path(sv_vcf_index) val outdir val outdir2 val ref_name output: - path 'sniffles.sv.phased.vcf.gz' - path 'sniffles.sv.phased.vcf.gz.tbi' - path 'sniffles.sv.phased.snf' - path 'sniffles.version.txt' + path 'sv.phased.vcf.gz' + path 'sv.phased.vcf.gz.tbi' script: """ @@ -740,10 +686,8 @@ process publish_sniffles { stub: """ - touch sniffles.sv.phased.vcf.gz - touch sniffles.sv.phased.vcf.gz.tbi - touch sniffles.sv.phased.snf - touch sniffles.version.txt + touch sv.phased.vcf.gz + touch sv.phased.vcf.gz.tbi """ } @@ -757,7 +701,7 @@ process cutesv { val tandem_repeat output: - tuple val(sample_id), path('cutesv.sv.vcf.gz'), path('cutesv.sv.vcf.gz.tbi'), path('cutesv.version.txt') + tuple val(sample_id), path('sv.vcf.gz'), path('sv.vcf.gz.tbi') script: if( data_type == 'ont' ) { @@ -771,7 +715,7 @@ process cutesv { cuteSV \ $haplotagged_bam \ $ref \ - cutesv.sv.vcf \ + sv.vcf \ ./ \ --sample ${sample_id} \ -t ${task.cpus} \ @@ -781,35 +725,33 @@ process cutesv { # compress and index vcf bgzip \ -@ ${task.cpus} \ - cutesv.sv.vcf - tabix cutesv.sv.vcf.gz - # grab version - cuteSV --version > cutesv.version.txt + sv.vcf + tabix sv.vcf.gz """ stub: """ - touch cutesv.sv.vcf.gz - touch cutesv.sv.vcf.gz.tbi - touch cutesv.version.txt + touch sv.vcf.gz + touch sv.vcf.gz.tbi """ } process publish_cutesv { - publishDir "$outdir/$sample_id/$outdir2", mode: 'copy', overwrite: true, saveAs: { filename -> "$sample_id.$ref_name.$filename" } + def sv_caller = "cutesv" + + publishDir "$outdir/$sample_id/$outdir2", mode: 'copy', overwrite: true, saveAs: { filename -> "$sample_id.$ref_name.$sv_caller.$filename" } input: - tuple val(sample_id), path(sv_vcf), path(sv_vcf_index), path(cutesv_version) + tuple val(sample_id), path(sv_vcf), path(sv_vcf_index) val outdir val outdir2 val ref_name output: - path 'cutesv.sv.vcf.gz' - path 'cutesv.sv.vcf.gz.tbi' - path 'cutesv.version.txt' + path 'sv.vcf.gz' + path 'sv.vcf.gz.tbi' script: """ @@ -818,9 +760,8 @@ process publish_cutesv { stub: """ - touch cutesv.sv.vcf.gz - touch cutesv.sv.vcf.gz.tbi - touch cutesv.version.txt + touch sv.vcf.gz + touch sv.vcf.gz.tbi """ } @@ -834,8 +775,20 @@ workflow { tandem_repeat = "$params.tandem_repeat" snp_indel_caller = "$params.snp_indel_caller" sv_caller = "$params.sv_caller" + annotate = "$params.annotate" + annotate_override = "$params.annotate_override" outdir = "$params.outdir" outdir2 = "$params.outdir2" + deepvariant_container = "$params.deepvariant_container" + vep_db = "$params.vep_db" + revel_db = "$params.revel_db" + gnomad_db = "$params.gnomad_db" + clinvar_db = "$params.clinvar_db" + cadd_snv_db = "$params.cadd_snv_db" + cadd_indel_db = "$params.cadd_indel_db" + spliceai_snv_db = "$params.spliceai_snv_db" + spliceai_indel_db = "$params.spliceai_indel_db" + alphamissense_db = "$params.alphamissense_db" // check user provided parameters if ( !in_data ) { @@ -856,15 +809,60 @@ workflow { if ( snp_indel_caller != 'clair3' && snp_indel_caller != 'deepvariant' ) { exit 1, "Error: SNP/indel calling software should be either 'clair3' or 'deepvariant', '${snp_indel_caller}' selected." } + if ( snp_indel_caller == 'deepvariant' && deepvariant_container == 'NONE' ) { + exit 1, "Error: When DeepVariant is selected as the SNP/indel calling software, provide a path to an appropriate DeepVariant container in the parameter file or pass to --deepvariant_container on the command line rather than setting it to 'NONE'." + } if ( !sv_caller ) { exit 1, "Error: No SV calling software selected. Either include in parameter file or pass to --sv_caller on the command line. Should be 'sniffles', 'cutesv', or 'both'." } if ( sv_caller != 'sniffles' && sv_caller != 'cutesv' && sv_caller != 'both' ) { exit 1, "Error: SV calling software should be 'sniffles', 'cutesv', or 'both', '${sv_caller}' selected." } + if ( !annotate ) { + exit 1, "Error: Choice to annotate not made. Either include in parameter file or pass to --annotate on the command line. Should be either 'yes' or 'no'." + } + if ( annotate != 'yes' && annotate != 'no' ) { + exit 1, "Error: Choice to annotate should be either 'yes', or 'no', '${annotate}' selected." + } + if ( annotate == 'yes' && !ref.toLowerCase().contains('hg38') && !ref.toLowerCase().contains('grch38') && annotate_override != 'yes' ) { + exit 1, "Warning: Annotation of only hg38/GRCh38 is supported. You've chosen to annotate, but it looks like you may not be passing a hg38/GRCh38 reference genome based on the filename of the reference genome. '${ref}' passed. Pass '--annotate_override yes' on the command line to override this error." + } + if ( annotate == 'yes' && !file(vep_db).exists() ) { + exit 1, "Error VEP cache directory does not exist, '${vep_db}' provided." + } + if ( annotate == 'yes' && !file(revel_db).exists() ) { + exit 1, "Error REVEL database file path does not exist, '${revel_db}' provided." + } + if ( annotate == 'yes' && !file(gnomad_db).exists() ) { + exit 1, "Error gnomAD database file path does not exist, '${gnomad_db}' provided." + } + if ( annotate == 'yes' && !file(clinvar_db).exists() ) { + exit 1, "Error ClinVar database file path does not exist, '${clinvar_db}' provided." + } + if ( annotate == 'yes' && !file(cadd_snv_db).exists() ) { + exit 1, "Error CADD SNV database file path does not exist, '${cadd_snv_db}' provided." + } + if ( annotate == 'yes' && !file(cadd_indel_db).exists() ) { + exit 1, "Error CADD indel database file path does not exist, '${cadd_indel_db}' provided." + } + if ( annotate == 'yes' && !file(spliceai_snv_db).exists() ) { + exit 1, "Error SpliceAI SNV database file path does not exist, '${spliceai_snv_db}' provided." + } + if ( annotate == 'yes' && !file(spliceai_indel_db).exists() ) { + exit 1, "Error SpliceAI indel database file path does not exist, '${spliceai_indel_db}' provided." + } + if ( annotate == 'yes' && !file(alphamissense_db).exists() ) { + exit 1, "Error AlphaMissense database file path does not exist, '${alphamissense_db}' provided." + } if ( !outdir ) { exit 1, "Error: No output directory provided. Either include in parameter file or pass to --outdir on the command line." } + if ( !deepvariant_container ) { + exit 1, "Error: No DeepVariant container provided. Either include in parameter file or pass to --deepvariant_container on the command line. Set to 'NONE' if not running DeepVariant." + } + if ( deepvariant_container != 'NONE' && snp_indel_caller == 'clair3') { + exit 1, "Error: Pass 'NONE' to 'deepvariant_container' when DeepVariant is NOT selected as the SNP/indel calling software, '${snp_indel_caller}' and '${deepvariant_container}' provided'." + } if ( !file(in_data).exists() ) { exit 1, "Error: In data csv file path does not exist, '${in_data}' provided." } @@ -877,6 +875,9 @@ workflow { if ( !file(tandem_repeat).exists() ) { exit 1, "Error: Tandem repeat bed file path does not exist, '${tandem_repeat}' provided." } + if ( !file(deepvariant_container).exists() ) { + exit 1, "Error: In DeepVariant container file path does not exist, '${deepvariant_container}' provided." + } // build variable ref_name = file(ref).getSimpleName() @@ -948,27 +949,27 @@ workflow { } // workflow - scrape_settings_to_publish = scrape_settings(in_data_tuple, in_data, ref, ref_index, tandem_repeat, snp_indel_caller, sv_caller, outdir) + scrape_settings_to_publish = scrape_settings(in_data_tuple, in_data, ref, ref_index, tandem_repeat, snp_indel_caller, sv_caller, annotate, outdir) publish_settings(scrape_settings_to_publish, outdir, outdir2, ref_name) bam_header = scrape_bam_header(in_data_list) publish_bam_header(bam_header, outdir, outdir2) merged = merge_runs(in_data_tuple) (bam, minimap_to_publish1) = minimap2(merged, ref, ref_index) minimap_to_publish2 = depth(minimap_to_publish1) - publish_minimap2(minimap_to_publish2, outdir, outdir2, ref_name) + publish_depth(minimap_to_publish2, outdir, outdir2, ref_name) if ( snp_indel_caller == 'clair3' ) { - (snp_indel_vcf, clair3_to_publish) = clair3(bam, ref, ref_index) - publish_clair3(clair3_to_publish, outdir, outdir2, ref_name) - (snp_indel_phased_vcf, whatshap_phase_to_publish) = whatshap_phase_clair3(snp_indel_vcf, ref, ref_index) - publish_whatshap_phase_clair3(whatshap_phase_to_publish, outdir, outdir2, ref_name) + (snp_indel_vcf_bam, snp_indel_vcf) = clair3(bam, ref, ref_index) } else if ( snp_indel_caller == 'deepvariant' ) { - (snp_indel_vcf, deepvariant_to_publish) = deepvariant(bam, ref, ref_index) - publish_deepvariant(deepvariant_to_publish, outdir, outdir2, ref_name) - (snp_indel_phased_vcf, whatshap_phase_to_publish) = whatshap_phase_dv(snp_indel_vcf, ref, ref_index) - publish_whatshap_phase_dv(whatshap_phase_to_publish, outdir, outdir2, ref_name) + (snp_indel_vcf_bam, snp_indel_vcf) = deepvariant(bam, ref, ref_index, deepvariant_container) + } + (snp_indel_phased_vcf_bam, whatshap_phase_to_publish, snp_indel_phased_vcf) = whatshap_phase(snp_indel_vcf_bam, ref, ref_index) + publish_whatshap_phase(whatshap_phase_to_publish, outdir, outdir2, ref_name, snp_indel_caller) + if ( annotate == 'yes' ) { + vep_snv_to_publish = vep_snv(snp_indel_phased_vcf, ref, ref_index, vep_db, revel_db, gnomad_db, clinvar_db, cadd_snv_db, cadd_indel_db, spliceai_snv_db, spliceai_indel_db, alphamissense_db) + publish_vep_snv(vep_snv_to_publish, outdir, outdir2, ref_name, snp_indel_caller) } - (haplotagged_bam, whatshap_haplotag_to_publish) = whatshap_haplotag(snp_indel_phased_vcf, ref, ref_index) + (haplotagged_bam, whatshap_haplotag_to_publish) = whatshap_haplotag(snp_indel_phased_vcf_bam, ref, ref_index) publish_whatshap_haplotag(whatshap_haplotag_to_publish, outdir, outdir2, ref_name) if ( sv_caller == 'sniffles' ) { sniffles_to_publish = sniffles(haplotagged_bam, ref, ref_index, tandem_repeat) @@ -986,4 +987,3 @@ workflow { } } -