Skip to content

Commit

Permalink
dealing better with contigs
Browse files Browse the repository at this point in the history
  • Loading branch information
kubranarci committed May 7, 2024
1 parent 20b94a0 commit 09654a1
Show file tree
Hide file tree
Showing 6 changed files with 41 additions and 54 deletions.
3 changes: 2 additions & 1 deletion assets/samplesheet_hg38_WGS.csv
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
sample,tumor,tumor_index,control,control_index
SEQC2_LL1,/omics/odcf/analysis/OE0526_projects/public_data_analyses/seqc2/sequencing/whole_genome_sequencing/results_per_pid/SEQC2_LL1/alignment/tumor01_SEQC2_LL1_merged.mdup.bam,/omics/odcf/analysis/OE0526_projects/public_data_analyses/seqc2/sequencing/whole_genome_sequencing/results_per_pid/SEQC2_LL1/alignment/tumor01_SEQC2_LL1_merged.mdup.bam.bai,/omics/odcf/analysis/OE0526_projects/public_data_analyses/seqc2/sequencing/whole_genome_sequencing/results_per_pid/SEQC2_LL1/alignment/control01_SEQC2_LL1_merged.mdup.bam,/omics/odcf/analysis/OE0526_projects/public_data_analyses/seqc2/sequencing/whole_genome_sequencing/results_per_pid/SEQC2_LL1/alignment/control01_SEQC2_LL1_merged.mdup.bam.bai
SEQC2_LL1,/omics/odcf/analysis/OE0526_projects/public_data_analyses/seqc2/sequencing/whole_genome_sequencing/results_per_pid/SEQC2_LL1/alignment/tumor01_SEQC2_LL1_merged.mdup.bam,/omics/odcf/analysis/OE0526_projects/public_data_analyses/seqc2/sequencing/whole_genome_sequencing/results_per_pid/SEQC2_LL1/alignment/tumor01_SEQC2_LL1_merged.mdup.bam.bai,/omics/odcf/analysis/OE0526_projects/public_data_analyses/seqc2/sequencing/whole_genome_sequencing/results_per_pid/SEQC2_LL1/alignment/control01_SEQC2_LL1_merged.mdup.bam,/omics/odcf/analysis/OE0526_projects/public_data_analyses/seqc2/sequencing/whole_genome_sequencing/results_per_pid/SEQC2_LL1/alignment/control01_SEQC2_LL1_merged.mdup.bam.bai
SEQC2_LL2,/omics/odcf/project/public_data/seqc2/sequencing/whole_genome_sequencing/view-by-pid/SEQC2_IL2/tumor01/paired/merged-alignment/tumor01_SEQC2_IL2_merged.mdup.bam,/omics/odcf/project/public_data/seqc2/sequencing/whole_genome_sequencing/view-by-pid/SEQC2_IL2/tumor01/paired/merged-alignment/tumor01_SEQC2_IL2_merged.mdup.bam.bai,/omics/odcf/project/public_data/seqc2/sequencing/whole_genome_sequencing/view-by-pid/SEQC2_IL2/control01/paired/merged-alignment/control01_SEQC2_IL2_merged.mdup.bam,/omics/odcf/project/public_data/seqc2/sequencing/whole_genome_sequencing/view-by-pid/SEQC2_IL2/control01/paired/merged-alignment/control01_SEQC2_IL2_merged.mdup.bam.bai
4 changes: 2 additions & 2 deletions conf/dkfz_cluster_hg38.config
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ params {
runSNVVCFFilter = true
skip_multiqc = false
runCompareGermline = true
runcontigs = "NONE" // "ALT_HLA | ALL | NONE"
runcontigs = "ALT_HLA" // "ALT_HLA | ALL | NONE"


// Annovar
Expand All @@ -54,7 +54,7 @@ params {
fasta_fai = '/omics/odcf/reference_data/legacy/ngs_share/assemblies/hg_GRCh38/sequence/GRCh38_decoy_ebv_alt_hla_phiX.fa.fai'
chrom_sizes = '/omics/odcf/reference_data/legacy/ngs_share/assemblies/hg_GRCh38/stats/GRCh38_decoy_ebv_alt_hla_phiX.fa.chrLenOnlyACGT_realChromosomes.tsv'
chr_prefix = "chr"
//contig_file = "assets/GRCh38_decoy_ebv_phiX_alt_hla_chr.contig.bed"
//contig_file = "assets/GRCh38_decoy_ebv_phiX_alt_hla_chr.fa.contig.bed"

// Annotation files
k_genome ="${params.data_path}/databases/1000Genome/hg38/ALL.wgs.shapeit2_integrated_snvindels_v2a.GRCh38.27022019.sites.vcf.gz"
Expand Down
24 changes: 16 additions & 8 deletions modules/local/get_contigs.nf
Original file line number Diff line number Diff line change
Expand Up @@ -20,12 +20,16 @@ process GET_CONTIGS {
if (params.runcontigs == 'ALT_HLA')
{
"""
touch ALT_contigs.bed
touch HLA_contigs.bed
samtools view -H $meta.tumor_bam | grep -P "_alt\\tLN" | sed -e 's/@SQ\\tSN://' -e 's/\\tLN:/\\t0\\t/' -e 's/\\tAH.*//' | sort -V -k1,1 | cut -f 1 > ALT_contigs.bed
samtools view -H $meta.tumor_bam | grep -P "SN:HLA" | sed -e 's/@SQ\\tSN://' -e 's/\\tLN:/\\t0\\t/' -e 's/\\tAH.*//' | sort -V -k1,1 | cut -f 1 > HLA_contigs.bed
if [ -n "\$(samtools view -H $tumor | grep -P "_alt\\tLN")" ]; then
samtools view -H $tumor | grep -P "_alt\\tLN" | sed -e 's/@SQ\\tSN://' -e 's/\\tLN:/\\t0\\t/' -e 's/\\tAH.*//' | sort -V -k1,1 | cut -f 1 > ALT_contigs.bed
else
touch ALT_contigs.bed
fi
if [-n "\$(samtools view -H $tumor | grep -P "SN:HLA")" ]; then
samtools view -H $tumor | grep -P "SN:HLA" | sed -e 's/@SQ\\tSN://' -e 's/\\tLN:/\\t0\\t/' -e 's/\\tAH.*//' | sort -V -k1,1 | cut -f 1 > HLA_contigs.bed
else
touch HLA_contigs.bed
fi
cat ALT_contigs.bed HLA_contigs.bed > contigs.bed
cat <<-END_VERSIONS > versions.yml
Expand All @@ -37,8 +41,12 @@ process GET_CONTIGS {
}
else {
"""
samtools view -H $meta.tumor_bam | grep -P "SN:" | sed -e 's/@SQ\\tSN://' -e 's/\tLN:/\\t0\\t/' -e 's/\\tAH.*//' | sort -V -k1,1 | cut -f 1 | tail -n +25 > contigs.bed
touch contigs.bed
if [-n "\$(samtools view -H $tumor | grep -P "SN:") "]; then
samtools view -H $tumor | grep -P "SN:" | sed -e 's/@SQ\\tSN://' -e 's/\tLN:/\\t0\\t/' -e 's/\\tAH.*//' | sort -V -k1,1 | cut -f 1 | tail -n +25 > contigs.bed
else
touch contigs.bed
fi
cat <<-END_VERSIONS > versions.yml
"${task.process}":
samtools: \$(echo \$(samtools 2>&1) | sed -e 's/.*Version: //; s/ Usage.*//')
Expand Down
7 changes: 3 additions & 4 deletions modules/nf-core/modules/bcftools/mpileup/main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ process BCFTOOLS_MPILEUP {
output:
tuple val(meta), path("*.vcf") , emit: vcf
tuple val(meta), path("*.bcftools_stats.txt"), emit: stats
tuple val(meta), val(interval_name) , emit: intervals
tuple val(meta), val(intervals) , emit: intervals
path "versions.yml" , emit: versions

when:
Expand All @@ -27,17 +27,16 @@ process BCFTOOLS_MPILEUP {
def prefix = task.ext.prefix ?: "${meta.id}"
def args_c = interval_file ? "$args2 -R ${interval_file}" : "$args -r ${intervals}"
def ref_spec = params.fasta.contains("38") ? "$args3 --ploidy GRCh38": "$args3"
interval_name = interval_file ? "contig" : "${intervals}"

"""
bcftools \\
mpileup \\
--fasta-ref $fasta \\
$args_c \\
$tumor \\
| bcftools call --output-type v $ref_spec > ${prefix}.${interval_name}.vcf
| bcftools call --output-type v $ref_spec > ${prefix}.${intervals}.vcf
bcftools stats ${prefix}.${interval_name}.vcf > ${prefix}.${interval_name}.bcftools_stats.txt
bcftools stats ${prefix}.${intervals}.vcf > ${prefix}.${intervals}.bcftools_stats.txt
cat <<-END_VERSIONS > versions.yml
"${task.process}":
Expand Down
54 changes: 16 additions & 38 deletions subworkflows/local/mpileup_snv_call.nf
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,7 @@

params.options = [:]

include { BCFTOOLS_MPILEUP as BCFTOOLS_MPILEUP_1 } from '../../modules/nf-core/modules/bcftools/mpileup/main' addParams( options: params.options )
include { BCFTOOLS_MPILEUP as BCFTOOLS_MPILEUP_2 } from '../../modules/nf-core/modules/bcftools/mpileup/main' addParams( options: params.options )
include { BCFTOOLS_MPILEUP } from '../../modules/nf-core/modules/bcftools/mpileup/main' addParams( options: params.options )
include { MPILEUP_COMPARE } from '../../modules/local/mpileup_compare.nf' addParams( options: params.options )
include { SEQ_CONTEXT_ANNOTATOR } from '../../modules/local/seq_context_annotator.nf' addParams( options: params.options )
include { FILE_CONCATENATOR } from '../../modules/local/file_concatenator.nf' addParams( options: params.options )
Expand All @@ -16,33 +15,39 @@ workflow MPILEUP_SNV_CALL {
sample_ch // channel: [val(meta), tumor,tumor_bai, control, control_bai, tumorname, controlname]
ref // channel: [path(fasta), path(fai)]
intervals // channel: [[chr, region], [chr, region], ...]
contigs // channel: [path(hla)]
contigs // channel: [path(contigs.bed)]

main:
versions = Channel.empty()

// Combine intervals with samples to create 'interval x sample' number of parallel run
// Combine intervals with samples to create 'interval x sample' number of parallel run and with contigs if exist

contigs.map { it -> ["contigs", it] }
.set{contig_ch}

intervals.take(24)
.map{it -> [it[0],[]]}
.set{ch_intervals}
ch_intervals.mix(contig_ch)
.set{intervals_ch}

sample_ch
.combine(intervals_ch)
.set { combined_inputs }

//
// MODULE:BCFTOOLS_MPILEUP
//
// RUN bcftools mpileup and bcftools call to call variants. This process is scattered by chr intervals
combined_inputs = combined_inputs.map {it -> tuple( it[0], it[1], it[2], it[3], it[4], it[5], it[6], it[7], [])}
BCFTOOLS_MPILEUP_1 (
BCFTOOLS_MPILEUP (
combined_inputs,
ref
)
versions = versions.mix(BCFTOOLS_MPILEUP_1.out.versions)
versions = versions.mix(BCFTOOLS_MPILEUP.out.versions)

// filter VCFs if there is no variant
BCFTOOLS_MPILEUP_1.out.vcf
.join(BCFTOOLS_MPILEUP_1.out.intervals)
.join(BCFTOOLS_MPILEUP_1.out.stats)
BCFTOOLS_MPILEUP.out.vcf
.join(BCFTOOLS_MPILEUP.out.intervals)
.join(BCFTOOLS_MPILEUP.out.stats)
.filter{meta, vcf, intervals, stats -> WorkflowCommons.getNumVariantsFromBCFToolsStats(stats) > 0 }
.set{ch_vcf_stats}
ch_vcf_stats
Expand All @@ -57,33 +62,6 @@ workflow MPILEUP_SNV_CALL {
ch_vcf = ch_vcf.mix(ch_vcf_1)
ch_intervals = ch_intervals.mix(ch_intervals_1)

// If reference is hg38, run HLA and ALT contigs as seperate runs
if ((params.fasta.contains("38")) && (params.runcontigs =! "NONE")){
println "Running HLA and ALT contigs as seperate runs"
// Generate input set for HLA and ALT runs
hla_alt_inputs = sample_ch.combine(contigs)
hla_alt_inputs = hla_alt_inputs.map {it -> tuple( it[0], it[1], it[2], it[3], it[4], it[5], it[6], [], it[7])}
BCFTOOLS_MPILEUP_2 (
hla_alt_inputs,
ref
)
BCFTOOLS_MPILEUP_2.out.vcf
.join(BCFTOOLS_MPILEUP_2.out.intervals)
.join(BCFTOOLS_MPILEUP_2.out.stats)
.filter{meta, vcf, intervals, stats -> WorkflowCommons.getNumVariantsFromBCFToolsStats(stats) > 0 }
.set{ch_vcf_stats_hla_alt}

ch_vcf_stats_hla_alt
.map { meta, vcf, intervals, stats -> [meta, vcf]}
.set {ch_vcf_2}
ch_vcf = ch_vcf.mix(ch_vcf_2)

ch_vcf_stats_hla_alt
.map { meta, vcf, intervals, stats -> [meta, intervals]}
.set {ch_intervals_2}
ch_intervals = ch_intervals.mix(ch_intervals_2)
}

//
// MODULE:SEQ_CONTEXT_ANNOTATOR
//
Expand Down
3 changes: 2 additions & 1 deletion workflows/snvcalling.nf
Original file line number Diff line number Diff line change
Expand Up @@ -213,7 +213,8 @@ workflow SNVCALLING {
sample_ch
)
ch_versions = ch_versions.mix(GET_CONTIGS.out.versions)
contigs = GET_CONTIGS.out.contigs
GET_CONTIGS.out.contigs.filter{contig -> WorkflowCommons.getNumLinesInFile(contig) > 0}
.set{contigs}
}

//
Expand Down

0 comments on commit 09654a1

Please sign in to comment.