diff --git a/README.md b/README.md index f4da609..2b89a7f 100644 --- a/README.md +++ b/README.md @@ -22,6 +22,8 @@ AliNe is a pipeline written in Nextflow that aims to efficiently align reads aga * [Usage and test](#usage) * [Parameters](#parameters) * [Output](#output) + * [Structure](#structure) + * [Statistics](#statistics) * [Contributing](#contributing) ## Foreword @@ -353,6 +355,9 @@ On success you should get a message looking like this: ## Output + +### Structure + Here the description of typical ouput you will get from AliNe: ``` @@ -404,7 +409,87 @@ Here the description of typical ouput you will get from AliNe: └── MultiQC # MultiQC folder that aggregate results across many samples into a single report ├── multiqc_report.html # Report with interactive plots for statistics across many samples. └── multiqc_report_data # Plot and data used by the multiqc_report.html +``` + +### Statistics + +In order to compare several aligner output you should activate the `--fastqc` parameter. AliNe will run the FastQC program for each output file, thereafter all FastQC file will be gathered in an html file using MultiQC. The resulting file called `multiqc_report.html` can be found in /MultiQC ( by default is called `alignment_results` and can be setup using `--outdir` AliNe parameter). + +FastQC collect the following information: + * Sequence Counts + * Sequence Quality + * Per Sequence Quality Scores + * Per Base Sequence Content + * Per Sequence GC Content + * Per Base N Content + * Sequence Length Distribution + * Sequence Duplication Levels + * Overrepresented sequences + * Adapter Content + +#### General Statistics + +Sequence Duplication Levels, Per Sequence GC Content and Sequence Counts are reported at the top of the `multiqc_report.html` file in a table called `General Statistics` as % Dups, %GC a,d M Seqs accordingly (see below). + + + +In order to facilitate the reading of this `General Statistics` you can export the table in tsv using the `Export as CSV...` button and execute the following piece of R code on the downloaded `general_stats_table.tsv` file : + +```R +# install packages +install.packages("dplyr") +install.packages("stringr") +install.packages("tidyr") +# Load necessary libraries +library(dplyr) +library(stringr) +library(tidyr) + +# Read the TSV file +file_path <- "general_stats_table.tsv" +df <- read.delim(file_path, check.names = FALSE) + +# sample name as row name +rownames(df) <- df$Sample + +# remove Sample column and clean up the column names +tmp <- cbind(ID = rownames(df), stack(df[-1])) |> + transform(ind = as.character(ind) |> stringr::str_remove_all("\\.\\d+")) + +# remove na values +tmp <- tmp[!is.na(tmp$values),] +# remove . values +tmp$values <- tmp$values |> stringr::str_remove_all("^\\.$") + +# pivot data +tmp <- tmp |> pivot_wider(id_cols = ID , names_from = ind, values_from = values, + values_fn = \(x) paste(unique(x), collapse = "")) + +# print only 4 first columns +tmp[1:4] +``` + +You will get a table similar to this one: + +``` + ID Dups GC Seqs + + 1 yeast_R1 73.01 43 0.01 + 2 yeast_R1_seqkit_trim 69.21661409043114 44 0.007607999999999999 + 3 yeast_R1_seqkit_trim_STAR_sorted 69.54090258811289 44 0.007689 + 4 yeast_R1_seqkit_trim_bbmap_sorted 69.21661409043114 44 0.007607999999999999 + 5 yeast_R1_seqkit_trim_bowtie2_sorted 69.21661409043114 44 0.007607999999999999 + 6 yeast_R1_seqkit_trim_bwaaln_sorted 69.21661409043114 44 0.007607999999999999 + 7 yeast_R1_seqkit_trim_bwamem_sorted 69.18933123111286 44 0.007611 + 8 yeast_R1_seqkit_trim_bwasw_sorted 69.23279033105622 44 0.007612 + 9 yeast_R1_seqkit_trim_graphmap2_sorted 69.21661409043114 44 0.007607999999999999 +10 yeast_R1_seqkit_trim_hisat2_sorted 69.21661409043114 44 0.007607999999999999 +11 yeast_R1_seqkit_trim_kallisto_sorted 69.21661409043114 44 0.007607999999999999 +12 yeast_R1_seqkit_trim_minimap2_sorted 69.63058976020739 44 0.007715 +13 yeast_R1_seqkit_trim_nucmer.fixed_sorted 64.70588235294117 42 0.00016999999999999999 +14 yeast_R1_seqkit_trim_sublong 53.23343848580442 44 0.007607999999999999 +15 yeast_R1_seqkit_trim_subread 69.21661409043114 44 0.007607999999999999 ``` diff --git a/aline.nf b/aline.nf index 3e56b16..743b36c 100644 --- a/aline.nf +++ b/aline.nf @@ -513,7 +513,7 @@ workflow align { // ------------------- QC ----------------- if(params.fastqc){ fastqc_raw(raw_reads, "fastqc/raw", "raw") - logs.mix(fastqc_raw.out) + logs.concat(fastqc_raw.out).set{logs} // save log } // ------------------- Standardize Score to Be Phred+33 ---------------- @@ -1092,4 +1092,5 @@ In aline.nf Think to convert sam to bam if necessary Think to sort bam output if necessary Add info in README.md +Add tool in multiqc config file (at least for fastqc) */ \ No newline at end of file diff --git a/config/multiqc_conf.yml b/config/multiqc_conf.yml index 5371906..ca837fe 100644 --- a/config/multiqc_conf.yml +++ b/config/multiqc_conf.yml @@ -1,4 +1,4 @@ -title: "Nextflow Alignment pipeline" +title: "AliNe - Nextflow Alignment pipeline (https://github.com/Juke34/AliNe)" run_modules: - fastqc @@ -45,6 +45,10 @@ module_order: name: "FastQC (graphmap2)" path_filters: - "*graphmap2_logs*" + - fastqc: + name: "FastQC (hisat)" + path_filters: + - "*hisat_logs*" - hisat2 - fastqc: name: "FastQC (hisat2)" @@ -73,6 +77,12 @@ module_order: - "*sublong_logs*" - star - fastqc: - name: "FastQC (star)" + name: "FastQC (STAR)" + path_filters: + - "*star_logs*" + - "*STAR_logs*" + - fastqc: + name: "FastQC (STARlong)" path_filters: - - "*star_logs*" \ No newline at end of file + - "*starlong_logs*" + - "*STARlong_logs*" \ No newline at end of file diff --git a/config/softwares.config b/config/softwares.config index d6921e7..47f69aa 100644 --- a/config/softwares.config +++ b/config/softwares.config @@ -33,7 +33,8 @@ process { container = 'quay.io/biocontainers/minimap2:2.26--he4a0461_1' } withName: 'multiqc' { - container = 'quay.io/biocontainers/multiqc:1.14--pyhdfd78af_0' + //container = 'quay.io/biocontainers/multiqc:1.14--pyhdfd78af_0' + container = 'quay.io/biocontainers/multiqc:1.27--pyhdfd78af_0' } withLabel: 'mummer4' { container = 'quay.io/biocontainers/mummer4:4.0.0rc1--pl5321hdbdd923_6' diff --git a/img/multiqc.png b/img/multiqc.png new file mode 100644 index 0000000..b42cc36 Binary files /dev/null and b/img/multiqc.png differ diff --git a/modules/bbmap.nf b/modules/bbmap.nf index 82734f1..ae8f678 100644 --- a/modules/bbmap.nf +++ b/modules/bbmap.nf @@ -83,7 +83,7 @@ process bbmap { } // set fileName - def fileName = fastq[0].baseName.replace('.fastq','') + def fileName = fastq[0].baseName.replace('.fastq','') + "_bbmap" """ ${tool} \\ ref=${genome_index} \\ diff --git a/modules/bowtie2.nf b/modules/bowtie2.nf index 7da986d..22a516d 100644 --- a/modules/bowtie2.nf +++ b/modules/bowtie2.nf @@ -30,7 +30,7 @@ process bowtie2 { output: tuple val(sample), path ("*.sam"), emit: tuple_sample_sam - path "*bowtie2.log", emit: bowtie2_summary + path "*.log", emit: bowtie2_summary script: @@ -48,22 +48,25 @@ process bowtie2 { read_orientation = "--ff" } } + + // catch filename + filename = reads[0].baseName.replace('.fastq','') + "_bowtie2" if (params.read_type == "short_paired"){ """ bowtie2 ${params.bowtie2_options} ${read_orientation}\\ -p ${task.cpus} \\ -x ${genome.baseName} \\ - -S ${reads[0].baseName.replace('.fastq','')}_bowtie2.sam \\ - -1 ${reads[0]} -2 ${reads[1]} 2> ${reads[0].baseName.replace('.fastq','')}_bowtie2.log + -S ${filename}.sam \\ + -1 ${reads[0]} -2 ${reads[1]} 2> ${filename}_sorted.log """ } else { """ bowtie2 ${params.bowtie2_options} ${read_orientation}\\ -p ${task.cpus} \\ -x ${genome.baseName} \\ - -S ${reads.baseName.replace('.fastq','')}_bowtie2.sam \\ - -U ${reads} 2> ${reads.baseName}_bowtie2.log + -S ${filename}.sam \\ + -U ${reads} 2> ${filename}_sorted.log """ } diff --git a/modules/hisat2.nf b/modules/hisat2.nf index b109abd..c007734 100644 --- a/modules/hisat2.nf +++ b/modules/hisat2.nf @@ -34,8 +34,8 @@ process hisat2 { output: tuple val(sample), path ("*.sam"), emit: tuple_sample_sam - path "${sample}_splicesite.txt" - path "*hisat2-summary.txt", emit: hisat2_summary + path "*_splicesite.txt" + path "*_summary.txt", emit: hisat2_summary script: @@ -43,7 +43,7 @@ process hisat2 { index_basename = hisat2_index_files[0].toString() - ~/.\d.ht2l?/ // catch filename - filename = reads[0].baseName.replace('.fastq','') + filename = reads[0].baseName.replace('.fastq','') + "_hisat2" // deal with library type - default is unstranded. def lib_strand="" @@ -82,14 +82,14 @@ process hisat2 { if (params.read_type == "short_paired"){ """ - hisat2 ${lib_strand} ${read_orientation} ${params.hisat2_options} --novel-splicesite-outfile ${sample}_splicesite.txt \\ - --new-summary --summary-file ${sample}.hisat2-summary.txt \\ + hisat2 ${lib_strand} ${read_orientation} ${params.hisat2_options} --novel-splicesite-outfile ${filename}_splicesite.txt \\ + --new-summary --summary-file ${filename}_sorted_summary.txt \\ -p ${task.cpus} -x $index_basename -1 ${reads[0]} -2 ${reads[1]} > ${filename}.sam """ } else { """ - hisat2 ${lib_strand} ${read_orientation} ${params.hisat2_options} --novel-splicesite-outfile ${sample}_splicesite.txt \\ - --new-summary --summary-file ${sample}.hisat2-summary.txt \\ + hisat2 ${lib_strand} ${read_orientation} ${params.hisat2_options} --novel-splicesite-outfile ${filename}_splicesite.txt \\ + --new-summary --summary-file ${filename}_sorted_summary.txt \\ -p ${task.cpus} -x $index_basename -U $reads > ${filename}.sam """ } diff --git a/modules/kallisto.nf b/modules/kallisto.nf index 16f6450..7c763e2 100644 --- a/modules/kallisto.nf +++ b/modules/kallisto.nf @@ -26,7 +26,7 @@ process kallisto_index { process kallisto { label 'kallisto' tag "$sample" - publishDir "${params.outdir}/${outpath}", pattern: "*.bam", mode: 'copy' + publishDir "${params.outdir}/${outpath}", pattern: "${filename}/*.bam", mode: 'copy' input: tuple val(sample), path(reads), val(library), val(read_length) @@ -34,13 +34,13 @@ process kallisto { val outpath output: - tuple val(sample), path ("*.bam"), emit: tuple_sample_bam - path "*kallisto.log", emit: kallisto_summary + tuple val(sample), path ("${filename}/*.bam"), emit: tuple_sample_bam + path "*.log", emit: kallisto_summary script: // catch filename - filename = reads[0].baseName.replace('.fastq','') + filename = reads[0].baseName.replace('.fastq','') + "_kallisto_sorted" // deal with library type def read_orientation="" @@ -61,7 +61,11 @@ process kallisto { -t ${task.cpus} \ --pseudobam \ -i ${kallisto_index} \ - ${reads[0]} ${reads[1]} -o ${filename}.bam 2> ${filename}_kallisto.log + ${reads[0]} ${reads[1]} -o ${filename} 2> ${filename}.log + + mv ${filename}/pseudoalignments.bam ${filename}/${filename}.bam + # in order that the log file contains the name of the output fastq files (MultiQC) + sed -i 's/${reads[0]}/${filename}.fastq.gz/' ${filename}.log """ } else { @@ -83,8 +87,11 @@ process kallisto { --pseudobam \ -i ${kallisto_index} \ --single \ - ${reads} -o ${filename}.bam 2> ${filename}_kallisto.log + ${reads} -o ${filename} 2> ${filename}.log + + mv ${filename}/pseudoalignments.bam ${filename}/${filename}.bam + # in order that the log file contains the name of the output fastq files (MultiQC) + sed -i 's/${reads[0]}/${filename}.fastq.gz/' ${filename}.log """ } - } \ No newline at end of file diff --git a/modules/star.nf b/modules/star.nf index 7190d9c..d4d7254 100644 --- a/modules/star.nf +++ b/modules/star.nf @@ -107,7 +107,7 @@ process star { --readFilesIn pipedRead1 pipedRead2 \\ --runThreadN ${task.cpus} \\ --runMode alignReads \\ - --outFileNamePrefix ${reads[0].baseName.replace('.fastq','')} \\ + --outFileNamePrefix ${reads[0].baseName.replace('.fastq','')}_${star_tool}_sorted \\ --outSAMunmapped Within \\ --outSAMtype BAM SortedByCoordinate """ @@ -121,7 +121,7 @@ process star { --readFilesIn pipedRead \\ --runThreadN ${task.cpus} \\ --runMode alignReads \\ - --outFileNamePrefix ${reads.baseName.replace('.fastq','')} \\ + --outFileNamePrefix ${reads.baseName.replace('.fastq','')}_${star_tool}_sorted \\ --outSAMunmapped Within \\ --outSAMtype BAM SortedByCoordinate """ @@ -172,7 +172,7 @@ process star2pass{ --readFilesIn pipedRead1 pipedRead2 \\ --runThreadN ${task.cpus} \\ --runMode alignReads \\ - --outFileNamePrefix ${reads.baseName.replace('.fastq','')}_2pass \\ + --outFileNamePrefix ${reads.baseName.replace('.fastq','')}_${star_tool}_2pass_sorted \\ --outSAMunmapped Within \\ --outSAMtype BAM SortedByCoordinate \\ --sjdbFileChrStartEnd *SJ.out.tab @@ -195,7 +195,7 @@ process star2pass{ --readFilesIn pipedRead \\ --runThreadN ${task.cpus} \\ --runMode alignReads \\ - --outFileNamePrefix ${reads[0].baseName.replace('.fastq','')}_2pass \\ + --outFileNamePrefix ${reads[0].baseName.replace('.fastq','')}_${star_tool}_2pass_sorted \\ --outSAMunmapped Within \\ --outSAMtype BAM SortedByCoordinate \\ --sjdbFileChrStartEnd *SJ.out.tab diff --git a/modules/subread.nf b/modules/subread.nf index c9b3682..8412b0e 100644 --- a/modules/subread.nf +++ b/modules/subread.nf @@ -53,7 +53,7 @@ process subread { } // remove fastq.gz - def fileName = fastq[0].baseName.replace('.fastq','') + def fileName = fastq[0].baseName.replace('.fastq','') + "_subread" // prepare index name def index_prefix = genome.baseName + "_index" @@ -120,7 +120,7 @@ process sublong { script: // remove fastq.gz - def fileName = fastq[0].baseName.replace('.fastq','') + def fileName = fastq[0].baseName.replace('.fastq','') + "_sublong" // prepare index name def index_prefix = genome.baseName + "_index"