Merge pull request #83 from bacterial-genomics/dev-fastp

Dev fastp
bacterial-genomics · Mar 20, 2024 · 52c7a93 · 52c7a93
2 parents b3bcbdb + d1fbf36
commit 52c7a93
Show file tree

Hide file tree

Showing 18 changed files with 305 additions and 81 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -8,6 +8,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 ### `Added`
 
 - [#82](https://github.com/bacterial-genomics/wf-paired-end-illumina-assembly/pull/82) Added CAT and CheckM2 to assess the assembly file formed (@chrisgulvik and @gregorysprenger).
+- [#83](https://github.com/bacterial-genomics/wf-paired-end-illumina-assembly/pull/83) Added fastp as an alternative to Trimmomatic (@chrisgulvik and @gregorysprenger).
 
 ### `Fixed`
 

diff --git a/conf/modules.config b/conf/modules.config
@@ -308,6 +308,27 @@ process {
         ]
     }
 
+    withName: TRIM_READS_FASTP {
+        publishDir = [
+            [
+                path:    { "${params.outdir}/CleanedReads/fastp" },
+                mode:    params.publish_dir_mode,
+                pattern: "*.fastp.*"
+            ],
+            [
+                path:    params.qc_filecheck_log_dir,
+                mode:    params.publish_dir_mode,
+                pattern: "*_File.tsv"
+            ],
+            [
+                path:    params.process_log_dir,
+                mode:    params.publish_dir_mode,
+                pattern: ".command.{out,err}",
+                saveAs:  { filename -> "${meta.id}.${task.process}${filename}" }
+            ]
+        ]
+    }
+
     withName: TRIM_READS_TRIMMOMATIC {
         publishDir = [
             [

diff --git a/conf/params.config b/conf/params.config
@@ -30,25 +30,24 @@ params {
     // Host Removal
     host_remove                  = "both"
     update_sra_human_scrubber_db = false
-    sra_scrubber_db              = "https://ftp.ncbi.nlm.nih.gov/sra/dbs/human_filter/human_filter.db.20231218v2"
 
     // Downsampling
     depth                        = 100
     genome_size                  = ""
 
+    // Read trimming
+    trim_reads_tool              = "trimmomatic"
+
     // Reference files
     phix_reference               = "${projectDir}/bin/PhiX_NC_001422.1.fasta"
     adapter_reference            = "${projectDir}/bin/adapters_Nextera_NEB_TruSeq_NuGEN_ThruPLEX.fas"
 
-    // BLAST
+    // Databases
     blast_db                     = "https://ftp.ncbi.nlm.nih.gov/blast/db/16S_ribosomal_RNA.tar.gz"
-
-    // Kraken
     kraken1_db                   = "https://ccb.jhu.edu/software/kraken/dl/minikraken_20171019_8GB.tgz"
     kraken2_db                   = "https://genome-idx.s3.amazonaws.com/kraken/k2_standard_08gb_20231009.tar.gz"
-
-    // CheckM2
     checkm2_db                   = "https://zenodo.org/records/5571251/files/checkm2_database.tar.gz"
+    sra_scrubber_db              = "https://ftp.ncbi.nlm.nih.gov/sra/dbs/human_filter/human_filter.db.20231218v2"
 
     // CAT
     cat_db                       = null
@@ -126,5 +125,5 @@ params {
     */
     // Ignore "Found unexpected parameters" warning
     profile_cache_dir            = "${projectDir}/assets/.cache"
-    schema_ignore_params         = "filter_blast_bitscore,filter_blast_column,min_filesize_filtered_blastn,min_filesize_blastn_output,min_filesize_blastn_db,min_filesize_extracted_ssu_file,min_filesize_renamed_ssu_file,genbank_search_type,genbank_query_qualifier,genbank_query_feature,genbank_query,min_filesize_annotated_genbank,min_filesize_binary_se_alignment,min_filesize_final_assembly,min_filesize_polished_assembly,min_filesize_binary_pe_alignment,min_filesize_filtered_assembly,filter_contigs_no_sort,filter_contigs_deflines,filter_contigs_keep_low_complexity,filter_contigs_length,filter_contigs_gcskew,filter_contigs_discard_file,filter_contigs_coverage,min_filesize_raw_assembly,min_filesize_non_overlapping_fastq,min_filesize_fastq_adapters_removed,min_filesize_adapters,min_filesize_fastq_phix_removed,min_filesize_phix_genome,min_filesize_fastq_input,workflows,available_workflows,max_retry,bigdata,logpath,qc_filecheck_log_dir,process_log_dir,kraken1_db,kraken2_db,blast_db,polish_corrections,skesa_allow_snps,skesa_min_contig_length,skesa_max_snp_length,skesa_fraction,skesa_steps,skesa_vector_percent,skesa_kmer_length,excel_sheet_name,merge_lanes,sge_high_memory,sge_options,sge_queue_size,sge_queue,sge_penv,singularity_cache,sge_process_time,gtdbtk_pplacer_scratch,gtdbtk_min_perc_aa,gtdbtk_pplacer_cpus,gtdbtk_min_af,depth,genome_size,busco_config,adapter_reference,phix_reference,spades_mode,spades_kmer_sizes,validationSchemaIgnoreParams,validationShowHiddenParams,validation-schema-ignore-params,validation-show-hidden-params,mash_db,min_filesize_sra_human_scrubber_db_file,trimmomatic_keep_both_reads,trimmomatic_palindrome_clip_threshold,trimmomatic_simple_clip_threshold,trimmomatic_required_quality,trimmomatic_trailing_quality,trimmomatic_leading_quality,trimmomatic_min_length,trimmomatic_min_adapter_length,trimmomatic_seed_mismatches,trimmomatic_window_size,trimmomatic_phred,create_excel_outputs,rdp_phylomarker,rdp_output_format,min_filesize_rdp_output,ASSEMBLY:READ_CLASSIFY_KRAKEN_ONE,ASSEMBLY:ASSEMBLE_CONTIGS:ASSEMBLE_CONTIGS_SPADES,ASSEMBLY:READ_CLASSIFY_KRAKEN_ONE,ASSEMBLY:ASSEMBLE_CONTIGS:ASSEMBLE_CONTIGS_SKESA,min_filesize_checkm2_report,cat_db,min_filesize_cat_output,download_cat_db"
+    schema_ignore_params         = "filter_blast_bitscore,filter_blast_column,min_filesize_filtered_blastn,min_filesize_blastn_output,min_filesize_blastn_db,min_filesize_extracted_ssu_file,min_filesize_renamed_ssu_file,genbank_search_type,genbank_query_qualifier,genbank_query_feature,genbank_query,min_filesize_annotated_genbank,min_filesize_binary_se_alignment,min_filesize_final_assembly,min_filesize_polished_assembly,min_filesize_binary_pe_alignment,min_filesize_filtered_assembly,filter_contigs_no_sort,filter_contigs_deflines,filter_contigs_keep_low_complexity,filter_contigs_length,filter_contigs_gcskew,filter_contigs_discard_file,filter_contigs_coverage,min_filesize_raw_assembly,min_filesize_non_overlapping_fastq,min_filesize_fastq_adapters_removed,min_filesize_adapters,min_filesize_fastq_phix_removed,min_filesize_phix_genome,min_filesize_fastq_input,workflows,available_workflows,max_retry,bigdata,logpath,qc_filecheck_log_dir,process_log_dir,kraken1_db,kraken2_db,blast_db,polish_corrections,skesa_allow_snps,skesa_min_contig_length,skesa_max_snp_length,skesa_fraction,skesa_steps,skesa_vector_percent,skesa_kmer_length,excel_sheet_name,merge_lanes,sge_high_memory,sge_options,sge_queue_size,sge_queue,sge_penv,singularity_cache,sge_process_time,gtdbtk_pplacer_scratch,gtdbtk_min_perc_aa,gtdbtk_pplacer_cpus,gtdbtk_min_af,depth,genome_size,busco_config,adapter_reference,phix_reference,spades_mode,spades_kmer_sizes,validationSchemaIgnoreParams,validationShowHiddenParams,validation-schema-ignore-params,validation-show-hidden-params,mash_db,min_filesize_sra_human_scrubber_db_file,trimmomatic_keep_both_reads,trimmomatic_palindrome_clip_threshold,trimmomatic_simple_clip_threshold,trimmomatic_required_quality,trimmomatic_trailing_quality,trimmomatic_leading_quality,trimmomatic_min_length,trimmomatic_min_adapter_length,trimmomatic_seed_mismatches,trimmomatic_window_size,trimmomatic_phred,create_excel_outputs,rdp_phylomarker,rdp_output_format,min_filesize_rdp_output,ASSEMBLY:READ_CLASSIFY_KRAKEN_ONE,ASSEMBLY:ASSEMBLE_CONTIGS:ASSEMBLE_CONTIGS_SPADES,ASSEMBLY:READ_CLASSIFY_KRAKEN_ONE,ASSEMBLY:ASSEMBLE_CONTIGS:ASSEMBLE_CONTIGS_SKESA,min_filesize_checkm2_report,cat_db,min_filesize_cat_output,download_cat_db,trim_reads_tool"
 }
diff --git a/conf/workflows.config b/conf/workflows.config
@@ -17,7 +17,7 @@ params {
             description: 'Trim, assemble, and annotate paired end illumina reads using SPAdes.'
             includes    = ['spades']
             is_workflow = true
-            modules     = ["convert_samplesheet_python", "infile_handling_unix", "remove_phix_bbduk",
+            modules     = ["convert_samplesheet_python", "infile_handling_unix", "remove_phix_bbduk", "trim_reads_fastp",
                             "trim_reads_trimmomatic", "overlap_paired_reads_flash", "assemble_contigs_spades",
                             "filter_contigs_biopython", "polish_assembly_bwa_pilon", "annotate_prokka",
                             "extract_16S_biopython", "extract_16S_barrnap", "align_16S_blast", "classify_16S_rdp",
@@ -30,7 +30,7 @@ params {
             description: 'Trim, assemble, and annotate paired end illumina reads using SKESA.'
             includes    = ['skesa']
             is_workflow = true
-            modules     = ["convert_samplesheet_python", "infile_handling_unix", "remove_phix_bbduk",
+            modules     = ["convert_samplesheet_python", "infile_handling_unix", "remove_phix_bbduk", "trim_reads_fastp",
                             "trim_reads_trimmomatic", "overlap_paired_reads_flash", "assemble_contigs_skesa",
                             "filter_contigs_biopython", "map_contigs_bwa", "annotate_prokka",
                             "extract_16S_biopython", "extract_16S_barrnap", "align_16S_blast", "classify_16S_rdp",
@@ -64,6 +64,9 @@ params {
         'remove_phix_bbduk' {
             path        = "modules/local/remove_phix_bbduk"
         }
+        'trim_reads_fastp' {
+            path        = "modules/local/trim_reads_fastp"
+        }
         'trim_reads_trimmomatic' {
             path        = "modules/local/trim_reads_trimmomatic"
         }

diff --git a/docs/output.md b/docs/output.md
@@ -107,15 +107,19 @@ PhiX reads are commonly used as a positive control for Illumina sequencing. Duri
 
 ### Adapter clipping and quality trimming
 
-Illumina instruments can detect and remove adapter sequences, but sometimes adapters can end up in the FastQ output due to sequencing errors. A default [adapters reference file](../bin/adapters_Nextera_NEB_TruSeq_NuGEN_ThruPLEX.fas) is used within this pipeline. Trimmomatic also performs quality trimming, where broken sister reads are retained for downstream processes.
+Illumina instruments can detect and remove adapter sequences, but sometimes adapters can end up in the FastQ output due to sequencing errors.
+
+#### Trimmomatic
+
+A default [adapters reference file](../bin/adapters_Nextera_NEB_TruSeq_NuGEN_ThruPLEX.fas) is used within this pipeline for Trimmomatic. Trimmomatic also performs quality trimming, where broken sister reads are retained for downstream processes.
 Please see the [adapter clipping and quality trimming using Trimmomatic documentation](../modules/local/trim_reads_trimmomatic/README.md) for more information.
 
 <details markdown="1">
 <summary><strong>QC Steps</strong></summary>
 
 - The adapters reference file that is used with Trimmomatic to remove sequence reads must be accessible and meet a minimum file size `[Default: 10k]`.
 
-- FastQ files after removing adapter sequences are checked to ensure that they meet a minimum file size before continuing to downstream processes `[Default: 25M]`. This is to halt analysis of a sample that is primarily contaminated with artificial Illumina sequences.
+- FastQ files after removing adapters and performing quality trimming are checked to ensure that they meet a minimum file size before continuing to downstream processes `[Default: 25M]`. This is to halt analysis of a sample that is primarily contaminated with artificial Illumina sequences.
 
 </details>
 
@@ -129,6 +133,29 @@ Please see the [adapter clipping and quality trimming using Trimmomatic document
 
 </details>
 
+#### fastp
+
+fastp is able to clip adapters, perform quality trimming, and retain broken sister reads for downstream processes. fastp is able to automatically detect adapter sequences, but the use of a custom list of adapter sequences can be used.
+
+<details markdown="1">
+<summary><strong>QC Steps</strong></summary>
+
+- If used, the adapters reference file that is used to remove sequence reads must be accessible and meet a minimum file size `[Default: 10k]`.
+
+- FastQ files after removing adapters and performing quality trimming are checked to ensure that they meet a minimum file size before continuing to downstream processes `[Default: 25M]`. This is to halt analysis of a sample that is primarily contaminated with artificial Illumina sequences.
+
+</details>
+
+<br />
+
+<details markdown="1">
+<summary>Output files</summary>
+
+- `CleanedReads/fastp/`
+  - `[sample].fastp.[tsv,xlsx]`: Summary of the number of reads discarded and retained from Trimmomatic.
+
+</details>
+
 ### Merge overlapping sister reads
 
 Overlapping content between sister reads that are at least 80% similar are collapsed into a singleton read. Please see the [overlapping of paired-end reads documentation](../modules/local/overlap_paired_reads_flash/README.md) for more information.
@@ -389,13 +416,16 @@ Concatenation of output metrics for all samples.
 
 - `Summaries/`
   - `Summary.16S.[tsv,xlsx]`: Summary of the best BLAST alignment for each sample.
+  - `Summary.RDP.[tsv,xlsx]`: Summary of RDP Classifier on predicted 16S ribosomal RNA genes.
   - `Summary.MLST.[tsv,xlsx]`: Summary of the MLST results for all samples.
-  - `Summary.PhiX.[tsv,xlsx]`: Number of reads discarded and retained for each sample.
   - `Summary.Assemblies.[tsv,xlsx]`: Assembly metrics such as N50, cumulative length, longest contig length, and GC composition for each sample.
-  - `Summary.GenomeCoverage.[tsv,xlsx]`: Summary of the overall genome coverage for each sample.
+  - `Summary.PhiX_Removal.[tsv,xlsx]`: Number of reads discarded and retained for each sample.
   - `Summary.QC_File_Checks.[tsv,xlsx]`: Summary of all QC file checks detailing if a sample passes or fails each process.
+  - `Summary.GenomeCoverage.[tsv,xlsx]`: Summary of the overall genome coverage for each sample.
   - `Summary.CleanedReads-Bases.[tsv,xlsx]`: Summary of the number of cleaned bases for each sample.
+  - `Summary.Clean_and_Overlapping_Reads.[tsv,xlsx]`: Summary of the merging of overlapping sister reads.
   - `Summary.CleanedReads-AlignmentStats.[tsv,xlsx]`: Summary of the genome size and coverages of the paired-end and single-end reads for each sample.
+  - `Summary-[fastp,trimmomatic].Adapter_and_QC_Trimming.[tsv,xlsx]`:  Summary of adapter clipping and quality trimming for each sample.
   - `Summary-Report_yyyy-MM-dd_HH-mm-ss.xlsx`: Excel workbook where each file in the Summaries directory is added to a separate worksheet within the workbook.
 
 </details>

diff --git a/docs/usage.md b/docs/usage.md
@@ -185,7 +185,7 @@ Command executed:
       --threads 12 \
       --fastq-input \
       --gzip-compressed \
-      test_R1.paired.fq.gz test_R2.paired.fq.gz test.single.fq.gz \
+      test_R1.paired.fq.gz test_R2.paired.fq.gz test_single.fq.gz \
       > test_kraken.output
 
 Command exit status:
@@ -195,7 +195,7 @@ Command output:
     (empty)
 
 Command error:
-    .command.sh: line 9:  30 Killed    kraken --db /kraken_database/ --threads 12 --fastq-input --gzip-compressed test_R1.paired.fq.gz test_R2.paired.fq.gz test.single.fq.gz  > test_kraken.output
+    .command.sh: line 9:  30 Killed    kraken --db /kraken_database/ --threads 12 --fastq-input --gzip-compressed test_R1.paired.fq.gz test_R2.paired.fq.gz test_single.fq.gz  > test_kraken.output
 
 Work dir:
     /home/wf-paired-end-illumina-assembly/work/9d/172ca5881234073e8d76f2a19c88fb

diff --git a/modules/local/assemble_contigs_skesa/main.nf b/modules/local/assemble_contigs_skesa/main.nf
@@ -23,8 +23,8 @@ process ASSEMBLE_CONTIGS_SKESA {
 
     if [[ ! -f "!{meta.id}-SKESA_contigs.fasta" ]]; then
       skesa \
-        --reads !{cleaned_fastq_files[0]},!{cleaned_fastq_files[1]} \
-        --reads !{cleaned_fastq_files[2]} \
+        --reads "!{meta.id}_R1.paired.fq.gz","!{meta.id}_R2.paired.fq.gz" \
+        --reads "!{meta.id}_single.fq.gz" \
         !{allow_snps} \
         --cores !{task.cpus} \
         --memory !{memory} \

diff --git a/modules/local/assemble_contigs_spades/main.nf b/modules/local/assemble_contigs_spades/main.nf
@@ -25,9 +25,9 @@ process ASSEMBLE_CONTIGS_SPADES {
     RAMSIZE=$(echo !{task.memory} | cut -d ' ' -f 1)
 
     spades.py \
-      -1 !{cleaned_fastq_files[0]} \
-      -2 !{cleaned_fastq_files[1]} \
-      -s !{cleaned_fastq_files[2]} \
+      -1 "!{meta.id}_R1.paired.fq.gz" \
+      -2 "!{meta.id}_R2.paired.fq.gz" \
+      -s "!{meta.id}_single.fq.gz" \
       -o SPAdes \
       -k !{params.spades_kmer_sizes} \
       !{mode} \

diff --git a/modules/local/map_contigs_bwa/main.nf b/modules/local/map_contigs_bwa/main.nf
@@ -35,7 +35,7 @@ process MAP_CONTIGS_BWA {
       -x intractg \
       -t !{task.cpus} \
       !{uncorrected_contigs} \
-      !{cleaned_fastq_files[0]} !{cleaned_fastq_files[1]} \
+      "!{meta.id}_R1.paired.fq.gz" "!{meta.id}_R2.paired.fq.gz" \
       | \
       samtools sort \
       -@ !{task.cpus} \
@@ -66,15 +66,15 @@ process MAP_CONTIGS_BWA {
     fi
 
     # Single read mapping if available for downstream depth of coverage calculations
-    if [[ !{cleaned_fastq_files[2]} ]]; then
+    if [[ !{meta.id}_single.fq.gz ]]; then
       msg "INFO: Single read mapping"
       bwa index "!{meta.id}-!{meta.assembler}.fna"
 
       bwa mem \
         -v 2 \
         -x intractg \
         "!{meta.id}-!{meta.assembler}.fna" \
-        !{cleaned_fastq_files[2]} \
+        "!{meta.id}_single.fq.gz" \
         -t !{task.cpus} \
         | \
         samtools sort \

diff --git a/modules/local/polish_assembly_bwa_pilon/main.nf b/modules/local/polish_assembly_bwa_pilon/main.nf
@@ -48,7 +48,7 @@ process POLISH_ASSEMBLY_BWA_PILON {
         -x intractg \
         -t !{task.cpus} \
         !{uncorrected_contigs} \
-        !{cleaned_fastq_files[0]} !{cleaned_fastq_files[1]} \
+        "!{meta.id}_R1.paired.fq.gz" "!{meta.id}_R2.paired.fq.gz" \
         | \
         samtools sort \
         -l 9 \
@@ -112,7 +112,7 @@ process POLISH_ASSEMBLY_BWA_PILON {
 
     # Single read mapping if available for downstream depth of coverage
     #  calculations, not for assembly polishing.
-    if [[ !{cleaned_fastq_files[2]} ]]; then
+    if [[ !{meta.id}_single.fq.gz ]]; then
       msg "INFO: Single read mapping"
       bwa index "!{meta.id}-!{meta.assembler}.fna"
 
@@ -121,7 +121,7 @@ process POLISH_ASSEMBLY_BWA_PILON {
         -x intractg \
         "!{meta.id}-!{meta.assembler}.fna" \
         -t !{task.cpus} \
-        !{cleaned_fastq_files[2]} \
+        "!{meta.id}_single.fq.gz" \
         | \
         samtools sort \
         -l 9 \

diff --git a/modules/local/qa_assembly_quast/main.nf b/modules/local/qa_assembly_quast/main.nf
@@ -46,9 +46,9 @@ process QA_ASSEMBLY_QUAST {
     # Count nucleotides per read set
     echo -n '' > "!{meta.id}-!{meta.assembler}.CleanedReads-Bases.tsv"
     for (( i=0; i<3; i+=3 )); do
-      R1=$(basename "!{cleaned_fastq_files[0]}" _R1.paired.fq.gz)
-      R2=$(basename "!{cleaned_fastq_files[1]}" _R2.paired.fq.gz)
-      single=$(basename "!{cleaned_fastq_files[2]}" _single.fq.gz)
+      R1=$(basename "!{meta.id}_R1.paired.fq.gz" _R1.paired.fq.gz)
+      R2=$(basename "!{meta.id}_R2.paired.fq.gz" _R2.paired.fq.gz)
+      single=$(basename "!{meta.id}_single.fq.gz" _single.fq.gz)
 
       # Verify each set of reads groups properly
       nr_uniq_str=$(echo -e "${R1}\\n${R2}\\n${single}" | sort -u | wc -l)
@@ -57,7 +57,7 @@ process QA_ASSEMBLY_QUAST {
         exit 1
       fi
       echo -ne "${R1}\t" >> "!{meta.id}-!{meta.assembler}.CleanedReads-Bases.tsv"
-      zcat "!{cleaned_fastq_files[0]}" "!{cleaned_fastq_files[1]}" "!{cleaned_fastq_files[2]}" | \
+      zcat "!{meta.id}_R1.paired.fq.gz" "!{meta.id}_R2.paired.fq.gz" "!{meta.id}_single.fq.gz" | \
         awk 'BEGIN{SUM=0} {if(NR%4==2){SUM+=length($0)}} END{OFMT="%f"; print SUM}' \
           >> "!{meta.id}-!{meta.assembler}.CleanedReads-Bases.tsv"
 

diff --git a/modules/local/trim_reads_fastp/README.md b/modules/local/trim_reads_fastp/README.md
@@ -0,0 +1,3 @@
+# fastp
+
+This process uses [fastp](https://github.com/OpenGene/fastp) to perform some of the read cleaning steps to remove and trim FastQ sequences.
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		# fastp

		This process uses [fastp](https://github.com/OpenGene/fastp) to perform some of the read cleaning steps to remove and trim FastQ sequences.