Skip to content

Commit

Permalink
Merge pull request #83 from bacterial-genomics/dev-fastp
Browse files Browse the repository at this point in the history
Dev fastp
  • Loading branch information
gregorysprenger authored Mar 20, 2024
2 parents b3bcbdb + d1fbf36 commit 52c7a93
Show file tree
Hide file tree
Showing 18 changed files with 305 additions and 81 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
### `Added`

- [#82](https://github.com/bacterial-genomics/wf-paired-end-illumina-assembly/pull/82) Added CAT and CheckM2 to assess the assembly file formed (@chrisgulvik and @gregorysprenger).
- [#83](https://github.com/bacterial-genomics/wf-paired-end-illumina-assembly/pull/83) Added fastp as an alternative to Trimmomatic (@chrisgulvik and @gregorysprenger).

### `Fixed`

Expand Down
21 changes: 21 additions & 0 deletions conf/modules.config
Original file line number Diff line number Diff line change
Expand Up @@ -308,6 +308,27 @@ process {
]
}

withName: TRIM_READS_FASTP {
publishDir = [
[
path: { "${params.outdir}/CleanedReads/fastp" },
mode: params.publish_dir_mode,
pattern: "*.fastp.*"
],
[
path: params.qc_filecheck_log_dir,
mode: params.publish_dir_mode,
pattern: "*_File.tsv"
],
[
path: params.process_log_dir,
mode: params.publish_dir_mode,
pattern: ".command.{out,err}",
saveAs: { filename -> "${meta.id}.${task.process}${filename}" }
]
]
}

withName: TRIM_READS_TRIMMOMATIC {
publishDir = [
[
Expand Down
13 changes: 6 additions & 7 deletions conf/params.config
Original file line number Diff line number Diff line change
Expand Up @@ -30,25 +30,24 @@ params {
// Host Removal
host_remove = "both"
update_sra_human_scrubber_db = false
sra_scrubber_db = "https://ftp.ncbi.nlm.nih.gov/sra/dbs/human_filter/human_filter.db.20231218v2"

// Downsampling
depth = 100
genome_size = ""

// Read trimming
trim_reads_tool = "trimmomatic"

// Reference files
phix_reference = "${projectDir}/bin/PhiX_NC_001422.1.fasta"
adapter_reference = "${projectDir}/bin/adapters_Nextera_NEB_TruSeq_NuGEN_ThruPLEX.fas"

// BLAST
// Databases
blast_db = "https://ftp.ncbi.nlm.nih.gov/blast/db/16S_ribosomal_RNA.tar.gz"

// Kraken
kraken1_db = "https://ccb.jhu.edu/software/kraken/dl/minikraken_20171019_8GB.tgz"
kraken2_db = "https://genome-idx.s3.amazonaws.com/kraken/k2_standard_08gb_20231009.tar.gz"

// CheckM2
checkm2_db = "https://zenodo.org/records/5571251/files/checkm2_database.tar.gz"
sra_scrubber_db = "https://ftp.ncbi.nlm.nih.gov/sra/dbs/human_filter/human_filter.db.20231218v2"

// CAT
cat_db = null
Expand Down Expand Up @@ -126,5 +125,5 @@ params {
*/
// Ignore "Found unexpected parameters" warning
profile_cache_dir = "${projectDir}/assets/.cache"
schema_ignore_params = "filter_blast_bitscore,filter_blast_column,min_filesize_filtered_blastn,min_filesize_blastn_output,min_filesize_blastn_db,min_filesize_extracted_ssu_file,min_filesize_renamed_ssu_file,genbank_search_type,genbank_query_qualifier,genbank_query_feature,genbank_query,min_filesize_annotated_genbank,min_filesize_binary_se_alignment,min_filesize_final_assembly,min_filesize_polished_assembly,min_filesize_binary_pe_alignment,min_filesize_filtered_assembly,filter_contigs_no_sort,filter_contigs_deflines,filter_contigs_keep_low_complexity,filter_contigs_length,filter_contigs_gcskew,filter_contigs_discard_file,filter_contigs_coverage,min_filesize_raw_assembly,min_filesize_non_overlapping_fastq,min_filesize_fastq_adapters_removed,min_filesize_adapters,min_filesize_fastq_phix_removed,min_filesize_phix_genome,min_filesize_fastq_input,workflows,available_workflows,max_retry,bigdata,logpath,qc_filecheck_log_dir,process_log_dir,kraken1_db,kraken2_db,blast_db,polish_corrections,skesa_allow_snps,skesa_min_contig_length,skesa_max_snp_length,skesa_fraction,skesa_steps,skesa_vector_percent,skesa_kmer_length,excel_sheet_name,merge_lanes,sge_high_memory,sge_options,sge_queue_size,sge_queue,sge_penv,singularity_cache,sge_process_time,gtdbtk_pplacer_scratch,gtdbtk_min_perc_aa,gtdbtk_pplacer_cpus,gtdbtk_min_af,depth,genome_size,busco_config,adapter_reference,phix_reference,spades_mode,spades_kmer_sizes,validationSchemaIgnoreParams,validationShowHiddenParams,validation-schema-ignore-params,validation-show-hidden-params,mash_db,min_filesize_sra_human_scrubber_db_file,trimmomatic_keep_both_reads,trimmomatic_palindrome_clip_threshold,trimmomatic_simple_clip_threshold,trimmomatic_required_quality,trimmomatic_trailing_quality,trimmomatic_leading_quality,trimmomatic_min_length,trimmomatic_min_adapter_length,trimmomatic_seed_mismatches,trimmomatic_window_size,trimmomatic_phred,create_excel_outputs,rdp_phylomarker,rdp_output_format,min_filesize_rdp_output,ASSEMBLY:READ_CLASSIFY_KRAKEN_ONE,ASSEMBLY:ASSEMBLE_CONTIGS:ASSEMBLE_CONTIGS_SPADES,ASSEMBLY:READ_CLASSIFY_KRAKEN_ONE,ASSEMBLY:ASSEMBLE_CONTIGS:ASSEMBLE_CONTIGS_SKESA,min_filesize_checkm2_report,cat_db,min_filesize_cat_output,download_cat_db"
schema_ignore_params = "filter_blast_bitscore,filter_blast_column,min_filesize_filtered_blastn,min_filesize_blastn_output,min_filesize_blastn_db,min_filesize_extracted_ssu_file,min_filesize_renamed_ssu_file,genbank_search_type,genbank_query_qualifier,genbank_query_feature,genbank_query,min_filesize_annotated_genbank,min_filesize_binary_se_alignment,min_filesize_final_assembly,min_filesize_polished_assembly,min_filesize_binary_pe_alignment,min_filesize_filtered_assembly,filter_contigs_no_sort,filter_contigs_deflines,filter_contigs_keep_low_complexity,filter_contigs_length,filter_contigs_gcskew,filter_contigs_discard_file,filter_contigs_coverage,min_filesize_raw_assembly,min_filesize_non_overlapping_fastq,min_filesize_fastq_adapters_removed,min_filesize_adapters,min_filesize_fastq_phix_removed,min_filesize_phix_genome,min_filesize_fastq_input,workflows,available_workflows,max_retry,bigdata,logpath,qc_filecheck_log_dir,process_log_dir,kraken1_db,kraken2_db,blast_db,polish_corrections,skesa_allow_snps,skesa_min_contig_length,skesa_max_snp_length,skesa_fraction,skesa_steps,skesa_vector_percent,skesa_kmer_length,excel_sheet_name,merge_lanes,sge_high_memory,sge_options,sge_queue_size,sge_queue,sge_penv,singularity_cache,sge_process_time,gtdbtk_pplacer_scratch,gtdbtk_min_perc_aa,gtdbtk_pplacer_cpus,gtdbtk_min_af,depth,genome_size,busco_config,adapter_reference,phix_reference,spades_mode,spades_kmer_sizes,validationSchemaIgnoreParams,validationShowHiddenParams,validation-schema-ignore-params,validation-show-hidden-params,mash_db,min_filesize_sra_human_scrubber_db_file,trimmomatic_keep_both_reads,trimmomatic_palindrome_clip_threshold,trimmomatic_simple_clip_threshold,trimmomatic_required_quality,trimmomatic_trailing_quality,trimmomatic_leading_quality,trimmomatic_min_length,trimmomatic_min_adapter_length,trimmomatic_seed_mismatches,trimmomatic_window_size,trimmomatic_phred,create_excel_outputs,rdp_phylomarker,rdp_output_format,min_filesize_rdp_output,ASSEMBLY:READ_CLASSIFY_KRAKEN_ONE,ASSEMBLY:ASSEMBLE_CONTIGS:ASSEMBLE_CONTIGS_SPADES,ASSEMBLY:READ_CLASSIFY_KRAKEN_ONE,ASSEMBLY:ASSEMBLE_CONTIGS:ASSEMBLE_CONTIGS_SKESA,min_filesize_checkm2_report,cat_db,min_filesize_cat_output,download_cat_db,trim_reads_tool"
}
7 changes: 5 additions & 2 deletions conf/workflows.config
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ params {
description: 'Trim, assemble, and annotate paired end illumina reads using SPAdes.'
includes = ['spades']
is_workflow = true
modules = ["convert_samplesheet_python", "infile_handling_unix", "remove_phix_bbduk",
modules = ["convert_samplesheet_python", "infile_handling_unix", "remove_phix_bbduk", "trim_reads_fastp",
"trim_reads_trimmomatic", "overlap_paired_reads_flash", "assemble_contigs_spades",
"filter_contigs_biopython", "polish_assembly_bwa_pilon", "annotate_prokka",
"extract_16S_biopython", "extract_16S_barrnap", "align_16S_blast", "classify_16S_rdp",
Expand All @@ -30,7 +30,7 @@ params {
description: 'Trim, assemble, and annotate paired end illumina reads using SKESA.'
includes = ['skesa']
is_workflow = true
modules = ["convert_samplesheet_python", "infile_handling_unix", "remove_phix_bbduk",
modules = ["convert_samplesheet_python", "infile_handling_unix", "remove_phix_bbduk", "trim_reads_fastp",
"trim_reads_trimmomatic", "overlap_paired_reads_flash", "assemble_contigs_skesa",
"filter_contigs_biopython", "map_contigs_bwa", "annotate_prokka",
"extract_16S_biopython", "extract_16S_barrnap", "align_16S_blast", "classify_16S_rdp",
Expand Down Expand Up @@ -64,6 +64,9 @@ params {
'remove_phix_bbduk' {
path = "modules/local/remove_phix_bbduk"
}
'trim_reads_fastp' {
path = "modules/local/trim_reads_fastp"
}
'trim_reads_trimmomatic' {
path = "modules/local/trim_reads_trimmomatic"
}
Expand Down
38 changes: 34 additions & 4 deletions docs/output.md
Original file line number Diff line number Diff line change
Expand Up @@ -107,15 +107,19 @@ PhiX reads are commonly used as a positive control for Illumina sequencing. Duri

### Adapter clipping and quality trimming

Illumina instruments can detect and remove adapter sequences, but sometimes adapters can end up in the FastQ output due to sequencing errors. A default [adapters reference file](../bin/adapters_Nextera_NEB_TruSeq_NuGEN_ThruPLEX.fas) is used within this pipeline. Trimmomatic also performs quality trimming, where broken sister reads are retained for downstream processes.
Illumina instruments can detect and remove adapter sequences, but sometimes adapters can end up in the FastQ output due to sequencing errors.

#### Trimmomatic

A default [adapters reference file](../bin/adapters_Nextera_NEB_TruSeq_NuGEN_ThruPLEX.fas) is used within this pipeline for Trimmomatic. Trimmomatic also performs quality trimming, where broken sister reads are retained for downstream processes.
Please see the [adapter clipping and quality trimming using Trimmomatic documentation](../modules/local/trim_reads_trimmomatic/README.md) for more information.

<details markdown="1">
<summary><strong>QC Steps</strong></summary>

- The adapters reference file that is used with Trimmomatic to remove sequence reads must be accessible and meet a minimum file size `[Default: 10k]`.

- FastQ files after removing adapter sequences are checked to ensure that they meet a minimum file size before continuing to downstream processes `[Default: 25M]`. This is to halt analysis of a sample that is primarily contaminated with artificial Illumina sequences.
- FastQ files after removing adapters and performing quality trimming are checked to ensure that they meet a minimum file size before continuing to downstream processes `[Default: 25M]`. This is to halt analysis of a sample that is primarily contaminated with artificial Illumina sequences.

</details>

Expand All @@ -129,6 +133,29 @@ Please see the [adapter clipping and quality trimming using Trimmomatic document

</details>

#### fastp

fastp is able to clip adapters, perform quality trimming, and retain broken sister reads for downstream processes. fastp is able to automatically detect adapter sequences, but the use of a custom list of adapter sequences can be used.

<details markdown="1">
<summary><strong>QC Steps</strong></summary>

- If used, the adapters reference file that is used to remove sequence reads must be accessible and meet a minimum file size `[Default: 10k]`.

- FastQ files after removing adapters and performing quality trimming are checked to ensure that they meet a minimum file size before continuing to downstream processes `[Default: 25M]`. This is to halt analysis of a sample that is primarily contaminated with artificial Illumina sequences.

</details>

<br />

<details markdown="1">
<summary>Output files</summary>

- `CleanedReads/fastp/`
- `[sample].fastp.[tsv,xlsx]`: Summary of the number of reads discarded and retained from Trimmomatic.

</details>

### Merge overlapping sister reads

Overlapping content between sister reads that are at least 80% similar are collapsed into a singleton read. Please see the [overlapping of paired-end reads documentation](../modules/local/overlap_paired_reads_flash/README.md) for more information.
Expand Down Expand Up @@ -389,13 +416,16 @@ Concatenation of output metrics for all samples.

- `Summaries/`
- `Summary.16S.[tsv,xlsx]`: Summary of the best BLAST alignment for each sample.
- `Summary.RDP.[tsv,xlsx]`: Summary of RDP Classifier on predicted 16S ribosomal RNA genes.
- `Summary.MLST.[tsv,xlsx]`: Summary of the MLST results for all samples.
- `Summary.PhiX.[tsv,xlsx]`: Number of reads discarded and retained for each sample.
- `Summary.Assemblies.[tsv,xlsx]`: Assembly metrics such as N50, cumulative length, longest contig length, and GC composition for each sample.
- `Summary.GenomeCoverage.[tsv,xlsx]`: Summary of the overall genome coverage for each sample.
- `Summary.PhiX_Removal.[tsv,xlsx]`: Number of reads discarded and retained for each sample.
- `Summary.QC_File_Checks.[tsv,xlsx]`: Summary of all QC file checks detailing if a sample passes or fails each process.
- `Summary.GenomeCoverage.[tsv,xlsx]`: Summary of the overall genome coverage for each sample.
- `Summary.CleanedReads-Bases.[tsv,xlsx]`: Summary of the number of cleaned bases for each sample.
- `Summary.Clean_and_Overlapping_Reads.[tsv,xlsx]`: Summary of the merging of overlapping sister reads.
- `Summary.CleanedReads-AlignmentStats.[tsv,xlsx]`: Summary of the genome size and coverages of the paired-end and single-end reads for each sample.
- `Summary-[fastp,trimmomatic].Adapter_and_QC_Trimming.[tsv,xlsx]`: Summary of adapter clipping and quality trimming for each sample.
- `Summary-Report_yyyy-MM-dd_HH-mm-ss.xlsx`: Excel workbook where each file in the Summaries directory is added to a separate worksheet within the workbook.

</details>
Expand Down
4 changes: 2 additions & 2 deletions docs/usage.md
Original file line number Diff line number Diff line change
Expand Up @@ -185,7 +185,7 @@ Command executed:
--threads 12 \
--fastq-input \
--gzip-compressed \
test_R1.paired.fq.gz test_R2.paired.fq.gz test.single.fq.gz \
test_R1.paired.fq.gz test_R2.paired.fq.gz test_single.fq.gz \
> test_kraken.output

Command exit status:
Expand All @@ -195,7 +195,7 @@ Command output:
(empty)

Command error:
.command.sh: line 9: 30 Killed kraken --db /kraken_database/ --threads 12 --fastq-input --gzip-compressed test_R1.paired.fq.gz test_R2.paired.fq.gz test.single.fq.gz > test_kraken.output
.command.sh: line 9: 30 Killed kraken --db /kraken_database/ --threads 12 --fastq-input --gzip-compressed test_R1.paired.fq.gz test_R2.paired.fq.gz test_single.fq.gz > test_kraken.output

Work dir:
/home/wf-paired-end-illumina-assembly/work/9d/172ca5881234073e8d76f2a19c88fb
Expand Down
4 changes: 2 additions & 2 deletions modules/local/assemble_contigs_skesa/main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,8 @@ process ASSEMBLE_CONTIGS_SKESA {
if [[ ! -f "!{meta.id}-SKESA_contigs.fasta" ]]; then
skesa \
--reads !{cleaned_fastq_files[0]},!{cleaned_fastq_files[1]} \
--reads !{cleaned_fastq_files[2]} \
--reads "!{meta.id}_R1.paired.fq.gz","!{meta.id}_R2.paired.fq.gz" \
--reads "!{meta.id}_single.fq.gz" \
!{allow_snps} \
--cores !{task.cpus} \
--memory !{memory} \
Expand Down
6 changes: 3 additions & 3 deletions modules/local/assemble_contigs_spades/main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -25,9 +25,9 @@ process ASSEMBLE_CONTIGS_SPADES {
RAMSIZE=$(echo !{task.memory} | cut -d ' ' -f 1)
spades.py \
-1 !{cleaned_fastq_files[0]} \
-2 !{cleaned_fastq_files[1]} \
-s !{cleaned_fastq_files[2]} \
-1 "!{meta.id}_R1.paired.fq.gz" \
-2 "!{meta.id}_R2.paired.fq.gz" \
-s "!{meta.id}_single.fq.gz" \
-o SPAdes \
-k !{params.spades_kmer_sizes} \
!{mode} \
Expand Down
6 changes: 3 additions & 3 deletions modules/local/map_contigs_bwa/main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ process MAP_CONTIGS_BWA {
-x intractg \
-t !{task.cpus} \
!{uncorrected_contigs} \
!{cleaned_fastq_files[0]} !{cleaned_fastq_files[1]} \
"!{meta.id}_R1.paired.fq.gz" "!{meta.id}_R2.paired.fq.gz" \
| \
samtools sort \
-@ !{task.cpus} \
Expand Down Expand Up @@ -66,15 +66,15 @@ process MAP_CONTIGS_BWA {
fi
# Single read mapping if available for downstream depth of coverage calculations
if [[ !{cleaned_fastq_files[2]} ]]; then
if [[ !{meta.id}_single.fq.gz ]]; then
msg "INFO: Single read mapping"
bwa index "!{meta.id}-!{meta.assembler}.fna"
bwa mem \
-v 2 \
-x intractg \
"!{meta.id}-!{meta.assembler}.fna" \
!{cleaned_fastq_files[2]} \
"!{meta.id}_single.fq.gz" \
-t !{task.cpus} \
| \
samtools sort \
Expand Down
6 changes: 3 additions & 3 deletions modules/local/polish_assembly_bwa_pilon/main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ process POLISH_ASSEMBLY_BWA_PILON {
-x intractg \
-t !{task.cpus} \
!{uncorrected_contigs} \
!{cleaned_fastq_files[0]} !{cleaned_fastq_files[1]} \
"!{meta.id}_R1.paired.fq.gz" "!{meta.id}_R2.paired.fq.gz" \
| \
samtools sort \
-l 9 \
Expand Down Expand Up @@ -112,7 +112,7 @@ process POLISH_ASSEMBLY_BWA_PILON {
# Single read mapping if available for downstream depth of coverage
# calculations, not for assembly polishing.
if [[ !{cleaned_fastq_files[2]} ]]; then
if [[ !{meta.id}_single.fq.gz ]]; then
msg "INFO: Single read mapping"
bwa index "!{meta.id}-!{meta.assembler}.fna"
Expand All @@ -121,7 +121,7 @@ process POLISH_ASSEMBLY_BWA_PILON {
-x intractg \
"!{meta.id}-!{meta.assembler}.fna" \
-t !{task.cpus} \
!{cleaned_fastq_files[2]} \
"!{meta.id}_single.fq.gz" \
| \
samtools sort \
-l 9 \
Expand Down
8 changes: 4 additions & 4 deletions modules/local/qa_assembly_quast/main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -46,9 +46,9 @@ process QA_ASSEMBLY_QUAST {
# Count nucleotides per read set
echo -n '' > "!{meta.id}-!{meta.assembler}.CleanedReads-Bases.tsv"
for (( i=0; i<3; i+=3 )); do
R1=$(basename "!{cleaned_fastq_files[0]}" _R1.paired.fq.gz)
R2=$(basename "!{cleaned_fastq_files[1]}" _R2.paired.fq.gz)
single=$(basename "!{cleaned_fastq_files[2]}" _single.fq.gz)
R1=$(basename "!{meta.id}_R1.paired.fq.gz" _R1.paired.fq.gz)
R2=$(basename "!{meta.id}_R2.paired.fq.gz" _R2.paired.fq.gz)
single=$(basename "!{meta.id}_single.fq.gz" _single.fq.gz)
# Verify each set of reads groups properly
nr_uniq_str=$(echo -e "${R1}\\n${R2}\\n${single}" | sort -u | wc -l)
Expand All @@ -57,7 +57,7 @@ process QA_ASSEMBLY_QUAST {
exit 1
fi
echo -ne "${R1}\t" >> "!{meta.id}-!{meta.assembler}.CleanedReads-Bases.tsv"
zcat "!{cleaned_fastq_files[0]}" "!{cleaned_fastq_files[1]}" "!{cleaned_fastq_files[2]}" | \
zcat "!{meta.id}_R1.paired.fq.gz" "!{meta.id}_R2.paired.fq.gz" "!{meta.id}_single.fq.gz" | \
awk 'BEGIN{SUM=0} {if(NR%4==2){SUM+=length($0)}} END{OFMT="%f"; print SUM}' \
>> "!{meta.id}-!{meta.assembler}.CleanedReads-Bases.tsv"
Expand Down
3 changes: 3 additions & 0 deletions modules/local/trim_reads_fastp/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# fastp

This process uses [fastp](https://github.com/OpenGene/fastp) to perform some of the read cleaning steps to remove and trim FastQ sequences.
Loading

0 comments on commit 52c7a93

Please sign in to comment.