From 43a48d074cd531260fbcfe93affe35a2c959d04a Mon Sep 17 00:00:00 2001 From: Chris Gulvik Date: Mon, 7 Oct 2024 12:21:22 -0400 Subject: [PATCH] changed date format for output files; added additional bash variable quoting --- _run_assembly.uge-nextflow | 109 +++++++++++++++++++------------------ 1 file changed, 56 insertions(+), 53 deletions(-) diff --git a/_run_assembly.uge-nextflow b/_run_assembly.uge-nextflow index 671782e6..eacc0935 100755 --- a/_run_assembly.uge-nextflow +++ b/_run_assembly.uge-nextflow @@ -4,49 +4,52 @@ SCRIPT_NAME="$(basename ${0#_} .uge-nextflow)" # Set profile # Get node number - <=230 = biolinux, >=231 = rosalind -NODE_NUM=$(echo ${HOSTNAME%%.*} | sed 's/node//1') +NODE_NUM"=$(echo ${HOSTNAME%%.*} | sed 's/node//1')" if [[ ${NODE_NUM} -ge 231 ]]; then HPC='rosalind_hpc' else HPC='aspen_hpc' fi +time_stamp="$(date '+%Y-%b-%d_%a_%H-%M-%S')" + module load nextflow nextflow \ - -log ${OUT}/pipeline_info/nextflow_log.${SCRIPT_NAME}.txt \ + -log "${OUT}/pipeline_info/nextflow_log.${SCRIPT_NAME}.txt" \ run \ - ${LAB_HOME}/workflows/wf-paired-end-illumina-assembly/main.nf \ - -profile ${HPC} \ - --input ${IN} \ - --outdir ${OUT} \ + "${LAB_HOME}/workflows/wf-paired-end-illumina-assembly/main.nf" \ + -profile "${HPC}" \ + --input "${IN}" \ + --outdir "${OUT}" \ -ansi-log false \ - -N ${USER}@cdc.gov \ - -w ${OUT}/.work \ - --blast_db ${LAB_HOME}/.databases/ncbi \ - --checkm2_db ${LAB_HOME}/.databases/checkm2 \ - --kraken1_db ${LAB_HOME}/.databases/kraken1-db-v1.0.0 \ - --kraken2_db ${LAB_HOME}/.databases/kraken2 \ + -N "${USER}@cdc.gov" \ + -w "${OUT}/.work" \ + -with-dag "${OUT}/pipeline_info/dag.${time_stamp}.html" \ + --blast_db "${LAB_HOME}/.databases/ncbi" \ + --checkm2_db "${LAB_HOME}/.databases/checkm2" \ + --kraken1_db "${LAB_HOME}/.databases/kraken1-db-v1.0.0" \ + --kraken2_db "${LAB_HOME}/.databases/kraken2" \ + --sra_scrubber_db "${LAB_HOME}/.databases/sra-human-scrubber/data/human_filter.db" \ + --subsample_tool seqkit \ --create_excel_outputs \ -resume # Check for errors and add to errors.tsv # Get nextflow run name -run_name=$(grep "Launching" ${OUT}/pipeline_info/ASM_*.o${SCRIPT_NAME} | cut -d '[' -f 2 | cut -d ']' -f 1) - -time_stamp=$(date '+%Y-%b-%d %a %H:%M:%S') +run_name=$(grep Launching "${OUT}/pipeline_info/ASM_*.o${SCRIPT_NAME}" | cut -d '[' -f 2 | cut -d ']' -f 1) # Read each line from nextflow log, find info, and add to errors.tsv while read -r line; do # If process is already running, clean up error if [[ "${line}" =~ ^Unable[[:space:]]to[[:space:]]acquire[[:space:]]lock.* ]]; then error="You are trying to resume the execution of an already running pipeline." - ASM_OUT=$(realpath ${OUT}/pipeline_info/ASM_*.o*) - echo -e "-\t-\t${error}\t${ASM_OUT}\t${time_stamp}\t${run_name}" >> ${OUT}/pipeline_info/errors.tsv + ASM_OUT="$(realpath ${OUT}/pipeline_info/ASM_*.o*)" + echo -e "-\t-\t${error}\t${ASM_OUT}\t${time_stamp}\t${run_name}" >> "${OUT}/pipeline_info/errors.tsv" else # Workflow ran some processes - sample_name=$(grep "nf-" ${line}/.command.run | cut -d '(' -f 2 | cut -d ')' -f 1) - process=$(grep "NEXTFLOW TASK" ${line}/.command.run | awk -F ':' '{print $NF}' | cut -d ' ' -f 1) - error=$(tail -n 1 ${line}/.command.err | sed -e 's/\[[^][]*\] //g') + sample_name=$(grep "nf-" "${line}/.command.run" | cut -d '(' -f 2 | cut -d ')' -f 1) + process=$(grep 'NEXTFLOW TASK' "${line}/.command.run" | awk -F ':' '{print $NF}' | cut -d ' ' -f 1) + error=$(tail -n 1 "${line}/.command.err" | sed -e 's/\[[^][]*\] //g') # Kraken 2 places "Loading database information... done." in error log if [[ ${process} =~ .*READ_CLASSIFY_KRAKEN_TWO$ ]] \ @@ -57,13 +60,13 @@ while read -r line; do # BBDuk java errors if [[ ${process} =~ .*REMOVE_PHIX_BBDUK$ ]] \ && [[ "${error}" =~ .*at.* ]]; then - error=$(grep -A1 "java.lang" ${line}/.command.err | head -n 2 | tail -n 1) - if [[ ! ${error} ]]; then + error=$(grep -A1 'java.lang' "${line}/.command.err" | head -n 2 | tail -n 1) + if [[ ! "${error}" ]]; then continue elif [[ ${error} =~ ^Mismatch.* ]]; then - error=${error} + error="${error}" else - error=$(grep "java.lang" ${line}/.command.err | awk -F ': ' 'END {print $2}') + error=$(grep 'java.lang' "${line}/.command.err" | awk -F ': ' 'END {print $2}') fi elif [[ ${process} =~ .*REMOVE_PHIX_BBDUK$ ]] \ && [[ "${error}" =~ "Input is being processed as unpaired" ]]; then @@ -77,74 +80,74 @@ while read -r line; do # Check if error is from file checks if [[ ${error} =~ .+Check[[:space:]]failed$ ]]; then - get_previous_process_workdir=$(dirname $(grep "ln -s" ${line}/.command.run | grep "work" | awk 'END {print $(NF-1)}' )) - process=$(grep "nf-" ${get_previous_process_workdir}/.command.run | awk -F 'nf-' '{print $2}' | sed -e 's/_(.*//') + get_previous_process_workdir=$(dirname $(grep "ln -s" "${line}/.command.run" | grep "work" | awk 'END {print $(NF-1)}' )) + process=$(grep "nf-" "${get_previous_process_workdir}/.command.run" | awk -F 'nf-' '{print $2}' | sed -e 's/_(.*//') line="${get_previous_process_workdir}" fi # If process for sample retried and succeeded, ignore if [[ $(find "${OUT}/pipeline_info/process_logs/" -type f -name "${sample_name}.*${process}*.command.out") ]] \ - && [[ $(cat ${line}/.exitcode) = @(0|71|104|134|137|139|140|143|245|250|255) ]]; then + && [[ $(cat "${line}/.exitcode") = @(0|71|104|134|137|139|140|143|245|250|255) ]]; then continue else - echo -e "${sample_name}\t${process}\t${error}\t${line}\t${time_stamp}\t${run_name}" >> ${OUT}/pipeline_info/errors.tsv + echo -e "${sample_name}\t${process}\t${error}\t${line}\t${time_stamp}\t${run_name}" >> "${OUT}/pipeline_info/errors.tsv" fi fi -done < <(nextflow log ${run_name} -filter 'status == "FAILED"') +done < <(nextflow log "${run_name}" -filter 'status == "FAILED"') # Look for errors from process EXTRACT_16S_BIOPYTHON -biopython_rna_errors=( $(find ${OUT}/.work -type f -name ".command.err" -exec grep -l "ERROR: \['16S ribosomal RNA'\]" '{}' \;) ) -if [[ $biopython_rna_errors ]]; then - for line in ${biopython_rna_errors[@]}; do - work_dir=$(dirname ${line}) - error=$(tail -n 1 ${line} | sed -e 's/[][]//g' | awk -F '/' '{print $1 $NF}') - sample_name=$(grep "nf-" ${work_dir}/.command.run | cut -d '(' -f 2 | cut -d ')' -f 1) - process=$(grep "NEXTFLOW TASK" ${work_dir}/.command.run | awk -F ':' '{print $NF}' | cut -d ' ' -f 1) +biopython_rna_errors=( $(find "${OUT}/.work" -type f -name ".command.err" -exec grep -l "ERROR: \['16S ribosomal RNA'\]" '{}' \;) ) +if [[ "${biopython_rna_errors}" ]]; then + for line in "${biopython_rna_errors[@]}"; do + work_dir=$(dirname "${line}") + error=$(tail -n 1 "${line}" | sed -e 's/[][]//g' | awk -F '/' '{print $1 $NF}') + sample_name=$(grep "nf-" "${work_dir}/.command.run" | cut -d '(' -f 2 | cut -d ')' -f 1) + process=$(grep "NEXTFLOW TASK" "${work_dir}/.command.run" | awk -F ':' '{print $NF}' | cut -d ' ' -f 1) # Append to errors.tsv - echo -e "${sample_name}\t${process}\t${error}\t${work_dir}\t${time_stamp}\t${run_name}" >> ${OUT}/pipeline_info/errors.tsv + echo -e "${sample_name}\t${process}\t${error}\t${work_dir}\t${time_stamp}\t${run_name}" >> "${OUT}/pipeline_info/errors.tsv" done fi # Parse HPC stdout file for QC check failures QC_FAILURES=() while read -r line; do - QC_FAILURES+=("$line") -done < <(awk '/QC check failed/ {print $(NF-3), "("$NF")"}' ${OUT}/pipeline_info/ASM_*.o${SCRIPT_NAME}) + QC_FAILURES+=("${line}") +done < <(awk '/QC check failed/ {print $(NF-3), "("$NF")"}' "${OUT}/pipeline_info/ASM_*.o${SCRIPT_NAME}") -if [[ $QC_FAILURES ]]; then +if [[ "${QC_FAILURES}" ]]; then # Loop over each QC failure for f in "${QC_FAILURES[@]}"; do # Get work directory - short_wd=$(grep "$f" ${OUT}/pipeline_info/ASM_*.o${SCRIPT_NAME} | awk -F '[][]' '{print $2}') - wd_path=$(realpath ${OUT}/.work/${short_wd}*) + short_wd=$(grep "$f" "${OUT}/pipeline_info/ASM_*.o${SCRIPT_NAME}" | awk -F '[][]' '{print $2}') + wd_path=$(realpath "${OUT}/.work/${short_wd}"*) # Get first error - error=$(grep "ERROR" ${wd_path}/.command.err | head -n 1 | sed -e 's/\[[^][]*\] //g') + error=$(grep "ERROR" "${wd_path}/.command.err" | head -n 1 | sed -e 's/\[[^][]*\] //g') - process=$(grep "NEXTFLOW TASK" ${wd_path}/.command.run | awk -F ':' '{print $NF}' | cut -d ' ' -f 1) - sample_name=$(echo "$f" | awk -F "[()]" '{print $2}') + process=$(grep "NEXTFLOW TASK" "${wd_path}/.command.run" | awk -F ':' '{print $NF}' | cut -d ' ' -f 1) + sample_name=$(echo "${f}" | awk -F "[()]" '{print $2}') # Append to errors.tsv - echo -e "${sample_name}\t${process}\t${error}\t${wd_path}\t${time_stamp}\t${run_name}" >> ${OUT}/pipeline_info/errors.tsv + echo -e "${sample_name}\t${process}\t${error}\t${wd_path}\t${time_stamp}\t${run_name}" >> "${OUT}/pipeline_info/errors.tsv" done fi -# If errors.tsv found.. +# If errors.tsv found ... if [[ -f "${OUT}/pipeline_info/errors.tsv" ]]; then # Add column headers - sed -i '1i Sample Name\tProcess\tError\tError Directory\tTimestamp\tRun Name' ${OUT}/pipeline_info/errors.tsv + sed -i '1i Sample Name\tProcess\tError\tError Directory\tTimestamp\tRun Name' "${OUT}/pipeline_info/errors.tsv" # Remove duplicate lines and lines that have an empty first column - awk -F $'\t' '!_[$1,$2,$3,$6]++' ${OUT}/pipeline_info/errors.tsv \ + awk -F $'\t' '!_[$1,$2,$3,$6]++' "${OUT}/pipeline_info/errors.tsv" \ | awk -F $'\t' '$1{print $0}' \ - > ${OUT}/pipeline_info/errors_new.tsv + > "${OUT}/pipeline_info/errors_new.tsv" # Delete original errors.tsv and rename errors_new.tsv - rm ${OUT}/pipeline_info/errors.tsv + rm "${OUT}/pipeline_info/errors.tsv" - mv ${OUT}/pipeline_info/errors_new.tsv \ - ${OUT}/pipeline_info/errors.tsv + mv "${OUT}/pipeline_info/errors_new.tsv" \ + "${OUT}/pipeline_info/errors.tsv" fi # Count lines in Summary.GenomeCoverage.tsv