From f72c4ecd2300482c0371be5d1b919c5ad8b6829d Mon Sep 17 00:00:00 2001 From: Zhuoxuan Zhang Date: Tue, 12 Nov 2024 14:06:19 -0500 Subject: [PATCH] Refactor bio scripts and input setup - Renamed `bio/inputs.sh` to `bio/deps.sh` and removed unnecessary code. - Updated `bio/run.sh` to use the correct input file name. - Removed unused code from `bio/input.sh`. - Cleaned up `bio/verify.sh` and added comments. Closes #27 --- .github/workflows/tests.yml | 2 +- bio/{inputs.sh => deps.sh} | 34 +------------------------------ bio/hashes/CHS_HG00614_1.hash | 1 + bio/hashes/CHS_HG00614_10.hash | 1 + bio/hashes/CHS_HG00614_11.hash | 1 + bio/hashes/CHS_HG00614_17.hash | 1 + bio/hashes/CHS_HG00614_19.hash | 1 + bio/hashes/CHS_HG00614_2.hash | 1 + bio/hashes/CHS_HG00614_4.hash | 1 + bio/hashes/CHS_HG00614_7.hash | 1 + bio/hashes/CHS_HG00614_X.hash | 1 + bio/hashes/CHS_HG00614_Y.hash | 1 + bio/hashes/HG00614_corrected.hash | 1 + bio/hashes/HG01942_corrected.hash | 1 + bio/hashes/PEL_HG01942_1.hash | 1 + bio/hashes/PEL_HG01942_10.hash | 1 + bio/hashes/PEL_HG01942_11.hash | 1 + bio/hashes/PEL_HG01942_17.hash | 1 + bio/hashes/PEL_HG01942_19.hash | 1 + bio/hashes/PEL_HG01942_2.hash | 1 + bio/hashes/PEL_HG01942_4.hash | 1 + bio/hashes/PEL_HG01942_7.hash | 1 + bio/hashes/PEL_HG01942_X.hash | 1 + bio/hashes/PEL_HG01942_Y.hash | 1 + bio/input.sh | 31 ++++++++++++++++++++++++++++ bio/input.txt | 1 - bio/{bio4.sh => run.sh} | 3 +-- bio/verify.sh | 13 +++++++----- 28 files changed, 64 insertions(+), 42 deletions(-) rename bio/{inputs.sh => deps.sh} (71%) create mode 100644 bio/hashes/CHS_HG00614_1.hash create mode 100644 bio/hashes/CHS_HG00614_10.hash create mode 100644 bio/hashes/CHS_HG00614_11.hash create mode 100644 bio/hashes/CHS_HG00614_17.hash create mode 100644 bio/hashes/CHS_HG00614_19.hash create mode 100644 bio/hashes/CHS_HG00614_2.hash create mode 100644 bio/hashes/CHS_HG00614_4.hash create mode 100644 bio/hashes/CHS_HG00614_7.hash create mode 100644 bio/hashes/CHS_HG00614_X.hash create mode 100644 bio/hashes/CHS_HG00614_Y.hash create mode 100644 bio/hashes/HG00614_corrected.hash create mode 100644 bio/hashes/HG01942_corrected.hash create mode 100644 bio/hashes/PEL_HG01942_1.hash create mode 100644 bio/hashes/PEL_HG01942_10.hash create mode 100644 bio/hashes/PEL_HG01942_11.hash create mode 100644 bio/hashes/PEL_HG01942_17.hash create mode 100644 bio/hashes/PEL_HG01942_19.hash create mode 100644 bio/hashes/PEL_HG01942_2.hash create mode 100644 bio/hashes/PEL_HG01942_4.hash create mode 100644 bio/hashes/PEL_HG01942_7.hash create mode 100644 bio/hashes/PEL_HG01942_X.hash create mode 100644 bio/hashes/PEL_HG01942_Y.hash create mode 100755 bio/input.sh rename bio/{bio4.sh => run.sh} (96%) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 424077fbe..4dbf4f51e 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -12,7 +12,7 @@ jobs: strategy: fail-fast: false matrix: - benchmark: [nlp, file-enc, unix50, log-analysis, max-temp, media-conv, sklearn, covid-mts, riker, oneliners, web-index] + benchmark: [nlp, file-enc, unix50, log-analysis, max-temp, media-conv, sklearn, covid-mts, riker, oneliners, web-index, bio] steps: - name: Checkout code diff --git a/bio/inputs.sh b/bio/deps.sh similarity index 71% rename from bio/inputs.sh rename to bio/deps.sh index 8f6db0a3e..a5e29dd6a 100755 --- a/bio/inputs.sh +++ b/bio/deps.sh @@ -1,22 +1,3 @@ -# red color -RED='\033[0;31m' -# reset the color -NC='\033[0m' - -IN=${BIO4:-$PASH_TOP/benchmarks/bio} -IN_NAME=${IN_N:-input_all.txt} -if [[ $1 == "-c" ]]; then - rm -rf *.bam - rm -rf *.sam - rm -rf ../output - exit -fi - -PW=${PASH_TOP}/benchmarks/bio/input -echo $PW -mkdir -p $PW -mkdir -p ${PASH_TOP}/benchmarks/bio/output - # install dependencies required_version="1.7" @@ -62,17 +43,4 @@ else echo "Failed to install the correct version of Samtools." exit 1 fi -fi - -cat ${IN}/${IN_NAME} |while read s_line; - do - - sample=$(echo $s_line |cut -d " " -f 2); - if [[ ! -f $sample ]]; then - pop=$(echo $s_line |cut -f 1 -d " "); - link=$(echo $s_line |cut -f 3 -d " "); - wget -O "$PW/$sample".bam "$link"; ##this part can be adjusted maybe - # TODO: stop after one download for testing - exit 0 - fi -done; +fi \ No newline at end of file diff --git a/bio/hashes/CHS_HG00614_1.hash b/bio/hashes/CHS_HG00614_1.hash new file mode 100644 index 000000000..c0ad467f3 --- /dev/null +++ b/bio/hashes/CHS_HG00614_1.hash @@ -0,0 +1 @@ +527eb9f7611542301ec372574053e22f0437ee03a79d3e1bf6617b4b0bd18008 diff --git a/bio/hashes/CHS_HG00614_10.hash b/bio/hashes/CHS_HG00614_10.hash new file mode 100644 index 000000000..70a90575b --- /dev/null +++ b/bio/hashes/CHS_HG00614_10.hash @@ -0,0 +1 @@ +a7943043dcb526544ea0ec3dbe0bb2482e85d8997c603b28ec95d3e399bd7874 diff --git a/bio/hashes/CHS_HG00614_11.hash b/bio/hashes/CHS_HG00614_11.hash new file mode 100644 index 000000000..33372cbe0 --- /dev/null +++ b/bio/hashes/CHS_HG00614_11.hash @@ -0,0 +1 @@ +9fa8cc4dcaeb984e8c5acdf3f32e1859938f241c3949afbfb81c87cba4017215 diff --git a/bio/hashes/CHS_HG00614_17.hash b/bio/hashes/CHS_HG00614_17.hash new file mode 100644 index 000000000..2ff1e63f9 --- /dev/null +++ b/bio/hashes/CHS_HG00614_17.hash @@ -0,0 +1 @@ +8f155d4f0f892416c01ff79e9fbccefd255cbcac19e9df6c0df3768cb1588923 diff --git a/bio/hashes/CHS_HG00614_19.hash b/bio/hashes/CHS_HG00614_19.hash new file mode 100644 index 000000000..29d9bd467 --- /dev/null +++ b/bio/hashes/CHS_HG00614_19.hash @@ -0,0 +1 @@ +4322e91cde058dadffd216cb19cbb07696a828def8db10f23cddee49c12f1c04 diff --git a/bio/hashes/CHS_HG00614_2.hash b/bio/hashes/CHS_HG00614_2.hash new file mode 100644 index 000000000..2ecad9c98 --- /dev/null +++ b/bio/hashes/CHS_HG00614_2.hash @@ -0,0 +1 @@ +bd1f91aca35c988469d8d9ebb7fb131349b3145a1b25950b18d3b7673a455f9a diff --git a/bio/hashes/CHS_HG00614_4.hash b/bio/hashes/CHS_HG00614_4.hash new file mode 100644 index 000000000..1980fb34e --- /dev/null +++ b/bio/hashes/CHS_HG00614_4.hash @@ -0,0 +1 @@ +b7e8d5f93d15a4f10556277c2841ee588782f485f09c33dcad2a4504e4433d65 diff --git a/bio/hashes/CHS_HG00614_7.hash b/bio/hashes/CHS_HG00614_7.hash new file mode 100644 index 000000000..e75d6b776 --- /dev/null +++ b/bio/hashes/CHS_HG00614_7.hash @@ -0,0 +1 @@ +d0c05e98f971cf18a7f0fa61a45ee4eb5c42b9c8ddf5b7f7d8b977e4232c463c diff --git a/bio/hashes/CHS_HG00614_X.hash b/bio/hashes/CHS_HG00614_X.hash new file mode 100644 index 000000000..0623bb98c --- /dev/null +++ b/bio/hashes/CHS_HG00614_X.hash @@ -0,0 +1 @@ +10f504af43506138056d60c3947de00bb7fc3b8b872c1eedc63d66a4d5586ead diff --git a/bio/hashes/CHS_HG00614_Y.hash b/bio/hashes/CHS_HG00614_Y.hash new file mode 100644 index 000000000..80e372eeb --- /dev/null +++ b/bio/hashes/CHS_HG00614_Y.hash @@ -0,0 +1 @@ +8cff7e9ec2c537298d6de6c0a1749d003c895549e32933dfd6eec94d376e0041 diff --git a/bio/hashes/HG00614_corrected.hash b/bio/hashes/HG00614_corrected.hash new file mode 100644 index 000000000..73b8a0699 --- /dev/null +++ b/bio/hashes/HG00614_corrected.hash @@ -0,0 +1 @@ +5b87bb001a505d9e0fe22b60d4c9303b90d4af001bbdc5e2038a0dcf7245dc76 diff --git a/bio/hashes/HG01942_corrected.hash b/bio/hashes/HG01942_corrected.hash new file mode 100644 index 000000000..c30680402 --- /dev/null +++ b/bio/hashes/HG01942_corrected.hash @@ -0,0 +1 @@ +e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855 diff --git a/bio/hashes/PEL_HG01942_1.hash b/bio/hashes/PEL_HG01942_1.hash new file mode 100644 index 000000000..b7a79cecf --- /dev/null +++ b/bio/hashes/PEL_HG01942_1.hash @@ -0,0 +1 @@ +fec413b3018fec127ff33d93219582195fd5286c9e6f01f1d58f365a62fbaecf diff --git a/bio/hashes/PEL_HG01942_10.hash b/bio/hashes/PEL_HG01942_10.hash new file mode 100644 index 000000000..b7a79cecf --- /dev/null +++ b/bio/hashes/PEL_HG01942_10.hash @@ -0,0 +1 @@ +fec413b3018fec127ff33d93219582195fd5286c9e6f01f1d58f365a62fbaecf diff --git a/bio/hashes/PEL_HG01942_11.hash b/bio/hashes/PEL_HG01942_11.hash new file mode 100644 index 000000000..b7a79cecf --- /dev/null +++ b/bio/hashes/PEL_HG01942_11.hash @@ -0,0 +1 @@ +fec413b3018fec127ff33d93219582195fd5286c9e6f01f1d58f365a62fbaecf diff --git a/bio/hashes/PEL_HG01942_17.hash b/bio/hashes/PEL_HG01942_17.hash new file mode 100644 index 000000000..b7a79cecf --- /dev/null +++ b/bio/hashes/PEL_HG01942_17.hash @@ -0,0 +1 @@ +fec413b3018fec127ff33d93219582195fd5286c9e6f01f1d58f365a62fbaecf diff --git a/bio/hashes/PEL_HG01942_19.hash b/bio/hashes/PEL_HG01942_19.hash new file mode 100644 index 000000000..b7a79cecf --- /dev/null +++ b/bio/hashes/PEL_HG01942_19.hash @@ -0,0 +1 @@ +fec413b3018fec127ff33d93219582195fd5286c9e6f01f1d58f365a62fbaecf diff --git a/bio/hashes/PEL_HG01942_2.hash b/bio/hashes/PEL_HG01942_2.hash new file mode 100644 index 000000000..b7a79cecf --- /dev/null +++ b/bio/hashes/PEL_HG01942_2.hash @@ -0,0 +1 @@ +fec413b3018fec127ff33d93219582195fd5286c9e6f01f1d58f365a62fbaecf diff --git a/bio/hashes/PEL_HG01942_4.hash b/bio/hashes/PEL_HG01942_4.hash new file mode 100644 index 000000000..b7a79cecf --- /dev/null +++ b/bio/hashes/PEL_HG01942_4.hash @@ -0,0 +1 @@ +fec413b3018fec127ff33d93219582195fd5286c9e6f01f1d58f365a62fbaecf diff --git a/bio/hashes/PEL_HG01942_7.hash b/bio/hashes/PEL_HG01942_7.hash new file mode 100644 index 000000000..b7a79cecf --- /dev/null +++ b/bio/hashes/PEL_HG01942_7.hash @@ -0,0 +1 @@ +fec413b3018fec127ff33d93219582195fd5286c9e6f01f1d58f365a62fbaecf diff --git a/bio/hashes/PEL_HG01942_X.hash b/bio/hashes/PEL_HG01942_X.hash new file mode 100644 index 000000000..b7a79cecf --- /dev/null +++ b/bio/hashes/PEL_HG01942_X.hash @@ -0,0 +1 @@ +fec413b3018fec127ff33d93219582195fd5286c9e6f01f1d58f365a62fbaecf diff --git a/bio/hashes/PEL_HG01942_Y.hash b/bio/hashes/PEL_HG01942_Y.hash new file mode 100644 index 000000000..b7a79cecf --- /dev/null +++ b/bio/hashes/PEL_HG01942_Y.hash @@ -0,0 +1 @@ +fec413b3018fec127ff33d93219582195fd5286c9e6f01f1d58f365a62fbaecf diff --git a/bio/input.sh b/bio/input.sh new file mode 100755 index 000000000..de169425a --- /dev/null +++ b/bio/input.sh @@ -0,0 +1,31 @@ +# red color +RED='\033[0;31m' +# reset the color +NC='\033[0m' + +IN=${BIO4:-$PASH_TOP/benchmarks/bio} +IN_NAME=${IN_N:-input.txt} +if [[ $1 == "-c" ]]; then + rm -rf *.bam + rm -rf *.sam + rm -rf ../output + exit +fi + +PW=${PASH_TOP}/benchmarks/bio/input +echo $PW +mkdir -p $PW +mkdir -p ${PASH_TOP}/benchmarks/bio/output + +cat ${IN}/${IN_NAME} |while read s_line; + do + + sample=$(echo $s_line |cut -d " " -f 2); + if [[ ! -f $sample ]]; then + pop=$(echo $s_line |cut -f 1 -d " "); + link=$(echo $s_line |cut -f 3 -d " "); + wget -O "$PW/$sample".bam "$link"; ##this part can be adjusted maybe + # stop after one download + exit 0 + fi +done; diff --git a/bio/input.txt b/bio/input.txt index c73de65c0..d83169d32 100644 --- a/bio/input.txt +++ b/bio/input.txt @@ -1,2 +1 @@ CHS HG00614 ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/phase3/data/HG00614/cg_data/HG00614_lcl_SRR821831.mapped.COMPLETE_GENOMICS.CGworkflow2_2_evidenceOnly.CHS.high_coverage.20130401.bam -PEL HG01942 ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/phase3/data/HG01942/cg_data/HG01942_blood_SRR801093.mapped.COMPLETE_GENOMICS.CGworkflow2_2_evidenceOnly.PEL.high_coverage.20130401.bam diff --git a/bio/bio4.sh b/bio/run.sh similarity index 96% rename from bio/bio4.sh rename to bio/run.sh index 5e955fbd3..981ae4624 100755 --- a/bio/bio4.sh +++ b/bio/run.sh @@ -1,7 +1,7 @@ # create bam files with regions ################### 1KG SAMPLES IN=${INPUT:-$PASH_TOP/benchmarks/bio} -IN_NAME=${IN_N:-input_all.txt} +IN_NAME=${IN_N:-input.txt} OUT=${OUTPUT:-$PASH_TOP/benchmarks/bio/output} cat ${IN}/${IN_NAME}|while read s_line; do @@ -22,7 +22,6 @@ cat ${IN}/${IN_NAME}|while read s_line; samtools view -b "${OUT}/$sample"_corrected.bam chr"$chr" > "${OUT}/$pop"_"$sample"_"$chr".bam ; echo 'Indexing Sample '$pop'_'${OUT}/$sample' '; samtools index -b "${OUT}/$pop"_"$sample"_"$chr".bam; - sleep 2 done; #rm "${OUT}/$sample"_corrected.bam; #rm "${OUT}/$sample"_corrected.bam.bai; diff --git a/bio/verify.sh b/bio/verify.sh index 7e283e3a9..0f11d5bc6 100755 --- a/bio/verify.sh +++ b/bio/verify.sh @@ -5,7 +5,9 @@ cd "$(realpath $(dirname "$0"))" -mkdir -p hashes +hash_folder="hashes" + +mkdir -p $hash_folder if [[ "$@" == *"--generate"* ]]; then directory="output" @@ -50,7 +52,7 @@ do for file in "$folder"/*.bam do current_file=$((current_file + 1)) - echo "Processing file $current_file of $total_files..." + # echo "Processing file $current_file of $total_files..." # Extract the filename without the directory path and extension filename=$(basename "$file" .bam) @@ -66,7 +68,8 @@ do # Compare the hash with the hash in the hashes directory diff "$hash_folder/$filename.hash" "$folder/$filename.hash" - # Print the filename and hash - echo "File: $folder/$filename | SHA-256 Hash: $(cat "$folder/$filename.hash")" + # # Print the filename and hash + # echo "File: $folder/$filename | SHA-256 Hash: $(cat "$folder/$filename.hash")" done -done \ No newline at end of file +done +exit 0 \ No newline at end of file