Skip to content

Commit

Permalink
Refactor bio scripts and input setup
Browse files Browse the repository at this point in the history
- Renamed `bio/inputs.sh` to `bio/deps.sh` and removed unnecessary code.
- Updated `bio/run.sh` to use the correct input file name.
- Removed unused code from `bio/input.sh`.
- Cleaned up `bio/verify.sh` and added comments.

Closes binpash#27
  • Loading branch information
Zhuoxuan Zhang authored and vagos committed Jan 3, 2025
1 parent c96477a commit f72c4ec
Show file tree
Hide file tree
Showing 28 changed files with 64 additions and 42 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ jobs:
strategy:
fail-fast: false
matrix:
benchmark: [nlp, file-enc, unix50, log-analysis, max-temp, media-conv, sklearn, covid-mts, riker, oneliners, web-index]
benchmark: [nlp, file-enc, unix50, log-analysis, max-temp, media-conv, sklearn, covid-mts, riker, oneliners, web-index, bio]

steps:
- name: Checkout code
Expand Down
34 changes: 1 addition & 33 deletions bio/inputs.sh → bio/deps.sh
Original file line number Diff line number Diff line change
@@ -1,22 +1,3 @@
# red color
RED='\033[0;31m'
# reset the color
NC='\033[0m'

IN=${BIO4:-$PASH_TOP/benchmarks/bio}
IN_NAME=${IN_N:-input_all.txt}
if [[ $1 == "-c" ]]; then
rm -rf *.bam
rm -rf *.sam
rm -rf ../output
exit
fi

PW=${PASH_TOP}/benchmarks/bio/input
echo $PW
mkdir -p $PW
mkdir -p ${PASH_TOP}/benchmarks/bio/output

# install dependencies
required_version="1.7"

Expand Down Expand Up @@ -62,17 +43,4 @@ else
echo "Failed to install the correct version of Samtools."
exit 1
fi
fi

cat ${IN}/${IN_NAME} |while read s_line;
do

sample=$(echo $s_line |cut -d " " -f 2);
if [[ ! -f $sample ]]; then
pop=$(echo $s_line |cut -f 1 -d " ");
link=$(echo $s_line |cut -f 3 -d " ");
wget -O "$PW/$sample".bam "$link"; ##this part can be adjusted maybe
# TODO: stop after one download for testing
exit 0
fi
done;
fi
1 change: 1 addition & 0 deletions bio/hashes/CHS_HG00614_1.hash
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
527eb9f7611542301ec372574053e22f0437ee03a79d3e1bf6617b4b0bd18008
1 change: 1 addition & 0 deletions bio/hashes/CHS_HG00614_10.hash
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
a7943043dcb526544ea0ec3dbe0bb2482e85d8997c603b28ec95d3e399bd7874
1 change: 1 addition & 0 deletions bio/hashes/CHS_HG00614_11.hash
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
9fa8cc4dcaeb984e8c5acdf3f32e1859938f241c3949afbfb81c87cba4017215
1 change: 1 addition & 0 deletions bio/hashes/CHS_HG00614_17.hash
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
8f155d4f0f892416c01ff79e9fbccefd255cbcac19e9df6c0df3768cb1588923
1 change: 1 addition & 0 deletions bio/hashes/CHS_HG00614_19.hash
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
4322e91cde058dadffd216cb19cbb07696a828def8db10f23cddee49c12f1c04
1 change: 1 addition & 0 deletions bio/hashes/CHS_HG00614_2.hash
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
bd1f91aca35c988469d8d9ebb7fb131349b3145a1b25950b18d3b7673a455f9a
1 change: 1 addition & 0 deletions bio/hashes/CHS_HG00614_4.hash
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
b7e8d5f93d15a4f10556277c2841ee588782f485f09c33dcad2a4504e4433d65
1 change: 1 addition & 0 deletions bio/hashes/CHS_HG00614_7.hash
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
d0c05e98f971cf18a7f0fa61a45ee4eb5c42b9c8ddf5b7f7d8b977e4232c463c
1 change: 1 addition & 0 deletions bio/hashes/CHS_HG00614_X.hash
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
10f504af43506138056d60c3947de00bb7fc3b8b872c1eedc63d66a4d5586ead
1 change: 1 addition & 0 deletions bio/hashes/CHS_HG00614_Y.hash
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
8cff7e9ec2c537298d6de6c0a1749d003c895549e32933dfd6eec94d376e0041
1 change: 1 addition & 0 deletions bio/hashes/HG00614_corrected.hash
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
5b87bb001a505d9e0fe22b60d4c9303b90d4af001bbdc5e2038a0dcf7245dc76
1 change: 1 addition & 0 deletions bio/hashes/HG01942_corrected.hash
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855
1 change: 1 addition & 0 deletions bio/hashes/PEL_HG01942_1.hash
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
fec413b3018fec127ff33d93219582195fd5286c9e6f01f1d58f365a62fbaecf
1 change: 1 addition & 0 deletions bio/hashes/PEL_HG01942_10.hash
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
fec413b3018fec127ff33d93219582195fd5286c9e6f01f1d58f365a62fbaecf
1 change: 1 addition & 0 deletions bio/hashes/PEL_HG01942_11.hash
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
fec413b3018fec127ff33d93219582195fd5286c9e6f01f1d58f365a62fbaecf
1 change: 1 addition & 0 deletions bio/hashes/PEL_HG01942_17.hash
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
fec413b3018fec127ff33d93219582195fd5286c9e6f01f1d58f365a62fbaecf
1 change: 1 addition & 0 deletions bio/hashes/PEL_HG01942_19.hash
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
fec413b3018fec127ff33d93219582195fd5286c9e6f01f1d58f365a62fbaecf
1 change: 1 addition & 0 deletions bio/hashes/PEL_HG01942_2.hash
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
fec413b3018fec127ff33d93219582195fd5286c9e6f01f1d58f365a62fbaecf
1 change: 1 addition & 0 deletions bio/hashes/PEL_HG01942_4.hash
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
fec413b3018fec127ff33d93219582195fd5286c9e6f01f1d58f365a62fbaecf
1 change: 1 addition & 0 deletions bio/hashes/PEL_HG01942_7.hash
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
fec413b3018fec127ff33d93219582195fd5286c9e6f01f1d58f365a62fbaecf
1 change: 1 addition & 0 deletions bio/hashes/PEL_HG01942_X.hash
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
fec413b3018fec127ff33d93219582195fd5286c9e6f01f1d58f365a62fbaecf
1 change: 1 addition & 0 deletions bio/hashes/PEL_HG01942_Y.hash
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
fec413b3018fec127ff33d93219582195fd5286c9e6f01f1d58f365a62fbaecf
31 changes: 31 additions & 0 deletions bio/input.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
# red color
RED='\033[0;31m'
# reset the color
NC='\033[0m'

IN=${BIO4:-$PASH_TOP/benchmarks/bio}
IN_NAME=${IN_N:-input.txt}
if [[ $1 == "-c" ]]; then
rm -rf *.bam
rm -rf *.sam
rm -rf ../output
exit
fi

PW=${PASH_TOP}/benchmarks/bio/input
echo $PW
mkdir -p $PW
mkdir -p ${PASH_TOP}/benchmarks/bio/output

cat ${IN}/${IN_NAME} |while read s_line;
do

sample=$(echo $s_line |cut -d " " -f 2);
if [[ ! -f $sample ]]; then
pop=$(echo $s_line |cut -f 1 -d " ");
link=$(echo $s_line |cut -f 3 -d " ");
wget -O "$PW/$sample".bam "$link"; ##this part can be adjusted maybe
# stop after one download
exit 0
fi
done;
1 change: 0 additions & 1 deletion bio/input.txt
Original file line number Diff line number Diff line change
@@ -1,2 +1 @@
CHS HG00614 ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/phase3/data/HG00614/cg_data/HG00614_lcl_SRR821831.mapped.COMPLETE_GENOMICS.CGworkflow2_2_evidenceOnly.CHS.high_coverage.20130401.bam
PEL HG01942 ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/phase3/data/HG01942/cg_data/HG01942_blood_SRR801093.mapped.COMPLETE_GENOMICS.CGworkflow2_2_evidenceOnly.PEL.high_coverage.20130401.bam
3 changes: 1 addition & 2 deletions bio/bio4.sh → bio/run.sh
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# create bam files with regions
################### 1KG SAMPLES
IN=${INPUT:-$PASH_TOP/benchmarks/bio}
IN_NAME=${IN_N:-input_all.txt}
IN_NAME=${IN_N:-input.txt}
OUT=${OUTPUT:-$PASH_TOP/benchmarks/bio/output}
cat ${IN}/${IN_NAME}|while read s_line;
do
Expand All @@ -22,7 +22,6 @@ cat ${IN}/${IN_NAME}|while read s_line;
samtools view -b "${OUT}/$sample"_corrected.bam chr"$chr" > "${OUT}/$pop"_"$sample"_"$chr".bam ;
echo 'Indexing Sample '$pop'_'${OUT}/$sample' ';
samtools index -b "${OUT}/$pop"_"$sample"_"$chr".bam;
sleep 2
done;
#rm "${OUT}/$sample"_corrected.bam;
#rm "${OUT}/$sample"_corrected.bam.bai;
Expand Down
13 changes: 8 additions & 5 deletions bio/verify.sh
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,9 @@

cd "$(realpath $(dirname "$0"))"

mkdir -p hashes
hash_folder="hashes"

mkdir -p $hash_folder

if [[ "$@" == *"--generate"* ]]; then
directory="output"
Expand Down Expand Up @@ -50,7 +52,7 @@ do
for file in "$folder"/*.bam
do
current_file=$((current_file + 1))
echo "Processing file $current_file of $total_files..."
# echo "Processing file $current_file of $total_files..."

# Extract the filename without the directory path and extension
filename=$(basename "$file" .bam)
Expand All @@ -66,7 +68,8 @@ do
# Compare the hash with the hash in the hashes directory
diff "$hash_folder/$filename.hash" "$folder/$filename.hash"

# Print the filename and hash
echo "File: $folder/$filename | SHA-256 Hash: $(cat "$folder/$filename.hash")"
# # Print the filename and hash
# echo "File: $folder/$filename | SHA-256 Hash: $(cat "$folder/$filename.hash")"
done
done
done
exit 0

0 comments on commit f72c4ec

Please sign in to comment.