From 9817343163da870550005cf63601c7c2ca276462 Mon Sep 17 00:00:00 2001 From: Zakariyya Mughal Date: Wed, 7 Feb 2024 13:24:35 -0500 Subject: [PATCH] Adjust to PubChemRDF from uniprot-kg --- dvc.lock | 61 ----------------------------------------- dvc.yaml | 4 +-- stages/00_invalidate.sh | 16 +++++------ stages/01_download.sh | 8 ++++-- stages/02_build.sh | 22 ++++++--------- 5 files changed, 24 insertions(+), 87 deletions(-) delete mode 100644 dvc.lock diff --git a/dvc.lock b/dvc.lock deleted file mode 100644 index 74ef1c2..0000000 --- a/dvc.lock +++ /dev/null @@ -1,61 +0,0 @@ -schema: '2.0' -stages: - 1-stage: - cmd: Rscript stages/01-stage.R - deps: - - path: stages/01-stage.R - md5: c468dc14527e00c339f146e1dba2bd3b - size: 118 - outs: - - path: data/mtcars.parquet - md5: 77cf29accf9535522fad7db1486eff9a - size: 6243 - download: - cmd: stages/01_download.sh - deps: - - path: checksum - hash: md5 - md5: c4873befa69781f8b5de7355dd84718a.dir - size: 476280 - nfiles: 1 - - path: stages/01_download.sh - hash: md5 - md5: 381e0091de2585e5a6f6c033ddea923a - size: 460 - outs: - - path: download - hash: md5 - md5: 9361da160c0d2f37fba41ec102ff2c8e.dir - size: 979163990986 - nfiles: 657 - invalidate: - cmd: stages/00_invalidate.sh - deps: - - path: stages/00_invalidate.sh - hash: md5 - md5: 615f8d20a400c764ec4f50527f4a76de - size: 477 - outs: - - path: checksum - hash: md5 - md5: c4873befa69781f8b5de7355dd84718a.dir - size: 476280 - nfiles: 1 - build: - cmd: stages/02_build.sh - deps: - - path: download - hash: md5 - md5: 9361da160c0d2f37fba41ec102ff2c8e.dir - size: 979163990986 - nfiles: 657 - - path: stages/02_build.sh - hash: md5 - md5: be9e06c03537c9b972d7cac882b8c463 - size: 1517 - outs: - - path: brick - hash: md5 - md5: 4942db9f6e6374ff5efdc939fbbf1818.dir - size: 1777322130952 - nfiles: 656 diff --git a/dvc.yaml b/dvc.yaml index d781667..ecead11 100644 --- a/dvc.yaml +++ b/dvc.yaml @@ -14,12 +14,12 @@ stages: deps: - stages/00_invalidate.sh outs: - - checksum + - void download: cmd: stages/01_download.sh deps: - stages/01_download.sh - - checksum + - void outs: - download: persist: true diff --git a/stages/00_invalidate.sh b/stages/00_invalidate.sh index e1cd672..fb21a7d 100755 --- a/stages/00_invalidate.sh +++ b/stages/00_invalidate.sh @@ -6,16 +6,16 @@ localpath=$(pwd) echo "Local path: $localpath" -# Define the release URL for the dataset -checksum_url="https://ftp.uniprot.org/pub/databases/uniprot/current_release/rdf/RELEASE.metalink" +# Define the VoID URL for the dataset +void_url="https://ftp.ncbi.nlm.nih.gov/pubchem/RDF/void.ttl" -# Create the checksum directory -checksumpath="$localpath/checksum" -echo "Checksum path: $checksumpath" -mkdir -p "$checksumpath" -cd $checksumpath; +# Create the VoID directory +voidpath="$localpath/void" +echo "VoID path: $voidpath" +mkdir -p "$voidpath" +cd $voidpath; # Download file -wget -P $checksumpath $checksum_url +wget -P $voidpath $void_url echo "Download done." diff --git a/stages/01_download.sh b/stages/01_download.sh index badaecb..a878781 100755 --- a/stages/01_download.sh +++ b/stages/01_download.sh @@ -6,8 +6,10 @@ localpath=$(pwd) echo "Local path: $localpath" -# Define the release URL for the dataset -metalink_url="https://ftp.uniprot.org/pub/databases/uniprot/current_release/rdf/RELEASE.metalink" +# Define the FTP URL for the dataset +# https://ftp.ncbi.nlm.nih.gov/pubchem/RDF/ +# ftp://ftp.ncbi.nlm.nih.gov/pubchem/RDF/ +ftp_url="ftp://ftp.ncbi.nlm.nih.gov/pubchem/RDF/" # Create the download directory downloadpath="$localpath/download" @@ -16,4 +18,4 @@ mkdir -p "$downloadpath" cd $downloadpath; # Download files -aria2c -c -d $downloadpath $metalink_url +aria2c -c -d $downloadpath $ftp_url diff --git a/stages/02_build.sh b/stages/02_build.sh index 24effe6..d5cc2f0 100755 --- a/stages/02_build.sh +++ b/stages/02_build.sh @@ -2,7 +2,7 @@ set -euo pipefail -# Script to convert RDF/XML to RDF HDT +# Script to convert Turtle to RDF HDT # Get local path localpath=$(pwd) @@ -24,7 +24,7 @@ brickpath="$localpath/brick" mkdir -p $brickpath echo "Brick path: $brickpath" -base_uri="https://www.uniprot.org/" +base_uri="http://rdf.ncbi.nlm.nih.gov/pubchem/" # Set TMPDIR on same filesystem export BUILD_TMPDIR=$buildpath/tmp @@ -37,29 +37,25 @@ export TMPDIR=$BUILD_TMPDIR export buildpath_prestage=$buildpath/prestage mkdir -p $buildpath_prestage -export buildpath brickpath base_uri -find $downloadpath -type f -name '*.rdf.xz' | sort \ - | grep -vFf <( cat <<'EOF' # remove empty files -uniparc_patents.rdf.xz -EOF - ) \ +export downloadpath buildpath brickpath base_uri +find $downloadpath -type f -name '*.ttl.gz' | sort \ | parallel -J ./parallel.prf --bar ' set -euo pipefail; - RDF=$buildpath/{/.}; - RDF_HDT="$buildpath_prestage"/"$(basename "$RDF" .rdf).hdt"; + RDF_HDT_DIR_REL="$(realpath -s --relative-to="$downloadpath" {})"; + RDF_HDT="$buildpath_prestage"/"$RDF_HDT_DIR_REL"/"$(basename "$RDF" .rdf).hdt"; export RDF2HDTCAT_JAVA_OPTS="-Xmx24g"; if [ ! -s $RDF_HDT ]; then echo "Processing {}" - xz -T1 -dk < {} \ - | rapper --input rdfxml --output ntriples - "$base_uri" \ + gzip -dk < {} \ + | rapper --input turtle --output ntriples - "$base_uri" \ | rdf2hdtcat-parpipe $base_uri $RDF_HDT fi ' find $downloadpath/ -maxdepth 1 \ - -type f \! -name '*.rdf.xz' \ + -type f \! -name '*.ttl.gz' \ -exec cp -v {} $brickpath/ \; mv -v $buildpath_prestage/* $brickpath/