Skip to content

Commit

Permalink
cleaning the cache
Browse files Browse the repository at this point in the history
  • Loading branch information
Sulstice committed Jul 9, 2024
1 parent 31c0a47 commit 5b4f501
Show file tree
Hide file tree
Showing 208 changed files with 13 additions and 6,302 deletions.
18 changes: 9 additions & 9 deletions stages/01_download.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,14 +32,14 @@

# Download links to cache/{source_name}/annotations.json
safe_sources = [re.sub(r'[\\/*?:"<>|]', '_', name) for name in source_names]
for link, name in tqdm.tqdm(zip(links, safe_sources), total=len(links), desc="Downloading annotations"):
annotation_path = os.path.join('cache/01_download', name, 'annotations.json')
os.makedirs(os.path.dirname(annotation_path), exist_ok=True)
response = requests.get(link, stream=True)
time.sleep(3) # Be nice to the server
with open(annotation_path, 'wb') as f:
for chunk in response.iter_content(chunk_size=8192):
_ = f.write(chunk)
# for link, name in tqdm.tqdm(zip(links, safe_sources), total=len(links), desc="Downloading annotations"):
# annotation_path = os.path.join('cache/01_download', name, 'annotations.json')
# os.makedirs(os.path.dirname(annotation_path), exist_ok=True)
# response = requests.get(link, stream=True)
# time.sleep(3) # Be nice to the server
# with open(annotation_path, 'wb') as f:
# for chunk in response.iter_content(chunk_size=8192):
# _ = f.write(chunk)

# Make a 'headings' table by reading all the annotations.json files
headings = []
Expand Down Expand Up @@ -75,6 +75,7 @@
'response_type': 'save',
'response_basename': f'PubChemAnnotations_{row["source"]}_heading={safe_heading}'
}

# Initial URL and download path setup
download_path = os.path.join('cache/01_download', row['safe_source'], f'{safe_heading}.json')
os.makedirs(os.path.dirname(download_path), exist_ok=True)
Expand All @@ -93,7 +94,6 @@
if response.status_code == 200:
data = response.json()
annotations = data.get('Annotations', {})
print ("Annotations: %s" % annotations)
all_data.extend(annotations.get('Annotation', []))
# Check if more pages exist
if current_page >= annotations.get('TotalPages', 1):
Expand Down
6 changes: 4 additions & 2 deletions stages/02_process.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@

# Create a dictionary for quick lookup
json_file_dict = {os.path.splitext(os.path.basename(file))[0]: file for file in all_json_files}

print (json_file_dict)
# Debugging: Log the total number of JSON files found
print(f"Total JSON files found: {len(all_json_files)}")

Expand All @@ -25,10 +25,12 @@
source = row['source']
heading = row['heading']
data_type = row['type']

# Construct the expected file name without path
file_name = f"{heading}.json"

# Check if the file is in the dictionary
if heading in json_file_dict:
if heading in json_file_dict['annotations']:
file_path = json_file_dict[heading]
# Debugging: Log the found file path
print(f"Processing file: {file_path}")
Expand Down

This file was deleted.

This file was deleted.

This file was deleted.

This file was deleted.

This file was deleted.

This file was deleted.

This file was deleted.

10 changes: 0 additions & 10 deletions stages/cache/01_download/Athena Minerals/annotations.json

This file was deleted.

This file was deleted.

This file was deleted.

This file was deleted.

This file was deleted.

22 changes: 0 additions & 22 deletions stages/cache/01_download/BindingDB/annotations.json

This file was deleted.

This file was deleted.

18 changes: 0 additions & 18 deletions stages/cache/01_download/BioGRID/annotations.json

This file was deleted.

This file was deleted.

118 changes: 0 additions & 118 deletions stages/cache/01_download/CAMEO Chemicals/annotations.json

This file was deleted.

14 changes: 0 additions & 14 deletions stages/cache/01_download/CAS Common Chemistry/annotations.json

This file was deleted.

10 changes: 0 additions & 10 deletions stages/cache/01_download/CCSbase/annotations.json

This file was deleted.

14 changes: 0 additions & 14 deletions stages/cache/01_download/COVID-19 Disease Map/annotations.json

This file was deleted.

Loading

0 comments on commit 5b4f501

Please sign in to comment.