Skip to content

Commit

Permalink
added the fixes to methylthiolate, acetamide, and acetate
Browse files Browse the repository at this point in the history
  • Loading branch information
Sulstice committed Jul 17, 2024
1 parent 5b4f501 commit 21944e4
Show file tree
Hide file tree
Showing 2 changed files with 42,520 additions and 15 deletions.
31 changes: 16 additions & 15 deletions stages/01_download.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,16 +30,18 @@
args = urlencode({'sourcename': name, 'response_type': 'save', 'response_basename': f'{name}_PubChemAnnotationTopics'})
links += [f"{base_url}?{args}"]

out_file = open('links.txt', 'w')

# Download links to cache/{source_name}/annotations.json
safe_sources = [re.sub(r'[\\/*?:"<>|]', '_', name) for name in source_names]
# for link, name in tqdm.tqdm(zip(links, safe_sources), total=len(links), desc="Downloading annotations"):
# annotation_path = os.path.join('cache/01_download', name, 'annotations.json')
# os.makedirs(os.path.dirname(annotation_path), exist_ok=True)
# response = requests.get(link, stream=True)
# time.sleep(3) # Be nice to the server
# with open(annotation_path, 'wb') as f:
# for chunk in response.iter_content(chunk_size=8192):
# _ = f.write(chunk)
# annotation_path = os.path.join('cache/01_download', name, 'annotations.json')
# os.makedirs(os.path.dirname(annotation_path), exist_ok=True)
# response = requests.get(link, stream=True)
# time.sleep(3) # Be nice to the server
# with open(annotation_path, 'wb') as f:
# for chunk in response.iter_content(chunk_size=8192):
# _ = f.write(chunk)

# Make a 'headings' table by reading all the annotations.json files
headings = []
Expand All @@ -66,7 +68,6 @@
base_url = "https://pubchem.ncbi.nlm.nih.gov/rest/pug_view/annotations/heading/JSON/?"

for index, row in tqdm.tqdm(heading_df.iterrows(), total=len(heading_df), desc="Downloading headings"):
time.sleep(3)
safe_heading = re.sub(r'[\\/*?:"<>|]', '_', row['heading'])
args = {
'source': row['source'],
Expand All @@ -87,20 +88,20 @@
# Add the current page to the args and construct the URL
args['page'] = current_page
paginated_url = f"{base_url}{urlencode(args)}"

out_file.write(paginated_url + '\n')
try:
# Fetch data for the current page
response = requests.get(paginated_url)
if response.status_code == 200:
data = response.json()
annotations = data.get('Annotations', {})
all_data.extend(annotations.get('Annotation', []))
# Check if more pages exist
if current_page >= annotations.get('TotalPages', 1):
break
# Move to the next page
current_page += 1
time.sleep(1) # be nice to the server
total_pages = annotations.get('TotalPages', 1)
for i in range(int(total_pages)):
args['page'] = i
paginated_url = f"{base_url}{urlencode(args)}"
out_file.write(paginated_url + '\n')
break
else:
print(f"Failed to download page {current_page} for {row['heading']} from {row['source']}")
except Exception as e:
Expand Down
Loading

0 comments on commit 21944e4

Please sign in to comment.