-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathscrape_arxiv_for_subject.py
56 lines (49 loc) · 1.75 KB
/
scrape_arxiv_for_subject.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
"""
this crawls arvix and fetches the subject for each file
"""
import requests
from bs4 import BeautifulSoup
import csv
import time
BASE_URL = "https://arxiv.org/abs/"
def read_in_for_scraper():
with open('results/file_ids') as f:
content = f.readlines()
content = [x.strip() for x in content]
content = [x.strip('.tex') for x in content]
content = [x for x in content if x[0].isdigit() and x[1].isdigit()]
return content
def pull_subject_single_id(id):
try:
req = requests.get(BASE_URL + id)
c = req.content
return c
except:
print("request didnt complete")
return None
def pull_subject_from_content(content):
try:
soup = BeautifulSoup(content)
subject = soup.find_all("span", {'class': 'primary-subject'})
subj_span = subject[0]
subj_text = subj_span.get_text()
return subj_text
except:
return None
if __name__ == "__main__":
test_ids = read_in_for_scraper()
indexes = [0, 500, 1000, 1500, 2000, 2500, 3000, 3500, 4000, 4500, 5000, 6000, 7000, 8000, 9000, 10000, 11000, 12000, 13000, 14000, 15000, 16000]
for index in indexes:
print(str(index) + " is the starting index")
id_to_subject = []
current_ids = test_ids[index:index + 500]
for current_id in current_ids:
print(str(current_id) + " is the current ID")
content = pull_subject_single_id(current_id)
subject = pull_subject_from_content(content)
if subject != None:
id_to_subject.append((current_id, subject))
with open('all_subjects.csv','a') as out:
csv_out=csv.writer(out)
for row in id_to_subject:
csv_out.writerow(row)