-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathoclcHoldingsSearch.py
91 lines (82 loc) · 3.21 KB
/
oclcHoldingsSearch.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
import requests
from bs4 import BeautifulSoup
import csv
import secrets
import time
import datetime
import argparse
# command line arguments
parser = argparse.ArgumentParser()
parser.add_argument('-f', '--fileName', help='the file of data to be searched. \
optional - if not provided, the script will ask for input')
args = parser.parse_args()
if args.fileName:
fileName = args.fileName
else:
fileName = input('Enter the file of data to be searched: ')
# run time start, establish variables,
startTime = time.time()
wskey = secrets.wskey
oclcSymbols = secrets.oclcSymbols
fileNameWithoutExtension = fileName[:fileName.index('.')]
baseURL = 'http://www.worldcat.org/webservices/catalog/search/opensearch?q='
baseURL2 = 'http://www.worldcat.org/webservices/catalog/content/'
oclcSymbolsString = ''
for oclcSymbol in oclcSymbols:
oclcSymbolsString += oclcSymbol + ','
with open(fileName) as csvfile:
reader = csv.DictReader(csvfile)
rowCount = len(list(reader))
# script content
f = csv.writer(open(fileNameWithoutExtension + 'oclcSearchMatches.csv', 'w'))
f.writerow(['bartonId'] + ['searchOclcNum'] + ['heldByMIT']
+ ['holdingsCountNonMIT'] + ['holdingInstitutions'])
f2 = csv.writer(open(fileNameWithoutExtension
+ 'oclcSearchNonMatches.csv', 'w'))
f2.writerow(['searchOoclcNum'] + ['holdingsCount'])
with open(fileName) as csvfile:
reader = csv.DictReader(csvfile)
for row in reader:
rowCount -= 1
if rowCount != 0 and rowCount % 200 == 0:
time.sleep(5)
if rowCount != 0 and rowCount % 3000 == 0:
print('sleep 5 min')
time.sleep(300)
print('Items remaining: ', rowCount)
bartonId = row['bartonId']
if ')' in row['oclcNum']:
searchOclcNum = row['oclcNum'][row['oclcNum'].index(')') + 1:]
else:
searchOclcNum = row['oclcNum']
print(searchOclcNum)
searchUrl = 'http://www.worldcat.org/'
searchUrl = searchUrl + 'webservices/catalog/content/libraries/'
searchUrl = searchUrl + searchOclcNum
searchUrl = searchUrl + '?maximumLibraries=100&oclcsymbol='
searchUrl = searchUrl + oclcSymbolsString + '&wskey=' + wskey
response = requests.get(searchUrl)
print(response)
response = response.content
records = BeautifulSoup(response, "lxml")
heldByMIT = False
if records.findAll('diagnostics') != []:
print('No match')
f2.writerow([searchOclcNum] + ['No match'])
else:
records = records.findAll('holding')
recordInstCodes = []
for record in records:
instCode = record.find('institutionidentifier')
instCode = instCode.find('value').text
if instCode == 'MYG':
heldByMIT = True
else:
recordInstCodes.append(instCode)
holdingsCount = len(recordInstCodes)
print(recordInstCodes)
f.writerow([bartonId] + [searchOclcNum] + [heldByMIT]
+ [holdingsCount] + [recordInstCodes])
# print script run time
td = datetime.timedelta(seconds=time.time() - startTime)
print("Elapsed time: {}".format(td))