-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathoclcTitleDateSearch.py
108 lines (107 loc) · 5.14 KB
/
oclcTitleDateSearch.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
import requests
from bs4 import BeautifulSoup
import csv
import secrets
import urllib
baseURL = 'http://www.worldcat.org/webservices/catalog/search/opensearch?q='
baseURL2 = 'http://www.worldcat.org/webservices/catalog/content/'
wskey = secrets.wskey
f = csv.writer(open('oclcTitleDateSearchMatches.csv', 'w'))
f.writerow(['bibNumber'] + ['searchTitle'] + ['searchDate'] + ['searchType']
+ ['oclcTitle'] + ['date'] + ['oclcNum'] + ['url'] + ['author']
+ ['publisher'] + ['physDesc'] + ['encoding'])
f2 = csv.writer(open('oclcTitleDateSearchNonMatches.csv', 'w'))
f2.writerow(['bibNumber'] + ['searchTitle'] + ['searchDate'])
with open('oclcRecordsTitle.csv') as csvfile:
reader = csv.DictReader(csvfile)
for row in reader:
bibNumber = row['bib#']
print(bibNumber)
searchType = 'date & title'
if row['Date 1 from 008'] != '':
searchDate = row['Date 1 from 008']
query = 'srw.yr+%3D+"' + searchDate + '"+and+'
else:
query = ''
# deprecated method for extracting the date from the 260 or 264
# if row['260 - all subfields'] != '':
# if 'c' in row['260 - all subfields']:
# searchDate = row['260 - all subfields']
# searchDate = searchDate[searchDate.index('c')+2:].strip()
# searchDate = re.sub('[^\d-]+', '', searchDate)
# query = 'srw.yr+%3D+"'+searchDate+'"+and+'
# else:
# query = ''
# elif row['264 - all subfields'] != '':
# if 'c' in row['264 - all subfields']:
# searchDate = row['264 - all subfields']
# searchDate = searchDate[searchDate.index('c')+2:].strip()
# searchDate = re.sub('[^\d-]+', '', searchDate)
# query = 'srw.yr+%3D+"'+searchDate+'"+and+'
# else:
# query = ''
# else:
# query = ''
searchTitle = row['245 - all subfields'][2:]
originalTitle = searchTitle
if 'b' in searchTitle:
searchTitle = searchTitle[:searchTitle.index('b')] + ' '\
+ searchTitle[searchTitle.index('b') + 2:]
if 'c' in searchTitle:
searchTitle = searchTitle[:searchTitle.index('c')]
else:
pass
elif 'c' in searchTitle:
searchTitle = searchTitle[:searchTitle.index('c')]
else:
pass
searchTitleURL = urllib.quote(searchTitle).strip()
query = baseURL + query + 'srw.ti+%3D+"' + searchTitleURL
query = query + '"&format=rss&wskey=' + wskey
print(query)
response = requests.get(query).content
records = BeautifulSoup(response, 'lxml').findAll('item')
if records != []:
for record in records:
oclcTitle = record.find('title').text
url = record.find('guid').text
oclcNum = url.replace('http://worldcat.org/oclc/', '')
author = record.find('author').find('name').text
serviceLevel = '?servicelevel=full'
classScheme = '&classificationScheme=LibraryOfCongress'
response2 = requests.get(baseURL2 + oclcNum + serviceLevel
+ classScheme
+ '&wskey=' + wskey).content
record2 = BeautifulSoup(response2, "lxml").find('record')
encoding = record2.find('leader').text[17]
type = record2.find('controlfield', {'tag': '008'}).text[23:24]
date = record2.find('controlfield', {'tag': '008'}).text[7:11]
try:
publisher = record2.find('datafield', {'tag': '260'})
publisher = publisher.find('subfield', {'code': 'b'}).text
except ValueError:
try:
publisher = record2.find('datafield', {'tag': '264'})
publisher = publisher.find('subfield', {'code': 'b'})
publisher = publisher.text
except ValueError:
publisher = ''
try:
catLang = record2.find('datafield', {'tag': '040'})
catLang = catLang.find('subfield', {'code': 'b'}).text
except ValueError:
catLang = ''
try:
physDesc = record2.find('datafield', {'tag': '300'})
physDesc = physDesc.find('subfield', {'code': 'a'}).text
except ValueError:
physDesc = ''
if type == ' ' and (catLang == 'eng' or catLang == ''):
f.writerow([bibNumber] + [searchTitle] + [searchDate]
+ [searchType] + [oclcTitle] + [date]
+ [oclcNum] + [url] + [author] + [publisher]
+ [physDesc] + [encoding])
f.writerow([''] + [''] + [''] + [''] + [''] + [''] + [''] + ['']
+ [''] + [''])
else:
f2.writerow([bibNumber] + [searchTitle] + [searchDate])