-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathoclcTitlePhrase.py
80 lines (79 loc) · 3.79 KB
/
oclcTitlePhrase.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
import requests
from bs4 import BeautifulSoup
import csv
import secrets
import urllib
baseURL = 'http://www.worldcat.org/webservices/catalog/search/opensearch?q='
baseURL2 = 'http://www.worldcat.org/webservices/catalog/content/'
wskey = secrets.wskey
f = csv.writer(open('oclcTitleSearchMatches.csv', 'w'))
f.writerow(['bibNumber'] + ['searchTitle'] + ['oclcTitle'] + ['oclcNum']
+ ['url'] + ['author'] + ['publisher'] + ['physDesc']
+ ['encoding'] + ['date'])
f2 = csv.writer(open('oclcTitleSearchNonMatches.csv', 'w'))
f2.writerow(['bibNumber'] + ['searchTitle'])
with open('oclcRecordsTitle.csv') as csvfile:
reader = csv.DictReader(csvfile)
for row in reader:
bibNumber = row['bib#']
print(bibNumber)
searchTitle = row['245 - all subfields'][2:]
originalTitle = searchTitle
if 'b' in searchTitle:
searchTitle = searchTitle[:searchTitle.index('b')] + ' '\
+ searchTitle[searchTitle.index('b') + 2:]
if 'c' in searchTitle:
searchTitle = searchTitle[:searchTitle.index('c')]
else:
pass
elif 'c' in searchTitle:
searchTitle = searchTitle[:searchTitle.index('c')]
else:
pass
search = urllib.quote(searchTitle)
response = requests.get(baseURL + search.strip() + '&format=rss&wskey='
+ wskey).content
records = BeautifulSoup(response, "lxml").findAll('item')
if records != []:
for record in records:
oclcTitle = record.find('title').text
url = record.find('guid').text
oclcNum = url.replace('http://worldcat.org/oclc/', '')
author = record.find('author').find('name').text
wskey = '&wskey=' + wskey
serviceLevel = '?servicelevel=full'
classScheme = '&classificationScheme=LibraryOfCongress'
response2 = requests.get(baseURL2 + oclcNum + serviceLevel
+ classScheme + wskey).content
record2 = BeautifulSoup(response2, "lxml").find('record')
encoding = record2.find('leader').text[17]
type = record2.find('controlfield', {'tag': '008'}).text[23:24]
date = record2.find('controlfield', {'tag': '008'}).text[7:11]
try:
publisher = record2.find('datafield', {'tag': '260'})
publisher = publisher.find('subfield', {'code': 'b'}).text
except ValueError:
try:
publisher = record2.find('datafield', {'tag': '264'})
publisher = publisher.find('subfield', {'code': 'b'})
publisher = publisher.text
except ValueError:
publisher = ''
try:
catLang = record2.find('datafield', {'tag': '040'})
catLang = catLang.find('subfield', {'code': 'b'}).text
except ValueError:
catLang = ''
try:
physDesc = record2.find('datafield', {'tag': '300'})
physDesc = physDesc.find('subfield', {'code': 'a'}).text
except ValueError:
physDesc = ''
if type == ' ' and (catLang == 'eng' or catLang == ''):
f.writerow([bibNumber] + [searchTitle] + [oclcTitle]
+ [oclcNum] + [url] + [author] + [publisher]
+ [physDesc] + [encoding] + [date])
f.writerow([''] + [''] + [''] + [''] + [''] + [''] + [''] + ['']
+ [''] + [''])
else:
f2.writerow([bibNumber] + [searchTitle])