-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.py
executable file
·115 lines (97 loc) · 4.97 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
#!/usr/bin/env python
import re
import os
import pandas as pd
import csv
import argparse
from nerodia.browser import Browser # Needs selenium == 4.9.1
from amazoncaptcha import AmazonCaptcha
class AmazonReviews():
def __init__(self, asin, page_start = None, page_end = None) -> None:
self.asin = asin
self.page_start = page_start
self.file_name = f'amz-reviews-{self.asin}.csv'
self.page_end = page_end
self.countries = ['co.uk','ca','com']
self.review_filter = ['positive','critical', 'five_star', 'one_star', 'two_star', 'three_star', 'four_star']
def pages_numbers(self) -> list[int]:
return list(range(0,11))
def scrape(self, review, country):
date_and_country = review.span(class_name = 'review-date')
date_clean = date_and_country.text.split('on ')[1]
country_re = 'in (.*) on'
country_match = re.search(country_re, date_and_country.text)
country_clean = country_match.group(1) if country_match != None else country
stars_clean = review.span(class_name = 'a-icon-alt').innertext.split(' out')[0]
title = review.link(class_name = 'review-title-content').text if review.link(class_name = 'review-title-content').exist else "no title found"
return {
'date': date_clean,
'country': country_clean,
'stars': stars_clean,
'reviewer': review.span(class_name = 'a-profile-name').text,
'title': title,
'body': review.span(class_name = 'review-text-content').text.replace('\n',' ')
}
def save(self, review):
if os.path.isfile(f'{self.file_name}'):
with open(self.file_name, 'a', newline='', encoding='utf-8') as csv_file:
headers = ['date','country','stars','reviewer','title','body']
writer = csv.DictWriter(csv_file, fieldnames=headers)
writer.writerows(review)
else:
with open(self.file_name, 'w+', newline='', encoding='utf-8') as csv_file:
headers = ['date','country','stars','reviewer','title','body']
writer = csv.DictWriter(csv_file, fieldnames=headers)
writer.writeheader()
writer.writerows(review)
def check_product_exist(self, browser, country):
browser.goto(f'https://www.amazon.{country}/-/dp/{self.asin}/?_encoding=UTF8')
if browser.title == "Page Not Found":
print(f"This product doesnt exist with this ASIN in this country, skipping country {country}")
return False
else:
return True
def fetch(self):
browser = Browser(browser='chrome', options={'headless': True}) # Set this to False to see what the browser is doing
reviews_list = []
for country in self.countries:
if not self.check_product_exist(browser, country):
continue
# SOLVE CAPTCHA
if browser.text_field(id='captchacharacters').exist:
imgs = browser.images()
text_field = browser.text_field(id='captchacharacters')
for img in imgs:
if 'captcha' in img.src:
captcha = AmazonCaptcha.fromlink(img.src)
solution = captcha.solve()
text_field.value = solution
browser.button(class_name='a-button-text').click()
for filt in self.review_filter:
for page in self.pages_numbers():
browser.goto(f'https://www.amazon.{country}/product-reviews/{self.asin}/?reviewerType=all_reviews&sortBy=recent&pageNumber=' + str(page) + '&filterByStar=' + filt)
print(f'fetching page {page} for product {self.asin} in {country} (using filter: {filt})')
reviews_in_page = browser.divs(class_name='review')
if len(reviews_in_page) != 0:
for review in reviews_in_page:
review_data = self.scrape(review, country)
reviews_list.append(review_data)
if len(reviews_list) == 10:
self.save(reviews_list)
reviews_list.clear()
else:
print(f"error, found no reviews in page {page}")
continue
return 0
def deduplicate(self):
if os.path.isfile(f'{self.file_name}'):
data = pd.read_csv(self.file_name)
data = data.drop_duplicates()
data.to_csv(self.file_name, index=False)
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument("--asin", type=str, help="ASIN (unique Amazon product ID), such as: B08SJXX6YY", required = True)
args = parser.parse_args()
amz = AmazonReviews(args.asin)
amz.fetch()
amz.deduplicate()