From d62f3572ea39348ccfbb01a075140a3891206b15 Mon Sep 17 00:00:00 2001 From: Adam Kariv Date: Thu, 20 Feb 2025 13:36:00 +0200 Subject: [PATCH] Avoid reading duplicate files --- odds/backend/scanner/website/website_scanner.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/odds/backend/scanner/website/website_scanner.py b/odds/backend/scanner/website/website_scanner.py index d672363..674e8ce 100644 --- a/odds/backend/scanner/website/website_scanner.py +++ b/odds/backend/scanner/website/website_scanner.py @@ -44,6 +44,7 @@ class Scraper: WORKER_COUNT = 5 PERIOD = 0.25 CACHE = CACHE_DIR / 'web-scraper' + CACHE_HASHES = CACHE / 'hashes' WS = re.compile(r'\s+', re.UNICODE | re.MULTILINE) ALLOWED_TAGS = {'a', 'abbr', 'acronym', 'b', 'blockquote', 'code', 'em', 'i', 'li', 'ol', 'strong', 'ul', 'table', 'tr', 'td', 'th', 'tbody', 'thead', 'title'} @@ -57,6 +58,7 @@ def __init__(self, base_urls) -> None: self.all_urls = set() self.all_hashes = set() self.CACHE.mkdir(parents=True, exist_ok=True) + self.CACHE_HASHES.mkdir(parents=True, exist_ok=True) async def queue(self, url: str) -> None: if url not in self.all_urls: @@ -97,6 +99,10 @@ async def scrape(self, url: str) -> list[str]: content = data.get('content') content_type = data.get('content_type') final_url = data.get('final_url') + content_hash = sha256(content_.encode()).hexdigest() + content_hash_file = self.CACHE_HASHES / f'{content_hash}.touch' + if not content_hash_file.exists(): + content_hash_file.open('w').write(content_hash).close() print(f'GOT FROM CACHE: {url} -> {final_url}') if content is None: @@ -109,7 +115,7 @@ async def scrape(self, url: str) -> list[str]: if content_type.startswith('text/html'): content_ = r.text content_hash = sha256(content_.encode()).hexdigest() - content_hash_file = self.CACHE / f'{content_hash}.touch' + content_hash_file = self.CACHE_HASHES / f'{content_hash}.touch' if not content_hash_file.exists(): content = content_ content_hash_file.open('w').write(content_hash).close()