Skip to content

Commit

Permalink
Avoid reading duplicate files
Browse files Browse the repository at this point in the history
  • Loading branch information
akariv committed Feb 20, 2025
1 parent 971fc03 commit d62f357
Showing 1 changed file with 7 additions and 1 deletion.
8 changes: 7 additions & 1 deletion odds/backend/scanner/website/website_scanner.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ class Scraper:
WORKER_COUNT = 5
PERIOD = 0.25
CACHE = CACHE_DIR / 'web-scraper'
CACHE_HASHES = CACHE / 'hashes'
WS = re.compile(r'\s+', re.UNICODE | re.MULTILINE)
ALLOWED_TAGS = {'a', 'abbr', 'acronym', 'b', 'blockquote', 'code',
'em', 'i', 'li', 'ol', 'strong', 'ul', 'table', 'tr', 'td', 'th', 'tbody', 'thead', 'title'}
Expand All @@ -57,6 +58,7 @@ def __init__(self, base_urls) -> None:
self.all_urls = set()
self.all_hashes = set()
self.CACHE.mkdir(parents=True, exist_ok=True)
self.CACHE_HASHES.mkdir(parents=True, exist_ok=True)

async def queue(self, url: str) -> None:
if url not in self.all_urls:
Expand Down Expand Up @@ -97,6 +99,10 @@ async def scrape(self, url: str) -> list[str]:
content = data.get('content')
content_type = data.get('content_type')
final_url = data.get('final_url')
content_hash = sha256(content_.encode()).hexdigest()
content_hash_file = self.CACHE_HASHES / f'{content_hash}.touch'
if not content_hash_file.exists():
content_hash_file.open('w').write(content_hash).close()
print(f'GOT FROM CACHE: {url} -> {final_url}')

if content is None:
Expand All @@ -109,7 +115,7 @@ async def scrape(self, url: str) -> list[str]:
if content_type.startswith('text/html'):
content_ = r.text
content_hash = sha256(content_.encode()).hexdigest()
content_hash_file = self.CACHE / f'{content_hash}.touch'
content_hash_file = self.CACHE_HASHES / f'{content_hash}.touch'
if not content_hash_file.exists():
content = content_
content_hash_file.open('w').write(content_hash).close()
Expand Down

0 comments on commit d62f357

Please sign in to comment.