Avoid reading duplicate files

whiletrue-industries · Feb 20, 2025 · d62f357 · d62f357
1 parent 971fc03
commit d62f357
Showing 1 changed file with 7 additions and 1 deletion.
diff --git a/odds/backend/scanner/website/website_scanner.py b/odds/backend/scanner/website/website_scanner.py
@@ -44,6 +44,7 @@ class Scraper:
     WORKER_COUNT = 5
     PERIOD = 0.25
     CACHE = CACHE_DIR / 'web-scraper'
+    CACHE_HASHES = CACHE / 'hashes'
     WS = re.compile(r'\s+', re.UNICODE | re.MULTILINE)
     ALLOWED_TAGS = {'a', 'abbr', 'acronym', 'b', 'blockquote', 'code',
             'em', 'i', 'li', 'ol', 'strong', 'ul', 'table', 'tr', 'td', 'th', 'tbody', 'thead', 'title'}
@@ -57,6 +58,7 @@ def __init__(self, base_urls) -> None:
         self.all_urls = set()
         self.all_hashes = set()
         self.CACHE.mkdir(parents=True, exist_ok=True)
+        self.CACHE_HASHES.mkdir(parents=True, exist_ok=True)
 
     async def queue(self, url: str) -> None:
         if url not in self.all_urls:
@@ -97,6 +99,10 @@ async def scrape(self, url: str) -> list[str]:
                 content = data.get('content')
                 content_type = data.get('content_type')
                 final_url = data.get('final_url')
+                content_hash = sha256(content_.encode()).hexdigest()
+                content_hash_file = self.CACHE_HASHES / f'{content_hash}.touch'
+                if not content_hash_file.exists():
+                    content_hash_file.open('w').write(content_hash).close()
                 print(f'GOT FROM CACHE: {url} -> {final_url}')
 
         if content is None:
@@ -109,7 +115,7 @@ async def scrape(self, url: str) -> list[str]:
                 if content_type.startswith('text/html'):
                     content_ = r.text
                     content_hash = sha256(content_.encode()).hexdigest()
-                    content_hash_file = self.CACHE / f'{content_hash}.touch'
+                    content_hash_file = self.CACHE_HASHES / f'{content_hash}.touch'
                     if not content_hash_file.exists():
                         content = content_
                         content_hash_file.open('w').write(content_hash).close()