new: Shodan MM3H indexing

Lookyloo · Feb 26, 2024 · decf887 · decf887
1 parent 7e25747
commit decf887
Show file tree

Hide file tree

Showing 9 changed files with 234 additions and 69 deletions.
diff --git a/bin/background_indexer.py b/bin/background_indexer.py
@@ -33,6 +33,7 @@ def _to_run_forever(self) -> None:
         all_done = self._build_missing_pickles()
         if all_done:
             self._check_indexes()
+            self._check_probabilistic_indexes()
         self.lookyloo.update_tree_cache_info(os.getpid(), self.script_name)
 
     def _build_missing_pickles(self) -> bool:
@@ -168,6 +169,33 @@ def _check_indexes(self) -> None:
         index_redis.delete('ongoing_indexing')
         self.logger.info('... done.')
 
+    def _check_probabilistic_indexes(self) -> None:
+        index_redis = self.lookyloo.indexing.redis
+        can_index = index_redis.set('ongoing_probalistic_indexing', 1, ex=3600, nx=True)
+        if not can_index:
+            # There is no reason to run this method in multiple scripts.
+            self.logger.info('Probalistic indexing already ongoing in another process.')
+            return None
+        self.logger.info('Check probabilistic indexes...')
+        algorithms = ['mmh3-shodan']
+        for cache in self.lookyloo.sorted_capture_cache(cached_captures_only=False):
+            if self.lookyloo.is_public_instance and cache.no_index:
+                # Capture unindexed
+                continue
+            p = index_redis.pipeline()
+            for algorithm in algorithms:
+                p.sismember(f'indexed_favicons_probabilistic|{algorithm}', cache.uuid)
+            indexed = p.execute()
+            if all(indexed):
+                continue
+            for i, algorithm in enumerate(algorithms):
+                if not indexed[i]:
+                    self.logger.info(f'Probabilistic indexing favicons for {cache.uuid} with {algorithm}')
+                    favicons = self.lookyloo.get_potential_favicons(cache.uuid, all_favicons=True, for_datauri=False)
+                    self.lookyloo.indexing.index_favicons_probabilistic(cache.uuid, favicons, algorithm)
+        index_redis.delete('ongoing_probalistic_indexing')
+        self.logger.info('... done.')
+
 
 def main() -> None:
     i = BackgroundIndexer()

diff --git a/bin/background_processing.py b/bin/background_processing.py
@@ -99,7 +99,7 @@ def _retry_failed_enqueue(self) -> None:
                     to_requeue.append(uuid)
 
         for uuid in to_requeue:
-            if self.lookyloo.redis.zscore('to_capture', uuid) is None
+            if self.lookyloo.redis.zscore('to_capture', uuid) is None:
                 # The capture has been captured in the meantime.
                 continue
             self.logger.info(f'Found a non-queued capture ({uuid}), retrying now.')

diff --git a/lookyloo/indexing.py b/lookyloo/indexing.py
@@ -2,6 +2,7 @@
 
 from __future__ import annotations
 
+import base64
 import hashlib
 import logging
 # import re
@@ -11,6 +12,8 @@
 from urllib.parse import urlsplit
 from zipfile import ZipFile
 
+import mmh3
+
 from har2tree import CrawledTree
 from redis import ConnectionPool, Redis
 from redis.connection import UnixDomainSocketConnection
@@ -372,6 +375,55 @@ def get_captures_favicon(self, favicon_sha512: str) -> set[str]:
     def get_favicon(self, favicon_sha512: str) -> bytes | None:
         return self.redis_bytes.get(f'favicons|{favicon_sha512}')
 
+    # ###### favicons probabilistic hashes ######
+
+    def favicon_probabilistic_frequency(self, algorithm: str, phash: str) -> float | None:
+        return self.redis.zscore(f'favicons|{algorithm}', phash)
+
+    def index_favicons_probabilistic(self, capture_uuid: str, favicons: BytesIO, algorithm: str) -> None:
+        if self.redis.sismember(f'indexed_favicons_probabilistic|{algorithm}', capture_uuid):
+            # Do not reindex
+            return
+        self.redis.sadd(f'indexed_favicons_probabilistic|{algorithm}', capture_uuid)
+        pipeline = self.redis.pipeline()
+        with ZipFile(favicons, 'r') as myzip:
+            for name in myzip.namelist():
+                if not name.endswith('.ico'):
+                    continue
+                favicon = myzip.read(name)
+                if not favicon:
+                    # Empty file, ignore.
+                    continue
+                sha = hashlib.sha512(favicon).hexdigest()
+                if algorithm == 'mmh3-shodan':
+                    # Shodan uses a weird technique:
+                    # 1. encodes the image to base64, with newlines every 76 characters (as per RFC 2045)
+                    # 2. hashes the base64 string with mmh3
+                    b64 = base64.encodebytes(favicon)
+                    h = str(mmh3.hash(b64))
+                else:
+                    raise NotImplementedError(f'Unknown algorithm: {algorithm}')
+                pipeline.zincrby(f'favicons|{algorithm}', 1, h)
+                # All captures with this hash for this algorithm
+                pipeline.sadd(f'favicons|{algorithm}|{h}|captures', capture_uuid)
+                # All hashes with this hash for this algorithm
+                pipeline.sadd(f'favicons|{algorithm}|{h}|favicons', sha)
+                # reverse lookup to get probabilistic hashes related to a specific favicon
+                pipeline.sadd(f'favicons|{algorithm}|{sha}', h)
+        pipeline.execute()
+
+    def get_hashes_favicon_probablistic(self, algorithm: str, phash: str) -> set[str]:
+        '''All the favicon sha512 for this probabilistic hash for this algorithm'''
+        return self.redis.smembers(f'favicons|{algorithm}|{phash}|favicons')
+
+    def get_probabilistic_hashes_favicon(self, algorithm: str, favicon_sha512: str) -> set[str]:
+        '''All the probabilistic hashes for this favicon SHA512 for this algorithm'''''
+        return self.redis.smembers(f'favicons|{algorithm}|{favicon_sha512}')
+
+    def get_captures_favicon_probablistic(self, algorithm: str, phash: str) -> set[str]:
+        '''All the captures with this probabilistic hash for this algorithm'''
+        return self.redis.smembers(f'favicons|{algorithm}|{phash}|captures')
+
     # ###### Categories ######
 
     @property

diff --git a/lookyloo/lookyloo.py b/lookyloo/lookyloo.py
@@ -34,6 +34,7 @@
                        CaptureSettings as CaptureSettingsCore)
 from PIL import Image, UnidentifiedImageError
 from playwrightcapture import get_devices
+from puremagic import from_string  # type: ignore[import-untyped]
 from pylacus import (PyLacus,
                      CaptureStatus as CaptureStatusPy
                      # CaptureResponse as CaptureResponsePy,
@@ -1055,12 +1056,46 @@ def get_cookie_name_investigator(self, cookie_name: str, /) -> tuple[list[tuple[
                    for domain, freq in self.indexing.get_cookie_domains(cookie_name)]
         return captures, domains
 
-    def get_favicon_investigator(self, favicon_sha512: str, /) -> tuple[list[tuple[str, str, str, datetime]], bytes | None]:
+    def get_favicon_investigator(self, favicon_sha512: str,
+                                 /,
+                                 get_probabilistic=True) -> tuple[list[tuple[str, str, str, datetime]],
+                                                                  tuple[str, str],
+                                                                  dict[str, dict[str, dict[str, tuple[str, str]]]]]:
         '''Returns all the captures related to a cookie name entry, used in the web interface.'''
         cached_captures = self.sorted_capture_cache([uuid for uuid in self.indexing.get_captures_favicon(favicon_sha512)])
         captures = [(cache.uuid, cache.title, cache.redirects[-1], cache.timestamp) for cache in cached_captures]
         favicon = self.indexing.get_favicon(favicon_sha512)
-        return captures, favicon
+        if favicon:
+            mimetype = from_string(favicon, mime=True)
+            b64_favicon = base64.b64encode(favicon).decode()
+        else:
+            mimetype = ''
+            b64_favicon = ''
+
+        # For now, there is only one probabilistic hash algo for favicons, keeping it simple
+        probabilistic_hash_algos = ['mmh3-shodan']
+        probabilistic_favicons: dict[str, dict[str, dict[str, tuple[str, str]]]] = {}
+        if get_probabilistic:
+            for algo in probabilistic_hash_algos:
+                probabilistic_favicons[algo] = {}
+                for mm3hash in self.indexing.get_probabilistic_hashes_favicon(algo, favicon_sha512):
+                    probabilistic_favicons[algo][mm3hash] = {}
+                    for sha512 in self.indexing.get_hashes_favicon_probablistic(algo, mm3hash):
+                        if sha512 == favicon_sha512:
+                            # Skip entry if it is the same as the favicon we are investigating
+                            continue
+                        favicon = self.indexing.get_favicon(sha512)
+                        if favicon:
+                            mimetype = from_string(favicon, mime=True)
+                            b64_favicon = base64.b64encode(favicon).decode()
+                            probabilistic_favicons[algo][mm3hash][sha512] = (mimetype, b64_favicon)
+                    if not probabilistic_favicons[algo][mm3hash]:
+                        # remove entry if it has no favicon
+                        probabilistic_favicons[algo].pop(mm3hash)
+                if not probabilistic_favicons[algo]:
+                    # remove entry if it has no hash
+                    probabilistic_favicons.pop(algo)
+        return captures, (mimetype, b64_favicon), probabilistic_favicons
 
     def get_hhh_investigator(self, hhh: str, /) -> tuple[list[tuple[str, str, str, str]], list[tuple[str, str]]]:
         '''Returns all the captures related to a cookie name entry, used in the web interface.'''