Skip to content

Commit

Permalink
new: Shodan MM3H indexing
Browse files Browse the repository at this point in the history
  • Loading branch information
Rafiot committed Feb 26, 2024
1 parent 7e25747 commit decf887
Show file tree
Hide file tree
Showing 9 changed files with 234 additions and 69 deletions.
28 changes: 28 additions & 0 deletions bin/background_indexer.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ def _to_run_forever(self) -> None:
all_done = self._build_missing_pickles()
if all_done:
self._check_indexes()
self._check_probabilistic_indexes()
self.lookyloo.update_tree_cache_info(os.getpid(), self.script_name)

def _build_missing_pickles(self) -> bool:
Expand Down Expand Up @@ -168,6 +169,33 @@ def _check_indexes(self) -> None:
index_redis.delete('ongoing_indexing')
self.logger.info('... done.')

def _check_probabilistic_indexes(self) -> None:
index_redis = self.lookyloo.indexing.redis
can_index = index_redis.set('ongoing_probalistic_indexing', 1, ex=3600, nx=True)
if not can_index:
# There is no reason to run this method in multiple scripts.
self.logger.info('Probalistic indexing already ongoing in another process.')
return None
self.logger.info('Check probabilistic indexes...')
algorithms = ['mmh3-shodan']
for cache in self.lookyloo.sorted_capture_cache(cached_captures_only=False):
if self.lookyloo.is_public_instance and cache.no_index:
# Capture unindexed
continue
p = index_redis.pipeline()
for algorithm in algorithms:
p.sismember(f'indexed_favicons_probabilistic|{algorithm}', cache.uuid)
indexed = p.execute()
if all(indexed):
continue
for i, algorithm in enumerate(algorithms):
if not indexed[i]:
self.logger.info(f'Probabilistic indexing favicons for {cache.uuid} with {algorithm}')
favicons = self.lookyloo.get_potential_favicons(cache.uuid, all_favicons=True, for_datauri=False)
self.lookyloo.indexing.index_favicons_probabilistic(cache.uuid, favicons, algorithm)
index_redis.delete('ongoing_probalistic_indexing')
self.logger.info('... done.')


def main() -> None:
i = BackgroundIndexer()
Expand Down
2 changes: 1 addition & 1 deletion bin/background_processing.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,7 @@ def _retry_failed_enqueue(self) -> None:
to_requeue.append(uuid)

for uuid in to_requeue:
if self.lookyloo.redis.zscore('to_capture', uuid) is None
if self.lookyloo.redis.zscore('to_capture', uuid) is None:
# The capture has been captured in the meantime.
continue
self.logger.info(f'Found a non-queued capture ({uuid}), retrying now.')
Expand Down
52 changes: 52 additions & 0 deletions lookyloo/indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

from __future__ import annotations

import base64
import hashlib
import logging
# import re
Expand All @@ -11,6 +12,8 @@
from urllib.parse import urlsplit
from zipfile import ZipFile

import mmh3

from har2tree import CrawledTree
from redis import ConnectionPool, Redis
from redis.connection import UnixDomainSocketConnection
Expand Down Expand Up @@ -372,6 +375,55 @@ def get_captures_favicon(self, favicon_sha512: str) -> set[str]:
def get_favicon(self, favicon_sha512: str) -> bytes | None:
return self.redis_bytes.get(f'favicons|{favicon_sha512}')

# ###### favicons probabilistic hashes ######

def favicon_probabilistic_frequency(self, algorithm: str, phash: str) -> float | None:
return self.redis.zscore(f'favicons|{algorithm}', phash)

def index_favicons_probabilistic(self, capture_uuid: str, favicons: BytesIO, algorithm: str) -> None:
if self.redis.sismember(f'indexed_favicons_probabilistic|{algorithm}', capture_uuid):
# Do not reindex
return
self.redis.sadd(f'indexed_favicons_probabilistic|{algorithm}', capture_uuid)
pipeline = self.redis.pipeline()
with ZipFile(favicons, 'r') as myzip:
for name in myzip.namelist():
if not name.endswith('.ico'):
continue
favicon = myzip.read(name)
if not favicon:
# Empty file, ignore.
continue
sha = hashlib.sha512(favicon).hexdigest()
if algorithm == 'mmh3-shodan':
# Shodan uses a weird technique:
# 1. encodes the image to base64, with newlines every 76 characters (as per RFC 2045)
# 2. hashes the base64 string with mmh3
b64 = base64.encodebytes(favicon)
h = str(mmh3.hash(b64))
else:
raise NotImplementedError(f'Unknown algorithm: {algorithm}')
pipeline.zincrby(f'favicons|{algorithm}', 1, h)
# All captures with this hash for this algorithm
pipeline.sadd(f'favicons|{algorithm}|{h}|captures', capture_uuid)
# All hashes with this hash for this algorithm
pipeline.sadd(f'favicons|{algorithm}|{h}|favicons', sha)
# reverse lookup to get probabilistic hashes related to a specific favicon
pipeline.sadd(f'favicons|{algorithm}|{sha}', h)
pipeline.execute()

def get_hashes_favicon_probablistic(self, algorithm: str, phash: str) -> set[str]:
'''All the favicon sha512 for this probabilistic hash for this algorithm'''
return self.redis.smembers(f'favicons|{algorithm}|{phash}|favicons')

def get_probabilistic_hashes_favicon(self, algorithm: str, favicon_sha512: str) -> set[str]:
'''All the probabilistic hashes for this favicon SHA512 for this algorithm'''''
return self.redis.smembers(f'favicons|{algorithm}|{favicon_sha512}')

def get_captures_favicon_probablistic(self, algorithm: str, phash: str) -> set[str]:
'''All the captures with this probabilistic hash for this algorithm'''
return self.redis.smembers(f'favicons|{algorithm}|{phash}|captures')

# ###### Categories ######

@property
Expand Down
39 changes: 37 additions & 2 deletions lookyloo/lookyloo.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
CaptureSettings as CaptureSettingsCore)
from PIL import Image, UnidentifiedImageError
from playwrightcapture import get_devices
from puremagic import from_string # type: ignore[import-untyped]
from pylacus import (PyLacus,
CaptureStatus as CaptureStatusPy
# CaptureResponse as CaptureResponsePy,
Expand Down Expand Up @@ -1055,12 +1056,46 @@ def get_cookie_name_investigator(self, cookie_name: str, /) -> tuple[list[tuple[
for domain, freq in self.indexing.get_cookie_domains(cookie_name)]
return captures, domains

def get_favicon_investigator(self, favicon_sha512: str, /) -> tuple[list[tuple[str, str, str, datetime]], bytes | None]:
def get_favicon_investigator(self, favicon_sha512: str,
/,
get_probabilistic=True) -> tuple[list[tuple[str, str, str, datetime]],
tuple[str, str],
dict[str, dict[str, dict[str, tuple[str, str]]]]]:
'''Returns all the captures related to a cookie name entry, used in the web interface.'''
cached_captures = self.sorted_capture_cache([uuid for uuid in self.indexing.get_captures_favicon(favicon_sha512)])
captures = [(cache.uuid, cache.title, cache.redirects[-1], cache.timestamp) for cache in cached_captures]
favicon = self.indexing.get_favicon(favicon_sha512)
return captures, favicon
if favicon:
mimetype = from_string(favicon, mime=True)
b64_favicon = base64.b64encode(favicon).decode()
else:
mimetype = ''
b64_favicon = ''

# For now, there is only one probabilistic hash algo for favicons, keeping it simple
probabilistic_hash_algos = ['mmh3-shodan']
probabilistic_favicons: dict[str, dict[str, dict[str, tuple[str, str]]]] = {}
if get_probabilistic:
for algo in probabilistic_hash_algos:
probabilistic_favicons[algo] = {}
for mm3hash in self.indexing.get_probabilistic_hashes_favicon(algo, favicon_sha512):
probabilistic_favicons[algo][mm3hash] = {}
for sha512 in self.indexing.get_hashes_favicon_probablistic(algo, mm3hash):
if sha512 == favicon_sha512:
# Skip entry if it is the same as the favicon we are investigating
continue
favicon = self.indexing.get_favicon(sha512)
if favicon:
mimetype = from_string(favicon, mime=True)
b64_favicon = base64.b64encode(favicon).decode()
probabilistic_favicons[algo][mm3hash][sha512] = (mimetype, b64_favicon)
if not probabilistic_favicons[algo][mm3hash]:
# remove entry if it has no favicon
probabilistic_favicons[algo].pop(mm3hash)
if not probabilistic_favicons[algo]:
# remove entry if it has no hash
probabilistic_favicons.pop(algo)
return captures, (mimetype, b64_favicon), probabilistic_favicons

def get_hhh_investigator(self, hhh: str, /) -> tuple[list[tuple[str, str, str, str]], list[tuple[str, str]]]:
'''Returns all the captures related to a cookie name entry, used in the web interface.'''
Expand Down
Loading

0 comments on commit decf887

Please sign in to comment.