Skip to content

Commit

Permalink
new: Add favicons in indexer
Browse files Browse the repository at this point in the history
  • Loading branch information
Rafiot committed Feb 19, 2024
1 parent a795d08 commit 4153138
Show file tree
Hide file tree
Showing 4 changed files with 80 additions and 0 deletions.
5 changes: 5 additions & 0 deletions bin/background_indexer.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,6 +136,7 @@ def _check_indexes(self) -> None:
p.sismember('indexed_body_hashes', cache.uuid)
p.sismember('indexed_cookies', cache.uuid)
p.sismember('indexed_hhhashes', cache.uuid)
p.sismember('indexed_favicons', cache.uuid)
indexed = p.execute()
if all(indexed):
continue
Expand All @@ -158,6 +159,10 @@ def _check_indexes(self) -> None:
if not indexed[3]:
self.logger.info(f'Indexing HH Hashes for {cache.uuid}')
self.lookyloo.indexing.index_http_headers_hashes_capture(ct)
if not indexed[4]:
self.logger.info(f'Indexing favicons for {cache.uuid}')
favicons = self.lookyloo.get_potential_favicons(cache.uuid, all_favicons=True, for_datauri=False)
self.lookyloo.indexing.index_favicons_capture(cache.uuid, favicons)
# NOTE: categories aren't taken in account here, should be fixed(?)
# see indexing.index_categories_capture(capture_uuid, categories)
index_redis.delete('ongoing_indexing')
Expand Down
44 changes: 44 additions & 0 deletions lookyloo/indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,11 @@
import hashlib
import logging
# import re
from io import BytesIO
from collections import defaultdict
from typing import Iterable
from urllib.parse import urlsplit
from zipfile import ZipFile

from har2tree import CrawledTree
from redis import ConnectionPool, Redis
Expand All @@ -22,12 +24,18 @@ class Indexing():
def __init__(self) -> None:
self.logger = logging.getLogger(f'{self.__class__.__name__}')
self.logger.setLevel(get_config('generic', 'loglevel'))
self.redis_pool_bytes: ConnectionPool = ConnectionPool(connection_class=UnixDomainSocketConnection,
path=get_socket_path('indexing'))
self.redis_pool: ConnectionPool = ConnectionPool(connection_class=UnixDomainSocketConnection,
path=get_socket_path('indexing'), decode_responses=True)

def clear_indexes(self) -> None:
self.redis.flushdb()

@property
def redis_bytes(self) -> Redis: # type: ignore[type-arg]
return Redis(connection_pool=self.redis_pool_bytes)

@property
def redis(self) -> Redis: # type: ignore[type-arg]
return Redis(connection_pool=self.redis_pool)
Expand Down Expand Up @@ -325,6 +333,42 @@ def get_captures_url(self, url: str) -> set[str]:
def get_captures_hostname(self, hostname: str) -> set[str]:
return self.redis.smembers(f'hostnames|{hostname}|captures')

# ###### favicons ######

@property
def favicons(self) -> list[tuple[str, float]]:
return self.redis.zrevrange('favicons', 0, 200, withscores=True)

def favicon_number_captures(self, favicon_sha512: str) -> int:
return self.redis.scard(f'favicons|{favicon_sha512}|captures')

def index_favicons_capture(self, capture_uuid: str, favicons: BytesIO) -> None:
if self.redis.sismember('indexed_favicons', capture_uuid):
# Do not reindex
return
self.redis.sadd('indexed_favicons', capture_uuid)
pipeline = self.redis.pipeline()
with ZipFile(favicons, 'r') as myzip:
for name in myzip.namelist():
if not name.endswith('.ico'):
continue
favicon = myzip.read(name)
if not favicon:
# Empty file, ignore.
continue
sha = hashlib.sha512(favicon).hexdigest()
pipeline.zincrby('favicons', 1, sha)
pipeline.sadd(f'favicons|{sha}|captures', capture_uuid)
# There is no easi access to the favicons unless we store them in redis
pipeline.set(f'favicons|{sha}', favicon)
pipeline.execute()

def get_captures_favicon(self, favicon_sha512: str) -> set[str]:
return self.redis.smembers(f'favicons|{favicon_sha512}|captures')

def get_favicon(self, favicon_sha512: str) -> bytes | None:
return self.redis_bytes.get(f'favicons|{favicon_sha512}')

# ###### Categories ######

@property
Expand Down
7 changes: 7 additions & 0 deletions lookyloo/lookyloo.py
Original file line number Diff line number Diff line change
Expand Up @@ -1046,6 +1046,13 @@ def get_cookie_name_investigator(self, cookie_name: str, /) -> tuple[list[tuple[
for domain, freq in self.indexing.get_cookie_domains(cookie_name)]
return captures, domains

def get_favicon_investigator(self, favicon_sha512: str, /) -> tuple[list[tuple[str, str]], bytes | None]:
'''Returns all the captures related to a cookie name entry, used in the web interface.'''
cached_captures = self.sorted_capture_cache([uuid for uuid in self.indexing.get_captures_favicon(favicon_sha512)])
captures = [(cache.uuid, cache.title) for cache in cached_captures]
favicon = self.indexing.get_favicon(favicon_sha512)
return captures, favicon

def get_hhh_investigator(self, hhh: str, /) -> tuple[list[tuple[str, str, str, str]], list[tuple[str, str]]]:
'''Returns all the captures related to a cookie name entry, used in the web interface.'''
all_captures = dict(self.indexing.get_http_headers_hashes_captures(hhh))
Expand Down
24 changes: 24 additions & 0 deletions website/web/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -945,6 +945,19 @@ def hhhashes_lookup() -> str:
return render_template('hhhashes.html', hhhashes=hhhashes)


@app.route('/favicons', methods=['GET'])
def favicons_lookup() -> str:
favicons = []
for sha512, freq in lookyloo.indexing.favicons:
favicon = lookyloo.indexing.get_favicon(sha512)
if not favicon:
continue
favicon_b64 = base64.b64encode(favicon).decode()
nb_captures = lookyloo.indexing.favicon_number_captures(sha512)
favicons.append((sha512, freq, nb_captures, favicon_b64))
return render_template('favicons.html', favicons=favicons)


@app.route('/ressources', methods=['GET'])
def ressources() -> str:
ressources = []
Expand Down Expand Up @@ -1206,6 +1219,17 @@ def hhh_detail(hhh: str) -> str:
return render_template('hhh_details.html', hhh=hhh, captures=captures, headers=headers)


@app.route('/favicon_details/<string:favicon_sha512>', methods=['GET'])
def favicon_detail(favicon_sha512: str) -> str:
captures, favicon = lookyloo.get_favicon_investigator(favicon_sha512.strip())
if favicon:
b64_favicon = base64.b64encode(favicon).decode()
else:
b64_favicon = ''
return render_template('favicon_details.html', favicon_sha512=favicon_sha512,
captures=captures, b64_favicon=b64_favicon)


@app.route('/body_hashes/<string:body_hash>', methods=['GET'])
def body_hash_details(body_hash: str) -> str:
from_popup = True if (request.args.get('from_popup') and request.args.get('from_popup') == 'True') else False
Expand Down

0 comments on commit 4153138

Please sign in to comment.