Skip to content

Commit

Permalink
new: Index and views for identifiers
Browse files Browse the repository at this point in the history
  • Loading branch information
Rafiot committed Mar 13, 2024
1 parent 54ef3bf commit 0f4ef01
Show file tree
Hide file tree
Showing 6 changed files with 163 additions and 28 deletions.
5 changes: 4 additions & 1 deletion bin/background_indexer.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ def _to_run_forever(self) -> None:
# Don't need the cache in this class.
self.lookyloo.clear_tree_cache()

def _to_index_no_cache(self) -> Generator[tuple[tuple[bool, bool, bool, bool, bool], str], None, None]:
def _to_index_no_cache(self) -> Generator[tuple[tuple[bool, bool, bool, bool, bool, bool], str], None, None]:
# NOTE: only get the non-archived captures for now.
for uuid, directory in self.redis.hscan_iter('lookup_dirs'):
if not self.full_indexer:
Expand Down Expand Up @@ -85,6 +85,9 @@ def _check_indexes(self) -> None:
self.logger.info(f'Indexing favicons for {uuid_to_index}')
favicons = self.lookyloo.get_potential_favicons(uuid_to_index, all_favicons=True, for_datauri=False)
self.indexing.index_favicons_capture(uuid_to_index, favicons)
if not indexed[5]:
self.logger.info(f'Indexing identifiers for {uuid_to_index}')
self.indexing.index_identifiers_capture(ct)
# NOTE: categories aren't taken in account here, should be fixed(?)
# see indexing.index_categories_capture(capture_uuid, categories)
self.indexing.indexing_done()
Expand Down
57 changes: 55 additions & 2 deletions lookyloo/indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,16 +65,18 @@ def force_reindex(self, capture_uuid: str) -> None:
p.srem('indexed_cookies', capture_uuid)
p.srem('indexed_hhhashes', capture_uuid)
p.srem('indexed_favicons', capture_uuid)
p.srem('indexed_identifiers', capture_uuid)
p.execute()

def capture_indexed(self, capture_uuid: str) -> tuple[bool, bool, bool, bool, bool]:
def capture_indexed(self, capture_uuid: str) -> tuple[bool, bool, bool, bool, bool, bool]:
p = self.redis.pipeline()
p.sismember('indexed_urls', capture_uuid)
p.sismember('indexed_body_hashes', capture_uuid)
p.sismember('indexed_cookies', capture_uuid)
p.sismember('indexed_hhhashes', capture_uuid)
p.sismember('indexed_favicons', capture_uuid)
# This call for sure returns a tuple of 5 booleans
p.sismember('indexed_identifiers', capture_uuid)
# This call for sure returns a tuple of 6 booleans
return p.execute() # type: ignore[return-value]

# ###### Cookies ######
Expand Down Expand Up @@ -365,6 +367,57 @@ def get_captures_favicon(self, favicon_sha512: str) -> set[str]:
def get_favicon(self, favicon_sha512: str) -> bytes | None:
return self.redis_bytes.get(f'favicons|{favicon_sha512}')

# ###### identifiers ######

def identifiers_types(self) -> set[str]:
return self.redis.smembers('identifiers_types')

def identifiers(self, identifier_type: str) -> list[tuple[str, float]]:
return self.redis.zrevrange(f'identifiers|{identifier_type}', 0, 200, withscores=True)

def identifier_frequency(self, identifier_type: str, identifier: str) -> float | None:
return self.redis.zscore(f'identifiers|{identifier_type}', identifier)

def identifier_number_captures(self, identifier_type: str, identifier: str) -> int:
return self.redis.scard(f'identifiers|{identifier_type}|{identifier}|captures')

def index_identifiers_capture(self, crawled_tree: CrawledTree) -> None:
capture_uuid = crawled_tree.uuid
if self.redis.sismember('indexed_identifiers', capture_uuid):
# Do not reindex
return
self.redis.sadd('indexed_identifiers', capture_uuid)
if (not hasattr(crawled_tree.root_hartree.rendered_node, 'identifiers')
or not crawled_tree.root_hartree.rendered_node.identifiers):
return
pipeline = self.redis.pipeline()
# We have multiple identifiers types, this is the difference with the other indexes
for identifier_type, id_values in crawled_tree.root_hartree.rendered_node.identifiers.items():
pipeline.sadd('identifiers_types', identifier_type) # no-op if already there
if self.redis.sismember(f'indexed_identifiers|{identifier_type}|captures', capture_uuid):
# Do not reindex the same identifier type for the same capture
continue
pipeline.sadd(f'indexed_identifiers|{identifier_type}|captures', capture_uuid)
self.logger.debug(f'Indexing identifiers {identifier_type} for {capture_uuid} ... ')
for identifier in id_values:
if self.redis.sismember(f'identifiers|{identifier_type}|{identifier}|captures', capture_uuid):
# Already counted this specific identifier for this capture
continue
pipeline.sadd(f'identifiers|{capture_uuid}', identifier_type)
pipeline.sadd(f'identifiers|{capture_uuid}|{identifier_type}', identifier)
pipeline.sadd(f'identifiers|{identifier_type}|{identifier}|captures', capture_uuid)
pipeline.zincrby(f'identifiers|{identifier_type}', 1, identifier)
pipeline.execute()

def get_identifiers_capture(self, capture_uuid: str) -> dict[str, set[str]]:
to_return = {}
for identifier_type in self.redis.smembers(f'identifiers|{capture_uuid}'):
to_return[identifier_type] = self.redis.smembers(f'identifiers|{capture_uuid}|{identifier_type}')
return to_return

def get_captures_identifier(self, identifier_type: str, identifier: str) -> set[str]:
return self.redis.smembers(f'identifiers|{identifier_type}|{identifier}|captures')

# ###### favicons probabilistic hashes ######

def favicon_probabilistic_frequency(self, algorithm: str, phash: str) -> float | None:
Expand Down
48 changes: 24 additions & 24 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ pyhashlookup = "^1.2.2"
lief = "^0.14"
ua-parser = "^0.18.0"
Flask-Login = "^0.6.3"
har2tree = "^1.23.0"
har2tree = "^1.23.1"
passivetotal = "^2.5.9"
werkzeug = "^3.0.1"
filetype = "^1.2.0"
Expand Down
24 changes: 24 additions & 0 deletions website/web/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -416,6 +416,11 @@ def get_cookie_name_investigator(cookie_name: str, /) -> tuple[list[tuple[str, s
return captures, domains


def get_identifier_investigator(identifier_type: str, identifier: str) -> list[tuple[str, str, str, datetime]]:
cached_captures = lookyloo.sorted_capture_cache([uuid for uuid in get_indexing(flask_login.current_user).get_captures_identifier(identifier_type=identifier_type, identifier=identifier)])
return [(cache.uuid, cache.title, cache.redirects[-1], cache.timestamp) for cache in cached_captures]


def get_favicon_investigator(favicon_sha512: str,
/,
get_probabilistic: bool=False) -> tuple[list[tuple[str, str, str, datetime]],
Expand Down Expand Up @@ -1187,6 +1192,17 @@ def mark_as_legitimate(tree_uuid: str) -> Response:
return jsonify({'message': 'Legitimate entry added.'})


@app.route('/tree/<string:tree_uuid>/identifiers', methods=['GET'])
def tree_identifiers(tree_uuid: str) -> str:
to_return: list[tuple[int, str, str]] = []

for id_type, identifiers in get_indexing(flask_login.current_user).get_identifiers_capture(tree_uuid).items():
for identifier in identifiers:
nb_captures = get_indexing(flask_login.current_user).identifier_number_captures(id_type, identifier)
to_return.append((nb_captures, id_type, identifier))
return render_template('tree_identifiers.html', tree_uuid=tree_uuid, identifiers=to_return)


@app.route('/tree/<string:tree_uuid>/favicons', methods=['GET'])
def tree_favicons(tree_uuid: str) -> str:
favicons = []
Expand Down Expand Up @@ -1605,6 +1621,14 @@ def hhh_detail(hhh: str) -> str:
return render_template('hhh_details.html', hhh=hhh, captures=captures, headers=headers)


@app.route('/identifier_details/<string:identifier_type>/<string:identifier>', methods=['GET'])
def identifier_details(identifier_type: str, identifier: str) -> str:
captures = get_identifier_investigator(identifier_type, identifier)
return render_template('identifier_details.html', identifier_type=identifier_type,
identifier=identifier,
captures=captures)


@app.route('/favicon_details/<string:favicon_sha512>', methods=['GET'])
@app.route('/favicon_details/<string:favicon_sha512>/<int:get_probabilistic>', methods=['GET'])
def favicon_detail(favicon_sha512: str, get_probabilistic: int=0) -> str:
Expand Down
55 changes: 55 additions & 0 deletions website/web/templates/tree.html
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,20 @@
});
</script>
<script>
$('#identifiersModal').on('show.bs.modal', function(e) {
var button = $(e.relatedTarget);
var modal = $(this);
modal.find('.modal-body').load(button.data("remote"));
});
</script>
<script>
$('#identifierDetailsModal').on('show.bs.modal', function(e) {
var button = $(e.relatedTarget);
var modal = $(this);
modal.find('.modal-body').load(button.data("remote"));
});
</script>
<script>
$('#faviconsModal').on('show.bs.modal', function(e) {
var button = $(e.relatedTarget);
var modal = $(this);
Expand Down Expand Up @@ -320,6 +334,10 @@
<a href="#faviconsModal" data-remote="{{ url_for('tree_favicons', tree_uuid=tree_uuid) }}"
data-bs-toggle="modal" data-bs-target="#faviconsModal" role="button">Favicons Capture</a>
</li>
<li class="list-group-item">
<a href="#identifiersModal" data-remote="{{ url_for('tree_identifiers', tree_uuid=tree_uuid) }}"
data-bs-toggle="modal" data-bs-target="#identifiersModal" role="button">Identifiers Capture</a>
</li>
</ul>
</div>
{% if current_user.is_authenticated %}
Expand Down Expand Up @@ -542,6 +560,43 @@ <h5 class="modal-title" id="statsModalLabel">Statistics</h5>
</div>
</div>

<div class="modal fade" id="identifiersModal" tabindex="-1" role="dialog">
<div class="modal-dialog modal-xl" role="document">
<div class="modal-content">
<div class="modal-header">
<h5 class="modal-title" id="identifiersModalLabel">Identifiers found on the rendered page</h5>
<button type="button" class="btn btn-close" data-bs-dismiss="modal" aria-label="Close"></button>
</div>
<div class="modal-body">
... loading identifiers ...
</div>
<div class="modal-footer">
<button type="button" class="btn btn-secondary" data-bs-dismiss="modal">Close</button>
</div>
</div>
</div>
</div>

<div class="modal fade" id="identifierDetailsModal" tabindex="-1" role="dialog">
<div class="modal-dialog modal-xl" role="document">
<div class="modal-content">
<div class="modal-header">
<h5 class="modal-title" id="identifierDetailsModalLabel">Other occurrences of the identifier</h5>
<button type="button" class="btn btn-close" data-bs-dismiss="modal" aria-label="Close"></button>
</div>
<div class="modal-body">
... loading identifier details ...
</div>
<div class="modal-footer">
<a class="btn btn-primary" href="#identifiersModal"
data-remote="{{ url_for('tree_identifiers', tree_uuid=tree_uuid) }}"
data-bs-toggle="modal" data-bs-target="#identifiersModal" role="button">Back to capture's identifiers</a>
<button type="button" class="btn btn-secondary" data-bs-dismiss="modal">Close</button>
</div>
</div>
</div>
</div>

<div class="modal fade" id="faviconsModal" tabindex="-1" role="dialog">
<div class="modal-dialog modal-xl" role="document">
<div class="modal-content">
Expand Down

0 comments on commit 0f4ef01

Please sign in to comment.