Skip to content

Commit

Permalink
new: Index IPs (v4 & v6)
Browse files Browse the repository at this point in the history
  • Loading branch information
Rafiot committed Jan 27, 2025
1 parent d9d5781 commit 022d08e
Show file tree
Hide file tree
Showing 8 changed files with 320 additions and 9 deletions.
106 changes: 104 additions & 2 deletions lookyloo/indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,13 @@
from __future__ import annotations

import hashlib
import ipaddress
import logging
import re
from collections.abc import Iterator

from datetime import datetime, timedelta
from ipaddress import IPv4Address, IPv6Address

from pathlib import Path

Expand Down Expand Up @@ -71,6 +73,7 @@ def force_reindex(self, capture_uuid: str) -> None:
p.srem('indexed_identifiers', capture_uuid)
p.srem('indexed_categories', capture_uuid)
p.srem('indexed_tlds', capture_uuid)
p.srem('indexed_ips', capture_uuid)
for identifier_type in self.identifiers_types():
p.srem(f'indexed_identifiers|{identifier_type}|captures', capture_uuid)
for hash_type in self.captures_hashes_types():
Expand All @@ -93,7 +96,7 @@ def force_reindex(self, capture_uuid: str) -> None:
p.delete(f'capture_indexes|{capture_uuid}')
p.execute()

def capture_indexed(self, capture_uuid: str) -> tuple[bool, bool, bool, bool, bool, bool, bool, bool, bool]:
def capture_indexed(self, capture_uuid: str) -> tuple[bool, bool, bool, bool, bool, bool, bool, bool, bool, bool]:
p = self.redis.pipeline()
p.sismember('indexed_urls', capture_uuid)
p.sismember('indexed_body_hashes', capture_uuid)
Expand All @@ -103,11 +106,12 @@ def capture_indexed(self, capture_uuid: str) -> tuple[bool, bool, bool, bool, bo
p.sismember('indexed_identifiers', capture_uuid)
p.sismember('indexed_categories', capture_uuid)
p.sismember('indexed_tlds', capture_uuid)
p.sismember('indexed_ips', capture_uuid)
# We also need to check if the hash_type are all indexed for this capture
hash_types_indexed = all(self.redis.sismember(f'indexed_hash_type|{hash_type}', capture_uuid) for hash_type in self.captures_hashes_types())
to_return: list[bool] = p.execute()
to_return.append(hash_types_indexed)
# This call for sure returns a tuple of 8 booleans
# This call for sure returns a tuple of 9 booleans
return tuple(to_return) # type: ignore[return-value]

def index_capture(self, uuid_to_index: str, directory: Path) -> None:
Expand Down Expand Up @@ -160,6 +164,9 @@ def index_capture(self, uuid_to_index: str, directory: Path) -> None:
self.logger.info(f'Indexing TLDs for {uuid_to_index}')
self.index_tld_capture(ct)
if not indexed[8]:
self.logger.info(f'Indexing IPs for {uuid_to_index}')
self.index_ips_capture(ct)
if not indexed[9]:
self.logger.info(f'Indexing hash types for {uuid_to_index}')
self.index_capture_hashes_types(ct)

Expand Down Expand Up @@ -453,6 +460,101 @@ def get_node_for_headers(self, hhh: str) -> tuple[str, str] | None:
return None
return capture_uuid, nodes.pop()

# ###### IPv4 & IPv6 ######

@property
def ipv4(self) -> set[str]:
return self.redis.smembers('ipv4')

@property
def ipv6(self) -> set[str]:
return self.redis.smembers('ipv6')

def index_ips_capture(self, crawled_tree: CrawledTree) -> None:
if self.redis.sismember('indexed_ips', crawled_tree.uuid):
# Do not reindex
return
self.redis.sadd('indexed_ips', crawled_tree.uuid)
self.logger.debug(f'Indexing IPs for {crawled_tree.uuid} ... ')
pipeline = self.redis.pipeline()

# Add the ips key in internal indexes set
internal_index = f'capture_indexes|{crawled_tree.uuid}'
pipeline.sadd(internal_index, 'ipv4')
pipeline.sadd(internal_index, 'ipv6')

already_indexed_global: set[IPv4Address | IPv6Address] = set()
for urlnode in crawled_tree.root_hartree.url_tree.traverse():
ip_to_index: IPv4Address | IPv6Address | None = None
if 'hostname_is_ip' in urlnode.features and urlnode.hostname_is_ip:
ip_to_index = ipaddress.ip_address(urlnode.hostname)
elif 'ip_address' in urlnode.features:
# The IP address from the HAR file, this is the one used for the connection
ip_to_index = urlnode.ip_address

if not ip_to_index:
# No IP available, skip
continue
ip_version_key = f'ipv{ip_to_index.version}'

# The IP address from the HAR file, this is the one used for the connection
if ip_to_index not in already_indexed_global:
# The IP hasn't been indexed in that run yet
already_indexed_global.add(ip_to_index)
pipeline.sadd(f'{internal_index}|{ip_version_key}', ip_to_index.compressed)
pipeline.sadd(ip_version_key, ip_to_index.compressed)
pipeline.zadd(f'{ip_version_key}|{ip_to_index.compressed}|captures',
mapping={crawled_tree.uuid: crawled_tree.start_time.timestamp()})

# Add urlnode UUID in internal index
pipeline.sadd(f'{internal_index}|{ip_version_key}|{ip_to_index.compressed}', urlnode.uuid)

for hostnode in crawled_tree.root_hartree.hostname_tree.traverse():
if 'resolved_ips' in hostnode.features:
for ip_version, ips in hostnode.resolved_ips.items():
for ip in ips:
ip_version_key = f'ip{ip_version}'
if ip not in already_indexed_global:
# The IP hasn't been indexed in that run yet
already_indexed_global.add(ip)
pipeline.sadd(f'{internal_index}|{ip_version_key}', ip)
pipeline.sadd(ip_version_key, ip)
pipeline.zadd(f'{ip_version_key}|{ip}|captures',
mapping={crawled_tree.uuid: crawled_tree.start_time.timestamp()})

# Add urlnodes UUIDs in internal index
pipeline.sadd(f'{internal_index}|{ip_version_key}|{ip}', *[urlnode.uuid for urlnode in hostnode.urls])

pipeline.execute()
self.logger.debug(f'done with IPs for {crawled_tree.uuid}.')

def get_captures_ip(self, ip: str, most_recent_capture: datetime | None = None,
oldest_capture: datetime | None = None,
offset: int | None=None, limit: int | None=None) -> list[str]:
"""Get all the captures for a specific IP, on a time interval starting from the most recent one.
:param ip: The IP address
:param most_recent_capture: The capture time of the most recent capture to consider
:param oldest_capture: The capture time of the oldest capture to consider.
"""
max_score: str | float = most_recent_capture.timestamp() if most_recent_capture else '+Inf'
min_score: str | float = self.__limit_failsafe(oldest_capture, limit)
return self.redis.zrevrangebyscore(f'ipv{ipaddress.ip_address(ip).version}|{ip}|captures', max_score, min_score, start=offset, num=limit)

def scan_captures_ip(self, ip: str) -> Iterator[tuple[str, float]]:
yield from self.redis.zscan_iter(f'ipv{ipaddress.ip_address(ip).version}|{ip}|captures')

def get_captures_ip_count(self, ip: str) -> int:
return self.redis.zcard(f'ipv{ipaddress.ip_address(ip).version}|{ip}|captures')

def get_capture_ip_counter(self, capture_uuid: str, ip: str) -> int:
return self.redis.scard(f'capture_indexes|{capture_uuid}|ipv{ipaddress.ip_address(ip).version}|{ip}')

def get_capture_ip_nodes(self, capture_uuid: str, ip: str) -> set[str]:
if url_nodes := self.redis.smembers(f'capture_indexes|{capture_uuid}|ipv{ipaddress.ip_address(ip).version}|{ip}'):
return set(url_nodes)
return set()

# ###### URLs and Domains ######

def _reindex_urls_domains(self, hostname: str, md5_url: str) -> None:
Expand Down
94 changes: 90 additions & 4 deletions website/web/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import functools
import hashlib
import http
import ipaddress
import json
import logging
import logging.config
Expand Down Expand Up @@ -456,6 +457,50 @@ def get_hostname_investigator(hostname: str, offset: int | None=None, limit: int
return total, captures


def get_ip_investigator(ip: str, offset: int | None=None, limit: int | None=None, search: str | None=None) -> tuple[int, list[tuple[str, str, str, datetime, list[tuple[str, str]]]]]:
'''Returns all the captures loading content from that ip, used in the web interface.'''
total = get_indexing(flask_login.current_user).get_captures_ip_count(ip)
if search:
cached_captures = [capture for capture in lookyloo.sorted_capture_cache(
[uuid for uuid, _ in get_indexing(flask_login.current_user).scan_captures_ip(ip)], cached_captures_only=False) if capture.search(search)]
else:
cached_captures = lookyloo.sorted_capture_cache(
get_indexing(flask_login.current_user).get_captures_ip(ip=ip, offset=offset, limit=limit), cached_captures_only=False)
_captures = [(cache.uuid, cache.title, cache.redirects[-1], cache.timestamp, get_indexing(flask_login.current_user).get_capture_ip_nodes(cache.uuid, ip)) for cache in cached_captures]
captures = []
for capture_uuid, capture_title, landing_page, capture_ts, nodes in _captures:
nodes_info: list[tuple[str, str]] = []
for urlnode_uuid in nodes:
try:
urlnode = lookyloo.get_urlnode_from_tree(capture_uuid, urlnode_uuid)
nodes_info.append((urlnode.name, urlnode_uuid))
except IndexError:
continue
captures.append((capture_uuid, capture_title, landing_page, capture_ts, nodes_info))
return total, captures


def get_all_ips(capture_uuid: str, /) -> dict[str, dict[str, int | list[URLNode]]]:
ct = lookyloo.get_crawled_tree(capture_uuid)
to_return: dict[str, dict[str, list[URLNode] | int]] = defaultdict()
for urlnode in ct.root_hartree.url_tree.traverse():
ip: ipaddress.IPv4Address | ipaddress.IPv6Address | None = None
if 'hostname_is_ip' in urlnode.features and urlnode.hostname_is_ip:
ip = ipaddress.ip_address(urlnode.hostname)
elif 'ip_address' in urlnode.features:
ip = urlnode.ip_address

if not ip:
continue

captures_count = get_indexing(flask_login.current_user).get_captures_ip_count(ip.compressed)
# Note for future: mayeb get url, capture title, something better than just the hash to show to the user
if ip.compressed not in to_return:
to_return[ip.compressed] = {'total_captures': captures_count, 'nodes': []}
to_return[ip.compressed]['nodes'].append(urlnode) # type: ignore[union-attr]
return to_return


def get_all_hostnames(capture_uuid: str, /) -> dict[str, dict[str, int | list[URLNode]]]:
ct = lookyloo.get_crawled_tree(capture_uuid)
to_return: dict[str, dict[str, list[URLNode] | int]] = defaultdict()
Expand Down Expand Up @@ -1371,6 +1416,11 @@ def tree_body_hashes(tree_uuid: str) -> str:
return render_template('tree_body_hashes.html', tree_uuid=tree_uuid)


@app.route('/tree/<string:tree_uuid>/ips', methods=['GET'])
def tree_ips(tree_uuid: str) -> str:
return render_template('tree_ips.html', tree_uuid=tree_uuid)


@app.route('/tree/<string:tree_uuid>/hostnames', methods=['GET'])
def tree_hostnames(tree_uuid: str) -> str:
return render_template('tree_hostnames.html', tree_uuid=tree_uuid)
Expand Down Expand Up @@ -1861,6 +1911,12 @@ def hostname_details(hostname: str) -> str:
return render_template('hostname.html', hostname=hostname, from_popup=from_popup)


@app.route('/ips/<string:ip>', methods=['GET'])
def ip_details(ip: str) -> str:
from_popup = True if (request.args.get('from_popup') and request.args.get('from_popup') == 'True') else False
return render_template('ip.html', ip=ip, from_popup=from_popup)


@app.route('/stats', methods=['GET'])
def statsfull() -> str:
stats = lookyloo.get_stats()
Expand Down Expand Up @@ -2220,6 +2276,21 @@ def post_table(table_name: str, value: str) -> Response:
prepared_captures.append(to_append)
return jsonify({'draw': draw, 'recordsTotal': total, 'recordsFiltered': total if not search else total_filtered, 'data': prepared_captures})

if table_name == 'ipTable':
total, captures = get_ip_investigator(value.strip(), offset=start, limit=length, search=search)
if search and start is not None and length is not None:
total_filtered = len(captures)
captures = captures[start:start + length]
prepared_captures = []
for capture_uuid, title, landing_page, capture_time, nodes in captures:
to_append = {
'capture_time': capture_time.isoformat(),
'capture_title': f'{__prepare_title_in_modal(capture_uuid, title, from_popup)}</br>{__prepare_node_view(capture_uuid, nodes, from_popup)}',
'landing_page': __prepare_landings_in_modal(landing_page)
}
prepared_captures.append(to_append)
return jsonify({'draw': draw, 'recordsTotal': total, 'recordsFiltered': total if not search else total_filtered, 'data': prepared_captures})

if table_name == 'hostnameTable':
total, captures = get_hostname_investigator(value.strip(), offset=start, limit=length, search=search)
if search and start is not None and length is not None:
Expand Down Expand Up @@ -2335,14 +2406,29 @@ def post_table(table_name: str, value: str) -> Response:
prepared_captures.append(to_append)
return jsonify(prepared_captures)

if table_name == 'ipsTable':
tree_uuid = value.strip()
prepared_captures = []
for _ip, _info in get_all_ips(tree_uuid).items(): # type: ignore[assignment]
nodes = [(node.name, node.uuid) for node in _info['nodes']] # type: ignore[union-attr]
to_append = {
'total_captures': _info['total_captures'], # type: ignore[dict-item]
'ip': details_modal_button(target_modal_id='#ipDetailsModal',
data_remote=url_for('ip_details', ip=_ip),
button_string=shorten_string(_ip, 100, with_title=True)),
'urls': __prepare_node_view(tree_uuid, nodes, from_popup)
}
prepared_captures.append(to_append)
return jsonify(prepared_captures)

if table_name == 'bodyHashesTable':
tree_uuid = value.strip()
prepared_captures = []
for body_hash, info in get_all_body_hashes(tree_uuid).items():
nodes = [(node[0].name, node[0].uuid) for node in info['nodes']] # type: ignore[union-attr]
for body_hash, _bh_info in get_all_body_hashes(tree_uuid).items():
nodes = [(node[0].name, node[0].uuid) for node in _bh_info['nodes']] # type: ignore[union-attr]
to_append = {
'total_captures': info['total_captures'], # type: ignore[dict-item]
'file_type': hash_icon_render(tree_uuid, info['nodes'][0][0].uuid, info['mimetype'], body_hash), # type: ignore[index,union-attr,arg-type]
'total_captures': _bh_info['total_captures'], # type: ignore[dict-item]
'file_type': hash_icon_render(tree_uuid, _bh_info['nodes'][0][0].uuid, _bh_info['mimetype'], body_hash), # type: ignore[index,union-attr,arg-type]
'urls': __prepare_node_view(tree_uuid, nodes, from_popup),
'sha512': details_modal_button(target_modal_id='#bodyHashDetailsModal',
data_remote=url_for('body_hash_details', body_hash=body_hash),
Expand Down
4 changes: 2 additions & 2 deletions website/web/sri.txt
Original file line number Diff line number Diff line change
Expand Up @@ -32,13 +32,13 @@
"loader.gif": "ZZKD5vLSKBWKeUpa2KI9qheUJ49iTI/UULmVU/AX28fBfH00K3lLc2v5pVJZ4qXG1BbB13LTXzRKKU35H2XfNg==",
"lookyloo.jpeg": "i6wBj8CsIM5YAQLEMQfhs3CNOSKkErF8AMqqM6ZygSwCyQgv9CU8xt94veMZhM/ufBWoz7kAXmR+yywmxsTxug==",
"redirect.png": "PAjzlPV97rEFvH55mG1ZC9wRl98be3yMeX/nENuFkJcds6/AXgSR2ig/QyPULgobSnNgiYieLVWY/oqsgeywrQ==",
"render_tables.js": "X1/IFdIdZxIJh7Bg11tjzkvsOLax6Tjj1g4CGcd34adOw5ydk9czK5MIhdZo4HnhDozQqUwzicHY6cj17Wt53w==",
"render_tables.js": "ga4nPLppQmG1UEJ5zLIF6NNa6FtKlVVU4O+QV6368Vitmi4bKhEucBAB12YWzgRz3WTK2hwJEwoGpk2yQZ+WLw==",
"secure.svg": "H8ni7t0d60nCJDVGuZpuxC+RBy/ipAjWT627D12HlZGg6LUmjSwPTQTUekm3UJupEP7TUkhXyq6WHc5gy7QBjg==",
"stats.css": "/kY943FwWBTne4IIyf7iBROSfbGd82TeBicEXqKkRwawMVRIvM/Pk5MRa7okUyGIxaDjFQGmV/U1vy+PhN6Jbw==",
"stats_graph.js": "S/sMNQK1UMMLD0xQeEa7sq3ce8o6oPxwxGlyKVtaHOODjair86dbBDm7cu6pa/elMRDJT1j09jEFjWp+5GbhTw==",
"tree.css": "jc7+RiJaZy7utfMu7iMWicpt0y0ZFiEQlB4c7MFNdlWcZf0czi3LgSQUFlDWt828Mx463V+JP1RalXuRjbGcEg==",
"tree.js": "5dHZ3npV2YHsPlng1OtxPCOcTjTx1/N0KjrwDoIp4+NS7JMTu/pgaQoDVgtISjZEm1Vb0mra+oQ4eY2arZfbyA==",
"tree_modals.js": "JaufDDoCYLrfyw3BLTWA42kI6uJC7YdtiDng08BdLAUMTc2MqEhPzh3aZpFywlfIsSxXnZJnO1vVTJUypDSfqw==",
"tree_modals.js": "McGe2C6t76h1vzyrAr9CrvsZgrGy4BLAfPp/dPZHH5xqSJhPuUn0q1V3XBIqtRSKny3M04honHmfwhqlnTrZCw==",
"up.jpg": "d1ljZJ9f5JekyM6RLFFH2Ua44j6neiQBdUIXOenRTjGppQr3JaeglpQIH6BjPCJL177+TH52U3UIRNS5YAyKIg==",
"up_right.jpg": "OMmz+n+MxR34P8/fn5t4DkqKqdJRzQbXQ7fAi2lhkZIJGhVs2vIyY1f2hpYoBxDAX1OcYsSE2lqIR2vXNDGZsA==",
"video.png": "gJtmkfr8I1Kw43pYEKjg6CAjgmhl1vIBKBQ3ZkxCu3wvxQm+6kf93iLrrFiY2WuiXzxEn2Leu52GJzmVN5id0g==",
Expand Down
42 changes: 42 additions & 0 deletions website/web/static/render_tables.js
Original file line number Diff line number Diff line change
Expand Up @@ -184,6 +184,27 @@
{ data: 'urls', width: '50%', orderable: false }],
});
}
if (document.getElementById('ipsTable')) {
treeUUID = document.getElementById('ipsTable').dataset.treeuuid;
new DataTable('#ipsTable', {
processing: true,
retrieve: true,
searching: true,
drawCallback: function (settings) {
newTabClickListener();
$('[data-bs-toggle="tooltip"]').tooltip({html: true});
},
order: [[ 0, "desc" ]],
ajax: {
url: `/tables/ipsTable/${treeUUID}${window.location.search}`,
type: 'POST',
dataSrc: ""
},
columns: [{ data: 'total_captures', width: '10%' },
{ data: 'ip', width: '40%', orderable: false },
{ data: 'urls', width: '50%', orderable: false }],
});
}
if (document.getElementById('identifiersTable')) {
treeUUID = document.getElementById('identifiersTable').dataset.treeuuid;
new DataTable('#identifiersTable', {
Expand Down Expand Up @@ -240,6 +261,27 @@
});
}

if (document.getElementById('ipTable')) {
hostname = document.getElementById('ipTable').dataset.ip;
new DataTable('#ipTable', {
processing: true,
serverSide: true,
retrieve: true,
ordering: false,
searching: true,
drawCallback: function (settings) { newTabClickListener() },
ajax: {
url: `/tables/ipTable/${hostname}${window.location.search}`,
type: 'POST'
},
columns : [
{ data: 'capture_time', width: '20%', render: DataTable.render.datetime_with_tz() },
{ data: 'capture_title', width: '40%' },
{ data: 'landing_page', width: '40%' }
],
});
}

if (document.getElementById('hostnameTable')) {
hostname = document.getElementById('hostnameTable').dataset.hostname;
new DataTable('#hostnameTable', {
Expand Down
3 changes: 2 additions & 1 deletion website/web/static/tree_modals.js
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,8 @@ document.addEventListener("DOMContentLoaded", () => {
"#bodyHashesModal", "#bodyHashDetailsModal",
"#hostnamesModal", "#hostnameDetailsModal",
"#urlsModal", "#urlDetailsModal",
"#urlsInPageModal"].forEach(modal => {
"#urlsInPageModal",
"#ipsModal", "#ipDetailsModal"].forEach(modal => {
$(modal).on('show.bs.modal', function(e) {
var button = $(e.relatedTarget);
var modal = $(this);
Expand Down
Loading

0 comments on commit 022d08e

Please sign in to comment.