Skip to content

Commit

Permalink
chg: Use new annotations
Browse files Browse the repository at this point in the history
  • Loading branch information
Rafiot committed Jan 12, 2024
1 parent 0b5128e commit ee1ad48
Show file tree
Hide file tree
Showing 49 changed files with 749 additions and 657 deletions.
4 changes: 2 additions & 2 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,14 @@
exclude: "user_agents|website/web/sri.txt"
repos:
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v4.1.0
rev: v4.5.0
hooks:
- id: trailing-whitespace
- id: end-of-file-fixer
- id: check-yaml
- id: check-added-large-files
- repo: https://github.com/asottile/pyupgrade
rev: v2.31.1
rev: v3.15.0
hooks:
- id: pyupgrade
args: [--py38-plus]
28 changes: 15 additions & 13 deletions bin/archiver.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
#!/usr/bin/env python3

from __future__ import annotations

import csv
import gzip
import logging
Expand All @@ -23,7 +25,7 @@

class Archiver(AbstractManager):

def __init__(self, loglevel: Optional[int]=None):
def __init__(self, loglevel: int | None=None) -> None:
super().__init__(loglevel)
self.script_name = 'archiver'
self.redis = Redis(unix_socket_path=get_socket_path('cache'))
Expand Down Expand Up @@ -54,7 +56,7 @@ def __init__(self, loglevel: Optional[int]=None):
self.s3fs_bucket = s3fs_config['config']['bucket_name']
self.s3fs_client.clear_multipart_uploads(self.s3fs_bucket)

def _to_run_forever(self):
def _to_run_forever(self) -> None:
archiving_done = False
# NOTE: When we archive a big directory, moving *a lot* of files, expecially to MinIO
# can take a very long time. In order to avoid being stuck on the archiving, we break that in chunks
Expand All @@ -71,14 +73,14 @@ def _to_run_forever(self):
# This call takes a very long time on MinIO
self._update_all_capture_indexes()

def _update_index(self, root_dir: Path, *, s3fs_parent_dir: Optional[str]=None) -> Optional[Path]:
def _update_index(self, root_dir: Path, *, s3fs_parent_dir: str | None=None) -> Path | None:
# returns a path to the index for the given directory
logmsg = f'Updating index for {root_dir}'
if s3fs_parent_dir:
logmsg = f'{logmsg} (s3fs)'
self.logger.info(logmsg)

current_index: Dict[str, str] = {}
current_index: dict[str, str] = {}
index_file = root_dir / 'index'
if index_file.exists():
try:
Expand All @@ -91,11 +93,11 @@ def _update_index(self, root_dir: Path, *, s3fs_parent_dir: Optional[str]=None)
# NOTE: should we remove if it has subs?
index_file.unlink()

sub_indexes: List[Path] = []
current_index_dirs: Set[str] = set(current_index.values())
new_captures: Set[Path] = set()
sub_indexes: list[Path] = []
current_index_dirs: set[str] = set(current_index.values())
new_captures: set[Path] = set()
# Directories that are actually in the listing.
current_dirs: Set[str] = set()
current_dirs: set[str] = set()

if s3fs_parent_dir:
s3fs_dir = '/'.join([s3fs_parent_dir, root_dir.name])
Expand Down Expand Up @@ -212,7 +214,7 @@ def _update_index(self, root_dir: Path, *, s3fs_parent_dir: Optional[str]=None)

return index_file

def _update_all_capture_indexes(self, *, recent_only: bool=False):
def _update_all_capture_indexes(self, *, recent_only: bool=False) -> None:
'''Run that after the captures are in the proper directories'''
# Recent captures
self.logger.info('Update recent indexes')
Expand Down Expand Up @@ -278,7 +280,7 @@ def __archive_single_capture(self, capture_path: Path) -> Path:

return dest_dir / capture_path.name

def _archive(self):
def _archive(self) -> bool:
archive_interval = timedelta(days=get_config('generic', 'archive'))
cut_time = (datetime.now() - archive_interval)
self.logger.info(f'Archiving all captures older than {cut_time.isoformat()}.')
Expand Down Expand Up @@ -340,7 +342,7 @@ def _archive(self):
self.logger.info('Archiving done.')
return archiving_done

def __load_index(self, index_path: Path, ignore_sub: bool=False) -> Dict[str, str]:
def __load_index(self, index_path: Path, ignore_sub: bool=False) -> dict[str, str]:
'''Loads the given index file and all the subsequent ones if they exist'''
# NOTE: this method is used on recent and archived captures, it must never trigger a dir listing
indexed_captures = {}
Expand All @@ -359,7 +361,7 @@ def __load_index(self, index_path: Path, ignore_sub: bool=False) -> Dict[str, st
indexed_captures[key] = str(index_path.parent / path_name)
return indexed_captures

def _load_indexes(self):
def _load_indexes(self) -> None:
# capture_dir / Year / Month / index <- should always exists. If not, created by _update_index
# Initialize recent index
for index in sorted(get_captures_dir().glob('*/*/index'), reverse=True):
Expand Down Expand Up @@ -391,7 +393,7 @@ def _load_indexes(self):
self.logger.info(f'Archived indexes loaded: {total_archived_captures} entries.')


def main():
def main() -> None:
a = Archiver()
a.run(sleep_in_sec=3600)

Expand Down
31 changes: 17 additions & 14 deletions bin/async_capture.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
#!/usr/bin/env python3

from __future__ import annotations

import asyncio
import json
import logging
Expand All @@ -10,7 +12,7 @@
from typing import Optional, Set, Union

from lacuscore import LacusCore, CaptureStatus as CaptureStatusCore, CaptureResponse as CaptureResponseCore
from pylacus import PyLacus, CaptureStatus as CaptureStatusPy, CaptureResponse as CaptureResponsePy
from pylacus import PyLacus, CaptureStatus as CaptureStatusPy, CaptureResponse as CaptureResponsePy # type: ignore[attr-defined]

from lookyloo.lookyloo import Lookyloo, CaptureSettings
from lookyloo.default import AbstractManager, get_config
Expand All @@ -23,15 +25,15 @@

class AsyncCapture(AbstractManager):

def __init__(self, loglevel: Optional[int]=None):
def __init__(self, loglevel: int | None=None) -> None:
super().__init__(loglevel)
self.script_name = 'async_capture'
self.only_global_lookups: bool = get_config('generic', 'only_global_lookups')
self.capture_dir: Path = get_captures_dir()
self.lookyloo = Lookyloo()

if isinstance(self.lookyloo.lacus, LacusCore):
self.captures: Set[asyncio.Task] = set()
self.captures: set[asyncio.Task] = set() # type: ignore[type-arg]

self.fox = FOX(config_name='FOX')
if not self.fox.available:
Expand All @@ -41,23 +43,24 @@ def thirdparty_submit(self, url: str) -> None:
if self.fox.available:
self.fox.capture_default_trigger(url, auto_trigger=True)

async def _trigger_captures(self):
async def _trigger_captures(self) -> None:
# Only called if LacusCore is used
max_new_captures = get_config('generic', 'async_capture_processes') - len(self.captures)
self.logger.debug(f'{len(self.captures)} ongoing captures.')
if max_new_captures <= 0:
self.logger.info(f'Max amount of captures in parallel reached ({len(self.captures)})')
return
for capture_task in self.lookyloo.lacus.consume_queue(max_new_captures):
return None
for capture_task in self.lookyloo.lacus.consume_queue(max_new_captures): # type: ignore[union-attr]
self.captures.add(capture_task)
capture_task.add_done_callback(self.captures.discard)

def uuids_ready(self):
def uuids_ready(self) -> list[str]:
return [uuid for uuid in self.lookyloo.redis.zrevrangebyscore('to_capture', 'Inf', '-Inf')
if uuid and self.lookyloo.lacus.get_capture_status(uuid) in [CaptureStatusPy.DONE, CaptureStatusCore]]

def process_capture_queue(self) -> None:
'''Process a query from the capture queue'''
entries: Union[CaptureResponseCore, CaptureResponsePy]
entries: CaptureResponseCore | CaptureResponsePy
for uuid in self.uuids_ready():
if isinstance(self.lookyloo.lacus, LacusCore):
entries = self.lookyloo.lacus.get_capture(uuid, decode=True)
Expand All @@ -71,9 +74,9 @@ def process_capture_queue(self) -> None:
self.logger.info(log)

self.lookyloo.redis.sadd('ongoing', uuid)
queue: Optional[str] = self.lookyloo.redis.getdel(f'{uuid}_mgmt')
queue: str | None = self.lookyloo.redis.getdel(f'{uuid}_mgmt')

to_capture: CaptureSettings = self.lookyloo.redis.hgetall(uuid)
to_capture: CaptureSettings = self.lookyloo.redis.hgetall(uuid) # type: ignore[assignment]

if get_config('generic', 'default_public'):
# By default, the captures are on the index, unless the user mark them as un-listed
Expand Down Expand Up @@ -123,9 +126,9 @@ def process_capture_queue(self) -> None:
self.unset_running()
self.logger.info(f'Done with {uuid}')

async def _to_run_forever_async(self):
async def _to_run_forever_async(self) -> None:
if self.force_stop:
return
return None

if isinstance(self.lookyloo.lacus, LacusCore):
await self._trigger_captures()
Expand All @@ -135,7 +138,7 @@ async def _to_run_forever_async(self):

self.process_capture_queue()

async def _wait_to_finish_async(self):
async def _wait_to_finish_async(self) -> None:
if isinstance(self.lookyloo.lacus, LacusCore):
while self.captures:
self.logger.info(f'Waiting for {len(self.captures)} capture(s) to finish...')
Expand All @@ -147,7 +150,7 @@ async def _wait_to_finish_async(self):
self.logger.info('No more captures')


def main():
def main() -> None:
m = AsyncCapture()

loop = asyncio.new_event_loop()
Expand Down
14 changes: 8 additions & 6 deletions bin/background_indexer.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
#!/usr/bin/env python3

from __future__ import annotations

import logging
import logging.config
import os
Expand All @@ -20,15 +22,15 @@

class BackgroundIndexer(AbstractManager):

def __init__(self, loglevel: Optional[int]=None):
def __init__(self, loglevel: int | None=None):
super().__init__(loglevel)
self.lookyloo = Lookyloo()
self.script_name = 'background_indexer'
# make sure discarded captures dir exists
self.discarded_captures_dir = self.lookyloo.capture_dir.parent / 'discarded_captures'
self.discarded_captures_dir.mkdir(parents=True, exist_ok=True)

def _to_run_forever(self):
def _to_run_forever(self) -> None:
all_done = self._build_missing_pickles()
if all_done:
self._check_indexes()
Expand Down Expand Up @@ -72,7 +74,7 @@ def _build_missing_pickles(self) -> bool:
# The capture with this UUID exists, but it is for some reason missing in lookup_dirs
self.lookyloo.redis.hset('lookup_dirs', uuid, str(path))
else:
cached_path = Path(self.lookyloo.redis.hget('lookup_dirs', uuid))
cached_path = Path(self.lookyloo.redis.hget('lookup_dirs', uuid)) # type: ignore[arg-type]
if cached_path != path:
# we have a duplicate UUID, it is proably related to some bad copy/paste
if cached_path.exists():
Expand Down Expand Up @@ -118,13 +120,13 @@ def _build_missing_pickles(self) -> bool:
return True
return False

def _check_indexes(self):
def _check_indexes(self) -> None:
index_redis = self.lookyloo.indexing.redis
can_index = index_redis.set('ongoing_indexing', 1, ex=3600, nx=True)
if not can_index:
# There is no reason to run this method in multiple scripts.
self.logger.info('Indexing already ongoing in another process.')
return
return None
self.logger.info('Check indexes...')
for cache in self.lookyloo.sorted_capture_cache(cached_captures_only=False):
if self.lookyloo.is_public_instance and cache.no_index:
Expand Down Expand Up @@ -163,7 +165,7 @@ def _check_indexes(self):
self.logger.info('... done.')


def main():
def main() -> None:
i = BackgroundIndexer()
i.run(sleep_in_sec=60)

Expand Down
16 changes: 9 additions & 7 deletions bin/background_processing.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
#!/usr/bin/env python3

from __future__ import annotations

import json
import time
import logging
Expand All @@ -8,7 +10,7 @@
from datetime import date, timedelta
from typing import Any, Dict, Optional

from lookyloo.lookyloo import Lookyloo, CaptureStatusCore, CaptureStatusPy
from lookyloo.lookyloo import Lookyloo, CaptureStatusCore, CaptureStatusPy # type: ignore[attr-defined]
from lookyloo.default import AbstractManager, get_config, get_homedir, safe_create_dir
from lookyloo.helpers import ParsedUserAgent, serialize_to_json

Expand All @@ -17,19 +19,19 @@

class Processing(AbstractManager):

def __init__(self, loglevel: Optional[int]=None):
def __init__(self, loglevel: int | None=None):
super().__init__(loglevel)
self.script_name = 'processing'
self.lookyloo = Lookyloo()

self.use_own_ua = get_config('generic', 'use_user_agents_users')

def _to_run_forever(self):
def _to_run_forever(self) -> None:
if self.use_own_ua:
self._build_ua_file()
self._retry_failed_enqueue()

def _build_ua_file(self):
def _build_ua_file(self) -> None:
'''Build a file in a format compatible with the capture page'''
yesterday = (date.today() - timedelta(days=1))
self_generated_ua_file_path = get_homedir() / 'own_user_agents' / str(yesterday.year) / f'{yesterday.month:02}'
Expand All @@ -44,7 +46,7 @@ def _build_ua_file(self):
self.logger.info(f'No User-agent file for {yesterday} to generate.')
return

to_store: Dict[str, Any] = {'by_frequency': []}
to_store: dict[str, Any] = {'by_frequency': []}
uas = Counter([entry.split('|', 1)[1] for entry in entries])
for ua, _ in uas.most_common():
parsed_ua = ParsedUserAgent(ua)
Expand All @@ -71,7 +73,7 @@ def _build_ua_file(self):
self.lookyloo.redis.delete(f'user_agents|{yesterday.isoformat()}')
self.logger.info(f'User-agent file for {yesterday} generated.')

def _retry_failed_enqueue(self):
def _retry_failed_enqueue(self) -> None:
'''If enqueuing failed, the settings are added, with a UUID in the 'to_capture key', and they have a UUID'''
for uuid in self.lookyloo.redis.zrevrangebyscore('to_capture', 'Inf', '-Inf'):
try_reenqueue = False
Expand Down Expand Up @@ -131,7 +133,7 @@ def _retry_failed_enqueue(self):
self.logger.info(f'{uuid} enqueued.')


def main():
def main() -> None:
p = Processing()
p.run(sleep_in_sec=30)

Expand Down
Loading

0 comments on commit ee1ad48

Please sign in to comment.