chg: Use new annotations

Lookyloo · Jan 12, 2024 · ee1ad48 · ee1ad48
1 parent 0b5128e
commit ee1ad48
Show file tree

Hide file tree

Showing 49 changed files with 749 additions and 657 deletions.
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -3,14 +3,14 @@
 exclude: "user_agents|website/web/sri.txt"
 repos:
 -   repo: https://github.com/pre-commit/pre-commit-hooks
-    rev: v4.1.0
+    rev: v4.5.0
     hooks:
     -   id: trailing-whitespace
     -   id: end-of-file-fixer
     -   id: check-yaml
     -   id: check-added-large-files
 -   repo: https://github.com/asottile/pyupgrade
-    rev: v2.31.1
+    rev: v3.15.0
     hooks:
     -   id: pyupgrade
         args: [--py38-plus]
diff --git a/bin/archiver.py b/bin/archiver.py
@@ -1,5 +1,7 @@
 #!/usr/bin/env python3
 
+from __future__ import annotations
+
 import csv
 import gzip
 import logging
@@ -23,7 +25,7 @@
 
 class Archiver(AbstractManager):
 
-    def __init__(self, loglevel: Optional[int]=None):
+    def __init__(self, loglevel: int | None=None) -> None:
         super().__init__(loglevel)
         self.script_name = 'archiver'
         self.redis = Redis(unix_socket_path=get_socket_path('cache'))
@@ -54,7 +56,7 @@ def __init__(self, loglevel: Optional[int]=None):
             self.s3fs_bucket = s3fs_config['config']['bucket_name']
             self.s3fs_client.clear_multipart_uploads(self.s3fs_bucket)
 
-    def _to_run_forever(self):
+    def _to_run_forever(self) -> None:
         archiving_done = False
         # NOTE: When we archive a big directory, moving *a lot* of files, expecially to MinIO
         # can take a very long time. In order to avoid being stuck on the archiving, we break that in chunks
@@ -71,14 +73,14 @@ def _to_run_forever(self):
             # This call takes a very long time on MinIO
             self._update_all_capture_indexes()
 
-    def _update_index(self, root_dir: Path, *, s3fs_parent_dir: Optional[str]=None) -> Optional[Path]:
+    def _update_index(self, root_dir: Path, *, s3fs_parent_dir: str | None=None) -> Path | None:
         # returns a path to the index for the given directory
         logmsg = f'Updating index for {root_dir}'
         if s3fs_parent_dir:
             logmsg = f'{logmsg} (s3fs)'
         self.logger.info(logmsg)
 
-        current_index: Dict[str, str] = {}
+        current_index: dict[str, str] = {}
         index_file = root_dir / 'index'
         if index_file.exists():
             try:
@@ -91,11 +93,11 @@ def _update_index(self, root_dir: Path, *, s3fs_parent_dir: Optional[str]=None)
                 # NOTE: should we remove if it has subs?
                 index_file.unlink()
 
-        sub_indexes: List[Path] = []
-        current_index_dirs: Set[str] = set(current_index.values())
-        new_captures: Set[Path] = set()
+        sub_indexes: list[Path] = []
+        current_index_dirs: set[str] = set(current_index.values())
+        new_captures: set[Path] = set()
         # Directories that are actually in the listing.
-        current_dirs: Set[str] = set()
+        current_dirs: set[str] = set()
 
         if s3fs_parent_dir:
             s3fs_dir = '/'.join([s3fs_parent_dir, root_dir.name])
@@ -212,7 +214,7 @@ def _update_index(self, root_dir: Path, *, s3fs_parent_dir: Optional[str]=None)
 
         return index_file
 
-    def _update_all_capture_indexes(self, *, recent_only: bool=False):
+    def _update_all_capture_indexes(self, *, recent_only: bool=False) -> None:
         '''Run that after the captures are in the proper directories'''
         # Recent captures
         self.logger.info('Update recent indexes')
@@ -278,7 +280,7 @@ def __archive_single_capture(self, capture_path: Path) -> Path:
 
         return dest_dir / capture_path.name
 
-    def _archive(self):
+    def _archive(self) -> bool:
         archive_interval = timedelta(days=get_config('generic', 'archive'))
         cut_time = (datetime.now() - archive_interval)
         self.logger.info(f'Archiving all captures older than {cut_time.isoformat()}.')
@@ -340,7 +342,7 @@ def _archive(self):
             self.logger.info('Archiving done.')
         return archiving_done
 
-    def __load_index(self, index_path: Path, ignore_sub: bool=False) -> Dict[str, str]:
+    def __load_index(self, index_path: Path, ignore_sub: bool=False) -> dict[str, str]:
         '''Loads the given index file and all the subsequent ones if they exist'''
         # NOTE: this method is used on recent and archived captures, it must never trigger a dir listing
         indexed_captures = {}
@@ -359,7 +361,7 @@ def __load_index(self, index_path: Path, ignore_sub: bool=False) -> Dict[str, st
                     indexed_captures[key] = str(index_path.parent / path_name)
         return indexed_captures
 
-    def _load_indexes(self):
+    def _load_indexes(self) -> None:
         # capture_dir / Year / Month / index <- should always exists. If not, created by _update_index
         # Initialize recent index
         for index in sorted(get_captures_dir().glob('*/*/index'), reverse=True):
@@ -391,7 +393,7 @@ def _load_indexes(self):
         self.logger.info(f'Archived indexes loaded: {total_archived_captures} entries.')
 
 
-def main():
+def main() -> None:
     a = Archiver()
     a.run(sleep_in_sec=3600)
 

diff --git a/bin/async_capture.py b/bin/async_capture.py
@@ -1,5 +1,7 @@
 #!/usr/bin/env python3
 
+from __future__ import annotations
+
 import asyncio
 import json
 import logging
@@ -10,7 +12,7 @@
 from typing import Optional, Set, Union
 
 from lacuscore import LacusCore, CaptureStatus as CaptureStatusCore, CaptureResponse as CaptureResponseCore
-from pylacus import PyLacus, CaptureStatus as CaptureStatusPy, CaptureResponse as CaptureResponsePy
+from pylacus import PyLacus, CaptureStatus as CaptureStatusPy, CaptureResponse as CaptureResponsePy  # type: ignore[attr-defined]
 
 from lookyloo.lookyloo import Lookyloo, CaptureSettings
 from lookyloo.default import AbstractManager, get_config
@@ -23,15 +25,15 @@
 
 class AsyncCapture(AbstractManager):
 
-    def __init__(self, loglevel: Optional[int]=None):
+    def __init__(self, loglevel: int | None=None) -> None:
         super().__init__(loglevel)
         self.script_name = 'async_capture'
         self.only_global_lookups: bool = get_config('generic', 'only_global_lookups')
         self.capture_dir: Path = get_captures_dir()
         self.lookyloo = Lookyloo()
 
         if isinstance(self.lookyloo.lacus, LacusCore):
-            self.captures: Set[asyncio.Task] = set()
+            self.captures: set[asyncio.Task] = set()  # type: ignore[type-arg]
 
         self.fox = FOX(config_name='FOX')
         if not self.fox.available:
@@ -41,23 +43,24 @@ def thirdparty_submit(self, url: str) -> None:
         if self.fox.available:
             self.fox.capture_default_trigger(url, auto_trigger=True)
 
-    async def _trigger_captures(self):
+    async def _trigger_captures(self) -> None:
+        # Only called if LacusCore is used
         max_new_captures = get_config('generic', 'async_capture_processes') - len(self.captures)
         self.logger.debug(f'{len(self.captures)} ongoing captures.')
         if max_new_captures <= 0:
             self.logger.info(f'Max amount of captures in parallel reached ({len(self.captures)})')
-            return
-        for capture_task in self.lookyloo.lacus.consume_queue(max_new_captures):
+            return None
+        for capture_task in self.lookyloo.lacus.consume_queue(max_new_captures):  # type: ignore[union-attr]
             self.captures.add(capture_task)
             capture_task.add_done_callback(self.captures.discard)
 
-    def uuids_ready(self):
+    def uuids_ready(self) -> list[str]:
         return [uuid for uuid in self.lookyloo.redis.zrevrangebyscore('to_capture', 'Inf', '-Inf')
                 if uuid and self.lookyloo.lacus.get_capture_status(uuid) in [CaptureStatusPy.DONE, CaptureStatusCore]]
 
     def process_capture_queue(self) -> None:
         '''Process a query from the capture queue'''
-        entries: Union[CaptureResponseCore, CaptureResponsePy]
+        entries: CaptureResponseCore | CaptureResponsePy
         for uuid in self.uuids_ready():
             if isinstance(self.lookyloo.lacus, LacusCore):
                 entries = self.lookyloo.lacus.get_capture(uuid, decode=True)
@@ -71,9 +74,9 @@ def process_capture_queue(self) -> None:
             self.logger.info(log)
 
             self.lookyloo.redis.sadd('ongoing', uuid)
-            queue: Optional[str] = self.lookyloo.redis.getdel(f'{uuid}_mgmt')
+            queue: str | None = self.lookyloo.redis.getdel(f'{uuid}_mgmt')
 
-            to_capture: CaptureSettings = self.lookyloo.redis.hgetall(uuid)
+            to_capture: CaptureSettings = self.lookyloo.redis.hgetall(uuid)  # type: ignore[assignment]
 
             if get_config('generic', 'default_public'):
                 # By default, the captures are on the index, unless the user mark them as un-listed
@@ -123,9 +126,9 @@ def process_capture_queue(self) -> None:
             self.unset_running()
             self.logger.info(f'Done with {uuid}')
 
-    async def _to_run_forever_async(self):
+    async def _to_run_forever_async(self) -> None:
         if self.force_stop:
-            return
+            return None
 
         if isinstance(self.lookyloo.lacus, LacusCore):
             await self._trigger_captures()
@@ -135,7 +138,7 @@ async def _to_run_forever_async(self):
 
         self.process_capture_queue()
 
-    async def _wait_to_finish_async(self):
+    async def _wait_to_finish_async(self) -> None:
         if isinstance(self.lookyloo.lacus, LacusCore):
             while self.captures:
                 self.logger.info(f'Waiting for {len(self.captures)} capture(s) to finish...')
@@ -147,7 +150,7 @@ async def _wait_to_finish_async(self):
         self.logger.info('No more captures')
 
 
-def main():
+def main() -> None:
     m = AsyncCapture()
 
     loop = asyncio.new_event_loop()

diff --git a/bin/background_indexer.py b/bin/background_indexer.py
@@ -1,5 +1,7 @@
 #!/usr/bin/env python3
 
+from __future__ import annotations
+
 import logging
 import logging.config
 import os
@@ -20,15 +22,15 @@
 
 class BackgroundIndexer(AbstractManager):
 
-    def __init__(self, loglevel: Optional[int]=None):
+    def __init__(self, loglevel: int | None=None):
         super().__init__(loglevel)
         self.lookyloo = Lookyloo()
         self.script_name = 'background_indexer'
         # make sure discarded captures dir exists
         self.discarded_captures_dir = self.lookyloo.capture_dir.parent / 'discarded_captures'
         self.discarded_captures_dir.mkdir(parents=True, exist_ok=True)
 
-    def _to_run_forever(self):
+    def _to_run_forever(self) -> None:
         all_done = self._build_missing_pickles()
         if all_done:
             self._check_indexes()
@@ -72,7 +74,7 @@ def _build_missing_pickles(self) -> bool:
                     # The capture with this UUID exists, but it is for some reason missing in lookup_dirs
                     self.lookyloo.redis.hset('lookup_dirs', uuid, str(path))
                 else:
-                    cached_path = Path(self.lookyloo.redis.hget('lookup_dirs', uuid))
+                    cached_path = Path(self.lookyloo.redis.hget('lookup_dirs', uuid))  # type: ignore[arg-type]
                     if cached_path != path:
                         # we have a duplicate UUID, it is proably related to some bad copy/paste
                         if cached_path.exists():
@@ -118,13 +120,13 @@ def _build_missing_pickles(self) -> bool:
             return True
         return False
 
-    def _check_indexes(self):
+    def _check_indexes(self) -> None:
         index_redis = self.lookyloo.indexing.redis
         can_index = index_redis.set('ongoing_indexing', 1, ex=3600, nx=True)
         if not can_index:
             # There is no reason to run this method in multiple scripts.
             self.logger.info('Indexing already ongoing in another process.')
-            return
+            return None
         self.logger.info('Check indexes...')
         for cache in self.lookyloo.sorted_capture_cache(cached_captures_only=False):
             if self.lookyloo.is_public_instance and cache.no_index:
@@ -163,7 +165,7 @@ def _check_indexes(self):
         self.logger.info('... done.')
 
 
-def main():
+def main() -> None:
     i = BackgroundIndexer()
     i.run(sleep_in_sec=60)
 

diff --git a/bin/background_processing.py b/bin/background_processing.py
@@ -1,5 +1,7 @@
 #!/usr/bin/env python3
 
+from __future__ import annotations
+
 import json
 import time
 import logging
@@ -8,7 +10,7 @@
 from datetime import date, timedelta
 from typing import Any, Dict, Optional
 
-from lookyloo.lookyloo import Lookyloo, CaptureStatusCore, CaptureStatusPy
+from lookyloo.lookyloo import Lookyloo, CaptureStatusCore, CaptureStatusPy  # type: ignore[attr-defined]
 from lookyloo.default import AbstractManager, get_config, get_homedir, safe_create_dir
 from lookyloo.helpers import ParsedUserAgent, serialize_to_json
 
@@ -17,19 +19,19 @@
 
 class Processing(AbstractManager):
 
-    def __init__(self, loglevel: Optional[int]=None):
+    def __init__(self, loglevel: int | None=None):
         super().__init__(loglevel)
         self.script_name = 'processing'
         self.lookyloo = Lookyloo()
 
         self.use_own_ua = get_config('generic', 'use_user_agents_users')
 
-    def _to_run_forever(self):
+    def _to_run_forever(self) -> None:
         if self.use_own_ua:
             self._build_ua_file()
         self._retry_failed_enqueue()
 
-    def _build_ua_file(self):
+    def _build_ua_file(self) -> None:
         '''Build a file in a format compatible with the capture page'''
         yesterday = (date.today() - timedelta(days=1))
         self_generated_ua_file_path = get_homedir() / 'own_user_agents' / str(yesterday.year) / f'{yesterday.month:02}'
@@ -44,7 +46,7 @@ def _build_ua_file(self):
             self.logger.info(f'No User-agent file for {yesterday} to generate.')
             return
 
-        to_store: Dict[str, Any] = {'by_frequency': []}
+        to_store: dict[str, Any] = {'by_frequency': []}
         uas = Counter([entry.split('|', 1)[1] for entry in entries])
         for ua, _ in uas.most_common():
             parsed_ua = ParsedUserAgent(ua)
@@ -71,7 +73,7 @@ def _build_ua_file(self):
         self.lookyloo.redis.delete(f'user_agents|{yesterday.isoformat()}')
         self.logger.info(f'User-agent file for {yesterday} generated.')
 
-    def _retry_failed_enqueue(self):
+    def _retry_failed_enqueue(self) -> None:
         '''If enqueuing failed, the settings are added, with a UUID in the 'to_capture key', and they have a UUID'''
         for uuid in self.lookyloo.redis.zrevrangebyscore('to_capture', 'Inf', '-Inf'):
             try_reenqueue = False
@@ -131,7 +133,7 @@ def _retry_failed_enqueue(self):
                 self.logger.info(f'{uuid} enqueued.')
 
 
-def main():
+def main() -> None:
     p = Processing()
     p.run(sleep_in_sec=30)