From 911e8dac5452244ea3ae0b77f459b0bb8792d20e Mon Sep 17 00:00:00 2001 From: Max Dallabetta Date: Mon, 24 Feb 2025 12:40:21 +0100 Subject: [PATCH] set `allow_all=True` when robots file cannot be loaded --- src/fundus/publishers/base_objects.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/src/fundus/publishers/base_objects.py b/src/fundus/publishers/base_objects.py index b6afffc1..797d2561 100644 --- a/src/fundus/publishers/base_objects.py +++ b/src/fundus/publishers/base_objects.py @@ -5,11 +5,14 @@ import requests +from fundus.logging import create_logger from fundus.parser.base_parser import ParserProxy from fundus.scraping.filter import URLFilter from fundus.scraping.url import NewsMap, RSSFeed, Sitemap, URLSource from fundus.utils.iteration import iterate_all_subclasses +logger = create_logger(__name__) + class CustomRobotFileParser(RobotFileParser): """Monkey patch RobotFileParse @@ -41,7 +44,11 @@ def __init__(self, url: str): self.ready: bool = False def read(self, headers: Optional[Dict[str, str]] = None) -> None: - self.robots_file_parser.read(headers=headers) + try: + self.robots_file_parser.read(headers=headers) + except requests.exceptions.ConnectionError as err: + logger.warning(f"Could not load robots {self.url!r}. Ignoring robots and continuing.") + self.robots_file_parser.allow_all = True self.ready = True def can_fetch(self, useragent: str, url: str) -> bool: