Skip to content

Commit

Permalink
set allow_all=True when robots file cannot be loaded
Browse files Browse the repository at this point in the history
  • Loading branch information
MaxDall committed Feb 24, 2025
1 parent fa4342a commit 911e8da
Showing 1 changed file with 8 additions and 1 deletion.
9 changes: 8 additions & 1 deletion src/fundus/publishers/base_objects.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,14 @@

import requests

from fundus.logging import create_logger
from fundus.parser.base_parser import ParserProxy
from fundus.scraping.filter import URLFilter
from fundus.scraping.url import NewsMap, RSSFeed, Sitemap, URLSource
from fundus.utils.iteration import iterate_all_subclasses

logger = create_logger(__name__)


class CustomRobotFileParser(RobotFileParser):
"""Monkey patch RobotFileParse
Expand Down Expand Up @@ -41,7 +44,11 @@ def __init__(self, url: str):
self.ready: bool = False

def read(self, headers: Optional[Dict[str, str]] = None) -> None:
self.robots_file_parser.read(headers=headers)
try:
self.robots_file_parser.read(headers=headers)
except requests.exceptions.ConnectionError as err:
logger.warning(f"Could not load robots {self.url!r}. Ignoring robots and continuing.")
self.robots_file_parser.allow_all = True
self.ready = True

def can_fetch(self, useragent: str, url: str) -> bool:
Expand Down

0 comments on commit 911e8da

Please sign in to comment.