Skip to content

Commit

Permalink
Merge pull request #709 from flairNLP/fix-a-bug-with-robots-parser
Browse files Browse the repository at this point in the history
Set `allow_all=True` when robots cannot be loaded
  • Loading branch information
MaxDall authored Feb 27, 2025
2 parents fa4342a + 8dced33 commit fc5b68a
Show file tree
Hide file tree
Showing 2 changed files with 26 additions and 18 deletions.
9 changes: 8 additions & 1 deletion src/fundus/publishers/base_objects.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,14 @@

import requests

from fundus.logging import create_logger
from fundus.parser.base_parser import ParserProxy
from fundus.scraping.filter import URLFilter
from fundus.scraping.url import NewsMap, RSSFeed, Sitemap, URLSource
from fundus.utils.iteration import iterate_all_subclasses

logger = create_logger(__name__)


class CustomRobotFileParser(RobotFileParser):
"""Monkey patch RobotFileParse
Expand Down Expand Up @@ -41,7 +44,11 @@ def __init__(self, url: str):
self.ready: bool = False

def read(self, headers: Optional[Dict[str, str]] = None) -> None:
self.robots_file_parser.read(headers=headers)
try:
self.robots_file_parser.read(headers=headers)
except requests.exceptions.ConnectionError as err:
logger.warning(f"Could not load robots {self.url!r}. Ignoring robots and continuing.")
self.robots_file_parser.allow_all = True
self.ready = True

def can_fetch(self, useragent: str, url: str) -> bool:
Expand Down
35 changes: 18 additions & 17 deletions src/fundus/scraping/html.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
from requests import ConnectionError, HTTPError

from fundus.logging import create_logger
from fundus.publishers.base_objects import Publisher
from fundus.publishers.base_objects import Publisher, Robots
from fundus.scraping.delay import Delay
from fundus.scraping.filter import URLFilter
from fundus.scraping.session import _default_header, session_handler
Expand Down Expand Up @@ -112,9 +112,23 @@ def __init__(
self.query_parameters = query_parameters or {}
if isinstance(url_source, URLSource):
url_source.set_header(self.request_header)

self.delay = delay
self.ignore_robots = ignore_robots
self.ignore_crawl_delay = ignore_crawl_delay

# parse robots:
self.robots: Optional[Robots] = None
if not ignore_robots:
self.robots = self.publisher.robots
if not self.robots.ready:
self.publisher.robots.read(headers=self.request_header)

if not ignore_crawl_delay:
if robots_delay := self.robots.crawl_delay(self.request_header.get("user-agent") or "*"):
logger.debug(
f"Found crawl-delay of {robots_delay} seconds in robots.txt for {self.publisher.name}. "
f"Overwriting existing delay."
)
self.delay = lambda: robots_delay

def fetch(self, url_filter: Optional[URLFilter] = None) -> Iterator[HTML]:
combined_filters: List[URLFilter] = ([self.url_filter] if self.url_filter else []) + (
Expand All @@ -123,19 +137,6 @@ def fetch(self, url_filter: Optional[URLFilter] = None) -> Iterator[HTML]:

timestamp = time.time() + self.delay() if self.delay is not None else time.time()

robots = self.publisher.robots

if not robots.ready:
robots.read(headers=self.request_header)

if not (self.ignore_robots or self.ignore_crawl_delay):
if delay := robots.crawl_delay(self.request_header.get("user-agent") or "*"):
logger.debug(
f"Found crawl-delay of {delay} seconds in robots.txt for {self.publisher.name}. "
f"Overwriting existing delay."
)
self.delay = lambda: delay

def filter_url(u: str) -> bool:
return any(f(u) for f in combined_filters)

Expand All @@ -148,7 +149,7 @@ def filter_url(u: str) -> bool:
logger.debug(f"Skipped requested URL {url!r} because of URL filter")
continue

if not (self.ignore_robots or robots.can_fetch(self.request_header.get("user-agent") or "*", url)):
if not (self.robots is None or self.robots.can_fetch(self.request_header.get("user-agent") or "*", url)):
logger.debug(f"Skipped requested URL {url!r} because of robots.txt")
continue

Expand Down

0 comments on commit fc5b68a

Please sign in to comment.