diff --git a/src/fundus/scraping/html.py b/src/fundus/scraping/html.py
index 2f3de7f8..5e94aff2 100644
--- a/src/fundus/scraping/html.py
+++ b/src/fundus/scraping/html.py
@@ -14,7 +14,7 @@
from requests import ConnectionError, HTTPError
from fundus.logging import create_logger
-from fundus.publishers.base_objects import Publisher
+from fundus.publishers.base_objects import Publisher, Robots
from fundus.scraping.delay import Delay
from fundus.scraping.filter import URLFilter
from fundus.scraping.session import _default_header, session_handler
@@ -112,9 +112,23 @@ def __init__(
self.query_parameters = query_parameters or {}
if isinstance(url_source, URLSource):
url_source.set_header(self.request_header)
+
self.delay = delay
- self.ignore_robots = ignore_robots
- self.ignore_crawl_delay = ignore_crawl_delay
+
+ # parse robots:
+ self.robots: Optional[Robots] = None
+ if not ignore_robots:
+ self.robots = self.publisher.robots
+ if not self.robots.ready:
+ self.publisher.robots.read(headers=self.request_header)
+
+ if not ignore_crawl_delay:
+ if robots_delay := self.robots.crawl_delay(self.request_header.get("user-agent") or "*"):
+ logger.debug(
+ f"Found crawl-delay of {robots_delay} seconds in robots.txt for {self.publisher.name}. "
+ f"Overwriting existing delay."
+ )
+ self.delay = lambda: robots_delay
def fetch(self, url_filter: Optional[URLFilter] = None) -> Iterator[HTML]:
combined_filters: List[URLFilter] = ([self.url_filter] if self.url_filter else []) + (
@@ -123,19 +137,6 @@ def fetch(self, url_filter: Optional[URLFilter] = None) -> Iterator[HTML]:
timestamp = time.time() + self.delay() if self.delay is not None else time.time()
- robots = self.publisher.robots
-
- if not robots.ready:
- robots.read(headers=self.request_header)
-
- if not (self.ignore_robots or self.ignore_crawl_delay):
- if delay := robots.crawl_delay(self.request_header.get("user-agent") or "*"):
- logger.debug(
- f"Found crawl-delay of {delay} seconds in robots.txt for {self.publisher.name}. "
- f"Overwriting existing delay."
- )
- self.delay = lambda: delay
-
def filter_url(u: str) -> bool:
return any(f(u) for f in combined_filters)
@@ -148,7 +149,7 @@ def filter_url(u: str) -> bool:
logger.debug(f"Skipped requested URL {url!r} because of URL filter")
continue
- if not (self.ignore_robots or robots.can_fetch(self.request_header.get("user-agent") or "*", url)):
+ if not (self.robots is None or self.robots.can_fetch(self.request_header.get("user-agent") or "*", url)):
logger.debug(f"Skipped requested URL {url!r} because of robots.txt")
continue