From dc2e84bb58379b562e9f718687b0b859a3c6b888 Mon Sep 17 00:00:00 2001 From: Max Dallabetta Date: Wed, 24 Jan 2024 15:10:24 +0100 Subject: [PATCH] Apply suggestions from code review Co-authored-by: Conrad Dobberstein <29147025+dobbersc@users.noreply.github.com> --- src/fundus/scraping/common_crawl/pipeline.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/fundus/scraping/common_crawl/pipeline.py b/src/fundus/scraping/common_crawl/pipeline.py index 2e2d35c1c..4f73a2595 100644 --- a/src/fundus/scraping/common_crawl/pipeline.py +++ b/src/fundus/scraping/common_crawl/pipeline.py @@ -116,8 +116,8 @@ def _get_warc_paths(self, start: datetime, end: datetime) -> List[str]: raise ValueError("The specified end date is in the future. We don't want to give spoilers, do we?") date_sequence: List[datetime] = list(rrule(MONTHLY, dtstart=start, until=end)) - urls = [ - self.server_address + f"crawl-data/CC-NEWS/{date.strftime('%Y/%m')}/warc.paths.gz" for date in date_sequence + urls: List[str] = [ + f"{self.server_address}crawl-data/CC-NEWS/{date.strftime('%Y/%m')}/warc.paths.gz" for date in date_sequence ] def load_paths(url: str) -> List[str]: @@ -144,7 +144,7 @@ def filter_warc_path_by_date(path: str) -> bool: return start_strf <= match["date"] <= end_strf return sorted( - [self.server_address + warc_path for warc_path in filter(filter_warc_path_by_date, warc_paths)], + (f"{self.server_address}{warc_path}" for warc_path in filter(filter_warc_path_by_date, warc_paths)), reverse=True, )