Skip to content

Commit

Permalink
Apply suggestions from code review
Browse files Browse the repository at this point in the history
Co-authored-by: Conrad Dobberstein <29147025+dobbersc@users.noreply.github.com>
  • Loading branch information
MaxDall and dobbersc authored Jan 24, 2024
1 parent e716fa3 commit dc2e84b
Showing 1 changed file with 3 additions and 3 deletions.
6 changes: 3 additions & 3 deletions src/fundus/scraping/common_crawl/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,8 +116,8 @@ def _get_warc_paths(self, start: datetime, end: datetime) -> List[str]:
raise ValueError("The specified end date is in the future. We don't want to give spoilers, do we?")

date_sequence: List[datetime] = list(rrule(MONTHLY, dtstart=start, until=end))
urls = [
self.server_address + f"crawl-data/CC-NEWS/{date.strftime('%Y/%m')}/warc.paths.gz" for date in date_sequence
urls: List[str] = [
f"{self.server_address}crawl-data/CC-NEWS/{date.strftime('%Y/%m')}/warc.paths.gz" for date in date_sequence
]

def load_paths(url: str) -> List[str]:
Expand All @@ -144,7 +144,7 @@ def filter_warc_path_by_date(path: str) -> bool:
return start_strf <= match["date"] <= end_strf

return sorted(
[self.server_address + warc_path for warc_path in filter(filter_warc_path_by_date, warc_paths)],
(f"{self.server_address}{warc_path}" for warc_path in filter(filter_warc_path_by_date, warc_paths)),
reverse=True,
)

Expand Down

0 comments on commit dc2e84b

Please sign in to comment.