Skip to content

Commit

Permalink
Add ban urls
Browse files Browse the repository at this point in the history
  • Loading branch information
akariv committed Feb 20, 2025
1 parent 2ce7866 commit b37114a
Show file tree
Hide file tree
Showing 3 changed files with 12 additions and 4 deletions.
10 changes: 8 additions & 2 deletions odds/backend/scanner/website/website_scanner.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,8 +50,9 @@ class Scraper:
'em', 'i', 'li', 'ol', 'strong', 'ul', 'table', 'tr', 'td', 'th', 'tbody', 'thead', 'title'}
CLEAN_TAGS = {'script', 'style', 'meta', 'iframe', 'nav', 'header', 'footer', 'form'}

def __init__(self, base_urls) -> None:
def __init__(self, base_urls, ban_urls) -> None:
self.base_urls = base_urls
self.ban_urls = ban_urls
self.q = asyncio.Queue()
self.out_q = asyncio.Queue()
self.outstanding = set()
Expand Down Expand Up @@ -135,6 +136,8 @@ async def scrape(self, url: str) -> list[str]:
if final_url != url:
if not any(final_url.startswith(base_url) for base_url in self.base_urls):
links = []
elif any(final_url.startswith(ban_url) for ban_url in self.ban_urls):
links = []
else:
links = [final_url]
# print(f'{url}: GOT STATUS', r.status_code)
Expand Down Expand Up @@ -186,6 +189,8 @@ async def scrape(self, url: str) -> list[str]:

_links = []
for link in links:
if any(link.startswith(ban_url) for ban_url in self.ban_urls):
continue
if any(link.startswith(base_url) for base_url in self.base_urls):
link = link.split('#')[0]
link = link.strip()
Expand Down Expand Up @@ -238,7 +243,8 @@ async def scan(self) -> AsyncIterator[Dataset]:
bases = self.catalog.url
if not isinstance(bases, list):
bases = [bases]
scraper = Scraper(bases)
ban = self.catalog.ban or []
scraper = Scraper(bases, ban)
count = 0
async for item in scraper():
count += 1
Expand Down
3 changes: 2 additions & 1 deletion odds/common/catalog_repo/config_catalog_repo.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,8 @@ def load_catalogs(self) -> list[DataCatalog]:
catalog['id'], catalog['kind'], catalog['url'], catalog['title'],
description=catalog.get('description'),
geo=catalog.get('geo'),
http_headers=catalog.get('http_headers') or {}
http_headers=catalog.get('http_headers') or {},
ban=catalog.get('ban') or [],
)
for catalog in catalogs
]
Expand Down
3 changes: 2 additions & 1 deletion odds/common/datatypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,12 +88,13 @@ def merge(self, updates: 'Dataset'):
@dataclass
class DataCatalog:
id: str
kind: Literal['CKAN', 'Socrata', 'data.json', 'other', 'website']
kind: Literal['CKAN', 'Socrata', 'data.json', 'other', 'website', 'arcgis']
url: str | List[str]
title: str
description: str = None
geo: str = None
http_headers: dict = field(default_factory=dict)
ban: List[str] = None


@dataclass
Expand Down

0 comments on commit b37114a

Please sign in to comment.