Add ban urls

whiletrue-industries · Feb 20, 2025 · b37114a · b37114a
1 parent 2ce7866
commit b37114a
Show file tree

Hide file tree

Showing 3 changed files with 12 additions and 4 deletions.
diff --git a/odds/backend/scanner/website/website_scanner.py b/odds/backend/scanner/website/website_scanner.py
@@ -50,8 +50,9 @@ class Scraper:
             'em', 'i', 'li', 'ol', 'strong', 'ul', 'table', 'tr', 'td', 'th', 'tbody', 'thead', 'title'}
     CLEAN_TAGS = {'script', 'style', 'meta', 'iframe', 'nav', 'header', 'footer', 'form'}
 
-    def __init__(self, base_urls) -> None:
+    def __init__(self, base_urls, ban_urls) -> None:
         self.base_urls = base_urls
+        self.ban_urls = ban_urls
         self.q = asyncio.Queue()
         self.out_q = asyncio.Queue()
         self.outstanding = set()
@@ -135,6 +136,8 @@ async def scrape(self, url: str) -> list[str]:
             if final_url != url:
                 if not any(final_url.startswith(base_url) for base_url in self.base_urls):
                     links = []
+                elif any(final_url.startswith(ban_url) for ban_url in self.ban_urls):
+                    links = []
                 else:
                     links = [final_url]
         # print(f'{url}: GOT STATUS', r.status_code)
@@ -186,6 +189,8 @@ async def scrape(self, url: str) -> list[str]:
 
         _links = []
         for link in links:
+            if any(link.startswith(ban_url) for ban_url in self.ban_urls):
+                continue
             if any(link.startswith(base_url) for base_url in self.base_urls):
                 link = link.split('#')[0]
                 link = link.strip()
@@ -238,7 +243,8 @@ async def scan(self) -> AsyncIterator[Dataset]:
         bases = self.catalog.url
         if not isinstance(bases, list):
             bases = [bases]
-        scraper = Scraper(bases)
+        ban = self.catalog.ban or []
+        scraper = Scraper(bases, ban)
         count = 0
         async for item in scraper():
             count += 1

diff --git a/odds/common/catalog_repo/config_catalog_repo.py b/odds/common/catalog_repo/config_catalog_repo.py
@@ -19,7 +19,8 @@ def load_catalogs(self) -> list[DataCatalog]:
                 catalog['id'], catalog['kind'], catalog['url'], catalog['title'],
                 description=catalog.get('description'),
                 geo=catalog.get('geo'),
-                http_headers=catalog.get('http_headers') or {}
+                http_headers=catalog.get('http_headers') or {},
+                ban=catalog.get('ban') or [],
             )
             for catalog in catalogs
         ]

diff --git a/odds/common/datatypes.py b/odds/common/datatypes.py
@@ -88,12 +88,13 @@ def merge(self, updates: 'Dataset'):
 @dataclass
 class DataCatalog:
     id: str
-    kind: Literal['CKAN', 'Socrata', 'data.json', 'other', 'website']
+    kind: Literal['CKAN', 'Socrata', 'data.json', 'other', 'website', 'arcgis']
     url: str | List[str]
     title: str
     description: str = None
     geo: str = None 
     http_headers: dict = field(default_factory=dict)
+    ban: List[str] = None
 
 
 @dataclass