Skip to content

Commit

Permalink
Add arcgis scanner
Browse files Browse the repository at this point in the history
  • Loading branch information
akariv committed Feb 19, 2025
1 parent 4952c49 commit 82e5ba2
Show file tree
Hide file tree
Showing 3 changed files with 89 additions and 0 deletions.
Empty file.
86 changes: 86 additions & 0 deletions odds/backend/scanner/arcgis/arcgis_catalog_scanner.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
from typing import AsyncIterator
import httpx

from ....common.config import config
from ....common.datatypes import Dataset, DataCatalog, Resource
from ....common.retry import Retry
from ....common.realtime_status import realtime_status as rts
from ..catalog_scanner import CatalogScanner

headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:124.0) Gecko/20100101 Firefox/124.0'
}

class ArcGISCatalogScanner(CatalogScanner):

def __init__(self, catalog: DataCatalog, ctx: str):
self.catalog = catalog
self.ctx = ctx

def done(self, num_rows):
if config.limit_catalog_datasets and num_rows >= config.limit_catalog_datasets:
return True
return False

async def scan(self) -> AsyncIterator[Dataset]:
num_rows = 0
startindex = 1
used_ids = set()
async with httpx.AsyncClient() as client:
headers.update(self.catalog.http_headers)
domain = self.catalog.url.split('//')[1].split('/')[0]
while True:
if config.debug:
rts.set(self.ctx, f"Getting offset {startindex-1} of datasets from {self.catalog.url}")
try:
r = await Retry()(client, 'get',
f"{self.catalog.url}/api/search/v1/collections/dataset/items", params={"startindex": startindex, "filter": "type='CSV'"},
headers=headers,
timeout=60
)
r.raise_for_status()
r = r.json()
except Exception as e:
rts.set(self.ctx, f"Error getting offset {startindex-1} of datasets from {self.catalog.url}: {e!r}", 'error')
raise
rows = r['features']
if len(rows) == 0:
break
# print(f'XXXXX got {len(rows)} rows')
for row in rows:
startindex += 1
id = row['id']
properties = row['properties']
if properties['type'] != 'CSV':
continue
if id in used_ids:
print(f"Skipping duplicate resource {id}")
continue
data_url = f'https://www.arcgis.com/sharing/rest/content/items/{id}/data'
title = row['properties']['title']
description = row['properties']['description']
filename = properties['name']
publisher = properties['source']
link = f'{self.catalog.url}/datasets/{id}/about'

used_ids.add(id)
num_rows += 1

resources = [
Resource(
f'{data_url}#{filename}',
'csv',
title=filename,
)
]
# print(resource)
dataset = Dataset(
self.catalog.id, id, title,
description=description,
publisher=publisher,
resources=resources,
link=link
)
yield dataset
if self.done(num_rows):
break
3 changes: 3 additions & 0 deletions odds/backend/scanner/scanner_factory.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from .socrata.socrata_catalog_scanner import SocrataCatalogScanner
from .website.website_scanner import WebsiteCatalogScanner
from .worldbank.worldbank_catalog_scanner import WorldBankCatalogScanner
from .arcgis.arcgis_catalog_scanner import ArcGISCatalogScanner


class ScannerFactory:
Expand All @@ -18,3 +19,5 @@ def create_scanner(self, catalog: DataCatalog, ctx: str) -> CatalogScanner:
return WebsiteCatalogScanner(catalog, ctx)
if catalog.kind == 'worldbank':
return WorldBankCatalogScanner(catalog, ctx)
if catalog.kind == 'arcgis':
return ArcGISCatalogScanner(catalog, ctx)

0 comments on commit 82e5ba2

Please sign in to comment.