Skip to content

Commit

Permalink
Scrape excel files as well
Browse files Browse the repository at this point in the history
  • Loading branch information
akariv committed Feb 21, 2025
1 parent a161edc commit b11b810
Show file tree
Hide file tree
Showing 2 changed files with 11 additions and 5 deletions.
14 changes: 10 additions & 4 deletions odds/backend/scanner/arcgis/arcgis_catalog_scanner.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,19 +22,18 @@ def done(self, num_rows):
return True
return False

async def scan(self) -> AsyncIterator[Dataset]:
async def scan_aux(self, collection, filetype) -> AsyncIterator[Dataset]:
num_rows = 0
startindex = 1
used_ids = set()
async with httpx.AsyncClient() as client:
headers.update(self.catalog.http_headers)
domain = self.catalog.url.split('//')[1].split('/')[0]
while True:
if config.debug:
rts.set(self.ctx, f"Getting offset {startindex-1} of datasets from {self.catalog.url}")
try:
r = await Retry()(client, 'get',
f"{self.catalog.url}/api/search/v1/collections/dataset/items", params={"startindex": startindex, "filter": "type='CSV'"},
f"{self.catalog.url}/api/search/v1/collections/{collection}/items", params={"startindex": startindex, "filter": f"type='{filetype}'"},
headers=headers,
timeout=60
)
Expand All @@ -60,6 +59,7 @@ async def scan(self) -> AsyncIterator[Dataset]:
title = row['properties']['title']
description = row['properties']['description']
filename = properties['name']
file_format = filename.split('.')[-1].lower()
publisher = properties['source']
link = f'{self.catalog.url}/datasets/{id}/about'

Expand All @@ -69,7 +69,7 @@ async def scan(self) -> AsyncIterator[Dataset]:
resources = [
Resource(
f'{data_url}#{filename}',
'csv',
file_format,
title=filename,
)
]
Expand All @@ -84,3 +84,9 @@ async def scan(self) -> AsyncIterator[Dataset]:
yield dataset
if self.done(num_rows):
break

async def scan(self) -> AsyncIterator[Dataset]:
async for dataset in self.scan_aux('dataset', 'CSV'):
yield dataset
async for dataset in self.scan_aux('document', 'Microsoft Excel'):
yield dataset
2 changes: 1 addition & 1 deletion ui/projects/ask/src/app/home/home.component.less
Original file line number Diff line number Diff line change
Expand Up @@ -222,7 +222,7 @@
a {
cursor: pointer;
text-decoration: underline;
color: @color-blue-2;
color: @color-blue-2;
}
}
}
Expand Down

0 comments on commit b11b810

Please sign in to comment.