-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathmultiplex.py
270 lines (238 loc) · 14.6 KB
/
multiplex.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
import argparse
import sys, time, ckanapi, re
from icecream import ic
from pprint import pprint
from gadgets import (set_resource_parameters_to_values, set_package_parameters_to_values,
get_value_from_extras, set_package_extras_parameter_to_value,
get_package_parameter,
clear_package_groups, assign_package_to_group,
add_view)
from credentials import site, API_key
# Somehow apply a gadget function to multiple entities.
# For instance, we need to update the links on 29 identically named resources (which are just hyperlinks).
# We want to invoke
# set_resource_parameters_to_values,
# iterating over it for different resource_id values that have resources that match a certain search term.
def guess_parameter_type(parameter, value, mode):
# We need to change types from the default (string) to the correct type (like Boolean).
if value in ['None', 'null']:
print("Coercing parameter to a null value.")
return None
if parameter in ['relationships_as_object', 'relationships_as_subject', 'groups']: # There are other
# lists that could be added here, like 'tags' and 'extras'.
# Note that relationships_as_object and relationships_as_subject should not need to be manually set.
# The correct way of setting a relationship is through the separate relationships CKAN API endpoints.
# I noticed that deleting one of those relationships did not result in immediate deletion of the
# relevant package-level metadata, but within a day they automatically updated, suggesting that there
# is some infrequent background process that eventually updates the package-level metadata.
# Since groups and tags are strictly lists, it seems more convenient sometimes to use 'add' mode,
# to add a new element to the list.
if mode == 'add':
return value
else:
if value is None:
return value
if value[0] in ['[', '{']:
import json
return json.loads(value) # We need to convert '[]' to [] (a proper empty list)
return value
if parameter in ['datastore_active', 'private', 'isopen']:
if value in ['False', 'false']:
return False
elif value in ['True', 'true']:
return True
else:
raise ValueError(f"guess_parameter_type doesn't know what to do for parameter == {parameter} and value == {value}.")
#return bool(value) # Doesn't work for False values.
if parameter in ['position']: # We shouldn't be messing with these without at least some more effort: 'num_resources', 'num_tags'
return int(value)
return value
def act_on_parameter(entity, entity_type, mode, parameter, parameter_value):
if mode == 'get':
if ':' not in parameter:
return entity[parameter]
else: # Handle sub-parameters (for "extras")
params = parameter.split(':')
if len(params) == 2:
first = entity[params[0]]
if params[0] == "extras":
return get_value_from_extras(extras=first, key=params[1], default=None)
else:
raise ValueError(f'act_on_parameter is not yet designed to handle parameters like {parameter}')
elif mode == 'add':
if entity_type == 'dataset':
if parameter == 'groups':
package = assign_package_to_group(site, entity, entity['id'], parameter_value, API_key)
new_value = get_package_parameter(site, entity['id'], parameter=parameter, API_key=API_key)
return new_value
else:
raise ValueError(f'act_on_parameter is not yet designed to add dataset parameters like {parameter}')
else:
raise ValueError(f'act_on_parameter is not yet designed to add to entities of type {entity_type}')
elif mode == 'add_view':
if entity_type == 'resource':
if parameter_value in ['pdf_view', 'geo_view', 'text_view', 'webpage_view', 'image_view', 'datatables_view']:
view = add_view(entity['id'], parameter_value)
return view
else:
raise ValueError(f'act_on_parameter is not yet designed to add views like {parameter_value}')
else:
raise ValueError(f'act_on_parameter is not yet designed to add views to entities of type {entity_type}')
elif mode == 'set':
print(f"(This is where the value of {parameter} should be set to {parameter_value}.)")
if entity_type == 'resource':
set_resource_parameters_to_values(site, entity['id'], [parameter], [parameter_value], API_key)
elif entity_type == 'dataset':
if ':' in parameter: # Handle sub-parameters (for "extras")
params = parameter.split(':')
if len(params) == 2:
first = entity.get(params[0], {}) # The assumption here is that
# the first-level parameter should be a dictionary (if it's not there
# at all. This is true for the "extras" field, but should be reconsidered
# for others.
assert params[0] == "extras"
if params[0] == "extras":
package = set_package_extras_parameter_to_value(site, entity['id'], params[1], parameter_value, API_key)
new_value = get_value_from_extras(extras = package['extras'], key=params[1], default=None)
return new_value
else:
raise ValueError(f'act_on_parameter is not yet designed to handle parameters like {parameter}')
else:
raise ValueError(f'act_on_parameter is not yet designed to handle {len(params)} parameters like in {parameter}')
else:
if parameter == 'groups' and entity_type == 'dataset':
values = parameter_value.split('|')
package = entity
package_id = package['id']
clear_package_groups(site, package, package_id, API_key)
for value in values:
package = assign_package_to_group(site, package, package_id, parameter_value, API_key)
new_values = get_package_parameter(site, package_id, parameter=parameter, API_key=API_key)
return new_values
else:
raise ValueError(f'act_on_parameter is not yet designed to set dataset parameters like {parameter}')
else:
raise ValueError(f'Unknown entity_type == {entity_type}')
return parameter_value # Why are we returning this? Does it get changed somewhere?
elif mode == 'delete':
if ':' not in parameter or parameter.split(':')[0] != 'extras':
raise ValueError(f'Not programmed to handle deleting parameters like "{parameter}".')
params = parameter.split(':')
key = params[1]
assert entity_type == 'dataset' # Since resources don't have extras metadata.
assert len(params) == 2
assert params[0] == "extras"
extras_list = entity.get(params[0], {}) # The assumption here is that
# the first-level parameter should be a dictionary (if it's not there
# at all. This is true for the "extras" field, but should be reconsidered
# for others.
new_extras_list = [d for d in extras_list if d['key'] != key]
package = set_package_parameters_to_values(site, entity['id'], ['extras'], [new_extras_list], API_key)
else:
raise ValueError(f'Unknown mode == {mode}')
def multiplex_with_functional_selection(mode, entity_type, parameter, parameter_value, dataset_filter, resource_filter):
# Filter by dataset and resource with the passed filter functions.
# Then based on the mode value, either select the corresponding parameter or set it to parameter_value.
ckan = ckanapi.RemoteCKAN(site, apikey=API_key)
try:
packages = ckan.action.current_package_list_with_resources(limit=999999)
except:
time.sleep(0.01)
packages = ckan.action.current_package_list_with_resources(limit=999999)
collected = []
for dataset in packages:
if entity_type == 'dataset':
# Operate on the dataset level
if dataset_filter(dataset):
if parameter is None:
pprint(dataset)
after_param = dataset
else:
after_param = act_on_parameter(dataset, entity_type, mode, parameter, parameter_value)
collected.append({'parameter_val': after_param, 'dataset': dataset, 'name': dataset['title'], 'id': dataset['id']})
elif entity_type == 'resource':
# Find all matching resources
for resource in dataset['resources']:
if resource_filter(resource):
if parameter is None:
pprint(resource)
after_param = resource
else:
after_param = act_on_parameter(resource, entity_type, mode, parameter, parameter_value)
collected.append({'parameter_val': after_param, 'resource': resource, 'name': resource['name'], 'id': resource['id']})
else:
assert entity_type in ['dataset', 'resource']
for c in sorted(collected, key=lambda d: d['name']):
print(f"{c['name']} ({c['id']}){'' if parameter is None else '[' + parameter + ']'}: {c['parameter_val']}")
print(f"{'Set' if mode == 'set' else 'Got'} parameters for {len(collected)} {entity_type}{'s' if len(collected) != 1 else ''}.")
return collected
def is_uuid(s):
if s is None:
return False
import re
return re.search('^[a-f0-9]{8}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{12}$', s) is not None
def construct_function(pattern, entity_type):
if is_uuid(pattern):
return lambda x: True if x['id'] == pattern else None
elif pattern in ['all', None]:
return lambda x: True
else:
import re
# This needs to be generalized so it can handle cases where either the title or the name is given for datasets.
return lambda x: True if re.search(pattern, x['title'] if entity_type == 'dataset' else x['name']) is not None else None
# In principle, we might want to select on other metadata values.
def multi(mode, parameter, parameter_value, dataset_selector, resource_selector, tag_selector):
if resource_selector is None: # It's a dataset metadata field.
assert (parameter in [ 'id', 'title', 'name', 'geographic_unit', 'owner_org', 'maintainer',
'tags', 'relationships_as_object', 'access_level_comment',
'frequency_publishing', 'maintainer_email', 'num_tags',
'metadata_created', 'group', 'metadata_modified', 'author',
'author_email', 'state', 'version', 'department', 'license_id',
'type', 'resources', 'num_resources', 'data_steward_name', 'data_steward_email',
'frequency_data_change', 'private', 'groups',
'creator_user_id', 'relationships_as_subject', 'data_notes',
'isopen', 'url', 'notes', 'license_title',
'temporal_coverage', 'related_documents', 'license_url',
'organization', 'revision_id', 'extras', None]) or re.match('extras:', parameter)
else:
assert parameter in ['id', 'cache_last_updated', 'package_id', 'webstore_last_updated',
'datastore_active', 'size', 'state', 'hash',
'description', 'format', 'last_modified', 'url_type',
'mimetype', 'cache_url', 'name', 'created', 'url',
'webstore_url', 'mimetype_inner', 'position',
'revision_id', 'resource_type', None] + ['view_type']
# [ ] Which fields have non-string values (and would need to be cast)?
dataset_filter = construct_function(dataset_selector, 'dataset')
resource_filter = construct_function(resource_selector, 'resource')
if tag_selector == 'all':
tag_selector = None
if resource_selector is None and tag_selector is None:
entity_type = 'dataset'
elif resource_selector is None and tag_selector is not None:
# Make a dataset_filter based on the tag.
dataset_filter = lambda x: tag_selector in [t['name'] for t in x['tags']]
entity_type = 'dataset'
elif resource_selector is not None and tag_selector is None:
entity_type = 'resource'
if parameter is not None:
parameter_value = guess_parameter_type(parameter, parameter_value, mode)
multiplex_with_functional_selection(mode, entity_type, parameter, parameter_value, dataset_filter, resource_filter)
# Some package parameters you can fetch from the WPRDC with
# this function are:
# A full command-line specification would be like
# > multiplex.py change resources "DASH Data Guide" url <new url>
# > multichange.py resources "DASH Data Guide" url <new url>
# It would also be nice to be able to change all the resources (or resources matching a regex) in one dataset.
# > multiplex.py set dataset all resource "DASH Data Guide" url <new url>
# > multiplex.py set dataset <package_id> resource all url <new url>
# > multiplex.py set dataset (all|regex|package_id) resource (all|regex|resource_id) <parameter> <parameter value>
# > multiplex.py (set|get) <parameter> <parameter value> --dataset (all|regex|package_id) --resource (all|regex|resource_id)
parser = argparse.ArgumentParser(description='Select dataset packages/resources to set/get parameters on')
parser.add_argument('mode', default='get', choices=['set', 'get', 'add', 'delete', 'add_view'], help='Either "set" or "get" or "add" (or "delete" for extras keys) or "add_view".')
parser.add_argument('--parameter', dest='parameter', default=None, required=False, help='The parameter of interest (resource-level if the --resource parameter is given, else dataset-level). [Use "view_type" with add_view.]')
parser.add_argument('--value', dest='parameter_value', required=False, help='The parameter value to set the parameter to (resource-level if the --resource parameter is given, else dataset-level)')
parser.add_argument('--dataset', dest='dataset_selector', default=None, required=False, help='(all|<search term to match>|<package ID or name>)')
parser.add_argument('--resource', dest='resource_selector', default=None, required=False, help='(all|<search term to match>|<resource ID>)')
parser.add_argument('--tag', dest='tag_selector', default=None, required=False, help='(all|<search term to match>)') # We could add support for tag IDs.
args = parser.parse_args()
multi(args.mode, args.parameter, args.parameter_value, args.dataset_selector, args.resource_selector, args.tag_selector)