19
19
20
20
help_msg = """
21
21
22
- NOTE: This is a beta feature. The user model, interface and options
23
- are subject to change.
24
-
25
22
This tool is intended for the purpose of expanding the size of the
26
- data corpus associated an OSB workload. Currently, this capability is
27
- implemented only for the http_logs workload.
23
+ data corpus associated with an OSB workload. Currently, this capability
24
+ is implemented only for the http_logs workload.
28
25
29
26
TLDR: to generate a 100 GB corpus and then run a test against it:
30
27
106
103
def handler (signum , frame ):
107
104
sys .exit (1 )
108
105
106
+
107
+ def error_exit (script_name , message ):
108
+ print (f'{ script_name } : { message } ' , file = sys .stderr )
109
+ sys .exit (1 )
110
+
111
+
109
112
class DocGenerator :
110
113
111
114
def __init__ (self ,
@@ -150,7 +153,8 @@ def error(self, message):
150
153
self .usage_msg ()
151
154
152
155
153
- def generate_docs (workload : str ,
156
+ def generate_docs (script_name : str ,
157
+ workload : str ,
154
158
repository : str ,
155
159
input_file : str ,
156
160
output_file_suffix : str ,
@@ -165,12 +169,17 @@ def generate_docs(workload: str,
165
169
#
166
170
config = configparser .ConfigParser ()
167
171
benchmark_home = os .environ .get ('BENCHMARK_HOME' ) or os .environ ['HOME' ]
168
- config .read (benchmark_home + '/.benchmark/benchmark.ini' )
172
+ benchmark_ini = benchmark_home + '/.benchmark/benchmark.ini'
173
+ if not os .path .isfile (benchmark_ini ):
174
+ error_exit (script_name , f"could not find OSB config file { benchmark_ini } , run a workload first to create it" )
175
+ config .read (benchmark_ini )
169
176
170
177
root_dir = config ['node' ]['root.dir' ]
171
178
workload_dir = root_dir + '/workloads/' + repository + '/' + workload
172
179
data_dir = config ['benchmarks' ]['local.dataset.cache' ] + '/' + workload
173
180
181
+ if not os .path .exists (data_dir ):
182
+ error_exit (script_name , f"workload data directory { data_dir } does not exist, run the appropriate workload first to create it" )
174
183
output_file = data_dir + '/documents-' + output_file_suffix + '.json'
175
184
if '/' not in input_file :
176
185
input_file = data_dir + '/' + input_file
@@ -274,8 +283,6 @@ def main(args: list) -> None:
274
283
output_file_suffix = args .output_file_suffix
275
284
n_docs = args .number_of_docs
276
285
corpus_size = args .corpus_size
277
- interval = args .interval if args .interval is not None else \
278
- corpus_size * - 2
279
286
start_timestamp = args .start_timestamp
280
287
batch_size = args .batch_size
281
288
@@ -286,12 +293,14 @@ def main(args: list) -> None:
286
293
elif not n_docs and not corpus_size :
287
294
parser .usage_msg (script_name +
288
295
": must specify number of documents or corpus size" )
289
-
296
+ interval = args .interval if args .interval is not None else \
297
+ corpus_size * - 2
290
298
if workload != 'http_logs' :
291
299
parser .usage_msg (script_name +
292
300
': only the "http_logs" workload is currently supported' )
293
301
294
- generate_docs (workload ,
302
+ generate_docs (script_name ,
303
+ workload ,
295
304
repository ,
296
305
input_file ,
297
306
output_file_suffix ,
0 commit comments