Skip to content

Commit fa1adf2

Browse files
authored
Updates to the expand-data-corpus script (#612)
Signed-off-by: Govind Kamat <govkamat@amazon.com>
1 parent e79599f commit fa1adf2

File tree

2 files changed

+58
-11
lines changed

2 files changed

+58
-11
lines changed

scripts/expand-data-corpus.py

+20-11
Original file line numberDiff line numberDiff line change
@@ -19,12 +19,9 @@
1919

2020
help_msg = """
2121
22-
NOTE: This is a beta feature. The user model, interface and options
23-
are subject to change.
24-
2522
This tool is intended for the purpose of expanding the size of the
26-
data corpus associated an OSB workload. Currently, this capability is
27-
implemented only for the http_logs workload.
23+
data corpus associated with an OSB workload. Currently, this capability
24+
is implemented only for the http_logs workload.
2825
2926
TLDR: to generate a 100 GB corpus and then run a test against it:
3027
@@ -106,6 +103,12 @@
106103
def handler(signum, frame):
107104
sys.exit(1)
108105

106+
107+
def error_exit(script_name, message):
108+
print(f'{script_name}: {message}', file=sys.stderr)
109+
sys.exit(1)
110+
111+
109112
class DocGenerator:
110113

111114
def __init__(self,
@@ -150,7 +153,8 @@ def error(self, message):
150153
self.usage_msg()
151154

152155

153-
def generate_docs(workload: str,
156+
def generate_docs(script_name: str,
157+
workload: str,
154158
repository: str,
155159
input_file: str,
156160
output_file_suffix: str,
@@ -165,12 +169,17 @@ def generate_docs(workload: str,
165169
#
166170
config = configparser.ConfigParser()
167171
benchmark_home = os.environ.get('BENCHMARK_HOME') or os.environ['HOME']
168-
config.read(benchmark_home + '/.benchmark/benchmark.ini')
172+
benchmark_ini = benchmark_home + '/.benchmark/benchmark.ini'
173+
if not os.path.isfile(benchmark_ini):
174+
error_exit(script_name, f"could not find OSB config file {benchmark_ini}, run a workload first to create it")
175+
config.read(benchmark_ini)
169176

170177
root_dir = config['node']['root.dir']
171178
workload_dir= root_dir + '/workloads/' + repository + '/' + workload
172179
data_dir = config['benchmarks']['local.dataset.cache'] + '/' + workload
173180

181+
if not os.path.exists(data_dir):
182+
error_exit(script_name, f"workload data directory {data_dir} does not exist, run the appropriate workload first to create it")
174183
output_file = data_dir + '/documents-' + output_file_suffix + '.json'
175184
if '/' not in input_file:
176185
input_file = data_dir + '/' + input_file
@@ -274,8 +283,6 @@ def main(args: list) -> None:
274283
output_file_suffix = args.output_file_suffix
275284
n_docs = args.number_of_docs
276285
corpus_size = args.corpus_size
277-
interval = args.interval if args.interval is not None else \
278-
corpus_size * -2
279286
start_timestamp = args.start_timestamp
280287
batch_size = args.batch_size
281288

@@ -286,12 +293,14 @@ def main(args: list) -> None:
286293
elif not n_docs and not corpus_size:
287294
parser.usage_msg(script_name +
288295
": must specify number of documents or corpus size")
289-
296+
interval = args.interval if args.interval is not None else \
297+
corpus_size * -2
290298
if workload != 'http_logs':
291299
parser.usage_msg(script_name +
292300
': only the "http_logs" workload is currently supported')
293301

294-
generate_docs(workload,
302+
generate_docs(script_name,
303+
workload,
295304
repository,
296305
input_file,
297306
output_file_suffix,

tests/scripts_test.py

+38
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
# SPDX-License-Identifier: Apache-2.0
2+
#
3+
# The OpenSearch Contributors require contributions made to
4+
# this file be licensed under the Apache-2.0 license or a
5+
# compatible open source license.
6+
# Modifications Copyright OpenSearch Contributors. See
7+
# GitHub history for details.
8+
# Licensed to Elasticsearch B.V. under one or more contributor
9+
# license agreements. See the NOTICE file distributed with
10+
# this work for additional information regarding copyright
11+
# ownership. Elasticsearch B.V. licenses this file to you under
12+
# the Apache License, Version 2.0 (the "License"); you may
13+
# not use this file except in compliance with the License.
14+
# You may obtain a copy of the License at
15+
#
16+
# http://www.apache.org/licenses/LICENSE-2.0
17+
#
18+
# Unless required by applicable law or agreed to in writing,
19+
# software distributed under the License is distributed on an
20+
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
21+
# KIND, either express or implied. See the License for the
22+
# specific language governing permissions and limitations
23+
# under the License.
24+
25+
import os
26+
import pathlib
27+
import subprocess
28+
from unittest import TestCase
29+
30+
class ScriptsTests(TestCase):
31+
32+
def test_scr(self):
33+
os.environ["BENCHMARK_HOME"] = "/tmp"
34+
script = pathlib.Path(__file__).parent.parent / "scripts" / "expand-data-corpus.py"
35+
p = subprocess.Popen([str(script), "-c", "10"],
36+
stdout=subprocess.PIPE, stderr=subprocess.PIPE)
37+
stderr = p.communicate()[1].decode('UTF-8')
38+
self.assertTrue("could not find OSB config file" in stderr)

0 commit comments

Comments
 (0)