Added support for a data corpora source-url field to specify a file directly.

gkamat · gkamat · commit 822f31d064ee · 2024-02-15T09:34:48.000-08:00
Signed-off-by: Govind Kamat &lt;govkamat@amazon.com&gt;
diff --git a/it/__init__.py b/it/__init__.py
@@ -29,6 +29,7 @@
 import random
 import socket
 import time
+import datetime
 
 import pytest
 
@@ -87,7 +88,7 @@ def osbenchmark(cfg, command_line):
     These commands may have different CLI options than test_execution.
     """
     cmd = osbenchmark_command_line_for(cfg, command_line)
-    print("\nInvoking OSB:", cmd)
+    print(f'\n{datetime.datetime.now().strftime("%Y-%m-%dT%H:%M:%S")} Invoking OSB: {cmd}')
     err, retcode = process.run_subprocess_with_stderr(cmd)
     if retcode != 0:
         print(err)
diff --git a/osbenchmark/workload/loader.py b/osbenchmark/workload/loader.py
@@ -464,20 +464,23 @@ def __init__(self, offline, test_mode):
         self.test_mode = test_mode
         self.logger = logging.getLogger(__name__)
 
-    def download(self, base_url, target_path, size_in_bytes):
+    def download(self, base_url, source_url, target_path, size_in_bytes):
         file_name = os.path.basename(target_path)
 
         if not base_url:
             raise exceptions.DataError("Cannot download data because no base URL is provided.")
         if self.offline:
             raise exceptions.SystemSetupError(f"Cannot find [{target_path}]. Please disable offline mode and retry.")
 
-        if base_url.endswith("/"):
-            separator = ""
+        if source_url:
+            data_url = source_url
         else:
-            separator = "/"
-        # join manually as `urllib.parse.urljoin` does not work with S3 or GS URL schemes.
-        data_url = f"{base_url}{separator}{file_name}"
+            if base_url.endswith("/"):
+                separator = ""
+            else:
+                separator = "/"
+            # join manually as `urllib.parse.urljoin` does not work with S3 or GS URL schemes.
+            data_url = f"{base_url}{separator}{file_name}"
         try:
             io.ensure_dir(os.path.dirname(target_path))
             if size_in_bytes:
@@ -573,7 +576,7 @@ def prepare_document_set(self, document_set, data_root):
                     raise exceptions.BenchmarkAssertionError(f"Workload {self.workload_name} specifies documents but no corpus")
 
                 try:
-                    self.downloader.download(document_set.base_url, target_path, expected_size)
+                    self.downloader.download(document_set.base_url, document_set.source_url, target_path, expected_size)
                 except exceptions.DataError as e:
                     if e.message == "Cannot download data because no base URL is provided." and \
                        self.is_locally_available(target_path):
@@ -1489,6 +1492,7 @@ def _create_corpora(self, corpora_specs, indices, data_streams):
                 source_format = self._r(doc_spec, "source-format", mandatory=False, default_value=default_source_format)
 
                 if source_format in workload.Documents.SUPPORTED_SOURCE_FORMAT:
+                    source_url = self._r(doc_spec, "source-url", mandatory=False)
                     docs = self._r(doc_spec, "source-file")
                     if io.is_archive(docs):
                         document_archive = docs
@@ -1541,6 +1545,7 @@ def _create_corpora(self, corpora_specs, indices, data_streams):
                                            document_file=document_file,
                                            document_archive=document_archive,
                                            base_url=base_url,
+                                           source_url=source_url,
                                            includes_action_and_meta_data=includes_action_and_meta_data,
                                            number_of_documents=num_docs,
                                            compressed_size_in_bytes=compressed_bytes,
diff --git a/osbenchmark/workload/workload.py b/osbenchmark/workload/workload.py
@@ -190,7 +190,7 @@ class Documents:
     SOURCE_FORMAT_BIG_ANN = "big-ann"
     SUPPORTED_SOURCE_FORMAT = [SOURCE_FORMAT_BULK, SOURCE_FORMAT_HDF5, SOURCE_FORMAT_BIG_ANN]
 
-    def __init__(self, source_format, document_file=None, document_archive=None, base_url=None,
+    def __init__(self, source_format, document_file=None, document_archive=None, base_url=None, source_url=None,
                  includes_action_and_meta_data=False,
                  number_of_documents=0, compressed_size_in_bytes=0, uncompressed_size_in_bytes=0, target_index=None,
                  target_data_stream=None, target_type=None, meta_data=None):
@@ -201,7 +201,8 @@ def __init__(self, source_format, document_file=None, document_archive=None, bas
         just need a mapping but no documents)
         :param document_archive: The file name of the compressed benchmark document name on the remote server. Optional (e.g. for
         percolation we just need a mapping but no documents)
-        :param base_url: The URL from which to load data if they are not available locally. Optional.
+        :param base_url: The URL from which to load data if they are not available locally. Excludes the file or object name. Optional.
+        :param source_url: The full URL to the file or object from which to load data if not available locally. Optional.
         :param includes_action_and_meta_data: True, if the source file already includes the action and meta-data line. False, if it only
         contains documents.
         :param number_of_documents: The number of documents
@@ -224,6 +225,7 @@ def __init__(self, source_format, document_file=None, document_archive=None, bas
         self.document_file = document_file
         self.document_archive = document_archive
         self.base_url = base_url
+        self.source_url = source_url
         self.includes_action_and_meta_data = includes_action_and_meta_data
         self._number_of_documents = number_of_documents
         self._compressed_size_in_bytes = compressed_size_in_bytes
@@ -295,18 +297,18 @@ def __repr__(self):
 
     def __hash__(self):
         return hash(self.source_format) ^ hash(self.document_file) ^ hash(self.document_archive) ^ hash(self.base_url) ^ \
-               hash(self.includes_action_and_meta_data) ^ hash(self.number_of_documents) ^ hash(self.compressed_size_in_bytes) ^ \
-               hash(self.uncompressed_size_in_bytes) ^ hash(self.target_index) ^ hash(self.target_data_stream) ^ hash(self.target_type) ^ \
-               hash(frozenset(self.meta_data.items()))
+               hash(self.source_url) ^ hash(self.includes_action_and_meta_data) ^ hash(self.number_of_documents) ^ \
+               hash(self.compressed_size_in_bytes) ^ hash(self.uncompressed_size_in_bytes) ^ hash(self.target_index) ^ \
+               hash(self.target_data_stream) ^ hash(self.target_type) ^ hash(frozenset(self.meta_data.items()))
 
     def __eq__(self, othr):
         return (isinstance(othr, type(self)) and
-                (self.source_format, self.document_file, self.document_archive, self.base_url, self.includes_action_and_meta_data,
-                 self.number_of_documents, self.compressed_size_in_bytes, self.uncompressed_size_in_bytes,
-                 self.target_type, self.target_data_stream, self.target_type, self.meta_data) ==
-                (othr.source_format, othr.document_file, othr.document_archive, othr.base_url, othr.includes_action_and_meta_data,
-                 othr.number_of_documents, othr.compressed_size_in_bytes, othr.uncompressed_size_in_bytes,
-                 othr.target_type, othr.target_data_stream, othr.target_type, othr.meta_data))
+                (self.source_format, self.document_file, self.document_archive, self.base_url, self.source_url,
+                 self.includes_action_and_meta_data, self.number_of_documents, self.compressed_size_in_bytes,
+                 self.uncompressed_size_in_bytes, self.target_type, self.target_data_stream, self.target_type, self.meta_data) ==
+                (othr.source_format, othr.document_file, othr.document_archive, othr.base_url, self.source_url,
+                 othr.includes_action_and_meta_data, othr.number_of_documents, othr.compressed_size_in_bytes,
+                 othr.uncompressed_size_in_bytes, othr.target_type, othr.target_data_stream, othr.target_type, othr.meta_data))
 
 
 class DocumentCorpus:
diff --git a/tests/workload/loader_test.py b/tests/workload/loader_test.py