@@ -190,7 +190,7 @@ class Documents:
190
190
SOURCE_FORMAT_BIG_ANN = "big-ann"
191
191
SUPPORTED_SOURCE_FORMAT = [SOURCE_FORMAT_BULK , SOURCE_FORMAT_HDF5 , SOURCE_FORMAT_BIG_ANN ]
192
192
193
- def __init__ (self , source_format , document_file = None , document_archive = None , base_url = None ,
193
+ def __init__ (self , source_format , document_file = None , document_archive = None , base_url = None , source_url = None ,
194
194
includes_action_and_meta_data = False ,
195
195
number_of_documents = 0 , compressed_size_in_bytes = 0 , uncompressed_size_in_bytes = 0 , target_index = None ,
196
196
target_data_stream = None , target_type = None , meta_data = None ):
@@ -201,7 +201,8 @@ def __init__(self, source_format, document_file=None, document_archive=None, bas
201
201
just need a mapping but no documents)
202
202
:param document_archive: The file name of the compressed benchmark document name on the remote server. Optional (e.g. for
203
203
percolation we just need a mapping but no documents)
204
- :param base_url: The URL from which to load data if they are not available locally. Optional.
204
+ :param base_url: The URL from which to load data if they are not available locally. Excludes the file or object name. Optional.
205
+ :param source_url: The full URL to the file or object from which to load data if not available locally. Optional.
205
206
:param includes_action_and_meta_data: True, if the source file already includes the action and meta-data line. False, if it only
206
207
contains documents.
207
208
:param number_of_documents: The number of documents
@@ -224,6 +225,7 @@ def __init__(self, source_format, document_file=None, document_archive=None, bas
224
225
self .document_file = document_file
225
226
self .document_archive = document_archive
226
227
self .base_url = base_url
228
+ self .source_url = source_url
227
229
self .includes_action_and_meta_data = includes_action_and_meta_data
228
230
self ._number_of_documents = number_of_documents
229
231
self ._compressed_size_in_bytes = compressed_size_in_bytes
@@ -295,18 +297,18 @@ def __repr__(self):
295
297
296
298
def __hash__ (self ):
297
299
return hash (self .source_format ) ^ hash (self .document_file ) ^ hash (self .document_archive ) ^ hash (self .base_url ) ^ \
298
- hash (self .includes_action_and_meta_data ) ^ hash (self .number_of_documents ) ^ hash (self .compressed_size_in_bytes ) ^ \
299
- hash (self .uncompressed_size_in_bytes ) ^ hash (self .target_index ) ^ hash (self .target_data_stream ) ^ hash ( self . target_type ) ^ \
300
- hash (frozenset (self .meta_data .items ()))
300
+ hash (self .source_url ) ^ hash (self .includes_action_and_meta_data ) ^ hash (self .number_of_documents ) ^ \
301
+ hash (self .compressed_size_in_bytes ) ^ hash (self .uncompressed_size_in_bytes ) ^ hash (self .target_index ) ^ \
302
+ hash (self . target_data_stream ) ^ hash ( self . target_type ) ^ hash ( frozenset (self .meta_data .items ()))
301
303
302
304
def __eq__ (self , othr ):
303
305
return (isinstance (othr , type (self )) and
304
- (self .source_format , self .document_file , self .document_archive , self .base_url , self .includes_action_and_meta_data ,
305
- self .number_of_documents , self .compressed_size_in_bytes , self .uncompressed_size_in_bytes ,
306
- self .target_type , self .target_data_stream , self .target_type , self .meta_data ) ==
307
- (othr .source_format , othr .document_file , othr .document_archive , othr .base_url , othr . includes_action_and_meta_data ,
308
- othr .number_of_documents , othr .compressed_size_in_bytes , othr .uncompressed_size_in_bytes ,
309
- othr .target_type , othr .target_data_stream , othr .target_type , othr .meta_data ))
306
+ (self .source_format , self .document_file , self .document_archive , self .base_url , self .source_url ,
307
+ self .includes_action_and_meta_data , self .number_of_documents , self .compressed_size_in_bytes ,
308
+ self .uncompressed_size_in_bytes , self . target_type , self .target_data_stream , self .target_type , self .meta_data ) ==
309
+ (othr .source_format , othr .document_file , othr .document_archive , othr .base_url , self . source_url ,
310
+ othr .includes_action_and_meta_data , othr .number_of_documents , othr .compressed_size_in_bytes ,
311
+ othr .uncompressed_size_in_bytes , othr . target_type , othr .target_data_stream , othr .target_type , othr .meta_data ))
310
312
311
313
312
314
class DocumentCorpus :
0 commit comments