From 721e43c5c70e87d27c7b60a1e46fe130d46c2cdf Mon Sep 17 00:00:00 2001 From: Daniel Bauer Date: Wed, 22 Jan 2025 17:35:15 +0100 Subject: [PATCH 01/25] add abstract method for streaming data entities --- rocrate/model/data_entity.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/rocrate/model/data_entity.py b/rocrate/model/data_entity.py index 22e2f01e..96f5a360 100644 --- a/rocrate/model/data_entity.py +++ b/rocrate/model/data_entity.py @@ -1,4 +1,5 @@ #!/usr/bin/env python +from typing import Generator # Copyright 2019-2024 The University of Manchester, UK # Copyright 2020-2024 Vlaams Instituut voor Biotechnologie (VIB), BE @@ -28,3 +29,14 @@ class DataEntity(Entity): def write(self, base_path): pass + + def stream(self) -> Generator[tuple[str, bytes], None, None]: + """ Stream the data from the source. Each chunk of the content is yielded as a tuple + containing the name of the destination file relative to the crate and the chunk of data. + The destination file name is required because a DataEntity can be a file or a + collection of files (Dataset) and the caller need to know to which file a chunk belongs. + For collection of files, the caller can assume that files are streamed one after another, + meaning once the destination name changes, a file can be closed and the next one can be + openend. + """ + raise NotImplementedError From 939abf402390d137877ed21d856ffd77ea73f21e Mon Sep 17 00:00:00 2001 From: Daniel Bauer Date: Wed, 22 Jan 2025 17:35:38 +0100 Subject: [PATCH 02/25] implement streaming for file --- rocrate/model/file.py | 120 +++++++++++++++++++++++++++++++----------- 1 file changed, 89 insertions(+), 31 deletions(-) diff --git a/rocrate/model/file.py b/rocrate/model/file.py index 8cd95286..829c4627 100644 --- a/rocrate/model/file.py +++ b/rocrate/model/file.py @@ -1,5 +1,4 @@ #!/usr/bin/env python - # Copyright 2019-2024 The University of Manchester, UK # Copyright 2020-2024 Vlaams Instituut voor Biotechnologie (VIB), BE # Copyright 2020-2024 Barcelona Supercomputing Center (BSC), ES @@ -22,6 +21,8 @@ from pathlib import Path import requests +from typing import Generator + import shutil import urllib.request import warnings @@ -40,41 +41,98 @@ def _empty(self): } return val + def _has_writeable_stream(self): + if isinstance(self.source, (BytesIO, StringIO)): + return True + elif is_url(str(self.source)): + return self.fetch_remote + else: + return self.source is not None + + def _write_from_stream(self, out_file_path): + if not self._has_writeable_stream(): + # is this does not correspond to a writeable stream (i.e. it is a url but fetch_remote is False), + # we still want to consume the stream to consume file headers, run the size calculation, etc. + all(self.stream()) + return + + out_file_path.parent.mkdir(parents=True, exist_ok=True) + with open(out_file_path, 'wb') as out_file: + for _, chunk in self.stream(): + out_file.write(chunk) + + def _copy_file(self, path, out_file_path): + out_file_path.parent.mkdir(parents=True, exist_ok=True) + if not out_file_path.exists() or not out_file_path.samefile(path): + shutil.copy(path, out_file_path) + if self.record_size: + self._jsonld['contentSize'] = str(out_file_path.stat().st_size) + def write(self, base_path): out_file_path = Path(base_path) / self.id - if isinstance(self.source, (BytesIO, StringIO)): - out_file_path.parent.mkdir(parents=True, exist_ok=True) - mode = 'w' + ('b' if isinstance(self.source, BytesIO) else 't') - kw = {} if isinstance(self.source, BytesIO) else {'encoding': 'utf-8'} - with open(out_file_path, mode, **kw) as out_file: - content = self.source.getvalue() - out_file.write(content) + if isinstance(self.source, (BytesIO, StringIO)) or is_url(str(self.source)): + self._write_from_stream(out_file_path) + elif self.source is None: + # Allows to record a File entity whose @id does not exist, see #73 + warnings.warn(f"No source for {self.id}") + else: + self._copy_file(self.source, out_file_path) + + def _stream_from_stream(self, stream): + size = 0 + read = stream.read() + if isinstance(self.source, StringIO): + read = read.encode('utf-8') + while len(read) > 0: + yield self.id, read + size += len(read) + read = stream.read() + if isinstance(self.source, StringIO): + read = read.encode('utf-8') + + if self.record_size: + self._jsonld['contentSize'] = str(size) + + def _stream_from_url(self, url) -> Generator[tuple[str, bytes], None, None]: + if self.fetch_remote or self.validate_url: + if self.validate_url: + if url.startswith("http"): + with requests.head(url) as response: + self._jsonld.update({ + 'contentSize': response.headers.get('Content-Length'), + 'encodingFormat': response.headers.get('Content-Type') + }) + if not self.fetch_remote: + date_published = response.headers.get("Last-Modified", iso_now()) + self._jsonld['sdDatePublished'] = date_published + if self.fetch_remote: + size = 0 + self._jsonld['contentUrl'] = str(url) + with urllib.request.urlopen(url) as response: + chunk_size = 8192 + while chunk := response.read(chunk_size): + yield self.id, chunk + size += len(chunk) + if self.record_size: - self._jsonld['contentSize'] = str(len(content)) + self._jsonld['contentSize'] = str(size) + + def _stream_from_file(self, path): + size = 0 + with open(path, 'rb') as f: + for chunk in f: + yield self.id, chunk + size += len(chunk) + if self.record_size: + self._jsonld['contentSize'] = str(size) + + def stream(self) -> Generator[tuple[str, bytes], None, None]: + if isinstance(self.source, (BytesIO, StringIO)): + yield from self._stream_from_stream(self.source) elif is_url(str(self.source)): - if self.fetch_remote or self.validate_url: - if self.validate_url: - if self.source.startswith("http"): - with requests.head(self.source) as response: - self._jsonld.update({ - 'contentSize': response.headers.get('Content-Length'), - 'encodingFormat': response.headers.get('Content-Type') - }) - if not self.fetch_remote: - date_published = response.headers.get("Last-Modified", iso_now()) - self._jsonld['sdDatePublished'] = date_published - if self.fetch_remote: - out_file_path.parent.mkdir(parents=True, exist_ok=True) - urllib.request.urlretrieve(self.source, out_file_path) - self._jsonld['contentUrl'] = str(self.source) - if self.record_size: - self._jsonld['contentSize'] = str(out_file_path.stat().st_size) + yield from self._stream_from_url(self.source) elif self.source is None: # Allows to record a File entity whose @id does not exist, see #73 warnings.warn(f"No source for {self.id}") else: - out_file_path.parent.mkdir(parents=True, exist_ok=True) - if not out_file_path.exists() or not out_file_path.samefile(self.source): - shutil.copy(self.source, out_file_path) - if self.record_size: - self._jsonld['contentSize'] = str(out_file_path.stat().st_size) + yield from self._stream_from_file(self.source) From a23da9cb2ff91578b12406e8e91cfe098aac9a6d Mon Sep 17 00:00:00 2001 From: Daniel Bauer Date: Wed, 22 Jan 2025 18:53:08 +0100 Subject: [PATCH 03/25] implement streaming for metadata --- rocrate/model/metadata.py | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/rocrate/model/metadata.py b/rocrate/model/metadata.py index aa7cf801..81146eec 100644 --- a/rocrate/model/metadata.py +++ b/rocrate/model/metadata.py @@ -22,6 +22,7 @@ import json from pathlib import Path +from typing import Generator from .file import File from .dataset import Dataset @@ -74,11 +75,16 @@ def generate(self): context = context[0] return {'@context': context, '@graph': graph} - def write(self, base_path): - write_path = Path(base_path) / self.id - as_jsonld = self.generate() - with open(write_path, 'w', encoding='utf-8') as outfile: - json.dump(as_jsonld, outfile, indent=4, sort_keys=True) + def stream(self) -> Generator[tuple[str, bytes], None, None]: + content = self.generate() + yield self.id, str.encode(json.dumps(content, indent=4, sort_keys=True), encoding='utf-8') + + def _has_writeable_stream(self): + return True + + def write(self, dest_base): + write_path = Path(dest_base) / self.id + super()._write_from_stream(write_path) @property def root(self) -> Dataset: From f34dad87d58e1017e6e40fe18a407b1d57713e37 Mon Sep 17 00:00:00 2001 From: Daniel Bauer Date: Wed, 22 Jan 2025 18:53:19 +0100 Subject: [PATCH 04/25] implement streaming for preview --- rocrate/model/preview.py | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/rocrate/model/preview.py b/rocrate/model/preview.py index 12abf817..7b98522d 100644 --- a/rocrate/model/preview.py +++ b/rocrate/model/preview.py @@ -22,6 +22,7 @@ import os from pathlib import Path +from typing import Generator from jinja2 import Template from .file import File @@ -90,11 +91,15 @@ def is_object_list(a): out_html = src.render(crate=self.crate, context=context_entities, data=data_entities) return out_html - def write(self, dest_base): + def stream(self) -> Generator[tuple[str, bytes], None, None]: if self.source: - super().write(dest_base) + yield from super().stream() else: - write_path = Path(dest_base) / self.id - out_html = self.generate_html() - with open(write_path, 'w', encoding='utf-8') as outfile: - outfile.write(out_html) + yield self.id, str.encode(self.generate_html(), encoding='utf-8') + + def _has_writeable_stream(self): + return True + + def write(self, dest_base): + write_path = Path(dest_base) / self.id + super()._write_from_stream(write_path) From f909fbb2b85fee901dd0b7290044dd2e4b0d8cb9 Mon Sep 17 00:00:00 2001 From: Daniel Bauer Date: Wed, 22 Jan 2025 20:00:12 +0100 Subject: [PATCH 05/25] implement streaming for dataset --- rocrate/model/dataset.py | 104 ++++++++++++++++++++++++++++----------- 1 file changed, 75 insertions(+), 29 deletions(-) diff --git a/rocrate/model/dataset.py b/rocrate/model/dataset.py index 0e0e52ff..64062cc1 100644 --- a/rocrate/model/dataset.py +++ b/rocrate/model/dataset.py @@ -22,8 +22,9 @@ import errno import os -import shutil +import warnings from pathlib import Path +from typing import Generator from urllib.request import urlopen from .file_or_dir import FileOrDir @@ -43,37 +44,82 @@ def _empty(self): def format_id(self, identifier): return identifier.rstrip("/") + "/" + def _write_from_url(self, base_path): + if self.validate_url and not self.fetch_remote: + with urlopen(self.source) as _: + self._jsonld['sdDatePublished'] = iso_now() + if self.fetch_remote: + out_file_path, out_file = None, None + for rel_path, chunk in self._stream_folder_from_url(): + path = base_path / rel_path + if path != out_file_path: + if out_file: + out_file.close() + out_file_path = Path(path) + out_file_path.parent.mkdir(parents=True, exist_ok=True) + out_file = open(out_file_path, 'wb') + out_file.write(chunk) + if out_file: + out_file.close() + + def _copy_folder(self, base_path): + abs_out_path = base_path / self.id + if self.source is None: + abs_out_path.mkdir(parents=True, exist_ok=True) + else: + if not Path(self.source).exists(): + raise FileNotFoundError( + errno.ENOENT, os.strerror(errno.ENOENT), str(self.source) + ) + abs_out_path.mkdir(parents=True, exist_ok=True) + if not self.crate.source: + self.crate._copy_unlisted(self.source, abs_out_path) + def write(self, base_path): - out_path = Path(base_path) / self.id + base_path = Path(base_path) if is_url(str(self.source)): - if self.validate_url and not self.fetch_remote: + self._write_from_url(base_path) + else: + self._copy_folder(base_path) + + def stream(self) -> Generator[tuple[str, bytes], None, None]: + if is_url(str(self.source)): + yield from self._stream_folder_from_url() + else: + yield from self._stream_folder_from_path() + + def _stream_folder_from_path(self) -> Generator[tuple[str, bytes], None, None]: + if not Path(self.source).exists(): + raise FileNotFoundError( + errno.ENOENT, os.strerror(errno.ENOENT), str(self.source) + ) + if not self.crate.source: + for root, _, files in os.walk(self.source): + root = Path(root) + for name in files: + source = root / name + dest = source.relative_to(self.source.parent) + with open(source, 'rb') as f: + yield dest, f.read() + + def _stream_folder_from_url(self) -> Generator[tuple[str, bytes], None, None]: + if not self.fetch_remote: + if self.validate_url: with urlopen(self.source) as _: self._jsonld['sdDatePublished'] = iso_now() - if self.fetch_remote: - self.__get_parts(out_path) else: - if self.source is None: - out_path.mkdir(parents=True, exist_ok=True) - else: - if not Path(self.source).exists(): - raise FileNotFoundError( - errno.ENOENT, os.strerror(errno.ENOENT), str(self.source) - ) - out_path.mkdir(parents=True, exist_ok=True) - if not self.crate.source: - self.crate._copy_unlisted(self.source, out_path) + base = self.source.rstrip("/") + for entry in self._jsonld.get("hasPart", []): + try: + part = entry["@id"] + if is_url(part) or part.startswith("/"): + raise RuntimeError(f"'{self.source}': part '{part}' is not a relative path") + part_uri = f"{base}/{part}" + rel_out_path = Path(self.id) / part - def __get_parts(self, out_path): - out_path.mkdir(parents=True, exist_ok=True) - base = self.source.rstrip("/") - for entry in self._jsonld.get("hasPart", []): - try: - part = entry["@id"] - except KeyError: - continue - if is_url(part) or part.startswith("/"): - raise RuntimeError(f"'{self.source}': part '{part}' is not a relative path") - part_uri = f"{base}/{part}" - part_out_path = out_path / part - with urlopen(part_uri) as r, open(part_out_path, 'wb') as f: - shutil.copyfileobj(r, f) + with urlopen(part_uri) as response: + chunk_size = 8192 + while chunk := response.read(chunk_size): + yield rel_out_path, chunk + except KeyError: + warnings.warn(f"'hasPart' entry in {self.id} is missing '@id'. Skipping.") From 883c439de9ff18fb80ba7b29b0e07abf53ae734e Mon Sep 17 00:00:00 2001 From: Daniel Bauer Date: Wed, 22 Jan 2025 20:51:37 +0100 Subject: [PATCH 06/25] fix: dataset should not stream if root entity --- rocrate/model/dataset.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/rocrate/model/dataset.py b/rocrate/model/dataset.py index 64062cc1..e5828995 100644 --- a/rocrate/model/dataset.py +++ b/rocrate/model/dataset.py @@ -83,13 +83,15 @@ def write(self, base_path): self._copy_folder(base_path) def stream(self) -> Generator[tuple[str, bytes], None, None]: - if is_url(str(self.source)): + if self.source is None: + return + elif is_url(str(self.source)): yield from self._stream_folder_from_url() else: yield from self._stream_folder_from_path() def _stream_folder_from_path(self) -> Generator[tuple[str, bytes], None, None]: - if not Path(self.source).exists(): + if not Path(str(self.source)).exists(): raise FileNotFoundError( errno.ENOENT, os.strerror(errno.ENOENT), str(self.source) ) @@ -100,7 +102,7 @@ def _stream_folder_from_path(self) -> Generator[tuple[str, bytes], None, None]: source = root / name dest = source.relative_to(self.source.parent) with open(source, 'rb') as f: - yield dest, f.read() + yield str(dest), f.read() def _stream_folder_from_url(self) -> Generator[tuple[str, bytes], None, None]: if not self.fetch_remote: @@ -120,6 +122,6 @@ def _stream_folder_from_url(self) -> Generator[tuple[str, bytes], None, None]: with urlopen(part_uri) as response: chunk_size = 8192 while chunk := response.read(chunk_size): - yield rel_out_path, chunk + yield str(rel_out_path), chunk except KeyError: warnings.warn(f"'hasPart' entry in {self.id} is missing '@id'. Skipping.") From 77eee32a8bc9d6f11769c7cfc532c4f52ef7d24a Mon Sep 17 00:00:00 2001 From: Daniel Bauer Date: Wed, 22 Jan 2025 20:52:13 +0100 Subject: [PATCH 07/25] feat: add method to stream zip --- rocrate/memory_buffer.py | 23 +++++++++++++++++++++++ rocrate/rocrate.py | 21 +++++++++++++++++++++ 2 files changed, 44 insertions(+) create mode 100644 rocrate/memory_buffer.py diff --git a/rocrate/memory_buffer.py b/rocrate/memory_buffer.py new file mode 100644 index 00000000..8d51f085 --- /dev/null +++ b/rocrate/memory_buffer.py @@ -0,0 +1,23 @@ +from io import BytesIO + + +class MemoryBuffer(BytesIO): + """ Memory buffer provides a writable stream that can be read back. + Automatically resets after reading. """ + def __init__(self): + super().__init__() + self._buffer = b"" + + def writable(self): + return True + + def write(self, b): + if self.closed: + raise RuntimeError("Stream was closed before writing!") + self._buffer += b + return len(b) + + def read(self, **kwargs): + chunk = self._buffer + self._buffer = b"" + return chunk diff --git a/rocrate/rocrate.py b/rocrate/rocrate.py index b489c948..8833774c 100644 --- a/rocrate/rocrate.py +++ b/rocrate/rocrate.py @@ -31,6 +31,7 @@ from pathlib import Path from urllib.parse import urljoin +from .memory_buffer import MemoryBuffer from .model import ( ComputationalWorkflow, ComputerLanguage, @@ -479,6 +480,26 @@ def write_zip(self, out_path): shutil.rmtree(tmp_dir) return archive + def stream_zip(self): + """ Create a stream of bytes representing the RO-Crate as a ZIP file. """ + buffer = MemoryBuffer() + with zipfile.ZipFile(buffer, mode='w', compression=zipfile.ZIP_DEFLATED) as archive: + entities = self.data_entities + self.default_entities + for writeable_entity in entities: #self.data_entities + self.default_entities: + current_file_path, current_out_file = None, None + for path, chunk in writeable_entity.stream(): + if path != current_file_path: + if current_out_file: + current_out_file.close() + current_file_path = path + current_out_file = archive.open(path, mode='w') + current_out_file.write(chunk) + yield buffer.read() + if current_out_file: + current_out_file.close() + yield buffer.read() + buffer.close() + def add_workflow( self, source=None, dest_path=None, fetch_remote=False, validate_url=False, properties=None, main=False, lang="cwl", lang_version=None, gen_cwl=False, cls=ComputationalWorkflow, From 08eeb027228bd00a80266926b7244192e204d1ad Mon Sep 17 00:00:00 2001 From: Daniel Bauer Date: Wed, 22 Jan 2025 21:56:51 +0100 Subject: [PATCH 08/25] chore: remove memory buffer class in favor of BytesIO --- rocrate/memory_buffer.py | 23 ----------------------- rocrate/rocrate.py | 9 +++++---- 2 files changed, 5 insertions(+), 27 deletions(-) delete mode 100644 rocrate/memory_buffer.py diff --git a/rocrate/memory_buffer.py b/rocrate/memory_buffer.py deleted file mode 100644 index 8d51f085..00000000 --- a/rocrate/memory_buffer.py +++ /dev/null @@ -1,23 +0,0 @@ -from io import BytesIO - - -class MemoryBuffer(BytesIO): - """ Memory buffer provides a writable stream that can be read back. - Automatically resets after reading. """ - def __init__(self): - super().__init__() - self._buffer = b"" - - def writable(self): - return True - - def write(self, b): - if self.closed: - raise RuntimeError("Stream was closed before writing!") - self._buffer += b - return len(b) - - def read(self, **kwargs): - chunk = self._buffer - self._buffer = b"" - return chunk diff --git a/rocrate/rocrate.py b/rocrate/rocrate.py index 8833774c..491aac1b 100644 --- a/rocrate/rocrate.py +++ b/rocrate/rocrate.py @@ -28,10 +28,10 @@ import warnings from collections import OrderedDict +from io import BytesIO from pathlib import Path from urllib.parse import urljoin -from .memory_buffer import MemoryBuffer from .model import ( ComputationalWorkflow, ComputerLanguage, @@ -482,10 +482,9 @@ def write_zip(self, out_path): def stream_zip(self): """ Create a stream of bytes representing the RO-Crate as a ZIP file. """ - buffer = MemoryBuffer() + buffer = BytesIO() with zipfile.ZipFile(buffer, mode='w', compression=zipfile.ZIP_DEFLATED) as archive: - entities = self.data_entities + self.default_entities - for writeable_entity in entities: #self.data_entities + self.default_entities: + for writeable_entity in self.data_entities + self.default_entities: current_file_path, current_out_file = None, None for path, chunk in writeable_entity.stream(): if path != current_file_path: @@ -494,9 +493,11 @@ def stream_zip(self): current_file_path = path current_out_file = archive.open(path, mode='w') current_out_file.write(chunk) + buffer.seek(0) yield buffer.read() if current_out_file: current_out_file.close() + buffer.seek(0) yield buffer.read() buffer.close() From c034c666281fd3237f4f87a9906bb41c8afce1ab Mon Sep 17 00:00:00 2001 From: Daniel Bauer Date: Fri, 24 Jan 2025 11:03:32 +0100 Subject: [PATCH 09/25] fix: files from datasets should also be streamed in chunks --- rocrate/model/dataset.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/rocrate/model/dataset.py b/rocrate/model/dataset.py index e5828995..c850f44b 100644 --- a/rocrate/model/dataset.py +++ b/rocrate/model/dataset.py @@ -100,9 +100,10 @@ def _stream_folder_from_path(self) -> Generator[tuple[str, bytes], None, None]: root = Path(root) for name in files: source = root / name - dest = source.relative_to(self.source.parent) + dest = source.relative_to(Path(self.source).parent) with open(source, 'rb') as f: - yield str(dest), f.read() + for chunk in f: + yield str(dest), chunk def _stream_folder_from_url(self) -> Generator[tuple[str, bytes], None, None]: if not self.fetch_remote: From 27457d8061dd745edb2ddcb95e7bb4cba3ed3837 Mon Sep 17 00:00:00 2001 From: Daniel Bauer Date: Fri, 24 Jan 2025 11:07:20 +0100 Subject: [PATCH 10/25] fix: zip stream repeats initial bytes --- rocrate/memory_buffer.py | 28 ++++++++++++++++++++++++++++ rocrate/rocrate.py | 7 +++---- 2 files changed, 31 insertions(+), 4 deletions(-) create mode 100644 rocrate/memory_buffer.py diff --git a/rocrate/memory_buffer.py b/rocrate/memory_buffer.py new file mode 100644 index 00000000..1cbd03f2 --- /dev/null +++ b/rocrate/memory_buffer.py @@ -0,0 +1,28 @@ +from io import RawIOBase + + +class MemoryBuffer(RawIOBase): + """ + A buffer class that supports reading and writing binary data. + The buffer automatically resets upon reading to make sure all data is read only once. + """ + + def __init__(self): + self._buffer = b'' + + def write(self, data): + if self.closed: + raise ValueError('write to closed file') + self._buffer += data + return len(data) + + def read(self, size=-1): + if self.closed: + raise ValueError('read from closed file') + if size < 0: + data = self._buffer + self._buffer = b'' + else: + data = self._buffer[:size] + self._buffer = self._buffer[size:] + return data diff --git a/rocrate/rocrate.py b/rocrate/rocrate.py index 491aac1b..32ecc764 100644 --- a/rocrate/rocrate.py +++ b/rocrate/rocrate.py @@ -28,10 +28,10 @@ import warnings from collections import OrderedDict -from io import BytesIO from pathlib import Path from urllib.parse import urljoin +from .memory_buffer import MemoryBuffer from .model import ( ComputationalWorkflow, ComputerLanguage, @@ -482,7 +482,7 @@ def write_zip(self, out_path): def stream_zip(self): """ Create a stream of bytes representing the RO-Crate as a ZIP file. """ - buffer = BytesIO() + buffer = MemoryBuffer() with zipfile.ZipFile(buffer, mode='w', compression=zipfile.ZIP_DEFLATED) as archive: for writeable_entity in self.data_entities + self.default_entities: current_file_path, current_out_file = None, None @@ -493,11 +493,10 @@ def stream_zip(self): current_file_path = path current_out_file = archive.open(path, mode='w') current_out_file.write(chunk) - buffer.seek(0) yield buffer.read() if current_out_file: current_out_file.close() - buffer.seek(0) + yield buffer.read() buffer.close() From 0a1c4a16992da8d6e2b5b10df0d560f52abe8154 Mon Sep 17 00:00:00 2001 From: Daniel Bauer Date: Fri, 24 Jan 2025 11:09:25 +0100 Subject: [PATCH 11/25] fix: make sure zip stream buffer is always properly closed --- rocrate/rocrate.py | 33 ++++++++++++++++----------------- 1 file changed, 16 insertions(+), 17 deletions(-) diff --git a/rocrate/rocrate.py b/rocrate/rocrate.py index 32ecc764..be42cb31 100644 --- a/rocrate/rocrate.py +++ b/rocrate/rocrate.py @@ -482,23 +482,22 @@ def write_zip(self, out_path): def stream_zip(self): """ Create a stream of bytes representing the RO-Crate as a ZIP file. """ - buffer = MemoryBuffer() - with zipfile.ZipFile(buffer, mode='w', compression=zipfile.ZIP_DEFLATED) as archive: - for writeable_entity in self.data_entities + self.default_entities: - current_file_path, current_out_file = None, None - for path, chunk in writeable_entity.stream(): - if path != current_file_path: - if current_out_file: - current_out_file.close() - current_file_path = path - current_out_file = archive.open(path, mode='w') - current_out_file.write(chunk) - yield buffer.read() - if current_out_file: - current_out_file.close() - - yield buffer.read() - buffer.close() + with MemoryBuffer() as buffer: + with zipfile.ZipFile(buffer, mode='w', compression=zipfile.ZIP_DEFLATED) as archive: + for writeable_entity in self.data_entities + self.default_entities: + current_file_path, current_out_file = None, None + for path, chunk in writeable_entity.stream(): + if path != current_file_path: + if current_out_file: + current_out_file.close() + current_file_path = path + current_out_file = archive.open(path, mode='w') + current_out_file.write(chunk) + yield buffer.read() + if current_out_file: + current_out_file.close() + + yield buffer.read() def add_workflow( self, source=None, dest_path=None, fetch_remote=False, validate_url=False, properties=None, From e229ccd1469e90130bf021586b92d36a8660ebd0 Mon Sep 17 00:00:00 2001 From: Daniel Bauer Date: Fri, 24 Jan 2025 18:12:15 +0100 Subject: [PATCH 12/25] feat: zip stream now yields predictable chunk sizes --- rocrate/memory_buffer.py | 3 +++ rocrate/model/data_entity.py | 2 +- rocrate/model/dataset.py | 13 ++++++------- rocrate/model/file.py | 13 ++++++------- rocrate/model/metadata.py | 2 +- rocrate/model/preview.py | 2 +- rocrate/rocrate.py | 10 ++++++---- 7 files changed, 24 insertions(+), 21 deletions(-) diff --git a/rocrate/memory_buffer.py b/rocrate/memory_buffer.py index 1cbd03f2..68fe53ed 100644 --- a/rocrate/memory_buffer.py +++ b/rocrate/memory_buffer.py @@ -26,3 +26,6 @@ def read(self, size=-1): data = self._buffer[:size] self._buffer = self._buffer[size:] return data + + def __len__(self): + return len(self._buffer) diff --git a/rocrate/model/data_entity.py b/rocrate/model/data_entity.py index 96f5a360..4441875d 100644 --- a/rocrate/model/data_entity.py +++ b/rocrate/model/data_entity.py @@ -30,7 +30,7 @@ class DataEntity(Entity): def write(self, base_path): pass - def stream(self) -> Generator[tuple[str, bytes], None, None]: + def stream(self, chunk_size=8192) -> Generator[tuple[str, bytes], None, None]: """ Stream the data from the source. Each chunk of the content is yielded as a tuple containing the name of the destination file relative to the crate and the chunk of data. The destination file name is required because a DataEntity can be a file or a diff --git a/rocrate/model/dataset.py b/rocrate/model/dataset.py index c850f44b..0d047ddb 100644 --- a/rocrate/model/dataset.py +++ b/rocrate/model/dataset.py @@ -82,15 +82,15 @@ def write(self, base_path): else: self._copy_folder(base_path) - def stream(self) -> Generator[tuple[str, bytes], None, None]: + def stream(self, chunk_size=8192) -> Generator[tuple[str, bytes], None, None]: if self.source is None: return elif is_url(str(self.source)): - yield from self._stream_folder_from_url() + yield from self._stream_folder_from_url(chunk_size) else: - yield from self._stream_folder_from_path() + yield from self._stream_folder_from_path(chunk_size) - def _stream_folder_from_path(self) -> Generator[tuple[str, bytes], None, None]: + def _stream_folder_from_path(self, chunk_size=8192): if not Path(str(self.source)).exists(): raise FileNotFoundError( errno.ENOENT, os.strerror(errno.ENOENT), str(self.source) @@ -102,10 +102,10 @@ def _stream_folder_from_path(self) -> Generator[tuple[str, bytes], None, None]: source = root / name dest = source.relative_to(Path(self.source).parent) with open(source, 'rb') as f: - for chunk in f: + while chunk := f.read(chunk_size): yield str(dest), chunk - def _stream_folder_from_url(self) -> Generator[tuple[str, bytes], None, None]: + def _stream_folder_from_url(self, chunk_size=8192): if not self.fetch_remote: if self.validate_url: with urlopen(self.source) as _: @@ -121,7 +121,6 @@ def _stream_folder_from_url(self) -> Generator[tuple[str, bytes], None, None]: rel_out_path = Path(self.id) / part with urlopen(part_uri) as response: - chunk_size = 8192 while chunk := response.read(chunk_size): yield str(rel_out_path), chunk except KeyError: diff --git a/rocrate/model/file.py b/rocrate/model/file.py index 829c4627..72b2c808 100644 --- a/rocrate/model/file.py +++ b/rocrate/model/file.py @@ -93,7 +93,7 @@ def _stream_from_stream(self, stream): if self.record_size: self._jsonld['contentSize'] = str(size) - def _stream_from_url(self, url) -> Generator[tuple[str, bytes], None, None]: + def _stream_from_url(self, url, chunk_size=8192): if self.fetch_remote or self.validate_url: if self.validate_url: if url.startswith("http"): @@ -109,7 +109,6 @@ def _stream_from_url(self, url) -> Generator[tuple[str, bytes], None, None]: size = 0 self._jsonld['contentUrl'] = str(url) with urllib.request.urlopen(url) as response: - chunk_size = 8192 while chunk := response.read(chunk_size): yield self.id, chunk size += len(chunk) @@ -117,22 +116,22 @@ def _stream_from_url(self, url) -> Generator[tuple[str, bytes], None, None]: if self.record_size: self._jsonld['contentSize'] = str(size) - def _stream_from_file(self, path): + def _stream_from_file(self, path, chunk_size=8192): size = 0 with open(path, 'rb') as f: - for chunk in f: + while chunk := f.read(chunk_size): yield self.id, chunk size += len(chunk) if self.record_size: self._jsonld['contentSize'] = str(size) - def stream(self) -> Generator[tuple[str, bytes], None, None]: + def stream(self, chunk_size=8192) -> Generator[tuple[str, bytes], None, None]: if isinstance(self.source, (BytesIO, StringIO)): yield from self._stream_from_stream(self.source) elif is_url(str(self.source)): - yield from self._stream_from_url(self.source) + yield from self._stream_from_url(self.source, chunk_size) elif self.source is None: # Allows to record a File entity whose @id does not exist, see #73 warnings.warn(f"No source for {self.id}") else: - yield from self._stream_from_file(self.source) + yield from self._stream_from_file(self.source, chunk_size) diff --git a/rocrate/model/metadata.py b/rocrate/model/metadata.py index 81146eec..2523a045 100644 --- a/rocrate/model/metadata.py +++ b/rocrate/model/metadata.py @@ -75,7 +75,7 @@ def generate(self): context = context[0] return {'@context': context, '@graph': graph} - def stream(self) -> Generator[tuple[str, bytes], None, None]: + def stream(self, chunk_size=8192) -> Generator[tuple[str, bytes], None, None]: content = self.generate() yield self.id, str.encode(json.dumps(content, indent=4, sort_keys=True), encoding='utf-8') diff --git a/rocrate/model/preview.py b/rocrate/model/preview.py index 7b98522d..da5d4866 100644 --- a/rocrate/model/preview.py +++ b/rocrate/model/preview.py @@ -91,7 +91,7 @@ def is_object_list(a): out_html = src.render(crate=self.crate, context=context_entities, data=data_entities) return out_html - def stream(self) -> Generator[tuple[str, bytes], None, None]: + def stream(self, chunk_size=8192) -> Generator[tuple[str, bytes], None, None]: if self.source: yield from super().stream() else: diff --git a/rocrate/rocrate.py b/rocrate/rocrate.py index be42cb31..53c814f6 100644 --- a/rocrate/rocrate.py +++ b/rocrate/rocrate.py @@ -480,24 +480,26 @@ def write_zip(self, out_path): shutil.rmtree(tmp_dir) return archive - def stream_zip(self): + def stream_zip(self, chunk_size=8192): """ Create a stream of bytes representing the RO-Crate as a ZIP file. """ with MemoryBuffer() as buffer: with zipfile.ZipFile(buffer, mode='w', compression=zipfile.ZIP_DEFLATED) as archive: for writeable_entity in self.data_entities + self.default_entities: current_file_path, current_out_file = None, None - for path, chunk in writeable_entity.stream(): + for path, chunk in writeable_entity.stream(chunk_size=chunk_size): if path != current_file_path: if current_out_file: current_out_file.close() current_file_path = path current_out_file = archive.open(path, mode='w') current_out_file.write(chunk) - yield buffer.read() + while len(buffer) >= chunk_size: + yield buffer.read(chunk_size) if current_out_file: current_out_file.close() - yield buffer.read() + while chunk := buffer.read(chunk_size): + yield chunk def add_workflow( self, source=None, dest_path=None, fetch_remote=False, validate_url=False, properties=None, From 37f24303fd6e6e8245f0493484ef2836595aa7c5 Mon Sep 17 00:00:00 2001 From: Daniel Bauer Date: Fri, 24 Jan 2025 18:41:28 +0100 Subject: [PATCH 13/25] chore: remove type hints for consistency --- rocrate/model/data_entity.py | 3 +-- rocrate/model/dataset.py | 3 +-- rocrate/model/file.py | 5 ++--- rocrate/model/metadata.py | 3 +-- rocrate/model/preview.py | 3 +-- 5 files changed, 6 insertions(+), 11 deletions(-) diff --git a/rocrate/model/data_entity.py b/rocrate/model/data_entity.py index 4441875d..573f08a6 100644 --- a/rocrate/model/data_entity.py +++ b/rocrate/model/data_entity.py @@ -1,5 +1,4 @@ #!/usr/bin/env python -from typing import Generator # Copyright 2019-2024 The University of Manchester, UK # Copyright 2020-2024 Vlaams Instituut voor Biotechnologie (VIB), BE @@ -30,7 +29,7 @@ class DataEntity(Entity): def write(self, base_path): pass - def stream(self, chunk_size=8192) -> Generator[tuple[str, bytes], None, None]: + def stream(self, chunk_size=8192): """ Stream the data from the source. Each chunk of the content is yielded as a tuple containing the name of the destination file relative to the crate and the chunk of data. The destination file name is required because a DataEntity can be a file or a diff --git a/rocrate/model/dataset.py b/rocrate/model/dataset.py index 0d047ddb..b2aaaf91 100644 --- a/rocrate/model/dataset.py +++ b/rocrate/model/dataset.py @@ -24,7 +24,6 @@ import os import warnings from pathlib import Path -from typing import Generator from urllib.request import urlopen from .file_or_dir import FileOrDir @@ -82,7 +81,7 @@ def write(self, base_path): else: self._copy_folder(base_path) - def stream(self, chunk_size=8192) -> Generator[tuple[str, bytes], None, None]: + def stream(self, chunk_size=8192): if self.source is None: return elif is_url(str(self.source)): diff --git a/rocrate/model/file.py b/rocrate/model/file.py index 72b2c808..d612bd71 100644 --- a/rocrate/model/file.py +++ b/rocrate/model/file.py @@ -1,4 +1,5 @@ #!/usr/bin/env python + # Copyright 2019-2024 The University of Manchester, UK # Copyright 2020-2024 Vlaams Instituut voor Biotechnologie (VIB), BE # Copyright 2020-2024 Barcelona Supercomputing Center (BSC), ES @@ -21,8 +22,6 @@ from pathlib import Path import requests -from typing import Generator - import shutil import urllib.request import warnings @@ -125,7 +124,7 @@ def _stream_from_file(self, path, chunk_size=8192): if self.record_size: self._jsonld['contentSize'] = str(size) - def stream(self, chunk_size=8192) -> Generator[tuple[str, bytes], None, None]: + def stream(self, chunk_size=8192): if isinstance(self.source, (BytesIO, StringIO)): yield from self._stream_from_stream(self.source) elif is_url(str(self.source)): diff --git a/rocrate/model/metadata.py b/rocrate/model/metadata.py index 2523a045..2703e8f0 100644 --- a/rocrate/model/metadata.py +++ b/rocrate/model/metadata.py @@ -22,7 +22,6 @@ import json from pathlib import Path -from typing import Generator from .file import File from .dataset import Dataset @@ -75,7 +74,7 @@ def generate(self): context = context[0] return {'@context': context, '@graph': graph} - def stream(self, chunk_size=8192) -> Generator[tuple[str, bytes], None, None]: + def stream(self, chunk_size=8192): content = self.generate() yield self.id, str.encode(json.dumps(content, indent=4, sort_keys=True), encoding='utf-8') diff --git a/rocrate/model/preview.py b/rocrate/model/preview.py index da5d4866..9f4d6e97 100644 --- a/rocrate/model/preview.py +++ b/rocrate/model/preview.py @@ -22,7 +22,6 @@ import os from pathlib import Path -from typing import Generator from jinja2 import Template from .file import File @@ -91,7 +90,7 @@ def is_object_list(a): out_html = src.render(crate=self.crate, context=context_entities, data=data_entities) return out_html - def stream(self, chunk_size=8192) -> Generator[tuple[str, bytes], None, None]: + def stream(self, chunk_size=8192): if self.source: yield from super().stream() else: From bf1b990f9acc6014a0702db37014ad8c50fa35da Mon Sep 17 00:00:00 2001 From: Daniel Bauer Date: Sun, 26 Jan 2025 13:47:57 +0100 Subject: [PATCH 14/25] feat: include unlisted files in zip stream --- rocrate/rocrate.py | 28 ++++++++++++++++++++++++++-- 1 file changed, 26 insertions(+), 2 deletions(-) diff --git a/rocrate/rocrate.py b/rocrate/rocrate.py index 53c814f6..1c3ad459 100644 --- a/rocrate/rocrate.py +++ b/rocrate/rocrate.py @@ -480,8 +480,11 @@ def write_zip(self, out_path): shutil.rmtree(tmp_dir) return archive - def stream_zip(self, chunk_size=8192): - """ Create a stream of bytes representing the RO-Crate as a ZIP file. """ + def stream_zip(self, chunk_size=8192, out_path=None): + """ Create a stream of bytes representing the RO-Crate as a ZIP file. + The out_path argument is used to exclude the file from the ZIP stream if the output is inside the crate folder + and can be omitted if the stream is not written into a file inside the crate dir. + """ with MemoryBuffer() as buffer: with zipfile.ZipFile(buffer, mode='w', compression=zipfile.ZIP_DEFLATED) as archive: for writeable_entity in self.data_entities + self.default_entities: @@ -498,6 +501,27 @@ def stream_zip(self, chunk_size=8192): if current_out_file: current_out_file.close() + # add additional unlisted files to stream + listed_files = [archived_file for archived_file in archive.namelist()] + for root, dirs, files in walk(str(self.source), exclude=self.exclude): + root = Path(root) + for name in dirs: + source = root / name + dest = source.relative_to(self.source) + dest.mkdir(parents=True, exist_ok=True) + for name in files: + source = root / name + rel = source.relative_to(self.source) + if not self.dereference(str(rel)) and not out_path.samefile(source): + dest = rel + if not str(dest) in listed_files: + with archive.open(str(dest), mode='w') as f: + with open(source, 'rb') as r: + while chunk := r.read(chunk_size): + f.write(chunk) + while len(buffer) >= chunk_size: + yield buffer.read(chunk_size) + while chunk := buffer.read(chunk_size): yield chunk From 3634e4b58546f61692f8448d755bd3bd7c3cc81a Mon Sep 17 00:00:00 2001 From: Daniel Bauer Date: Fri, 24 Jan 2025 19:00:30 +0100 Subject: [PATCH 15/25] feat: write_zip now uses zip streaming --- rocrate/rocrate.py | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/rocrate/rocrate.py b/rocrate/rocrate.py index 1c3ad459..44b2b958 100644 --- a/rocrate/rocrate.py +++ b/rocrate/rocrate.py @@ -470,15 +470,10 @@ def write(self, base_path): def write_zip(self, out_path): out_path = Path(out_path) - if out_path.suffix == ".zip": - out_path = out_path.parent / out_path.stem - tmp_dir = tempfile.mkdtemp(prefix="rocrate_") - try: - self.write(tmp_dir) - archive = shutil.make_archive(out_path, "zip", tmp_dir) - finally: - shutil.rmtree(tmp_dir) - return archive + with open(out_path, "wb") as f: + for chunk in self.stream_zip(out_path=out_path): + f.write(chunk) + return out_path def stream_zip(self, chunk_size=8192, out_path=None): """ Create a stream of bytes representing the RO-Crate as a ZIP file. From 1a4253f1432ec92a0951a99e43afc0da16722fb7 Mon Sep 17 00:00:00 2001 From: Daniel Bauer Date: Mon, 27 Jan 2025 12:39:03 +0100 Subject: [PATCH 16/25] feat: add streaming example --- examples/fastapi/main.py | 45 +++++++++++++++++++++++++++++++ examples/fastapi/requirements.txt | 3 +++ 2 files changed, 48 insertions(+) create mode 100644 examples/fastapi/main.py create mode 100644 examples/fastapi/requirements.txt diff --git a/examples/fastapi/main.py b/examples/fastapi/main.py new file mode 100644 index 00000000..6c5acf0e --- /dev/null +++ b/examples/fastapi/main.py @@ -0,0 +1,45 @@ +""" +Streaming RO-Crates from a web server + +This example demonstrates how to create an RO-Crate on-the-fly +and stream the result to the client. +By using `stream_zip`, the RO-Crate is not written to disk and remote +data is only fetched on the fly. + +To run: `fastapi dev main.py`, then visit http://localhost:8000/crate +""" + +from fastapi import FastAPI +from fastapi.responses import StreamingResponse +from rocrate.rocrate import ROCrate +from io import StringIO + +app = FastAPI() + +@app.get("/crate") +async def get(): + crate = ROCrate() + + # Add a remote file + crate.add_file( + "https://raw.githubusercontent.com/ResearchObject/ro-crate-py/refs/heads/master/test/test-data/sample_file.txt", + fetch_remote=True + ) + + # Add a file containing a string to the crate + crate.add_file( + source=StringIO("Hello, World!"), + dest_path="test-data/hello.txt" + ) + + # Stream crate to client as a zip file + return StreamingResponse( + crate.stream_zip(), + media_type="application/rocrate+zip", + headers={ + "Content-Disposition": "attachment; filename=crate.zip", + } + ) + + + diff --git a/examples/fastapi/requirements.txt b/examples/fastapi/requirements.txt new file mode 100644 index 00000000..09eae111 --- /dev/null +++ b/examples/fastapi/requirements.txt @@ -0,0 +1,3 @@ +../../ +fastapi +fastapi-cli From 2fc8fc842be4e0cce9bb7aa5ea3de363938f241b Mon Sep 17 00:00:00 2001 From: Daniel Bauer Date: Mon, 27 Jan 2025 12:45:08 +0100 Subject: [PATCH 17/25] fix: flake8 --- examples/fastapi/main.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/examples/fastapi/main.py b/examples/fastapi/main.py index 6c5acf0e..f8e99be0 100644 --- a/examples/fastapi/main.py +++ b/examples/fastapi/main.py @@ -16,6 +16,7 @@ app = FastAPI() + @app.get("/crate") async def get(): crate = ROCrate() @@ -40,6 +41,3 @@ async def get(): "Content-Disposition": "attachment; filename=crate.zip", } ) - - - From 781052f0c9fbe53ee522a3b5728cf2edf0038759 Mon Sep 17 00:00:00 2001 From: Daniel Bauer Date: Mon, 27 Jan 2025 16:52:55 +0100 Subject: [PATCH 18/25] fix: NPE when no out_path is given for straming --- rocrate/rocrate.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rocrate/rocrate.py b/rocrate/rocrate.py index 44b2b958..03e572d5 100644 --- a/rocrate/rocrate.py +++ b/rocrate/rocrate.py @@ -507,7 +507,7 @@ def stream_zip(self, chunk_size=8192, out_path=None): for name in files: source = root / name rel = source.relative_to(self.source) - if not self.dereference(str(rel)) and not out_path.samefile(source): + if not self.dereference(str(rel)) and (out_path and not out_path.samefile(source)): dest = rel if not str(dest) in listed_files: with archive.open(str(dest), mode='w') as f: From 0d81d7056eff08e49504c5cd57eef7f611feb99b Mon Sep 17 00:00:00 2001 From: Daniel Bauer Date: Mon, 27 Jan 2025 16:56:40 +0100 Subject: [PATCH 19/25] feat: hide out_path parameter of streaming api in an internal wrapper --- rocrate/rocrate.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/rocrate/rocrate.py b/rocrate/rocrate.py index 03e572d5..be557ba0 100644 --- a/rocrate/rocrate.py +++ b/rocrate/rocrate.py @@ -471,11 +471,15 @@ def write(self, base_path): def write_zip(self, out_path): out_path = Path(out_path) with open(out_path, "wb") as f: - for chunk in self.stream_zip(out_path=out_path): + for chunk in self._stream_zip(out_path=out_path): f.write(chunk) return out_path - def stream_zip(self, chunk_size=8192, out_path=None): + def stream_zip(self, chunk_size=8192): + """ Create a stream of bytes representing the RO-Crate as a ZIP file. """ + yield from self._stream_zip(chunk_size=chunk_size) + + def _stream_zip(self, chunk_size=8192, out_path=None): """ Create a stream of bytes representing the RO-Crate as a ZIP file. The out_path argument is used to exclude the file from the ZIP stream if the output is inside the crate folder and can be omitted if the stream is not written into a file inside the crate dir. From c636220ea7339c70831902b82133b56fb32e086d Mon Sep 17 00:00:00 2001 From: Daniel Bauer Date: Mon, 27 Jan 2025 17:13:19 +0100 Subject: [PATCH 20/25] feat: add test for streaming without write_zip --- test/test_write.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/test/test_write.py b/test/test_write.py index 94835dab..5161e38b 100644 --- a/test/test_write.py +++ b/test/test_write.py @@ -462,3 +462,18 @@ def test_http_header(tmpdir): assert "sdDatePublished" in props with requests.head(url) as response: assert props["sdDatePublished"] == response.headers.get("last-modified") + + +def test_stream(test_data_dir, tmpdir): + source = test_data_dir / "read_crate" + crate = ROCrate(source) + + out_path = tmpdir / 'ro_crate_out.zip' + with open(out_path, "wb") as out: + for chunk in crate.stream_zip(): + out.write(chunk) + + with zipfile.ZipFile(out_path, "r") as zf: + assert not zf.testzip() + for info in zf.infolist(): + assert info.file_size > 0 From 6b33a9066c425b4fb05409359e34a55e6e339db4 Mon Sep 17 00:00:00 2001 From: simleo Date: Wed, 29 Jan 2025 12:02:09 +0100 Subject: [PATCH 21/25] test for unlisted file presence when writing zip --- test/test_write.py | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/test/test_write.py b/test/test_write.py index 5161e38b..9ed885ec 100644 --- a/test/test_write.py +++ b/test/test_write.py @@ -395,6 +395,21 @@ def test_no_parts(tmpdir, helpers): assert "hasPart" not in json_entities["./"] +def test_write_zip_copy_unlisted(test_data_dir, tmpdir): + crate_dir = test_data_dir / 'ro-crate-galaxy-sortchangecase' + crate = ROCrate(crate_dir) + + zip_name = 'ro_crate_out.crate.zip' + zip_path = tmpdir / zip_name + crate.write_zip(zip_path) + out_path = tmpdir / 'ro_crate_out' + with zipfile.ZipFile(zip_path, "r") as zf: + zf.extractall(out_path) + + assert (out_path / "test" / "test1" / "input.bed").is_file() + assert (out_path / "test" / "test1" / "output_exp.bed").is_file() + + def test_no_zip_in_zip(test_data_dir, tmpdir): crate_dir = test_data_dir / 'ro-crate-galaxy-sortchangecase' crate = ROCrate(crate_dir) @@ -477,3 +492,10 @@ def test_stream(test_data_dir, tmpdir): assert not zf.testzip() for info in zf.infolist(): assert info.file_size > 0 + + extract_path = tmpdir / 'ro_crate_out' + with zipfile.ZipFile(out_path, "r") as zf: + zf.extractall(extract_path) + assert (extract_path / "ro-crate-metadata.jsonld").is_file() + assert (extract_path / "examples" / "README.txt").is_file() + assert (extract_path / "test" / "test-metadata.json").is_file() From 5d49dab9250a2f422d279f6246635ba35cb11247 Mon Sep 17 00:00:00 2001 From: Daniel Bauer Date: Wed, 29 Jan 2025 15:49:37 +0100 Subject: [PATCH 22/25] fix+refactor: streaming should not ignore unlisted files --- rocrate/rocrate.py | 27 ++++++++++++--------------- 1 file changed, 12 insertions(+), 15 deletions(-) diff --git a/rocrate/rocrate.py b/rocrate/rocrate.py index be557ba0..8d4cb37b 100644 --- a/rocrate/rocrate.py +++ b/rocrate/rocrate.py @@ -503,23 +503,20 @@ def _stream_zip(self, chunk_size=8192, out_path=None): # add additional unlisted files to stream listed_files = [archived_file for archived_file in archive.namelist()] for root, dirs, files in walk(str(self.source), exclude=self.exclude): - root = Path(root) - for name in dirs: - source = root / name - dest = source.relative_to(self.source) - dest.mkdir(parents=True, exist_ok=True) for name in files: - source = root / name + source = Path(root) / name + + # ignore out_path to not include a zip in itself + if out_path and out_path.samefile(source): + continue + rel = source.relative_to(self.source) - if not self.dereference(str(rel)) and (out_path and not out_path.samefile(source)): - dest = rel - if not str(dest) in listed_files: - with archive.open(str(dest), mode='w') as f: - with open(source, 'rb') as r: - while chunk := r.read(chunk_size): - f.write(chunk) - while len(buffer) >= chunk_size: - yield buffer.read(chunk_size) + if not self.dereference(str(rel)) and not str(rel) in listed_files: + with archive.open(str(rel), mode='w') as out_file, open(source, 'rb') as in_file: + while chunk := in_file.read(chunk_size): + out_file.write(chunk) + while len(buffer) >= chunk_size: + yield buffer.read(chunk_size) while chunk := buffer.read(chunk_size): yield chunk From 2178b420eff8aff00f23a9ab15ca89d7a91fce29 Mon Sep 17 00:00:00 2001 From: simleo Date: Mon, 3 Feb 2025 15:06:58 +0100 Subject: [PATCH 23/25] adjustments for plain DataEntity and large files --- rocrate/model/data_entity.py | 2 +- rocrate/rocrate.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/rocrate/model/data_entity.py b/rocrate/model/data_entity.py index 573f08a6..b1c0644b 100644 --- a/rocrate/model/data_entity.py +++ b/rocrate/model/data_entity.py @@ -38,4 +38,4 @@ def stream(self, chunk_size=8192): meaning once the destination name changes, a file can be closed and the next one can be openend. """ - raise NotImplementedError + yield from () diff --git a/rocrate/rocrate.py b/rocrate/rocrate.py index 8d4cb37b..7c71d4f4 100644 --- a/rocrate/rocrate.py +++ b/rocrate/rocrate.py @@ -493,7 +493,7 @@ def _stream_zip(self, chunk_size=8192, out_path=None): if current_out_file: current_out_file.close() current_file_path = path - current_out_file = archive.open(path, mode='w') + current_out_file = archive.open(path, mode='w', force_zip64=True) current_out_file.write(chunk) while len(buffer) >= chunk_size: yield buffer.read(chunk_size) From 09bff64fa1cb8216aabb2ac49e0aed739463938d Mon Sep 17 00:00:00 2001 From: Daniel Bauer Date: Mon, 3 Feb 2025 17:22:32 +0100 Subject: [PATCH 24/25] add author Daniel Bauer --- CITATION.cff | 5 ++++- rocrate/__init__.py | 1 + setup.py | 1 + 3 files changed, 6 insertions(+), 1 deletion(-) diff --git a/CITATION.cff b/CITATION.cff index b6bfb95f..e2e64b23 100644 --- a/CITATION.cff +++ b/CITATION.cff @@ -1,6 +1,9 @@ cff-version: 1.1.0 message: "Cite as" -authors: +author: + - family-names: Bauer + given-names: Daniel + orcid: https://orcid.org/0000-0001-9447-460X - family-names: Chadwick given-names: Eli orcid: https://orcid.org/0000-0002-0035-6475 diff --git a/rocrate/__init__.py b/rocrate/__init__.py index b4ef8599..33effd18 100644 --- a/rocrate/__init__.py +++ b/rocrate/__init__.py @@ -30,6 +30,7 @@ """ __author__ = ", ".join(( + 'Daniel Bauer', 'Eli Chadwick', 'Paul De Geest', 'Bert Droesbeke', diff --git a/setup.py b/setup.py index c4fc1236..66c6066e 100755 --- a/setup.py +++ b/setup.py @@ -58,6 +58,7 @@ long_description_content_type='text/markdown', long_description=long_description, author=", ".join(( + 'Daniel Bauer', 'Eli Chadwick', 'Paul De Geest', 'Bert Droesbeke', From 04128fa77c04650e56bf0164be706fc82bd0536d Mon Sep 17 00:00:00 2001 From: Daniel Bauer Date: Mon, 3 Feb 2025 17:23:46 +0100 Subject: [PATCH 25/25] add copyright for Senckenberg, SGN --- README.md | 1 + examples/fastapi/main.py | 21 +++++++++++++++++++++ examples/read_test_metadata.py | 1 + rocrate/__init__.py | 2 ++ rocrate/cli.py | 1 + rocrate/memory_buffer.py | 21 +++++++++++++++++++++ rocrate/metadata.py | 1 + rocrate/model/__init__.py | 1 + rocrate/model/computationalworkflow.py | 1 + rocrate/model/computerlanguage.py | 1 + rocrate/model/contextentity.py | 1 + rocrate/model/creativework.py | 1 + rocrate/model/data_entity.py | 1 + rocrate/model/dataset.py | 1 + rocrate/model/entity.py | 1 + rocrate/model/file.py | 1 + rocrate/model/file_or_dir.py | 1 + rocrate/model/metadata.py | 1 + rocrate/model/person.py | 1 + rocrate/model/preview.py | 1 + rocrate/model/root_dataset.py | 1 + rocrate/model/softwareapplication.py | 1 + rocrate/model/testdefinition.py | 1 + rocrate/model/testinstance.py | 1 + rocrate/model/testservice.py | 1 + rocrate/model/testsuite.py | 1 + rocrate/rocrate.py | 1 + rocrate/utils.py | 1 + rocrate/vocabs.py | 1 + setup.py | 1 + test/conftest.py | 1 + test/test_cli.py | 1 + test/test_jsonld.py | 1 + test/test_metadata.py | 1 + test/test_model.py | 1 + test/test_read.py | 1 + test/test_readwrite.py | 1 + test/test_test_metadata.py | 1 + test/test_utils.py | 1 + test/test_workflow_ro_crate.py | 1 + test/test_write.py | 1 + test/test_wrroc.py | 1 + tools/add_boilerplate.py | 1 + 43 files changed, 84 insertions(+) diff --git a/README.md b/README.md index ed67a13b..50488e96 100644 --- a/README.md +++ b/README.md @@ -451,6 +451,7 @@ Options: * Copyright 2022-2024 École Polytechnique Fédérale de Lausanne, CH * Copyright 2024 Data Centre, SciLifeLab, SE * Copyright 2024 National Institute of Informatics (NII), JP +* Copyright 2025 Senckenberg Society for Nature Research (SGN), DE Licensed under the Apache License, version 2.0 , diff --git a/examples/fastapi/main.py b/examples/fastapi/main.py index f8e99be0..a5c5ea57 100644 --- a/examples/fastapi/main.py +++ b/examples/fastapi/main.py @@ -1,3 +1,24 @@ +# Copyright 2019-2024 The University of Manchester, UK +# Copyright 2020-2024 Vlaams Instituut voor Biotechnologie (VIB), BE +# Copyright 2020-2024 Barcelona Supercomputing Center (BSC), ES +# Copyright 2020-2024 Center for Advanced Studies, Research and Development in Sardinia (CRS4), IT +# Copyright 2022-2024 École Polytechnique Fédérale de Lausanne, CH +# Copyright 2024 Data Centre, SciLifeLab, SE +# Copyright 2024 National Institute of Informatics (NII), JP +# Copyright 2025 Senckenberg Society for Nature Research (SGN), DE +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + """ Streaming RO-Crates from a web server diff --git a/examples/read_test_metadata.py b/examples/read_test_metadata.py index 722d0a16..88b77d52 100644 --- a/examples/read_test_metadata.py +++ b/examples/read_test_metadata.py @@ -5,6 +5,7 @@ # Copyright 2022-2024 École Polytechnique Fédérale de Lausanne, CH # Copyright 2024 Data Centre, SciLifeLab, SE # Copyright 2024 National Institute of Informatics (NII), JP +# Copyright 2025 Senckenberg Society for Nature Research (SGN), DE # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/rocrate/__init__.py b/rocrate/__init__.py index 33effd18..53905bd9 100644 --- a/rocrate/__init__.py +++ b/rocrate/__init__.py @@ -7,6 +7,7 @@ # Copyright 2022-2024 École Polytechnique Fédérale de Lausanne, CH # Copyright 2024 Data Centre, SciLifeLab, SE # Copyright 2024 National Institute of Informatics (NII), JP +# Copyright 2025 Senckenberg Society for Nature Research (SGN), DE # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -53,6 +54,7 @@ Copyright 2022-2024 École Polytechnique Fédérale de Lausanne, CH Copyright 2024 Data Centre, SciLifeLab, SE Copyright 2024 National Institute of Informatics (NII), JP +Copyright 2025 Senckenberg Society for Nature Research (SGN), DE """ __license__ = ("Apache License, version 2.0 " "") diff --git a/rocrate/cli.py b/rocrate/cli.py index a042dbd9..64693906 100644 --- a/rocrate/cli.py +++ b/rocrate/cli.py @@ -5,6 +5,7 @@ # Copyright 2022-2024 École Polytechnique Fédérale de Lausanne, CH # Copyright 2024 Data Centre, SciLifeLab, SE # Copyright 2024 National Institute of Informatics (NII), JP +# Copyright 2025 Senckenberg Society for Nature Research (SGN), DE # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/rocrate/memory_buffer.py b/rocrate/memory_buffer.py index 68fe53ed..9ed5b789 100644 --- a/rocrate/memory_buffer.py +++ b/rocrate/memory_buffer.py @@ -1,3 +1,24 @@ +# Copyright 2019-2024 The University of Manchester, UK +# Copyright 2020-2024 Vlaams Instituut voor Biotechnologie (VIB), BE +# Copyright 2020-2024 Barcelona Supercomputing Center (BSC), ES +# Copyright 2020-2024 Center for Advanced Studies, Research and Development in Sardinia (CRS4), IT +# Copyright 2022-2024 École Polytechnique Fédérale de Lausanne, CH +# Copyright 2024 Data Centre, SciLifeLab, SE +# Copyright 2024 National Institute of Informatics (NII), JP +# Copyright 2025 Senckenberg Society for Nature Research (SGN), DE +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + from io import RawIOBase diff --git a/rocrate/metadata.py b/rocrate/metadata.py index 29cdb218..7a5ea720 100644 --- a/rocrate/metadata.py +++ b/rocrate/metadata.py @@ -5,6 +5,7 @@ # Copyright 2022-2024 École Polytechnique Fédérale de Lausanne, CH # Copyright 2024 Data Centre, SciLifeLab, SE # Copyright 2024 National Institute of Informatics (NII), JP +# Copyright 2025 Senckenberg Society for Nature Research (SGN), DE # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/rocrate/model/__init__.py b/rocrate/model/__init__.py index 5ae3c862..2c482d59 100644 --- a/rocrate/model/__init__.py +++ b/rocrate/model/__init__.py @@ -5,6 +5,7 @@ # Copyright 2022-2024 École Polytechnique Fédérale de Lausanne, CH # Copyright 2024 Data Centre, SciLifeLab, SE # Copyright 2024 National Institute of Informatics (NII), JP +# Copyright 2025 Senckenberg Society for Nature Research (SGN), DE # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/rocrate/model/computationalworkflow.py b/rocrate/model/computationalworkflow.py index 1ca93773..6754583e 100644 --- a/rocrate/model/computationalworkflow.py +++ b/rocrate/model/computationalworkflow.py @@ -7,6 +7,7 @@ # Copyright 2022-2024 École Polytechnique Fédérale de Lausanne, CH # Copyright 2024 Data Centre, SciLifeLab, SE # Copyright 2024 National Institute of Informatics (NII), JP +# Copyright 2025 Senckenberg Society for Nature Research (SGN), DE # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/rocrate/model/computerlanguage.py b/rocrate/model/computerlanguage.py index 88546f97..511c53ce 100644 --- a/rocrate/model/computerlanguage.py +++ b/rocrate/model/computerlanguage.py @@ -5,6 +5,7 @@ # Copyright 2022-2024 École Polytechnique Fédérale de Lausanne, CH # Copyright 2024 Data Centre, SciLifeLab, SE # Copyright 2024 National Institute of Informatics (NII), JP +# Copyright 2025 Senckenberg Society for Nature Research (SGN), DE # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/rocrate/model/contextentity.py b/rocrate/model/contextentity.py index 9ccc2fb9..1bd94db8 100644 --- a/rocrate/model/contextentity.py +++ b/rocrate/model/contextentity.py @@ -7,6 +7,7 @@ # Copyright 2022-2024 École Polytechnique Fédérale de Lausanne, CH # Copyright 2024 Data Centre, SciLifeLab, SE # Copyright 2024 National Institute of Informatics (NII), JP +# Copyright 2025 Senckenberg Society for Nature Research (SGN), DE # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/rocrate/model/creativework.py b/rocrate/model/creativework.py index 32e4341d..1b1b2498 100644 --- a/rocrate/model/creativework.py +++ b/rocrate/model/creativework.py @@ -7,6 +7,7 @@ # Copyright 2022-2024 École Polytechnique Fédérale de Lausanne, CH # Copyright 2024 Data Centre, SciLifeLab, SE # Copyright 2024 National Institute of Informatics (NII), JP +# Copyright 2025 Senckenberg Society for Nature Research (SGN), DE # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/rocrate/model/data_entity.py b/rocrate/model/data_entity.py index b1c0644b..2c44e5ab 100644 --- a/rocrate/model/data_entity.py +++ b/rocrate/model/data_entity.py @@ -7,6 +7,7 @@ # Copyright 2022-2024 École Polytechnique Fédérale de Lausanne, CH # Copyright 2024 Data Centre, SciLifeLab, SE # Copyright 2024 National Institute of Informatics (NII), JP +# Copyright 2025 Senckenberg Society for Nature Research (SGN), DE # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/rocrate/model/dataset.py b/rocrate/model/dataset.py index b2aaaf91..77f4f93c 100644 --- a/rocrate/model/dataset.py +++ b/rocrate/model/dataset.py @@ -7,6 +7,7 @@ # Copyright 2022-2024 École Polytechnique Fédérale de Lausanne, CH # Copyright 2024 Data Centre, SciLifeLab, SE # Copyright 2024 National Institute of Informatics (NII), JP +# Copyright 2025 Senckenberg Society for Nature Research (SGN), DE # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/rocrate/model/entity.py b/rocrate/model/entity.py index 0e504f06..d0cbcd62 100644 --- a/rocrate/model/entity.py +++ b/rocrate/model/entity.py @@ -7,6 +7,7 @@ # Copyright 2022-2024 École Polytechnique Fédérale de Lausanne, CH # Copyright 2024 Data Centre, SciLifeLab, SE # Copyright 2024 National Institute of Informatics (NII), JP +# Copyright 2025 Senckenberg Society for Nature Research (SGN), DE # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/rocrate/model/file.py b/rocrate/model/file.py index d612bd71..d0363b76 100644 --- a/rocrate/model/file.py +++ b/rocrate/model/file.py @@ -7,6 +7,7 @@ # Copyright 2022-2024 École Polytechnique Fédérale de Lausanne, CH # Copyright 2024 Data Centre, SciLifeLab, SE # Copyright 2024 National Institute of Informatics (NII), JP +# Copyright 2025 Senckenberg Society for Nature Research (SGN), DE # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/rocrate/model/file_or_dir.py b/rocrate/model/file_or_dir.py index 4193b53e..a66e3c50 100644 --- a/rocrate/model/file_or_dir.py +++ b/rocrate/model/file_or_dir.py @@ -7,6 +7,7 @@ # Copyright 2022-2024 École Polytechnique Fédérale de Lausanne, CH # Copyright 2024 Data Centre, SciLifeLab, SE # Copyright 2024 National Institute of Informatics (NII), JP +# Copyright 2025 Senckenberg Society for Nature Research (SGN), DE # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/rocrate/model/metadata.py b/rocrate/model/metadata.py index 2703e8f0..5d432a87 100644 --- a/rocrate/model/metadata.py +++ b/rocrate/model/metadata.py @@ -7,6 +7,7 @@ # Copyright 2022-2024 École Polytechnique Fédérale de Lausanne, CH # Copyright 2024 Data Centre, SciLifeLab, SE # Copyright 2024 National Institute of Informatics (NII), JP +# Copyright 2025 Senckenberg Society for Nature Research (SGN), DE # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/rocrate/model/person.py b/rocrate/model/person.py index c6b6e6df..cfe7ec1f 100644 --- a/rocrate/model/person.py +++ b/rocrate/model/person.py @@ -7,6 +7,7 @@ # Copyright 2022-2024 École Polytechnique Fédérale de Lausanne, CH # Copyright 2024 Data Centre, SciLifeLab, SE # Copyright 2024 National Institute of Informatics (NII), JP +# Copyright 2025 Senckenberg Society for Nature Research (SGN), DE # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/rocrate/model/preview.py b/rocrate/model/preview.py index 9f4d6e97..3f5e08b8 100644 --- a/rocrate/model/preview.py +++ b/rocrate/model/preview.py @@ -7,6 +7,7 @@ # Copyright 2022-2024 École Polytechnique Fédérale de Lausanne, CH # Copyright 2024 Data Centre, SciLifeLab, SE # Copyright 2024 National Institute of Informatics (NII), JP +# Copyright 2025 Senckenberg Society for Nature Research (SGN), DE # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/rocrate/model/root_dataset.py b/rocrate/model/root_dataset.py index 2d52b2ee..ebef3814 100644 --- a/rocrate/model/root_dataset.py +++ b/rocrate/model/root_dataset.py @@ -7,6 +7,7 @@ # Copyright 2022-2024 École Polytechnique Fédérale de Lausanne, CH # Copyright 2024 Data Centre, SciLifeLab, SE # Copyright 2024 National Institute of Informatics (NII), JP +# Copyright 2025 Senckenberg Society for Nature Research (SGN), DE # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/rocrate/model/softwareapplication.py b/rocrate/model/softwareapplication.py index 874294b4..2cd3c530 100644 --- a/rocrate/model/softwareapplication.py +++ b/rocrate/model/softwareapplication.py @@ -5,6 +5,7 @@ # Copyright 2022-2024 École Polytechnique Fédérale de Lausanne, CH # Copyright 2024 Data Centre, SciLifeLab, SE # Copyright 2024 National Institute of Informatics (NII), JP +# Copyright 2025 Senckenberg Society for Nature Research (SGN), DE # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/rocrate/model/testdefinition.py b/rocrate/model/testdefinition.py index 3de24afd..a17677c1 100644 --- a/rocrate/model/testdefinition.py +++ b/rocrate/model/testdefinition.py @@ -5,6 +5,7 @@ # Copyright 2022-2024 École Polytechnique Fédérale de Lausanne, CH # Copyright 2024 Data Centre, SciLifeLab, SE # Copyright 2024 National Institute of Informatics (NII), JP +# Copyright 2025 Senckenberg Society for Nature Research (SGN), DE # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/rocrate/model/testinstance.py b/rocrate/model/testinstance.py index 94d9f0ef..956f4a33 100644 --- a/rocrate/model/testinstance.py +++ b/rocrate/model/testinstance.py @@ -5,6 +5,7 @@ # Copyright 2022-2024 École Polytechnique Fédérale de Lausanne, CH # Copyright 2024 Data Centre, SciLifeLab, SE # Copyright 2024 National Institute of Informatics (NII), JP +# Copyright 2025 Senckenberg Society for Nature Research (SGN), DE # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/rocrate/model/testservice.py b/rocrate/model/testservice.py index c5f44c12..c2b65988 100644 --- a/rocrate/model/testservice.py +++ b/rocrate/model/testservice.py @@ -5,6 +5,7 @@ # Copyright 2022-2024 École Polytechnique Fédérale de Lausanne, CH # Copyright 2024 Data Centre, SciLifeLab, SE # Copyright 2024 National Institute of Informatics (NII), JP +# Copyright 2025 Senckenberg Society for Nature Research (SGN), DE # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/rocrate/model/testsuite.py b/rocrate/model/testsuite.py index b99b103c..ba0442e0 100644 --- a/rocrate/model/testsuite.py +++ b/rocrate/model/testsuite.py @@ -5,6 +5,7 @@ # Copyright 2022-2024 École Polytechnique Fédérale de Lausanne, CH # Copyright 2024 Data Centre, SciLifeLab, SE # Copyright 2024 National Institute of Informatics (NII), JP +# Copyright 2025 Senckenberg Society for Nature Research (SGN), DE # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/rocrate/rocrate.py b/rocrate/rocrate.py index 7c71d4f4..6694c081 100644 --- a/rocrate/rocrate.py +++ b/rocrate/rocrate.py @@ -5,6 +5,7 @@ # Copyright 2022-2024 École Polytechnique Fédérale de Lausanne, CH # Copyright 2024 Data Centre, SciLifeLab, SE # Copyright 2024 National Institute of Informatics (NII), JP +# Copyright 2025 Senckenberg Society for Nature Research (SGN), DE # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/rocrate/utils.py b/rocrate/utils.py index aa1aeab2..5f565187 100644 --- a/rocrate/utils.py +++ b/rocrate/utils.py @@ -7,6 +7,7 @@ # Copyright 2022-2024 École Polytechnique Fédérale de Lausanne, CH # Copyright 2024 Data Centre, SciLifeLab, SE # Copyright 2024 National Institute of Informatics (NII), JP +# Copyright 2025 Senckenberg Society for Nature Research (SGN), DE # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/rocrate/vocabs.py b/rocrate/vocabs.py index e492294f..902a682d 100644 --- a/rocrate/vocabs.py +++ b/rocrate/vocabs.py @@ -7,6 +7,7 @@ # Copyright 2022-2024 École Polytechnique Fédérale de Lausanne, CH # Copyright 2024 Data Centre, SciLifeLab, SE # Copyright 2024 National Institute of Informatics (NII), JP +# Copyright 2025 Senckenberg Society for Nature Research (SGN), DE # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/setup.py b/setup.py index 66c6066e..bc4a0765 100755 --- a/setup.py +++ b/setup.py @@ -7,6 +7,7 @@ # Copyright 2022-2024 École Polytechnique Fédérale de Lausanne, CH # Copyright 2024 Data Centre, SciLifeLab, SE # Copyright 2024 National Institute of Informatics (NII), JP +# Copyright 2025 Senckenberg Society for Nature Research (SGN), DE # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/test/conftest.py b/test/conftest.py index 3dfee400..b7488d51 100644 --- a/test/conftest.py +++ b/test/conftest.py @@ -5,6 +5,7 @@ # Copyright 2022-2024 École Polytechnique Fédérale de Lausanne, CH # Copyright 2024 Data Centre, SciLifeLab, SE # Copyright 2024 National Institute of Informatics (NII), JP +# Copyright 2025 Senckenberg Society for Nature Research (SGN), DE # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/test/test_cli.py b/test/test_cli.py index b4fbb94c..353e9aea 100644 --- a/test/test_cli.py +++ b/test/test_cli.py @@ -5,6 +5,7 @@ # Copyright 2022-2024 École Polytechnique Fédérale de Lausanne, CH # Copyright 2024 Data Centre, SciLifeLab, SE # Copyright 2024 National Institute of Informatics (NII), JP +# Copyright 2025 Senckenberg Society for Nature Research (SGN), DE # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/test/test_jsonld.py b/test/test_jsonld.py index 7c9759c9..44ee9492 100644 --- a/test/test_jsonld.py +++ b/test/test_jsonld.py @@ -5,6 +5,7 @@ # Copyright 2022-2024 École Polytechnique Fédérale de Lausanne, CH # Copyright 2024 Data Centre, SciLifeLab, SE # Copyright 2024 National Institute of Informatics (NII), JP +# Copyright 2025 Senckenberg Society for Nature Research (SGN), DE # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/test/test_metadata.py b/test/test_metadata.py index e6ba0cea..ffacb5c1 100644 --- a/test/test_metadata.py +++ b/test/test_metadata.py @@ -5,6 +5,7 @@ # Copyright 2022-2024 École Polytechnique Fédérale de Lausanne, CH # Copyright 2024 Data Centre, SciLifeLab, SE # Copyright 2024 National Institute of Informatics (NII), JP +# Copyright 2025 Senckenberg Society for Nature Research (SGN), DE # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/test/test_model.py b/test/test_model.py index 165084fd..08400871 100644 --- a/test/test_model.py +++ b/test/test_model.py @@ -5,6 +5,7 @@ # Copyright 2022-2024 École Polytechnique Fédérale de Lausanne, CH # Copyright 2024 Data Centre, SciLifeLab, SE # Copyright 2024 National Institute of Informatics (NII), JP +# Copyright 2025 Senckenberg Society for Nature Research (SGN), DE # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/test/test_read.py b/test/test_read.py index c3b7a608..1b60b876 100644 --- a/test/test_read.py +++ b/test/test_read.py @@ -5,6 +5,7 @@ # Copyright 2022-2024 École Polytechnique Fédérale de Lausanne, CH # Copyright 2024 Data Centre, SciLifeLab, SE # Copyright 2024 National Institute of Informatics (NII), JP +# Copyright 2025 Senckenberg Society for Nature Research (SGN), DE # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/test/test_readwrite.py b/test/test_readwrite.py index 0fab0d90..0c894049 100644 --- a/test/test_readwrite.py +++ b/test/test_readwrite.py @@ -5,6 +5,7 @@ # Copyright 2022-2024 École Polytechnique Fédérale de Lausanne, CH # Copyright 2024 Data Centre, SciLifeLab, SE # Copyright 2024 National Institute of Informatics (NII), JP +# Copyright 2025 Senckenberg Society for Nature Research (SGN), DE # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/test/test_test_metadata.py b/test/test_test_metadata.py index 71c5e0ab..5645fe58 100644 --- a/test/test_test_metadata.py +++ b/test/test_test_metadata.py @@ -5,6 +5,7 @@ # Copyright 2022-2024 École Polytechnique Fédérale de Lausanne, CH # Copyright 2024 Data Centre, SciLifeLab, SE # Copyright 2024 National Institute of Informatics (NII), JP +# Copyright 2025 Senckenberg Society for Nature Research (SGN), DE # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/test/test_utils.py b/test/test_utils.py index a73b4a1e..1476e9a5 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -5,6 +5,7 @@ # Copyright 2022-2024 École Polytechnique Fédérale de Lausanne, CH # Copyright 2024 Data Centre, SciLifeLab, SE # Copyright 2024 National Institute of Informatics (NII), JP +# Copyright 2025 Senckenberg Society for Nature Research (SGN), DE # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/test/test_workflow_ro_crate.py b/test/test_workflow_ro_crate.py index eb4f5471..edad0018 100644 --- a/test/test_workflow_ro_crate.py +++ b/test/test_workflow_ro_crate.py @@ -5,6 +5,7 @@ # Copyright 2022-2024 École Polytechnique Fédérale de Lausanne, CH # Copyright 2024 Data Centre, SciLifeLab, SE # Copyright 2024 National Institute of Informatics (NII), JP +# Copyright 2025 Senckenberg Society for Nature Research (SGN), DE # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/test/test_write.py b/test/test_write.py index 9ed885ec..8c09debb 100644 --- a/test/test_write.py +++ b/test/test_write.py @@ -5,6 +5,7 @@ # Copyright 2022-2024 École Polytechnique Fédérale de Lausanne, CH # Copyright 2024 Data Centre, SciLifeLab, SE # Copyright 2024 National Institute of Informatics (NII), JP +# Copyright 2025 Senckenberg Society for Nature Research (SGN), DE # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/test/test_wrroc.py b/test/test_wrroc.py index 82405d07..66ab79aa 100644 --- a/test/test_wrroc.py +++ b/test/test_wrroc.py @@ -5,6 +5,7 @@ # Copyright 2022-2024 École Polytechnique Fédérale de Lausanne, CH # Copyright 2024 Data Centre, SciLifeLab, SE # Copyright 2024 National Institute of Informatics (NII), JP +# Copyright 2025 Senckenberg Society for Nature Research (SGN), DE # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/tools/add_boilerplate.py b/tools/add_boilerplate.py index 22ad428f..73ba0e58 100644 --- a/tools/add_boilerplate.py +++ b/tools/add_boilerplate.py @@ -5,6 +5,7 @@ # Copyright 2022-2024 École Polytechnique Fédérale de Lausanne, CH # Copyright 2024 Data Centre, SciLifeLab, SE # Copyright 2024 National Institute of Informatics (NII), JP +# Copyright 2025 Senckenberg Society for Nature Research (SGN), DE # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License.