Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Refactoring] File description building/hash saving part. #55

Merged
merged 4 commits into from
Jul 31, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 24 additions & 4 deletions src/background_ingestor.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,11 @@
build_background_ingestor_arg_parser,
build_scicat_background_ingester_config,
)
from scicat_dataset import convert_to_type
from scicat_dataset import (
build_single_data_file_desc,
convert_to_type,
save_and_build_single_hash_file_desc,
)
from scicat_logging import build_logger
from scicat_metadata import collect_schemas, select_applicable_schema
from system_helpers import exit_at_exceptions
Expand Down Expand Up @@ -133,7 +137,6 @@ def create_scicat_dataset(dataset: str, config: dict, logger: logging.Logger) ->
return result


def prepare_files_list(nexus_file, done_writing_message_file, config): ...
def prepare_scicat_origdatablock(files_list, config): ...
def create_scicat_origdatablock(
scicat_dataset_pid, nexus_file=None, done_writing_message_file=None
Expand All @@ -146,6 +149,7 @@ def main() -> None:
arg_namespace = arg_parser.parse_args()
config = build_scicat_background_ingester_config(arg_namespace)
ingestion_options = config.ingestion_options
file_handling_options = ingestion_options.file_handling_options
logger = build_logger(config)

# Log the configuration as dictionary so that it is easier to read from the logs
Expand Down Expand Up @@ -184,8 +188,24 @@ def main() -> None:
metadata_schema['variables'], h5file, config
)

# create files list with b2blake hash of all the files
_ = prepare_files_list(nexus_file_path, done_writing_message_file, config)
# Collect data-file descriptions
data_file_list = [
build_single_data_file_desc(nexus_file_path, file_handling_options),
build_single_data_file_desc(
done_writing_message_file, file_handling_options
),
# TODO: Add nexus structure file
]
# Create hash of all the files if needed
if file_handling_options.save_file_hash:
data_file_list += [
save_and_build_single_hash_file_desc(
data_file_dict, file_handling_options
)
for data_file_dict in data_file_list
]
# Collect all data-files and hash-files descriptions
_ = [json.dumps(file_dict, indent=2) for file_dict in data_file_list]

# create and populate scicat dataset entry
scicat_dataset = prepare_scicat_dataset(metadata_schema, variables_values)
Expand Down
98 changes: 97 additions & 1 deletion src/scicat_dataset.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
# SPDX-License-Identifier: BSD-3-Clause
# Copyright (c) 2024 ScicatProject contributors (https://github.com/ScicatProject)
import datetime
import pathlib
from types import MappingProxyType
from typing import Any

from scicat_configuration import FileHandlingOptions
from scicat_schemas import (
load_dataset_schema_template,
load_origdatablock_schema_template,
Expand Down Expand Up @@ -57,6 +59,7 @@ def convert_to_type(input_value: Any, dtype_desc: str) -> Any:

def build_dataset_description(
*,
dataset_pid_prefix: str,
nxs_dataset_pid: str,
dataset_name: str,
dataset_description: str,
Expand All @@ -78,6 +81,7 @@ def build_dataset_description(
access_groups: list[str],
) -> str:
return load_dataset_schema_template().render(
dataset_pid_prefix=dataset_pid_prefix,
nxs_dataset_pid=nxs_dataset_pid,
dataset_name=dataset_name,
dataset_description=dataset_description,
Expand Down Expand Up @@ -105,10 +109,10 @@ def build_single_datafile_description(
file_absolute_path: str,
file_size: int,
datetime_isoformat: str,
checksum: str,
uid: str,
gid: str,
perm: str,
checksum: str = "",
) -> str:
return load_single_datafile_template().render(
file_absolute_path=file_absolute_path,
Expand All @@ -123,14 +127,106 @@ def build_single_datafile_description(

def build_orig_datablock_description(
*,
dataset_pid_prefix: str,
nxs_dataset_pid: str,
dataset_size: int,
check_algorithm: str,
data_file_desc_list: list[str],
) -> str:
return load_origdatablock_schema_template().render(
dataset_pid_prefix=dataset_pid_prefix,
nxs_dataset_pid=nxs_dataset_pid,
dataset_size=dataset_size,
check_algorithm=check_algorithm,
data_file_desc_list=data_file_desc_list,
)


def _calculate_checksum(file_path: pathlib.Path, algorithm_name: str) -> str:
"""Calculate the checksum of a file."""
import hashlib

if not algorithm_name == "b2blake":
raise ValueError(
"Only b2blake hash algorithm is supported for now. Got: ",
f"{algorithm_name}",
)

chk = hashlib.new(algorithm_name, usedforsecurity=False)
buffer = memoryview(bytearray(128 * 1024))
with open(file_path, "rb", buffering=0) as file:
for n in iter(lambda: file.readinto(buffer), 0):
chk.update(buffer[:n])

return chk.hexdigest()


def build_single_data_file_desc(
file_path: pathlib.Path, config: FileHandlingOptions
) -> dict[str, Any]:
"""Build the description of a single data file."""
import datetime
import json

from scicat_schemas import load_single_datafile_template

single_file_template = load_single_datafile_template()

return json.loads(
single_file_template.render(
file_absolute_path=file_path.absolute(),
file_size=(file_stats := file_path.stat()).st_size,
datetime_isoformat=datetime.datetime.fromtimestamp(
file_stats.st_ctime, tz=datetime.UTC
).strftime("%Y-%m-%dT%H:%M:%S.000Z"),
chk=_calculate_checksum(file_path, config.file_hash_algorithm),
uid=str(file_stats.st_uid),
gid=str(file_stats.st_gid),
perm=oct(file_stats.st_mode),
)
)


def _build_hash_file_path(
*,
original_file_path: str,
ingestor_files_directory: str,
hash_file_extension: str,
) -> pathlib.Path:
"""Build the path for the hash file."""
original_path = pathlib.Path(original_file_path)
dir_path = pathlib.Path(ingestor_files_directory)
file_name = ".".join([original_path.name, hash_file_extension])
return dir_path / pathlib.Path(file_name)


def save_and_build_single_hash_file_desc(
original_file_desciption: dict, config: FileHandlingOptions
) -> dict:
"""Save the hash of the file and build the description."""
import datetime
import json

from scicat_schemas import load_single_datafile_template

single_file_template = load_single_datafile_template()
file_hash: str = original_file_desciption["chk"]
hash_path = _build_hash_file_path(
original_file_path=original_file_desciption["path"],
ingestor_files_directory=config.ingestor_files_directory,
hash_file_extension=config.hash_file_extension,
)
hash_path.write_text(file_hash)

return json.loads(
single_file_template.render(
file_absolute_path=hash_path.absolute(),
file_size=(file_stats := hash_path.stat()).st_size,
datetime_isoformat=datetime.datetime.fromtimestamp(
file_stats.st_ctime, tz=datetime.UTC
).strftime("%Y-%m-%dT%H:%M:%S.000Z"),
uid=str(file_stats.st_uid),
gid=str(file_stats.st_gid),
perm=oct(file_stats.st_mode),
)
)
9 changes: 9 additions & 0 deletions src/scicat_path_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,3 +13,12 @@ def select_target_directory(
return file_path.parent / pathlib.Path(fh_options.ingestor_files_directory)
else:
return pathlib.Path(fh_options.local_output_directory)


def compose_checksum_file_path(
fh_options: FileHandlingOptions, file_path: pathlib.Path
) -> pathlib.Path:
"""Compose the path for the checksum file."""
return pathlib.Path(fh_options.ingestor_files_directory) / pathlib.Path(
file_path.name + fh_options.hash_file_extension
)
2 changes: 1 addition & 1 deletion src/scicat_schemas/dataset.schema.json.jinja
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
{
"pid": "{{ nxs_dataset_pid }}",
"pid": "{{ dataset_pid }}",
"datasetName": "{{ dataset_name }}",
"description": "{{ dataset_description }}",
"principalInvestigator": "{{ principal_investigator }}",
Expand Down
2 changes: 1 addition & 1 deletion src/scicat_schemas/origdatablock.schema.json.jinja
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
{
"datasetId": "{{ nxs_dataset_pid }}",
"datasetId": "{{ dataset_pid }}",
"size": {{ dataset_size }},
"chkAlg": "{{ check_algorithm }}",
"dataFileList": [
Expand Down
4 changes: 2 additions & 2 deletions src/scicat_schemas/single_datafile.json.jinja
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
{
"path": "{{ file_absolute_path }}",
"size": {{ file_size }},
"time": "{{ datetime_isoformat }}",
"time": "{{ datetime_isoformat }}",{% if checksum %}
"chk": "{{ checksum }}",
"uid": "{{ uid }}",
{% endif %}"uid": "{{ uid }}",
"gid": "{{ gid }}",
"perm": "{{ perm }}"
}
24 changes: 21 additions & 3 deletions tests/test_scicat_schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,8 @@ def test_dataset_schema_rendering() -> None:
from scicat_dataset import build_dataset_description

dataset_schema = build_dataset_description(
nxs_dataset_pid="12.234.34567/e3690b21-ee8c-40d6-9409-6b6fdca776d2",
dataset_pid_prefix="12.234.34567",
nxs_dataset_pid="e3690b21-ee8c-40d6-9409-6b6fdca776d2",
dataset_name="this is a dataset",
dataset_description="this is the description of the dataset",
principal_investigator="Somebodys Name",
Expand Down Expand Up @@ -147,14 +148,30 @@ def test_single_file_description_rendering() -> None:
"path": "/ess/data/coda/2024/616254/0002.nxs",
"size": 1231231,
"time": "2024-07-16T10:00:00.000Z",
"chk": "1234567890abcdef",
"uid": "1004",
"gid": "1005",
"perm": "33188"
}
"""


def test_single_file_description_rendering_no_checksum() -> None:
import json

from scicat_dataset import build_single_datafile_description

file_description = build_single_datafile_description(
file_absolute_path="/ess/data/coda/2024/616254/0002.nxs",
file_size=1231231,
datetime_isoformat="2024-07-16T10:00:00.000Z",
uid="1004",
gid="1005",
perm="33188",
)

assert json.loads(file_description) == json.loads(_example_file_description_2)


_example_file_description_3 = """
{
"path": "/ess/data/coda/2024/616254/0003.nxs",
Expand Down Expand Up @@ -195,7 +212,8 @@ def test_orig_datablock_rendering() -> None:
from scicat_dataset import build_orig_datablock_description

orig_datablock = build_orig_datablock_description(
nxs_dataset_pid="20.500.12269/53fd2786-3729-11ef-83e5-fa163e9aae0a",
dataset_pid_prefix="20.500.12269",
nxs_dataset_pid="53fd2786-3729-11ef-83e5-fa163e9aae0a",
dataset_size=446630741,
check_algorithm="blake2b",
data_file_desc_list=[
Expand Down
Loading