SciCatProject · nitrosx · Jul 31, 2024 · Jul 18, 2024 · Jul 18, 2024 · Jul 31, 2024
diff --git a/src/background_ingestor.py b/src/background_ingestor.py
@@ -13,7 +13,11 @@
     build_background_ingestor_arg_parser,
     build_scicat_background_ingester_config,
 )
-from scicat_dataset import convert_to_type
+from scicat_dataset import (
+    build_single_data_file_desc,
+    convert_to_type,
+    save_and_build_single_hash_file_desc,
+)
 from scicat_logging import build_logger
 from scicat_metadata import collect_schemas, select_applicable_schema
 from system_helpers import exit_at_exceptions
@@ -133,7 +137,6 @@ def create_scicat_dataset(dataset: str, config: dict, logger: logging.Logger) ->
     return result
 
 
-def prepare_files_list(nexus_file, done_writing_message_file, config): ...
 def prepare_scicat_origdatablock(files_list, config): ...
 def create_scicat_origdatablock(
     scicat_dataset_pid, nexus_file=None, done_writing_message_file=None
@@ -146,6 +149,7 @@ def main() -> None:
     arg_namespace = arg_parser.parse_args()
     config = build_scicat_background_ingester_config(arg_namespace)
     ingestion_options = config.ingestion_options
+    file_handling_options = ingestion_options.file_handling_options
     logger = build_logger(config)
 
     # Log the configuration as dictionary so that it is easier to read from the logs
@@ -184,8 +188,24 @@ def main() -> None:
                 metadata_schema['variables'], h5file, config
             )
 
-        # create files list with b2blake hash of all the files
-        _ = prepare_files_list(nexus_file_path, done_writing_message_file, config)
+        # Collect data-file descriptions
+        data_file_list = [
+            build_single_data_file_desc(nexus_file_path, file_handling_options),
+            build_single_data_file_desc(
+                done_writing_message_file, file_handling_options
+            ),
+            # TODO: Add nexus structure file
+        ]
+        # Create hash of all the files if needed
+        if file_handling_options.save_file_hash:
+            data_file_list += [
+                save_and_build_single_hash_file_desc(
+                    data_file_dict, file_handling_options
+                )
+                for data_file_dict in data_file_list
+            ]
+        # Collect all data-files and hash-files descriptions
+        _ = [json.dumps(file_dict, indent=2) for file_dict in data_file_list]
 
         # create and populate scicat dataset entry
         scicat_dataset = prepare_scicat_dataset(metadata_schema, variables_values)

diff --git a/src/scicat_dataset.py b/src/scicat_dataset.py
@@ -1,9 +1,11 @@
 # SPDX-License-Identifier: BSD-3-Clause
 # Copyright (c) 2024 ScicatProject contributors (https://github.com/ScicatProject)
 import datetime
+import pathlib
 from types import MappingProxyType
 from typing import Any
 
+from scicat_configuration import FileHandlingOptions
 from scicat_schemas import (
     load_dataset_schema_template,
     load_origdatablock_schema_template,
@@ -57,6 +59,7 @@ def convert_to_type(input_value: Any, dtype_desc: str) -> Any:
 
 def build_dataset_description(
     *,
+    dataset_pid_prefix: str,
     nxs_dataset_pid: str,
     dataset_name: str,
     dataset_description: str,
@@ -78,6 +81,7 @@ def build_dataset_description(
     access_groups: list[str],
 ) -> str:
     return load_dataset_schema_template().render(
+        dataset_pid_prefix=dataset_pid_prefix,
         nxs_dataset_pid=nxs_dataset_pid,
         dataset_name=dataset_name,
         dataset_description=dataset_description,
@@ -105,10 +109,10 @@ def build_single_datafile_description(
     file_absolute_path: str,
     file_size: int,
     datetime_isoformat: str,
-    checksum: str,
     uid: str,
     gid: str,
     perm: str,
+    checksum: str = "",
 ) -> str:
     return load_single_datafile_template().render(
         file_absolute_path=file_absolute_path,
@@ -123,14 +127,106 @@ def build_single_datafile_description(
 
 def build_orig_datablock_description(
     *,
+    dataset_pid_prefix: str,
     nxs_dataset_pid: str,
     dataset_size: int,
     check_algorithm: str,
     data_file_desc_list: list[str],
 ) -> str:
     return load_origdatablock_schema_template().render(
+        dataset_pid_prefix=dataset_pid_prefix,
         nxs_dataset_pid=nxs_dataset_pid,
         dataset_size=dataset_size,
         check_algorithm=check_algorithm,
         data_file_desc_list=data_file_desc_list,
     )
+
+
+def _calculate_checksum(file_path: pathlib.Path, algorithm_name: str) -> str:
+    """Calculate the checksum of a file."""
+    import hashlib
+
+    if not algorithm_name == "b2blake":
+        raise ValueError(
+            "Only b2blake hash algorithm is supported for now. Got: ",
+            f"{algorithm_name}",
+        )
+
+    chk = hashlib.new(algorithm_name, usedforsecurity=False)
+    buffer = memoryview(bytearray(128 * 1024))
+    with open(file_path, "rb", buffering=0) as file:
+        for n in iter(lambda: file.readinto(buffer), 0):
+            chk.update(buffer[:n])
+
+    return chk.hexdigest()
+
+
+def build_single_data_file_desc(
+    file_path: pathlib.Path, config: FileHandlingOptions
+) -> dict[str, Any]:
+    """Build the description of a single data file."""
+    import datetime
+    import json
+
+    from scicat_schemas import load_single_datafile_template
+
+    single_file_template = load_single_datafile_template()
+
+    return json.loads(
+        single_file_template.render(
+            file_absolute_path=file_path.absolute(),
+            file_size=(file_stats := file_path.stat()).st_size,
+            datetime_isoformat=datetime.datetime.fromtimestamp(
+                file_stats.st_ctime, tz=datetime.UTC
+            ).strftime("%Y-%m-%dT%H:%M:%S.000Z"),
+            chk=_calculate_checksum(file_path, config.file_hash_algorithm),
+            uid=str(file_stats.st_uid),
+            gid=str(file_stats.st_gid),
+            perm=oct(file_stats.st_mode),
+        )
+    )
+
+
+def _build_hash_file_path(
+    *,
+    original_file_path: str,
+    ingestor_files_directory: str,
+    hash_file_extension: str,
+) -> pathlib.Path:
+    """Build the path for the hash file."""
+    original_path = pathlib.Path(original_file_path)
+    dir_path = pathlib.Path(ingestor_files_directory)
+    file_name = ".".join([original_path.name, hash_file_extension])
+    return dir_path / pathlib.Path(file_name)
+
+
+def save_and_build_single_hash_file_desc(
+    original_file_desciption: dict, config: FileHandlingOptions
+) -> dict:
+    """Save the hash of the file and build the description."""
+    import datetime
+    import json
+
+    from scicat_schemas import load_single_datafile_template
+
+    single_file_template = load_single_datafile_template()
+    file_hash: str = original_file_desciption["chk"]
+    hash_path = _build_hash_file_path(
+        original_file_path=original_file_desciption["path"],
+        ingestor_files_directory=config.ingestor_files_directory,
+        hash_file_extension=config.hash_file_extension,
+    )
+    hash_path.write_text(file_hash)
+
+    return json.loads(
+        single_file_template.render(
+            file_absolute_path=hash_path.absolute(),
+            file_size=(file_stats := hash_path.stat()).st_size,
+            datetime_isoformat=datetime.datetime.fromtimestamp(
+                file_stats.st_ctime, tz=datetime.UTC
+            ).strftime("%Y-%m-%dT%H:%M:%S.000Z"),
+            uid=str(file_stats.st_uid),
+            gid=str(file_stats.st_gid),
+            perm=oct(file_stats.st_mode),
+        )
+    )
diff --git a/src/scicat_path_helpers.py b/src/scicat_path_helpers.py
@@ -13,3 +13,12 @@ def select_target_directory(
         return file_path.parent / pathlib.Path(fh_options.ingestor_files_directory)
     else:
         return pathlib.Path(fh_options.local_output_directory)
+
+
+def compose_checksum_file_path(
+    fh_options: FileHandlingOptions, file_path: pathlib.Path
+) -> pathlib.Path:
+    """Compose the path for the checksum file."""
+    return pathlib.Path(fh_options.ingestor_files_directory) / pathlib.Path(
+        file_path.name + fh_options.hash_file_extension
+    )
diff --git a/src/scicat_schemas/dataset.schema.json.jinja b/src/scicat_schemas/dataset.schema.json.jinja
@@ -1,5 +1,5 @@
 {
-  "pid": "{{ nxs_dataset_pid }}",
+  "pid": "{{ dataset_pid }}",
   "datasetName": "{{ dataset_name }}",
   "description": "{{ dataset_description }}",
   "principalInvestigator": "{{ principal_investigator }}",

diff --git a/src/scicat_schemas/origdatablock.schema.json.jinja b/src/scicat_schemas/origdatablock.schema.json.jinja
@@ -1,5 +1,5 @@
 {
-  "datasetId": "{{ nxs_dataset_pid }}",
+  "datasetId": "{{ dataset_pid }}",
   "size": {{ dataset_size }},
   "chkAlg": "{{ check_algorithm }}",
   "dataFileList": [

diff --git a/src/scicat_schemas/single_datafile.json.jinja b/src/scicat_schemas/single_datafile.json.jinja
@@ -1,9 +1,9 @@
 {
     "path": "{{ file_absolute_path }}",
     "size": {{ file_size }},
-    "time": "{{ datetime_isoformat }}",
+    "time": "{{ datetime_isoformat }}",{% if checksum %}
     "chk": "{{ checksum }}",
-    "uid": "{{ uid }}",
+    {% endif %}"uid": "{{ uid }}",
     "gid": "{{ gid }}",
     "perm": "{{ perm }}"
 }
diff --git a/tests/test_scicat_schema.py b/tests/test_scicat_schema.py
@@ -84,7 +84,8 @@ def test_dataset_schema_rendering() -> None:
     from scicat_dataset import build_dataset_description
 
     dataset_schema = build_dataset_description(
-        nxs_dataset_pid="12.234.34567/e3690b21-ee8c-40d6-9409-6b6fdca776d2",
+        dataset_pid_prefix="12.234.34567",
+        nxs_dataset_pid="e3690b21-ee8c-40d6-9409-6b6fdca776d2",
         dataset_name="this is a dataset",
         dataset_description="this is the description of the dataset",
         principal_investigator="Somebodys Name",
@@ -147,14 +148,30 @@ def test_single_file_description_rendering() -> None:
   "path": "/ess/data/coda/2024/616254/0002.nxs",
   "size": 1231231,
   "time": "2024-07-16T10:00:00.000Z",
-  "chk": "1234567890abcdef",
   "uid": "1004",
   "gid": "1005",
   "perm": "33188"
 }
 """
 
 
+def test_single_file_description_rendering_no_checksum() -> None:
+    import json
+
+    from scicat_dataset import build_single_datafile_description
+
+    file_description = build_single_datafile_description(
+        file_absolute_path="/ess/data/coda/2024/616254/0002.nxs",
+        file_size=1231231,
+        datetime_isoformat="2024-07-16T10:00:00.000Z",
+        uid="1004",
+        gid="1005",
+        perm="33188",
+    )
+
+    assert json.loads(file_description) == json.loads(_example_file_description_2)
+
+
 _example_file_description_3 = """
 {
   "path": "/ess/data/coda/2024/616254/0003.nxs",
@@ -195,7 +212,8 @@ def test_orig_datablock_rendering() -> None:
     from scicat_dataset import build_orig_datablock_description
 
     orig_datablock = build_orig_datablock_description(
-        nxs_dataset_pid="20.500.12269/53fd2786-3729-11ef-83e5-fa163e9aae0a",
+        dataset_pid_prefix="20.500.12269",
+        nxs_dataset_pid="53fd2786-3729-11ef-83e5-fa163e9aae0a",
         dataset_size=446630741,
         check_algorithm="blake2b",
         data_file_desc_list=[