diff --git a/.github/workflows/pre-commit.yml b/.github/workflows/pre-commit.yml index 25f2253d..e6cb0dd2 100644 --- a/.github/workflows/pre-commit.yml +++ b/.github/workflows/pre-commit.yml @@ -18,7 +18,7 @@ jobs: - uses: actions/checkout@v4 with: fetch-depth: 100 - - uses: actions/setup-python@v4 + - uses: actions/setup-python@v5 with: python-version: ${{ matrix.python-version }} cache: 'pip' @@ -35,7 +35,7 @@ jobs: - name: 'Install requirements (standard or constraints ${{ matrix.python-version }})' run: | - pip install --upgrade pip wheel + pip install --upgrade pip wheel setuptools if [ ${{ steps.changed-requirements-txt.outputs.any_changed }} != 'true' ] && [ -f constraints-${{ matrix.python-version }}.txt ] ; then pip install -r requirements.txt -c constraints-${{ matrix.python-version }}.txt else @@ -73,8 +73,9 @@ jobs: - name: Print licences report if: ${{ always() }} run: echo "${{ steps.license_check_report.outputs.report }}" - - uses: actions/upload-artifact@v3 + - uses: actions/upload-artifact@v4 with: + name: pre-commit-${{ matrix.python-version }} retention-days: 2 path: constraints-${{ matrix.python-version }}.txt @@ -88,7 +89,7 @@ jobs: - uses: actions/checkout@v4 with: fetch-depth: 100 - - uses: actions/setup-python@v4 + - uses: actions/setup-python@v5 with: python-version: ${{ matrix.python-version }} cache: 'pip' @@ -105,7 +106,7 @@ jobs: - name: 'Install requirements (standard or constraints ${{ matrix.python-version }})' run: | - pip install --upgrade pip wheel + pip install --upgrade pip wheel setuptools if [ ${{ steps.changed-requirements-txt.outputs.any_changed }} != 'true' ] && [ -f constraints-${{ matrix.python-version }}.txt ] ; then pip install -r requirements.txt -c constraints-${{ matrix.python-version }}.txt else @@ -143,8 +144,9 @@ jobs: - name: Print licences report if: ${{ always() }} run: echo "${{ steps.license_check_report.outputs.report }}" - - uses: actions/upload-artifact@v3 + - uses: actions/upload-artifact@v4 with: + name: pre-commit-${{ matrix.python-version }} retention-days: 2 path: constraints-${{ matrix.python-version }}.txt @@ -157,24 +159,46 @@ jobs: - pre-commit - pre-commit-22_04 steps: - - uses: actions/checkout@v3 - - uses: actions/download-artifact@v3 + - name: Get analysis timestamp + id: timestamp + run: echo "timestamp=$(date -Is)" >> "$GITHUB_OUTPUT" + - uses: actions/checkout@v4 + - uses: actions/download-artifact@v4 + id: download with: + pattern: pre-commit-* + merge-multiple: true path: changes-dir - name: Move artifacts to their right place + id: move run: | - cp -dpr changes-dir/artifact/* . - rm -r changes-dir/artifact + skip=true + if [ -d "${{steps.download.outputs.download-path}}" ] ; then + for con in "${{steps.download.outputs.download-path}}"/constraints-*.txt ; do + case "$con" in + */constraints-\*.txt) + break + ;; + *) + cp -p "$con" . + skip=false + ;; + esac + done + fi + echo "skip=$skip" >> "$GITHUB_OUTPUT" - name: Create Pull Request id: cpr - uses: peter-evans/create-pull-request@v5 + uses: peter-evans/create-pull-request@v7 + if: steps.move.outputs.skip == 'false' with: - title: Updated constraints (triggered by ${{ github.sha }}) + title: Updated constraints (triggered on ${{ steps.timestamp.outputs.timestamp }} by ${{ github.sha }}) branch: create-pull-request/patch-constraints + add-paths: constraints-*.txt delete-branch: true commit-message: "[create-pull-request] Automatically commit updated contents (constraints)" - name: Check outputs if: ${{ steps.cpr.outputs.pull-request-number }} run: | - echo "Pull Request Number - ${{ steps.cpr.outputs.pull-request-number }}" - echo "Pull Request URL - ${{ steps.cpr.outputs.pull-request-url }}" + echo "Pull Request Number - ${{ steps.cpr.outputs.pull-request-number }}" >> "$GITHUB_STEP_SUMMARY" + echo "Pull Request URL - ${{ steps.cpr.outputs.pull-request-url }}" >> "$GITHUB_STEP_SUMMARY" diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 84d02686..851a4f9b 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -107,6 +107,10 @@ repos: - repo: https://github.com/ambv/black.git rev: 23.3.0 hooks: + - id: black + name: black_apply + exclude: "^[^/]*env/|development-[^/]*/|docs/" + stages: [manual] - id: black exclude: "^[^/]*env/|development-[^/]*/|docs/" args: [--diff, --check] diff --git a/requirements.txt b/requirements.txt index a7f58bca..f2f31fdf 100644 --- a/requirements.txt +++ b/requirements.txt @@ -26,7 +26,7 @@ wiktionary-term-fetcher >= 0.1.1 funny-passphrase >= 0.2.3 pyxdg groovy-parser == 0.1.1 -data-url +data-url >= 1.1.1 pgzip defusedxml # This is needed for exception groups diff --git a/tests/fetchers/test_git.py b/tests/fetchers/test_git.py index 3702cc5c..27e2d142 100644 --- a/tests/fetchers/test_git.py +++ b/tests/fetchers/test_git.py @@ -29,6 +29,8 @@ if TYPE_CHECKING: from typing import ( Optional, + Type, + Union, ) from wfexs_backend.common import ( @@ -38,14 +40,24 @@ URIType, ) +from wfexs_backend.scheme_catalog import ( + SchemeCatalog, +) + from wfexs_backend.fetchers import ( RemoteRepo, + RepoGuessException, RepoGuessFlavor, RepoType, ) + +from wfexs_backend.fetchers.http import HTTPFetcher + from wfexs_backend.fetchers.git import GitFetcher -WfExS_basedir = Path(__file__).parent.parent +import wfexs_backend + +WfExS_basedir = Path(wfexs_backend.__file__).parent.parent WfExS_basedir_file_uri = WfExS_basedir.as_uri() WfExS_git_basedir = WfExS_basedir / ".git" WfExS_git_basedir_file_uri = WfExS_git_basedir.as_uri() @@ -54,7 +66,7 @@ logger.setLevel(logging.INFO) GIT_TESTBED = pytest.mark.parametrize( - ["url", "remote_repo", "repo_pid"], + ["url", "remote_repo_or_exception_class", "repo_pid"], [ ( "https://github.com/inab/WfExS-backend.git", @@ -116,12 +128,9 @@ "git+ssh://git@github.com:inab/WfExS-backend.git@main", ), ( + # This tag does not exists! "ssh://git@github.com:inab/WfExS-backend.git@0.1.2", - RemoteRepo( - repo_url=cast("RepoURL", "ssh://git@github.com/inab/WfExS-backend.git"), - repo_type=RepoType.Git, - tag=cast("RepoTag", "0.1.2"), - ), + RepoGuessException, "git+ssh://git@github.com:inab/WfExS-backend.git@0.1.2", ), ( @@ -153,12 +162,9 @@ "git+" + WfExS_git_basedir_file_uri, ), ( + # This tag does not exists! WfExS_git_basedir_file_uri + "@0.1.2", - RemoteRepo( - repo_url=cast("RepoURL", WfExS_git_basedir_file_uri), - repo_type=RepoType.Git, - tag=cast("RepoTag", "0.1.2"), - ), + RepoGuessException, "git+" + WfExS_git_basedir_file_uri + "@0.1.2", ), ( @@ -233,28 +239,48 @@ @GIT_TESTBED def test_guess_git_repo_params( - url: "str", remote_repo: "Optional[RemoteRepo]", repo_pid: "Optional[str]" + url: "str", + remote_repo_or_exception_class: "Optional[Union[RemoteRepo, Type[Exception]]]", + repo_pid: "Optional[str]", ) -> "None": - output = GitFetcher.GuessRepoParams(cast("URIType", url), logger=logger) + if ( + isinstance(remote_repo_or_exception_class, RemoteRepo) + or remote_repo_or_exception_class is None + ): + output = GitFetcher.GuessRepoParams(cast("URIType", url), logger=logger) + + # When no tag is given, ignore what it was discovered + if output is not None and remote_repo_or_exception_class is not None: + if remote_repo_or_exception_class.tag is None: + output = output._replace(tag=None) + # For now, patch this + if remote_repo_or_exception_class.checkout is not None: + output = output._replace( + checkout=remote_repo_or_exception_class.checkout + ) + assert output == remote_repo_or_exception_class - # When no tag is given, ignore what it was discovered - if output is not None and remote_repo is not None: - if remote_repo.tag is None: - output = output._replace(tag=None) - # For now, patch this - if remote_repo.checkout is not None: - output = output._replace(checkout=remote_repo.checkout) - assert output == remote_repo + else: + with pytest.raises(remote_repo_or_exception_class): + output = GitFetcher.GuessRepoParams(cast("URIType", url), logger=logger) @GIT_TESTBED def test_build_git_pid_from_repo( - url: "str", remote_repo: "Optional[RemoteRepo]", repo_pid: "Optional[str]" + url: "str", + remote_repo_or_exception_class: "Optional[Union[RemoteRepo, Type[Exception]]]", + repo_pid: "Optional[str]", ) -> "None": - if remote_repo is None: + if remote_repo_or_exception_class is None or not isinstance( + remote_repo_or_exception_class, RemoteRepo + ): pytest.skip("Skipped test because no remote repo was provided") else: - fetcher = GitFetcher({}) - output = fetcher.build_pid_from_repo(remote_repo) + scheme_catalog = SchemeCatalog( + scheme_handlers=HTTPFetcher.GetSchemeHandlers(), + ) + + fetcher = GitFetcher(scheme_catalog, progs={}) + output = fetcher.build_pid_from_repo(remote_repo_or_exception_class) assert output == repo_pid diff --git a/tests/fetchers/test_swh.py b/tests/fetchers/test_swh.py index 0bc34a90..c170a5aa 100644 --- a/tests/fetchers/test_swh.py +++ b/tests/fetchers/test_swh.py @@ -2,7 +2,7 @@ # -*- coding: utf-8 -*- # SPDX-License-Identifier: Apache-2.0 -# Copyright 2020-2024 Barcelona Supercomputing Center (BSC), Spain +# Copyright 2020-2025 Barcelona Supercomputing Center (BSC), Spain # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -38,17 +38,19 @@ URIType, ) +from wfexs_backend.scheme_catalog import ( + SchemeCatalog, +) + from wfexs_backend.fetchers import ( RemoteRepo, RepoGuessFlavor, RepoType, ) -from wfexs_backend.fetchers.swh import SoftwareHeritageFetcher -WfExS_basedir = Path(__file__).parent.parent -WfExS_basedir_file_uri = WfExS_basedir.as_uri() -WfExS_git_basedir = WfExS_basedir / ".git" -WfExS_git_basedir_file_uri = WfExS_git_basedir.as_uri() +from wfexs_backend.fetchers.http import HTTPFetcher + +from wfexs_backend.fetchers.swh import SoftwareHeritageFetcher logger = logging.getLogger(__name__) logger.setLevel(logging.INFO) @@ -139,7 +141,11 @@ def test_build_swh_pid_from_repo( if remote_repo is None: pytest.skip("Skipped test because no remote repo was provided") else: - fetcher = SoftwareHeritageFetcher({}) + scheme_catalog = SchemeCatalog( + scheme_handlers=HTTPFetcher.GetSchemeHandlers(), + ) + + fetcher = SoftwareHeritageFetcher(scheme_catalog, progs={}) output = fetcher.build_pid_from_repo(remote_repo) assert output == repo_pid diff --git a/tests/fetchers/test_trs.py b/tests/fetchers/test_trs.py new file mode 100644 index 00000000..019eb0b1 --- /dev/null +++ b/tests/fetchers/test_trs.py @@ -0,0 +1,386 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +# SPDX-License-Identifier: Apache-2.0 +# Copyright 2020-2025 Barcelona Supercomputing Center (BSC), Spain +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest +import logging + +import pathlib + +from typing import ( + cast, + TYPE_CHECKING, +) + +if TYPE_CHECKING: + from typing import ( + Optional, + ) + + from wfexs_backend.common import ( + RelPath, + RepoTag, + RepoURL, + TRS_Workflow_Descriptor, + URIType, + ) + + from wfexs_backend.workflow import ( + WFVersionId, + WorkflowId, + ) + +from wfexs_backend.scheme_catalog import ( + SchemeCatalog, +) + +from wfexs_backend.fetchers import ( + RemoteRepo, + RepoGuessFlavor, + RepoType, +) + +from wfexs_backend.fetchers.http import HTTPFetcher + +from wfexs_backend.fetchers.trs_files import GA4GHTRSFetcher + +logger = logging.getLogger(__name__) +logger.setLevel(logging.INFO) + +TRS_PARAMS_TESTBED = pytest.mark.parametrize( + [ + "trs_endpoint", + "workflow_id", + "version_id", + "url", + "remote_repo", + "repo_pid", + "upstream_repo", + ], + [ + ( + "https://dockstore.org/api/ga4gh/trs/v2/", + cast( + "WorkflowId", + "#workflow/github.com/sevenbridges-openworkflows/Broad-Best-Practice-Somatic-CNV-Workflows/GATK-Somatic-CNV-Panel-Workflow", + ), + cast("Optional[WFVersionId]", "master"), + cast( + "URIType", + GA4GHTRSFetcher.INTERNAL_TRS_SCHEME_PREFIX + + ":" + + "https://dockstore.org/api/ga4gh/trs/v2/tools/%23workflow%2Fgithub.com%2Fsevenbridges-openworkflows%2FBroad-Best-Practice-Somatic-CNV-Workflows%2FGATK-Somatic-CNV-Panel-Workflow/versions/master", + ), + RemoteRepo( + repo_url=cast( + "RepoURL", + "https://dockstore.org/api/ga4gh/trs/v2/tools/%23workflow%2Fgithub.com%2Fsevenbridges-openworkflows%2FBroad-Best-Practice-Somatic-CNV-Workflows%2FGATK-Somatic-CNV-Panel-Workflow/versions/master", + ), + tag=cast("RepoTag", "master"), + repo_type=RepoType.TRS, + ), + "trs://dockstore.org/api/%23workflow%2Fgithub.com%2Fsevenbridges-openworkflows%2FBroad-Best-Practice-Somatic-CNV-Workflows%2FGATK-Somatic-CNV-Panel-Workflow/master", + RemoteRepo( + repo_url=cast( + "RepoURL", + "https://raw.githubusercontent.com/sevenbridges-openworkflows/Broad-Best-Practice-Somatic-CNV-Workflows/master/BroadCNVPanelWorkflow/gatk-cnv-panel-workflow_decomposed.cwl", + ), + ), + ), + ( + "https://dockstore.org/api/ga4gh/trs/v2/", + cast( + "WorkflowId", "#workflow/github.com/NCI-GDC/gdc-dnaseq-cwl/GDC_DNASeq" + ), + cast("Optional[WFVersionId]", "master"), + cast( + "URIType", + GA4GHTRSFetcher.INTERNAL_TRS_SCHEME_PREFIX + + ":" + + "https://dockstore.org/api/ga4gh/trs/v2/tools/%23workflow%2Fgithub.com%2FNCI-GDC%2Fgdc-dnaseq-cwl%2FGDC_DNASeq/versions/master", + ), + RemoteRepo( + repo_url=cast( + "RepoURL", + "https://dockstore.org/api/ga4gh/trs/v2/tools/%23workflow%2Fgithub.com%2FNCI-GDC%2Fgdc-dnaseq-cwl%2FGDC_DNASeq/versions/master", + ), + tag=cast("RepoTag", "master"), + repo_type=RepoType.TRS, + ), + "trs://dockstore.org/api/%23workflow%2Fgithub.com%2FNCI-GDC%2Fgdc-dnaseq-cwl%2FGDC_DNASeq/master", + RemoteRepo( + repo_url=cast( + "RepoURL", + "https://raw.githubusercontent.com/NCI-GDC/gdc-dnaseq-cwl/master/workflows/dnaseq/transform.cwl", + ), + ), + ), + ( + "https://dockstore.org/api/ga4gh/trs/v2/", + cast( + "WorkflowId", "#workflow/github.com/NCI-GDC/gdc-dnaseq-cwl/GDC_DNASeq" + ), + None, + cast( + "URIType", + GA4GHTRSFetcher.INTERNAL_TRS_SCHEME_PREFIX + + ":" + + "https://dockstore.org/api/ga4gh/trs/v2/tools/%23workflow%2Fgithub.com%2FNCI-GDC%2Fgdc-dnaseq-cwl%2FGDC_DNASeq", + ), + RemoteRepo( + repo_url=cast( + "RepoURL", + "https://dockstore.org/api/ga4gh/trs/v2/tools/%23workflow%2Fgithub.com%2FNCI-GDC%2Fgdc-dnaseq-cwl%2FGDC_DNASeq/versions/release", + ), + tag=cast("RepoTag", "release"), + repo_type=RepoType.TRS, + ), + "trs://dockstore.org/api/%23workflow%2Fgithub.com%2FNCI-GDC%2Fgdc-dnaseq-cwl%2FGDC_DNASeq/release", + RemoteRepo( + repo_url=cast( + "RepoURL", + "https://raw.githubusercontent.com/NCI-GDC/gdc-dnaseq-cwl/release/workflows/dnaseq/transform.cwl", + ), + ), + ), + ( + "https://workflowhub.eu/ga4gh/trs/v2/tools/", + cast("WorkflowId", 107), + cast("Optional[WFVersionId]", 1), + cast( + "URIType", + GA4GHTRSFetcher.INTERNAL_TRS_SCHEME_PREFIX + + ":" + + "https://workflowhub.eu/ga4gh/trs/v2/tools/107/versions/1", + ), + RemoteRepo( + repo_url=cast( + "RepoURL", + "https://workflowhub.eu/ga4gh/trs/v2/tools/107/versions/1", + ), + tag=cast("RepoTag", "1"), + repo_type=RepoType.TRS, + ), + "trs://workflowhub.eu/107/1", + RemoteRepo( + repo_url=cast( + "RepoURL", + "https://workflowhub.eu/ga4gh/trs/v2/tools/107/versions/1/CWL/files?format=zip", + ), + repo_type=RepoType.Raw, + ), + ), + ( + "https://workflowhub.eu/ga4gh/trs/v2/tools/", + cast("WorkflowId", 106), + cast("Optional[WFVersionId]", 3), + cast( + "URIType", + GA4GHTRSFetcher.INTERNAL_TRS_SCHEME_PREFIX + + ":" + + "https://workflowhub.eu/ga4gh/trs/v2/tools/106/versions/3", + ), + RemoteRepo( + repo_url=cast( + "RepoURL", + "https://workflowhub.eu/ga4gh/trs/v2/tools/106/versions/3", + ), + tag=cast("RepoTag", "3"), + repo_type=RepoType.TRS, + ), + "trs://workflowhub.eu/106/3", + RemoteRepo( + repo_url=cast( + "RepoURL", + "https://workflowhub.eu/ga4gh/trs/v2/tools/106/versions/3/NFL/files?format=zip", + ), + repo_type=RepoType.Raw, + ), + ), + ( + "https://workflowhub.eu/ga4gh/trs/v2/", + cast("WorkflowId", 119), + cast("Optional[WFVersionId]", 1), + cast( + "URIType", + GA4GHTRSFetcher.INTERNAL_TRS_SCHEME_PREFIX + + ":" + + "https://workflowhub.eu/ga4gh/trs/v2/tools/119/versions/1", + ), + RemoteRepo( + repo_url=cast( + "RepoURL", + "https://workflowhub.eu/ga4gh/trs/v2/tools/119/versions/1", + ), + tag=cast("RepoTag", "1"), + repo_type=RepoType.TRS, + ), + "trs://workflowhub.eu/119/1", + RemoteRepo( + repo_url=cast( + "RepoURL", + "https://workflowhub.eu/ga4gh/trs/v2/tools/119/versions/1/NFL/files?format=zip", + ), + repo_type=RepoType.Raw, + ), + ), + ( + "https://workflowhub.eu/ga4gh/trs/v2/tools/", + cast("WorkflowId", 244), + cast("Optional[WFVersionId]", 4), + cast( + "URIType", + GA4GHTRSFetcher.INTERNAL_TRS_SCHEME_PREFIX + + ":" + + "https://workflowhub.eu/ga4gh/trs/v2/tools/244/versions/4", + ), + RemoteRepo( + repo_url=cast( + "RepoURL", + "https://workflowhub.eu/ga4gh/trs/v2/tools/244/versions/4", + ), + tag=cast("RepoTag", "4"), + repo_type=RepoType.TRS, + ), + "trs://workflowhub.eu/244/4", + RemoteRepo( + repo_url=cast( + "RepoURL", + "https://workflowhub.eu/ga4gh/trs/v2/tools/244/versions/4/NFL/files?format=zip", + ), + repo_type=RepoType.Raw, + ), + ), + ( + "https://ddbj.github.io/workflow-registry/", + cast("WorkflowId", "0d2ae4c2-fe4c-48f7-811a-ac277776533e"), + cast("Optional[WFVersionId]", "1.0.0"), + cast( + "URIType", + GA4GHTRSFetcher.INTERNAL_TRS_SCHEME_PREFIX + + ":" + + "https://ddbj.github.io/workflow-registry/tools/0d2ae4c2-fe4c-48f7-811a-ac277776533e/versions/1.0.0", + ), + RemoteRepo( + repo_url=cast( + "RepoURL", + "https://ddbj.github.io/workflow-registry/tools/0d2ae4c2-fe4c-48f7-811a-ac277776533e/versions/1.0.0", + ), + tag=cast("RepoTag", "1.0.0"), + repo_type=RepoType.TRS, + ), + "trs://ddbj.github.io/workflow-registry/0d2ae4c2-fe4c-48f7-811a-ac277776533e/1.0.0", + RemoteRepo( + repo_url=cast( + "RepoURL", + "https://zenodo.org/api/files/2422dda0-1bd9-4109-aa44-53d55fd934de/download-sra.cwl", + ), + ), + ), + ], +) + + +@TRS_PARAMS_TESTBED +def test_guess_trs_repo_params( + trs_endpoint: "str", + workflow_id: "WorkflowId", + version_id: "Optional[WFVersionId]", + url: "str", + remote_repo: "Optional[RemoteRepo]", + repo_pid: "Optional[str]", + upstream_repo: "Optional[RemoteRepo]", +) -> "None": + output = GA4GHTRSFetcher.GuessRepoParams(cast("URIType", url), logger=logger) + + # When no web url is given, ignore what it was discovered + if output is not None and remote_repo is not None: + if remote_repo.web_url is None: + output = output._replace(web_url=None) + # For now, patch this + if remote_repo.checkout is None: + output = output._replace(checkout=None) + assert output == remote_repo + + +@TRS_PARAMS_TESTBED +def test_build_trs_internal_url_from_repo( + trs_endpoint: "str", + workflow_id: "WorkflowId", + version_id: "Optional[WFVersionId]", + url: "str", + remote_repo: "Optional[RemoteRepo]", + repo_pid: "Optional[str]", + upstream_repo: "Optional[RemoteRepo]", +) -> "None": + output = GA4GHTRSFetcher.BuildRepoPIDFromTRSParams( + trs_endpoint, + workflow_id, + version_id, + ) + + assert output == url + + +@TRS_PARAMS_TESTBED +def test_build_trs_pid_from_repo( + trs_endpoint: "str", + workflow_id: "WorkflowId", + version_id: "Optional[WFVersionId]", + url: "str", + remote_repo: "Optional[RemoteRepo]", + repo_pid: "Optional[str]", + upstream_repo: "Optional[RemoteRepo]", +) -> "None": + if remote_repo is None: + pytest.skip("Skipped test because no remote repo was provided") + else: + scheme_catalog = SchemeCatalog( + scheme_handlers=HTTPFetcher.GetSchemeHandlers(), + ) + + fetcher = GA4GHTRSFetcher(scheme_catalog, progs={}) + output = fetcher.build_pid_from_repo(remote_repo) + + assert output in (url, repo_pid) + + +@TRS_PARAMS_TESTBED +def test_materialize_repo_from_repo( + tmppath: "pathlib.Path", + trs_endpoint: "str", + workflow_id: "WorkflowId", + version_id: "Optional[WFVersionId]", + url: "str", + remote_repo: "Optional[RemoteRepo]", + repo_pid: "Optional[str]", + upstream_repo: "Optional[RemoteRepo]", +) -> "None": + if remote_repo is None: + pytest.skip("Skipped test because no remote repo was provided") + else: + scheme_catalog = SchemeCatalog( + scheme_handlers=HTTPFetcher.GetSchemeHandlers(), + ) + + fetcher = GA4GHTRSFetcher(scheme_catalog, progs={}) + materialized_repo = fetcher.materialize_repo_from_repo( + remote_repo, base_repo_destdir=tmppath + ) + + # Let's check the guessed repo' + assert materialized_repo.upstream_repo == upstream_repo diff --git a/wfexs_backend/__main__.py b/wfexs_backend/__main__.py index ef5d9fa6..51fa3e83 100644 --- a/wfexs_backend/__main__.py +++ b/wfexs_backend/__main__.py @@ -749,7 +749,7 @@ def processCacheCommand( cached_content = wfBackend.cacheFetch( cast("URIType", uri_to_fetch), args.cache_type, - offline=False, + offline=args.doCacheOffline, vault=vault, sec_context_name=secContextName, default_clonable=default_clonable, @@ -1256,6 +1256,13 @@ def _get_wfexs_argparse_internal( action="store_true", default=False, ) + ap_c.add_argument( + "--offline", + dest="doCacheOffline", + help="Try checking the offline behaviour of cache management", + action="store_true", + default=False, + ) ap_c.add_argument( "-g", "--glob", diff --git a/wfexs_backend/cache_handler.py b/wfexs_backend/cache_handler.py index ed0cf932..ee7a80a9 100644 --- a/wfexs_backend/cache_handler.py +++ b/wfexs_backend/cache_handler.py @@ -126,6 +126,10 @@ class CacheMetadataDict(TypedDict): InvalidFetcherException, ) +from .scheme_catalog import ( + SchemeCatalog, +) + from .utils.contents import link_or_copy from .utils.digests import ( ComputeDigestFromDirectory, @@ -161,13 +165,13 @@ class CacheHandlerSchemeException(CacheHandlerException): pass -class SchemeHandlerCacheHandler: +class CacheHandler: CACHE_METADATA_SCHEMA = cast("RelPath", "cache-metadata.json") def __init__( self, cacheDir: "pathlib.Path", - schemeHandlers: "Mapping[str, DocumentedProtocolFetcher]" = dict(), + scheme_catalog: "Optional[SchemeCatalog]" = None, ): # Getting a logger focused on specific classes self.logger = logging.getLogger( @@ -178,109 +182,9 @@ def __init__( # TODO: create caching database self.cacheDir = cacheDir - self.schemeHandlers: "MutableMapping[str, DocumentedProtocolFetcher]" = dict() - - self.addRawSchemeHandlers(schemeHandlers) - - def addRawSchemeHandlers( - self, schemeHandlers: "Mapping[str, DocumentedProtocolFetcher]" - ) -> None: - # No validation is done here about validness of schemes - if isinstance(schemeHandlers, dict): - self.schemeHandlers.update(schemeHandlers) - else: - raise InvalidFetcherException("Unable to add raw scheme handlers") - - def bypassSchemeHandler( - self, - scheme: "str", - handler: "Union[DocumentedStatefulProtocolFetcher, DocumentedProtocolFetcher]", - progs: "ProgsMapping" = dict(), - setup_block: "Optional[Mapping[str, Any]]" = None, - ) -> None: - """ - This method adds and overwrites a scheme handler, - instantiating it if it is a stateful one. - - :param scheme: - :param handler: - """ - the_handler: "DocumentedProtocolFetcher" - if isinstance(handler, DocumentedStatefulProtocolFetcher): - inst_handler = self.instantiateStatefulFetcher( - handler.fetcher_class, progs=progs, setup_block=setup_block - ) - the_handler = DocumentedProtocolFetcher( - fetcher=inst_handler.fetch, - description=inst_handler.description - if handler.description is None - else handler.description, - priority=handler.priority, - ) - elif isinstance(handler, DocumentedProtocolFetcher) and isinstance( - handler.fetcher, - ( - types.FunctionType, - types.LambdaType, - types.MethodType, - types.BuiltinFunctionType, - types.BuiltinMethodType, - ), - ): - the_handler = handler - else: - raise InvalidFetcherException( - "Trying to set for scheme {} a invalid handler".format(scheme) - ) - - self.schemeHandlers[scheme.lower()] = the_handler - - def bypassSchemeHandlers( - self, - schemeHandlers: "Mapping[str, Union[DocumentedStatefulProtocolFetcher, DocumentedProtocolFetcher]]", - ) -> None: - # No validation is done here about validness of schemes - if isinstance(schemeHandlers, dict): - for scheme, clazz in schemeHandlers.items(): - self.bypassSchemeHandler(scheme, clazz) - else: - raise InvalidFetcherException( - "Unable to instantiate to add scheme handlers" - ) - - def instantiateStatefulFetcher( - self, - statefulFetcher: "Type[StatefulFetcher]", - progs: "ProgsMapping" = dict(), - setup_block: "Optional[Mapping[str, Any]]" = None, - ) -> "StatefulFetcher": - """ - Method to instantiate stateful fetchers - """ - instStatefulFetcher = None - if inspect.isclass(statefulFetcher): - if issubclass(statefulFetcher, AbstractStatefulFetcher): - try: - instStatefulFetcher = statefulFetcher( - progs=progs, setup_block=setup_block - ) - except Exception as e: - raise FetcherInstanceException( - f"Error while instantiating {statefulFetcher.__name__}" - ) from e - - if instStatefulFetcher is None: - raise InvalidFetcherException( - "Unable to instantiate something which is not a class inheriting from AbstractStatefulFetcher" - ) - - return cast("StatefulFetcher", instStatefulFetcher) - - def describeRegisteredSchemes(self) -> "Sequence[Tuple[str, str, int]]": - return [ - (scheme, desc_fetcher.description, desc_fetcher.priority) - for scheme, desc_fetcher in self.schemeHandlers.items() - ] + if scheme_catalog is None: + scheme_catalog = SchemeCatalog() + self.scheme_catalog: "SchemeCatalog" = scheme_catalog def _genUriMetaCachedFilename( self, hashDir: "pathlib.Path", the_remote_file: "URIType" @@ -1092,9 +996,11 @@ def fetch( # Content is fetched here # As of RFC3986, schemes are case insensitive theScheme = parsedInputURL.scheme.lower() - schemeHandler = self.schemeHandlers.get(theScheme) + schemeHandler = self.scheme_catalog.get(theScheme) try: + # TODO: this code is redundant with the one in + # SchemeHandler method getSchemeHandler if schemeHandler is None: errmsg = f"No {theScheme} scheme handler for {the_remote_file} (while processing {remote_file}). Was this data injected in the cache?" self.logger.error(errmsg) @@ -1104,6 +1010,8 @@ def fetch( else: raise che + # TODO: this code is partially redundant with + # the one in SchemeHandler method fetch try: # Content is fetched here pfr = schemeHandler.fetcher( diff --git a/wfexs_backend/fetchers/__init__.py b/wfexs_backend/fetchers/__init__.py index bd4cf27d..bc7640b9 100644 --- a/wfexs_backend/fetchers/__init__.py +++ b/wfexs_backend/fetchers/__init__.py @@ -2,7 +2,7 @@ # -*- coding: utf-8 -*- # SPDX-License-Identifier: Apache-2.0 -# Copyright 2020-2024 Barcelona Supercomputing Center (BSC), Spain +# Copyright 2020-2025 Barcelona Supercomputing Center (BSC), Spain # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -70,6 +70,10 @@ URIWithMetadata, ) + from ..scheme_catalog import ( + SchemeCatalog, + ) + class RepoDesc(TypedDict): repo: Required[RepoURL] tag: Required[Optional[RepoTag]] @@ -97,13 +101,21 @@ class ProtocolFetcherReturn(NamedTuple): ProtocolFetcherReturn, ] + ProtocolStreamFetcher: TypeAlias = Callable[ + [ + URIType, + IO[bytes], + DefaultNamedArg(Optional[SecurityContextConfig], "secContext"), + ], + ProtocolFetcherReturn, + ] + from urllib import parse from ..common import ( AbstractWfExSException, ) - # Default priority DEFAULT_PRIORITY: "Final[int]" = 0 @@ -158,6 +170,7 @@ def __init__( self, progs: "ProgsMapping" = dict(), setup_block: "Optional[Mapping[str, Any]]" = None, + scheme_catalog: "Optional[SchemeCatalog]" = None, ): import inspect @@ -169,6 +182,7 @@ def __init__( # This is used to resolve program names self.progs = progs self.setup_block = setup_block if isinstance(setup_block, dict) else dict() + self.scheme_catalog = scheme_catalog @abc.abstractmethod def fetch( @@ -236,6 +250,10 @@ class RepoGuessException(FetcherException): pass +class OfflineRepoGuessException(RepoGuessException): + pass + + class RepoType(enum.Enum): Git = "git" Raw = "raw" @@ -295,18 +313,64 @@ def get_checkout(self) -> "RepoTag": ) -class AbstractRepoFetcher(AbstractStatefulFetcher): +class MaterializedRepo(NamedTuple): + local: "pathlib.Path" + repo: "RemoteRepo" + metadata_array: "Sequence[URIWithMetadata]" + upstream_repo: "Optional[RemoteRepo]" = None + recommends_upstream: "bool" = False + + +class AbstractSchemeRepoFetcher(AbstractStatefulFetcher): PRIORITY: "ClassVar[int]" = DEFAULT_PRIORITY + 10 + """ + This abstract subclass is used to force the initialization of the + scheme catalog instance + """ + + def __init__( + self, + scheme_catalog: "SchemeCatalog", + progs: "ProgsMapping" = dict(), + setup_block: "Optional[Mapping[str, Any]]" = None, + ): + """ + The scheme catalog is enforced + """ + super().__init__( + progs=progs, setup_block=setup_block, scheme_catalog=scheme_catalog + ) + self.scheme_catalog: "SchemeCatalog" + @abc.abstractmethod - def materialize_repo( + def materialize_repo_from_repo( self, - repoURL: "RepoURL", - repoTag: "Optional[RepoTag]" = None, + repo: "RemoteRepo", repo_tag_destdir: "Optional[PathLikePath]" = None, base_repo_destdir: "Optional[PathLikePath]" = None, doUpdate: "Optional[bool]" = True, - ) -> "Tuple[pathlib.Path, RemoteRepo, Sequence[URIWithMetadata]]": + ) -> "MaterializedRepo": + """ + Subclasses have to implement this method, which is used to materialize + a repository described by a RemoteRepo instance. + + :param repo: The description of the repository to be materialized. + :type repo: class: `wfexs_backend.fetchers.RemoteRepo` + :param repo_tag_destdir: Destination of the materialized repo. + :type repo_tag_destdir: str, `os.PathLike[str]`, optional + :param base_repo_destdir: If repo_tag_destdir is None, parent directory of the newly created destination directory for the repo. + :type base_repo_destdir: str, `os.PathLike[str]`, optional + :param doUpdate: Should the code try updating an already materialized repo? Defaults to False + :type doUpdate: bool + + The returned tuple has next elements: + * The local path where the repo was materialized. + * A RemoteRepo instance. + * The metadata gathered through the materialisation process. + * An optional, upstream URI representing the repo. For instance, + in the case of a TRS or a SWH hosted repo, the registered upstream URL. + """ pass @abc.abstractmethod @@ -314,8 +378,11 @@ def build_pid_from_repo(self, remote_repo: "RemoteRepo") -> "Optional[str]": """ This method is required to generate a PID which usually represents an element (usually a workflow) in a repository. - If the fetcher does not recognize the type of repo, it should + If the fetcher does not recognize the type of repo, either using + repo_url content or the repo type in the worst case, it should return None + + Calling this method in offline mode should be safe """ pass @@ -326,12 +393,13 @@ def GuessRepoParams( orig_wf_url: "Union[URIType, parse.ParseResult]", logger: "Optional[logging.Logger]" = None, fail_ok: "bool" = False, + offline: "bool" = False, ) -> "Optional[RemoteRepo]": pass if TYPE_CHECKING: - RepoFetcher = TypeVar("RepoFetcher", bound=AbstractRepoFetcher) + SchemeRepoFetcher = TypeVar("SchemeRepoFetcher", bound=AbstractSchemeRepoFetcher) class AbstractStatefulStreamingFetcher(AbstractStatefulFetcher): @@ -353,6 +421,6 @@ def streamfetch( ) -> "ProtocolFetcherReturn": """ This is the method to be implemented by the stateful streaming fetcher - which can receive as destination either a file + which can receive as destination a byte stream """ pass diff --git a/wfexs_backend/fetchers/b2share.py b/wfexs_backend/fetchers/b2share.py index c2829fa5..b31d4429 100644 --- a/wfexs_backend/fetchers/b2share.py +++ b/wfexs_backend/fetchers/b2share.py @@ -35,7 +35,7 @@ FetcherException, ProtocolFetcherReturn, ) -from .http import fetchClassicURL +from .http import HTTPFetcher from ..common import ( ContentKind, @@ -94,11 +94,12 @@ def fetchB2SHARE( metadata_url = cast("URIType", parse.urljoin(B2SHARE_RECORD_REST, b2share_id)) + http_fetcher = HTTPFetcher() gathered_meta = {"fetched": metadata_url} metadata_array = [URIWithMetadata(remote_file, gathered_meta)] try: metaio = io.BytesIO() - _, metametaio, _ = fetchClassicURL(metadata_url, metaio) + _, metametaio, _ = http_fetcher.streamfetch(metadata_url, metaio) metadata = json.loads(metaio.getvalue().decode("utf-8")) gathered_meta["payload"] = metadata metadata_array.extend(metametaio) @@ -178,12 +179,14 @@ def fetchB2SHARE( the_file_local_path = cast( "AbsPath", os.path.join(cachedFilename, relpath) ) - _, metacont, _ = fetchClassicURL( + _, metacont, _ = http_fetcher.fetch( the_file["ePIC_PID"], the_file_local_path ) metadata_array.extend(metacont) else: - _, metacont, _ = fetchClassicURL(the_files[0]["ePIC_PID"], cachedFilename) + _, metacont, _ = http_fetcher.fetch( + the_files[0]["ePIC_PID"], cachedFilename + ) metadata_array.extend(metacont) except FetcherException as fe: raise FetcherException( diff --git a/wfexs_backend/fetchers/doi.py b/wfexs_backend/fetchers/doi.py index d99d1cc2..367852f3 100644 --- a/wfexs_backend/fetchers/doi.py +++ b/wfexs_backend/fetchers/doi.py @@ -34,7 +34,7 @@ FetcherException, ProtocolFetcherReturn, ) -from .http import fetchClassicURL +from .http import HTTPFetcher from ..common import ( URIWithMetadata, @@ -95,9 +95,10 @@ def fetchDOI( gathered_ra_meta = {"fetched": metadata_ra_url} metadata_array = [URIWithMetadata(remote_file, gathered_ra_meta)] + http_fetcher = HTTPFetcher() try: metaio = io.BytesIO() - _, metametaraio, _ = fetchClassicURL(metadata_ra_url, metaio) + _, metametaraio, _ = http_fetcher.streamfetch(metadata_ra_url, metaio) metadata_ra = json.loads(metaio.getvalue().decode("utf-8")) gathered_ra_meta["payload"] = metadata_ra metadata_array.extend(metametaraio) @@ -120,7 +121,7 @@ def fetchDOI( metadata_array.append(URIWithMetadata(remote_file, gathered_meta)) try: metaio = io.BytesIO() - _, metametaio, _ = fetchClassicURL(metadata_url, metaio) + _, metametaio, _ = http_fetcher.streamfetch(metadata_url, metaio) metadata = json.loads(metaio.getvalue().decode("utf-8")) gathered_meta["payload"] = metadata metadata_array.extend(metametaio) diff --git a/wfexs_backend/fetchers/drs.py b/wfexs_backend/fetchers/drs.py index fe0f1831..be809007 100644 --- a/wfexs_backend/fetchers/drs.py +++ b/wfexs_backend/fetchers/drs.py @@ -52,7 +52,7 @@ FetcherException, ProtocolFetcherReturn, ) -from .http import fetchClassicURL +from .http import HTTPFetcher from ..common import ( LicensedURI, @@ -73,7 +73,7 @@ def query_n2t( gathered_meta = {"fetched": query_url} n2t_io = io.BytesIO() - _, meta_n2t_io, _ = fetchClassicURL(query_url, n2t_io) + _, meta_n2t_io, _ = HTTPFetcher().streamfetch(query_url, n2t_io) answer = yaml.safe_load(n2t_io.getvalue().decode("utf-8")) gathered_meta["payload"] = answer @@ -197,11 +197,12 @@ def downloadContentFromDRS( "URIType", drs_service_prefix + "objects/" + object_id ) + http_fetcher = HTTPFetcher() gathered_meta = {"fetched": object_metadata_url} metadata = None try: metaio = io.BytesIO() - _, metametaio, _ = fetchClassicURL( + _, metametaio, _ = http_fetcher.streamfetch( object_metadata_url, metaio, secContext=upperSecContext ) object_metadata = json.loads(metaio.getvalue().decode("utf-8")) @@ -234,7 +235,7 @@ def downloadContentFromDRS( try: metaaccio = io.BytesIO() - _, metametaaccio, _ = fetchClassicURL( + _, metametaaccio, _ = http_fetcher.streamfetch( object_access_metadata_url, metaaccio, secContext=upperSecContext, diff --git a/wfexs_backend/fetchers/fasp.py b/wfexs_backend/fetchers/fasp.py index 6e99156a..4014f985 100644 --- a/wfexs_backend/fetchers/fasp.py +++ b/wfexs_backend/fetchers/fasp.py @@ -2,7 +2,7 @@ # -*- coding: utf-8 -*- # SPDX-License-Identifier: Apache-2.0 -# Copyright 2020-2024 Barcelona Supercomputing Center (BSC), Spain +# Copyright 2020-2025 Barcelona Supercomputing Center (BSC), Spain # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -45,6 +45,10 @@ URIType, ) + from ..scheme_catalog import ( + SchemeCatalog, + ) + from . import ( AbstractStatefulFetcher, DocumentedStatefulProtocolFetcher, @@ -64,9 +68,14 @@ class FASPFetcher(AbstractStatefulFetcher): DEFAULT_ASPERA_CMD: "Final[SymbolicName]" = cast("SymbolicName", "ascp") def __init__( - self, progs: "ProgsMapping", setup_block: "Optional[Mapping[str, Any]]" = None + self, + progs: "ProgsMapping", + setup_block: "Optional[Mapping[str, Any]]" = None, + scheme_catalog: "Optional[SchemeCatalog]" = None, ): - super().__init__(progs=progs, setup_block=setup_block) + super().__init__( + progs=progs, setup_block=setup_block, scheme_catalog=scheme_catalog + ) self.ascp_cmd = self.progs.get( self.DEFAULT_ASPERA_CMD, cast("RelPath", self.DEFAULT_ASPERA_CMD) diff --git a/wfexs_backend/fetchers/git.py b/wfexs_backend/fetchers/git.py index d019937b..6830b7e2 100644 --- a/wfexs_backend/fetchers/git.py +++ b/wfexs_backend/fetchers/git.py @@ -2,7 +2,7 @@ # -*- coding: utf-8 -*- # SPDX-License-Identifier: Apache-2.0 -# Copyright 2020-2024 Barcelona Supercomputing Center (BSC), Spain +# Copyright 2020-2025 Barcelona Supercomputing Center (BSC), Spain # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -20,9 +20,12 @@ import hashlib import os import pathlib +import re import shutil import subprocess import tempfile +import warnings + from typing import ( cast, TYPE_CHECKING, @@ -33,6 +36,7 @@ from typing import ( Any, + ClassVar, Mapping, MutableMapping, MutableSequence, @@ -47,6 +51,10 @@ Final, ) + from ..scheme_catalog import ( + SchemeCatalog, + ) + from ..common import ( AbsPath, AnyPath, @@ -67,12 +75,17 @@ from urllib import parse, request +from dulwich.client import get_transport_and_path + import dulwich.porcelain +import dulwich.repo from . import ( - AbstractRepoFetcher, + AbstractSchemeRepoFetcher, DocumentedStatefulProtocolFetcher, FetcherException, + MaterializedRepo, + OfflineRepoGuessException, ProtocolFetcherReturn, RemoteRepo, RepoGuessException, @@ -90,7 +103,9 @@ GITHUB_NETLOC = "github.com" -class GitFetcher(AbstractRepoFetcher): +class GitFetcher(AbstractSchemeRepoFetcher): + PRIORITY: "ClassVar[int]" = AbstractSchemeRepoFetcher.PRIORITY + 10 + GIT_PROTO: "Final[str]" = "git" GIT_PROTO_PREFIX: "Final[str]" = GIT_PROTO + "+" GITHUB_SCHEME: "Final[str]" = "github" @@ -102,9 +117,12 @@ class GitFetcher(AbstractRepoFetcher): GIT_SCHEMES: "Final[Sequence[str]]" = ["https", "git", "ssh", "file"] def __init__( - self, progs: "ProgsMapping", setup_block: "Optional[Mapping[str, Any]]" = None + self, + scheme_catalog: "SchemeCatalog", + progs: "ProgsMapping" = dict(), + setup_block: "Optional[Mapping[str, Any]]" = None, ): - super().__init__(progs=progs, setup_block=setup_block) + super().__init__(scheme_catalog, progs=progs, setup_block=setup_block) self.git_cmd = self.progs.get( self.DEFAULT_GIT_CMD, cast("RelPath", self.DEFAULT_GIT_CMD) @@ -138,12 +156,23 @@ def GetNeededPrograms(cls) -> "Sequence[SymbolicName]": def _find_git_repo_in_uri( cls, remote_file: "Union[URIType, parse.ParseResult]", + offline: "bool" = False, ) -> "Tuple[RemoteRepo, Sequence[str], Sequence[RepoTag]]": if isinstance(remote_file, parse.ParseResult): parsedInputURL = remote_file else: parsedInputURL = parse.urlparse(remote_file) - sp_path = parsedInputURL.path.split("/") + + # Getting the tag or branch in cases like https://github.com/inab/WfExS-backend.git@0.2.0 + repoTag: "Optional[str]" = None + if "@" in parsedInputURL.path: + gitPath, repoTag = parsedInputURL.path.rsplit("@", 1) + reparsedInputURL = parsedInputURL._replace(path=gitPath) + else: + gitPath = parsedInputURL.path + reparsedInputURL = parsedInputURL + + sp_path = reparsedInputURL.path.split("/") shortest_pre_path: "Optional[URIType]" = None longest_post_path: "Optional[Sequence[str]]" = None @@ -156,70 +185,126 @@ def _find_git_repo_in_uri( pre_path = "/".join(sp_path[:pos]) if pre_path == "": pre_path = "/" - remote_uri_anc = parse.urlunparse(parsedInputURL._replace(path=pre_path)) + # Remove fragments + remote_uri_anc = parse.urlunparse( + reparsedInputURL._replace(path=pre_path, fragment="") + ) remote_refs_dict: "Mapping[bytes, bytes]" + if offline: + raise OfflineRepoGuessException( + f"Query to {remote_uri_anc} is not allowed in offline mode" + ) try: # Dulwich works both with file, ssh, git and http(s) protocols remote_refs_dict = dulwich.porcelain.ls_remote(remote_uri_anc) repo_type = RepoType.Git + + break except ( dulwich.errors.NotGitRepository, dulwich.errors.GitProtocolError, ) as ngr: # Skip and continue continue + else: + # Metadata is all we really need + remote_uri_anc = parse.urlunparse(reparsedInputURL._replace(fragment="")) + req = request.Request(remote_uri_anc, method="HEAD") + try: + with request.urlopen(req) as resp: + # Is it gitlab? + if list( + filter( + lambda c: "gitlab" in c, + resp.headers.get_all("Set-Cookie"), + ) + ): + repo_type = RepoType.Git + guessed_repo_flavor = RepoGuessFlavor.GitLab + elif list( + filter( + lambda c: GITHUB_NETLOC in c, + resp.headers.get_all("Set-Cookie"), + ) + ): + repo_type = RepoType.Git + guessed_repo_flavor = RepoGuessFlavor.GitHub + elif list( + filter( + lambda c: "bitbucket" in c, + resp.headers.get_all("X-View-Name"), + ) + ): + repo_type = RepoType.Git + guessed_repo_flavor = RepoGuessFlavor.BitBucket + except Exception as e: + pass - the_remote_uri = remote_uri_anc + if repo_type != RepoType.Git: + raise RepoGuessException(f"Unable to identify {remote_file} as a git repo") - head_remote_ref = remote_refs_dict[cls.HEAD_LABEL] - repo_branches = [] - b_default_repo_tag = None - for remote_label, remote_ref in remote_refs_dict.items(): - if remote_label.startswith(cls.REFS_HEADS_PREFIX): - b_repo_tag = remote_label[len(cls.REFS_HEADS_PREFIX) :].decode( - "utf-8", errors="continue" - ) - repo_branches.append(cast("RepoTag", b_repo_tag)) - if b_default_repo_tag is None and remote_ref == head_remote_ref: + the_remote_uri = remote_uri_anc + + # Now, try matching either a branch or a tag + head_remote_ref = remote_refs_dict[cls.HEAD_LABEL] + repo_branches = [] + b_default_repo_tag = None + b_checkout: "Optional[RepoTag]" = None + for remote_label, remote_ref in remote_refs_dict.items(): + b_repo_tag: "Optional[str]" = None + if remote_label.startswith(cls.REFS_HEADS_PREFIX): + b_repo_tag = remote_label[len(cls.REFS_HEADS_PREFIX) :].decode( + "utf-8", errors="continue" + ) + elif remote_label.startswith(cls.REFS_TAGS_PREFIX): + b_repo_tag = remote_label[len(cls.REFS_TAGS_PREFIX) :].decode( + "utf-8", errors="continue" + ) + + if b_repo_tag is not None: + repo_branches.append(cast("RepoTag", b_repo_tag)) + if b_default_repo_tag is None: + b_remote_ref = remote_ref.decode("utf-8", errors="continue") + if repoTag is None and remote_ref == head_remote_ref: b_default_repo_tag = b_repo_tag + b_checkout = cast("RepoTag", b_remote_ref) + elif repoTag in (b_repo_tag, b_remote_ref): + b_default_repo_tag = repoTag + b_checkout = cast("RepoTag", b_remote_ref) - # It is considered a git repo! - shortest_pre_path = cast("URIType", pre_path) - longest_post_path = sp_path[pos:] - if repo_type is None: - # Metadata is all we really need - repo_type = RepoType.Raw - req = request.Request(remote_uri_anc, method="HEAD") - try: - with request.urlopen(req) as resp: - # Is it gitlab? - if list( - filter( - lambda c: "gitlab" in c, - resp.headers.get_all("Set-Cookie"), - ) - ): - repo_type = RepoType.Git - guessed_repo_flavor = RepoGuessFlavor.GitLab - elif list( - filter( - lambda c: GITHUB_NETLOC in c, - resp.headers.get_all("Set-Cookie"), - ) - ): - repo_type = RepoType.Git - guessed_repo_flavor = RepoGuessFlavor.GitHub - elif list( - filter( - lambda c: "bitbucket" in c, - resp.headers.get_all("X-View-Name"), - ) - ): - repo_type = RepoType.Git - guessed_repo_flavor = RepoGuessFlavor.BitBucket - except Exception as e: - pass + if b_default_repo_tag is not None: + break + + if b_default_repo_tag is None: + if repoTag is None: + raise RepoGuessException( + f"No tag was obtained while getting default branch name from {remote_file}" + ) + + if len(repoTag) != 40 or re.search(r"[^a-f0-9]", repoTag): + raise RepoGuessException( + f"Tried to use ill formed {repoTag} SHA to query {remote_file} repo. It should be a 40 characters alphanumeric code." + ) + + # Possible sha in repoTag + memory_repo = dulwich.repo.MemoryRepo() + transport, path = get_transport_and_path(remote_uri_anc) + fetch_pack_result = transport.fetch( + path, cast("dulwich.repo.Repo", memory_repo) + ) + try: + memory_repo.get_object(repoTag.encode("utf-8")) + b_default_repo_tag = repoTag + b_checkout = cast("RepoTag", repoTag) + except (Exception, ValueError) as e: + raise RepoGuessException( + f"Git repo {remote_uri_anc} does not have either a reference or a commit identified as {repoTag}" + ) from e + + # It is considered a git repo! + shortest_pre_path = cast("URIType", pre_path) + longest_post_path = sp_path[pos:] if repo_type is None: raise RepoGuessException(f"Unable to identify {remote_file} as a git repo") @@ -227,16 +312,32 @@ def _find_git_repo_in_uri( if b_default_repo_tag is None: raise RepoGuessException( f"No tag was obtained while getting default branch name from {remote_file}" + if repoTag is None + else f"No tag matched {repoTag} from {remote_file}" ) assert longest_post_path is not None assert repo_branches is not None + # Getting the repoRelPath (if available) + the_rel_path: "Optional[RelPath]" = None + if len(reparsedInputURL.fragment) > 0: + frag_qs = parse.parse_qs(reparsedInputURL.fragment) + subDirArr = frag_qs.get("subdirectory", []) + if len(subDirArr) > 0: + the_rel_path = cast( + "RelPath", "/".join([*longest_post_path, subDirArr[0]]) + ) + elif len(longest_post_path) > 0: + the_rel_path = cast("RelPath", "/".join(longest_post_path)) + repo = RemoteRepo( repo_url=cast("RepoURL", the_remote_uri), + rel_path=the_rel_path, tag=cast("RepoTag", b_default_repo_tag), repo_type=repo_type, guess_flavor=guessed_repo_flavor, + checkout=b_checkout, ) return repo, longest_post_path, repo_branches @@ -246,7 +347,12 @@ def GuessRepoParams( wf_url: "Union[URIType, parse.ParseResult]", logger: "Optional[logging.Logger]" = None, fail_ok: "bool" = False, + offline: "bool" = False, ) -> "Optional[RemoteRepo]": + """ + When it is in offline mode, GuessRepoParams can raise an OfflineRepoGuessException + """ + repoURL = None repoTag = None repoRelPath = None @@ -288,7 +394,9 @@ def GuessRepoParams( fragment="", ) ) - found_params = cls._find_git_repo_in_uri(cast("URIType", repoURL)) + found_params = cls._find_git_repo_in_uri( + cast("URIType", repoURL), offline=offline + ) elif ( parsed_wf_url.scheme in ("http", "https") @@ -296,7 +404,7 @@ def GuessRepoParams( and "@" not in parsed_wf_url.path and parsed_wf_url.fragment == "" ): - found_params = cls._find_git_repo_in_uri(parsed_wf_url) + found_params = cls._find_git_repo_in_uri(parsed_wf_url, offline=offline) repoURL = found_params[0].repo_url repoType = RepoType.Git guessedRepoFlavor = RepoGuessFlavor.GitHub @@ -343,7 +451,9 @@ def GuessRepoParams( # And now, guessing the tag/checkout and the relative path # WARNING! This code can have problems with tags which contain slashes - found_params = cls._find_git_repo_in_uri(cast("URIType", repoURL)) + found_params = cls._find_git_repo_in_uri( + cast("URIType", repoURL), offline=offline + ) if len(wf_path) >= 4: repo_branches_tags = found_params[2] # Validate against existing branch and tag names @@ -366,10 +476,12 @@ def GuessRepoParams( repoTag = wf_path[3] if len(wf_path) > 4: repoRelPath = "/".join(wf_path[4:]) + # TODO handling other popular cases, like bitbucket elif ( parsed_wf_url.scheme == "" or (parsed_wf_url.scheme in cls.GetSchemeHandlers()) or (parsed_wf_url.scheme in cls.GIT_SCHEMES) + or parsed_wf_url.scheme == "http" ): if parsed_wf_url.scheme == "": # It could be a checkout uri in the form of 'git@github.com:inab/WfExS-backend.git' @@ -420,35 +532,23 @@ def GuessRepoParams( parse.urlunparse(denorm_parsed_wf_url) ) - # Getting the tag or branch - if "@" in parsed_wf_url.path: - gitPath, repoTag = parsed_wf_url.path.split("@", 1) - else: - gitPath = parsed_wf_url.path - - # Getting the repoRelPath (if available) - if len(parsed_wf_url.fragment) > 0: - frag_qs = parse.parse_qs(parsed_wf_url.fragment) - subDirArr = frag_qs.get("subdirectory", []) - if len(subDirArr) > 0: - repoRelPath = subDirArr[0] + found_params = cls._find_git_repo_in_uri(parsed_wf_url, offline=offline) + if found_params is not None: + repoURL = found_params[0].repo_url + repoRelPath = found_params[0].rel_path + guessedRepoFlavor = found_params[0].guess_flavor - # Now, reassemble the repoURL - repoURL = parse.urlunparse( - (gitScheme, parsed_wf_url.netloc, gitPath, "", "", "") - ) - found_params = cls._find_git_repo_in_uri(cast("URIType", repoURL)) - guessedRepoFlavor = found_params[0].guess_flavor - # TODO handling other popular cases, like bitbucket else: - found_params = cls._find_git_repo_in_uri(parsed_wf_url) + return None + except OfflineRepoGuessException as ogge: + raise except RepoGuessException as gge: if not fail_ok: import traceback traceback.print_exc() - raise FetcherException( + raise RepoGuessException( f"FIXME: Unsupported http(s) git repository {wf_url} (see cascade exception)" ) from gge @@ -459,7 +559,7 @@ def GuessRepoParams( if guessedRepoFlavor is None: guessedRepoFlavor = found_params[0].guess_flavor elif not fail_ok: - raise FetcherException( + raise RepoGuessException( f"FIXME: Unsupported git repository {wf_url}. (Is it really a git repo???)" ) @@ -501,9 +601,12 @@ def build_pid_from_repo(self, remote_repo: "RemoteRepo") -> "Optional[str]": """ This method is required to generate a PID which usually represents an element (usually a workflow) in a repository. - If the fetcher does not recognize the type of repo, it should + If the fetcher does not recognize the type of repo, either using + repo_url content or the repo type in the worst case, it should return None """ + if remote_repo.repo_type not in (RepoType.Git, None): + return None parsed_wf_url = parse.urlparse(remote_repo.repo_url) retval: "Optional[str]" = None @@ -651,14 +754,13 @@ def build_pid_from_repo(self, remote_repo: "RemoteRepo") -> "Optional[str]": return retval - def materialize_repo( + def materialize_repo_from_repo( self, - repoURL: "RepoURL", - repoTag: "Optional[RepoTag]" = None, + repo: "RemoteRepo", repo_tag_destdir: "Optional[PathLikePath]" = None, base_repo_destdir: "Optional[PathLikePath]" = None, doUpdate: "Optional[bool]" = True, - ) -> "Tuple[pathlib.Path, RemoteRepo, Sequence[URIWithMetadata]]": + ) -> "MaterializedRepo": """ :param repoURL: The URL to the repository. @@ -667,6 +769,13 @@ def materialize_repo( :param doUpdate: :return: """ + if repo.repo_type not in (RepoType.Git, None): + raise FetcherException( + f"Input RemoteRepo instance is not recognized as a fetchable URI (type {repo.repo_type})" + ) + + repoURL = repo.repo_url + repoTag = repo.tag # Assure directory exists before next step if repo_tag_destdir is None: @@ -746,14 +855,17 @@ def materialize_repo( with tempfile.NamedTemporaryFile() as git_stdout, tempfile.NamedTemporaryFile() as git_stderr: # First, (bare) clone retval = 0 + failed_command = [] if gitclone_params is not None: self.logger.debug(f'Running "{" ".join(gitclone_params)}"') + failed_command = gitclone_params retval = subprocess.call( gitclone_params, stdout=git_stdout, stderr=git_stderr ) # Then, checkout (which can be optional) if retval == 0 and (gitcheckout_params is not None): self.logger.debug(f'Running "{" ".join(gitcheckout_params)}"') + failed_command = gitcheckout_params retval = subprocess.Popen( gitcheckout_params, stdout=git_stdout, @@ -772,6 +884,7 @@ def materialize_repo( ] self.logger.debug(f'Running "{" ".join(gitsubmodule_params)}"') + failed_command = gitsubmodule_params retval = subprocess.Popen( gitsubmodule_params, stdout=git_stdout, @@ -787,9 +900,21 @@ def materialize_repo( with open(git_stderr.name, "r") as c_stF: git_stderr_v = c_stF.read() - errstr = "ERROR: Unable to pull '{}' (tag '{}'). Retval {}\n======\nSTDOUT\n======\n{}\n======\nSTDERR\n======\n{}".format( - repoURL, repoTag, retval, git_stdout_v, git_stderr_v + errstr = "ERROR: Unable to pull '{}' (tag '{}').\nFailed command: {}\nRetval {}\n======\nSTDOUT\n======\n{}\n======\nSTDERR\n======\n{}".format( + repoURL, + repoTag, + " ".join(failed_command), + retval, + git_stdout_v, + git_stderr_v, ) + + if repo_tag_destpath.exists(): + self.logger.warning( + f"Failed git command, removing incomplete path {repo_tag_destpath.as_posix()}" + ) + shutil.rmtree(repo_tag_destpath, ignore_errors=True) + raise FetcherException(errstr) # Last, we have to obtain the effective checkout @@ -808,17 +933,15 @@ def materialize_repo( "RepoTag", revproc.stdout.read().rstrip() ) - remote_repo = RemoteRepo( - repo_url=repoURL, - tag=repoTag, + remote_repo = repo._replace( repo_type=RepoType.Git, checkout=repo_effective_checkout, ) - return ( - repo_tag_destpath, - remote_repo, - [], + return MaterializedRepo( + local=repo_tag_destpath, + repo=remote_repo, + metadata_array=[], ) def fetch( @@ -872,7 +995,7 @@ def fetch( repoTag: "Optional[RepoTag]" if "@" in parsedInputURL.path: gitPath, repoTag = cast( - "Tuple[str, RepoTag]", tuple(parsedInputURL.path.split("@", 1)) + "Tuple[str, RepoTag]", tuple(parsedInputURL.path.rsplit("@", 1)) ) else: gitPath = parsedInputURL.path @@ -892,9 +1015,13 @@ def fetch( parse.urlunparse((gitScheme, parsedInputURL.netloc, gitPath, "", "", "")), ) - repo_tag_destdir, remote_repo, metadata_array = self.materialize_repo( - repoURL, repoTag=repoTag + materialized_repo_return = self.materialize_repo_from_repo( + RemoteRepo(repo_url=repoURL, tag=repoTag), ) + repo_tag_destdir = materialized_repo_return.local + remote_repo = materialized_repo_return.repo + metadata_array = materialized_repo_return.metadata_array + if repoRelPath is not None: remote_repo = remote_repo._replace(rel_path=cast("RelPath", repoRelPath)) diff --git a/wfexs_backend/fetchers/http.py b/wfexs_backend/fetchers/http.py index c88cd90b..d630b468 100644 --- a/wfexs_backend/fetchers/http.py +++ b/wfexs_backend/fetchers/http.py @@ -30,6 +30,7 @@ from typing import ( Any, Callable, + ClassVar, Iterable, IO, Mapping, @@ -42,6 +43,10 @@ Union, ) + from typing_extensions import ( + Final, + ) + from _typeshed import SupportsRead from ssl import SSLContext from mypy_extensions import DefaultNamedArg @@ -63,7 +68,9 @@ from . import ( AbstractStatefulFetcher, + AbstractStatefulStreamingFetcher, DocumentedProtocolFetcher, + DocumentedStatefulProtocolFetcher, FetcherException, ProtocolFetcherReturn, ) @@ -78,128 +85,133 @@ ) -def fetchClassicURL( - remote_file: "URIType", - cachedFilename: "Union[PathLikePath, IO[bytes]]", - secContext: "Optional[SecurityContextConfig]" = None, -) -> "ProtocolFetcherReturn": - """ - Method to fetch contents from http, https and ftp - - :param remote_file: - :param cachedFilename: - :param secContext: - """ - - # This is needed to remove possible embedded credentials, - # which should not be stored in the cache - orig_remote_file = remote_file - parsedInputURL, remote_file = AbstractStatefulFetcher.ParseAndRemoveCredentials( - orig_remote_file - ) - # Now the credentials are properly removed from remote_file - # we get them from the parsed url - username = parsedInputURL.username - password = parsedInputURL.password - - if isinstance(secContext, dict): - headers = secContext.get("headers", {}).copy() - token = secContext.get("token") - token_header = secContext.get("token_header") - username = secContext.get("username", username) - password = secContext.get("password", password) - - method = secContext.get("method") - data = secContext.get("data") - else: - headers = {} - method = None - data = None - token = None - token_header = None - - # Callable[[Union[str, Request], Union[bytes, SupportsRead[bytes], Iterable[bytes], None], Optional[float]], Any] - # Callable[[Union[str, Request], Optional[Union[bytes, SupportsRead[bytes], Iterable[bytes], None]], Optional[float], DefaultNamedArg(Optional[str], 'cafile'), DefaultNamedArg(Optional[str], 'capath'), DefaultNamedArg(bool, 'cadefault'), DefaultNamedArg(Optional[SSLContext], 'context')], Any] - opener: "Union[Callable[[Union[str, request.Request], Union[bytes, SupportsRead[bytes], Iterable[bytes], None], Optional[float]], Any], Callable[[Union[str, request.Request], Optional[Union[bytes, SupportsRead[bytes], Iterable[bytes]]], Optional[float], DefaultNamedArg(Optional[str], 'cafile'), DefaultNamedArg(Optional[str], 'capath'), DefaultNamedArg(bool, 'cadefault'), DefaultNamedArg(Optional[SSLContext], 'context')], Any]]" - opener = request.urlopen - if token is not None: - if token_header is not None: - headers[token_header] = token - else: - headers["Authorization"] = f"Bearer {token}" - elif username is not None: - if password is None: - password = "" - - opener = get_opener_with_auth(remote_file, username, password).open - - # # Time to set up user and password in URL - # parsedInputURL = parse.urlparse(remote_file) - # - # netloc = parse.quote(username, safe='') + ':' + parse.quote(password, - # safe='') + '@' + parsedInputURL.hostname - # if parsedInputURL.port is not None: - # netloc += ':' + str(parsedInputURL.port) - # - # # Now the credentials are properly set up - # remote_file = cast("URIType", parse.urlunparse((parsedInputURL.scheme, netloc, parsedInputURL.path, - # parsedInputURL.params, parsedInputURL.query, parsedInputURL.fragment))) - - # Preparing where it is going to be written - download_file: "IO[bytes]" - if isinstance(cachedFilename, (str, os.PathLike)): - download_file = open(cachedFilename, "wb") - else: - download_file = cachedFilename - - uri_with_metadata = None - try: - req_remote = request.Request( - remote_file, headers=headers, data=data, method=method - ) - with opener(req_remote) as url_response: - uri_with_metadata = URIWithMetadata( - uri=url_response.url, metadata=dict(url_response.headers.items()) - ) +class HTTPFetcher(AbstractStatefulStreamingFetcher): + PRIORITY: "ClassVar[int]" = 20 + HTTP_PROTO: "Final[str]" = "http" + HTTPS_PROTO: "Final[str]" = "https" - while True: - try: - # Try getting it - shutil.copyfileobj(url_response, download_file) - except http.client.IncompleteRead as icread: - download_file.write(icread.partial) - # Restarting the copy - continue - break - - except urllib.error.HTTPError as he: - raise FetcherException( - "Error fetching {} : {} {}\n{}".format( - orig_remote_file, he.code, he.reason, he.read().decode() + @classmethod + def GetSchemeHandlers(cls) -> "Mapping[str, DocumentedStatefulProtocolFetcher]": + # These are de-facto schemes supported by pip and git client + return { + cls.HTTP_PROTO: DocumentedStatefulProtocolFetcher( + fetcher_class=cls, + description="HTTP download URLs", + priority=cls.PRIORITY, ), - code=he.code, - reason=he.reason, - ) from he - finally: - # Closing files opened by this code - if download_file != cachedFilename: - download_file.close() - - return ProtocolFetcherReturn( - kind_or_resolved=ContentKind.File, - metadata_array=[uri_with_metadata], - ) - - -SCHEME_HANDLERS: "Mapping[str, DocumentedProtocolFetcher]" = { - "http": DocumentedProtocolFetcher( - fetcher=fetchClassicURL, - description="HTTP download URLs", - priority=20, - ), - "https": DocumentedProtocolFetcher( - fetcher=fetchClassicURL, - description="HTTPS download URLs", - priority=20, - ), -} + cls.HTTPS_PROTO: DocumentedStatefulProtocolFetcher( + fetcher_class=cls, + description="HTTPS download URLs", + priority=cls.PRIORITY, + ), + } + + @property + def description(self) -> "str": + return "HTTP and HTTPS download URLs" + + @classmethod + def GetNeededPrograms(cls) -> "Sequence[SymbolicName]": + return tuple() + + def streamfetch( + self, + remote_file: "URIType", + dest_stream: "IO[bytes]", + secContext: "Optional[SecurityContextConfig]" = None, + ) -> "ProtocolFetcherReturn": + """ + Method to fetch contents from http and https. + This is the method to be implemented by the stateful streaming fetcher + which can receive as destination a byte stream + + :param remote_file: + :param dest_stream: + :param secContext: + """ + + # This is needed to remove possible embedded credentials, + # which should not be stored in the cache + orig_remote_file = remote_file + parsedInputURL, remote_file = self.ParseAndRemoveCredentials(orig_remote_file) + # Now the credentials are properly removed from remote_file + # we get them from the parsed url + username = parsedInputURL.username + password = parsedInputURL.password + + if isinstance(secContext, dict): + headers = secContext.get("headers", {}).copy() + token = secContext.get("token") + token_header = secContext.get("token_header") + username = secContext.get("username", username) + password = secContext.get("password", password) + + method = secContext.get("method") + data = secContext.get("data") + else: + headers = {} + method = None + data = None + token = None + token_header = None + + # Callable[[Union[str, Request], Union[bytes, SupportsRead[bytes], Iterable[bytes], None], Optional[float]], Any] + # Callable[[Union[str, Request], Optional[Union[bytes, SupportsRead[bytes], Iterable[bytes], None]], Optional[float], DefaultNamedArg(Optional[str], 'cafile'), DefaultNamedArg(Optional[str], 'capath'), DefaultNamedArg(bool, 'cadefault'), DefaultNamedArg(Optional[SSLContext], 'context')], Any] + opener: "Union[Callable[[Union[str, request.Request], Union[bytes, SupportsRead[bytes], Iterable[bytes], None], Optional[float]], Any], Callable[[Union[str, request.Request], Optional[Union[bytes, SupportsRead[bytes], Iterable[bytes]]], Optional[float], DefaultNamedArg(Optional[str], 'cafile'), DefaultNamedArg(Optional[str], 'capath'), DefaultNamedArg(bool, 'cadefault'), DefaultNamedArg(Optional[SSLContext], 'context')], Any]]" + opener = request.urlopen + if token is not None: + if token_header is not None: + headers[token_header] = token + else: + headers["Authorization"] = f"Bearer {token}" + elif username is not None: + if password is None: + password = "" + + opener = get_opener_with_auth(remote_file, username, password).open + + # # Time to set up user and password in URL + # parsedInputURL = parse.urlparse(remote_file) + # + # netloc = parse.quote(username, safe='') + ':' + parse.quote(password, + # safe='') + '@' + parsedInputURL.hostname + # if parsedInputURL.port is not None: + # netloc += ':' + str(parsedInputURL.port) + # + # # Now the credentials are properly set up + # remote_file = cast("URIType", parse.urlunparse((parsedInputURL.scheme, netloc, parsedInputURL.path, + # parsedInputURL.params, parsedInputURL.query, parsedInputURL.fragment))) + + uri_with_metadata = None + try: + req_remote = request.Request( + remote_file, headers=headers, data=data, method=method + ) + with opener(req_remote) as url_response: + uri_with_metadata = URIWithMetadata( + uri=url_response.url, metadata=dict(url_response.headers.items()) + ) + + while True: + try: + # Try getting it + shutil.copyfileobj(url_response, dest_stream) + except http.client.IncompleteRead as icread: + dest_stream.write(icread.partial) + # Restarting the copy + continue + break + + except urllib.error.HTTPError as he: + raise FetcherException( + "Error fetching {} : {} {}\n{}".format( + orig_remote_file, he.code, he.reason, he.read().decode() + ), + code=he.code, + reason=he.reason, + ) from he + + return ProtocolFetcherReturn( + kind_or_resolved=ContentKind.File, + metadata_array=[uri_with_metadata], + ) diff --git a/wfexs_backend/fetchers/osf_io.py b/wfexs_backend/fetchers/osf_io.py index 60ff9fef..de108ebd 100644 --- a/wfexs_backend/fetchers/osf_io.py +++ b/wfexs_backend/fetchers/osf_io.py @@ -35,7 +35,7 @@ FetcherException, ProtocolFetcherReturn, ) -from .http import fetchClassicURL +from .http import HTTPFetcher from ..common import ( ContentKind, @@ -93,11 +93,12 @@ def fetchOSFIO( metadata_url = cast("URIType", parse.urljoin(OSF_IO_RECORD_REST, osf_io_id)) + http_fetcher = HTTPFetcher() gathered_meta = {"fetched": metadata_url} metadata_array = [URIWithMetadata(remote_file, gathered_meta)] try: metaio = io.BytesIO() - _, metametaio, _ = fetchClassicURL(metadata_url, metaio) + _, metametaio, _ = http_fetcher.streamfetch(metadata_url, metaio) metadata = json.loads(metaio.getvalue().decode("utf-8")) gathered_meta["payload"] = metadata metadata_array.extend(metametaio) @@ -131,7 +132,7 @@ def fetchOSFIO( metadata_array.append(URIWithMetadata(remote_file, gathered_l_meta)) try: metaio = io.BytesIO() - _, metametalicio, _ = fetchClassicURL(osf_io_lic_link, metaio) + _, metametalicio, _ = http_fetcher.streamfetch(osf_io_lic_link, metaio) l_metadata = json.loads(metaio.getvalue().decode("utf-8")) gathered_l_meta["payload"] = l_metadata metadata_array.extend(metametalicio) @@ -166,7 +167,7 @@ def fetchOSFIO( metadata_array.append(URIWithMetadata(remote_file, gathered_fm_meta)) try: metaio = io.BytesIO() - _, metametafmio, _ = fetchClassicURL(osf_io_files_meta_link, metaio) + _, metametafmio, _ = http_fetcher.streamfetch(osf_io_files_meta_link, metaio) fm_metadata = json.loads(metaio.getvalue().decode("utf-8")) gathered_fm_meta["payload"] = fm_metadata metadata_array.extend(metametafmio) @@ -199,7 +200,7 @@ def fetchOSFIO( metadata_array.append(URIWithMetadata(remote_file, gathered_s_meta)) try: metaio = io.BytesIO() - _, metametasio, _ = fetchClassicURL(osf_io_store_link, metaio) + _, metametasio, _ = http_fetcher.streamfetch(osf_io_store_link, metaio) s_metadata = json.loads(metaio.getvalue().decode("utf-8")) gathered_s_meta["payload"] = s_metadata metadata_array.extend(metametasio) @@ -265,12 +266,12 @@ def fetchOSFIO( the_file_local_path = cast( "AbsPath", os.path.join(cachedFilename, relpath) ) - _, metacont, _ = fetchClassicURL( + _, metacont, _ = http_fetcher.fetch( the_file["links"]["download"], the_file_local_path ) metadata_array.extend(metacont) elif kind == ContentKind.File: - _, metacont, _ = fetchClassicURL( + _, metacont, _ = http_fetcher.fetch( the_files[0]["links"]["download"], cachedFilename ) metadata_array.extend(metacont) diff --git a/wfexs_backend/fetchers/pride.py b/wfexs_backend/fetchers/pride.py index b6117476..ce71ccd7 100644 --- a/wfexs_backend/fetchers/pride.py +++ b/wfexs_backend/fetchers/pride.py @@ -50,7 +50,7 @@ FetcherException, ProtocolFetcherReturn, ) -from .http import fetchClassicURL +from .http import HTTPFetcher PRIDE_PROJECT_SCHEME = "pride.project" @@ -92,7 +92,7 @@ def fetchPRIDEProject( metadata = None try: metaio = io.BytesIO() - _, metametaio, _ = fetchClassicURL(metadata_url, metaio) + _, metametaio, _ = HTTPFetcher().streamfetch(metadata_url, metaio) metadata = json.loads(metaio.getvalue().decode("utf-8")) gathered_meta["payload"] = metadata metadata_array.extend(metametaio) diff --git a/wfexs_backend/fetchers/swh.py b/wfexs_backend/fetchers/swh.py index 6744dd73..a26203c3 100644 --- a/wfexs_backend/fetchers/swh.py +++ b/wfexs_backend/fetchers/swh.py @@ -2,7 +2,7 @@ # -*- coding: utf-8 -*- # SPDX-License-Identifier: Apache-2.0 -# Copyright 2020-2024 Barcelona Supercomputing Center (BSC), Spain +# Copyright 2020-2025 Barcelona Supercomputing Center (BSC), Spain # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -37,6 +37,7 @@ from typing import ( Any, + ClassVar, IO, Mapping, MutableSequence, @@ -65,16 +66,18 @@ ) from . import ( - AbstractRepoFetcher, + AbstractSchemeRepoFetcher, DocumentedStatefulProtocolFetcher, FetcherException, + MaterializedRepo, + OfflineRepoGuessException, ProtocolFetcherReturn, RemoteRepo, RepoGuessException, RepoType, ) -from .http import fetchClassicURL +from .http import HTTPFetcher from ..common import ( ContentKind, @@ -87,7 +90,9 @@ ) -class SoftwareHeritageFetcher(AbstractRepoFetcher): +class SoftwareHeritageFetcher(AbstractSchemeRepoFetcher): + PRIORITY: "ClassVar[int]" = AbstractSchemeRepoFetcher.PRIORITY + 20 + SOFTWARE_HERITAGE_SCHEME: "Final[str]" = "swh" SWH_API_REST: "Final[str]" = "https://archive.softwareheritage.org/api/1/" SWH_API_REST_KNOWN: "Final[URIType]" = cast( @@ -136,7 +141,7 @@ def _resolve_swh_id( # urljoin cannot be used due working with URIs resolve_uri = cast("URIType", cls.SWH_API_REST_RESOLVE + swh_quoted_id + "/") try: - _, metaresio, _ = fetchClassicURL( + _, metaresio, _ = HTTPFetcher().streamfetch( resolve_uri, resio, secContext={ @@ -172,6 +177,7 @@ def GuessRepoParams( orig_wf_url: "Union[URIType, parse.ParseResult]", logger: "Optional[logging.Logger]" = None, fail_ok: "bool" = False, + offline: "bool" = False, ) -> "Optional[RemoteRepo]": # Deciding which is the input wf_url: "RepoURL" @@ -183,6 +189,14 @@ def GuessRepoParams( wf_url = cast("RepoURL", orig_wf_url) parsed_wf_url = parse.urlparse(orig_wf_url) + if fail_ok and parsed_wf_url.scheme not in cls.GetSchemeHandlers(): + return None + + if offline: + raise OfflineRepoGuessException( + f"Queries related to {wf_url} are not allowed in offline mode" + ) + if parsed_wf_url.scheme not in cls.GetSchemeHandlers(): return None @@ -191,7 +205,7 @@ def GuessRepoParams( putative_core_swhid = wf_url.split(";", 1)[0] try: valio = io.BytesIO() - _, metavalio, _ = fetchClassicURL( + _, metavalio, _ = HTTPFetcher().streamfetch( cls.SWH_API_REST_KNOWN, valio, secContext={ @@ -207,7 +221,9 @@ def GuessRepoParams( except Exception as e: if fail_ok: return None - raise + raise RepoGuessException( + f"Errors while querying {wf_url} for guessing purposes" + ) from e # It could be a valid swh identifier, but it is not registered if not isinstance(val_doc, dict) or not val_doc.get( @@ -230,24 +246,40 @@ def build_pid_from_repo(self, remote_repo: "RemoteRepo") -> "Optional[str]": """ This method is required to generate a PID which usually represents an element (usually a workflow) in a repository. - If the fetcher does not recognize the type of repo, it should + If the fetcher does not recognize the type of repo, either using + repo_url content or the repo type in the worst case, it should return None """ + parsed_wf_url = parse.urlparse(remote_repo.repo_url) - if parsed_wf_url.scheme not in self.GetSchemeHandlers(): + if ( + parsed_wf_url.scheme not in self.GetSchemeHandlers() + or remote_repo.repo_type not in (RepoType.SoftwareHeritage, None) + ): return None # FIXME: improve this return remote_repo.repo_url - def materialize_repo( + def materialize_repo_from_repo( self, - repoURL: "RepoURL", - repoTag: "Optional[RepoTag]" = None, + repo: "RemoteRepo", repo_tag_destdir: "Optional[PathLikePath]" = None, base_repo_destdir: "Optional[PathLikePath]" = None, doUpdate: "Optional[bool]" = True, - ) -> "Tuple[pathlib.Path, RemoteRepo, Sequence[URIWithMetadata]]": + ) -> "MaterializedRepo": + repoURL = cast("RepoURL", repo.tag) if repo.tag is not None else repo.repo_url + repoTag = repo.tag + + parsed_wf_url = parse.urlparse(repoURL) + if ( + parsed_wf_url.scheme not in self.GetSchemeHandlers() + or repo.repo_type not in (RepoType.SoftwareHeritage, None) + ): + raise FetcherException( + f"Input RemoteRepo instance is not recognized as a fetchable URI (repo {repoURL} , type {repo.repo_type})" + ) + # If we are here is because the repo is valid # as it should have been checked by GuessRepoParams @@ -298,10 +330,10 @@ def materialize_repo( object_id + "/", ), ) - _, metarelio, _ = fetchClassicURL( + _, metarelio, _ = self.scheme_catalog.streamfetch( release_uri, relio, - secContext={ + sec_context={ "headers": { "Accept": "application/json", }, @@ -337,10 +369,10 @@ def materialize_repo( ) try: revio = io.BytesIO() - _, metarevio, _ = fetchClassicURL( + _, metarevio, _ = self.scheme_catalog.streamfetch( cast("URIType", revision_uri), revio, - secContext={ + sec_context={ "headers": { "Accept": "application/json", }, @@ -390,10 +422,10 @@ def materialize_repo( time.sleep(self.WAIT_SECS) try: dirio = io.BytesIO() - _, metadirio, _ = fetchClassicURL( + _, metadirio, _ = self.scheme_catalog.streamfetch( cast("URIType", directory_url), dirio, - secContext={ + sec_context={ "headers": { "Accept": "application/json", }, @@ -434,7 +466,7 @@ def materialize_repo( with tempfile.NamedTemporaryFile() as tmp_targz_filename: try: - _, metafetchio, _ = fetchClassicURL( + _, metafetchio, _ = self.scheme_catalog.fetch( dir_fetch_url, cast("AbsPath", tmp_targz_filename.name), ) @@ -520,10 +552,10 @@ def materialize_repo( try: contentio = io.BytesIO() - _, metacontentio, _ = fetchClassicURL( + _, metacontentio, _ = self.scheme_catalog.streamfetch( cast("URIType", content_url), contentio, - secContext={ + sec_context={ "headers": { "Accept": "application/json", }, @@ -550,14 +582,15 @@ def materialize_repo( # Assure base directory exists before next step # here repo_tag_destdir is a file - repo_tag_destfile: "Union[PathLikePath, IO[bytes]]" + repo_tag_destfile: "Optional[PathLikePath]" = None + repo_tag_deststream: "Optional[IO[bytes]]" = None if repo_tag_destdir is None: if base_repo_destdir is None: temp_file_descriptor, repo_tag_destdir = cast( "Tuple[int, AbsPath]", tempfile.mkstemp(prefix="wfexs", suffix=".swh"), ) - repo_tag_destfile = os.fdopen(temp_file_descriptor, mode="wb") + repo_tag_deststream = os.fdopen(temp_file_descriptor, mode="wb") atexit.register(os.unlink, repo_tag_destdir) else: repo_hashed_id = hashlib.sha1(repoURL.encode("utf-8")).hexdigest() @@ -586,17 +619,29 @@ def materialize_repo( repo_tag_destpath = pathlib.Path(repo_tag_destdir) try: - _, metafetchio, _ = fetchClassicURL( - content_fetch_url, - repo_tag_destfile, - ) + if repo_tag_destfile is not None: + _, metafetchio, _ = self.scheme_catalog.fetch( + content_fetch_url, + repo_tag_destfile, + ) + elif repo_tag_deststream is not None: + _, metafetchio, _ = self.scheme_catalog.streamfetch( + content_fetch_url, + repo_tag_deststream, + ) + else: + raise FetcherException( + f"No fetch of {content_fetch_url} (assertion?)" + ) + except FetcherException as fe: + raise except Exception as e: raise FetcherException( f"HTTP REST call {content_fetch_url} failed" ) from e finally: - if not isinstance(repo_tag_destfile, (str, os.PathLike)): - repo_tag_destfile.close() + if repo_tag_deststream is not None: + repo_tag_deststream.close() gathered_meta = { "fetched": content_fetch_url, @@ -608,17 +653,28 @@ def materialize_repo( f"Unexpected Software Heritage object type {object_type} for {repoURL}" ) - remote_repo = RemoteRepo( - repo_url=repoURL, - tag=repoTag, + remote_repo = repo._replace( repo_type=RepoType.SoftwareHeritage, checkout=cast("RepoTag", repo_effective_checkout), ) - return ( - repo_tag_destpath, - remote_repo, - metadata_array, + upstream_repo: "Optional[RemoteRepo]" = None + origin: "Optional[str]" = res_doc.get("metadata", {}).get("origin") + # This is an heuristic to build a git scheme uri + if origin is not None: + upstream_repo = RemoteRepo( + repo_url=cast("RepoURL", origin), + rel_path=cast("Optional[RelPath]", res_doc["metadata"].get("path")), + repo_type=RepoType.Git + if ("git" in origin) or ("bitbucket" in origin) + else None, + ) + + return MaterializedRepo( + local=repo_tag_destpath, + repo=remote_repo, + metadata_array=metadata_array, + upstream_repo=upstream_repo, ) def fetch( @@ -646,9 +702,12 @@ def fetch( repoRelPath = None # It is materialized in a temporary location - repo_tag_destdir, remote_repo, metadata_array = self.materialize_repo( - cast("RepoURL", remote_file) + materialized_repo_return = self.materialize_repo_from_repo( + RemoteRepo(repo_url=cast("RepoURL", remote_file)), ) + repo_tag_destdir = materialized_repo_return.local + remote_repo = materialized_repo_return.repo + metadata_array = materialized_repo_return.metadata_array preferredName: "Optional[RelPath]" # repoRelPath is only acknowledged when the resolved repo diff --git a/wfexs_backend/fetchers/trs_files.py b/wfexs_backend/fetchers/trs_files.py index 2bc1f532..8f135495 100644 --- a/wfexs_backend/fetchers/trs_files.py +++ b/wfexs_backend/fetchers/trs_files.py @@ -2,7 +2,7 @@ # -*- coding: utf-8 -*- # SPDX-License-Identifier: Apache-2.0 -# Copyright 2020-2024 Barcelona Supercomputing Center (BSC), Spain +# Copyright 2020-2025 Barcelona Supercomputing Center (BSC), Spain # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -18,9 +18,20 @@ from __future__ import absolute_import +import atexit +import copy +import hashlib +import inspect import io import json +import logging import os +import pathlib +import shutil +import tempfile +import urllib.parse +import sys +import warnings from typing import ( cast, @@ -29,287 +40,1176 @@ from urllib import parse +# This code needs exception groups +if sys.version_info[:2] < (3, 11): + from exceptiongroup import ExceptionGroup + from . import ( - AbstractStatefulFetcher, + AbstractSchemeRepoFetcher, DocumentedProtocolFetcher, + DocumentedStatefulProtocolFetcher, FetcherException, + MaterializedRepo, + OfflineRepoGuessException, ProtocolFetcherReturn, + RemoteRepo, + RepoGuessException, + RepoType, +) + +from .. import ( + get_WfExS_version_str, ) -from .http import fetchClassicURL from ..common import ( ContentKind, URIWithMetadata, ) +from ..utils.contents import ( + link_or_copy_pathlib, +) + +from ..utils.misc import ( + urlresolv, +) + +from .http import HTTPFetcher + +from ..scheme_catalog import ( + SchemeCatalog, +) + if TYPE_CHECKING: from typing import ( + Any, Mapping, + MutableMapping, MutableSequence, Optional, Sequence, + Tuple, + Union, + ) + + from typing_extensions import ( + Final, ) from ..common import ( AbsPath, PathLikePath, + ProgsMapping, + RelPath, + RepoTag, + RepoURL, SecurityContextConfig, + SymbolicName, + TRS_Workflow_Descriptor, URIType, ) -INTERNAL_TRS_SCHEME_PREFIX = "wfexs.trs.files" -TRS_SCHEME_PREFIX = "trs" + from ..workflow import ( + WFVersionId, + WorkflowId, + ) -TRS_FILES_SUFFIX = "/files" -TRS_DESCRIPTOR_INFIX = "/descriptor/" +class GA4GHTRSFetcher(AbstractSchemeRepoFetcher): + INTERNAL_TRS_SCHEME_PREFIX: "Final[str]" = "wfexs.trs.files" + TRS_SCHEME_PREFIX: "Final[str]" = "trs" -def fetchTRSFiles( - remote_file: "URIType", - cachedFilename: "PathLikePath", - secContext: "Optional[SecurityContextConfig]" = None, -) -> "ProtocolFetcherReturn": - """ - Method to download contents from TRS files related to a tool + TRS_TOOLS_SUFFIX: "Final[str]" = "tools/" + TRS_FILES_SUFFIX: "Final[str]" = "/files" + TRS_DESCRIPTOR_INFIX: "Final[str]" = "/descriptor/" - :param remote_file: - :param cachedFilename: Destination filename for the fetched content - :param secContext: The security context containing the credentials - """ + @classmethod + def GetSchemeHandlers(cls) -> "Mapping[str, DocumentedStatefulProtocolFetcher]": + # These are de-facto schemes supported by Software Heritage + # libraries and other implementations + return { + cls.INTERNAL_TRS_SCHEME_PREFIX: DocumentedStatefulProtocolFetcher( + fetcher_class=cls, + description="WfExS internal pseudo-scheme used to materialize files from pure TRS servers", + priority=cls.PRIORITY, + ), + cls.TRS_SCHEME_PREFIX: DocumentedStatefulProtocolFetcher( + fetcher_class=cls, + description="GA4GH TRS metadata is fetched using the APIs described at https://ga4gh.github.io/tool-registry-service-schemas/. Contents are downloaded delegating their associated URIs to other fetchers", + priority=cls.PRIORITY, + ), + } - parsedInputURL = parse.urlparse(remote_file) - path_steps: "Sequence[str]" = parsedInputURL.path.split("/") - embedded_remote_file = parsedInputURL.path + @property + def description(self) -> "str": + return "Fetcher for GA4GH TRSv2 tools" - metadata_array: "MutableSequence[URIWithMetadata]" = [] - if parsedInputURL.scheme == INTERNAL_TRS_SCHEME_PREFIX: - # TODO: Improve this code - if not embedded_remote_file.endswith(TRS_FILES_SUFFIX): - metadata_url = cast("URIType", embedded_remote_file + TRS_FILES_SUFFIX) - descriptor_base_url = embedded_remote_file + TRS_DESCRIPTOR_INFIX - else: - metadata_url = cast("URIType", embedded_remote_file) - descriptor_base_url = ( - embedded_remote_file[0 : -len(TRS_FILES_SUFFIX)] + TRS_DESCRIPTOR_INFIX + @classmethod + def GetNeededPrograms(cls) -> "Sequence[SymbolicName]": + return tuple() + + @classmethod + def GuessTRSParams( + cls, + orig_wf_url: "Union[URIType, parse.ParseResult]", + override_version_id: "Optional[WFVersionId]" = None, + logger: "Optional[logging.Logger]" = None, + fail_ok: "bool" = False, + scheme_catalog: "Optional[SchemeCatalog]" = None, + offline: "bool" = False, + ) -> "Optional[Tuple[RepoURL, str, Sequence[str], WorkflowId, WFVersionId, str, Sequence[URIWithMetadata], Optional[Mapping[str, Any]]]]": + if scheme_catalog is None: + scheme_catalog = SchemeCatalog( + scheme_handlers=HTTPFetcher.GetSchemeHandlers() ) - elif parsedInputURL.scheme == TRS_SCHEME_PREFIX: - # TRS official scheme - if len(path_steps) < 3 or path_steps[0] != "": - raise FetcherException( - f"Ill-formed TRS CURIE {remote_file}. It should be in the format of {TRS_SCHEME_PREFIX}://id/version or {TRS_SCHEME_PREFIX}://prefix-with-slashes/id/version" + + if logger is None: + logger = logging.getLogger( + dict(inspect.getmembers(cls))["__module__"] + "::" + cls.__name__ ) - version_steps = cast("MutableSequence[str]", path_steps[0:-2]) - version_steps.extend( - ["ga4gh", "trs", "v2", "tools", path_steps[-2], "versions", path_steps[-1]] - ) - version_metadata_url = cast( - "URIType", - parse.urlunparse( - parse.ParseResult( + # Deciding which is the input + wf_url: "RepoURL" + parsed_wf_url: "parse.ParseResult" + if isinstance(orig_wf_url, parse.ParseResult): + parsed_wf_url = orig_wf_url + wf_url = cast("RepoURL", parse.urlunparse(orig_wf_url)) + else: + wf_url = cast("RepoURL", orig_wf_url) + parsed_wf_url = parse.urlparse(orig_wf_url) + + if parsed_wf_url.scheme in HTTPFetcher.GetSchemeHandlers(): + wf_url = cast("RepoURL", cls.INTERNAL_TRS_SCHEME_PREFIX + ":" + wf_url) + parsed_wf_url = parse.urlparse(wf_url) + + metadata_array: "MutableSequence[URIWithMetadata]" = [] + putative_tool_uri: "Optional[URIType]" = None + descriptor: "Optional[str]" = None + service_info_metadata: "Optional[MutableMapping[str, Any]]" = None + trs_tool_uri: "URIType" + trs_tool_meta: "Optional[Mapping[str, Any]]" = None + version_id: "Optional[WFVersionId]" = None + if parsed_wf_url.scheme == cls.TRS_SCHEME_PREFIX: + if offline: + raise OfflineRepoGuessException( + f"Queries related to {wf_url} are not allowed in offline mode" + ) + # Duplication of code + path_steps: "Sequence[str]" = parsed_wf_url.path.split("/") + if len(path_steps) < 3 or path_steps[0] != "": + if fail_ok: + return None + raise RepoGuessException( + f"Ill-formed TRS CURIE {wf_url}. It should be in the format of {cls.TRS_SCHEME_PREFIX}://server/id/version or {cls.TRS_SCHEME_PREFIX}://server-plus-prefix-with-slashes/id/version" + ) + + trs_steps = cast("MutableSequence[str]", path_steps[0:-2]) + trs_steps.extend(["ga4gh", "trs", "v2", "service-info"]) + + trs_service_netloc = parsed_wf_url.netloc + trs_service_info = urllib.parse.urlunparse( + urllib.parse.ParseResult( scheme="https", - netloc=parsedInputURL.netloc, - path="/".join(version_steps), + netloc=trs_service_netloc, + path="/".join(trs_steps), params="", query="", fragment="", ) - ), + ) + + service_info_wfexs_meta = { + "fetched": trs_service_info, + "payload": cast("Optional[Mapping[str, Any]]", None), + } + metadata_array.append(URIWithMetadata(wf_url, service_info_wfexs_meta)) + try: + metaio = io.BytesIO() + _, metametaio, _ = scheme_catalog.streamfetch( + cast("URIType", trs_service_info), metaio + ) + service_info_metadata = json.loads(metaio.getvalue().decode("utf-8")) + service_info_wfexs_meta["payload"] = service_info_metadata + metadata_array.extend(metametaio) + + trs_endpoint = trs_service_info[0 : -len("service-info")] + except Exception as e1: + non_standard_trs_steps = cast("MutableSequence[str]", path_steps[0:-2]) + non_standard_trs_steps.extend(["service-info"]) + + non_standard_trs_service_info = urllib.parse.urlunparse( + urllib.parse.ParseResult( + scheme="https", + netloc=trs_service_netloc, + path="/".join(non_standard_trs_steps), + params="", + query="", + fragment="", + ) + ) + + try: + metaio = io.BytesIO() + _, metametaio, _ = scheme_catalog.streamfetch( + cast("URIType", non_standard_trs_service_info), metaio + ) + service_info_metadata = json.loads( + metaio.getvalue().decode("utf-8") + ) + service_info_wfexs_meta["payload"] = service_info_metadata + metadata_array.extend(metametaio) + trs_endpoint = non_standard_trs_service_info[ + 0 : -len("service-info") + ] + except Exception as e2: + if fail_ok: + return None + raise ExceptionGroup( # pylint: disable=possibly-used-before-assignment + f"Error fetching or processing TRS service info metadata for {wf_url} (tried both {trs_service_info} and {non_standard_trs_service_info})", + [e1, e2], + ) + + version_id = ( + urllib.parse.unquote(path_steps[-1]) + if not override_version_id + else override_version_id + ) + trs_tool_uri = cast( + "URIType", + trs_endpoint + + cls.TRS_TOOLS_SUFFIX + + path_steps[-2] + + "/versions/" + + urllib.parse.quote(cast("str", version_id), safe=""), + ) + workflow_id = urllib.parse.unquote(path_steps[-2]) + descriptor = None + elif parsed_wf_url.scheme == cls.INTERNAL_TRS_SCHEME_PREFIX: + if offline: + raise OfflineRepoGuessException( + f"Queries related to {wf_url} are not allowed in offline mode" + ) + putative_tool_uri = cast( + "URIType", + parsed_wf_url.path[0:-1] + if parsed_wf_url.path.endswith("/") + else parsed_wf_url.path, + ) + + parsed_putative_tool_uri = urllib.parse.urlparse(putative_tool_uri) + trs_service_netloc = parsed_putative_tool_uri.netloc + # Detecting workflowhub derivatives + is_wh = parsed_putative_tool_uri.netloc.endswith("workflowhub.eu") + + # Time to try guessing everything + tool_wfexs_meta = { + "fetched": putative_tool_uri, + "payload": None, + } + metadata_array.append(URIWithMetadata(wf_url, tool_wfexs_meta)) + try: + resio = io.BytesIO() + _, metaresio, _ = scheme_catalog.streamfetch( + putative_tool_uri, + resio, + sec_context={ + "headers": { + "Accept": "application/json", + # Added to avoid Cloudflare anti-bot policy + "User-Agent": get_WfExS_version_str(), + }, + }, + ) + trs__meta = json.loads(resio.getvalue().decode("utf-8")) + tool_wfexs_meta["payload"] = trs__meta + metadata_array.extend(metaresio) + except Exception as e: + if fail_ok: + return None + raise RepoGuessException( + f"trs_endpoint could not be guessed from {putative_tool_uri} (raised exception {e})" + ) from e + + if not isinstance(trs__meta, dict): + if fail_ok: + return None + raise RepoGuessException( + f"trs_endpoint could not be guessed from {putative_tool_uri} (not returning JSON object)" + ) + + # Is this the "abstract" tool definition? + versions = trs__meta.get("versions") + if isinstance(versions, list) and "toolclass" in trs__meta: + if len(versions) == 0: + if fail_ok: + return None + raise RepoGuessException( + f"No versions found associated to TRS tool reachable through {putative_tool_uri}" + ) + + if override_version_id: + for putative_trs_tool_meta in versions: + version_id = putative_trs_tool_meta.get("id") + name = putative_trs_tool_meta.get("name") + if version_id is not None: + # Dockstore misbehaves + if ( + name is not None + and str(version_id).endswith(name) + and parsed_putative_tool_uri.netloc.endswith( + "dockstore.org" + ) + ): + version_id = name + if version_id == override_version_id: + trs_tool_meta = putative_trs_tool_meta + break + else: + if fail_ok: + return None + raise RepoGuessException( + f"Forced version {override_version_id} not found associated to TRS tool reachable through {putative_tool_uri}" + ) + + else: + # Reuse the last version + trs_tool_meta = versions[-1] + + assert trs_tool_meta is not None + + trs_endpoint = urllib.parse.urlunparse( + urllib.parse.ParseResult( + scheme=parsed_putative_tool_uri.scheme, + netloc=parsed_putative_tool_uri.netloc, + path="/".join(parsed_putative_tool_uri.path.split("/")[0:-2]) + + "/", + params="", + query="", + fragment="", + ) + ) + workflow_id = urllib.parse.unquote( + parsed_putative_tool_uri.path.split("/")[-1] + ) + trs_tool_prefix = putative_tool_uri + version_id = cast("Optional[WFVersionId]", trs_tool_meta.get("id")) + name = trs_tool_meta.get("name") + if version_id is not None: + # Dockstore misbehaves + if ( + name is not None + and str(version_id).endswith(name) + and parsed_putative_tool_uri.netloc.endswith("dockstore.org") + ): + version_id = name + trs_tool_uri = cast( + "URIType", + trs_tool_prefix + + "/versions/" + + urllib.parse.quote(str(version_id), safe=""), + ) + elif fail_ok: + return None + else: + raise RepoGuessException( + f"No version id found associated to specific version of TRS tool reachable through {putative_tool_uri}" + ) + # ... or a concrete one? + elif "descriptor_type" in trs__meta: + if override_version_id: + rpslash = putative_tool_uri.rfind("/") + putative_tool_uri = cast( + "URIType", + putative_tool_uri[0 : rpslash + 1] + + urllib.parse.quote(str(override_version_id), safe=""), + ) + parsed_putative_tool_uri = urllib.parse.urlparse(putative_tool_uri) + # Time to try guessing everything + tool_wfexs_meta = { + "fetched": putative_tool_uri, + "payload": None, + } + metadata_array.append(URIWithMetadata(wf_url, tool_wfexs_meta)) + try: + resio = io.BytesIO() + _, metaresio, _ = scheme_catalog.streamfetch( + putative_tool_uri, + resio, + sec_context={ + "headers": { + "Accept": "application/json", + # Added to avoid Cloudflare anti-bot policy + "User-Agent": get_WfExS_version_str(), + }, + }, + ) + trs__meta = json.loads(resio.getvalue().decode("utf-8")) + tool_wfexs_meta["payload"] = trs__meta + metadata_array.extend(metaresio) + except Exception as e: + if fail_ok: + return None + raise RepoGuessException( + f"trs_endpoint could not be guessed from {putative_tool_uri} (forced version {override_version_id}, raised exception {e})" + ) from e + + if "descriptor_type" not in trs__meta: + if fail_ok: + return None + raise RepoGuessException( + f"trs_endpoint at {putative_tool_uri} (forced version {override_version_id}) is not answering what it is expected" + ) + + trs_tool_meta = trs__meta + trs_endpoint = urllib.parse.urlunparse( + urllib.parse.ParseResult( + scheme=parsed_putative_tool_uri.scheme, + netloc=parsed_putative_tool_uri.netloc, + path="/".join(parsed_putative_tool_uri.path.split("/")[0:-4]) + + "/", + params="", + query="", + fragment="", + ) + ) + trs_tool_prefix = cast( + "URIType", + urllib.parse.urlunparse( + urllib.parse.ParseResult( + scheme=parsed_putative_tool_uri.scheme, + netloc=parsed_putative_tool_uri.netloc, + path="/".join( + parsed_putative_tool_uri.path.split("/")[0:-2] + ) + + "/", + params="", + query="", + fragment="", + ) + ), + ) + workflow_id = urllib.parse.unquote( + parsed_putative_tool_uri.path.split("/")[-3] + ) + version_id = urllib.parse.unquote( + parsed_putative_tool_uri.path.split("/")[-1] + ) + trs_tool_uri = putative_tool_uri + elif fail_ok: + return None + else: + raise RepoGuessException( + f"trs_endpoint at {putative_tool_uri} is not answering what it is expected" + ) + + parsed_trs_endpoint = urllib.parse.urlparse(trs_endpoint) + trs_steps = parsed_trs_endpoint.path[0:-1].split("/") + + # Next two elifs should *never* happen + elif fail_ok: + return None + else: + raise RepoGuessException( + f"trs_endpoint could not be guessed from {orig_wf_url} (no clues)" + ) + + # This is needed to guarantee it is always declared + assert version_id is not None + assert trs_tool_uri is not None + if trs_tool_meta is None: + trs_tool_wfexs_meta: "MutableMapping[str, Union[URIType, Optional[Mapping[str, Any]]]]" = { + "fetched": trs_tool_uri, + "payload": None, + } + metadata_array.append(URIWithMetadata(wf_url, trs_tool_wfexs_meta)) + try: + resio = io.BytesIO() + _, metaresio, _ = scheme_catalog.streamfetch( + trs_tool_uri, + resio, + sec_context={ + "headers": { + "Accept": "application/json", + # Added to avoid Cloudflare anti-bot policy + "User-Agent": get_WfExS_version_str(), + }, + }, + ) + trs_tool_meta = json.loads(resio.getvalue().decode("utf-8")) + trs_tool_wfexs_meta["payload"] = trs_tool_meta + metadata_array.extend(metaresio) + except Exception as e: + if fail_ok: + return None + raise RepoGuessException( + f"trs_endpoint could not be guessed from {putative_tool_uri} (forced version {override_version_id}, raised exception {e})" + ) from e + + assert trs_tool_meta is not None + + if not isinstance(trs_tool_meta.get("descriptor_type"), list): + raise RepoGuessException( + f"Unable to obtain descriptor_type from tool descriptor obtained from {putative_tool_uri}" + ) + + descriptor_types = trs_tool_meta["descriptor_type"] + if len(descriptor_types) == 0: + raise RepoGuessException( + f"Empty list of descriptor_type from tool descriptor obtained from {putative_tool_uri}" + ) + + descriptor = descriptor_types[0] + assert descriptor is not None + if len(descriptor_types) > 1: + logger.warning( + f"Found {len(descriptor_types)} descriptor types for tool {putative_tool_uri}, using first ({descriptor})" + ) + + return ( + cast("RepoURL", trs_tool_uri), + trs_service_netloc, + trs_steps, + workflow_id, + version_id, + descriptor, + metadata_array, + service_info_metadata, ) - version_meta = { - "fetched": version_metadata_url, - "payload": None, - } - metadata_array.append(URIWithMetadata(remote_file, version_meta)) - try: - metaio = io.BytesIO() - _, metametaio, _ = fetchClassicURL(version_metadata_url, metaio) - version_metadata = json.loads(metaio.getvalue().decode("utf-8")) - version_meta["payload"] = version_metadata - metadata_array.extend(metametaio) - except FetcherException as fe: - raise FetcherException( - f"Error fetching or processing TRS version metadata for {remote_file} : {fe.code} {fe.reason}" - ) from fe + @classmethod + def GuessRepoParams( + cls, + orig_wf_url: "Union[URIType, parse.ParseResult]", + logger: "Optional[logging.Logger]" = None, + fail_ok: "bool" = False, + offline: "bool" = False, + ) -> "Optional[RemoteRepo]": + trs_params = cls.GuessTRSParams( + orig_wf_url, logger=logger, fail_ok=fail_ok, offline=offline + ) - # At last, we can finish building the URL - new_path_steps = [ - *version_steps, - version_metadata["descriptor_type"][0], - "files", - ] + return ( + None + if trs_params is None + else RemoteRepo( + repo_url=trs_params[0], + tag=cast("RepoTag", trs_params[4]), + repo_type=RepoType.TRS, + ) + ) - metadata_url = cast( + @classmethod + def BuildRepoPIDFromTRSParams( + cls, + trs_endpoint: "str", + workflow_id: "WorkflowId", + version_id: "Optional[WFVersionId]", + ) -> "URIType": + if isinstance(workflow_id, int): + workflow_id_str = str(workflow_id) + else: + workflow_id_str = workflow_id + + # The base URL must end with a slash + if trs_endpoint[-1] != "/": + trs_endpoint += "/" + + # Removing the tools suffix, which appeared in first WfExS iterations + if trs_endpoint.endswith("/" + cls.TRS_TOOLS_SUFFIX): + trs_endpoint = trs_endpoint[0 : -len(cls.TRS_TOOLS_SUFFIX)] + + trs_tools_url = cast( "URIType", - parse.urlunparse( - parse.ParseResult( - scheme="https", - netloc=parsedInputURL.netloc, - path="/".join(new_path_steps), - params="", - query="", - fragment="", - ) + urllib.parse.urljoin( + trs_endpoint, + cls.TRS_TOOLS_SUFFIX + urllib.parse.quote(workflow_id_str, safe=""), ), ) - descriptor_steps = [ - *version_steps, - version_metadata["descriptor_type"][0], - "descriptor", - ] - descriptor_base_url = parse.urlunparse( - parse.ParseResult( - scheme="https", - netloc=parsedInputURL.netloc, - path="/".join(descriptor_steps) + "/", - params="", - query="", - fragment="", + if version_id is not None: + trs_tool_url = ( + trs_tools_url + + "/versions/" + + urllib.parse.quote(str(version_id), safe="") ) - ) - else: - raise FetcherException(f"FIXME: Unhandled scheme {parsedInputURL.scheme}") - - topMeta = { - "fetched": metadata_url, - "payload": None, - "workflow_entrypoint": None, - "remote_workflow_entrypoint": None, - } - metadata_array = [URIWithMetadata(remote_file, topMeta)] - try: - metaio = io.BytesIO() - _, metametaio, _ = fetchClassicURL(metadata_url, metaio) - metadata = json.loads(metaio.getvalue().decode("utf-8")) - topMeta["payload"] = metadata - metadata_array.extend(metametaio) - except FetcherException as fe: - raise FetcherException( - "Error fetching or processing TRS files metadata for {} : {} {}".format( - remote_file, fe.code, fe.reason + else: + trs_tool_url = trs_tools_url + + return cast("URIType", cls.INTERNAL_TRS_SCHEME_PREFIX + ":" + trs_tool_url) + + def materialize_repo_from_repo( + self, + repo: "RemoteRepo", + repo_tag_destdir: "Optional[PathLikePath]" = None, + base_repo_destdir: "Optional[PathLikePath]" = None, + doUpdate: "Optional[bool]" = True, + ) -> "MaterializedRepo": + if repo.repo_type not in (RepoType.TRS, None): + raise FetcherException( + f"Remote repository {repo} is not of type TRS. Unable to fulfil request" ) - ) from fe - - os.makedirs(cachedFilename, exist_ok=True) - absdirs = set() - emptyWorkflow = True - # First pass, identify primary descriptor / workflow entrypoint - # and learn whether the destination paths should be sanitized - deepest_file_rel = 1 - for file_desc in metadata: - file_rel_path = file_desc.get("path") - if file_rel_path is not None: - frp_parsed = parse.urlparse(file_rel_path) - if frp_parsed.scheme in ("http", "https", "ftp"): - # An absolute URL, like in the case of DDBJ TRS implementation - # A mixure of resource might be catastrophic, the code is doing - # its best effort - file_rel_path = os.path.join(frp_parsed.netloc, frp_parsed.params) + remote_file = repo.repo_url + repoTag = repo.tag - # BEWARE! The relpath could contain references to parent directories - # escaping from the URL to be built and from the download "sandbox" - # Avoid absolute paths corner case before splitting - file_rel_path_steps = file_rel_path.lstrip("/").split("/") - - file_rel_depth = ( - len(file_rel_path_steps) - - file_rel_path_steps.count(".") - - file_rel_path_steps.count("") - - 2 * file_rel_path_steps.count("..") + guessed_trs_params = self.GuessTRSParams( + remote_file, + logger=self.logger, + scheme_catalog=self.scheme_catalog, + override_version_id=repoTag, + ) + if guessed_trs_params is None: + raise FetcherException(f"Unable to guess TRS params from {repo}") + + ( + trs_tool_url, + trs_service_netloc, + trs_steps, + workflow_id, + version_id, + descriptor, + guessed_metadata_array, + service_info_metadata, + ) = guessed_trs_params + files_metadata_url = ( + trs_tool_url + + "/" + + urllib.parse.quote(descriptor, safe="") + + self.TRS_FILES_SUFFIX + ) + descriptor_base_url = ( + trs_tool_url + + "/" + + urllib.parse.quote(descriptor, safe="") + + self.TRS_DESCRIPTOR_INFIX + ) + + # Assure directory exists before next step + if repo_tag_destdir is None: + if base_repo_destdir is None: + repo_tag_destpath = pathlib.Path( + tempfile.mkdtemp(prefix="wfexs", suffix=".trs") + ) + atexit.register(shutil.rmtree, repo_tag_destpath, True) + else: + repo_hashed_id = hashlib.sha1(remote_file.encode("utf-8")).hexdigest() + repo_destpath = pathlib.Path(base_repo_destdir, repo_hashed_id) + # repo_destdir = pathlib.Path(self.cacheWorkflowDir, repo_hashed_id) + + if not repo_destpath.exists(): + try: + repo_destpath.mkdir(parents=True) + except IOError: + errstr = "ERROR: Unable to create intermediate directories for repo {}. ".format( + remote_file + ) + raise FetcherException(errstr) + + repo_hashed_tag_id = hashlib.sha1( + b"" if version_id is None else str(version_id).encode("utf-8") + ).hexdigest() + repo_tag_destpath = repo_destpath / repo_hashed_tag_id + else: + repo_tag_destpath = ( + repo_tag_destdir + if isinstance(repo_tag_destdir, pathlib.Path) + else pathlib.Path(repo_tag_destdir) ) - if file_rel_depth < deepest_file_rel: - deepest_file_rel = file_rel_depth - - # We have to create anonymous directories to avoid leaving the download "sandbox" - abs_download_dir = cachedFilename - if deepest_file_rel < 1: - for depth in range(deepest_file_rel, 1): - abs_download_dir = cast( - "AbsPath", os.path.join(abs_download_dir, f"unnamed{depth}") + + self.logger.debug(f"Repo dir {repo_tag_destpath}") + + topMeta = { + "fetched": files_metadata_url, + "payload": None, + "workflow_entrypoint": None, + "remote_workflow_entrypoint": None, + } + metadata_array = [ + *guessed_metadata_array, + URIWithMetadata(remote_file, topMeta), + ] + try: + metaio = io.BytesIO() + _, metametaio, _ = self.scheme_catalog.streamfetch( + cast("URIType", files_metadata_url), metaio ) + metadata = json.loads(metaio.getvalue().decode("utf-8")) + topMeta["payload"] = metadata + metadata_array.extend(metametaio) + except FetcherException as fe: + raise FetcherException( + "Error fetching or processing TRS files metadata for {} : {} {} (offending url {})".format( + remote_file, fe.code, fe.reason, files_metadata_url + ) + ) from fe + + repo_tag_destpath.mkdir(parents=True, exist_ok=True) + absdirs = set() + emptyWorkflow = True + + # First pass, identify primary descriptor / workflow entrypoint + # and learn whether the destination paths should be sanitized + is_abs_url = False + is_anon = False + file_rel_2_url: "MutableMapping[str, str]" = dict() + for file_desc in metadata: + file_rel_path = file_desc.get("path") + if file_rel_path is None: + continue - # Second pass, fetching the contents, sanitizing the destination paths - for file_desc in metadata: - file_rel_path = file_desc.get("path") - if file_rel_path is not None: emptyWorkflow = False # BEWARE! The relpath could contain references to parent directories # escaping from the URL to be built and from the download "sandbox" frp_parsed = parse.urlparse(file_rel_path) is_abs_url = frp_parsed.scheme in ("http", "https", "ftp") + if is_abs_url: - # An absolute URL, like in the case of DDBJ TRS implementation - file_url = cast("URIType", file_rel_path) - absfile = cast( - "AbsPath", - os.path.join( - abs_download_dir, frp_parsed.netloc, frp_parsed.path.lstrip("/") - ), + # This one has to be dealt with a shortcut + file_rel_2_url[file_rel_path] = urlresolv(file_rel_path) + continue + + descriptor_url = cast( + "URIType", + descriptor_base_url + parse.quote(file_rel_path, safe="/"), + ) + try: + descmetaio = io.BytesIO() + _, descmetaelem, _ = self.scheme_catalog.streamfetch( + descriptor_url, + descmetaio, + {"headers": {"Accept": "application/json"}}, ) - else: - file_url = cast( - "URIType", - descriptor_base_url + parse.quote(file_rel_path, safe="/"), + descriptor_meta = json.loads(descmetaio.getvalue().decode("utf-8")) + except FetcherException as fe: + raise FetcherException( + "Error fetching or processing TRS descriptor metadata for {} : {} {}".format( + descriptor_url, fe.code, fe.reason + ) + ) from fe + + is_anon = ( + not isinstance(descriptor_meta, dict) + or descriptor_meta.get("url") is None + ) + if is_anon: + # This one has to be dealt in a different way + break + file_rel_2_url[file_rel_path] = urlresolv(descriptor_meta["url"]) + + if emptyWorkflow: + raise FetcherException( + "Error processing TRS files for {} : no file was found.\n{}".format( + remote_file, metadata ) - absfile = cast( - "AbsPath", os.path.join(abs_download_dir, file_rel_path.lstrip("/")) + ) + + if is_anon: + prefix_url = "" + elif len(file_rel_2_url) == 1: + # FIXME?: this is not going to work in Windows + prefix_url = os.path.dirname(tuple(file_rel_2_url.values())[0]) + else: + prefix_url = os.path.commonpath(tuple(file_rel_2_url.values())) + + # Due the peversion of commonpath, double slashes are collapsed + colon_pos = prefix_url.find(":") + if colon_pos > 0: + prefix_url = ( + prefix_url[0 : colon_pos + 1] + "/" + prefix_url[colon_pos + 1 :] ) - # Intermediate path creation - absdir = os.path.dirname(absfile) - if absdir not in absdirs: - absdirs.add(absdir) - os.makedirs(absdir, exist_ok=True) - real_rel_path = os.path.relpath(os.path.normpath(absfile), cachedFilename) - - # When it is the primary descriptor, it is fetched twice - if file_desc.get("file_type") == "PRIMARY_DESCRIPTOR": - topMeta["workflow_entrypoint"] = cast("URIType", real_rel_path) - if is_abs_url: - topMeta["remote_workflow_entrypoint"] = file_url - else: - descriptorMeta = io.BytesIO() - _, metaprimary, _ = fetchClassicURL(file_url, descriptorMeta) - metadata_array.extend(metaprimary) + # We have to create anonymous directories to avoid leaving the download "sandbox" + abs_download_dir = repo_tag_destpath + if "/" in prefix_url: + # This is needed to perform an effective work + if not prefix_url.endswith("/"): + prefix_url += "/" + + # Computing resolved relative paths + for file_desc in metadata: + file_rel_path = file_desc.get("path") + if file_rel_path is not None: + # BEWARE! The relpath could contain references to parent directories + # escaping from the URL to be built and from the download "sandbox" + frp_parsed = parse.urlparse(file_rel_path) + is_abs_url = frp_parsed.scheme in ("http", "https", "ftp") + if is_abs_url: + # An absolute URL, like in the case of DDBJ TRS implementation + file_url = cast("URIType", file_rel_path) + else: + file_url = cast( + "URIType", + descriptor_base_url + parse.quote(file_rel_path, safe="/"), + ) + local_rel_path = file_rel_2_url[file_rel_path][len(prefix_url) :] + absfile = (abs_download_dir / local_rel_path).resolve() + + # Intermediate path creation + absdir = absfile.parent + if absdir not in absdirs: + absdirs.add(absdir) + os.makedirs(absdir, exist_ok=True) + real_rel_path = absfile.relative_to(repo_tag_destpath) + + # When it is the primary descriptor, it is fetched twice + if file_desc.get("file_type") == "PRIMARY_DESCRIPTOR": + topMeta["workflow_entrypoint"] = cast( + "URIType", real_rel_path.as_posix() + ) + if is_abs_url: + topMeta["remote_workflow_entrypoint"] = file_url + else: + topMeta["remote_workflow_entrypoint"] = cast( + "URIType", file_rel_2_url[file_rel_path] + ) + + # Getting the raw content + accept_val = "*/*" if is_abs_url else "text/plain" + _, metaelem, _ = self.scheme_catalog.fetch( + file_url, absfile, {"headers": {"Accept": accept_val}} + ) + metadata_array.extend(metaelem) + else: + # First pass, identify primary descriptor / workflow entrypoint + # and learn whether the destination paths should be sanitized + deepest_file_rel = 0 + for file_desc in metadata: + file_rel_path = file_desc.get("path") + if file_rel_path is not None: + frp_parsed = parse.urlparse(file_rel_path) + if frp_parsed.scheme in ("http", "https", "ftp"): + # An absolute URL, like in the case of DDBJ TRS implementation + # A mixure of resource might be catastrophic, the code is doing + # its best effort + file_rel_path = os.path.join( + frp_parsed.netloc, frp_parsed.params + ) + + # BEWARE! The relpath could contain references to parent directories + # escaping from the URL to be built and from the download "sandbox" + # Avoid absolute paths corner case before splitting + file_rel_path_steps = file_rel_path.lstrip("/").split("/") + + deepest = 0 + depth = 0 + for step in file_rel_path_steps: + if step == "..": + depth -= 1 + if depth < deepest: + deepest = depth + elif step not in (".", ""): + depth += 1 + + if deepest < deepest_file_rel: + deepest_file_rel = deepest - # This metadata can help a lot to get the workflow repo - metadataPD = json.loads(descriptorMeta.getvalue().decode("utf-8")) - topMeta["remote_workflow_entrypoint"] = metadataPD.get("url") + if deepest_file_rel < 0: + for depth in range(-deepest_file_rel): + abs_download_dir = abs_download_dir / f"unnamed{depth}" - del descriptorMeta - del metadataPD + # Second pass, fetching the contents, sanitizing the destination paths + for file_desc in metadata: + file_rel_path = file_desc.get("path") + if file_rel_path is not None: + emptyWorkflow = False - accept_val = "*/*" if is_abs_url else "text/plain" - # Getting the raw content - _, metaelem, _ = fetchClassicURL( - file_url, absfile, {"headers": {"Accept": accept_val}} + # BEWARE! The relpath could contain references to parent directories + # escaping from the URL to be built and from the download "sandbox" + frp_parsed = parse.urlparse(file_rel_path) + is_abs_url = frp_parsed.scheme in ("http", "https", "ftp") + if is_abs_url: + # An absolute URL, like in the case of DDBJ TRS implementation + file_url = cast("URIType", file_rel_path) + absfile = ( + abs_download_dir + / frp_parsed.netloc + / frp_parsed.path.lstrip("/") + ) + else: + file_url = cast( + "URIType", + descriptor_base_url + parse.quote(file_rel_path, safe="/"), + ) + absfile = abs_download_dir / file_rel_path.lstrip("/") + + absfile = absfile.resolve() + + # Intermediate path creation + absdir = absfile.parent + if absdir not in absdirs: + absdirs.add(absdir) + absdir.mkdir(parents=True, exist_ok=True) + real_rel_path = absfile.relative_to(repo_tag_destpath) + + # When it is the primary descriptor, it is fetched twice + if file_desc.get("file_type") == "PRIMARY_DESCRIPTOR": + topMeta["workflow_entrypoint"] = cast( + "URIType", real_rel_path.as_posix() + ) + if is_abs_url: + topMeta["remote_workflow_entrypoint"] = file_url + else: + descriptorMeta = io.BytesIO() + _, metaprimary, _ = self.scheme_catalog.streamfetch( + file_url, descriptorMeta + ) + metadata_array.extend(metaprimary) + + # This metadata can help a lot to get the workflow repo + metadataPD = json.loads( + descriptorMeta.getvalue().decode("utf-8") + ) + topMeta["remote_workflow_entrypoint"] = metadataPD.get( + "url" + ) + + del descriptorMeta + del metadataPD + + # Getting the raw content + accept_val = "*/*" if is_abs_url else "text/plain" + try: + _, metaelem, _ = self.scheme_catalog.fetch( + file_url, absfile, {"headers": {"Accept": accept_val}} + ) + metadata_array.extend(metaelem) + except FetcherException as fe: + if file_desc.get("file_type") in ( + "PRIMARY_DESCRIPTOR", + "SECONDARY_DESCRIPTOR", + ): + raise + else: + self.logger.warning( + f"Unable to fetch {file_url}. TRS Dataset {files_metadata_url} might be incomplete" + ) + + if emptyWorkflow: + raise FetcherException( + "Error processing TRS files for {} : no file was found.\n{}".format( + remote_file, metadata + ) + ) + + upstream_repo: "Optional[RemoteRepo]" = None + recommends_upstream: "bool" = False + + if service_info_metadata is None: + parsed_trs_tool_url = urllib.parse.urlparse(trs_tool_url) + trs_service_info = urllib.parse.urlunparse( + urllib.parse.ParseResult( + scheme=parsed_trs_tool_url.scheme, + netloc=parsed_trs_tool_url.netloc, + path="/".join(parsed_trs_tool_url.path.split("/")[0:-4]) + + "/service-info", + params="", + query="", + fragment="", + ) ) - metadata_array.extend(metaelem) - if emptyWorkflow: - raise FetcherException( - "Error processing TRS files for {} : no file was found.\n{}".format( - remote_file, metadata + service_info_wfexs_meta = { + "fetched": trs_service_info, + "payload": cast("Optional[Mapping[str, Any]]", None), + } + metadata_array.append( + URIWithMetadata(trs_tool_url, service_info_wfexs_meta) + ) + try: + metaio = io.BytesIO() + _, metametaio, _ = self.scheme_catalog.streamfetch( + cast("URIType", trs_service_info), metaio + ) + service_info_metadata = json.loads(metaio.getvalue().decode("utf-8")) + service_info_wfexs_meta["payload"] = service_info_metadata + metadata_array.extend(metametaio) + + except Exception as e: + raise FetcherException( + f"Unable to fetch service info metadata {trs_service_info} (affects tool {trs_tool_url})" + ) from e + + # Checking whether it is WorkflowHub + # to recommend the generated Workflow RO-Crate + if service_info_metadata.get("organization", {}).get("name") == "WorkflowHub": + recommends_upstream = True + upstream_repo = RemoteRepo( + repo_url=cast( + "RepoURL", + files_metadata_url + + "?" + + urllib.parse.urlencode({"format": "zip"}), + ), + repo_type=RepoType.Raw, ) + elif topMeta["remote_workflow_entrypoint"] is not None: + upstream_repo = RemoteRepo( + repo_url=cast("RepoURL", topMeta["remote_workflow_entrypoint"]), + ) + + return MaterializedRepo( + local=repo_tag_destpath, + repo=RemoteRepo( + repo_url=remote_file, + tag=cast("RepoTag", str(version_id)), + rel_path=cast("Optional[RelPath]", topMeta["workflow_entrypoint"]), + repo_type=RepoType.TRS, + ), + metadata_array=metadata_array, + upstream_repo=upstream_repo, + recommends_upstream=recommends_upstream, ) - return ProtocolFetcherReturn( - kind_or_resolved=ContentKind.Directory, - metadata_array=metadata_array, - ) + def build_pid_from_repo(self, remote_repo: "RemoteRepo") -> "Optional[str]": + """ + This method is required to generate a PID which usually + represents an element (usually a workflow) in a repository. + If the fetcher does not recognize the type of repo, either using + repo_url content or the repo type in the worst case, it should + return None + """ + + # TODO: improve this to cover the different cases + parsedInputURL = parse.urlparse(remote_repo.repo_url) + if remote_repo.repo_type is None and parsedInputURL.scheme in ( + self.INTERNAL_TRS_SCHEME_PREFIX, + self.TRS_SCHEME_PREFIX, + ): + return remote_repo.repo_url + elif remote_repo.repo_type == RepoType.TRS: + try: + guessed_trs_params = self.GuessTRSParams( + parsedInputURL, + override_version_id=remote_repo.tag, + logger=self.logger, + fail_ok=True, + offline=True, + ) + except OfflineRepoGuessException as orge: + self.logger.error( + f"While building pid for {remote_repo.repo_url} called code which should be safe offline" + ) + guessed_trs_params = None + + if guessed_trs_params is not None: + ( + trs_tool_url, + trs_service_netloc, + trs_steps, + workflow_id, + version_id, + descriptor, + guessed_metadata_array, + service_info_metadata, + ) = guessed_trs_params + + # Remove /ga4gh/trs/v2 from the end + if ( + len(trs_steps) >= 3 + and trs_steps[-1] == "v2" + and trs_steps[-2] == "trs" + and trs_steps[-3] == "ga4gh" + ): + trs_steps = trs_steps[0:-3] + new_steps = [*trs_steps, urllib.parse.quote(str(workflow_id), safe="")] + if version_id is not None: + new_steps.append(urllib.parse.quote(str(version_id), safe="")) + + computed_trs_endpoint = urllib.parse.urlunparse( + urllib.parse.ParseResult( + scheme=self.TRS_SCHEME_PREFIX, + netloc=trs_service_netloc, + path="/".join(new_steps), + params="", + query="", + fragment="", + ) + ) + + return computed_trs_endpoint + + return None + + def fetch( + self, + remote_file: "URIType", + cachedFilename: "PathLikePath", + secContext: "Optional[SecurityContextConfig]" = None, + ) -> "ProtocolFetcherReturn": + """ + Method to download contents from TRS files related to a tool + + :param remote_file: + :param cachedFilename: Destination filename for the fetched content + :param secContext: The security context containing the credentials + """ + + parsedInputURL = parse.urlparse(remote_file) + + # For cases where the URI is not one of the native schemes + # fallback to INTERNAL_TRS_SCHEME_PREFIX + if parsedInputURL.scheme not in self.GetSchemeHandlers(): + the_remote_file = self.INTERNAL_TRS_SCHEME_PREFIX + ":" + remote_file + else: + the_remote_file = remote_file + + # Getting the repoRelPath (if available) + params = parse.parse_qs(parsedInputURL.path, separator=";") + repoRelPath_l = params.get("path", []) + repoRelPath: "Optional[str]" + if len(repoRelPath_l) > 0: + repoRelPath = repoRelPath_l[0] + # Directories also end with slashes + repoRelPath.strip("/") + else: + repoRelPath = None + + # It is materialized in a temporary location + materialized_repo_return = self.materialize_repo_from_repo( + RemoteRepo(repo_url=cast("RepoURL", remote_file), repo_type=RepoType.TRS), + ) + repo_tag_destdir = materialized_repo_return.local + remote_repo = materialized_repo_return.repo + metadata_array = materialized_repo_return.metadata_array + preferredName: "Optional[RelPath]" + # repoRelPath is only acknowledged when the resolved repo + # is translated to a directory + if repoRelPath is not None and repo_tag_destdir.is_dir(): + cachedContentPath = repo_tag_destdir / repoRelPath + preferredName = cast("RelPath", cachedContentPath.name) + else: + cachedContentPath = repo_tag_destdir + preferredName = None + # This is to remove spurious detections + repoRelPath = None -# These are schemes from identifiers.org -SCHEME_HANDLERS: "Mapping[str, DocumentedProtocolFetcher]" = { - INTERNAL_TRS_SCHEME_PREFIX: DocumentedProtocolFetcher( - fetcher=fetchTRSFiles, - description="WfExS internal pseudo-scheme used to materialize files from pure TRS servers", - ), - TRS_SCHEME_PREFIX: DocumentedProtocolFetcher( - fetcher=fetchTRSFiles, - description="GA4GH TRS metadata is fetched using the APIs described at https://ga4gh.github.io/tool-registry-service-schemas/. Contents are downloaded delegating their associated URIs to other fetchers", - ), -} + remote_repo = remote_repo._replace(rel_path=cast("RelPath", repoRelPath)) + + if cachedContentPath.is_dir(): + kind = ContentKind.Directory + elif cachedContentPath.is_file(): + kind = ContentKind.File + else: + raise FetcherException( + f"Remote {remote_file} is neither a file nor a directory (does it exist?)" + ) + + # shutil.move(cachedContentPath, cachedFilename) + link_or_copy_pathlib(cachedContentPath, pathlib.Path(cachedFilename)) + + repo_desc: "Optional[Mapping[str, Any]]" = remote_repo.gen_repo_desc() + if repo_desc is None: + repo_desc = {} + augmented_metadata_array = [ + URIWithMetadata( + uri=remote_file, metadata=repo_desc, preferredName=preferredName + ), + *metadata_array, + ] + return ProtocolFetcherReturn( + kind_or_resolved=kind, + metadata_array=augmented_metadata_array, + # TODO: Integrate licences from TRS report?? + licences=None, + ) diff --git a/wfexs_backend/fetchers/wiktionary.py b/wfexs_backend/fetchers/wiktionary.py index ea434974..7921bbbf 100644 --- a/wfexs_backend/fetchers/wiktionary.py +++ b/wfexs_backend/fetchers/wiktionary.py @@ -2,7 +2,7 @@ # -*- coding: utf-8 -*- # SPDX-License-Identifier: Apache-2.0 -# Copyright 2020-2024 Barcelona Supercomputing Center (BSC), Spain +# Copyright 2020-2025 Barcelona Supercomputing Center (BSC), Spain # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -64,11 +64,6 @@ class WiktionaryFetcher(AbstractStatefulFetcher): WIKTIONARY_PROTO: "Final[str]" = "wfexs.wiktionary" - def __init__( - self, progs: "ProgsMapping", setup_block: "Optional[Mapping[str, Any]]" = None - ): - super().__init__(progs=progs, setup_block=setup_block) - @classmethod def GetSchemeHandlers(cls) -> "Mapping[str, DocumentedStatefulProtocolFetcher]": # These are de-facto schemes supported by pip and git client diff --git a/wfexs_backend/fetchers/zenodo.py b/wfexs_backend/fetchers/zenodo.py index b33bb22c..737d01e9 100644 --- a/wfexs_backend/fetchers/zenodo.py +++ b/wfexs_backend/fetchers/zenodo.py @@ -35,7 +35,7 @@ FetcherException, ProtocolFetcherReturn, ) -from .http import fetchClassicURL +from .http import HTTPFetcher from ..common import ( ContentKind, @@ -95,11 +95,12 @@ def fetchZenodo( metadata_url = cast("URIType", parse.urljoin(ZENODO_RECORD_REST, zenodo_id)) + http_fetcher = HTTPFetcher() gathered_meta = {"fetched": metadata_url} metadata_array = [URIWithMetadata(remote_file, gathered_meta)] try: metaio = io.BytesIO() - _, metametaio, _ = fetchClassicURL(metadata_url, metaio) + _, metametaio, _ = http_fetcher.streamfetch(metadata_url, metaio) metadata = json.loads(metaio.getvalue().decode("utf-8")) gathered_meta["payload"] = metadata metadata_array.extend(metametaio) @@ -128,7 +129,7 @@ def fetchZenodo( metadata_array.append(URIWithMetadata(remote_file, gathered_l_meta)) try: metaio = io.BytesIO() - _, metametalicio, _ = fetchClassicURL(licence_meta_url, metaio) + _, metametalicio, _ = http_fetcher.streamfetch(licence_meta_url, metaio) l_metadata = json.loads(metaio.getvalue().decode("utf-8")) gathered_l_meta["payload"] = l_metadata metadata_array.extend(metametalicio) @@ -208,12 +209,12 @@ def fetchZenodo( the_file_local_path = cast( "AbsPath", os.path.join(cachedFilename, relpath) ) - _, metacont, _ = fetchClassicURL( + _, metacont, _ = http_fetcher.fetch( the_file["links"]["self"], the_file_local_path ) metadata_array.extend(metacont) else: - _, metacont, _ = fetchClassicURL( + _, metacont, _ = http_fetcher.fetch( the_files[0]["links"]["self"], cachedFilename ) metadata_array.extend(metacont) diff --git a/wfexs_backend/ro_crate.py b/wfexs_backend/ro_crate.py index c41ea6a1..e5f9971f 100644 --- a/wfexs_backend/ro_crate.py +++ b/wfexs_backend/ro_crate.py @@ -1302,7 +1302,7 @@ def _add_containers( the_size = os.stat(container.localPath).st_size if container.image_signature is not None: digest, algo = extract_digest(container.image_signature) - if digest is None: + if digest is None or digest == False: digest, algo = unstringifyDigest(container.image_signature) assert algo is not None the_signature = hexDigest(algo, digest) @@ -1561,7 +1561,7 @@ def addWorkflowInputs( the_signature: "Optional[Fingerprint]" = None if itemInValues.fingerprint is not None: digest, algo = extract_digest(itemInValues.fingerprint) - if digest is not None: + if digest is not None and digest != False: assert algo is not None the_signature = hexDigest(algo, digest) @@ -1796,7 +1796,7 @@ def addWorkflowInputs( sec_digest, sec_algo = extract_digest( secInput.fingerprint ) - if sec_digest is not None: + if sec_digest is not None and sec_digest != False: assert sec_algo is not None the_sec_signature = hexDigest( sec_algo, sec_digest @@ -2161,7 +2161,7 @@ def _add_workflow_to_crate( ) else: - raise ROCrateGenerationException( + self.logger.warning( "FIXME: Unsupported http(s) git repository {}".format( remote_repo.repo_url ) @@ -2977,7 +2977,7 @@ def _add_GeneratedContent_to_crate( assert the_content.signature is not None digest, algo = extract_digest(the_content.signature) - if digest is None: + if digest is None or digest == False: digest, algo = unstringifyDigest(the_content.signature) assert algo is not None dest_path = os.path.relpath(the_content.local, self.work_dir) diff --git a/wfexs_backend/schemas/stage-definition.json b/wfexs_backend/schemas/stage-definition.json index b391cfa3..60b9a148 100644 --- a/wfexs_backend/schemas/stage-definition.json +++ b/wfexs_backend/schemas/stage-definition.json @@ -575,6 +575,12 @@ } ] }, + "prefer_upstream_source": { + "title": "Prefer upstream source for the workflow, if available", + "description": "Prefer discovered and recommended upstream source for the workflow (if available) instead of the initially requested one, which can happen in cascade. This is needed for cases where the workflow is incomplete in the initially proposed source, and it is a somewhat known fact", + "type": "boolean", + "default": true + }, "nickname": { "title": "A friendly nickname (prefix) for the instances", "type": "string" diff --git a/wfexs_backend/scheme_catalog.py b/wfexs_backend/scheme_catalog.py new file mode 100644 index 00000000..109e5b9b --- /dev/null +++ b/wfexs_backend/scheme_catalog.py @@ -0,0 +1,520 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +# SPDX-License-Identifier: Apache-2.0 +# Copyright 2020-2025 Barcelona Supercomputing Center (BSC), Spain +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import + +import copy +import datetime +import hashlib +import importlib +import inspect +import json +import logging +import os +import os.path +import pathlib +import re +import shutil +import traceback +import types +import urllib.parse +import uuid + +from typing import ( + cast, + NamedTuple, + Pattern, + TYPE_CHECKING, +) + +if TYPE_CHECKING: + from types import ModuleType + + from typing import ( + Any, + IO, + Iterator, + Mapping, + MutableMapping, + MutableSequence, + Optional, + Sequence, + Set, + Tuple, + Type, + Union, + ) + + from typing_extensions import ( + Final, + NotRequired, + TypedDict, + ) + + from .common import ( + AbsPath, + AnyURI, + Fingerprint, + PathLikePath, + ProgsMapping, + RelPath, + SecurityContextConfig, + WritableSecurityContextConfig, + URIType, + ) + + from .fetchers import ( + ProtocolFetcherReturn, + StatefulFetcher, + ) + + from .security_context import ( + SecurityContextVault, + ) + + class RelAbsDict(TypedDict): + relative: RelPath + absolute: AbsPath + + class PathMetaDict(TypedDict): + meta: NotRequired[RelAbsDict] + relative: NotRequired[RelPath] + absolute: NotRequired[AbsPath] + + class MetadataEntryMetaDict(TypedDict): + injected: bool + + class MetadataEntryDict(TypedDict): + uri: URIType + metadata: MetadataEntryMetaDict + preferredName: RelPath + + class CacheMetadataDict(TypedDict): + stamp: datetime.datetime + path: PathMetaDict + kind: str + metadata_array: Sequence[MetadataEntryDict] + resolves_to: Sequence[URIType] + licences: Tuple[URIType, ...] + attributions: Sequence[Mapping[str, Any]] + fingerprint: Fingerprint + clonable: bool + + +from .common import ( + AbstractWfExSException, + Attribution, + ContentKind, + DefaultNoLicenceTuple, + LicenceDescription, + LicensedURI, + META_JSON_POSTFIX, + URIWithMetadata, +) + +from .fetchers import ( + AbstractSchemeRepoFetcher, + AbstractStatefulFetcher, + AbstractStatefulStreamingFetcher, + DocumentedProtocolFetcher, + DocumentedStatefulProtocolFetcher, + FetcherException, + FetcherInstanceException, + InvalidFetcherException, + RemoteRepo, +) + +from .utils.contents import link_or_copy +from .utils.digests import ( + ComputeDigestFromDirectory, + ComputeDigestFromFile, + stringifyFilenameDigest, +) +from .utils.misc import ( + config_validate, + DatetimeEncoder, + iter_namespace, + jsonFilterDecodeFromStream, + translate_glob_args, +) + + +class SchemeCatalogException(AbstractWfExSException): + pass + + +class SchemeCatalogImportException(SchemeCatalogException): + pass + + +class SchemeCatalog: + def __init__( + self, + scheme_handlers: "Mapping[str, Union[DocumentedStatefulProtocolFetcher, DocumentedProtocolFetcher]]" = dict(), + ): + # Getting a logger focused on specific classes + self.logger = logging.getLogger( + dict(inspect.getmembers(self))["__module__"] + + "::" + + self.__class__.__name__ + ) + + self.schemeHandlers: "MutableMapping[str, DocumentedProtocolFetcher]" = dict() + + self.bypassSchemeHandlers(scheme_handlers) + + def addRawSchemeHandlers( + self, schemeHandlers: "Mapping[str, DocumentedProtocolFetcher]" + ) -> None: + # No validation is done here about validness of schemes + if isinstance(schemeHandlers, dict): + self.schemeHandlers.update(schemeHandlers) + else: + raise InvalidFetcherException("Unable to add raw scheme handlers") + + def bypassSchemeHandler( + self, + scheme: "str", + handler: "Union[DocumentedStatefulProtocolFetcher, DocumentedProtocolFetcher]", + progs: "ProgsMapping" = dict(), + setup_block: "Optional[Mapping[str, Any]]" = None, + ) -> None: + """ + This method adds and overwrites a scheme handler, + instantiating it if it is a stateful one. + + :param scheme: + :param handler: + """ + the_handler: "DocumentedProtocolFetcher" + if isinstance(handler, DocumentedStatefulProtocolFetcher): + inst_handler = self.instantiateStatefulFetcher( + handler.fetcher_class, progs=progs, setup_block=setup_block + ) + the_handler = DocumentedProtocolFetcher( + fetcher=inst_handler.fetch, + description=inst_handler.description + if handler.description is None + else handler.description, + priority=handler.priority, + ) + elif isinstance(handler, DocumentedProtocolFetcher) and isinstance( + handler.fetcher, + ( + types.FunctionType, + types.LambdaType, + types.MethodType, + types.BuiltinFunctionType, + types.BuiltinMethodType, + ), + ): + the_handler = handler + else: + raise InvalidFetcherException( + "Trying to set for scheme {} a invalid handler".format(scheme) + ) + + self.schemeHandlers[scheme.lower()] = the_handler + + def bypassSchemeHandlers( + self, + schemeHandlers: "Mapping[str, Union[DocumentedStatefulProtocolFetcher, DocumentedProtocolFetcher]]", + ) -> None: + # No validation is done here about validness of schemes + if isinstance(schemeHandlers, dict): + for scheme, clazz in schemeHandlers.items(): + self.bypassSchemeHandler(scheme, clazz) + else: + raise InvalidFetcherException( + "Unable to instantiate to add scheme handlers" + ) + + def instantiateStatefulFetcher( + self, + statefulFetcher: "Type[StatefulFetcher]", + progs: "ProgsMapping" = dict(), + setup_block: "Optional[Mapping[str, Any]]" = None, + ) -> "StatefulFetcher": + """ + Method to instantiate stateful fetchers + """ + instStatefulFetcher: "Optional[AbstractStatefulFetcher]" = None + if inspect.isclass(statefulFetcher): + if issubclass(statefulFetcher, AbstractStatefulFetcher): + # Setting the default list of programs + mutable_progs = copy.copy(progs) + for prog in statefulFetcher.GetNeededPrograms(): + mutable_progs.setdefault(prog, cast("RelPath", prog)) + try: + if issubclass(statefulFetcher, AbstractSchemeRepoFetcher): + instStatefulFetcher = statefulFetcher( + self, progs=mutable_progs, setup_block=setup_block + ) + else: + instStatefulFetcher = statefulFetcher( + progs=progs, + setup_block=setup_block, + scheme_catalog=self, + ) + except Exception as e: + raise FetcherInstanceException( + f"Error while instantiating {statefulFetcher.__name__}" + ) from e + + if instStatefulFetcher is None: + raise InvalidFetcherException( + "Unable to instantiate something which is not a class inheriting from AbstractStatefulFetcher" + ) + + return cast("StatefulFetcher", instStatefulFetcher) + + def describeRegisteredSchemes(self) -> "Sequence[Tuple[str, str, int]]": + return [ + (scheme, desc_fetcher.description, desc_fetcher.priority) + for scheme, desc_fetcher in self.schemeHandlers.items() + ] + + def findAndAddSchemeHandlersFromModuleName( + self, + the_module_name: "str" = "wfexs_backend.fetchers", + fetchers_setup_block: "Optional[Mapping[str, Mapping[str, Any]]]" = None, + progs: "ProgsMapping" = dict(), + ) -> "Sequence[AbstractSchemeRepoFetcher]": + try: + the_module = importlib.import_module(the_module_name) + return self.findAndAddSchemeHandlersFromModule( + the_module, + fetchers_setup_block=fetchers_setup_block, + progs=progs, + ) + except Exception as e: + errmsg = f"Unable to import module {the_module_name} in order to gather scheme handlers, due errors:" + self.logger.exception(errmsg) + raise SchemeCatalogImportException(errmsg) from e + + def findAndAddSchemeHandlersFromModule( + self, + the_module: "ModuleType", + fetchers_setup_block: "Optional[Mapping[str, Mapping[str, Any]]]" = None, + progs: "ProgsMapping" = dict(), + ) -> "Sequence[AbstractSchemeRepoFetcher]": + repo_fetchers: "MutableSequence[AbstractSchemeRepoFetcher]" = [] + + for finder, module_name, ispkg in iter_namespace(the_module): + try: + named_module = importlib.import_module(module_name) + except: + self.logger.exception( + f"Skipping module {module_name} in order to gather scheme handlers, due errors:" + ) + continue + + # First, try locating a variable named SCHEME_HANDLERS + # then, the different class declarations inheriting + # from AbstractStatefulFetcher + skipit = True + for name, obj in inspect.getmembers(named_module): + if name == "SCHEME_HANDLERS": + if isinstance(obj, dict): + self.addSchemeHandlers( + obj, + fetchers_setup_block=fetchers_setup_block, + ) + skipit = False + elif ( + inspect.isclass(obj) + and not inspect.isabstract(obj) + and issubclass(obj, AbstractStatefulFetcher) + ): + # Now, let's learn whether the class is enabled + if getattr(obj, "ENABLED", False): + repo_fetchers.extend( + self.addStatefulSchemeHandlers( + obj, + fetchers_setup_block=fetchers_setup_block, + progs=progs, + ) + ) + skipit = False + + if skipit: + self.logger.debug( + f"Fetch module {named_module} was not eligible (no SCHEME_HANDLERS dictionary or subclass of {AbstractStatefulFetcher.__name__})" + ) + + return repo_fetchers + + def addStatefulSchemeHandlers( + self, + statefulSchemeHandler: "Type[AbstractStatefulFetcher]", + fetchers_setup_block: "Optional[Mapping[str, Mapping[str, Any]]]" = None, + progs: "ProgsMapping" = dict(), + ) -> "Sequence[AbstractSchemeRepoFetcher]": + """ + This method adds scheme handlers (aka "fetchers") from + a given stateful fetcher, also adding the needed programs + """ + + # Get the scheme handlers from this fetcher + schemeHandlers = statefulSchemeHandler.GetSchemeHandlers() + + return self.addSchemeHandlers( + schemeHandlers, + fetchers_setup_block=fetchers_setup_block, + progs=progs, + ) + + def get(self, scheme: "str") -> "Optional[DocumentedProtocolFetcher]": + return self.schemeHandlers.get(scheme) + + def getSchemeHandler( + self, the_remote_file: "URIType" + ) -> "DocumentedProtocolFetcher": + # Content is fetched here + # As of RFC3986, schemes are case insensitive + parsedInputURL = urllib.parse.urlparse(the_remote_file) + the_scheme = parsedInputURL.scheme.lower() + scheme_handler = self.get(the_scheme) + + if scheme_handler is None: + errmsg = f"No {the_scheme} scheme handler for {the_remote_file}. Was this URI injected in the cache? Is it a supported one?" + self.logger.error(errmsg) + raise SchemeCatalogException(errmsg) + + return scheme_handler + + def fetch( + self, + the_remote_file: "URIType", + cached_filename: "PathLikePath", + sec_context: "Optional[SecurityContextConfig]" = None, + ) -> "ProtocolFetcherReturn": + scheme_handler = self.getSchemeHandler(the_remote_file) + + # Content is fetched here + return scheme_handler.fetcher( + the_remote_file, + cached_filename, + secContext=sec_context, + ) + + def streamfetch( + self, + the_remote_file: "URIType", + the_stream: "IO[bytes]", + sec_context: "Optional[SecurityContextConfig]" = None, + ) -> "ProtocolFetcherReturn": + scheme_handler = self.getSchemeHandler(the_remote_file) + + stream_fetcher = ( + scheme_handler.fetcher.__self__ + if hasattr(scheme_handler.fetcher, "__self__") + else None + ) + + if not isinstance(stream_fetcher, AbstractStatefulStreamingFetcher): + errmsg = f"Scheme handler for {the_remote_file} does not offer streaming capabilities." + self.logger.error(errmsg) + raise SchemeCatalogException(errmsg) + + # Content is fetched here + return stream_fetcher.streamfetch( + the_remote_file, + the_stream, + secContext=sec_context, + ) + + # This pattern is used to validate the schemes + SCHEME_PAT: "Final[Pattern[str]]" = re.compile(r"^[a-zA-Z][a-zA-Z0-9+.-]*$") + + def addSchemeHandlers( + self, + schemeHandlers: "Mapping[str, Union[DocumentedProtocolFetcher, DocumentedStatefulProtocolFetcher]]", + fetchers_setup_block: "Optional[Mapping[str, Mapping[str, Any]]]" = None, + progs: "ProgsMapping" = dict(), + ) -> "Sequence[AbstractSchemeRepoFetcher]": + """ + This method adds scheme handlers (aka "fetchers") + or instantiates stateful scheme handlers (aka "stateful fetchers") + """ + instSchemeHandlers = dict() + fetchers_mapping: "MutableMapping[Type[AbstractStatefulFetcher], DocumentedProtocolFetcher]" = ( + dict() + ) + repo_fetchers: "MutableSequence[AbstractSchemeRepoFetcher]" = [] + if fetchers_setup_block is None: + fetchers_setup_block = dict() + for scheme, schemeHandler in schemeHandlers.items(): + if self.SCHEME_PAT.search(scheme) is None: + self.logger.warning( + f"Fetcher associated to scheme {scheme} has been skipped, as the scheme does not comply with RFC3986" + ) + continue + + lScheme = scheme.lower() + # When no setup block is available for the scheme fetcher, + # provide an empty one + setup_block = fetchers_setup_block.get(lScheme, dict()) + + instSchemeHandler: "Optional[DocumentedProtocolFetcher]" = None + if isinstance(schemeHandler, DocumentedStatefulProtocolFetcher): + instSchemeHandler = fetchers_mapping.get(schemeHandler.fetcher_class) + if instSchemeHandler is None: + try: + instSchemeInstance = self.instantiateStatefulFetcher( + schemeHandler.fetcher_class, + setup_block=setup_block, + progs=progs, + ) + if instSchemeInstance is not None: + instSchemeHandler = DocumentedProtocolFetcher( + fetcher=instSchemeInstance.fetch, + description=instSchemeInstance.description + if schemeHandler.description is None + else schemeHandler.description, + priority=schemeHandler.priority, + ) + fetchers_mapping[ + schemeHandler.fetcher_class + ] = instSchemeHandler + if isinstance( + instSchemeInstance, AbstractSchemeRepoFetcher + ): + repo_fetchers.append(instSchemeInstance) + except Exception as e: + self.logger.exception( + f"Error while instantiating handler implemented at {schemeHandler.fetcher_class} for scheme {lScheme}" + ) + elif isinstance(schemeHandler, DocumentedProtocolFetcher) and callable( + schemeHandler.fetcher + ): + instSchemeHandler = schemeHandler + + # Only the ones which have overcome the sanity checks + if instSchemeHandler is not None: + # Schemes are case insensitive, so register only + # the lowercase version + instSchemeHandlers[lScheme] = instSchemeHandler + else: + self.logger.warning( + f"Scheme {lScheme} could not be properly instantiated" + ) + + self.addRawSchemeHandlers(instSchemeHandlers) + + return repo_fetchers diff --git a/wfexs_backend/utils/contents.py b/wfexs_backend/utils/contents.py index e81a78fc..ac8b7676 100644 --- a/wfexs_backend/utils/contents.py +++ b/wfexs_backend/utils/contents.py @@ -561,7 +561,6 @@ def bin2dataurl(content: "bytes") -> "URIType": return cast( "URIType", - data_url.construct_data_url( - mime_type=mime_type, base64_encode=True, data=content - ), + # mime_type=mime_type, base64_encoded=True, data=content + data_url.construct_data_url(mime_type, True, content), ) diff --git a/wfexs_backend/utils/licences.py b/wfexs_backend/utils/licences.py index f5160c0c..e60a3f44 100644 --- a/wfexs_backend/utils/licences.py +++ b/wfexs_backend/utils/licences.py @@ -54,9 +54,13 @@ import xdg.BaseDirectory +from ..scheme_catalog import ( + SchemeCatalog, +) + from ..cache_handler import ( CacheHandlerException, - SchemeHandlerCacheHandler, + CacheHandler, ) from ..common import ( @@ -65,7 +69,7 @@ NoLicenceDescription, ) -from ..fetchers.http import SCHEME_HANDLERS as HTTP_SCHEME_HANDLERS +from ..fetchers.http import HTTPFetcher # Licences @@ -518,7 +522,7 @@ class LicenceMatcher: def __init__( self, - cacheHandler: "SchemeHandlerCacheHandler", + cacheHandler: "CacheHandler", cacheDir: "Optional[pathlib.Path]" = None, spdx_version: "str" = DEFAULT_SPDX_VERSION, ): @@ -627,10 +631,15 @@ def __new__(cls) -> "LicenceMatcher": # type: ignore xdg.BaseDirectory.save_cache_path("es.elixir.WfExSLicenceMatcher") ) + scheme_catalog = SchemeCatalog( + scheme_handlers=HTTPFetcher.GetSchemeHandlers(), + ) + # Private cache handler instance # with LicenceMatcher - cacheHandler = SchemeHandlerCacheHandler( - cachePath, schemeHandlers=HTTP_SCHEME_HANDLERS + cacheHandler = CacheHandler( + cachePath, + scheme_catalog=scheme_catalog, ) cls.__instance = LicenceMatcher(cacheHandler) diff --git a/wfexs_backend/utils/misc.py b/wfexs_backend/utils/misc.py index a7041f29..c2412b67 100644 --- a/wfexs_backend/utils/misc.py +++ b/wfexs_backend/utils/misc.py @@ -369,3 +369,14 @@ def get_maximum_file_descriptors() -> "int": result = MAXFD return result + + +def urlresolv(url: "str") -> "str": + if url.endswith("/"): + wident = "." + else: + rslash_idx = url.rfind("/") + if rslash_idx == -1: + return url + wident = url[rslash_idx + 1 :] + return urllib.parse.urljoin(url, wident) diff --git a/wfexs_backend/utils/orcid.py b/wfexs_backend/utils/orcid.py index b1d851d7..9628815e 100644 --- a/wfexs_backend/utils/orcid.py +++ b/wfexs_backend/utils/orcid.py @@ -51,7 +51,7 @@ from ..common import ( ResolvedORCID, ) -from ..fetchers.http import fetchClassicURL +from ..fetchers.http import HTTPFetcher from ..fetchers import FetcherException ORCID_HOST: "Final[str]" = "orcid.org" @@ -95,7 +95,7 @@ def validate_orcid( public_record_b = io.BytesIO() public_orcid_url = cast("URIType", f"{ORCID_URL_PREFIX}/{possible_orcid}") # If there is any issue fetching, next call should raise an exception - _, meta_public_record, _ = fetchClassicURL( + _, meta_public_record, _ = HTTPFetcher().streamfetch( cast("URIType", f"{public_orcid_url}/public-record.json"), public_record_b ) try: diff --git a/wfexs_backend/utils/passphrase_wrapper.py b/wfexs_backend/utils/passphrase_wrapper.py index 823fed21..42494898 100644 --- a/wfexs_backend/utils/passphrase_wrapper.py +++ b/wfexs_backend/utils/passphrase_wrapper.py @@ -53,11 +53,15 @@ import xdg.BaseDirectory +from ..scheme_catalog import ( + SchemeCatalog, +) + from ..cache_handler import ( CacheOfflineException, - SchemeHandlerCacheHandler, + CacheHandler, ) -from ..fetchers.http import SCHEME_HANDLERS as HTTP_SCHEME_HANDLERS +from ..fetchers.http import HTTPFetcher from ..fetchers.wiktionary import WiktionaryFetcher @@ -115,7 +119,7 @@ class WfExSPassphraseGenerator: def __init__( self, - cacheHandler: "SchemeHandlerCacheHandler", + cacheHandler: "CacheHandler", cacheDir: "Optional[pathlib.Path]" = None, word_sets: "Mapping[str, Sequence[RemoteWordlistResource]]" = DEFAULT_WORD_SETS, ): @@ -289,10 +293,14 @@ def __new__(cls) -> "WfExSPassphraseGenerator": # type: ignore # Private cache handler instance # with Wiktionary - cacheHandler = SchemeHandlerCacheHandler( - cachePath, schemeHandlers=HTTP_SCHEME_HANDLERS + scheme_catalog = SchemeCatalog( + scheme_handlers=HTTPFetcher.GetSchemeHandlers() + ) + scheme_catalog.bypassSchemeHandlers(WiktionaryFetcher.GetSchemeHandlers()) + cacheHandler = CacheHandler( + cachePath, + scheme_catalog=scheme_catalog, ) - cacheHandler.bypassSchemeHandlers(WiktionaryFetcher.GetSchemeHandlers()) cls.__instance = WfExSPassphraseGenerator(cacheHandler) return cls.__instance diff --git a/wfexs_backend/wfexs_backend.py b/wfexs_backend/wfexs_backend.py index 822b7a4c..81c67593 100644 --- a/wfexs_backend/wfexs_backend.py +++ b/wfexs_backend/wfexs_backend.py @@ -2,7 +2,7 @@ # -*- coding: utf-8 -*- # SPDX-License-Identifier: Apache-2.0 -# Copyright 2020-2024 Barcelona Supercomputing Center (BSC), Spain +# Copyright 2020-2025 Barcelona Supercomputing Center (BSC), Spain # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -83,9 +83,13 @@ ) +from .scheme_catalog import ( + SchemeCatalog, +) + from .cache_handler import ( CachedContent, - SchemeHandlerCacheHandler, + CacheHandler, ) from .container_factories import ( @@ -107,6 +111,7 @@ ) from .utils.marshalling_handling import ( + marshall_namedtuple, unmarshall_namedtuple, ) @@ -124,15 +129,19 @@ from .utils.rocrate import ( ReadROCrateMetadata, + ROCRATE_JSONLD_FILENAME, ROCrateToolbox, ) from .fetchers import ( - AbstractRepoFetcher, + AbstractSchemeRepoFetcher, AbstractStatefulFetcher, DocumentedProtocolFetcher, DocumentedStatefulProtocolFetcher, + FetcherException, + MaterializedRepo, RemoteRepo, + RepoGuessFlavor, # This is needed for proper unmarshalling of cached repository guesses RepoType, ) @@ -160,8 +169,7 @@ ) from .fetchers.trs_files import ( - TRS_SCHEME_PREFIX, - INTERNAL_TRS_SCHEME_PREFIX, + GA4GHTRSFetcher, ) @@ -216,7 +224,7 @@ ) from .fetchers import ( - RepoFetcher, + SchemeRepoFetcher, StatefulFetcher, ) @@ -479,11 +487,19 @@ def FromDescription( # It should not happen enabled_profiles = [str(profiles)] + parsed_workflow_id = urllib.parse.urlparse(workflow_meta["workflow_id"]) + trs_endpoint: "Optional[str]" + if parsed_workflow_id.scheme != "": + trs_endpoint = workflow_meta.get("trs_endpoint") + else: + trs_endpoint = workflow_meta.get("trs_endpoint", WF.DEFAULT_TRS_ENDPOINT) + return cls(updated_local_config, config_directory=config_directory).newSetup( workflow_meta["workflow_id"], workflow_meta.get("version"), descriptor_type=workflow_meta.get("workflow_type"), - trs_endpoint=workflow_meta.get("trs_endpoint", WF.DEFAULT_TRS_ENDPOINT), + trs_endpoint=trs_endpoint, + prefer_upstream_source=workflow_meta.get("prefer_upstream_source"), params=workflow_meta.get("params", {}), enabled_profiles=enabled_profiles, environment=workflow_meta.get("environment", {}), @@ -677,18 +693,24 @@ def __init__( self.baseWorkDir = baseWorkDir self.defaultParanoidMode = False - self._sngltn: "MutableMapping[Type[AbstractStatefulFetcher], AbstractStatefulFetcher]" = ( + self._sngltn_fetcher: "MutableMapping[Type[AbstractStatefulFetcher], AbstractStatefulFetcher]" = ( dict() ) - self.repo_fetchers: "MutableSequence[AbstractRepoFetcher]" = list() + # scheme_catalog is created on first use + self.scheme_catalog = SchemeCatalog() # cacheHandler is created on first use - self.cacheHandler = SchemeHandlerCacheHandler(self.cacheDir) + self.cacheHandler = CacheHandler( + self.cacheDir, scheme_catalog=self.scheme_catalog + ) fetchers_setup_block = local_config.get("fetchers-setup") # All the scheme handlers should be added here - self.findAndAddSchemeHandlersFromModuleName( - fetchers_setup_block=fetchers_setup_block + self._repo_fetchers = ( + self.scheme_catalog.findAndAddSchemeHandlersFromModuleName( + fetchers_setup_block=fetchers_setup_block, + progs=self.progs, + ) ) # Registry of export plugins is created here @@ -745,43 +767,15 @@ def cacheTRSFilesDir(self) -> "pathlib.Path": def cacheWorkflowInputsDir(self) -> "pathlib.Path": return self.cachePathMap[CacheType.Input] + @property + def repo_fetchers(self) -> "Sequence[AbstractSchemeRepoFetcher]": + return sorted(self._repo_fetchers, key=lambda f: f.PRIORITY, reverse=True) + def getCacheHandler( self, cache_type: "CacheType" - ) -> "Tuple[SchemeHandlerCacheHandler, Optional[pathlib.Path]]": + ) -> "Tuple[CacheHandler, Optional[pathlib.Path]]": return self.cacheHandler, self.cachePathMap.get(cache_type) - def instantiateStatefulFetcher( - self, - statefulFetcher: "Type[StatefulFetcher]", - setup_block: "Optional[Mapping[str, Any]]" = None, - ) -> "StatefulFetcher": - """ - Method to instantiate stateful fetchers once - """ - instStatefulFetcher = self._sngltn.get(statefulFetcher) - if instStatefulFetcher is None: - # Setting the default list of programs - for prog in statefulFetcher.GetNeededPrograms(): - self.progs.setdefault(prog, cast("RelPath", prog)) - # Let's augment the list of needed progs by this - # stateful fetcher - instStatefulFetcher = self.cacheHandler.instantiateStatefulFetcher( - statefulFetcher, progs=self.progs, setup_block=setup_block - ) - self._sngltn[statefulFetcher] = instStatefulFetcher - - return cast("StatefulFetcher", instStatefulFetcher) - - def instantiateRepoFetcher( - self, - repoFetcher: "Type[RepoFetcher]", - setup_block: "Optional[Mapping[str, Any]]" = None, - ) -> "RepoFetcher": - """ - Method to instantiate repo fetchers once - """ - return self.instantiateStatefulFetcher(repoFetcher, setup_block=setup_block) - def findAndAddWorkflowEnginesFromModuleName( self, the_module_name: "str" = "wfexs_backend.workflow_engines", @@ -994,141 +988,6 @@ def getExportPluginClass( ) -> "Optional[Type[AbstractExportPlugin]]": return self._export_plugins.get(plugin_id) - def findAndAddSchemeHandlersFromModuleName( - self, - the_module_name: "str" = "wfexs_backend.fetchers", - fetchers_setup_block: "Optional[Mapping[str, Mapping[str, Any]]]" = None, - ) -> None: - try: - the_module = importlib.import_module(the_module_name) - self.findAndAddSchemeHandlersFromModule( - the_module, - fetchers_setup_block=fetchers_setup_block, - ) - except Exception as e: - errmsg = f"Unable to import module {the_module_name} in order to gather scheme handlers, due errors:" - self.logger.exception(errmsg) - raise WfExSBackendException(errmsg) from e - - def findAndAddSchemeHandlersFromModule( - self, - the_module: "ModuleType", - fetchers_setup_block: "Optional[Mapping[str, Mapping[str, Any]]]" = None, - ) -> None: - for finder, module_name, ispkg in iter_namespace(the_module): - try: - named_module = importlib.import_module(module_name) - except: - self.logger.exception( - f"Skipping module {module_name} in order to gather scheme handlers, due errors:" - ) - continue - - # First, try locating a variable named SCHEME_HANDLERS - # then, the different class declarations inheriting - # from AbstractStatefulFetcher - skipit = True - for name, obj in inspect.getmembers(named_module): - if name == "SCHEME_HANDLERS": - if isinstance(obj, dict): - self.addSchemeHandlers( - obj, - fetchers_setup_block=fetchers_setup_block, - ) - skipit = False - elif ( - inspect.isclass(obj) - and not inspect.isabstract(obj) - and issubclass(obj, AbstractStatefulFetcher) - ): - # Now, let's learn whether the class is enabled - if getattr(obj, "ENABLED", False): - self.addStatefulSchemeHandlers( - obj, - fetchers_setup_block=fetchers_setup_block, - ) - skipit = False - - if skipit: - self.logger.debug( - f"Fetch module {named_module} was not eligible (no SCHEME_HANDLERS dictionary or subclass of {AbstractStatefulFetcher.__name__})" - ) - - def addStatefulSchemeHandlers( - self, - statefulSchemeHandler: "Type[AbstractStatefulFetcher]", - fetchers_setup_block: "Optional[Mapping[str, Mapping[str, Any]]]" = None, - ) -> None: - """ - This method adds scheme handlers (aka "fetchers") from - a given stateful fetcher, also adding the needed programs - """ - - # Get the scheme handlers from this fetcher - schemeHandlers = statefulSchemeHandler.GetSchemeHandlers() - - self.addSchemeHandlers( - schemeHandlers, fetchers_setup_block=fetchers_setup_block - ) - - # This pattern is used to validate the schemes - SCHEME_PAT: "Final[Pattern[str]]" = re.compile(r"^[a-zA-Z][a-zA-Z0-9+.-]*$") - - def addSchemeHandlers( - self, - schemeHandlers: "Mapping[str, Union[DocumentedProtocolFetcher, DocumentedStatefulProtocolFetcher]]", - fetchers_setup_block: "Optional[Mapping[str, Mapping[str, Any]]]" = None, - ) -> None: - """ - This method adds scheme handlers (aka "fetchers") - or instantiates stateful scheme handlers (aka "stateful fetchers") - """ - if isinstance(schemeHandlers, dict): - instSchemeHandlers = dict() - if fetchers_setup_block is None: - fetchers_setup_block = dict() - for scheme, schemeHandler in schemeHandlers.items(): - if self.SCHEME_PAT.search(scheme) is None: - self.logger.warning( - f"Fetcher associated to scheme {scheme} has been skipped, as the scheme does not comply with RFC3986" - ) - continue - - lScheme = scheme.lower() - # When no setup block is available for the scheme fetcher, - # provide an empty one - setup_block = fetchers_setup_block.get(lScheme, dict()) - - instSchemeHandler = None - if isinstance(schemeHandler, DocumentedStatefulProtocolFetcher): - instSchemeInstance = self.instantiateStatefulFetcher( - schemeHandler.fetcher_class, setup_block=setup_block - ) - if instSchemeInstance is not None: - instSchemeHandler = DocumentedProtocolFetcher( - fetcher=instSchemeInstance.fetch, - description=instSchemeInstance.description - if schemeHandler.description is None - else schemeHandler.description, - priority=schemeHandler.priority, - ) - - # Also, if it is a repository fetcher, record it separately - if isinstance(instSchemeInstance, AbstractRepoFetcher): - self.repo_fetchers.append(instSchemeInstance) - elif isinstance(schemeHandler, DocumentedProtocolFetcher) and callable( - schemeHandler.fetcher - ): - instSchemeHandler = schemeHandler - - # Only the ones which have overcome the sanity checks - if instSchemeHandler is not None: - # Schemes are case insensitive, so register only - # the lowercase version - instSchemeHandlers[lScheme] = instSchemeHandler - - self.cacheHandler.addRawSchemeHandlers(instSchemeHandlers) - def gen_workflow_pid(self, remote_repo: "RemoteRepo") -> "str": """ This method tries generating the workflow pid passing the remote @@ -1146,14 +1005,15 @@ def gen_workflow_pid(self, remote_repo: "RemoteRepo") -> "str": return remote_repo.repo_url if retval is None else retval def describeFetchableSchemes(self) -> "Sequence[Tuple[str, str, int]]": - return self.cacheHandler.describeRegisteredSchemes() + return self.scheme_catalog.describeRegisteredSchemes() def newSetup( self, workflow_id: "WorkflowId", version_id: "Optional[WFVersionId]" = None, descriptor_type: "Optional[TRS_Workflow_Descriptor]" = None, - trs_endpoint: "str" = WF.DEFAULT_TRS_ENDPOINT, + trs_endpoint: "Optional[str]" = None, + prefer_upstream_source: "Optional[bool]" = None, params: "Optional[ParamsBlock]" = None, enabled_profiles: "Optional[Sequence[str]]" = None, environment: "Optional[EnvironmentBlock]" = None, @@ -1174,6 +1034,7 @@ def newSetup( version_id=version_id, descriptor_type=descriptor_type, trs_endpoint=trs_endpoint, + prefer_upstream_source=prefer_upstream_source, params=params, enabled_profiles=enabled_profiles, environment=environment, @@ -2015,26 +1876,105 @@ def instantiateEngine( config_directory=self.config_directory, ) + def matchRepoFetcherByClassname( + self, clazzname: "str" + ) -> "Optional[AbstractSchemeRepoFetcher]": + for fetcher in self._repo_fetchers: + if fetcher.__class__.__name__ == clazzname: + return fetcher + + return None + def guess_repo_params( self, wf_url: "Union[URIType, parse.ParseResult]", fail_ok: "bool" = False, - ) -> "Optional[RemoteRepo]": + offline: "bool" = False, + ignoreCache: "bool" = False, + registerInCache: "bool" = True, + ) -> "Optional[Tuple[RemoteRepo, AbstractSchemeRepoFetcher]]": + remote_repo: "Optional[RemoteRepo]" = None + fetcher: "Optional[AbstractSchemeRepoFetcher]" = None + guess_cache = self.cacheWorkflowDir / "guess-cache" + + if not ignoreCache: + try: + # Let's check whether the workflow was registered + # kind: "ContentKind" + # path: "pathlib.Path" + # metadata_array: "Sequence[URIWithMetadata]" + # licences: "Tuple[URIType, ...]" + # fingerprint: "Optional[Fingerprint]" = None + # clonable: "bool" = True + cached_content = self.cacheHandler.fetch( + cast("URIType", wf_url), + offline=True, + destdir=guess_cache, + ) + # Always a cached metadata file + assert cached_content.kind == ContentKind.File + with cached_content.path.open(mode="r", encoding="utf-8") as ccH: + guessed_repo_payload = json.load(ccH) + + if isinstance(guessed_repo_payload, (tuple, list)): + remote_repo, fetcher_class_name = unmarshall_namedtuple( + guessed_repo_payload + ) + # Now, time to find the fetcher itself + if remote_repo is not None: + fetcher = self.matchRepoFetcherByClassname(fetcher_class_name) + if fetcher is not None: + return remote_repo, fetcher + self.logger.debug( + f"Cached empty guessing elements associated to {wf_url}. Ignoring" + ) + elif offline: + # Do not try again if it is in offline mode + return None + except Exception as e: + self.logger.debug(f"Guessed {wf_url} not cached (exception {e})") + if isinstance(wf_url, parse.ParseResult): parsedRepoURL = wf_url else: parsedRepoURL = urllib.parse.urlparse(wf_url) - remote_repo = SoftwareHeritageFetcher.GuessRepoParams( - parsedRepoURL, logger=self.logger, fail_ok=fail_ok - ) - if remote_repo is None: - # Assume it might be a git repo or a link to a git repo - remote_repo = GitFetcher.GuessRepoParams( - parsedRepoURL, logger=self.logger, fail_ok=fail_ok + for fetcher in self.repo_fetchers: + remote_repo = fetcher.GuessRepoParams( + parsedRepoURL, + logger=self.logger, + fail_ok=fail_ok, + offline=offline, ) + if remote_repo is not None: + if registerInCache: + temp_cached = guess_cache / ("caching-" + str(uuid.uuid4())) + try: + with temp_cached.open(mode="w", encoding="utf-8") as tC: + json.dump( + marshall_namedtuple( + (remote_repo, fetcher.__class__.__name__) + ), + tC, + ) + self.cacheHandler.inject( + cast("URIType", wf_url), + destdir=guess_cache, + tempCachedFilename=temp_cached, + inputKind=ContentKind.File, + ) + except Exception as e: + self.logger.exception( + f"Unable to register guess cache for {wf_url} (see exception trace)" + ) + finally: + # Removing the leftovers, whether they worked or not + if temp_cached.exists(): + temp_cached.unlink() - return remote_repo + return remote_repo, fetcher + + return None def cacheWorkflow( self, @@ -2042,6 +1982,7 @@ def cacheWorkflow( version_id: "Optional[WFVersionId]" = None, trs_endpoint: "Optional[str]" = None, descriptor_type: "Optional[TRS_Workflow_Descriptor]" = None, + prefer_upstream_source: "bool" = True, ignoreCache: "bool" = False, registerInCache: "bool" = True, offline: "bool" = False, @@ -2070,126 +2011,102 @@ def cacheWorkflow( if requested_workflow_type is None: self.logger.warning( - f"Workflow of type {descriptor_type} is not supported by this version of WfExS-backend" + f"Workflow of type {descriptor_type} is not supported by this version of WfExS-backend. Switching to guess mode." ) - putative_repo_url = str(workflow_id) + if (trs_endpoint is not None) and len(trs_endpoint) > 0: + putative_repo_url = GA4GHTRSFetcher.BuildRepoPIDFromTRSParams( + trs_endpoint, workflow_id, version_id + ) + else: + putative_repo_url = cast("URIType", str(workflow_id)) + parsedRepoURL = urllib.parse.urlparse(putative_repo_url) # It is not an absolute URL, so it is being an identifier in the workflow i_workflow: "Optional[IdentifiedWorkflow]" = None - engineDesc: "Optional[WorkflowType]" = None + workflow_type: "Optional[WorkflowType]" = None guessedRepo: "Optional[RemoteRepo]" = None repoDir: "Optional[pathlib.Path]" = None putative: "bool" = False cached_putative_path: "Optional[pathlib.Path]" = None - if parsedRepoURL.scheme in ("", TRS_SCHEME_PREFIX): - # Extracting the TRS endpoint details from the parsedRepoURL - if parsedRepoURL.scheme == TRS_SCHEME_PREFIX: - # Duplication of code borrowed from trs_files.py - path_steps: "Sequence[str]" = parsedRepoURL.path.split("/") - if len(path_steps) < 3 or path_steps[0] != "": - raise WfExSBackendException( - f"Ill-formed TRS CURIE {putative_repo_url}. It should be in the format of {TRS_SCHEME_PREFIX}://id/version or {TRS_SCHEME_PREFIX}://prefix-with-slashes/id/version" - ) - trs_steps = cast("MutableSequence[str]", path_steps[0:-2]) - trs_steps.extend(["ga4gh", "trs", "v2", ""]) - trs_endpoint = urllib.parse.urlunparse( - urllib.parse.ParseResult( - scheme="https", - netloc=parsedRepoURL.netloc, - path="/".join(trs_steps), - params="", - query="", - fragment="", - ) - ) + if parsedRepoURL.scheme == "": + raise WFException("trs_endpoint was not provided") - workflow_id = urllib.parse.unquote(path_steps[-2]) - version_id = urllib.parse.unquote(path_steps[-1]) - if (trs_endpoint is not None) and len(trs_endpoint) > 0: - i_workflow, repoDir = self.getWorkflowRepoFromTRS( - trs_endpoint, - workflow_id, - version_id, - descriptor_type, - ignoreCache=ignoreCache, - offline=offline, - meta_dir=meta_dir, + # Trying to be smarter + guessed = self.guess_repo_params( + parsedRepoURL, + offline=offline, + ignoreCache=ignoreCache, + registerInCache=registerInCache, + fail_ok=True, + ) + if guessed is not None: + guessedRepo = guessed[0] + if guessedRepo.tag is None and version_id is not None: + guessedRepo = RemoteRepo( + repo_url=guessedRepo.repo_url, + tag=cast("RepoTag", str(version_id)), + rel_path=guessedRepo.rel_path, + repo_type=guessedRepo.repo_type, + web_url=guessedRepo.web_url, ) - # For the cases of pure TRS repos, like Dockstore - # repoDir contains the cached path - else: - raise WFException("trs_endpoint was not provided") else: - # Trying to be smarter - guessedRepo = self.guess_repo_params(parsedRepoURL, fail_ok=True) - - if guessedRepo is not None: - if guessedRepo.tag is None and version_id is not None: - guessedRepo = RemoteRepo( - repo_url=guessedRepo.repo_url, - tag=cast("RepoTag", version_id), - rel_path=guessedRepo.rel_path, - repo_type=guessedRepo.repo_type, - web_url=guessedRepo.web_url, - ) - else: - repoRelPath: "Optional[str]" = None - ( - i_workflow, - cached_putative_path, - metadata_array, - repoRelPath, - ) = self.getWorkflowBundleFromURI( - cast("URIType", workflow_id), - offline=offline, - ignoreCache=ignoreCache, - ) - - if i_workflow is None: - repoDir = cached_putative_path - if not repoRelPath: - if repoDir.is_dir(): - if len(parsedRepoURL.fragment) > 0: - frag_qs = urllib.parse.parse_qs(parsedRepoURL.fragment) - subDirArr = frag_qs.get("subdirectory", []) - if len(subDirArr) > 0: - repoRelPath = subDirArr[0] - elif len(metadata_array) > 0: - # Let's try getting a pretty filename - # when the workflow is a single file - repoRelPath = metadata_array[0].preferredName - - # It can be either a relative path to a directory or to a file - # It could be even empty! - if repoRelPath == "": - repoRelPath = None - # raise WFException('Unable to guess repository from RO-Crate manifest') - guessedRepo = RemoteRepo( - repo_url=cast("RepoURL", workflow_id), - tag=cast("RepoTag", version_id), - rel_path=cast("Optional[RelPath]", repoRelPath), - ) - putative = True + repoRelPath: "Optional[str]" = None + ( + i_workflow, + cached_putative_path, + metadata_array, + repoRelPath, + ) = self.getWorkflowBundleFromURI( + putative_repo_url, + prefer_upstream_source=prefer_upstream_source, + offline=offline, + ignoreCache=ignoreCache, + registerInCache=registerInCache, + ) - # This can be incorrect, but let it be for now - if i_workflow is not None: - if ( - requested_workflow_type is not None - and requested_workflow_type != i_workflow.workflow_type - ): - message = f"Fetched workflow is of type {i_workflow.workflow_type.shortname} , but it was explicitly requested to be of type {requested_workflow_type.shortname}" - self.logger.error(message) - raise WfExSBackendException(message) + if i_workflow is None: + repoDir = cached_putative_path + if not repoRelPath: + if repoDir.is_dir(): + if len(parsedRepoURL.fragment) > 0: + frag_qs = urllib.parse.parse_qs(parsedRepoURL.fragment) + subDirArr = frag_qs.get("subdirectory", []) + if len(subDirArr) > 0: + repoRelPath = subDirArr[0] + elif len(metadata_array) > 0: + # Let's try getting a pretty filename + # when the workflow is a single file + repoRelPath = metadata_array[0].preferredName + + # It can be either a relative path to a directory or to a file + # It could be even empty! + if repoRelPath == "": + repoRelPath = None + # raise WFException('Unable to guess repository from RO-Crate manifest') + guessedRepo = RemoteRepo( + repo_url=cast("RepoURL", workflow_id), + tag=cast("RepoTag", version_id), + rel_path=cast("Optional[RelPath]", repoRelPath), + ) + putative = True + else: + # This can be incorrect, but let it be for now + if ( + requested_workflow_type is not None + and requested_workflow_type != i_workflow.workflow_type + ): + message = f"Fetched workflow is of type {i_workflow.workflow_type.shortname} , but it was explicitly requested to be of type {requested_workflow_type.shortname}" + self.logger.error(message) + raise WfExSBackendException(message) - guessedRepo = i_workflow.remote_repo - engineDesc = i_workflow.workflow_type - if cached_putative_path is not None: - self.cacheROCrateFilename = cached_putative_path + guessedRepo = i_workflow.remote_repo + workflow_type = i_workflow.workflow_type assert guessedRepo is not None assert guessedRepo.repo_url is not None + repo: "RemoteRepo" = guessedRepo repoEffectiveCheckout: "Optional[RepoTag]" = None # A putative workflow is one which is already materialized @@ -2200,426 +2117,243 @@ def cacheWorkflow( len(parsedRepoURL.scheme) > 0 ), f"Repository id {guessedRepo.repo_url} should be a parsable URI" - repoDir, repoEffectiveCheckout = self.doMaterializeRepo( + ( + repoDir, + materialized_repo, + workflow_type, + downstream_repos, + ) = self.doMaterializeRepo( guessedRepo, + fetcher=guessed[1] if guessed is not None else None, + prefer_upstream_source=prefer_upstream_source, doUpdate=ignoreCache, registerInCache=registerInCache, + offline=offline, ) + assert len(downstream_repos) > 0 + repo = materialized_repo.repo + repoEffectiveCheckout = repo.get_checkout() + # TODO: should we preserve the chain of repos? - return repoDir, guessedRepo, engineDesc, repoEffectiveCheckout + return repoDir, repo, workflow_type, repoEffectiveCheckout TRS_METADATA_FILE: "Final[RelPath]" = cast("RelPath", "trs_metadata.json") TRS_QUERY_CACHE_FILE: "Final[RelPath]" = cast("RelPath", "trs_result.json") - def getWorkflowRepoFromTRS( + def doMaterializeRepo( self, - trs_endpoint: "str", - workflow_id: "WorkflowId", - version_id: "Optional[WFVersionId]", - descriptor_type: "Optional[TRS_Workflow_Descriptor]", + repo: "RemoteRepo", + fetcher: "Optional[AbstractSchemeRepoFetcher]" = None, + prefer_upstream_source: "bool" = True, + doUpdate: "bool" = True, + registerInCache: "bool" = True, offline: "bool" = False, - ignoreCache: "bool" = False, - meta_dir: "Optional[pathlib.Path]" = None, - ) -> "Tuple[IdentifiedWorkflow, Optional[pathlib.Path]]": + ) -> "Tuple[pathlib.Path, MaterializedRepo, Optional[WorkflowType], Sequence[RemoteRepo]]": """ - - :return: + This method is used to materialize repos described using instances + of RemoteRepo. It starts asking all the known repo fetchers whether + they recognize the URI as consumable by them. + + Later, they fulfil the materialization task, answering the local + path where the repo was cloned, an updated instance of RemoteRepo, + the metadata array of all the requests, and whether their copy + came from another upstream repo (and whether it is recommended). + + If the upstream repo is recommended, then doMaterializeRepo calls + itself using it in order to fetch the contents of the upstream repo. + + If no repo fetcher is able to materialize the repo, then it is + considered a "raw" one, so it is fetched using standard fetchers. + With the fetched content, it is detected whether it is an RO-Crate. + If it is so, and the associated upstream repo is obtained, then + doMaterializeRepo calls itself in order to materialize it. + + At the end of the process the path to the repo, the identified + tag, a MaterializedRepo instance and the list of repos which brought + to this one is returned. """ - # If nothing is set, just create a temporary directory - if meta_dir is None: - meta_dir = pathlib.Path( - tempfile.mkdtemp(prefix="WfExS", suffix="TRSFetched") - ) - # Assuring this temporal directory is removed at the end - atexit.register(shutil.rmtree, meta_dir, True) - else: - # Assuring the destination directory does exist - meta_dir.mkdir(parents=True, exist_ok=True) - - if isinstance(workflow_id, int): - workflow_id_str = str(workflow_id) - else: - workflow_id_str = workflow_id - - # The base URL must end with a slash - if trs_endpoint[-1] != "/": - trs_endpoint += "/" - # Now, time to check whether it is a TRSv2 - trs_endpoint_v2_meta_url = cast("URIType", trs_endpoint + "service-info") - trs_endpoint_v2_beta2_meta_url = cast("URIType", trs_endpoint + "metadata") - trs_endpoint_meta_url = None - - # Needed to store this metadata - trsMetadataCache = meta_dir / self.TRS_METADATA_FILE - - try: - trs_cached_content = self.cacheHandler.fetch( - trs_endpoint_v2_meta_url, - destdir=meta_dir, - offline=offline, - ignoreCache=ignoreCache, + # This is needed in case a proposed fetcher is already set + # by the caller of this method (discouraged) + if fetcher is None: + for fetcher in self.repo_fetchers: + if fetcher.build_pid_from_repo(repo) is not None: + break + else: + fetcher = None + + workflow_type: "Optional[WorkflowType]" = None + # An specialized fetcher is used + downstream_repos: "MutableSequence[RemoteRepo]" + if fetcher is not None: + materialized_repo = fetcher.materialize_repo_from_repo( + repo, + doUpdate=doUpdate, + base_repo_destdir=self.cacheWorkflowDir, ) - trs_endpoint_meta_url = trs_endpoint_v2_meta_url - except WFException as wfe: - try: - trs_cached_content = self.cacheHandler.fetch( - trs_endpoint_v2_beta2_meta_url, - destdir=meta_dir, - offline=offline, - ignoreCache=ignoreCache, - ) - trs_endpoint_meta_url = trs_endpoint_v2_beta2_meta_url - except WFException as wfebeta: - raise WFException( - "Unable to fetch metadata from {} in order to identify whether it is a working GA4GH TRSv2 endpoint. Exceptions:\n{}\n{}".format( - trs_endpoint, wfe, wfebeta - ) - ) - # Giving a friendly name - if not trsMetadataCache.exists(): - os.symlink(trs_cached_content.path.name, trsMetadataCache) + downstream_repos = [repo] + repo_path = materialized_repo.local + materialized_repo_repo = materialized_repo.repo + metadata_array = materialized_repo.metadata_array - with trsMetadataCache.open(mode="r", encoding="utf-8") as ctmf: - trs_endpoint_meta = json.load(ctmf) + # Now, let's register the checkout with cache structures + # using its public URI + remote_url: "str" = repo.repo_url + if fetcher.__class__ == GitFetcher: + if not repo.repo_url.startswith("git"): + remote_url = "git+" + repo.repo_url - # Minimal check - trs_version = trs_endpoint_meta.get("api_version") - if trs_version is None: - trs_version = trs_endpoint_meta.get("type", {}).get("version") + if repo.tag is not None: + remote_url += "@" + repo.tag - if trs_version is None: - raise WFException( - "Unable to identify TRS version from {}".format(trs_endpoint_meta_url) + repo_desc: "Optional[Mapping[str, Any]]" = ( + materialized_repo_repo.gen_repo_desc() ) - - # Now, check the tool does exist in the TRS, and the version - trs_tools_url = cast( - "URIType", - urllib.parse.urljoin( - trs_endpoint, - WF.TRS_TOOLS_PATH + urllib.parse.quote(workflow_id_str, safe=""), - ), - ) - - trsQueryCache = meta_dir / self.TRS_QUERY_CACHE_FILE - trs_cached_tool = self.cacheHandler.fetch( - trs_tools_url, destdir=meta_dir, offline=offline, ignoreCache=ignoreCache - ) - # Giving a friendly name - if not trsQueryCache.exists(): - os.symlink(trs_cached_tool.path.name, trsQueryCache) - - with trsQueryCache.open(mode="r", encoding="utf-8") as tQ: - rawToolDesc = tQ.read() - - # If the tool does not exist, an exception will be thrown before - jd = json.JSONDecoder() - toolDesc = jd.decode(rawToolDesc) - - # If the tool is not a workflow, complain - if toolDesc.get("toolclass", {}).get("name", "") != "Workflow": - raise WFException( - "Tool {} from {} is not labelled as a workflow. Raw answer:\n{}".format( - workflow_id_str, trs_endpoint, rawToolDesc + if repo_desc is None: + repo_desc = {} + augmented_metadata_array = [ + URIWithMetadata( + uri=cast("URIType", remote_url), + metadata=repo_desc, + ), + *metadata_array, + ] + + # Give the chance to register the current fetched repo in the corresponding cache + if registerInCache: + kind = ContentKind.Directory if repo_path.is_dir() else ContentKind.File + self.cacheHandler.inject( + cast("URIType", remote_url), + destdir=self.cacheWorkflowDir, + fetched_metadata_array=augmented_metadata_array, + finalCachedFilename=repo_path, + inputKind=kind, ) - ) - - possibleToolVersions = toolDesc.get("versions", []) - if len(possibleToolVersions) == 0: - raise WFException( - "Version {} not found in workflow {} from {} . Raw answer:\n{}".format( - version_id, workflow_id_str, trs_endpoint, rawToolDesc - ) - ) - toolVersion = None - toolVersionId = str(version_id) if isinstance(version_id, int) else version_id - if (toolVersionId is not None) and len(toolVersionId) > 0: - for possibleToolVersion in possibleToolVersions: - if isinstance(possibleToolVersion, dict): - possibleId = str(possibleToolVersion.get("id", "")) - possibleName = str(possibleToolVersion.get("name", "")) - if version_id in (possibleId, possibleName): - toolVersion = possibleToolVersion - break - else: - raise WFException( - "Version {} not found in workflow {} from {} . Raw answer:\n{}".format( - version_id, workflow_id_str, trs_endpoint, rawToolDesc + # Go to the next repo only if it is recommended + if ( + prefer_upstream_source + and materialized_repo.recommends_upstream + and materialized_repo.upstream_repo is not None + ): + try: + ( + upstream_repo_path, + upstream_materialized_repo, + upstream_workflow_type, + upstream_downstream_repos, + ) = self.doMaterializeRepo( + materialized_repo.upstream_repo, + prefer_upstream_source=prefer_upstream_source, + doUpdate=doUpdate, + registerInCache=registerInCache, + offline=offline, ) - ) - else: - toolVersionId = "" - for possibleToolVersion in possibleToolVersions: - possibleToolVersionId = str(possibleToolVersion.get("id", "")) - if ( - len(possibleToolVersionId) > 0 - and toolVersionId < possibleToolVersionId - ): - toolVersion = possibleToolVersion - toolVersionId = possibleToolVersionId - - if toolVersion is None: - raise WFException( - "No valid version was found in workflow {} from {} . Raw answer:\n{}".format( - workflow_id_str, trs_endpoint, rawToolDesc - ) - ) - - # The version has been found - toolDescriptorTypes = toolVersion.get("descriptor_type", []) - if not isinstance(toolDescriptorTypes, list): - raise WFException( - 'Version {} of workflow {} from {} has no valid "descriptor_type" (should be a list). Raw answer:\n{}'.format( - version_id, workflow_id_str, trs_endpoint, rawToolDesc - ) - ) - - # Now, realize whether it matches - chosenDescriptorType = descriptor_type - if chosenDescriptorType is None: - for candidateDescriptorType in self.RECOGNIZED_TRS_DESCRIPTORS.keys(): - if candidateDescriptorType in toolDescriptorTypes: - chosenDescriptorType = candidateDescriptorType - break - else: - raise WFException( - 'Version {} of workflow {} from {} has no acknowledged "descriptor_type". Raw answer:\n{}'.format( - version_id, workflow_id_str, trs_endpoint, rawToolDesc + downstream_repos.extend(upstream_downstream_repos) + return ( + upstream_repo_path, + upstream_materialized_repo, + upstream_workflow_type, + downstream_repos, ) - ) - elif chosenDescriptorType not in toolVersion["descriptor_type"]: - raise WFException( - "Descriptor type {} not available for version {} of workflow {} from {} . Raw answer:\n{}".format( - descriptor_type, - version_id, - workflow_id_str, - trs_endpoint, - rawToolDesc, - ) - ) - elif chosenDescriptorType not in self.RECOGNIZED_TRS_DESCRIPTORS: - raise WFException( - "Descriptor type {} is not among the acknowledged ones by this backend. Version {} of workflow {} from {} . Raw answer:\n{}".format( - descriptor_type, - version_id, - workflow_id_str, - trs_endpoint, - rawToolDesc, - ) - ) - - toolFilesURL = ( - trs_tools_url - + "/versions/" - + urllib.parse.quote(toolVersionId, safe="") - + "/" - + urllib.parse.quote(chosenDescriptorType, safe="") - + "/files" - ) - - # Detecting whether RO-Crate trick will work - if trs_endpoint_meta.get("organization", {}).get("name") == "WorkflowHub": - self.logger.debug("WorkflowHub workflow") - # And this is the moment where the RO-Crate must be fetched - roCrateURL = cast( - "URIType", - toolFilesURL + "?" + urllib.parse.urlencode({"format": "zip"}), + except Exception as e: + self.logger.warning( + f"Recommended upstream repo {materialized_repo.upstream_repo} from repo {repo} could not be fetched, skipping. Exception: {e}" + ) + elif repo.repo_type not in (RepoType.Raw, None): + raise WfExSBackendException( + f"Don't know how to materialize {repo.repo_url} (of type {repo.repo_type}) as a repository" ) - + else: + downstream_repos = [] + # Let's try guessing whether it is an RO-Crate ( i_workflow, - self.cacheROCrateFilename, + cached_putative_path, metadata_array, - _, + repo_rel_path, ) = self.getWorkflowBundleFromURI( - roCrateURL, - expectedEngineDesc=self.RECOGNIZED_TRS_DESCRIPTORS[ - chosenDescriptorType - ], + repo.repo_url, + prefer_upstream_source=prefer_upstream_source, + ignoreCache=doUpdate, + registerInCache=registerInCache, offline=offline, - ignoreCache=ignoreCache, ) - assert i_workflow is not None - return i_workflow, None - else: - self.logger.debug("TRS workflow") - # Learning the available files and maybe - # which is the entrypoint to the workflow - cached_trs_files = self.cacheFetch( - cast("URIType", INTERNAL_TRS_SCHEME_PREFIX + ":" + toolFilesURL), - CacheType.TRS, - offline=offline, - ignoreCache=ignoreCache, - ) - - expectedEngineDesc = self.RECOGNIZED_TRS_DESCRIPTORS[chosenDescriptorType] - trs_meta = cached_trs_files.metadata_array[0] - remote_workflow_entrypoint = trs_meta.metadata.get( - "remote_workflow_entrypoint" - ) - if remote_workflow_entrypoint is not None: - # Give it a chance to identify the original repo of the workflow - repo = self.guess_repo_params(remote_workflow_entrypoint, fail_ok=True) - if repo is not None: - self.logger.debug( - "Derived repository {} ({} , rel {}) from {}".format( - repo.repo_url, repo.tag, repo.rel_path, trs_tools_url - ) + if i_workflow is not None: + # It is an RO-Crate + downstream_repos.append(repo) + i_workflow_repo = i_workflow.remote_repo + workflow_type = i_workflow.workflow_type + if repo_rel_path is not None: + i_workflow_repo = i_workflow_repo._replace(rel_path=repo_rel_path) + downstream_repos.append(i_workflow_repo) + + # We are assuming it is always recommended + try: + ( + upstream_repo_path, + upstream_materialized_repo, + upstream_workflow_type, + upstream_downstream_repos, + ) = self.doMaterializeRepo( + i_workflow_repo, + prefer_upstream_source=prefer_upstream_source, + doUpdate=doUpdate, + registerInCache=registerInCache, + offline=offline, ) + downstream_repos.extend(upstream_downstream_repos) return ( - IdentifiedWorkflow( - workflow_type=expectedEngineDesc, remote_repo=repo - ), - None, + upstream_repo_path, + upstream_materialized_repo, + upstream_workflow_type, + downstream_repos, ) - - workflow_entrypoint = trs_meta.metadata.get("workflow_entrypoint") - if workflow_entrypoint is not None: - self.logger.debug( - "Using raw files from TRS tool {}".format(trs_tools_url) + except Exception as e: + raise + # TODO: extract and use payload workflow from RO-Crate as a fallback + else: + # It was not an RO-Crate, so it is a raw workflow + repo_path = cached_putative_path + parsed_repo_url = urllib.parse.urlparse(repo.repo_url) + if not repo_rel_path: + if repo_path.is_dir(): + if len(parsed_repo_url.fragment) > 0: + frag_qs = urllib.parse.parse_qs(parsed_repo_url.fragment) + subDirArr = frag_qs.get("subdirectory", []) + if len(subDirArr) > 0: + repo_rel_path = cast("RelPath", subDirArr[0]) + elif len(metadata_array) > 0: + # Let's try getting a pretty filename + # when the workflow is a single file + repo_rel_path = metadata_array[0].preferredName + + # It can be either a relative path to a directory or to a file + # It could be even empty! + if repo_rel_path == "": + repo_rel_path = None + # raise WFException('Unable to guess repository from RO-Crate manifest') + guessed_repo = RemoteRepo( + repo_url=repo.repo_url, + rel_path=repo_rel_path, + repo_type=RepoType.Raw, ) - return ( - IdentifiedWorkflow( - workflow_type=expectedEngineDesc, - remote_repo=RemoteRepo( - repo_url=cast("RepoURL", toolFilesURL), - rel_path=workflow_entrypoint, - repo_type=RepoType.TRS, - ), - ), - cached_trs_files.path, + downstream_repos.append(guessed_repo) + materialized_repo = MaterializedRepo( + local=repo_path, + repo=guessed_repo, + metadata_array=metadata_array, ) - raise WFException("Unable to find a workflow in {}".format(trs_tools_url)) - - def doMaterializeRepo( - self, - repo: "RemoteRepo", - doUpdate: "bool" = True, - registerInCache: "bool" = True, - ) -> "Tuple[pathlib.Path, RepoTag]": - if repo.repo_type not in (RepoType.Other, RepoType.SoftwareHeritage): - ( - remote_url, - repo_effective_checkout, - repo_path, - metadata_array, - ) = self._doMaterializeGitRepo(repo, doUpdate=doUpdate) - elif repo.repo_type == RepoType.SoftwareHeritage: - ( - remote_url, - repo_effective_checkout, - repo_path, - metadata_array, - ) = self._doMaterializeSoftwareHeritageDirOrContent(repo, doUpdate=doUpdate) - else: - raise WfExSBackendException( - f"Don't know how to materialize {repo.repo_url} as a repository" - ) - - if registerInCache: - kind = ContentKind.Directory if repo_path.is_dir() else ContentKind.File - self.cacheHandler.inject( - remote_url, - destdir=self.cacheWorkflowDir, - fetched_metadata_array=metadata_array, - finalCachedFilename=repo_path, - inputKind=kind, - ) - - return repo_path, repo_effective_checkout - - def _doMaterializeGitRepo( - self, - repo: "RemoteRepo", - doUpdate: "bool" = True, - ) -> "Tuple[URIType, RepoTag, pathlib.Path, Sequence[URIWithMetadata]]": - """ - - :param repoURL: - :param repoTag: - :param doUpdate: - :return: - """ - gitFetcherInst = self.instantiateRepoFetcher(GitFetcher) - repoDir, materialized_repo, metadata_array = gitFetcherInst.materialize_repo( - repo.repo_url, - repoTag=repo.tag, - doUpdate=doUpdate, - base_repo_destdir=self.cacheWorkflowDir, - ) - - # Now, let's register the checkout with cache structures - # using its public URI - if not repo.repo_url.startswith("git"): - remote_url = "git+" + repo.repo_url - else: - remote_url = repo.repo_url - - if repo.tag is not None: - remote_url += "@" + repo.tag - - repo_desc: "Optional[Mapping[str, Any]]" = materialized_repo.gen_repo_desc() - if repo_desc is None: - repo_desc = {} - augmented_metadata_array = [ - URIWithMetadata( - uri=cast("URIType", remote_url), - metadata=repo_desc, - ), - *metadata_array, - ] - return ( - cast("URIType", remote_url), - materialized_repo.get_checkout(), - repoDir, - augmented_metadata_array, - ) - - def _doMaterializeSoftwareHeritageDirOrContent( - self, - repo: "RemoteRepo", - doUpdate: "bool" = True, - ) -> "Tuple[URIType, RepoTag, pathlib.Path, Sequence[URIWithMetadata]]": - """ - - :param repoURL: - :param repoTag: - :param doUpdate: - :return: - """ - swhFetcherInst = self.instantiateRepoFetcher(SoftwareHeritageFetcher) - repoDir, materialized_repo, metadata_array = swhFetcherInst.materialize_repo( - cast("RepoURL", repo.tag) if repo.tag is not None else repo.repo_url, - doUpdate=doUpdate, - base_repo_destdir=self.cacheWorkflowDir, - ) - - repo_desc: "Optional[Mapping[str, Any]]" = materialized_repo.gen_repo_desc() - if repo_desc is None: - repo_desc = {} - augmented_metadata_array = [ - URIWithMetadata( - uri=cast("URIType", repo.repo_url), - metadata=repo_desc, - ), - *metadata_array, - ] - return ( - repo.repo_url, - materialized_repo.get_checkout(), - repoDir, - augmented_metadata_array, - ) + return repo_path, materialized_repo, workflow_type, downstream_repos def getWorkflowBundleFromURI( self, remote_url: "URIType", expectedEngineDesc: "Optional[WorkflowType]" = None, + prefer_upstream_source: "bool" = True, offline: "bool" = False, ignoreCache: "bool" = False, registerInCache: "bool" = True, @@ -2644,15 +2378,29 @@ def getWorkflowBundleFromURI( if cached_content.path.is_file(): # Now, let's guess whether it is a possible RO-Crate or a bare file - encoding = magic.from_file(cached_content.path.as_posix(), mime=True) + metadata_file = cached_content.path + encoding = magic.from_file(metadata_file.as_posix(), mime=True) + elif cached_content.path.is_dir(): + metadata_file = cached_content.path / ROCRATE_JSONLD_FILENAME + if metadata_file.is_file(): + encoding = magic.from_file(metadata_file.as_posix(), mime=True) + else: + # A directory does not have mime type + encoding = "" else: - # A directory does not have mime type - encoding = "" - if encoding == "application/zip": - self.logger.info( - "putative workflow {} seems to be a packed RO-Crate".format(remote_url) + raise WfExSBackendException( + f"Unexpected cached path {cached_content.path}, which is neither file nor directory" ) + if encoding in ("application/zip", "application/json"): + if encoding == "application/zip": + info_message = ( + f"putative workflow {remote_url} seems to be a packed RO-Crate" + ) + else: + info_message = f"putative workflow from {remote_url} seems to be an unpacked RO-Crate" + self.logger.info(info_message) + crate_hashed_id = hashlib.sha1(remote_url.encode("utf-8")).hexdigest() roCrateFile = pathlib.Path(self.cacheROCrateDir) / ( crate_hashed_id + self.DEFAULT_RO_EXTENSION @@ -2665,27 +2413,43 @@ def getWorkflowBundleFromURI( roCrateFile, ) - identified_workflow = self.getWorkflowRepoFromROCrateFile( - roCrateFile, expectedEngineDesc - ) - return ( - identified_workflow, - roCrateFile, - cached_content.metadata_array, - identified_workflow.remote_repo.rel_path, - ) - else: - return ( - None, - cached_content.path, - cached_content.metadata_array, - None, - ) + try: + identified_workflow = self.getWorkflowRepoFromROCrateFile( + roCrateFile, + expectedEngineDesc=expectedEngineDesc, + prefer_upstream_source=prefer_upstream_source, + offline=offline, + ignoreCache=ignoreCache, + registerInCache=registerInCache, + ) + return ( + identified_workflow, + roCrateFile, + cached_content.metadata_array, + identified_workflow.remote_repo.rel_path, + ) + except Exception as e: + self.logger.info( + f"Putative workflow from {remote_url} is considered a raw one." + ) + self.logger.debug(f"Rejection traces {e}") + + # Default return + return ( + None, + cached_content.path, + cached_content.metadata_array, + None, + ) def getWorkflowRepoFromROCrateFile( self, roCrateFile: "pathlib.Path", expectedEngineDesc: "Optional[WorkflowType]" = None, + prefer_upstream_source: "bool" = True, + offline: "bool" = False, + ignoreCache: "bool" = False, + registerInCache: "bool" = True, ) -> "IdentifiedWorkflow": """ @@ -2721,6 +2485,9 @@ def getWorkflowRepoFromROCrateFile( # the branch/tag/checkout , and the relative directory in the # fetched content (needed by Nextflow) + # TODO: honour prefer_upstream_source parameter when it is false + # and the payload of the RO-Crate contains a copy of the workflow + # Some RO-Crates might have this value missing or ill-built repo, workflow_type, _ = self.rocrate_toolbox.extractWorkflowMetadata( g, @@ -2737,11 +2504,18 @@ def getWorkflowRepoFromROCrateFile( ) # We need this additional step to guess the repo type - guessedRepo = self.guess_repo_params(repo.repo_url, fail_ok=True) - if guessedRepo is None or guessedRepo.repo_type is None: + guessed = self.guess_repo_params( + repo.repo_url, + offline=offline, + ignoreCache=ignoreCache, + registerInCache=registerInCache, + fail_ok=True, + ) + if guessed is None or guessed[0].repo_type is None: raise WfExSBackendException( f"Unable to guess repository from RO-Crate manifest obtained from {public_name}" ) + guessedRepo = guessed[0] # Rescuing some values if repo.tag is not None and guessedRepo.tag is None: diff --git a/wfexs_backend/workflow.py b/wfexs_backend/workflow.py index 5a982aba..7564b0b9 100644 --- a/wfexs_backend/workflow.py +++ b/wfexs_backend/workflow.py @@ -2,7 +2,7 @@ # -*- coding: utf-8 -*- # SPDX-License-Identifier: Apache-2.0 -# Copyright 2020-2024 Barcelona Supercomputing Center (BSC), Spain +# Copyright 2020-2025 Barcelona Supercomputing Center (BSC), Spain # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -345,7 +345,7 @@ from .utils.zipfile_path import path_relative_to from .fetchers.trs_files import ( - TRS_SCHEME_PREFIX, + GA4GHTRSFetcher, ) if TYPE_CHECKING: @@ -449,7 +449,6 @@ class WF: DEFAULT_TRS_ENDPOINT: "Final[str]" = ( "https://dev.workflowhub.eu/ga4gh/trs/v2/" # root of GA4GH TRS API ) - TRS_TOOLS_PATH: "Final[str]" = "tools/" def __init__( self, @@ -457,7 +456,8 @@ def __init__( workflow_id: "Optional[WorkflowId]" = None, version_id: "Optional[WFVersionId]" = None, descriptor_type: "Optional[TRS_Workflow_Descriptor]" = None, - trs_endpoint: "str" = DEFAULT_TRS_ENDPOINT, + trs_endpoint: "Optional[str]" = None, + prefer_upstream_source: "Optional[bool]" = None, params: "Optional[ParamsBlock]" = None, enabled_profiles: "Optional[Sequence[str]]" = None, environment: "Optional[EnvironmentBlock]" = None, @@ -604,6 +604,8 @@ def __init__( workflow_meta["workflow_type"] = descriptor_type if trs_endpoint is not None: workflow_meta["trs_endpoint"] = trs_endpoint + if prefer_upstream_source is not None: + workflow_meta["prefer_upstream_source"] = prefer_upstream_source if workflow_config is not None: workflow_meta["workflow_config"] = workflow_config if params is not None: @@ -642,6 +644,9 @@ def __init__( self.id = str(workflow_id) if workflow_id is not None else None self.version_id = str(version_id) if version_id is not None else None self.descriptor_type = descriptor_type + self.prefer_upstream_source = ( + prefer_upstream_source if prefer_upstream_source is not None else True + ) self.params = params self.enabled_profiles = enabled_profiles self.environment = environment @@ -654,21 +659,19 @@ def __init__( [] if default_actions is None else default_actions ) + # We are assuming here the provided TRS endpoint is right # The endpoint should always end with a slash if isinstance(trs_endpoint, str): if trs_endpoint[-1] != "/": trs_endpoint += "/" - # Removing the tools suffix, which appeared in first WfExS iterations - if trs_endpoint.endswith("/" + self.TRS_TOOLS_PATH): - trs_endpoint = trs_endpoint[0 : -len(self.TRS_TOOLS_PATH)] - self.trs_endpoint = trs_endpoint else: self.trs_endpoint = None self.id = None self.version_id = None self.descriptor_type = None + self.prefer_upstream_source = True if instanceId is not None: self.instanceId = instanceId @@ -897,7 +900,6 @@ def __init__( self.arch: "Optional[ProcessorArchitecture]" = None self.stagedExecutions: "Optional[MutableSequence[StagedExecution]]" = None - self.cacheROCrateFilename: "Optional[pathlib.Path]" = None self.runExportActions: "Optional[MutableSequence[MaterializedExportAction]]" = ( None @@ -933,7 +935,7 @@ def getPID(self) -> "Optional[str]": the_pid = urllib.parse.urlunparse( urllib.parse.ParseResult( - scheme=TRS_SCHEME_PREFIX, + scheme=GA4GHTRSFetcher.TRS_SCHEME_PREFIX, netloc=parsedTRSURL.netloc, path="/".join(pid_steps), params="", @@ -1833,12 +1835,19 @@ def FromDescription( # It should not happen enabled_profiles = [str(profiles)] + parsed_workflow_id = urllib.parse.urlparse(workflow_meta["workflow_id"]) + if parsed_workflow_id.scheme != "": + trs_endpoint = workflow_meta.get("trs_endpoint") + else: + trs_endpoint = workflow_meta.get("trs_endpoint", cls.DEFAULT_TRS_ENDPOINT) + return cls( wfexs, workflow_meta["workflow_id"], workflow_meta.get("version"), descriptor_type=workflow_meta.get("workflow_type"), - trs_endpoint=workflow_meta.get("trs_endpoint", cls.DEFAULT_TRS_ENDPOINT), + trs_endpoint=trs_endpoint, + prefer_upstream_source=workflow_meta.get("prefer_upstream_source"), params=workflow_meta.get("params", dict()), enabled_profiles=enabled_profiles, environment=workflow_meta.get("environment", dict()), @@ -1897,12 +1906,19 @@ def FromForm( # It should not happen enabled_profiles = [str(profiles)] + parsed_workflow_id = urllib.parse.urlparse(workflow_meta["workflow_id"]) + if parsed_workflow_id.scheme != "": + trs_endpoint = workflow_meta.get("trs_endpoint") + else: + trs_endpoint = workflow_meta.get("trs_endpoint", cls.DEFAULT_TRS_ENDPOINT) + return cls( wfexs, workflow_meta["workflow_id"], workflow_meta.get("version"), descriptor_type=workflow_meta.get("workflow_type"), - trs_endpoint=workflow_meta.get("trs_endpoint", cls.DEFAULT_TRS_ENDPOINT), + trs_endpoint=trs_endpoint, + prefer_upstream_source=workflow_meta.get("prefer_upstream_source"), params=workflow_meta.get("params", dict()), enabled_profiles=enabled_profiles, environment=workflow_meta.get("environment", dict()), @@ -1923,6 +1939,7 @@ def fetchWorkflow( version_id: "Optional[WFVersionId]", trs_endpoint: "Optional[str]", descriptor_type: "Optional[TRS_Workflow_Descriptor]", + prefer_upstream_source: "bool" = True, offline: "bool" = False, ignoreCache: "bool" = False, injectable_repo: "Optional[Tuple[RemoteRepo, WorkflowType]]" = None, @@ -1944,6 +1961,7 @@ def fetchWorkflow( repoDir: "Optional[pathlib.Path]" = None injected_workflow: "Optional[LocalWorkflow]" = None rel_path_files: "Optional[Sequence[Union[RelPath, URIType]]]" = None + # Materialize the workflow, even if it was already materialized if self.remote_repo is None or ignoreCache: repoEffectiveCheckout: "Optional[RepoTag]" # Injectable repo info is a precondition for injectable local workflow @@ -1996,11 +2014,21 @@ def fetchWorkflow( f"Injected workflow has a different relPath from the injected repo" ) else: - repoDir, repoEffectiveCheckout = self.wfexs.doMaterializeRepo( + ( + repoDir, + materialized_repo, + workflow_type, + downstream_repos, + ) = self.wfexs.doMaterializeRepo( repo, + prefer_upstream_source=prefer_upstream_source, doUpdate=ignoreCache, # registerInCache=True, + offline=offline, ) + assert len(downstream_repos) > 0 + repo = materialized_repo.repo + repoEffectiveCheckout = repo.get_checkout() else: ( repoDir, @@ -2011,6 +2039,7 @@ def fetchWorkflow( workflow_id=workflow_id, version_id=version_id, trs_endpoint=trs_endpoint, + prefer_upstream_source=prefer_upstream_source, descriptor_type=descriptor_type, ignoreCache=ignoreCache, offline=offline, @@ -2120,7 +2149,7 @@ def fetchWorkflow( ) else: raise WFException( - "No engine recognized a valid workflow at {}".format(self.repoURL) + f"No engine recognized a valid workflow at {self.repoURL} ({localWorkflow})" ) else: self.logger.debug("Fixed engine " + self.engineDesc.trs_descriptor) @@ -2178,6 +2207,7 @@ def setupEngine( self.version_id, self.trs_endpoint, self.descriptor_type, + prefer_upstream_source=self.prefer_upstream_source, offline=offline, ignoreCache=ignoreCache, injectable_repo=injectable_repo, @@ -4285,6 +4315,7 @@ def exportResults( def staging_recipe(self) -> "WritableWorkflowMetaConfigBlock": workflow_meta: "WritableWorkflowMetaConfigBlock" = { "workflow_id": self.id, + "prefer_upstream_source": self.prefer_upstream_source, "paranoid_mode": self.paranoidMode, } if self.nickname is not None: @@ -4407,6 +4438,9 @@ def unmarshallConfig( self.version_id = workflow_meta.get("version") self.descriptor_type = workflow_meta.get("workflow_type") self.trs_endpoint = workflow_meta.get("trs_endpoint") + self.prefer_upstream_source = workflow_meta.get( + "prefer_upstream_source", True + ) self.workflow_config = workflow_meta.get("workflow_config") self.params = workflow_meta.get("params") profiles: "Optional[Union[str, Sequence[str]]]" = workflow_meta.get( @@ -5443,7 +5477,10 @@ def createStageResearchObject( assert self.localWorkflow is not None assert self.materializedEngine is not None assert self.remote_repo is not None - assert self.remote_repo.tag is not None + assert self.remote_repo.tag is not None or self.remote_repo.repo_type in ( + RepoType.Raw, + None, + ) assert self.materializedParams is not None assert self.materializedEnvironment is not None assert self.staged_setup.work_dir is not None @@ -5517,7 +5554,10 @@ def createResultsResearchObject( assert self.localWorkflow is not None assert self.materializedEngine is not None assert self.remote_repo is not None - assert self.remote_repo.tag is not None + assert self.remote_repo.tag is not None or self.remote_repo.repo_type in ( + RepoType.Raw, + None, + ) assert self.staged_setup.work_dir is not None assert ( isinstance(self.stagedExecutions, list) and len(self.stagedExecutions) > 0 diff --git a/wfexs_backend/workflow_engines/nextflow_engine.py b/wfexs_backend/workflow_engines/nextflow_engine.py index b6332bca..ce0a148b 100644 --- a/wfexs_backend/workflow_engines/nextflow_engine.py +++ b/wfexs_backend/workflow_engines/nextflow_engine.py @@ -123,7 +123,7 @@ NoContainerFactory, ) -from ..fetchers.http import fetchClassicURL +from ..fetchers.http import HTTPFetcher from ..utils.contents import ( copy2_nofollow, ) @@ -862,7 +862,7 @@ def runLocalNextflowCommand( nextflow_version, nextflow_script_url, cachedScript ) ) - fetchClassicURL(nextflow_script_url, cachedScript) + HTTPFetcher().fetch(nextflow_script_url, cachedScript) # Checking the installer has execution permissions if not os.access(cachedScript, os.R_OK | os.X_OK): diff --git a/workflow_examples/somatic_cnv_dockstore_cwl.wfex.stage b/workflow_examples/somatic_cnv_dockstore_cwl.wfex.stage index dc6850eb..9e4adf4a 100644 --- a/workflow_examples/somatic_cnv_dockstore_cwl.wfex.stage +++ b/workflow_examples/somatic_cnv_dockstore_cwl.wfex.stage @@ -1,7 +1,7 @@ # THIS IS AN INCOMPLETE EXAMPLE (use as template) # Use this example only to test TRS access to Dockstore works in stage, # as its parameters are not properly set -trs_endpoint: https://dockstore.org/api/api/ga4gh/v2/ +trs_endpoint: https://dockstore.org/api/ga4gh/v2/ workflow_id: '#workflow/github.com/sevenbridges-openworkflows/Broad-Best-Practice-Somatic-CNV-Workflows/GATK-Somatic-CNV-Panel-Workflow' version: master workflow_config: