From 9937bb1d947b3abd262a7693e6c1ec8138ca4aca Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20M=C2=AA=20Fern=C3=A1ndez?= Date: Wed, 23 Oct 2024 22:24:08 +0200 Subject: [PATCH 01/60] Added method to resolv URLs with `..` , to be used in wfexs_backend.fetchers.trs_files --- wfexs_backend/utils/misc.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/wfexs_backend/utils/misc.py b/wfexs_backend/utils/misc.py index a7041f29..c2412b67 100644 --- a/wfexs_backend/utils/misc.py +++ b/wfexs_backend/utils/misc.py @@ -369,3 +369,14 @@ def get_maximum_file_descriptors() -> "int": result = MAXFD return result + + +def urlresolv(url: "str") -> "str": + if url.endswith("/"): + wident = "." + else: + rslash_idx = url.rfind("/") + if rslash_idx == -1: + return url + wident = url[rslash_idx + 1 :] + return urllib.parse.urljoin(url, wident) From eedcdf176a31dd5c5db33016548895514a3c0b30 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20M=C2=AA=20Fern=C3=A1ndez?= Date: Wed, 23 Oct 2024 22:25:43 +0200 Subject: [PATCH 02/60] Better error reporting and handling on incomplete git repository materialisation --- wfexs_backend/fetchers/git.py | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/wfexs_backend/fetchers/git.py b/wfexs_backend/fetchers/git.py index d019937b..a9dc153c 100644 --- a/wfexs_backend/fetchers/git.py +++ b/wfexs_backend/fetchers/git.py @@ -746,14 +746,17 @@ def materialize_repo( with tempfile.NamedTemporaryFile() as git_stdout, tempfile.NamedTemporaryFile() as git_stderr: # First, (bare) clone retval = 0 + failed_command = [] if gitclone_params is not None: self.logger.debug(f'Running "{" ".join(gitclone_params)}"') + failed_command = gitclone_params retval = subprocess.call( gitclone_params, stdout=git_stdout, stderr=git_stderr ) # Then, checkout (which can be optional) if retval == 0 and (gitcheckout_params is not None): self.logger.debug(f'Running "{" ".join(gitcheckout_params)}"') + failed_command = gitcheckout_params retval = subprocess.Popen( gitcheckout_params, stdout=git_stdout, @@ -772,6 +775,7 @@ def materialize_repo( ] self.logger.debug(f'Running "{" ".join(gitsubmodule_params)}"') + failed_command = gitsubmodule_params retval = subprocess.Popen( gitsubmodule_params, stdout=git_stdout, @@ -787,9 +791,21 @@ def materialize_repo( with open(git_stderr.name, "r") as c_stF: git_stderr_v = c_stF.read() - errstr = "ERROR: Unable to pull '{}' (tag '{}'). Retval {}\n======\nSTDOUT\n======\n{}\n======\nSTDERR\n======\n{}".format( - repoURL, repoTag, retval, git_stdout_v, git_stderr_v + errstr = "ERROR: Unable to pull '{}' (tag '{}').\nFailed command: {}\nRetval {}\n======\nSTDOUT\n======\n{}\n======\nSTDERR\n======\n{}".format( + repoURL, + repoTag, + " ".join(failed_command), + retval, + git_stdout_v, + git_stderr_v, ) + + if repo_tag_destpath.exists(): + self.logger.warning( + f"Failed git command, removing incomplete path {repo_tag_destpath.as_posix()}" + ) + shutil.rmtree(repo_tag_destpath, ignore_errors=True) + raise FetcherException(errstr) # Last, we have to obtain the effective checkout From 56f40d5ee7f6c6b1b6497deca16a62859c850542 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20M=C2=AA=20Fern=C3=A1ndez?= Date: Wed, 23 Oct 2024 22:26:55 +0200 Subject: [PATCH 03/60] Improved handling of GA4GH TRS v2 materialisation. Tested with Dockstore, WorkflowHub and Yevis --- wfexs_backend/fetchers/trs_files.py | 364 ++++++++++++++++++++++------ 1 file changed, 287 insertions(+), 77 deletions(-) diff --git a/wfexs_backend/fetchers/trs_files.py b/wfexs_backend/fetchers/trs_files.py index 2bc1f532..5a8696a5 100644 --- a/wfexs_backend/fetchers/trs_files.py +++ b/wfexs_backend/fetchers/trs_files.py @@ -18,9 +18,11 @@ from __future__ import absolute_import +import copy import io import json import os +import warnings from typing import ( cast, @@ -42,9 +44,14 @@ URIWithMetadata, ) +from ..utils.misc import ( + urlresolv, +) + if TYPE_CHECKING: from typing import ( Mapping, + MutableMapping, MutableSequence, Optional, Sequence, @@ -99,10 +106,74 @@ def fetchTRSFiles( f"Ill-formed TRS CURIE {remote_file}. It should be in the format of {TRS_SCHEME_PREFIX}://id/version or {TRS_SCHEME_PREFIX}://prefix-with-slashes/id/version" ) - version_steps = cast("MutableSequence[str]", path_steps[0:-2]) - version_steps.extend( - ["ga4gh", "trs", "v2", "tools", path_steps[-2], "versions", path_steps[-1]] + trs_base_steps = cast("MutableSequence[str]", path_steps[0:-2]) + trs_base_steps.extend(["ga4gh", "trs", "v2"]) + + # Performing some sanity checks about the API + service_info_steps = copy.copy(trs_base_steps) + service_info_steps.append("service-info") + service_info_metadata_url = cast( + "URIType", + parse.urlunparse( + parse.ParseResult( + scheme="https", + netloc=parsedInputURL.netloc, + path="/".join(service_info_steps), + params="", + query="", + fragment="", + ) + ), ) + service_info_meta = { + "fetched": service_info_metadata_url, + "payload": None, + } + metadata_array.append(URIWithMetadata(remote_file, service_info_meta)) + try: + metaio = io.BytesIO() + _, metametaio, _ = fetchClassicURL(service_info_metadata_url, metaio) + service_info_metadata = json.loads(metaio.getvalue().decode("utf-8")) + service_info_meta["payload"] = service_info_metadata + metadata_array.extend(metametaio) + except FetcherException as fe: + raise FetcherException( + f"Error fetching or processing TRS service info metadata for {remote_file} : {fe.code} {fe.reason}" + ) from fe + + trs_version_str: "Optional[str]" = None + trs_artifact: "Optional[str]" = None + trs_group: "Optional[str]" = None + trs_endpoint_meta_type: "Optional[Mapping[str, str]]" = ( + service_info_metadata.get("type") + ) + if trs_endpoint_meta_type is not None: + trs_version_str = trs_endpoint_meta_type.get("version") + trs_artifact = trs_endpoint_meta_type.get("artifact") + trs_group = trs_endpoint_meta_type.get("group") + + if trs_version_str is None: + errstr = f"Unable to identify TRS version from {service_info_metadata_url}. Is this a TRS endpoint?" + raise FetcherException(errstr) + + # Avoiding querying a GA4GH DRS service, for instance + if trs_artifact is not None and trs_artifact.lower() not in ("trs", "yevis"): + errstr = f"Unsupported GA4GH service {trs_artifact} (group {trs_group}) from {service_info_metadata_url}" + raise FetcherException(errstr) + + # Warning about potentially unsupported versions + trs_version_tuple = tuple(map(int, trs_version_str.split("."))) + if trs_version_tuple < (2, 0, 1): + warnings.warn( + f"{service_info_metadata_url} is offering old TRS version {trs_version_str}, which diverges from what this implementation supports" + ) + elif trs_version_tuple > (3, 0): + warnings.warn( + f"{service_info_metadata_url} is offering TRS version {trs_version_str}, which might diverge from what this implementation supports" + ) + + version_steps = copy.copy(trs_base_steps) + version_steps.extend(["tools", path_steps[-2], "versions", path_steps[-1]]) version_metadata_url = cast( "URIType", parse.urlunparse( @@ -195,99 +266,238 @@ def fetchTRSFiles( os.makedirs(cachedFilename, exist_ok=True) absdirs = set() emptyWorkflow = True + # First pass, identify primary descriptor / workflow entrypoint # and learn whether the destination paths should be sanitized - deepest_file_rel = 1 + is_abs_url = False + is_anon = False + file_rel_2_url: "MutableMapping[str, str]" = dict() for file_desc in metadata: file_rel_path = file_desc.get("path") - if file_rel_path is not None: - frp_parsed = parse.urlparse(file_rel_path) - if frp_parsed.scheme in ("http", "https", "ftp"): - # An absolute URL, like in the case of DDBJ TRS implementation - # A mixure of resource might be catastrophic, the code is doing - # its best effort - file_rel_path = os.path.join(frp_parsed.netloc, frp_parsed.params) - - # BEWARE! The relpath could contain references to parent directories - # escaping from the URL to be built and from the download "sandbox" - # Avoid absolute paths corner case before splitting - file_rel_path_steps = file_rel_path.lstrip("/").split("/") - - file_rel_depth = ( - len(file_rel_path_steps) - - file_rel_path_steps.count(".") - - file_rel_path_steps.count("") - - 2 * file_rel_path_steps.count("..") + if file_rel_path is None: + continue + + emptyWorkflow = False + + # BEWARE! The relpath could contain references to parent directories + # escaping from the URL to be built and from the download "sandbox" + frp_parsed = parse.urlparse(file_rel_path) + is_abs_url = frp_parsed.scheme in ("http", "https", "ftp") + + if is_abs_url: + # This one has to be dealt with a shortcut + file_rel_2_url[file_rel_path] = urlresolv(file_rel_path) + continue + + descriptor_url = cast( + "URIType", + descriptor_base_url + parse.quote(file_rel_path, safe="/"), + ) + try: + descmetaio = io.BytesIO() + _, descmetaelem, _ = fetchClassicURL( + descriptor_url, descmetaio, {"headers": {"Accept": "application/json"}} + ) + descriptor_meta = json.loads(descmetaio.getvalue().decode("utf-8")) + except FetcherException as fe: + raise FetcherException( + "Error fetching or processing TRS descriptor metadata for {} : {} {}".format( + descriptor_url, fe.code, fe.reason + ) + ) from fe + + is_anon = ( + not isinstance(descriptor_meta, dict) or descriptor_meta.get("url") is None + ) + if is_anon: + # This one has to be dealt in a different way + break + file_rel_2_url[file_rel_path] = urlresolv(descriptor_meta["url"]) + + if emptyWorkflow: + raise FetcherException( + "Error processing TRS files for {} : no file was found.\n{}".format( + remote_file, metadata ) - if file_rel_depth < deepest_file_rel: - deepest_file_rel = file_rel_depth + ) + + if is_anon: + prefix_url = "" + else: + prefix_url = os.path.commonpath(tuple(file_rel_2_url.values())) # We have to create anonymous directories to avoid leaving the download "sandbox" abs_download_dir = cachedFilename - if deepest_file_rel < 1: - for depth in range(deepest_file_rel, 1): - abs_download_dir = cast( - "AbsPath", os.path.join(abs_download_dir, f"unnamed{depth}") + if "/" in prefix_url: + # This is needed to perform an effective work + prefix_url += "/" + # Due the peversion of commonpath, double slashes are collapsed + colon_pos = prefix_url.find(":") + if colon_pos > 0: + prefix_url = ( + prefix_url[0 : colon_pos + 1] + "/" + prefix_url[colon_pos + 1 :] ) - # Second pass, fetching the contents, sanitizing the destination paths - for file_desc in metadata: - file_rel_path = file_desc.get("path") - if file_rel_path is not None: - emptyWorkflow = False - - # BEWARE! The relpath could contain references to parent directories - # escaping from the URL to be built and from the download "sandbox" - frp_parsed = parse.urlparse(file_rel_path) - is_abs_url = frp_parsed.scheme in ("http", "https", "ftp") - if is_abs_url: - # An absolute URL, like in the case of DDBJ TRS implementation - file_url = cast("URIType", file_rel_path) + # Computing resolved relative paths + for file_desc in metadata: + file_rel_path = file_desc.get("path") + if file_rel_path is not None: + # BEWARE! The relpath could contain references to parent directories + # escaping from the URL to be built and from the download "sandbox" + frp_parsed = parse.urlparse(file_rel_path) + is_abs_url = frp_parsed.scheme in ("http", "https", "ftp") + if is_abs_url: + # An absolute URL, like in the case of DDBJ TRS implementation + file_url = cast("URIType", file_rel_path) + else: + file_url = cast( + "URIType", + descriptor_base_url + parse.quote(file_rel_path, safe="/"), + ) + local_rel_path = file_rel_2_url[file_rel_path][len(prefix_url) :] absfile = cast( - "AbsPath", - os.path.join( - abs_download_dir, frp_parsed.netloc, frp_parsed.path.lstrip("/") - ), + "AbsPath", os.path.join(abs_download_dir, local_rel_path) ) - else: - file_url = cast( - "URIType", - descriptor_base_url + parse.quote(file_rel_path, safe="/"), + + # Intermediate path creation + absdir = os.path.dirname(absfile) + if absdir not in absdirs: + absdirs.add(absdir) + os.makedirs(absdir, exist_ok=True) + real_rel_path = os.path.relpath( + os.path.normpath(absfile), cachedFilename ) - absfile = cast( - "AbsPath", os.path.join(abs_download_dir, file_rel_path.lstrip("/")) + + # When it is the primary descriptor, it is fetched twice + if file_desc.get("file_type") == "PRIMARY_DESCRIPTOR": + topMeta["workflow_entrypoint"] = cast("URIType", real_rel_path) + if is_abs_url: + topMeta["remote_workflow_entrypoint"] = file_url + else: + topMeta["remote_workflow_entrypoint"] = cast( + "URIType", file_rel_2_url[file_rel_path] + ) + + # Getting the raw content + accept_val = "*/*" if is_abs_url else "text/plain" + _, metaelem, _ = fetchClassicURL( + file_url, absfile, {"headers": {"Accept": accept_val}} + ) + metadata_array.extend(metaelem) + else: + # First pass, identify primary descriptor / workflow entrypoint + # and learn whether the destination paths should be sanitized + deepest_file_rel = 0 + for file_desc in metadata: + file_rel_path = file_desc.get("path") + if file_rel_path is not None: + frp_parsed = parse.urlparse(file_rel_path) + if frp_parsed.scheme in ("http", "https", "ftp"): + # An absolute URL, like in the case of DDBJ TRS implementation + # A mixure of resource might be catastrophic, the code is doing + # its best effort + file_rel_path = os.path.join(frp_parsed.netloc, frp_parsed.params) + + # BEWARE! The relpath could contain references to parent directories + # escaping from the URL to be built and from the download "sandbox" + # Avoid absolute paths corner case before splitting + file_rel_path_steps = file_rel_path.lstrip("/").split("/") + + deepest = 0 + depth = 0 + for step in file_rel_path_steps: + if step == "..": + depth -= 1 + if depth < deepest: + deepest = depth + elif step not in (".", ""): + depth += 1 + + if deepest < deepest_file_rel: + deepest_file_rel = deepest + + if deepest_file_rel < 0: + for depth in range(-deepest_file_rel): + abs_download_dir = cast( + "AbsPath", os.path.join(abs_download_dir, f"unnamed{depth}") ) - # Intermediate path creation - absdir = os.path.dirname(absfile) - if absdir not in absdirs: - absdirs.add(absdir) - os.makedirs(absdir, exist_ok=True) - real_rel_path = os.path.relpath(os.path.normpath(absfile), cachedFilename) + # Second pass, fetching the contents, sanitizing the destination paths + for file_desc in metadata: + file_rel_path = file_desc.get("path") + if file_rel_path is not None: + emptyWorkflow = False - # When it is the primary descriptor, it is fetched twice - if file_desc.get("file_type") == "PRIMARY_DESCRIPTOR": - topMeta["workflow_entrypoint"] = cast("URIType", real_rel_path) + # BEWARE! The relpath could contain references to parent directories + # escaping from the URL to be built and from the download "sandbox" + frp_parsed = parse.urlparse(file_rel_path) + is_abs_url = frp_parsed.scheme in ("http", "https", "ftp") if is_abs_url: - topMeta["remote_workflow_entrypoint"] = file_url + # An absolute URL, like in the case of DDBJ TRS implementation + file_url = cast("URIType", file_rel_path) + absfile = cast( + "AbsPath", + os.path.join( + abs_download_dir, + frp_parsed.netloc, + frp_parsed.path.lstrip("/"), + ), + ) else: - descriptorMeta = io.BytesIO() - _, metaprimary, _ = fetchClassicURL(file_url, descriptorMeta) - metadata_array.extend(metaprimary) - - # This metadata can help a lot to get the workflow repo - metadataPD = json.loads(descriptorMeta.getvalue().decode("utf-8")) - topMeta["remote_workflow_entrypoint"] = metadataPD.get("url") - - del descriptorMeta - del metadataPD + file_url = cast( + "URIType", + descriptor_base_url + parse.quote(file_rel_path, safe="/"), + ) + absfile = cast( + "AbsPath", + os.path.join(abs_download_dir, file_rel_path.lstrip("/")), + ) + + # Intermediate path creation + absdir = os.path.dirname(absfile) + if absdir not in absdirs: + absdirs.add(absdir) + os.makedirs(absdir, exist_ok=True) + real_rel_path = os.path.relpath( + os.path.normpath(absfile), cachedFilename + ) - accept_val = "*/*" if is_abs_url else "text/plain" - # Getting the raw content - _, metaelem, _ = fetchClassicURL( - file_url, absfile, {"headers": {"Accept": accept_val}} - ) - metadata_array.extend(metaelem) + # When it is the primary descriptor, it is fetched twice + if file_desc.get("file_type") == "PRIMARY_DESCRIPTOR": + topMeta["workflow_entrypoint"] = cast("URIType", real_rel_path) + if is_abs_url: + topMeta["remote_workflow_entrypoint"] = file_url + else: + descriptorMeta = io.BytesIO() + _, metaprimary, _ = fetchClassicURL(file_url, descriptorMeta) + metadata_array.extend(metaprimary) + + # This metadata can help a lot to get the workflow repo + metadataPD = json.loads( + descriptorMeta.getvalue().decode("utf-8") + ) + topMeta["remote_workflow_entrypoint"] = metadataPD.get("url") + + del descriptorMeta + del metadataPD + + # Getting the raw content + accept_val = "*/*" if is_abs_url else "text/plain" + try: + _, metaelem, _ = fetchClassicURL( + file_url, absfile, {"headers": {"Accept": accept_val}} + ) + metadata_array.extend(metaelem) + except FetcherException as fe: + if file_desc.get("file_type") in ( + "PRIMARY_DESCRIPTOR", + "SECONDARY_DESCRIPTOR", + ): + raise + else: + warnings.warn( + f"Unable to fetch {file_url}. TRS Dataset {metadata_url} might be incomplete" + ) if emptyWorkflow: raise FetcherException( From c740509e72f5a178e4b13c7d115d8d2acb8e6776 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20M=C2=AA=20Fern=C3=A1ndez?= Date: Wed, 23 Oct 2024 22:29:23 +0200 Subject: [PATCH 04/60] Better identification and error handling of workflows coming from GA4GH repositories. Also, added fallback to fetched TRS files for cases where the original repository does not contain the workflow any more. --- wfexs_backend/wfexs_backend.py | 186 +++++++++++++++++++++++++++++++-- 1 file changed, 176 insertions(+), 10 deletions(-) diff --git a/wfexs_backend/wfexs_backend.py b/wfexs_backend/wfexs_backend.py index 822b7a4c..511c8b5b 100644 --- a/wfexs_backend/wfexs_backend.py +++ b/wfexs_backend/wfexs_backend.py @@ -132,6 +132,7 @@ AbstractStatefulFetcher, DocumentedProtocolFetcher, DocumentedStatefulProtocolFetcher, + FetcherException, RemoteRepo, RepoType, ) @@ -2083,7 +2084,7 @@ def cacheWorkflow( repoDir: "Optional[pathlib.Path]" = None putative: "bool" = False cached_putative_path: "Optional[pathlib.Path]" = None - if parsedRepoURL.scheme in ("", TRS_SCHEME_PREFIX): + if parsedRepoURL.scheme in ("", TRS_SCHEME_PREFIX, INTERNAL_TRS_SCHEME_PREFIX): # Extracting the TRS endpoint details from the parsedRepoURL if parsedRepoURL.scheme == TRS_SCHEME_PREFIX: # Duplication of code borrowed from trs_files.py @@ -2107,6 +2108,113 @@ def cacheWorkflow( workflow_id = urllib.parse.unquote(path_steps[-2]) version_id = urllib.parse.unquote(path_steps[-1]) + elif parsedRepoURL.scheme == INTERNAL_TRS_SCHEME_PREFIX: + # Time to try guessing everything + try: + internal_trs_cached_content = self.cacheHandler.fetch( + cast("URIType", parsedRepoURL.path), + destdir=meta_dir, + offline=offline, + ignoreCache=ignoreCache, + ) + + with internal_trs_cached_content.path.open( + mode="r", encoding="utf-8" + ) as ctmf: + trs__meta = json.load(ctmf) + except Exception as e: + raise WFException( + f"trs_endpoint could not be guessed from {parsedRepoURL.path} (raised exception)" + ) from e + + if not isinstance(trs__meta, dict): + raise WFException( + f"trs_endpoint could not be guessed from {parsedRepoURL.path} (not returning JSON object)" + ) + + trs__meta__id: "Optional[str]" = trs__meta.get("id") + if trs__meta__id is None: + raise WFException( + f"trs_endpoint could not be guessed from {parsedRepoURL.path} (not returning id)" + ) + + trs__meta__url: "Optional[str]" = trs__meta.get("url") + if trs__meta__url is None: + raise WFException( + f"trs_endpoint could not be guessed from {parsedRepoURL.path} (not returning url)" + ) + + if "descriptor_type" in trs__meta: + version_id = trs__meta__id + # Now we need to backtrack in the url to get the workflow id + tool_url_suffix = "/versions/" + urllib.parse.quote( + version_id, safe="" + ) + + # If this happens, this implementation is not so compliant with standard + dockstore_tool_url_suffix = "/versions/" + urllib.parse.quote( + trs__meta.get("name", ""), safe="" + ) + if trs__meta__url.endswith(dockstore_tool_url_suffix): + tool_url_suffix = dockstore_tool_url_suffix + version_id = trs__meta.get("name", "") + + if not trs__meta__url.endswith(tool_url_suffix): + raise WFException( + f"trs_endpoint could not be guessed from {parsedRepoURL.path} and {trs__meta__url} (version {version_id}, mismatched API route)" + ) + + trs_tool_url = trs__meta__url[0 : -len(tool_url_suffix)] + try: + internal_trs_cached_content = self.cacheHandler.fetch( + cast("URIType", trs_tool_url), + destdir=meta_dir, + offline=offline, + ignoreCache=ignoreCache, + ) + + with internal_trs_cached_content.path.open( + mode="r", encoding="utf-8" + ) as ctmf: + trs__meta = json.load(ctmf) + except Exception as e: + raise WFException( + f"trs_endpoint could not be guessed from {trs_tool_url} (came from {parsedRepoURL.path}, raised exception)" + ) from e + + trs__meta__id = trs__meta.get("id") + if trs__meta__id is None: + raise WFException( + f"trs_endpoint could not be guessed from {trs_tool_url} (came from {parsedRepoURL.path}, not returning id)" + ) + + trs__meta__url = trs__meta.get("url") + if trs__meta__url is None: + raise WFException( + f"trs_endpoint could not be guessed from {trs_tool_url} (came from {parsedRepoURL.path}, not returning url)" + ) + else: + trs_tool_url = parsedRepoURL.path + version_id = None + + if "toolclass" in trs__meta: + workflow_id = trs__meta__id + + # Now we need to backtrack in the url to get the workflow id + tool_url_suffix = "/tools/" + urllib.parse.quote( + workflow_id, safe="" + ) + if not trs__meta__url.endswith(tool_url_suffix): + raise WFException( + f"trs_endpoint could not be guessed from {trs_tool_url} and {trs__meta__url} (mismatched API route)" + ) + + trs_endpoint = trs__meta__url[0 : -len(tool_url_suffix)] + else: + raise WFException( + f"trs_endpoint could not be guessed from {parsedRepoURL.path} (no clues)" + ) + if (trs_endpoint is not None) and len(trs_endpoint) > 0: i_workflow, repoDir = self.getWorkflowRepoFromTRS( trs_endpoint, @@ -2284,14 +2392,41 @@ def getWorkflowRepoFromTRS( with trsMetadataCache.open(mode="r", encoding="utf-8") as ctmf: trs_endpoint_meta = json.load(ctmf) - # Minimal check - trs_version = trs_endpoint_meta.get("api_version") - if trs_version is None: - trs_version = trs_endpoint_meta.get("type", {}).get("version") - - if trs_version is None: - raise WFException( - "Unable to identify TRS version from {}".format(trs_endpoint_meta_url) + # Minimal checks + trs_version_str: "Optional[str]" = None + trs_artifact: "Optional[str]" = None + trs_group: "Optional[str]" = None + trs_endpoint_meta_type: "Optional[Mapping[str, str]]" = trs_endpoint_meta.get( + "type" + ) + if trs_endpoint_meta_type is not None: + trs_version_str = trs_endpoint_meta_type.get("version") + trs_artifact = trs_endpoint_meta_type.get("artifact") + trs_group = trs_endpoint_meta_type.get("group") + else: + # Supporting 2.0beta2 + trs_version_str = trs_endpoint_meta.get("api_version") + + if trs_version_str is None: + errstr = f"Unable to identify TRS version from {trs_endpoint_meta_url}. Is this a TRS endpoint?" + self.logger.error(errstr) + raise WFException(errstr) + + # Avoiding querying a GA4GH DRS service, for instance + if trs_artifact is not None and trs_artifact.lower() not in ("trs", "yevis"): + errstr = f"Unsupported GA4GH service {trs_artifact} (group {trs_group}) from {trs_endpoint_meta_url}" + self.logger.error(errstr) + raise WFException(errstr) + + # Warning about potentially unsupported versions + trs_version_tuple = tuple(map(int, trs_version_str.split("."))) + if trs_version_tuple < (2, 0, 1): + self.logger.warning( + f"{trs_endpoint_meta_url} is offering old TRS version {trs_version_str}, which diverges from what this implementation supports" + ) + elif trs_version_tuple > (3, 0): + self.logger.warning( + f"{trs_endpoint_meta_url} is offering TRS version {trs_version_str}, which might diverge from what this implementation supports" ) # Now, check the tool does exist in the TRS, and the version @@ -2460,11 +2595,42 @@ def getWorkflowRepoFromTRS( remote_workflow_entrypoint = trs_meta.metadata.get( "remote_workflow_entrypoint" ) + trs_files_path: "Optional[pathlib.Path]" = None if remote_workflow_entrypoint is not None: # Give it a chance to identify the original repo of the workflow repo = self.guess_repo_params(remote_workflow_entrypoint, fail_ok=True) + self.logger.error( + f"Now guessing from {remote_workflow_entrypoint} {repo}" + ) if repo is not None: + try: + # This is really, really needed to recognize + # when to fall back to the safe path of what + # we already have + repoDir, repoEffectiveCheckout = self.doMaterializeRepo( + repo, + doUpdate=ignoreCache, + ) + except FetcherException as fe: + self.logger.warning( + f"Repo for {remote_workflow_entrypoint} was guessed, but some element was unreachable. Falling back to GA4GH TRS contents from {toolFilesURL}" + ) + self.logger.warning(f"(nested exception was {fe})") + repo = RemoteRepo(repo_url=cast("RepoURL", toolFilesURL)) + + if repo.repo_type is None: + workflow_entrypoint = trs_meta.metadata.get( + "workflow_entrypoint" + ) + if workflow_entrypoint is not None: + repo = RemoteRepo( + repo_url=cast("RepoURL", toolFilesURL), + rel_path=workflow_entrypoint, + repo_type=RepoType.TRS, + ) + trs_files_path = cached_trs_files.path + self.logger.debug( "Derived repository {} ({} , rel {}) from {}".format( repo.repo_url, repo.tag, repo.rel_path, trs_tools_url @@ -2474,7 +2640,7 @@ def getWorkflowRepoFromTRS( IdentifiedWorkflow( workflow_type=expectedEngineDesc, remote_repo=repo ), - None, + trs_files_path, ) workflow_entrypoint = trs_meta.metadata.get("workflow_entrypoint") From 0496859a183b168d0290d3d176280c60c901ae87 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20M=C2=AA=20Fern=C3=A1ndez?= Date: Thu, 24 Oct 2024 15:27:26 +0200 Subject: [PATCH 05/60] Added manual step to apply `black` PEP8 styling fixes, so the local instance of black can be discontinued in the future from the development dependencies --- .pre-commit-config.yaml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index fc75ce2a..c82a90e5 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -101,6 +101,10 @@ repos: - id: black exclude: "^[^/]*env/|development-[^/]*/|docs/" args: [--diff, --check] + - id: black + name: black_apply + exclude: "^[^/]*env/|development-[^/]*/|docs/" + stages: [manual] - repo: https://github.com/jmfernandez/citation-cff-checker.git rev: v0.1.0 From df8cabacab98886843dba971096c1605eb9fcc9f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20M=C2=AA=20Fern=C3=A1ndez?= Date: Thu, 24 Oct 2024 17:31:49 +0200 Subject: [PATCH 06/60] Added method `materialize_repo_from_repo` to AbstractRepoFetcher, so some code currently living at wfexs_backend.wfexs_backend can be simplified --- wfexs_backend/fetchers/__init__.py | 15 +++++++++++++++ wfexs_backend/fetchers/swh.py | 16 ++++++++++++++++ 2 files changed, 31 insertions(+) diff --git a/wfexs_backend/fetchers/__init__.py b/wfexs_backend/fetchers/__init__.py index bd4cf27d..e0edf1b7 100644 --- a/wfexs_backend/fetchers/__init__.py +++ b/wfexs_backend/fetchers/__init__.py @@ -309,6 +309,21 @@ def materialize_repo( ) -> "Tuple[pathlib.Path, RemoteRepo, Sequence[URIWithMetadata]]": pass + def materialize_repo_from_repo( + self, + repo: "RemoteRepo", + repo_tag_destdir: "Optional[PathLikePath]" = None, + base_repo_destdir: "Optional[PathLikePath]" = None, + doUpdate: "Optional[bool]" = True, + ) -> "Tuple[pathlib.Path, RemoteRepo, Sequence[URIWithMetadata]]": + return self.materialize_repo( + repoURL=repo.repo_url, + repoTag=repo.tag, + repo_tag_destdir=repo_tag_destdir, + base_repo_destdir=base_repo_destdir, + doUpdate=doUpdate, + ) + @abc.abstractmethod def build_pid_from_repo(self, remote_repo: "RemoteRepo") -> "Optional[str]": """ diff --git a/wfexs_backend/fetchers/swh.py b/wfexs_backend/fetchers/swh.py index 6744dd73..a6acca6e 100644 --- a/wfexs_backend/fetchers/swh.py +++ b/wfexs_backend/fetchers/swh.py @@ -226,6 +226,22 @@ def GuessRepoParams( web_url=web_url, ) + def materialize_repo_from_repo( + self, + repo: "RemoteRepo", + repo_tag_destdir: "Optional[PathLikePath]" = None, + base_repo_destdir: "Optional[PathLikePath]" = None, + doUpdate: "Optional[bool]" = True, + ) -> "Tuple[pathlib.Path, RemoteRepo, Sequence[URIWithMetadata]]": + return self.materialize_repo( + repoURL=cast("RepoURL", repo.tag) + if repo.tag is not None + else repo.repo_url, + repo_tag_destdir=repo_tag_destdir, + base_repo_destdir=base_repo_destdir, + doUpdate=doUpdate, + ) + def build_pid_from_repo(self, remote_repo: "RemoteRepo") -> "Optional[str]": """ This method is required to generate a PID which usually From ac7df29698d25f83b4d7ad1e694bd2d197a1c47d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20M=C2=AA=20Fern=C3=A1ndez?= Date: Thu, 24 Oct 2024 18:51:06 +0200 Subject: [PATCH 07/60] Changed order of manual `black` step in pre-commit, so the fix is applied before the check is performed. --- .pre-commit-config.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index c82a90e5..42a0018d 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -98,13 +98,13 @@ repos: - repo: https://github.com/ambv/black.git rev: 23.3.0 hooks: - - id: black - exclude: "^[^/]*env/|development-[^/]*/|docs/" - args: [--diff, --check] - id: black name: black_apply exclude: "^[^/]*env/|development-[^/]*/|docs/" stages: [manual] + - id: black + exclude: "^[^/]*env/|development-[^/]*/|docs/" + args: [--diff, --check] - repo: https://github.com/jmfernandez/citation-cff-checker.git rev: v0.1.0 From f520726c92fb80ef7b51362d5570c5d878bbaf66 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20M=C2=AA=20Fern=C3=A1ndez?= Date: Thu, 24 Oct 2024 19:13:54 +0200 Subject: [PATCH 08/60] Partially generalized repo materialization, in preparation of a more flexible mechanism --- wfexs_backend/wfexs_backend.py | 115 ++++++++------------------------- 1 file changed, 28 insertions(+), 87 deletions(-) diff --git a/wfexs_backend/wfexs_backend.py b/wfexs_backend/wfexs_backend.py index 511c8b5b..2aa6442a 100644 --- a/wfexs_backend/wfexs_backend.py +++ b/wfexs_backend/wfexs_backend.py @@ -2668,66 +2668,37 @@ def doMaterializeRepo( doUpdate: "bool" = True, registerInCache: "bool" = True, ) -> "Tuple[pathlib.Path, RepoTag]": + fetcher_clazz: "Optional[Type[AbstractRepoFetcher]]" = None if repo.repo_type not in (RepoType.Other, RepoType.SoftwareHeritage): - ( - remote_url, - repo_effective_checkout, - repo_path, - metadata_array, - ) = self._doMaterializeGitRepo(repo, doUpdate=doUpdate) + fetcher_clazz = GitFetcher elif repo.repo_type == RepoType.SoftwareHeritage: - ( - remote_url, - repo_effective_checkout, - repo_path, - metadata_array, - ) = self._doMaterializeSoftwareHeritageDirOrContent(repo, doUpdate=doUpdate) - else: + fetcher_clazz = SoftwareHeritageFetcher + + if fetcher_clazz is None: raise WfExSBackendException( f"Don't know how to materialize {repo.repo_url} as a repository" ) - if registerInCache: - kind = ContentKind.Directory if repo_path.is_dir() else ContentKind.File - self.cacheHandler.inject( - remote_url, - destdir=self.cacheWorkflowDir, - fetched_metadata_array=metadata_array, - finalCachedFilename=repo_path, - inputKind=kind, - ) - - return repo_path, repo_effective_checkout - - def _doMaterializeGitRepo( - self, - repo: "RemoteRepo", - doUpdate: "bool" = True, - ) -> "Tuple[URIType, RepoTag, pathlib.Path, Sequence[URIWithMetadata]]": - """ - - :param repoURL: - :param repoTag: - :param doUpdate: - :return: - """ - gitFetcherInst = self.instantiateRepoFetcher(GitFetcher) - repoDir, materialized_repo, metadata_array = gitFetcherInst.materialize_repo( - repo.repo_url, - repoTag=repo.tag, + fetcher = self.instantiateRepoFetcher(fetcher_clazz) + ( + repo_path, + materialized_repo, + metadata_array, + ) = fetcher.materialize_repo_from_repo( + repo, doUpdate=doUpdate, base_repo_destdir=self.cacheWorkflowDir, ) # Now, let's register the checkout with cache structures # using its public URI - if not repo.repo_url.startswith("git"): - remote_url = "git+" + repo.repo_url - else: - remote_url = repo.repo_url + remote_url: "str" = repo.repo_url + if fetcher_clazz == GitFetcher: + if not repo.repo_url.startswith("git"): + remote_url = "git+" + repo.repo_url - if repo.tag is not None: - remote_url += "@" + repo.tag + if repo.tag is not None: + remote_url += "@" + repo.tag repo_desc: "Optional[Mapping[str, Any]]" = materialized_repo.gen_repo_desc() if repo_desc is None: @@ -2739,48 +2710,18 @@ def _doMaterializeGitRepo( ), *metadata_array, ] - return ( - cast("URIType", remote_url), - materialized_repo.get_checkout(), - repoDir, - augmented_metadata_array, - ) - def _doMaterializeSoftwareHeritageDirOrContent( - self, - repo: "RemoteRepo", - doUpdate: "bool" = True, - ) -> "Tuple[URIType, RepoTag, pathlib.Path, Sequence[URIWithMetadata]]": - """ - - :param repoURL: - :param repoTag: - :param doUpdate: - :return: - """ - swhFetcherInst = self.instantiateRepoFetcher(SoftwareHeritageFetcher) - repoDir, materialized_repo, metadata_array = swhFetcherInst.materialize_repo( - cast("RepoURL", repo.tag) if repo.tag is not None else repo.repo_url, - doUpdate=doUpdate, - base_repo_destdir=self.cacheWorkflowDir, - ) + if registerInCache: + kind = ContentKind.Directory if repo_path.is_dir() else ContentKind.File + self.cacheHandler.inject( + cast("URIType", remote_url), + destdir=self.cacheWorkflowDir, + fetched_metadata_array=augmented_metadata_array, + finalCachedFilename=repo_path, + inputKind=kind, + ) - repo_desc: "Optional[Mapping[str, Any]]" = materialized_repo.gen_repo_desc() - if repo_desc is None: - repo_desc = {} - augmented_metadata_array = [ - URIWithMetadata( - uri=cast("URIType", repo.repo_url), - metadata=repo_desc, - ), - *metadata_array, - ] - return ( - repo.repo_url, - materialized_repo.get_checkout(), - repoDir, - augmented_metadata_array, - ) + return repo_path, materialized_repo.get_checkout() def getWorkflowBundleFromURI( self, From 3a2a30bb04ba5ae35bab02723eeaac8c172d4feb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20M=C2=AA=20Fern=C3=A1ndez?= Date: Thu, 23 Jan 2025 17:21:23 +0100 Subject: [PATCH 09/60] Applied janitorial work in order to be able to split cummulative set of changes --- wfexs_backend/fetchers/__init__.py | 2 +- wfexs_backend/fetchers/swh.py | 2 +- wfexs_backend/wfexs_backend.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/wfexs_backend/fetchers/__init__.py b/wfexs_backend/fetchers/__init__.py index e0edf1b7..a8758c8c 100644 --- a/wfexs_backend/fetchers/__init__.py +++ b/wfexs_backend/fetchers/__init__.py @@ -309,7 +309,7 @@ def materialize_repo( ) -> "Tuple[pathlib.Path, RemoteRepo, Sequence[URIWithMetadata]]": pass - def materialize_repo_from_repo( + def materialize_repo_from_repo_transient( self, repo: "RemoteRepo", repo_tag_destdir: "Optional[PathLikePath]" = None, diff --git a/wfexs_backend/fetchers/swh.py b/wfexs_backend/fetchers/swh.py index a6acca6e..4b0a2b50 100644 --- a/wfexs_backend/fetchers/swh.py +++ b/wfexs_backend/fetchers/swh.py @@ -226,7 +226,7 @@ def GuessRepoParams( web_url=web_url, ) - def materialize_repo_from_repo( + def materialize_repo_from_repo_transient( self, repo: "RemoteRepo", repo_tag_destdir: "Optional[PathLikePath]" = None, diff --git a/wfexs_backend/wfexs_backend.py b/wfexs_backend/wfexs_backend.py index 2aa6442a..c37540c6 100644 --- a/wfexs_backend/wfexs_backend.py +++ b/wfexs_backend/wfexs_backend.py @@ -2684,7 +2684,7 @@ def doMaterializeRepo( repo_path, materialized_repo, metadata_array, - ) = fetcher.materialize_repo_from_repo( + ) = fetcher.materialize_repo_from_repo_transient( repo, doUpdate=doUpdate, base_repo_destdir=self.cacheWorkflowDir, From a6677f395079d66cdc2177a55cba7256261024c1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20M=C2=AA=20Fern=C3=A1ndez?= Date: Thu, 23 Jan 2025 17:26:44 +0100 Subject: [PATCH 10/60] Created both the scheme catalog and the definitive AbstractSchemeRepoFetcher AbstractSchemRepoFetcher enforces the declaration of the scheme parameter, needed for the next commits. --- wfexs_backend/fetchers/__init__.py | 121 +++++++- wfexs_backend/scheme_catalog.py | 477 +++++++++++++++++++++++++++++ 2 files changed, 594 insertions(+), 4 deletions(-) create mode 100644 wfexs_backend/scheme_catalog.py diff --git a/wfexs_backend/fetchers/__init__.py b/wfexs_backend/fetchers/__init__.py index a8758c8c..45e55fc4 100644 --- a/wfexs_backend/fetchers/__init__.py +++ b/wfexs_backend/fetchers/__init__.py @@ -97,12 +97,24 @@ class ProtocolFetcherReturn(NamedTuple): ProtocolFetcherReturn, ] + ProtocolStreamFetcher: TypeAlias = Callable[ + [ + URIType, + IO[bytes], + DefaultNamedArg(Optional[SecurityContextConfig], "secContext"), + ], + ProtocolFetcherReturn, + ] + from urllib import parse from ..common import ( AbstractWfExSException, ) +from ..scheme_catalog import ( + SchemeCatalog, +) # Default priority DEFAULT_PRIORITY: "Final[int]" = 0 @@ -158,6 +170,7 @@ def __init__( self, progs: "ProgsMapping" = dict(), setup_block: "Optional[Mapping[str, Any]]" = None, + scheme_catalog: "Optional[SchemeCatalog]" = None, ): import inspect @@ -169,6 +182,7 @@ def __init__( # This is used to resolve program names self.progs = progs self.setup_block = setup_block if isinstance(setup_block, dict) else dict() + self.scheme_catalog = scheme_catalog @abc.abstractmethod def fetch( @@ -295,6 +309,14 @@ def get_checkout(self) -> "RepoTag": ) +class MaterializedRepo(NamedTuple): + local: "pathlib.Path" + repo: "RemoteRepo" + metadata_array: "Sequence[URIWithMetadata]" + upstream_repo: "Optional[RemoteRepo]" = None + recommends_upstream: "bool" = False + + class AbstractRepoFetcher(AbstractStatefulFetcher): PRIORITY: "ClassVar[int]" = DEFAULT_PRIORITY + 10 @@ -307,6 +329,24 @@ def materialize_repo( base_repo_destdir: "Optional[PathLikePath]" = None, doUpdate: "Optional[bool]" = True, ) -> "Tuple[pathlib.Path, RemoteRepo, Sequence[URIWithMetadata]]": + """ + Subclasses have to implement this method, which is used to materialize + a repository described by a RemoteRepo instance. + + :param repo: The description of the repository to be materialized. + :type repo: class: `wfexs_backend.fetchers.RemoteRepo` + :param repo_tag_destdir: Destination of the materialized repo. + :type repo_tag_destdir: str, `os.PathLike[str]`, optional + :param base_repo_destdir: If repo_tag_destdir is None, parent directory of the newly created destination directory for the repo. + :type base_repo_destdir: str, `os.PathLike[str]`, optional + :param doUpdate: Should the code try updating an already materialized repo? Defaults to False + :type doUpdate: bool + + The returned tuple has next elements: + * The local path where the repo was materialized. + * A RemoteRepo instance. + * The metadata gathered through the materialisation process. + """ pass def materialize_repo_from_repo_transient( @@ -317,8 +357,8 @@ def materialize_repo_from_repo_transient( doUpdate: "Optional[bool]" = True, ) -> "Tuple[pathlib.Path, RemoteRepo, Sequence[URIWithMetadata]]": return self.materialize_repo( - repoURL=repo.repo_url, - repoTag=repo.tag, + repo.repo_url, + repo.tag, repo_tag_destdir=repo_tag_destdir, base_repo_destdir=base_repo_destdir, doUpdate=doUpdate, @@ -329,7 +369,8 @@ def build_pid_from_repo(self, remote_repo: "RemoteRepo") -> "Optional[str]": """ This method is required to generate a PID which usually represents an element (usually a workflow) in a repository. - If the fetcher does not recognize the type of repo, it should + If the fetcher does not recognize the type of repo, either using + repo_url content or the repo type in the worst case, it should return None """ pass @@ -345,8 +386,80 @@ def GuessRepoParams( pass +class AbstractSchemeRepoFetcher(AbstractRepoFetcher): + """ + This abstract subclass is used to force the initialization of the + scheme catalog instance + """ + + def __init__( + self, + scheme_catalog: "SchemeCatalog", + progs: "ProgsMapping" = dict(), + setup_block: "Optional[Mapping[str, Any]]" = None, + ): + """ + The scheme catalog is enforced + """ + super().__init__( + progs=progs, setup_block=setup_block, scheme_catalog=scheme_catalog + ) + self.scheme_catalog: "SchemeCatalog" + + def materialize_repo( + self, + repoURL: "RepoURL", + repoTag: "Optional[RepoTag]" = None, + repo_tag_destdir: "Optional[PathLikePath]" = None, + base_repo_destdir: "Optional[PathLikePath]" = None, + doUpdate: "Optional[bool]" = True, + ) -> "Tuple[pathlib.Path, RemoteRepo, Sequence[URIWithMetadata]]": + mrepo = self.materialize_repo_from_repo( + RemoteRepo( + repo_url=repoURL, + tag=repoTag, + ), + repo_tag_destdir=repo_tag_destdir, + base_repo_destdir=base_repo_destdir, + doUpdate=doUpdate, + ) + + return mrepo.local, mrepo.repo, mrepo.metadata_array + + @abc.abstractmethod + def materialize_repo_from_repo( + self, + repo: "RemoteRepo", + repo_tag_destdir: "Optional[PathLikePath]" = None, + base_repo_destdir: "Optional[PathLikePath]" = None, + doUpdate: "Optional[bool]" = True, + ) -> "MaterializedRepo": + """ + Subclasses have to implement this method, which is used to materialize + a repository described by a RemoteRepo instance. + + :param repo: The description of the repository to be materialized. + :type repo: class: `wfexs_backend.fetchers.RemoteRepo` + :param repo_tag_destdir: Destination of the materialized repo. + :type repo_tag_destdir: str, `os.PathLike[str]`, optional + :param base_repo_destdir: If repo_tag_destdir is None, parent directory of the newly created destination directory for the repo. + :type base_repo_destdir: str, `os.PathLike[str]`, optional + :param doUpdate: Should the code try updating an already materialized repo? Defaults to False + :type doUpdate: bool + + The returned tuple has next elements: + * The local path where the repo was materialized. + * A RemoteRepo instance. + * The metadata gathered through the materialisation process. + * An optional, upstream URI representing the repo. For instance, + in the case of a TRS or a SWH hosted repo, the registered upstream URL. + """ + pass + + if TYPE_CHECKING: RepoFetcher = TypeVar("RepoFetcher", bound=AbstractRepoFetcher) + SchemeRepoFetcher = TypeVar("SchemeRepoFetcher", bound=AbstractSchemeRepoFetcher) class AbstractStatefulStreamingFetcher(AbstractStatefulFetcher): @@ -368,6 +481,6 @@ def streamfetch( ) -> "ProtocolFetcherReturn": """ This is the method to be implemented by the stateful streaming fetcher - which can receive as destination either a file + which can receive as destination a byte stream """ pass diff --git a/wfexs_backend/scheme_catalog.py b/wfexs_backend/scheme_catalog.py new file mode 100644 index 00000000..0691abba --- /dev/null +++ b/wfexs_backend/scheme_catalog.py @@ -0,0 +1,477 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +# SPDX-License-Identifier: Apache-2.0 +# Copyright 2020-2024 Barcelona Supercomputing Center (BSC), Spain +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import + +import copy +import datetime +import hashlib +import importlib +import inspect +import json +import logging +import os +import os.path +import pathlib +import re +import shutil +import traceback +import types +import urllib.parse +import uuid + +from typing import ( + cast, + NamedTuple, + Pattern, + TYPE_CHECKING, +) + +if TYPE_CHECKING: + from types import ModuleType + + from typing import ( + Any, + IO, + Iterator, + Mapping, + MutableMapping, + MutableSequence, + Optional, + Sequence, + Set, + Tuple, + Type, + Union, + ) + + from typing_extensions import ( + Final, + NotRequired, + TypedDict, + ) + + from .common import ( + AbsPath, + AnyURI, + Fingerprint, + PathLikePath, + ProgsMapping, + RelPath, + SecurityContextConfig, + WritableSecurityContextConfig, + URIType, + ) + + from .fetchers import ( + ProtocolFetcherReturn, + StatefulFetcher, + ) + + from .security_context import ( + SecurityContextVault, + ) + + class RelAbsDict(TypedDict): + relative: RelPath + absolute: AbsPath + + class PathMetaDict(TypedDict): + meta: NotRequired[RelAbsDict] + relative: NotRequired[RelPath] + absolute: NotRequired[AbsPath] + + class MetadataEntryMetaDict(TypedDict): + injected: bool + + class MetadataEntryDict(TypedDict): + uri: URIType + metadata: MetadataEntryMetaDict + preferredName: RelPath + + class CacheMetadataDict(TypedDict): + stamp: datetime.datetime + path: PathMetaDict + kind: str + metadata_array: Sequence[MetadataEntryDict] + resolves_to: Sequence[URIType] + licences: Tuple[URIType, ...] + attributions: Sequence[Mapping[str, Any]] + fingerprint: Fingerprint + clonable: bool + + +from .common import ( + AbstractWfExSException, + Attribution, + ContentKind, + DefaultNoLicenceTuple, + LicenceDescription, + LicensedURI, + META_JSON_POSTFIX, + URIWithMetadata, +) + +from .fetchers import ( + AbstractSchemeRepoFetcher, + AbstractStatefulFetcher, + AbstractStatefulStreamingFetcher, + DocumentedProtocolFetcher, + DocumentedStatefulProtocolFetcher, + FetcherException, + FetcherInstanceException, + InvalidFetcherException, + RemoteRepo, +) + +from .utils.contents import link_or_copy +from .utils.digests import ( + ComputeDigestFromDirectory, + ComputeDigestFromFile, + stringifyFilenameDigest, +) +from .utils.misc import ( + config_validate, + DatetimeEncoder, + iter_namespace, + jsonFilterDecodeFromStream, + translate_glob_args, +) + + +class SchemeCatalogException(AbstractWfExSException): + pass + + +class SchemeCatalogImportException(SchemeCatalogException): + pass + + +class SchemeCatalog: + def __init__( + self, + scheme_handlers: "Mapping[str, Union[DocumentedStatefulProtocolFetcher, DocumentedProtocolFetcher]]" = dict(), + ): + # Getting a logger focused on specific classes + self.logger = logging.getLogger( + dict(inspect.getmembers(self))["__module__"] + + "::" + + self.__class__.__name__ + ) + + self.schemeHandlers: "MutableMapping[str, DocumentedProtocolFetcher]" = dict() + + self.bypassSchemeHandlers(scheme_handlers) + + def addRawSchemeHandlers( + self, schemeHandlers: "Mapping[str, DocumentedProtocolFetcher]" + ) -> None: + # No validation is done here about validness of schemes + if isinstance(schemeHandlers, dict): + self.schemeHandlers.update(schemeHandlers) + else: + raise InvalidFetcherException("Unable to add raw scheme handlers") + + def bypassSchemeHandler( + self, + scheme: "str", + handler: "Union[DocumentedStatefulProtocolFetcher, DocumentedProtocolFetcher]", + progs: "ProgsMapping" = dict(), + setup_block: "Optional[Mapping[str, Any]]" = None, + ) -> None: + """ + This method adds and overwrites a scheme handler, + instantiating it if it is a stateful one. + + :param scheme: + :param handler: + """ + the_handler: "DocumentedProtocolFetcher" + if isinstance(handler, DocumentedStatefulProtocolFetcher): + inst_handler = self.instantiateStatefulFetcher( + handler.fetcher_class, progs=progs, setup_block=setup_block + ) + the_handler = DocumentedProtocolFetcher( + fetcher=inst_handler.fetch, + description=inst_handler.description + if handler.description is None + else handler.description, + priority=handler.priority, + ) + elif isinstance(handler, DocumentedProtocolFetcher) and isinstance( + handler.fetcher, + ( + types.FunctionType, + types.LambdaType, + types.MethodType, + types.BuiltinFunctionType, + types.BuiltinMethodType, + ), + ): + the_handler = handler + else: + raise InvalidFetcherException( + "Trying to set for scheme {} a invalid handler".format(scheme) + ) + + self.schemeHandlers[scheme.lower()] = the_handler + + def bypassSchemeHandlers( + self, + schemeHandlers: "Mapping[str, Union[DocumentedStatefulProtocolFetcher, DocumentedProtocolFetcher]]", + ) -> None: + # No validation is done here about validness of schemes + if isinstance(schemeHandlers, dict): + for scheme, clazz in schemeHandlers.items(): + self.bypassSchemeHandler(scheme, clazz) + else: + raise InvalidFetcherException( + "Unable to instantiate to add scheme handlers" + ) + + def instantiateStatefulFetcher( + self, + statefulFetcher: "Type[StatefulFetcher]", + progs: "ProgsMapping" = dict(), + setup_block: "Optional[Mapping[str, Any]]" = None, + ) -> "StatefulFetcher": + """ + Method to instantiate stateful fetchers + """ + instStatefulFetcher: "Optional[AbstractStatefulFetcher]" = None + if inspect.isclass(statefulFetcher): + if issubclass(statefulFetcher, AbstractStatefulFetcher): + try: + if issubclass(statefulFetcher, AbstractSchemeRepoFetcher): + instStatefulFetcher = statefulFetcher( + self, progs=progs, setup_block=setup_block + ) + else: + instStatefulFetcher = statefulFetcher( + progs=progs, + setup_block=setup_block, + scheme_catalog=self, + ) + except Exception as e: + raise FetcherInstanceException( + f"Error while instantiating {statefulFetcher.__name__}" + ) from e + + if instStatefulFetcher is None: + raise InvalidFetcherException( + "Unable to instantiate something which is not a class inheriting from AbstractStatefulFetcher" + ) + + return cast("StatefulFetcher", instStatefulFetcher) + + def describeRegisteredSchemes(self) -> "Sequence[Tuple[str, str, int]]": + return [ + (scheme, desc_fetcher.description, desc_fetcher.priority) + for scheme, desc_fetcher in self.schemeHandlers.items() + ] + + def findAndAddSchemeHandlersFromModuleName( + self, + the_module_name: "str" = "wfexs_backend.fetchers", + fetchers_setup_block: "Optional[Mapping[str, Mapping[str, Any]]]" = None, + ) -> None: + try: + the_module = importlib.import_module(the_module_name) + self.findAndAddSchemeHandlersFromModule( + the_module, + fetchers_setup_block=fetchers_setup_block, + ) + except Exception as e: + errmsg = f"Unable to import module {the_module_name} in order to gather scheme handlers, due errors:" + self.logger.exception(errmsg) + raise SchemeCatalogImportException(errmsg) from e + + def findAndAddSchemeHandlersFromModule( + self, + the_module: "ModuleType", + fetchers_setup_block: "Optional[Mapping[str, Mapping[str, Any]]]" = None, + ) -> None: + for finder, module_name, ispkg in iter_namespace(the_module): + try: + named_module = importlib.import_module(module_name) + except: + self.logger.exception( + f"Skipping module {module_name} in order to gather scheme handlers, due errors:" + ) + continue + + # First, try locating a variable named SCHEME_HANDLERS + # then, the different class declarations inheriting + # from AbstractStatefulFetcher + skipit = True + for name, obj in inspect.getmembers(named_module): + if name == "SCHEME_HANDLERS": + if isinstance(obj, dict): + self.addSchemeHandlers( + obj, + fetchers_setup_block=fetchers_setup_block, + ) + skipit = False + elif ( + inspect.isclass(obj) + and not inspect.isabstract(obj) + and issubclass(obj, AbstractStatefulFetcher) + ): + # Now, let's learn whether the class is enabled + if getattr(obj, "ENABLED", False): + self.addStatefulSchemeHandlers( + obj, + fetchers_setup_block=fetchers_setup_block, + ) + skipit = False + + if skipit: + self.logger.debug( + f"Fetch module {named_module} was not eligible (no SCHEME_HANDLERS dictionary or subclass of {AbstractStatefulFetcher.__name__})" + ) + + def addStatefulSchemeHandlers( + self, + statefulSchemeHandler: "Type[AbstractStatefulFetcher]", + fetchers_setup_block: "Optional[Mapping[str, Mapping[str, Any]]]" = None, + ) -> None: + """ + This method adds scheme handlers (aka "fetchers") from + a given stateful fetcher, also adding the needed programs + """ + + # Get the scheme handlers from this fetcher + schemeHandlers = statefulSchemeHandler.GetSchemeHandlers() + + self.addSchemeHandlers( + schemeHandlers, fetchers_setup_block=fetchers_setup_block + ) + + def get(self, scheme: "str") -> "Optional[DocumentedProtocolFetcher]": + return self.schemeHandlers.get(scheme) + + def getSchemeHandler( + self, the_remote_file: "URIType" + ) -> "DocumentedProtocolFetcher": + # Content is fetched here + # As of RFC3986, schemes are case insensitive + parsedInputURL = urllib.parse.urlparse(the_remote_file) + the_scheme = parsedInputURL.scheme.lower() + scheme_handler = self.get(the_scheme) + + if scheme_handler is None: + errmsg = f"No {the_scheme} scheme handler for {the_remote_file}. Was this URI injected in the cache? Is it a supported one?" + self.logger.error(errmsg) + raise SchemeCatalogException(errmsg) + + return scheme_handler + + def fetch( + self, + the_remote_file: "URIType", + cached_filename: "PathLikePath", + sec_context: "Optional[SecurityContextConfig]" = None, + ) -> "ProtocolFetcherReturn": + scheme_handler = self.getSchemeHandler(the_remote_file) + + # Content is fetched here + return scheme_handler.fetcher( + the_remote_file, + cached_filename, + secContext=sec_context, + ) + + def streamfetch( + self, + the_remote_file: "URIType", + the_stream: "IO[bytes]", + sec_context: "Optional[SecurityContextConfig]" = None, + ) -> "ProtocolFetcherReturn": + scheme_handler = self.getSchemeHandler(the_remote_file) + + stream_fetcher = ( + scheme_handler.fetcher.__self__ + if hasattr(scheme_handler.fetcher, "__self__") + else None + ) + + if not isinstance(stream_fetcher, AbstractStatefulStreamingFetcher): + errmsg = f"Scheme handler for {the_remote_file} does not offer streaming capabilities." + self.logger.error(errmsg) + raise SchemeCatalogException(errmsg) + + # Content is fetched here + return stream_fetcher.streamfetch( + the_remote_file, + the_stream, + secContext=sec_context, + ) + + # This pattern is used to validate the schemes + SCHEME_PAT: "Final[Pattern[str]]" = re.compile(r"^[a-zA-Z][a-zA-Z0-9+.-]*$") + + def addSchemeHandlers( + self, + schemeHandlers: "Mapping[str, Union[DocumentedProtocolFetcher, DocumentedStatefulProtocolFetcher]]", + fetchers_setup_block: "Optional[Mapping[str, Mapping[str, Any]]]" = None, + ) -> None: + """ + This method adds scheme handlers (aka "fetchers") + or instantiates stateful scheme handlers (aka "stateful fetchers") + """ + if isinstance(schemeHandlers, dict): + instSchemeHandlers = dict() + if fetchers_setup_block is None: + fetchers_setup_block = dict() + for scheme, schemeHandler in schemeHandlers.items(): + if self.SCHEME_PAT.search(scheme) is None: + self.logger.warning( + f"Fetcher associated to scheme {scheme} has been skipped, as the scheme does not comply with RFC3986" + ) + continue + + lScheme = scheme.lower() + # When no setup block is available for the scheme fetcher, + # provide an empty one + setup_block = fetchers_setup_block.get(lScheme, dict()) + + instSchemeHandler = None + if isinstance(schemeHandler, DocumentedStatefulProtocolFetcher): + instSchemeInstance = self.instantiateStatefulFetcher( + schemeHandler.fetcher_class, setup_block=setup_block + ) + if instSchemeInstance is not None: + instSchemeHandler = DocumentedProtocolFetcher( + fetcher=instSchemeInstance.fetch, + description=instSchemeInstance.description + if schemeHandler.description is None + else schemeHandler.description, + priority=schemeHandler.priority, + ) + elif isinstance(schemeHandler, DocumentedProtocolFetcher) and callable( + schemeHandler.fetcher + ): + instSchemeHandler = schemeHandler + + # Only the ones which have overcome the sanity checks + if instSchemeHandler is not None: + # Schemes are case insensitive, so register only + # the lowercase version + instSchemeHandlers[lScheme] = instSchemeHandler + + self.addRawSchemeHandlers(instSchemeHandlers) From 302e34cab335d1c41fd6a2cb14ed10501fa194ce Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20M=C2=AA=20Fern=C3=A1ndez?= Date: Thu, 23 Jan 2025 17:41:04 +0100 Subject: [PATCH 11/60] SchemeHandlerCacheHandler has been split into the SchemeCatalog and the CacheHandler --- wfexs_backend/cache_handler.py | 156 +++++++++++---------------------- 1 file changed, 50 insertions(+), 106 deletions(-) diff --git a/wfexs_backend/cache_handler.py b/wfexs_backend/cache_handler.py index ed0cf932..ba462d5d 100644 --- a/wfexs_backend/cache_handler.py +++ b/wfexs_backend/cache_handler.py @@ -126,6 +126,10 @@ class CacheMetadataDict(TypedDict): InvalidFetcherException, ) +from .scheme_catalog import ( + SchemeCatalog, +) + from .utils.contents import link_or_copy from .utils.digests import ( ComputeDigestFromDirectory, @@ -161,13 +165,13 @@ class CacheHandlerSchemeException(CacheHandlerException): pass -class SchemeHandlerCacheHandler: +class CacheHandler: CACHE_METADATA_SCHEMA = cast("RelPath", "cache-metadata.json") def __init__( self, cacheDir: "pathlib.Path", - schemeHandlers: "Mapping[str, DocumentedProtocolFetcher]" = dict(), + scheme_catalog: "Optional[SchemeCatalog]" = None, ): # Getting a logger focused on specific classes self.logger = logging.getLogger( @@ -178,109 +182,9 @@ def __init__( # TODO: create caching database self.cacheDir = cacheDir - self.schemeHandlers: "MutableMapping[str, DocumentedProtocolFetcher]" = dict() - - self.addRawSchemeHandlers(schemeHandlers) - - def addRawSchemeHandlers( - self, schemeHandlers: "Mapping[str, DocumentedProtocolFetcher]" - ) -> None: - # No validation is done here about validness of schemes - if isinstance(schemeHandlers, dict): - self.schemeHandlers.update(schemeHandlers) - else: - raise InvalidFetcherException("Unable to add raw scheme handlers") - - def bypassSchemeHandler( - self, - scheme: "str", - handler: "Union[DocumentedStatefulProtocolFetcher, DocumentedProtocolFetcher]", - progs: "ProgsMapping" = dict(), - setup_block: "Optional[Mapping[str, Any]]" = None, - ) -> None: - """ - This method adds and overwrites a scheme handler, - instantiating it if it is a stateful one. - - :param scheme: - :param handler: - """ - the_handler: "DocumentedProtocolFetcher" - if isinstance(handler, DocumentedStatefulProtocolFetcher): - inst_handler = self.instantiateStatefulFetcher( - handler.fetcher_class, progs=progs, setup_block=setup_block - ) - the_handler = DocumentedProtocolFetcher( - fetcher=inst_handler.fetch, - description=inst_handler.description - if handler.description is None - else handler.description, - priority=handler.priority, - ) - elif isinstance(handler, DocumentedProtocolFetcher) and isinstance( - handler.fetcher, - ( - types.FunctionType, - types.LambdaType, - types.MethodType, - types.BuiltinFunctionType, - types.BuiltinMethodType, - ), - ): - the_handler = handler - else: - raise InvalidFetcherException( - "Trying to set for scheme {} a invalid handler".format(scheme) - ) - - self.schemeHandlers[scheme.lower()] = the_handler - - def bypassSchemeHandlers( - self, - schemeHandlers: "Mapping[str, Union[DocumentedStatefulProtocolFetcher, DocumentedProtocolFetcher]]", - ) -> None: - # No validation is done here about validness of schemes - if isinstance(schemeHandlers, dict): - for scheme, clazz in schemeHandlers.items(): - self.bypassSchemeHandler(scheme, clazz) - else: - raise InvalidFetcherException( - "Unable to instantiate to add scheme handlers" - ) - - def instantiateStatefulFetcher( - self, - statefulFetcher: "Type[StatefulFetcher]", - progs: "ProgsMapping" = dict(), - setup_block: "Optional[Mapping[str, Any]]" = None, - ) -> "StatefulFetcher": - """ - Method to instantiate stateful fetchers - """ - instStatefulFetcher = None - if inspect.isclass(statefulFetcher): - if issubclass(statefulFetcher, AbstractStatefulFetcher): - try: - instStatefulFetcher = statefulFetcher( - progs=progs, setup_block=setup_block - ) - except Exception as e: - raise FetcherInstanceException( - f"Error while instantiating {statefulFetcher.__name__}" - ) from e - - if instStatefulFetcher is None: - raise InvalidFetcherException( - "Unable to instantiate something which is not a class inheriting from AbstractStatefulFetcher" - ) - - return cast("StatefulFetcher", instStatefulFetcher) - - def describeRegisteredSchemes(self) -> "Sequence[Tuple[str, str, int]]": - return [ - (scheme, desc_fetcher.description, desc_fetcher.priority) - for scheme, desc_fetcher in self.schemeHandlers.items() - ] + if scheme_catalog is None: + scheme_catalog = SchemeCatalog() + self.scheme_catalog: "SchemeCatalog" = scheme_catalog def _genUriMetaCachedFilename( self, hashDir: "pathlib.Path", the_remote_file: "URIType" @@ -1092,9 +996,11 @@ def fetch( # Content is fetched here # As of RFC3986, schemes are case insensitive theScheme = parsedInputURL.scheme.lower() - schemeHandler = self.schemeHandlers.get(theScheme) + schemeHandler = self.scheme_catalog.get(theScheme) try: + # TODO: this code is redundant with the one in + # SchemeHandler method getSchemeHandler if schemeHandler is None: errmsg = f"No {theScheme} scheme handler for {the_remote_file} (while processing {remote_file}). Was this data injected in the cache?" self.logger.error(errmsg) @@ -1104,6 +1010,8 @@ def fetch( else: raise che + # TODO: this code is partially redundant with + # the one in SchemeHandler method fetch try: # Content is fetched here pfr = schemeHandler.fetcher( @@ -1227,3 +1135,39 @@ def fetch( fingerprint=final_fingerprint, clonable=clonable, ) + + +class SchemeHandlerCacheHandler(CacheHandler): + def __init__( + self, + cacheDir: "pathlib.Path", + schemeHandlers: "Mapping[str, DocumentedProtocolFetcher]" = dict(), + ): + scheme_catalog = SchemeCatalog(scheme_handlers=schemeHandlers) + super().__init__(cacheDir, scheme_catalog) + + def addRawSchemeHandlers( + self, schemeHandlers: "Mapping[str, DocumentedProtocolFetcher]" + ) -> None: + self.scheme_catalog.addRawSchemeHandlers(schemeHandlers) + + def bypassSchemeHandlers( + self, + schemeHandlers: "Mapping[str, Union[DocumentedStatefulProtocolFetcher, DocumentedProtocolFetcher]]", + ) -> None: + self.scheme_catalog.bypassSchemeHandlers(schemeHandlers) + + def instantiateStatefulFetcher( + self, + statefulFetcher: "Type[StatefulFetcher]", + progs: "ProgsMapping" = dict(), + setup_block: "Optional[Mapping[str, Any]]" = None, + ) -> "StatefulFetcher": + return self.scheme_catalog.instantiateStatefulFetcher( + statefulFetcher, + progs=progs, + setup_block=setup_block, + ) + + def describeRegisteredSchemes(self) -> "Sequence[Tuple[str, str, int]]": + return self.scheme_catalog.describeRegisteredSchemes() From 6ba96f394dd9e6d388a4db971ad0d972134b1ef9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20M=C2=AA=20Fern=C3=A1ndez?= Date: Thu, 23 Jan 2025 17:51:30 +0100 Subject: [PATCH 12/60] Migrated HTTP fetchClassicURL to an instance of AbstractStatefulStreamingFetcher --- wfexs_backend/fetchers/http.py | 245 +++++++++++++++++++-------------- 1 file changed, 142 insertions(+), 103 deletions(-) diff --git a/wfexs_backend/fetchers/http.py b/wfexs_backend/fetchers/http.py index c88cd90b..da2f81e1 100644 --- a/wfexs_backend/fetchers/http.py +++ b/wfexs_backend/fetchers/http.py @@ -30,6 +30,7 @@ from typing import ( Any, Callable, + ClassVar, Iterable, IO, Mapping, @@ -42,6 +43,10 @@ Union, ) + from typing_extensions import ( + Final, + ) + from _typeshed import SupportsRead from ssl import SSLContext from mypy_extensions import DefaultNamedArg @@ -63,7 +68,9 @@ from . import ( AbstractStatefulFetcher, + AbstractStatefulStreamingFetcher, DocumentedProtocolFetcher, + DocumentedStatefulProtocolFetcher, FetcherException, ProtocolFetcherReturn, ) @@ -78,117 +85,149 @@ ) +class HTTPFetcher(AbstractStatefulStreamingFetcher): + PRIORITY: "ClassVar[int]" = 20 + HTTP_PROTO: "Final[str]" = "http" + HTTPS_PROTO: "Final[str]" = "https" + + @classmethod + def GetSchemeHandlers(cls) -> "Mapping[str, DocumentedStatefulProtocolFetcher]": + # These are de-facto schemes supported by pip and git client + return { + cls.HTTP_PROTO: DocumentedStatefulProtocolFetcher( + fetcher_class=cls, + description="HTTP download URLs", + priority=cls.PRIORITY, + ), + cls.HTTPS_PROTO: DocumentedStatefulProtocolFetcher( + fetcher_class=cls, + description="HTTPS download URLs", + priority=cls.PRIORITY, + ), + } + + @property + def description(self) -> "str": + return "HTTP and HTTPS download URLs" + + @classmethod + def GetNeededPrograms(cls) -> "Sequence[SymbolicName]": + return tuple() + + def streamfetch( + self, + remote_file: "URIType", + dest_stream: "IO[bytes]", + secContext: "Optional[SecurityContextConfig]" = None, + ) -> "ProtocolFetcherReturn": + """ + Method to fetch contents from http and https. + This is the method to be implemented by the stateful streaming fetcher + which can receive as destination a byte stream + + :param remote_file: + :param dest_stream: + :param secContext: + """ + + # This is needed to remove possible embedded credentials, + # which should not be stored in the cache + orig_remote_file = remote_file + parsedInputURL, remote_file = self.ParseAndRemoveCredentials(orig_remote_file) + # Now the credentials are properly removed from remote_file + # we get them from the parsed url + username = parsedInputURL.username + password = parsedInputURL.password + + if isinstance(secContext, dict): + headers = secContext.get("headers", {}).copy() + token = secContext.get("token") + token_header = secContext.get("token_header") + username = secContext.get("username", username) + password = secContext.get("password", password) + + method = secContext.get("method") + data = secContext.get("data") + else: + headers = {} + method = None + data = None + token = None + token_header = None + + # Callable[[Union[str, Request], Union[bytes, SupportsRead[bytes], Iterable[bytes], None], Optional[float]], Any] + # Callable[[Union[str, Request], Optional[Union[bytes, SupportsRead[bytes], Iterable[bytes], None]], Optional[float], DefaultNamedArg(Optional[str], 'cafile'), DefaultNamedArg(Optional[str], 'capath'), DefaultNamedArg(bool, 'cadefault'), DefaultNamedArg(Optional[SSLContext], 'context')], Any] + opener: "Union[Callable[[Union[str, request.Request], Union[bytes, SupportsRead[bytes], Iterable[bytes], None], Optional[float]], Any], Callable[[Union[str, request.Request], Optional[Union[bytes, SupportsRead[bytes], Iterable[bytes]]], Optional[float], DefaultNamedArg(Optional[str], 'cafile'), DefaultNamedArg(Optional[str], 'capath'), DefaultNamedArg(bool, 'cadefault'), DefaultNamedArg(Optional[SSLContext], 'context')], Any]]" + opener = request.urlopen + if token is not None: + if token_header is not None: + headers[token_header] = token + else: + headers["Authorization"] = f"Bearer {token}" + elif username is not None: + if password is None: + password = "" + + opener = get_opener_with_auth(remote_file, username, password).open + + # # Time to set up user and password in URL + # parsedInputURL = parse.urlparse(remote_file) + # + # netloc = parse.quote(username, safe='') + ':' + parse.quote(password, + # safe='') + '@' + parsedInputURL.hostname + # if parsedInputURL.port is not None: + # netloc += ':' + str(parsedInputURL.port) + # + # # Now the credentials are properly set up + # remote_file = cast("URIType", parse.urlunparse((parsedInputURL.scheme, netloc, parsedInputURL.path, + # parsedInputURL.params, parsedInputURL.query, parsedInputURL.fragment))) + + uri_with_metadata = None + try: + req_remote = request.Request( + remote_file, headers=headers, data=data, method=method + ) + with opener(req_remote) as url_response: + uri_with_metadata = URIWithMetadata( + uri=url_response.url, metadata=dict(url_response.headers.items()) + ) + + while True: + try: + # Try getting it + shutil.copyfileobj(url_response, dest_stream) + except http.client.IncompleteRead as icread: + dest_stream.write(icread.partial) + # Restarting the copy + continue + break + + except urllib.error.HTTPError as he: + raise FetcherException( + "Error fetching {} : {} {}\n{}".format( + orig_remote_file, he.code, he.reason, he.read().decode() + ), + code=he.code, + reason=he.reason, + ) from he + + return ProtocolFetcherReturn( + kind_or_resolved=ContentKind.File, + metadata_array=[uri_with_metadata], + ) + + def fetchClassicURL( remote_file: "URIType", cachedFilename: "Union[PathLikePath, IO[bytes]]", secContext: "Optional[SecurityContextConfig]" = None, ) -> "ProtocolFetcherReturn": - """ - Method to fetch contents from http, https and ftp - - :param remote_file: - :param cachedFilename: - :param secContext: - """ - - # This is needed to remove possible embedded credentials, - # which should not be stored in the cache - orig_remote_file = remote_file - parsedInputURL, remote_file = AbstractStatefulFetcher.ParseAndRemoveCredentials( - orig_remote_file - ) - # Now the credentials are properly removed from remote_file - # we get them from the parsed url - username = parsedInputURL.username - password = parsedInputURL.password - - if isinstance(secContext, dict): - headers = secContext.get("headers", {}).copy() - token = secContext.get("token") - token_header = secContext.get("token_header") - username = secContext.get("username", username) - password = secContext.get("password", password) - - method = secContext.get("method") - data = secContext.get("data") - else: - headers = {} - method = None - data = None - token = None - token_header = None - - # Callable[[Union[str, Request], Union[bytes, SupportsRead[bytes], Iterable[bytes], None], Optional[float]], Any] - # Callable[[Union[str, Request], Optional[Union[bytes, SupportsRead[bytes], Iterable[bytes], None]], Optional[float], DefaultNamedArg(Optional[str], 'cafile'), DefaultNamedArg(Optional[str], 'capath'), DefaultNamedArg(bool, 'cadefault'), DefaultNamedArg(Optional[SSLContext], 'context')], Any] - opener: "Union[Callable[[Union[str, request.Request], Union[bytes, SupportsRead[bytes], Iterable[bytes], None], Optional[float]], Any], Callable[[Union[str, request.Request], Optional[Union[bytes, SupportsRead[bytes], Iterable[bytes]]], Optional[float], DefaultNamedArg(Optional[str], 'cafile'), DefaultNamedArg(Optional[str], 'capath'), DefaultNamedArg(bool, 'cadefault'), DefaultNamedArg(Optional[SSLContext], 'context')], Any]]" - opener = request.urlopen - if token is not None: - if token_header is not None: - headers[token_header] = token - else: - headers["Authorization"] = f"Bearer {token}" - elif username is not None: - if password is None: - password = "" - - opener = get_opener_with_auth(remote_file, username, password).open - - # # Time to set up user and password in URL - # parsedInputURL = parse.urlparse(remote_file) - # - # netloc = parse.quote(username, safe='') + ':' + parse.quote(password, - # safe='') + '@' + parsedInputURL.hostname - # if parsedInputURL.port is not None: - # netloc += ':' + str(parsedInputURL.port) - # - # # Now the credentials are properly set up - # remote_file = cast("URIType", parse.urlunparse((parsedInputURL.scheme, netloc, parsedInputURL.path, - # parsedInputURL.params, parsedInputURL.query, parsedInputURL.fragment))) - - # Preparing where it is going to be written - download_file: "IO[bytes]" if isinstance(cachedFilename, (str, os.PathLike)): - download_file = open(cachedFilename, "wb") + return HTTPFetcher().fetch(remote_file, cachedFilename, secContext=secContext) else: - download_file = cachedFilename - - uri_with_metadata = None - try: - req_remote = request.Request( - remote_file, headers=headers, data=data, method=method + return HTTPFetcher().streamfetch( + remote_file, cachedFilename, secContext=secContext ) - with opener(req_remote) as url_response: - uri_with_metadata = URIWithMetadata( - uri=url_response.url, metadata=dict(url_response.headers.items()) - ) - - while True: - try: - # Try getting it - shutil.copyfileobj(url_response, download_file) - except http.client.IncompleteRead as icread: - download_file.write(icread.partial) - # Restarting the copy - continue - break - - except urllib.error.HTTPError as he: - raise FetcherException( - "Error fetching {} : {} {}\n{}".format( - orig_remote_file, he.code, he.reason, he.read().decode() - ), - code=he.code, - reason=he.reason, - ) from he - finally: - # Closing files opened by this code - if download_file != cachedFilename: - download_file.close() - - return ProtocolFetcherReturn( - kind_or_resolved=ContentKind.File, - metadata_array=[uri_with_metadata], - ) SCHEME_HANDLERS: "Mapping[str, DocumentedProtocolFetcher]" = { From 11ff66ef36471588f849f24fb25b99b98bf4e17f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20M=C2=AA=20Fern=C3=A1ndez?= Date: Thu, 23 Jan 2025 19:29:58 +0100 Subject: [PATCH 13/60] Migrated to new HTTP fetcher stateful implementation --- wfexs_backend/utils/licences.py | 19 +++++++++++++----- wfexs_backend/utils/orcid.py | 4 ++-- wfexs_backend/utils/passphrase_wrapper.py | 20 +++++++++++++------ .../workflow_engines/nextflow_engine.py | 4 ++-- 4 files changed, 32 insertions(+), 15 deletions(-) diff --git a/wfexs_backend/utils/licences.py b/wfexs_backend/utils/licences.py index f5160c0c..e60a3f44 100644 --- a/wfexs_backend/utils/licences.py +++ b/wfexs_backend/utils/licences.py @@ -54,9 +54,13 @@ import xdg.BaseDirectory +from ..scheme_catalog import ( + SchemeCatalog, +) + from ..cache_handler import ( CacheHandlerException, - SchemeHandlerCacheHandler, + CacheHandler, ) from ..common import ( @@ -65,7 +69,7 @@ NoLicenceDescription, ) -from ..fetchers.http import SCHEME_HANDLERS as HTTP_SCHEME_HANDLERS +from ..fetchers.http import HTTPFetcher # Licences @@ -518,7 +522,7 @@ class LicenceMatcher: def __init__( self, - cacheHandler: "SchemeHandlerCacheHandler", + cacheHandler: "CacheHandler", cacheDir: "Optional[pathlib.Path]" = None, spdx_version: "str" = DEFAULT_SPDX_VERSION, ): @@ -627,10 +631,15 @@ def __new__(cls) -> "LicenceMatcher": # type: ignore xdg.BaseDirectory.save_cache_path("es.elixir.WfExSLicenceMatcher") ) + scheme_catalog = SchemeCatalog( + scheme_handlers=HTTPFetcher.GetSchemeHandlers(), + ) + # Private cache handler instance # with LicenceMatcher - cacheHandler = SchemeHandlerCacheHandler( - cachePath, schemeHandlers=HTTP_SCHEME_HANDLERS + cacheHandler = CacheHandler( + cachePath, + scheme_catalog=scheme_catalog, ) cls.__instance = LicenceMatcher(cacheHandler) diff --git a/wfexs_backend/utils/orcid.py b/wfexs_backend/utils/orcid.py index b1d851d7..9628815e 100644 --- a/wfexs_backend/utils/orcid.py +++ b/wfexs_backend/utils/orcid.py @@ -51,7 +51,7 @@ from ..common import ( ResolvedORCID, ) -from ..fetchers.http import fetchClassicURL +from ..fetchers.http import HTTPFetcher from ..fetchers import FetcherException ORCID_HOST: "Final[str]" = "orcid.org" @@ -95,7 +95,7 @@ def validate_orcid( public_record_b = io.BytesIO() public_orcid_url = cast("URIType", f"{ORCID_URL_PREFIX}/{possible_orcid}") # If there is any issue fetching, next call should raise an exception - _, meta_public_record, _ = fetchClassicURL( + _, meta_public_record, _ = HTTPFetcher().streamfetch( cast("URIType", f"{public_orcid_url}/public-record.json"), public_record_b ) try: diff --git a/wfexs_backend/utils/passphrase_wrapper.py b/wfexs_backend/utils/passphrase_wrapper.py index 823fed21..42494898 100644 --- a/wfexs_backend/utils/passphrase_wrapper.py +++ b/wfexs_backend/utils/passphrase_wrapper.py @@ -53,11 +53,15 @@ import xdg.BaseDirectory +from ..scheme_catalog import ( + SchemeCatalog, +) + from ..cache_handler import ( CacheOfflineException, - SchemeHandlerCacheHandler, + CacheHandler, ) -from ..fetchers.http import SCHEME_HANDLERS as HTTP_SCHEME_HANDLERS +from ..fetchers.http import HTTPFetcher from ..fetchers.wiktionary import WiktionaryFetcher @@ -115,7 +119,7 @@ class WfExSPassphraseGenerator: def __init__( self, - cacheHandler: "SchemeHandlerCacheHandler", + cacheHandler: "CacheHandler", cacheDir: "Optional[pathlib.Path]" = None, word_sets: "Mapping[str, Sequence[RemoteWordlistResource]]" = DEFAULT_WORD_SETS, ): @@ -289,10 +293,14 @@ def __new__(cls) -> "WfExSPassphraseGenerator": # type: ignore # Private cache handler instance # with Wiktionary - cacheHandler = SchemeHandlerCacheHandler( - cachePath, schemeHandlers=HTTP_SCHEME_HANDLERS + scheme_catalog = SchemeCatalog( + scheme_handlers=HTTPFetcher.GetSchemeHandlers() + ) + scheme_catalog.bypassSchemeHandlers(WiktionaryFetcher.GetSchemeHandlers()) + cacheHandler = CacheHandler( + cachePath, + scheme_catalog=scheme_catalog, ) - cacheHandler.bypassSchemeHandlers(WiktionaryFetcher.GetSchemeHandlers()) cls.__instance = WfExSPassphraseGenerator(cacheHandler) return cls.__instance diff --git a/wfexs_backend/workflow_engines/nextflow_engine.py b/wfexs_backend/workflow_engines/nextflow_engine.py index b6332bca..ce0a148b 100644 --- a/wfexs_backend/workflow_engines/nextflow_engine.py +++ b/wfexs_backend/workflow_engines/nextflow_engine.py @@ -123,7 +123,7 @@ NoContainerFactory, ) -from ..fetchers.http import fetchClassicURL +from ..fetchers.http import HTTPFetcher from ..utils.contents import ( copy2_nofollow, ) @@ -862,7 +862,7 @@ def runLocalNextflowCommand( nextflow_version, nextflow_script_url, cachedScript ) ) - fetchClassicURL(nextflow_script_url, cachedScript) + HTTPFetcher().fetch(nextflow_script_url, cachedScript) # Checking the installer has execution permissions if not os.access(cachedScript, os.R_OK | os.X_OK): From 2f3650b68f660cb007e329adb337588d7fda8893 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20M=C2=AA=20Fern=C3=A1ndez?= Date: Thu, 23 Jan 2025 19:33:12 +0100 Subject: [PATCH 14/60] Migrated to new HTTP fetcher stateful implementation (some fetchers were using it) --- wfexs_backend/fetchers/b2share.py | 11 +++++++---- wfexs_backend/fetchers/doi.py | 7 ++++--- wfexs_backend/fetchers/drs.py | 9 +++++---- wfexs_backend/fetchers/osf_io.py | 15 ++++++++------- wfexs_backend/fetchers/pride.py | 4 ++-- wfexs_backend/fetchers/zenodo.py | 11 ++++++----- 6 files changed, 32 insertions(+), 25 deletions(-) diff --git a/wfexs_backend/fetchers/b2share.py b/wfexs_backend/fetchers/b2share.py index c2829fa5..b31d4429 100644 --- a/wfexs_backend/fetchers/b2share.py +++ b/wfexs_backend/fetchers/b2share.py @@ -35,7 +35,7 @@ FetcherException, ProtocolFetcherReturn, ) -from .http import fetchClassicURL +from .http import HTTPFetcher from ..common import ( ContentKind, @@ -94,11 +94,12 @@ def fetchB2SHARE( metadata_url = cast("URIType", parse.urljoin(B2SHARE_RECORD_REST, b2share_id)) + http_fetcher = HTTPFetcher() gathered_meta = {"fetched": metadata_url} metadata_array = [URIWithMetadata(remote_file, gathered_meta)] try: metaio = io.BytesIO() - _, metametaio, _ = fetchClassicURL(metadata_url, metaio) + _, metametaio, _ = http_fetcher.streamfetch(metadata_url, metaio) metadata = json.loads(metaio.getvalue().decode("utf-8")) gathered_meta["payload"] = metadata metadata_array.extend(metametaio) @@ -178,12 +179,14 @@ def fetchB2SHARE( the_file_local_path = cast( "AbsPath", os.path.join(cachedFilename, relpath) ) - _, metacont, _ = fetchClassicURL( + _, metacont, _ = http_fetcher.fetch( the_file["ePIC_PID"], the_file_local_path ) metadata_array.extend(metacont) else: - _, metacont, _ = fetchClassicURL(the_files[0]["ePIC_PID"], cachedFilename) + _, metacont, _ = http_fetcher.fetch( + the_files[0]["ePIC_PID"], cachedFilename + ) metadata_array.extend(metacont) except FetcherException as fe: raise FetcherException( diff --git a/wfexs_backend/fetchers/doi.py b/wfexs_backend/fetchers/doi.py index d99d1cc2..367852f3 100644 --- a/wfexs_backend/fetchers/doi.py +++ b/wfexs_backend/fetchers/doi.py @@ -34,7 +34,7 @@ FetcherException, ProtocolFetcherReturn, ) -from .http import fetchClassicURL +from .http import HTTPFetcher from ..common import ( URIWithMetadata, @@ -95,9 +95,10 @@ def fetchDOI( gathered_ra_meta = {"fetched": metadata_ra_url} metadata_array = [URIWithMetadata(remote_file, gathered_ra_meta)] + http_fetcher = HTTPFetcher() try: metaio = io.BytesIO() - _, metametaraio, _ = fetchClassicURL(metadata_ra_url, metaio) + _, metametaraio, _ = http_fetcher.streamfetch(metadata_ra_url, metaio) metadata_ra = json.loads(metaio.getvalue().decode("utf-8")) gathered_ra_meta["payload"] = metadata_ra metadata_array.extend(metametaraio) @@ -120,7 +121,7 @@ def fetchDOI( metadata_array.append(URIWithMetadata(remote_file, gathered_meta)) try: metaio = io.BytesIO() - _, metametaio, _ = fetchClassicURL(metadata_url, metaio) + _, metametaio, _ = http_fetcher.streamfetch(metadata_url, metaio) metadata = json.loads(metaio.getvalue().decode("utf-8")) gathered_meta["payload"] = metadata metadata_array.extend(metametaio) diff --git a/wfexs_backend/fetchers/drs.py b/wfexs_backend/fetchers/drs.py index fe0f1831..be809007 100644 --- a/wfexs_backend/fetchers/drs.py +++ b/wfexs_backend/fetchers/drs.py @@ -52,7 +52,7 @@ FetcherException, ProtocolFetcherReturn, ) -from .http import fetchClassicURL +from .http import HTTPFetcher from ..common import ( LicensedURI, @@ -73,7 +73,7 @@ def query_n2t( gathered_meta = {"fetched": query_url} n2t_io = io.BytesIO() - _, meta_n2t_io, _ = fetchClassicURL(query_url, n2t_io) + _, meta_n2t_io, _ = HTTPFetcher().streamfetch(query_url, n2t_io) answer = yaml.safe_load(n2t_io.getvalue().decode("utf-8")) gathered_meta["payload"] = answer @@ -197,11 +197,12 @@ def downloadContentFromDRS( "URIType", drs_service_prefix + "objects/" + object_id ) + http_fetcher = HTTPFetcher() gathered_meta = {"fetched": object_metadata_url} metadata = None try: metaio = io.BytesIO() - _, metametaio, _ = fetchClassicURL( + _, metametaio, _ = http_fetcher.streamfetch( object_metadata_url, metaio, secContext=upperSecContext ) object_metadata = json.loads(metaio.getvalue().decode("utf-8")) @@ -234,7 +235,7 @@ def downloadContentFromDRS( try: metaaccio = io.BytesIO() - _, metametaaccio, _ = fetchClassicURL( + _, metametaaccio, _ = http_fetcher.streamfetch( object_access_metadata_url, metaaccio, secContext=upperSecContext, diff --git a/wfexs_backend/fetchers/osf_io.py b/wfexs_backend/fetchers/osf_io.py index 60ff9fef..de108ebd 100644 --- a/wfexs_backend/fetchers/osf_io.py +++ b/wfexs_backend/fetchers/osf_io.py @@ -35,7 +35,7 @@ FetcherException, ProtocolFetcherReturn, ) -from .http import fetchClassicURL +from .http import HTTPFetcher from ..common import ( ContentKind, @@ -93,11 +93,12 @@ def fetchOSFIO( metadata_url = cast("URIType", parse.urljoin(OSF_IO_RECORD_REST, osf_io_id)) + http_fetcher = HTTPFetcher() gathered_meta = {"fetched": metadata_url} metadata_array = [URIWithMetadata(remote_file, gathered_meta)] try: metaio = io.BytesIO() - _, metametaio, _ = fetchClassicURL(metadata_url, metaio) + _, metametaio, _ = http_fetcher.streamfetch(metadata_url, metaio) metadata = json.loads(metaio.getvalue().decode("utf-8")) gathered_meta["payload"] = metadata metadata_array.extend(metametaio) @@ -131,7 +132,7 @@ def fetchOSFIO( metadata_array.append(URIWithMetadata(remote_file, gathered_l_meta)) try: metaio = io.BytesIO() - _, metametalicio, _ = fetchClassicURL(osf_io_lic_link, metaio) + _, metametalicio, _ = http_fetcher.streamfetch(osf_io_lic_link, metaio) l_metadata = json.loads(metaio.getvalue().decode("utf-8")) gathered_l_meta["payload"] = l_metadata metadata_array.extend(metametalicio) @@ -166,7 +167,7 @@ def fetchOSFIO( metadata_array.append(URIWithMetadata(remote_file, gathered_fm_meta)) try: metaio = io.BytesIO() - _, metametafmio, _ = fetchClassicURL(osf_io_files_meta_link, metaio) + _, metametafmio, _ = http_fetcher.streamfetch(osf_io_files_meta_link, metaio) fm_metadata = json.loads(metaio.getvalue().decode("utf-8")) gathered_fm_meta["payload"] = fm_metadata metadata_array.extend(metametafmio) @@ -199,7 +200,7 @@ def fetchOSFIO( metadata_array.append(URIWithMetadata(remote_file, gathered_s_meta)) try: metaio = io.BytesIO() - _, metametasio, _ = fetchClassicURL(osf_io_store_link, metaio) + _, metametasio, _ = http_fetcher.streamfetch(osf_io_store_link, metaio) s_metadata = json.loads(metaio.getvalue().decode("utf-8")) gathered_s_meta["payload"] = s_metadata metadata_array.extend(metametasio) @@ -265,12 +266,12 @@ def fetchOSFIO( the_file_local_path = cast( "AbsPath", os.path.join(cachedFilename, relpath) ) - _, metacont, _ = fetchClassicURL( + _, metacont, _ = http_fetcher.fetch( the_file["links"]["download"], the_file_local_path ) metadata_array.extend(metacont) elif kind == ContentKind.File: - _, metacont, _ = fetchClassicURL( + _, metacont, _ = http_fetcher.fetch( the_files[0]["links"]["download"], cachedFilename ) metadata_array.extend(metacont) diff --git a/wfexs_backend/fetchers/pride.py b/wfexs_backend/fetchers/pride.py index b6117476..ce71ccd7 100644 --- a/wfexs_backend/fetchers/pride.py +++ b/wfexs_backend/fetchers/pride.py @@ -50,7 +50,7 @@ FetcherException, ProtocolFetcherReturn, ) -from .http import fetchClassicURL +from .http import HTTPFetcher PRIDE_PROJECT_SCHEME = "pride.project" @@ -92,7 +92,7 @@ def fetchPRIDEProject( metadata = None try: metaio = io.BytesIO() - _, metametaio, _ = fetchClassicURL(metadata_url, metaio) + _, metametaio, _ = HTTPFetcher().streamfetch(metadata_url, metaio) metadata = json.loads(metaio.getvalue().decode("utf-8")) gathered_meta["payload"] = metadata metadata_array.extend(metametaio) diff --git a/wfexs_backend/fetchers/zenodo.py b/wfexs_backend/fetchers/zenodo.py index b33bb22c..737d01e9 100644 --- a/wfexs_backend/fetchers/zenodo.py +++ b/wfexs_backend/fetchers/zenodo.py @@ -35,7 +35,7 @@ FetcherException, ProtocolFetcherReturn, ) -from .http import fetchClassicURL +from .http import HTTPFetcher from ..common import ( ContentKind, @@ -95,11 +95,12 @@ def fetchZenodo( metadata_url = cast("URIType", parse.urljoin(ZENODO_RECORD_REST, zenodo_id)) + http_fetcher = HTTPFetcher() gathered_meta = {"fetched": metadata_url} metadata_array = [URIWithMetadata(remote_file, gathered_meta)] try: metaio = io.BytesIO() - _, metametaio, _ = fetchClassicURL(metadata_url, metaio) + _, metametaio, _ = http_fetcher.streamfetch(metadata_url, metaio) metadata = json.loads(metaio.getvalue().decode("utf-8")) gathered_meta["payload"] = metadata metadata_array.extend(metametaio) @@ -128,7 +129,7 @@ def fetchZenodo( metadata_array.append(URIWithMetadata(remote_file, gathered_l_meta)) try: metaio = io.BytesIO() - _, metametalicio, _ = fetchClassicURL(licence_meta_url, metaio) + _, metametalicio, _ = http_fetcher.streamfetch(licence_meta_url, metaio) l_metadata = json.loads(metaio.getvalue().decode("utf-8")) gathered_l_meta["payload"] = l_metadata metadata_array.extend(metametalicio) @@ -208,12 +209,12 @@ def fetchZenodo( the_file_local_path = cast( "AbsPath", os.path.join(cachedFilename, relpath) ) - _, metacont, _ = fetchClassicURL( + _, metacont, _ = http_fetcher.fetch( the_file["links"]["self"], the_file_local_path ) metadata_array.extend(metacont) else: - _, metacont, _ = fetchClassicURL( + _, metacont, _ = http_fetcher.fetch( the_files[0]["links"]["self"], cachedFilename ) metadata_array.extend(metacont) From d2ca621119bf688f4688fc7c71e2d250faa17055 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20M=C2=AA=20Fern=C3=A1ndez?= Date: Thu, 23 Jan 2025 20:34:00 +0100 Subject: [PATCH 15/60] SWH and Git fetchers are now subclasses of AbstractSchemeRepoFetcher. Also, their usage of fetchClassicURL has been "uplifted" to new implementation --- tests/fetchers/test_git.py | 13 +++- tests/fetchers/test_swh.py | 13 +++- wfexs_backend/fetchers/git.py | 45 ++++++++----- wfexs_backend/fetchers/swh.py | 116 ++++++++++++++++++++-------------- 4 files changed, 121 insertions(+), 66 deletions(-) diff --git a/tests/fetchers/test_git.py b/tests/fetchers/test_git.py index 3702cc5c..eaaaa8d5 100644 --- a/tests/fetchers/test_git.py +++ b/tests/fetchers/test_git.py @@ -38,11 +38,18 @@ URIType, ) +from wfexs_backend.scheme_catalog import ( + SchemeCatalog, +) + from wfexs_backend.fetchers import ( RemoteRepo, RepoGuessFlavor, RepoType, ) + +from wfexs_backend.fetchers.http import HTTPFetcher + from wfexs_backend.fetchers.git import GitFetcher WfExS_basedir = Path(__file__).parent.parent @@ -254,7 +261,11 @@ def test_build_git_pid_from_repo( if remote_repo is None: pytest.skip("Skipped test because no remote repo was provided") else: - fetcher = GitFetcher({}) + scheme_catalog = SchemeCatalog( + scheme_handlers=HTTPFetcher.GetSchemeHandlers(), + ) + + fetcher = GitFetcher(scheme_catalog, progs={}) output = fetcher.build_pid_from_repo(remote_repo) assert output == repo_pid diff --git a/tests/fetchers/test_swh.py b/tests/fetchers/test_swh.py index 0bc34a90..79afd2cc 100644 --- a/tests/fetchers/test_swh.py +++ b/tests/fetchers/test_swh.py @@ -38,11 +38,18 @@ URIType, ) +from wfexs_backend.scheme_catalog import ( + SchemeCatalog, +) + from wfexs_backend.fetchers import ( RemoteRepo, RepoGuessFlavor, RepoType, ) + +from wfexs_backend.fetchers.http import HTTPFetcher + from wfexs_backend.fetchers.swh import SoftwareHeritageFetcher WfExS_basedir = Path(__file__).parent.parent @@ -139,7 +146,11 @@ def test_build_swh_pid_from_repo( if remote_repo is None: pytest.skip("Skipped test because no remote repo was provided") else: - fetcher = SoftwareHeritageFetcher({}) + scheme_catalog = SchemeCatalog( + scheme_handlers=SoftwareHeritageFetcher.GetSchemeHandlers(), + ) + + fetcher = SoftwareHeritageFetcher(scheme_catalog, progs={}) output = fetcher.build_pid_from_repo(remote_repo) assert output == repo_pid diff --git a/wfexs_backend/fetchers/git.py b/wfexs_backend/fetchers/git.py index a9dc153c..cf96ef23 100644 --- a/wfexs_backend/fetchers/git.py +++ b/wfexs_backend/fetchers/git.py @@ -47,6 +47,10 @@ Final, ) + from ..scheme_catalog import ( + SchemeCatalog, + ) + from ..common import ( AbsPath, AnyPath, @@ -70,9 +74,10 @@ import dulwich.porcelain from . import ( - AbstractRepoFetcher, + AbstractSchemeRepoFetcher, DocumentedStatefulProtocolFetcher, FetcherException, + MaterializedRepo, ProtocolFetcherReturn, RemoteRepo, RepoGuessException, @@ -90,7 +95,7 @@ GITHUB_NETLOC = "github.com" -class GitFetcher(AbstractRepoFetcher): +class GitFetcher(AbstractSchemeRepoFetcher): GIT_PROTO: "Final[str]" = "git" GIT_PROTO_PREFIX: "Final[str]" = GIT_PROTO + "+" GITHUB_SCHEME: "Final[str]" = "github" @@ -102,9 +107,12 @@ class GitFetcher(AbstractRepoFetcher): GIT_SCHEMES: "Final[Sequence[str]]" = ["https", "git", "ssh", "file"] def __init__( - self, progs: "ProgsMapping", setup_block: "Optional[Mapping[str, Any]]" = None + self, + scheme_catalog: "SchemeCatalog", + progs: "ProgsMapping" = dict(), + setup_block: "Optional[Mapping[str, Any]]" = None, ): - super().__init__(progs=progs, setup_block=setup_block) + super().__init__(scheme_catalog, progs=progs, setup_block=setup_block) self.git_cmd = self.progs.get( self.DEFAULT_GIT_CMD, cast("RelPath", self.DEFAULT_GIT_CMD) @@ -501,7 +509,8 @@ def build_pid_from_repo(self, remote_repo: "RemoteRepo") -> "Optional[str]": """ This method is required to generate a PID which usually represents an element (usually a workflow) in a repository. - If the fetcher does not recognize the type of repo, it should + If the fetcher does not recognize the type of repo, either using + repo_url content or the repo type in the worst case, it should return None """ parsed_wf_url = parse.urlparse(remote_repo.repo_url) @@ -651,14 +660,13 @@ def build_pid_from_repo(self, remote_repo: "RemoteRepo") -> "Optional[str]": return retval - def materialize_repo( + def materialize_repo_from_repo( self, - repoURL: "RepoURL", - repoTag: "Optional[RepoTag]" = None, + repo: "RemoteRepo", repo_tag_destdir: "Optional[PathLikePath]" = None, base_repo_destdir: "Optional[PathLikePath]" = None, doUpdate: "Optional[bool]" = True, - ) -> "Tuple[pathlib.Path, RemoteRepo, Sequence[URIWithMetadata]]": + ) -> "MaterializedRepo": """ :param repoURL: The URL to the repository. @@ -668,6 +676,9 @@ def materialize_repo( :return: """ + repoURL = repo.repo_url + repoTag = repo.tag + # Assure directory exists before next step if repo_tag_destdir is None: if base_repo_destdir is None: @@ -831,10 +842,10 @@ def materialize_repo( checkout=repo_effective_checkout, ) - return ( - repo_tag_destpath, - remote_repo, - [], + return MaterializedRepo( + local=repo_tag_destpath, + repo=remote_repo, + metadata_array=[], ) def fetch( @@ -908,9 +919,13 @@ def fetch( parse.urlunparse((gitScheme, parsedInputURL.netloc, gitPath, "", "", "")), ) - repo_tag_destdir, remote_repo, metadata_array = self.materialize_repo( - repoURL, repoTag=repoTag + materialized_repo_return = self.materialize_repo_from_repo( + RemoteRepo(repo_url=repoURL, tag=repoTag), ) + repo_tag_destdir = materialized_repo_return.local + remote_repo = materialized_repo_return.repo + metadata_array = materialized_repo_return.metadata_array + if repoRelPath is not None: remote_repo = remote_repo._replace(rel_path=cast("RelPath", repoRelPath)) diff --git a/wfexs_backend/fetchers/swh.py b/wfexs_backend/fetchers/swh.py index 4b0a2b50..7f5f8556 100644 --- a/wfexs_backend/fetchers/swh.py +++ b/wfexs_backend/fetchers/swh.py @@ -65,16 +65,17 @@ ) from . import ( - AbstractRepoFetcher, + AbstractSchemeRepoFetcher, DocumentedStatefulProtocolFetcher, FetcherException, + MaterializedRepo, ProtocolFetcherReturn, RemoteRepo, RepoGuessException, RepoType, ) -from .http import fetchClassicURL +from .http import HTTPFetcher from ..common import ( ContentKind, @@ -87,7 +88,7 @@ ) -class SoftwareHeritageFetcher(AbstractRepoFetcher): +class SoftwareHeritageFetcher(AbstractSchemeRepoFetcher): SOFTWARE_HERITAGE_SCHEME: "Final[str]" = "swh" SWH_API_REST: "Final[str]" = "https://archive.softwareheritage.org/api/1/" SWH_API_REST_KNOWN: "Final[URIType]" = cast( @@ -136,7 +137,7 @@ def _resolve_swh_id( # urljoin cannot be used due working with URIs resolve_uri = cast("URIType", cls.SWH_API_REST_RESOLVE + swh_quoted_id + "/") try: - _, metaresio, _ = fetchClassicURL( + _, metaresio, _ = HTTPFetcher().streamfetch( resolve_uri, resio, secContext={ @@ -191,7 +192,7 @@ def GuessRepoParams( putative_core_swhid = wf_url.split(";", 1)[0] try: valio = io.BytesIO() - _, metavalio, _ = fetchClassicURL( + _, metavalio, _ = HTTPFetcher().streamfetch( cls.SWH_API_REST_KNOWN, valio, secContext={ @@ -226,29 +227,15 @@ def GuessRepoParams( web_url=web_url, ) - def materialize_repo_from_repo_transient( - self, - repo: "RemoteRepo", - repo_tag_destdir: "Optional[PathLikePath]" = None, - base_repo_destdir: "Optional[PathLikePath]" = None, - doUpdate: "Optional[bool]" = True, - ) -> "Tuple[pathlib.Path, RemoteRepo, Sequence[URIWithMetadata]]": - return self.materialize_repo( - repoURL=cast("RepoURL", repo.tag) - if repo.tag is not None - else repo.repo_url, - repo_tag_destdir=repo_tag_destdir, - base_repo_destdir=base_repo_destdir, - doUpdate=doUpdate, - ) - def build_pid_from_repo(self, remote_repo: "RemoteRepo") -> "Optional[str]": """ This method is required to generate a PID which usually represents an element (usually a workflow) in a repository. - If the fetcher does not recognize the type of repo, it should + If the fetcher does not recognize the type of repo, either using + repo_url content or the repo type in the worst case, it should return None """ + parsed_wf_url = parse.urlparse(remote_repo.repo_url) if parsed_wf_url.scheme not in self.GetSchemeHandlers(): return None @@ -256,14 +243,16 @@ def build_pid_from_repo(self, remote_repo: "RemoteRepo") -> "Optional[str]": # FIXME: improve this return remote_repo.repo_url - def materialize_repo( + def materialize_repo_from_repo( self, - repoURL: "RepoURL", - repoTag: "Optional[RepoTag]" = None, + repo: "RemoteRepo", repo_tag_destdir: "Optional[PathLikePath]" = None, base_repo_destdir: "Optional[PathLikePath]" = None, doUpdate: "Optional[bool]" = True, - ) -> "Tuple[pathlib.Path, RemoteRepo, Sequence[URIWithMetadata]]": + ) -> "MaterializedRepo": + repoURL = cast("RepoURL", repo.tag) if repo.tag is not None else repo.repo_url + repoTag = repo.tag + # If we are here is because the repo is valid # as it should have been checked by GuessRepoParams @@ -314,10 +303,10 @@ def materialize_repo( object_id + "/", ), ) - _, metarelio, _ = fetchClassicURL( + _, metarelio, _ = self.scheme_catalog.streamfetch( release_uri, relio, - secContext={ + sec_context={ "headers": { "Accept": "application/json", }, @@ -353,10 +342,10 @@ def materialize_repo( ) try: revio = io.BytesIO() - _, metarevio, _ = fetchClassicURL( + _, metarevio, _ = self.scheme_catalog.streamfetch( cast("URIType", revision_uri), revio, - secContext={ + sec_context={ "headers": { "Accept": "application/json", }, @@ -406,10 +395,10 @@ def materialize_repo( time.sleep(self.WAIT_SECS) try: dirio = io.BytesIO() - _, metadirio, _ = fetchClassicURL( + _, metadirio, _ = self.scheme_catalog.streamfetch( cast("URIType", directory_url), dirio, - secContext={ + sec_context={ "headers": { "Accept": "application/json", }, @@ -450,7 +439,7 @@ def materialize_repo( with tempfile.NamedTemporaryFile() as tmp_targz_filename: try: - _, metafetchio, _ = fetchClassicURL( + _, metafetchio, _ = self.scheme_catalog.fetch( dir_fetch_url, cast("AbsPath", tmp_targz_filename.name), ) @@ -536,10 +525,10 @@ def materialize_repo( try: contentio = io.BytesIO() - _, metacontentio, _ = fetchClassicURL( + _, metacontentio, _ = self.scheme_catalog.streamfetch( cast("URIType", content_url), contentio, - secContext={ + sec_context={ "headers": { "Accept": "application/json", }, @@ -566,14 +555,15 @@ def materialize_repo( # Assure base directory exists before next step # here repo_tag_destdir is a file - repo_tag_destfile: "Union[PathLikePath, IO[bytes]]" + repo_tag_destfile: "Optional[PathLikePath]" = None + repo_tag_deststream: "Optional[IO[bytes]]" = None if repo_tag_destdir is None: if base_repo_destdir is None: temp_file_descriptor, repo_tag_destdir = cast( "Tuple[int, AbsPath]", tempfile.mkstemp(prefix="wfexs", suffix=".swh"), ) - repo_tag_destfile = os.fdopen(temp_file_descriptor, mode="wb") + repo_tag_deststream = os.fdopen(temp_file_descriptor, mode="wb") atexit.register(os.unlink, repo_tag_destdir) else: repo_hashed_id = hashlib.sha1(repoURL.encode("utf-8")).hexdigest() @@ -602,17 +592,29 @@ def materialize_repo( repo_tag_destpath = pathlib.Path(repo_tag_destdir) try: - _, metafetchio, _ = fetchClassicURL( - content_fetch_url, - repo_tag_destfile, - ) + if repo_tag_destfile is not None: + _, metafetchio, _ = self.scheme_catalog.fetch( + content_fetch_url, + repo_tag_destfile, + ) + elif repo_tag_deststream is not None: + _, metafetchio, _ = self.scheme_catalog.streamfetch( + content_fetch_url, + repo_tag_deststream, + ) + else: + raise FetcherException( + f"No fetch of {content_fetch_url} (assertion?)" + ) + except FetcherException as fe: + raise except Exception as e: raise FetcherException( f"HTTP REST call {content_fetch_url} failed" ) from e finally: - if not isinstance(repo_tag_destfile, (str, os.PathLike)): - repo_tag_destfile.close() + if repo_tag_deststream is not None: + repo_tag_deststream.close() gathered_meta = { "fetched": content_fetch_url, @@ -631,10 +633,23 @@ def materialize_repo( checkout=cast("RepoTag", repo_effective_checkout), ) - return ( - repo_tag_destpath, - remote_repo, - metadata_array, + upstream_repo: "Optional[RemoteRepo]" = None + origin: "Optional[str]" = res_doc.get("metadata", {}).get("origin") + # This is an heuristic to build a git scheme uri + if origin is not None: + upstream_repo = RemoteRepo( + repo_url=cast("RepoURL", origin), + rel_path=cast("Optional[RelPath]", res_doc["metadata"].get("path")), + repo_type=RepoType.Git + if ("git" in origin) or ("bitbucket" in origin) + else None, + ) + + return MaterializedRepo( + local=repo_tag_destpath, + repo=remote_repo, + metadata_array=metadata_array, + upstream_repo=upstream_repo, ) def fetch( @@ -662,9 +677,12 @@ def fetch( repoRelPath = None # It is materialized in a temporary location - repo_tag_destdir, remote_repo, metadata_array = self.materialize_repo( - cast("RepoURL", remote_file) + materialized_repo_return = self.materialize_repo_from_repo( + RemoteRepo(repo_url=cast("RepoURL", remote_file)), ) + repo_tag_destdir = materialized_repo_return.local + remote_repo = materialized_repo_return.repo + metadata_array = materialized_repo_return.metadata_array preferredName: "Optional[RelPath]" # repoRelPath is only acknowledged when the resolved repo From bba5ca979d1435250b371572e06abb2bcdf5af3d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20M=C2=AA=20Fern=C3=A1ndez?= Date: Thu, 23 Jan 2025 20:39:15 +0100 Subject: [PATCH 16/60] Uplifted implementation of GA4GH TRS fetcher, so it can be a workflow "repo" fetcher --- wfexs_backend/fetchers/trs_files.py | 1039 ++++++++++++++++----------- 1 file changed, 635 insertions(+), 404 deletions(-) diff --git a/wfexs_backend/fetchers/trs_files.py b/wfexs_backend/fetchers/trs_files.py index 5a8696a5..97b7dd44 100644 --- a/wfexs_backend/fetchers/trs_files.py +++ b/wfexs_backend/fetchers/trs_files.py @@ -18,10 +18,17 @@ from __future__ import absolute_import +import atexit import copy +import hashlib import io import json +import logging import os +import pathlib +import shutil +import tempfile +import urllib.parse import warnings from typing import ( @@ -32,494 +39,718 @@ from urllib import parse from . import ( - AbstractStatefulFetcher, + AbstractSchemeRepoFetcher, DocumentedProtocolFetcher, + DocumentedStatefulProtocolFetcher, FetcherException, + MaterializedRepo, ProtocolFetcherReturn, + RemoteRepo, + RepoType, ) -from .http import fetchClassicURL from ..common import ( ContentKind, URIWithMetadata, ) +from ..utils.contents import ( + link_or_copy_pathlib, +) + from ..utils.misc import ( urlresolv, ) if TYPE_CHECKING: from typing import ( + Any, Mapping, MutableMapping, MutableSequence, Optional, Sequence, + Tuple, + Union, + ) + + from typing_extensions import ( + Final, ) from ..common import ( AbsPath, PathLikePath, + ProgsMapping, + RelPath, + RepoURL, SecurityContextConfig, + SymbolicName, URIType, ) -INTERNAL_TRS_SCHEME_PREFIX = "wfexs.trs.files" -TRS_SCHEME_PREFIX = "trs" + from ..scheme_catalog import ( + SchemeCatalog, + ) -TRS_FILES_SUFFIX = "/files" -TRS_DESCRIPTOR_INFIX = "/descriptor/" +class GA4GHTRSFetcher(AbstractSchemeRepoFetcher): + INTERNAL_TRS_SCHEME_PREFIX: "Final[str]" = "wfexs.trs.files" + TRS_SCHEME_PREFIX: "Final[str]" = "trs" -def fetchTRSFiles( - remote_file: "URIType", - cachedFilename: "PathLikePath", - secContext: "Optional[SecurityContextConfig]" = None, -) -> "ProtocolFetcherReturn": - """ - Method to download contents from TRS files related to a tool + TRS_FILES_SUFFIX: "Final[str]" = "/files" + TRS_DESCRIPTOR_INFIX: "Final[str]" = "/descriptor/" - :param remote_file: - :param cachedFilename: Destination filename for the fetched content - :param secContext: The security context containing the credentials - """ + @classmethod + def GetSchemeHandlers(cls) -> "Mapping[str, DocumentedStatefulProtocolFetcher]": + # These are de-facto schemes supported by Software Heritage + # libraries and other implementations + return { + cls.INTERNAL_TRS_SCHEME_PREFIX: DocumentedStatefulProtocolFetcher( + fetcher_class=cls, + description="WfExS internal pseudo-scheme used to materialize files from pure TRS servers", + priority=cls.PRIORITY, + ), + cls.TRS_SCHEME_PREFIX: DocumentedStatefulProtocolFetcher( + fetcher_class=cls, + description="GA4GH TRS metadata is fetched using the APIs described at https://ga4gh.github.io/tool-registry-service-schemas/. Contents are downloaded delegating their associated URIs to other fetchers", + priority=cls.PRIORITY, + ), + } - parsedInputURL = parse.urlparse(remote_file) - path_steps: "Sequence[str]" = parsedInputURL.path.split("/") - embedded_remote_file = parsedInputURL.path + @property + def description(self) -> "str": + return "Fetcher for GA4GH TRSv2 tools" + + @classmethod + def GetNeededPrograms(cls) -> "Sequence[SymbolicName]": + return tuple() + + @classmethod + def GuessRepoParams( + cls, + orig_wf_url: "Union[URIType, parse.ParseResult]", + logger: "Optional[logging.Logger]" = None, + fail_ok: "bool" = False, + ) -> "Optional[RemoteRepo]": + pass + + def materialize_repo_from_repo( + self, + repo: "RemoteRepo", + repo_tag_destdir: "Optional[PathLikePath]" = None, + base_repo_destdir: "Optional[PathLikePath]" = None, + doUpdate: "Optional[bool]" = True, + ) -> "MaterializedRepo": + remote_file = repo.repo_url + repoTag = repo.tag + + parsedInputURL = parse.urlparse(remote_file) + path_steps: "Sequence[str]" = parsedInputURL.path.split("/") + embedded_remote_file = parsedInputURL.path + + metadata_array: "MutableSequence[URIWithMetadata]" = [] + if parsedInputURL.scheme == self.INTERNAL_TRS_SCHEME_PREFIX: + # TODO: Improve this code + if not embedded_remote_file.endswith(self.TRS_FILES_SUFFIX): + files_metadata_url = cast( + "URIType", embedded_remote_file + self.TRS_FILES_SUFFIX + ) + descriptor_base_url = embedded_remote_file + self.TRS_DESCRIPTOR_INFIX + else: + files_metadata_url = cast("URIType", embedded_remote_file) + descriptor_base_url = ( + embedded_remote_file[0 : -len(self.TRS_FILES_SUFFIX)] + + self.TRS_DESCRIPTOR_INFIX + ) + # TODO: fetch here service info metadata + elif parsedInputURL.scheme == self.TRS_SCHEME_PREFIX: + # TRS official scheme + if len(path_steps) < 3 or path_steps[0] != "": + raise FetcherException( + f"Ill-formed TRS CURIE {remote_file}. It should be in the format of {self.TRS_SCHEME_PREFIX}://id/version or {self.TRS_SCHEME_PREFIX}://prefix-with-slashes/id/version" + ) - metadata_array: "MutableSequence[URIWithMetadata]" = [] - if parsedInputURL.scheme == INTERNAL_TRS_SCHEME_PREFIX: - # TODO: Improve this code - if not embedded_remote_file.endswith(TRS_FILES_SUFFIX): - metadata_url = cast("URIType", embedded_remote_file + TRS_FILES_SUFFIX) - descriptor_base_url = embedded_remote_file + TRS_DESCRIPTOR_INFIX - else: - metadata_url = cast("URIType", embedded_remote_file) - descriptor_base_url = ( - embedded_remote_file[0 : -len(TRS_FILES_SUFFIX)] + TRS_DESCRIPTOR_INFIX + trs_base_steps = cast("MutableSequence[str]", path_steps[0:-2]) + trs_base_steps.extend(["ga4gh", "trs", "v2"]) + + # Performing some sanity checks about the API + service_info_steps = copy.copy(trs_base_steps) + service_info_steps.append("service-info") + service_info_metadata_url = cast( + "URIType", + parse.urlunparse( + parse.ParseResult( + scheme="https", + netloc=parsedInputURL.netloc, + path="/".join(service_info_steps), + params="", + query="", + fragment="", + ) + ), ) - elif parsedInputURL.scheme == TRS_SCHEME_PREFIX: - # TRS official scheme - if len(path_steps) < 3 or path_steps[0] != "": - raise FetcherException( - f"Ill-formed TRS CURIE {remote_file}. It should be in the format of {TRS_SCHEME_PREFIX}://id/version or {TRS_SCHEME_PREFIX}://prefix-with-slashes/id/version" + service_info_wfexs_meta = { + "fetched": service_info_metadata_url, + "payload": None, + } + metadata_array.append(URIWithMetadata(remote_file, service_info_wfexs_meta)) + try: + metaio = io.BytesIO() + _, metametaio, _ = self.scheme_catalog.streamfetch( + service_info_metadata_url, metaio + ) + service_info_metadata = json.loads(metaio.getvalue().decode("utf-8")) + service_info_wfexs_meta["payload"] = service_info_metadata + metadata_array.extend(metametaio) + except FetcherException as fe: + raise FetcherException( + f"Error fetching or processing TRS service info metadata for {remote_file} : {fe.code} {fe.reason}" + ) from fe + + trs_version_str: "Optional[str]" = None + trs_artifact: "Optional[str]" = None + trs_group: "Optional[str]" = None + trs_endpoint_meta_type: "Optional[Mapping[str, str]]" = ( + service_info_metadata.get("type") ) - - trs_base_steps = cast("MutableSequence[str]", path_steps[0:-2]) - trs_base_steps.extend(["ga4gh", "trs", "v2"]) - - # Performing some sanity checks about the API - service_info_steps = copy.copy(trs_base_steps) - service_info_steps.append("service-info") - service_info_metadata_url = cast( - "URIType", - parse.urlunparse( - parse.ParseResult( - scheme="https", - netloc=parsedInputURL.netloc, - path="/".join(service_info_steps), - params="", - query="", - fragment="", + if trs_endpoint_meta_type is not None: + trs_version_str = trs_endpoint_meta_type.get("version") + trs_artifact = trs_endpoint_meta_type.get("artifact") + trs_group = trs_endpoint_meta_type.get("group") + + if trs_version_str is None: + errstr = f"Unable to identify TRS version from {service_info_metadata_url}. Is this a TRS endpoint?" + raise FetcherException(errstr) + + # Avoiding querying a GA4GH DRS service, for instance + if trs_artifact is not None and trs_artifact.lower() not in ( + "trs", + "yevis", + ): + errstr = f"Unsupported GA4GH service {trs_artifact} (group {trs_group}) from {service_info_metadata_url}" + raise FetcherException(errstr) + + # Warning about potentially unsupported versions + trs_version_tuple = tuple(map(int, trs_version_str.split("."))) + if trs_version_tuple < (2, 0, 1): + self.logger.warning( + f"{service_info_metadata_url} is offering old TRS version {trs_version_str}, which diverges from what this implementation supports" + ) + elif trs_version_tuple > (3, 0): + self.logger.warning( + f"{service_info_metadata_url} is offering TRS version {trs_version_str}, which might diverge from what this implementation supports" ) - ), - ) - service_info_meta = { - "fetched": service_info_metadata_url, - "payload": None, - } - metadata_array.append(URIWithMetadata(remote_file, service_info_meta)) - try: - metaio = io.BytesIO() - _, metametaio, _ = fetchClassicURL(service_info_metadata_url, metaio) - service_info_metadata = json.loads(metaio.getvalue().decode("utf-8")) - service_info_meta["payload"] = service_info_metadata - metadata_array.extend(metametaio) - except FetcherException as fe: - raise FetcherException( - f"Error fetching or processing TRS service info metadata for {remote_file} : {fe.code} {fe.reason}" - ) from fe - trs_version_str: "Optional[str]" = None - trs_artifact: "Optional[str]" = None - trs_group: "Optional[str]" = None - trs_endpoint_meta_type: "Optional[Mapping[str, str]]" = ( - service_info_metadata.get("type") - ) - if trs_endpoint_meta_type is not None: - trs_version_str = trs_endpoint_meta_type.get("version") - trs_artifact = trs_endpoint_meta_type.get("artifact") - trs_group = trs_endpoint_meta_type.get("group") - - if trs_version_str is None: - errstr = f"Unable to identify TRS version from {service_info_metadata_url}. Is this a TRS endpoint?" - raise FetcherException(errstr) - - # Avoiding querying a GA4GH DRS service, for instance - if trs_artifact is not None and trs_artifact.lower() not in ("trs", "yevis"): - errstr = f"Unsupported GA4GH service {trs_artifact} (group {trs_group}) from {service_info_metadata_url}" - raise FetcherException(errstr) - - # Warning about potentially unsupported versions - trs_version_tuple = tuple(map(int, trs_version_str.split("."))) - if trs_version_tuple < (2, 0, 1): - warnings.warn( - f"{service_info_metadata_url} is offering old TRS version {trs_version_str}, which diverges from what this implementation supports" + version_steps = copy.copy(trs_base_steps) + version_steps.extend(["tools", path_steps[-2], "versions", path_steps[-1]]) + version_metadata_url = cast( + "URIType", + parse.urlunparse( + parse.ParseResult( + scheme="https", + netloc=parsedInputURL.netloc, + path="/".join(version_steps), + params="", + query="", + fragment="", + ) + ), ) - elif trs_version_tuple > (3, 0): - warnings.warn( - f"{service_info_metadata_url} is offering TRS version {trs_version_str}, which might diverge from what this implementation supports" + version_meta = { + "fetched": version_metadata_url, + "payload": None, + } + metadata_array.append(URIWithMetadata(remote_file, version_meta)) + try: + metaio = io.BytesIO() + _, metametaio, _ = self.scheme_catalog.streamfetch( + version_metadata_url, metaio + ) + version_metadata = json.loads(metaio.getvalue().decode("utf-8")) + version_meta["payload"] = version_metadata + metadata_array.extend(metametaio) + + except FetcherException as fe: + raise FetcherException( + f"Error fetching or processing TRS version metadata for {remote_file} : {fe.code} {fe.reason}" + ) from fe + + # At last, we can finish building the URL + new_path_steps = [ + *version_steps, + version_metadata["descriptor_type"][0], + "files", + ] + + files_metadata_url = cast( + "URIType", + parse.urlunparse( + parse.ParseResult( + scheme="https", + netloc=parsedInputURL.netloc, + path="/".join(new_path_steps), + params="", + query="", + fragment="", + ) + ), ) - version_steps = copy.copy(trs_base_steps) - version_steps.extend(["tools", path_steps[-2], "versions", path_steps[-1]]) - version_metadata_url = cast( - "URIType", - parse.urlunparse( + descriptor_steps = [ + *version_steps, + version_metadata["descriptor_type"][0], + "descriptor", + ] + descriptor_base_url = parse.urlunparse( parse.ParseResult( scheme="https", netloc=parsedInputURL.netloc, - path="/".join(version_steps), + path="/".join(descriptor_steps) + "/", params="", query="", fragment="", ) - ), - ) - version_meta = { - "fetched": version_metadata_url, + ) + else: + raise FetcherException(f"FIXME: Unhandled scheme {parsedInputURL.scheme}") + + # Assure directory exists before next step + if repo_tag_destdir is None: + if base_repo_destdir is None: + repo_tag_destpath = pathlib.Path( + tempfile.mkdtemp(prefix="wfexs", suffix=".trs") + ) + atexit.register(shutil.rmtree, repo_tag_destpath, True) + else: + repo_hashed_id = hashlib.sha1(remote_file.encode("utf-8")).hexdigest() + repo_destpath = pathlib.Path(base_repo_destdir, repo_hashed_id) + # repo_destdir = pathlib.Path(self.cacheWorkflowDir, repo_hashed_id) + + if not repo_destpath.exists(): + try: + repo_destpath.mkdir(parents=True) + except IOError: + errstr = "ERROR: Unable to create intermediate directories for repo {}. ".format( + remote_file + ) + raise FetcherException(errstr) + + repo_hashed_tag_id = hashlib.sha1( + b"" if repoTag is None else repoTag.encode("utf-8") + ).hexdigest() + repo_tag_destpath = repo_destpath / repo_hashed_tag_id + else: + repo_tag_destpath = ( + repo_tag_destdir + if isinstance(repo_tag_destdir, pathlib.Path) + else pathlib.Path(repo_tag_destdir) + ) + + self.logger.debug(f"Repo dir {repo_tag_destpath}") + + topMeta = { + "fetched": files_metadata_url, "payload": None, + "workflow_entrypoint": None, + "remote_workflow_entrypoint": None, } - metadata_array.append(URIWithMetadata(remote_file, version_meta)) + metadata_array = [URIWithMetadata(remote_file, topMeta)] try: metaio = io.BytesIO() - _, metametaio, _ = fetchClassicURL(version_metadata_url, metaio) - version_metadata = json.loads(metaio.getvalue().decode("utf-8")) - version_meta["payload"] = version_metadata + _, metametaio, _ = self.scheme_catalog.streamfetch( + files_metadata_url, metaio + ) + metadata = json.loads(metaio.getvalue().decode("utf-8")) + topMeta["payload"] = metadata metadata_array.extend(metametaio) - except FetcherException as fe: raise FetcherException( - f"Error fetching or processing TRS version metadata for {remote_file} : {fe.code} {fe.reason}" + "Error fetching or processing TRS files metadata for {} : {} {}".format( + remote_file, fe.code, fe.reason + ) ) from fe - # At last, we can finish building the URL - new_path_steps = [ - *version_steps, - version_metadata["descriptor_type"][0], - "files", - ] + repo_tag_destpath.mkdir(parents=True, exist_ok=True) + absdirs = set() + emptyWorkflow = True - metadata_url = cast( - "URIType", - parse.urlunparse( - parse.ParseResult( - scheme="https", - netloc=parsedInputURL.netloc, - path="/".join(new_path_steps), - params="", - query="", - fragment="", - ) - ), - ) + # First pass, identify primary descriptor / workflow entrypoint + # and learn whether the destination paths should be sanitized + is_abs_url = False + is_anon = False + file_rel_2_url: "MutableMapping[str, str]" = dict() + for file_desc in metadata: + file_rel_path = file_desc.get("path") + if file_rel_path is None: + continue - descriptor_steps = [ - *version_steps, - version_metadata["descriptor_type"][0], - "descriptor", - ] - descriptor_base_url = parse.urlunparse( - parse.ParseResult( - scheme="https", - netloc=parsedInputURL.netloc, - path="/".join(descriptor_steps) + "/", - params="", - query="", - fragment="", - ) - ) - else: - raise FetcherException(f"FIXME: Unhandled scheme {parsedInputURL.scheme}") - - topMeta = { - "fetched": metadata_url, - "payload": None, - "workflow_entrypoint": None, - "remote_workflow_entrypoint": None, - } - metadata_array = [URIWithMetadata(remote_file, topMeta)] - try: - metaio = io.BytesIO() - _, metametaio, _ = fetchClassicURL(metadata_url, metaio) - metadata = json.loads(metaio.getvalue().decode("utf-8")) - topMeta["payload"] = metadata - metadata_array.extend(metametaio) - except FetcherException as fe: - raise FetcherException( - "Error fetching or processing TRS files metadata for {} : {} {}".format( - remote_file, fe.code, fe.reason - ) - ) from fe - - os.makedirs(cachedFilename, exist_ok=True) - absdirs = set() - emptyWorkflow = True - - # First pass, identify primary descriptor / workflow entrypoint - # and learn whether the destination paths should be sanitized - is_abs_url = False - is_anon = False - file_rel_2_url: "MutableMapping[str, str]" = dict() - for file_desc in metadata: - file_rel_path = file_desc.get("path") - if file_rel_path is None: - continue - - emptyWorkflow = False - - # BEWARE! The relpath could contain references to parent directories - # escaping from the URL to be built and from the download "sandbox" - frp_parsed = parse.urlparse(file_rel_path) - is_abs_url = frp_parsed.scheme in ("http", "https", "ftp") - - if is_abs_url: - # This one has to be dealt with a shortcut - file_rel_2_url[file_rel_path] = urlresolv(file_rel_path) - continue - - descriptor_url = cast( - "URIType", - descriptor_base_url + parse.quote(file_rel_path, safe="/"), - ) - try: - descmetaio = io.BytesIO() - _, descmetaelem, _ = fetchClassicURL( - descriptor_url, descmetaio, {"headers": {"Accept": "application/json"}} - ) - descriptor_meta = json.loads(descmetaio.getvalue().decode("utf-8")) - except FetcherException as fe: - raise FetcherException( - "Error fetching or processing TRS descriptor metadata for {} : {} {}".format( - descriptor_url, fe.code, fe.reason - ) - ) from fe + emptyWorkflow = False - is_anon = ( - not isinstance(descriptor_meta, dict) or descriptor_meta.get("url") is None - ) - if is_anon: - # This one has to be dealt in a different way - break - file_rel_2_url[file_rel_path] = urlresolv(descriptor_meta["url"]) - - if emptyWorkflow: - raise FetcherException( - "Error processing TRS files for {} : no file was found.\n{}".format( - remote_file, metadata + # BEWARE! The relpath could contain references to parent directories + # escaping from the URL to be built and from the download "sandbox" + frp_parsed = parse.urlparse(file_rel_path) + is_abs_url = frp_parsed.scheme in ("http", "https", "ftp") + + if is_abs_url: + # This one has to be dealt with a shortcut + file_rel_2_url[file_rel_path] = urlresolv(file_rel_path) + continue + + descriptor_url = cast( + "URIType", + descriptor_base_url + parse.quote(file_rel_path, safe="/"), ) - ) + try: + descmetaio = io.BytesIO() + _, descmetaelem, _ = self.scheme_catalog.streamfetch( + descriptor_url, + descmetaio, + {"headers": {"Accept": "application/json"}}, + ) + descriptor_meta = json.loads(descmetaio.getvalue().decode("utf-8")) + except FetcherException as fe: + raise FetcherException( + "Error fetching or processing TRS descriptor metadata for {} : {} {}".format( + descriptor_url, fe.code, fe.reason + ) + ) from fe - if is_anon: - prefix_url = "" - else: - prefix_url = os.path.commonpath(tuple(file_rel_2_url.values())) - - # We have to create anonymous directories to avoid leaving the download "sandbox" - abs_download_dir = cachedFilename - if "/" in prefix_url: - # This is needed to perform an effective work - prefix_url += "/" - # Due the peversion of commonpath, double slashes are collapsed - colon_pos = prefix_url.find(":") - if colon_pos > 0: - prefix_url = ( - prefix_url[0 : colon_pos + 1] + "/" + prefix_url[colon_pos + 1 :] + is_anon = ( + not isinstance(descriptor_meta, dict) + or descriptor_meta.get("url") is None ) + if is_anon: + # This one has to be dealt in a different way + break + file_rel_2_url[file_rel_path] = urlresolv(descriptor_meta["url"]) - # Computing resolved relative paths - for file_desc in metadata: - file_rel_path = file_desc.get("path") - if file_rel_path is not None: - # BEWARE! The relpath could contain references to parent directories - # escaping from the URL to be built and from the download "sandbox" - frp_parsed = parse.urlparse(file_rel_path) - is_abs_url = frp_parsed.scheme in ("http", "https", "ftp") - if is_abs_url: - # An absolute URL, like in the case of DDBJ TRS implementation - file_url = cast("URIType", file_rel_path) - else: - file_url = cast( - "URIType", - descriptor_base_url + parse.quote(file_rel_path, safe="/"), - ) - local_rel_path = file_rel_2_url[file_rel_path][len(prefix_url) :] - absfile = cast( - "AbsPath", os.path.join(abs_download_dir, local_rel_path) + if emptyWorkflow: + raise FetcherException( + "Error processing TRS files for {} : no file was found.\n{}".format( + remote_file, metadata ) + ) - # Intermediate path creation - absdir = os.path.dirname(absfile) - if absdir not in absdirs: - absdirs.add(absdir) - os.makedirs(absdir, exist_ok=True) - real_rel_path = os.path.relpath( - os.path.normpath(absfile), cachedFilename + if is_anon: + prefix_url = "" + else: + prefix_url = os.path.commonpath(tuple(file_rel_2_url.values())) + + # We have to create anonymous directories to avoid leaving the download "sandbox" + abs_download_dir = repo_tag_destpath + if "/" in prefix_url: + # This is needed to perform an effective work + prefix_url += "/" + # Due the peversion of commonpath, double slashes are collapsed + colon_pos = prefix_url.find(":") + if colon_pos > 0: + prefix_url = ( + prefix_url[0 : colon_pos + 1] + "/" + prefix_url[colon_pos + 1 :] ) - # When it is the primary descriptor, it is fetched twice - if file_desc.get("file_type") == "PRIMARY_DESCRIPTOR": - topMeta["workflow_entrypoint"] = cast("URIType", real_rel_path) + # Computing resolved relative paths + for file_desc in metadata: + file_rel_path = file_desc.get("path") + if file_rel_path is not None: + # BEWARE! The relpath could contain references to parent directories + # escaping from the URL to be built and from the download "sandbox" + frp_parsed = parse.urlparse(file_rel_path) + is_abs_url = frp_parsed.scheme in ("http", "https", "ftp") if is_abs_url: - topMeta["remote_workflow_entrypoint"] = file_url + # An absolute URL, like in the case of DDBJ TRS implementation + file_url = cast("URIType", file_rel_path) else: - topMeta["remote_workflow_entrypoint"] = cast( - "URIType", file_rel_2_url[file_rel_path] + file_url = cast( + "URIType", + descriptor_base_url + parse.quote(file_rel_path, safe="/"), ) - - # Getting the raw content - accept_val = "*/*" if is_abs_url else "text/plain" - _, metaelem, _ = fetchClassicURL( - file_url, absfile, {"headers": {"Accept": accept_val}} - ) - metadata_array.extend(metaelem) - else: - # First pass, identify primary descriptor / workflow entrypoint - # and learn whether the destination paths should be sanitized - deepest_file_rel = 0 - for file_desc in metadata: - file_rel_path = file_desc.get("path") - if file_rel_path is not None: - frp_parsed = parse.urlparse(file_rel_path) - if frp_parsed.scheme in ("http", "https", "ftp"): - # An absolute URL, like in the case of DDBJ TRS implementation - # A mixure of resource might be catastrophic, the code is doing - # its best effort - file_rel_path = os.path.join(frp_parsed.netloc, frp_parsed.params) - - # BEWARE! The relpath could contain references to parent directories - # escaping from the URL to be built and from the download "sandbox" - # Avoid absolute paths corner case before splitting - file_rel_path_steps = file_rel_path.lstrip("/").split("/") - - deepest = 0 - depth = 0 - for step in file_rel_path_steps: - if step == "..": - depth -= 1 - if depth < deepest: - deepest = depth - elif step not in (".", ""): - depth += 1 - - if deepest < deepest_file_rel: - deepest_file_rel = deepest - - if deepest_file_rel < 0: - for depth in range(-deepest_file_rel): - abs_download_dir = cast( - "AbsPath", os.path.join(abs_download_dir, f"unnamed{depth}") - ) - - # Second pass, fetching the contents, sanitizing the destination paths - for file_desc in metadata: - file_rel_path = file_desc.get("path") - if file_rel_path is not None: - emptyWorkflow = False - - # BEWARE! The relpath could contain references to parent directories - # escaping from the URL to be built and from the download "sandbox" - frp_parsed = parse.urlparse(file_rel_path) - is_abs_url = frp_parsed.scheme in ("http", "https", "ftp") - if is_abs_url: - # An absolute URL, like in the case of DDBJ TRS implementation - file_url = cast("URIType", file_rel_path) - absfile = cast( - "AbsPath", - os.path.join( - abs_download_dir, - frp_parsed.netloc, - frp_parsed.path.lstrip("/"), - ), - ) - else: - file_url = cast( - "URIType", - descriptor_base_url + parse.quote(file_rel_path, safe="/"), - ) - absfile = cast( - "AbsPath", - os.path.join(abs_download_dir, file_rel_path.lstrip("/")), + local_rel_path = file_rel_2_url[file_rel_path][len(prefix_url) :] + absfile = (abs_download_dir / local_rel_path).resolve() + + # Intermediate path creation + absdir = absfile.parent + if absdir not in absdirs: + absdirs.add(absdir) + os.makedirs(absdir, exist_ok=True) + real_rel_path = absfile.relative_to(repo_tag_destpath) + + # When it is the primary descriptor, it is fetched twice + if file_desc.get("file_type") == "PRIMARY_DESCRIPTOR": + topMeta["workflow_entrypoint"] = cast( + "URIType", real_rel_path.as_posix() + ) + if is_abs_url: + topMeta["remote_workflow_entrypoint"] = file_url + else: + topMeta["remote_workflow_entrypoint"] = cast( + "URIType", file_rel_2_url[file_rel_path] + ) + + # Getting the raw content + accept_val = "*/*" if is_abs_url else "text/plain" + _, metaelem, _ = self.scheme_catalog.fetch( + file_url, absfile, {"headers": {"Accept": accept_val}} ) + metadata_array.extend(metaelem) + else: + # First pass, identify primary descriptor / workflow entrypoint + # and learn whether the destination paths should be sanitized + deepest_file_rel = 0 + for file_desc in metadata: + file_rel_path = file_desc.get("path") + if file_rel_path is not None: + frp_parsed = parse.urlparse(file_rel_path) + if frp_parsed.scheme in ("http", "https", "ftp"): + # An absolute URL, like in the case of DDBJ TRS implementation + # A mixure of resource might be catastrophic, the code is doing + # its best effort + file_rel_path = os.path.join( + frp_parsed.netloc, frp_parsed.params + ) - # Intermediate path creation - absdir = os.path.dirname(absfile) - if absdir not in absdirs: - absdirs.add(absdir) - os.makedirs(absdir, exist_ok=True) - real_rel_path = os.path.relpath( - os.path.normpath(absfile), cachedFilename - ) - - # When it is the primary descriptor, it is fetched twice - if file_desc.get("file_type") == "PRIMARY_DESCRIPTOR": - topMeta["workflow_entrypoint"] = cast("URIType", real_rel_path) + # BEWARE! The relpath could contain references to parent directories + # escaping from the URL to be built and from the download "sandbox" + # Avoid absolute paths corner case before splitting + file_rel_path_steps = file_rel_path.lstrip("/").split("/") + + deepest = 0 + depth = 0 + for step in file_rel_path_steps: + if step == "..": + depth -= 1 + if depth < deepest: + deepest = depth + elif step not in (".", ""): + depth += 1 + + if deepest < deepest_file_rel: + deepest_file_rel = deepest + + if deepest_file_rel < 0: + for depth in range(-deepest_file_rel): + abs_download_dir = abs_download_dir / f"unnamed{depth}" + + # Second pass, fetching the contents, sanitizing the destination paths + for file_desc in metadata: + file_rel_path = file_desc.get("path") + if file_rel_path is not None: + emptyWorkflow = False + + # BEWARE! The relpath could contain references to parent directories + # escaping from the URL to be built and from the download "sandbox" + frp_parsed = parse.urlparse(file_rel_path) + is_abs_url = frp_parsed.scheme in ("http", "https", "ftp") if is_abs_url: - topMeta["remote_workflow_entrypoint"] = file_url + # An absolute URL, like in the case of DDBJ TRS implementation + file_url = cast("URIType", file_rel_path) + absfile = ( + abs_download_dir + / frp_parsed.netloc + / frp_parsed.path.lstrip("/") + ) else: - descriptorMeta = io.BytesIO() - _, metaprimary, _ = fetchClassicURL(file_url, descriptorMeta) - metadata_array.extend(metaprimary) - - # This metadata can help a lot to get the workflow repo - metadataPD = json.loads( - descriptorMeta.getvalue().decode("utf-8") + file_url = cast( + "URIType", + descriptor_base_url + parse.quote(file_rel_path, safe="/"), ) - topMeta["remote_workflow_entrypoint"] = metadataPD.get("url") + absfile = abs_download_dir / file_rel_path.lstrip("/") - del descriptorMeta - del metadataPD + absfile = absfile.resolve() - # Getting the raw content - accept_val = "*/*" if is_abs_url else "text/plain" - try: - _, metaelem, _ = fetchClassicURL( - file_url, absfile, {"headers": {"Accept": accept_val}} - ) - metadata_array.extend(metaelem) - except FetcherException as fe: - if file_desc.get("file_type") in ( - "PRIMARY_DESCRIPTOR", - "SECONDARY_DESCRIPTOR", - ): - raise - else: - warnings.warn( - f"Unable to fetch {file_url}. TRS Dataset {metadata_url} might be incomplete" + # Intermediate path creation + absdir = absfile.parent + if absdir not in absdirs: + absdirs.add(absdir) + absdir.mkdir(parents=True, exist_ok=True) + real_rel_path = absfile.relative_to(repo_tag_destpath) + + # When it is the primary descriptor, it is fetched twice + if file_desc.get("file_type") == "PRIMARY_DESCRIPTOR": + topMeta["workflow_entrypoint"] = cast( + "URIType", real_rel_path.as_posix() ) + if is_abs_url: + topMeta["remote_workflow_entrypoint"] = file_url + else: + descriptorMeta = io.BytesIO() + _, metaprimary, _ = self.scheme_catalog.streamfetch( + file_url, descriptorMeta + ) + metadata_array.extend(metaprimary) + + # This metadata can help a lot to get the workflow repo + metadataPD = json.loads( + descriptorMeta.getvalue().decode("utf-8") + ) + topMeta["remote_workflow_entrypoint"] = metadataPD.get( + "url" + ) + + del descriptorMeta + del metadataPD + + # Getting the raw content + accept_val = "*/*" if is_abs_url else "text/plain" + try: + _, metaelem, _ = self.scheme_catalog.fetch( + file_url, absfile, {"headers": {"Accept": accept_val}} + ) + metadata_array.extend(metaelem) + except FetcherException as fe: + if file_desc.get("file_type") in ( + "PRIMARY_DESCRIPTOR", + "SECONDARY_DESCRIPTOR", + ): + raise + else: + self.logger.warning( + f"Unable to fetch {file_url}. TRS Dataset {files_metadata_url} might be incomplete" + ) + + if emptyWorkflow: + raise FetcherException( + "Error processing TRS files for {} : no file was found.\n{}".format( + remote_file, metadata + ) + ) - if emptyWorkflow: - raise FetcherException( - "Error processing TRS files for {} : no file was found.\n{}".format( - remote_file, metadata + upstream_repo: "Optional[RemoteRepo]" = None + recommends_upstream: "bool" = False + # Checking whether it is WorkflowHub + # to recommend the generated Workflow RO-Crate + if service_info_metadata.get("organization", {}).get("name") == "WorkflowHub": + recommends_upstream = True + upstream_repo = RemoteRepo( + repo_url=cast( + "RepoURL", + files_metadata_url + + "?" + + urllib.parse.urlencode({"format": "zip"}), + ), + repo_type=RepoType.Raw, ) + elif topMeta["remote_workflow_entrypoint"] is not None: + upstream_repo = RemoteRepo( + repo_url=cast("RepoURL", topMeta["remote_workflow_entrypoint"]), + ) + + return MaterializedRepo( + local=repo_tag_destpath, + repo=RemoteRepo( + repo_url=remote_file, + rel_path=cast("Optional[RelPath]", topMeta["workflow_entrypoint"]), + repo_type=RepoType.TRS, + ), + metadata_array=metadata_array, + upstream_repo=upstream_repo, + recommends_upstream=recommends_upstream, ) - return ProtocolFetcherReturn( - kind_or_resolved=ContentKind.Directory, - metadata_array=metadata_array, - ) + def build_pid_from_repo(self, remote_repo: "RemoteRepo") -> "Optional[str]": + """ + This method is required to generate a PID which usually + represents an element (usually a workflow) in a repository. + If the fetcher does not recognize the type of repo, either using + repo_url content or the repo type in the worst case, it should + return None + """ + + # TODO: improve this to cover the different cases + parsedInputURL = parse.urlparse(remote_repo.repo_url) + if ( + parsedInputURL.scheme + in (self.INTERNAL_TRS_SCHEME_PREFIX, self.TRS_SCHEME_PREFIX) + or remote_repo.repo_type == RepoType.TRS + ): + return remote_repo.repo_url + + return None + + def fetch( + self, + remote_file: "URIType", + cachedFilename: "PathLikePath", + secContext: "Optional[SecurityContextConfig]" = None, + ) -> "ProtocolFetcherReturn": + """ + Method to download contents from TRS files related to a tool + + :param remote_file: + :param cachedFilename: Destination filename for the fetched content + :param secContext: The security context containing the credentials + """ + + parsedInputURL = parse.urlparse(remote_file) + + # For cases where the URI is not one of the native schemes + # fallback to INTERNAL_TRS_SCHEME_PREFIX + if parsedInputURL.scheme not in self.GetSchemeHandlers(): + the_remote_file = self.INTERNAL_TRS_SCHEME_PREFIX + ":" + remote_file + else: + the_remote_file = remote_file + + # Getting the repoRelPath (if available) + params = parse.parse_qs(parsedInputURL.path, separator=";") + repoRelPath_l = params.get("path", []) + repoRelPath: "Optional[str]" + if len(repoRelPath_l) > 0: + repoRelPath = repoRelPath_l[0] + # Directories also end with slashes + repoRelPath.strip("/") + else: + repoRelPath = None + + # It is materialized in a temporary location + materialized_repo_return = self.materialize_repo_from_repo( + RemoteRepo(repo_url=cast("RepoURL", remote_file), repo_type=RepoType.TRS), + ) + repo_tag_destdir = materialized_repo_return.local + remote_repo = materialized_repo_return.repo + metadata_array = materialized_repo_return.metadata_array + + preferredName: "Optional[RelPath]" + # repoRelPath is only acknowledged when the resolved repo + # is translated to a directory + if repoRelPath is not None and repo_tag_destdir.is_dir(): + cachedContentPath = repo_tag_destdir / repoRelPath + preferredName = cast("RelPath", cachedContentPath.name) + else: + cachedContentPath = repo_tag_destdir + preferredName = None + # This is to remove spurious detections + repoRelPath = None + + remote_repo = remote_repo._replace(rel_path=cast("RelPath", repoRelPath)) + + if cachedContentPath.is_dir(): + kind = ContentKind.Directory + elif cachedContentPath.is_file(): + kind = ContentKind.File + else: + raise FetcherException( + f"Remote {remote_file} is neither a file nor a directory (does it exist?)" + ) + + # shutil.move(cachedContentPath, cachedFilename) + link_or_copy_pathlib(cachedContentPath, pathlib.Path(cachedFilename)) + + repo_desc: "Optional[Mapping[str, Any]]" = remote_repo.gen_repo_desc() + if repo_desc is None: + repo_desc = {} + augmented_metadata_array = [ + URIWithMetadata( + uri=remote_file, metadata=repo_desc, preferredName=preferredName + ), + *metadata_array, + ] + return ProtocolFetcherReturn( + kind_or_resolved=kind, + metadata_array=augmented_metadata_array, + # TODO: Integrate licences from TRS report?? + licences=None, + ) -# These are schemes from identifiers.org -SCHEME_HANDLERS: "Mapping[str, DocumentedProtocolFetcher]" = { - INTERNAL_TRS_SCHEME_PREFIX: DocumentedProtocolFetcher( - fetcher=fetchTRSFiles, - description="WfExS internal pseudo-scheme used to materialize files from pure TRS servers", - ), - TRS_SCHEME_PREFIX: DocumentedProtocolFetcher( - fetcher=fetchTRSFiles, - description="GA4GH TRS metadata is fetched using the APIs described at https://ga4gh.github.io/tool-registry-service-schemas/. Contents are downloaded delegating their associated URIs to other fetchers", - ), -} +INTERNAL_TRS_SCHEME_PREFIX = GA4GHTRSFetcher.INTERNAL_TRS_SCHEME_PREFIX +TRS_SCHEME_PREFIX = GA4GHTRSFetcher.TRS_SCHEME_PREFIX From 96c23a33a88a68a2dac88fbc9cfd3b4401a6f96b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20M=C2=AA=20Fern=C3=A1ndez?= Date: Thu, 23 Jan 2025 20:40:31 +0100 Subject: [PATCH 17/60] Switch to new constants from new implementation of GA4GHTRSFetcher --- wfexs_backend/workflow.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/wfexs_backend/workflow.py b/wfexs_backend/workflow.py index 5a982aba..624a75c5 100644 --- a/wfexs_backend/workflow.py +++ b/wfexs_backend/workflow.py @@ -345,7 +345,7 @@ from .utils.zipfile_path import path_relative_to from .fetchers.trs_files import ( - TRS_SCHEME_PREFIX, + GA4GHTRSFetcher, ) if TYPE_CHECKING: @@ -933,7 +933,7 @@ def getPID(self) -> "Optional[str]": the_pid = urllib.parse.urlunparse( urllib.parse.ParseResult( - scheme=TRS_SCHEME_PREFIX, + scheme=GA4GHTRSFetcher.TRS_SCHEME_PREFIX, netloc=parsedTRSURL.netloc, path="/".join(pid_steps), params="", From 300850f2622d0c353daa9a361dcca10f810968a4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20M=C2=AA=20Fern=C3=A1ndez?= Date: Thu, 23 Jan 2025 20:43:40 +0100 Subject: [PATCH 18/60] Finally migrating to new scheme catalog and cache handler infrastructure. Also, migrated other bits, so next commit will be a cleanup. --- wfexs_backend/wfexs_backend.py | 206 +++++++-------------------------- 1 file changed, 44 insertions(+), 162 deletions(-) diff --git a/wfexs_backend/wfexs_backend.py b/wfexs_backend/wfexs_backend.py index c37540c6..1134735f 100644 --- a/wfexs_backend/wfexs_backend.py +++ b/wfexs_backend/wfexs_backend.py @@ -83,9 +83,13 @@ ) +from .scheme_catalog import ( + SchemeCatalog, +) + from .cache_handler import ( CachedContent, - SchemeHandlerCacheHandler, + CacheHandler, ) from .container_factories import ( @@ -128,7 +132,7 @@ ) from .fetchers import ( - AbstractRepoFetcher, + AbstractSchemeRepoFetcher, AbstractStatefulFetcher, DocumentedProtocolFetcher, DocumentedStatefulProtocolFetcher, @@ -161,8 +165,7 @@ ) from .fetchers.trs_files import ( - TRS_SCHEME_PREFIX, - INTERNAL_TRS_SCHEME_PREFIX, + GA4GHTRSFetcher, ) @@ -217,7 +220,7 @@ ) from .fetchers import ( - RepoFetcher, + SchemeRepoFetcher, StatefulFetcher, ) @@ -678,17 +681,21 @@ def __init__( self.baseWorkDir = baseWorkDir self.defaultParanoidMode = False - self._sngltn: "MutableMapping[Type[AbstractStatefulFetcher], AbstractStatefulFetcher]" = ( + self._sngltn_fetcher: "MutableMapping[Type[AbstractStatefulFetcher], AbstractStatefulFetcher]" = ( dict() ) - self.repo_fetchers: "MutableSequence[AbstractRepoFetcher]" = list() + self.repo_fetchers: "MutableSequence[AbstractSchemeRepoFetcher]" = list() + # scheme_catalog is created on first use + self.scheme_catalog = SchemeCatalog() # cacheHandler is created on first use - self.cacheHandler = SchemeHandlerCacheHandler(self.cacheDir) + self.cacheHandler = CacheHandler( + self.cacheDir, scheme_catalog=self.scheme_catalog + ) fetchers_setup_block = local_config.get("fetchers-setup") # All the scheme handlers should be added here - self.findAndAddSchemeHandlersFromModuleName( + self.scheme_catalog.findAndAddSchemeHandlersFromModuleName( fetchers_setup_block=fetchers_setup_block ) @@ -748,7 +755,7 @@ def cacheWorkflowInputsDir(self) -> "pathlib.Path": def getCacheHandler( self, cache_type: "CacheType" - ) -> "Tuple[SchemeHandlerCacheHandler, Optional[pathlib.Path]]": + ) -> "Tuple[CacheHandler, Optional[pathlib.Path]]": return self.cacheHandler, self.cachePathMap.get(cache_type) def instantiateStatefulFetcher( @@ -759,25 +766,29 @@ def instantiateStatefulFetcher( """ Method to instantiate stateful fetchers once """ - instStatefulFetcher = self._sngltn.get(statefulFetcher) + instStatefulFetcher = self._sngltn_fetcher.get(statefulFetcher) if instStatefulFetcher is None: # Setting the default list of programs for prog in statefulFetcher.GetNeededPrograms(): self.progs.setdefault(prog, cast("RelPath", prog)) # Let's augment the list of needed progs by this # stateful fetcher - instStatefulFetcher = self.cacheHandler.instantiateStatefulFetcher( + instStatefulFetcher = self.scheme_catalog.instantiateStatefulFetcher( statefulFetcher, progs=self.progs, setup_block=setup_block ) - self._sngltn[statefulFetcher] = instStatefulFetcher + self._sngltn_fetcher[statefulFetcher] = instStatefulFetcher + + # Also, if it is a repository fetcher, record it separately + if isinstance(instStatefulFetcher, AbstractSchemeRepoFetcher): + self.repo_fetchers.append(instStatefulFetcher) return cast("StatefulFetcher", instStatefulFetcher) def instantiateRepoFetcher( self, - repoFetcher: "Type[RepoFetcher]", + repoFetcher: "Type[SchemeRepoFetcher]", setup_block: "Optional[Mapping[str, Any]]" = None, - ) -> "RepoFetcher": + ) -> "SchemeRepoFetcher": """ Method to instantiate repo fetchers once """ @@ -995,141 +1006,6 @@ def getExportPluginClass( ) -> "Optional[Type[AbstractExportPlugin]]": return self._export_plugins.get(plugin_id) - def findAndAddSchemeHandlersFromModuleName( - self, - the_module_name: "str" = "wfexs_backend.fetchers", - fetchers_setup_block: "Optional[Mapping[str, Mapping[str, Any]]]" = None, - ) -> None: - try: - the_module = importlib.import_module(the_module_name) - self.findAndAddSchemeHandlersFromModule( - the_module, - fetchers_setup_block=fetchers_setup_block, - ) - except Exception as e: - errmsg = f"Unable to import module {the_module_name} in order to gather scheme handlers, due errors:" - self.logger.exception(errmsg) - raise WfExSBackendException(errmsg) from e - - def findAndAddSchemeHandlersFromModule( - self, - the_module: "ModuleType", - fetchers_setup_block: "Optional[Mapping[str, Mapping[str, Any]]]" = None, - ) -> None: - for finder, module_name, ispkg in iter_namespace(the_module): - try: - named_module = importlib.import_module(module_name) - except: - self.logger.exception( - f"Skipping module {module_name} in order to gather scheme handlers, due errors:" - ) - continue - - # First, try locating a variable named SCHEME_HANDLERS - # then, the different class declarations inheriting - # from AbstractStatefulFetcher - skipit = True - for name, obj in inspect.getmembers(named_module): - if name == "SCHEME_HANDLERS": - if isinstance(obj, dict): - self.addSchemeHandlers( - obj, - fetchers_setup_block=fetchers_setup_block, - ) - skipit = False - elif ( - inspect.isclass(obj) - and not inspect.isabstract(obj) - and issubclass(obj, AbstractStatefulFetcher) - ): - # Now, let's learn whether the class is enabled - if getattr(obj, "ENABLED", False): - self.addStatefulSchemeHandlers( - obj, - fetchers_setup_block=fetchers_setup_block, - ) - skipit = False - - if skipit: - self.logger.debug( - f"Fetch module {named_module} was not eligible (no SCHEME_HANDLERS dictionary or subclass of {AbstractStatefulFetcher.__name__})" - ) - - def addStatefulSchemeHandlers( - self, - statefulSchemeHandler: "Type[AbstractStatefulFetcher]", - fetchers_setup_block: "Optional[Mapping[str, Mapping[str, Any]]]" = None, - ) -> None: - """ - This method adds scheme handlers (aka "fetchers") from - a given stateful fetcher, also adding the needed programs - """ - - # Get the scheme handlers from this fetcher - schemeHandlers = statefulSchemeHandler.GetSchemeHandlers() - - self.addSchemeHandlers( - schemeHandlers, fetchers_setup_block=fetchers_setup_block - ) - - # This pattern is used to validate the schemes - SCHEME_PAT: "Final[Pattern[str]]" = re.compile(r"^[a-zA-Z][a-zA-Z0-9+.-]*$") - - def addSchemeHandlers( - self, - schemeHandlers: "Mapping[str, Union[DocumentedProtocolFetcher, DocumentedStatefulProtocolFetcher]]", - fetchers_setup_block: "Optional[Mapping[str, Mapping[str, Any]]]" = None, - ) -> None: - """ - This method adds scheme handlers (aka "fetchers") - or instantiates stateful scheme handlers (aka "stateful fetchers") - """ - if isinstance(schemeHandlers, dict): - instSchemeHandlers = dict() - if fetchers_setup_block is None: - fetchers_setup_block = dict() - for scheme, schemeHandler in schemeHandlers.items(): - if self.SCHEME_PAT.search(scheme) is None: - self.logger.warning( - f"Fetcher associated to scheme {scheme} has been skipped, as the scheme does not comply with RFC3986" - ) - continue - - lScheme = scheme.lower() - # When no setup block is available for the scheme fetcher, - # provide an empty one - setup_block = fetchers_setup_block.get(lScheme, dict()) - - instSchemeHandler = None - if isinstance(schemeHandler, DocumentedStatefulProtocolFetcher): - instSchemeInstance = self.instantiateStatefulFetcher( - schemeHandler.fetcher_class, setup_block=setup_block - ) - if instSchemeInstance is not None: - instSchemeHandler = DocumentedProtocolFetcher( - fetcher=instSchemeInstance.fetch, - description=instSchemeInstance.description - if schemeHandler.description is None - else schemeHandler.description, - priority=schemeHandler.priority, - ) - - # Also, if it is a repository fetcher, record it separately - if isinstance(instSchemeInstance, AbstractRepoFetcher): - self.repo_fetchers.append(instSchemeInstance) - elif isinstance(schemeHandler, DocumentedProtocolFetcher) and callable( - schemeHandler.fetcher - ): - instSchemeHandler = schemeHandler - - # Only the ones which have overcome the sanity checks - if instSchemeHandler is not None: - # Schemes are case insensitive, so register only - # the lowercase version - instSchemeHandlers[lScheme] = instSchemeHandler - - self.cacheHandler.addRawSchemeHandlers(instSchemeHandlers) - def gen_workflow_pid(self, remote_repo: "RemoteRepo") -> "str": """ This method tries generating the workflow pid passing the remote @@ -1147,7 +1023,7 @@ def gen_workflow_pid(self, remote_repo: "RemoteRepo") -> "str": return remote_repo.repo_url if retval is None else retval def describeFetchableSchemes(self) -> "Sequence[Tuple[str, str, int]]": - return self.cacheHandler.describeRegisteredSchemes() + return self.scheme_catalog.describeRegisteredSchemes() def newSetup( self, @@ -2084,14 +1960,18 @@ def cacheWorkflow( repoDir: "Optional[pathlib.Path]" = None putative: "bool" = False cached_putative_path: "Optional[pathlib.Path]" = None - if parsedRepoURL.scheme in ("", TRS_SCHEME_PREFIX, INTERNAL_TRS_SCHEME_PREFIX): + if parsedRepoURL.scheme in ( + "", + GA4GHTRSFetcher.TRS_SCHEME_PREFIX, + GA4GHTRSFetcher.INTERNAL_TRS_SCHEME_PREFIX, + ): # Extracting the TRS endpoint details from the parsedRepoURL - if parsedRepoURL.scheme == TRS_SCHEME_PREFIX: + if parsedRepoURL.scheme == GA4GHTRSFetcher.TRS_SCHEME_PREFIX: # Duplication of code borrowed from trs_files.py path_steps: "Sequence[str]" = parsedRepoURL.path.split("/") if len(path_steps) < 3 or path_steps[0] != "": raise WfExSBackendException( - f"Ill-formed TRS CURIE {putative_repo_url}. It should be in the format of {TRS_SCHEME_PREFIX}://id/version or {TRS_SCHEME_PREFIX}://prefix-with-slashes/id/version" + f"Ill-formed TRS CURIE {putative_repo_url}. It should be in the format of {GA4GHTRSFetcher.TRS_SCHEME_PREFIX}://id/version or {GA4GHTRSFetcher.TRS_SCHEME_PREFIX}://prefix-with-slashes/id/version" ) trs_steps = cast("MutableSequence[str]", path_steps[0:-2]) trs_steps.extend(["ga4gh", "trs", "v2", ""]) @@ -2108,7 +1988,7 @@ def cacheWorkflow( workflow_id = urllib.parse.unquote(path_steps[-2]) version_id = urllib.parse.unquote(path_steps[-1]) - elif parsedRepoURL.scheme == INTERNAL_TRS_SCHEME_PREFIX: + elif parsedRepoURL.scheme == GA4GHTRSFetcher.INTERNAL_TRS_SCHEME_PREFIX: # Time to try guessing everything try: internal_trs_cached_content = self.cacheHandler.fetch( @@ -2584,7 +2464,10 @@ def getWorkflowRepoFromTRS( # Learning the available files and maybe # which is the entrypoint to the workflow cached_trs_files = self.cacheFetch( - cast("URIType", INTERNAL_TRS_SCHEME_PREFIX + ":" + toolFilesURL), + cast( + "URIType", + GA4GHTRSFetcher.INTERNAL_TRS_SCHEME_PREFIX + ":" + toolFilesURL, + ), CacheType.TRS, offline=offline, ignoreCache=ignoreCache, @@ -2668,7 +2551,7 @@ def doMaterializeRepo( doUpdate: "bool" = True, registerInCache: "bool" = True, ) -> "Tuple[pathlib.Path, RepoTag]": - fetcher_clazz: "Optional[Type[AbstractRepoFetcher]]" = None + fetcher_clazz: "Optional[Type[AbstractSchemeRepoFetcher]]" = None if repo.repo_type not in (RepoType.Other, RepoType.SoftwareHeritage): fetcher_clazz = GitFetcher elif repo.repo_type == RepoType.SoftwareHeritage: @@ -2680,15 +2563,14 @@ def doMaterializeRepo( ) fetcher = self.instantiateRepoFetcher(fetcher_clazz) - ( - repo_path, - materialized_repo, - metadata_array, - ) = fetcher.materialize_repo_from_repo_transient( + materialized_repo_return = fetcher.materialize_repo_from_repo( repo, doUpdate=doUpdate, base_repo_destdir=self.cacheWorkflowDir, ) + repo_path = materialized_repo_return.local + materialized_repo = materialized_repo_return.repo + metadata_array = materialized_repo_return.metadata_array # Now, let's register the checkout with cache structures # using its public URI From 348eb2f05597cfc2b8b325a67b4d270902d32feb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20M=C2=AA=20Fern=C3=A1ndez?= Date: Thu, 23 Jan 2025 20:47:53 +0100 Subject: [PATCH 19/60] Cleanup of glue code which helped to perform the migration --- wfexs_backend/cache_handler.py | 36 ----------------------------- wfexs_backend/fetchers/__init__.py | 16 ------------- wfexs_backend/fetchers/http.py | 27 ---------------------- wfexs_backend/fetchers/trs_files.py | 4 ---- 4 files changed, 83 deletions(-) diff --git a/wfexs_backend/cache_handler.py b/wfexs_backend/cache_handler.py index ba462d5d..ee7a80a9 100644 --- a/wfexs_backend/cache_handler.py +++ b/wfexs_backend/cache_handler.py @@ -1135,39 +1135,3 @@ def fetch( fingerprint=final_fingerprint, clonable=clonable, ) - - -class SchemeHandlerCacheHandler(CacheHandler): - def __init__( - self, - cacheDir: "pathlib.Path", - schemeHandlers: "Mapping[str, DocumentedProtocolFetcher]" = dict(), - ): - scheme_catalog = SchemeCatalog(scheme_handlers=schemeHandlers) - super().__init__(cacheDir, scheme_catalog) - - def addRawSchemeHandlers( - self, schemeHandlers: "Mapping[str, DocumentedProtocolFetcher]" - ) -> None: - self.scheme_catalog.addRawSchemeHandlers(schemeHandlers) - - def bypassSchemeHandlers( - self, - schemeHandlers: "Mapping[str, Union[DocumentedStatefulProtocolFetcher, DocumentedProtocolFetcher]]", - ) -> None: - self.scheme_catalog.bypassSchemeHandlers(schemeHandlers) - - def instantiateStatefulFetcher( - self, - statefulFetcher: "Type[StatefulFetcher]", - progs: "ProgsMapping" = dict(), - setup_block: "Optional[Mapping[str, Any]]" = None, - ) -> "StatefulFetcher": - return self.scheme_catalog.instantiateStatefulFetcher( - statefulFetcher, - progs=progs, - setup_block=setup_block, - ) - - def describeRegisteredSchemes(self) -> "Sequence[Tuple[str, str, int]]": - return self.scheme_catalog.describeRegisteredSchemes() diff --git a/wfexs_backend/fetchers/__init__.py b/wfexs_backend/fetchers/__init__.py index 45e55fc4..d0d1f06a 100644 --- a/wfexs_backend/fetchers/__init__.py +++ b/wfexs_backend/fetchers/__init__.py @@ -349,21 +349,6 @@ def materialize_repo( """ pass - def materialize_repo_from_repo_transient( - self, - repo: "RemoteRepo", - repo_tag_destdir: "Optional[PathLikePath]" = None, - base_repo_destdir: "Optional[PathLikePath]" = None, - doUpdate: "Optional[bool]" = True, - ) -> "Tuple[pathlib.Path, RemoteRepo, Sequence[URIWithMetadata]]": - return self.materialize_repo( - repo.repo_url, - repo.tag, - repo_tag_destdir=repo_tag_destdir, - base_repo_destdir=base_repo_destdir, - doUpdate=doUpdate, - ) - @abc.abstractmethod def build_pid_from_repo(self, remote_repo: "RemoteRepo") -> "Optional[str]": """ @@ -458,7 +443,6 @@ def materialize_repo_from_repo( if TYPE_CHECKING: - RepoFetcher = TypeVar("RepoFetcher", bound=AbstractRepoFetcher) SchemeRepoFetcher = TypeVar("SchemeRepoFetcher", bound=AbstractSchemeRepoFetcher) diff --git a/wfexs_backend/fetchers/http.py b/wfexs_backend/fetchers/http.py index da2f81e1..d630b468 100644 --- a/wfexs_backend/fetchers/http.py +++ b/wfexs_backend/fetchers/http.py @@ -215,30 +215,3 @@ def streamfetch( kind_or_resolved=ContentKind.File, metadata_array=[uri_with_metadata], ) - - -def fetchClassicURL( - remote_file: "URIType", - cachedFilename: "Union[PathLikePath, IO[bytes]]", - secContext: "Optional[SecurityContextConfig]" = None, -) -> "ProtocolFetcherReturn": - if isinstance(cachedFilename, (str, os.PathLike)): - return HTTPFetcher().fetch(remote_file, cachedFilename, secContext=secContext) - else: - return HTTPFetcher().streamfetch( - remote_file, cachedFilename, secContext=secContext - ) - - -SCHEME_HANDLERS: "Mapping[str, DocumentedProtocolFetcher]" = { - "http": DocumentedProtocolFetcher( - fetcher=fetchClassicURL, - description="HTTP download URLs", - priority=20, - ), - "https": DocumentedProtocolFetcher( - fetcher=fetchClassicURL, - description="HTTPS download URLs", - priority=20, - ), -} diff --git a/wfexs_backend/fetchers/trs_files.py b/wfexs_backend/fetchers/trs_files.py index 97b7dd44..9600662d 100644 --- a/wfexs_backend/fetchers/trs_files.py +++ b/wfexs_backend/fetchers/trs_files.py @@ -750,7 +750,3 @@ def fetch( # TODO: Integrate licences from TRS report?? licences=None, ) - - -INTERNAL_TRS_SCHEME_PREFIX = GA4GHTRSFetcher.INTERNAL_TRS_SCHEME_PREFIX -TRS_SCHEME_PREFIX = GA4GHTRSFetcher.TRS_SCHEME_PREFIX From e517b404036f0862d250fdbf3faa019c3aab7996 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20M=C2=AA=20Fern=C3=A1ndez?= Date: Fri, 24 Jan 2025 12:13:15 +0100 Subject: [PATCH 20/60] Some pending cleanups of glue code added to ease past contribution integration --- wfexs_backend/fetchers/__init__.py | 95 +++++++----------------------- 1 file changed, 22 insertions(+), 73 deletions(-) diff --git a/wfexs_backend/fetchers/__init__.py b/wfexs_backend/fetchers/__init__.py index d0d1f06a..ddc52e0c 100644 --- a/wfexs_backend/fetchers/__init__.py +++ b/wfexs_backend/fetchers/__init__.py @@ -317,61 +317,9 @@ class MaterializedRepo(NamedTuple): recommends_upstream: "bool" = False -class AbstractRepoFetcher(AbstractStatefulFetcher): +class AbstractSchemeRepoFetcher(AbstractStatefulFetcher): PRIORITY: "ClassVar[int]" = DEFAULT_PRIORITY + 10 - @abc.abstractmethod - def materialize_repo( - self, - repoURL: "RepoURL", - repoTag: "Optional[RepoTag]" = None, - repo_tag_destdir: "Optional[PathLikePath]" = None, - base_repo_destdir: "Optional[PathLikePath]" = None, - doUpdate: "Optional[bool]" = True, - ) -> "Tuple[pathlib.Path, RemoteRepo, Sequence[URIWithMetadata]]": - """ - Subclasses have to implement this method, which is used to materialize - a repository described by a RemoteRepo instance. - - :param repo: The description of the repository to be materialized. - :type repo: class: `wfexs_backend.fetchers.RemoteRepo` - :param repo_tag_destdir: Destination of the materialized repo. - :type repo_tag_destdir: str, `os.PathLike[str]`, optional - :param base_repo_destdir: If repo_tag_destdir is None, parent directory of the newly created destination directory for the repo. - :type base_repo_destdir: str, `os.PathLike[str]`, optional - :param doUpdate: Should the code try updating an already materialized repo? Defaults to False - :type doUpdate: bool - - The returned tuple has next elements: - * The local path where the repo was materialized. - * A RemoteRepo instance. - * The metadata gathered through the materialisation process. - """ - pass - - @abc.abstractmethod - def build_pid_from_repo(self, remote_repo: "RemoteRepo") -> "Optional[str]": - """ - This method is required to generate a PID which usually - represents an element (usually a workflow) in a repository. - If the fetcher does not recognize the type of repo, either using - repo_url content or the repo type in the worst case, it should - return None - """ - pass - - @classmethod - @abc.abstractmethod - def GuessRepoParams( - cls, - orig_wf_url: "Union[URIType, parse.ParseResult]", - logger: "Optional[logging.Logger]" = None, - fail_ok: "bool" = False, - ) -> "Optional[RemoteRepo]": - pass - - -class AbstractSchemeRepoFetcher(AbstractRepoFetcher): """ This abstract subclass is used to force the initialization of the scheme catalog instance @@ -391,26 +339,6 @@ def __init__( ) self.scheme_catalog: "SchemeCatalog" - def materialize_repo( - self, - repoURL: "RepoURL", - repoTag: "Optional[RepoTag]" = None, - repo_tag_destdir: "Optional[PathLikePath]" = None, - base_repo_destdir: "Optional[PathLikePath]" = None, - doUpdate: "Optional[bool]" = True, - ) -> "Tuple[pathlib.Path, RemoteRepo, Sequence[URIWithMetadata]]": - mrepo = self.materialize_repo_from_repo( - RemoteRepo( - repo_url=repoURL, - tag=repoTag, - ), - repo_tag_destdir=repo_tag_destdir, - base_repo_destdir=base_repo_destdir, - doUpdate=doUpdate, - ) - - return mrepo.local, mrepo.repo, mrepo.metadata_array - @abc.abstractmethod def materialize_repo_from_repo( self, @@ -441,6 +369,27 @@ def materialize_repo_from_repo( """ pass + @abc.abstractmethod + def build_pid_from_repo(self, remote_repo: "RemoteRepo") -> "Optional[str]": + """ + This method is required to generate a PID which usually + represents an element (usually a workflow) in a repository. + If the fetcher does not recognize the type of repo, either using + repo_url content or the repo type in the worst case, it should + return None + """ + pass + + @classmethod + @abc.abstractmethod + def GuessRepoParams( + cls, + orig_wf_url: "Union[URIType, parse.ParseResult]", + logger: "Optional[logging.Logger]" = None, + fail_ok: "bool" = False, + ) -> "Optional[RemoteRepo]": + pass + if TYPE_CHECKING: SchemeRepoFetcher = TypeVar("SchemeRepoFetcher", bound=AbstractSchemeRepoFetcher) From b115870252fe74d049d2bb682a901b11d3e52d8d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20M=C2=AA=20Fern=C3=A1ndez?= Date: Fri, 24 Jan 2025 12:46:53 +0100 Subject: [PATCH 21/60] Fixed TRS endpoint --- workflow_examples/somatic_cnv_dockstore_cwl.wfex.stage | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflow_examples/somatic_cnv_dockstore_cwl.wfex.stage b/workflow_examples/somatic_cnv_dockstore_cwl.wfex.stage index dc6850eb..9e4adf4a 100644 --- a/workflow_examples/somatic_cnv_dockstore_cwl.wfex.stage +++ b/workflow_examples/somatic_cnv_dockstore_cwl.wfex.stage @@ -1,7 +1,7 @@ # THIS IS AN INCOMPLETE EXAMPLE (use as template) # Use this example only to test TRS access to Dockstore works in stage, # as its parameters are not properly set -trs_endpoint: https://dockstore.org/api/api/ga4gh/v2/ +trs_endpoint: https://dockstore.org/api/ga4gh/v2/ workflow_id: '#workflow/github.com/sevenbridges-openworkflows/Broad-Best-Practice-Somatic-CNV-Workflows/GATK-Somatic-CNV-Panel-Workflow' version: master workflow_config: From 475a9da89e4b96d03446f692485c9befa8e6ee93 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20M=C2=AA=20Fern=C3=A1ndez?= Date: Sat, 25 Jan 2025 01:19:07 +0100 Subject: [PATCH 22/60] Avoid circular reference issue due wfexs_backend.fetchers importing wfexs_backend.scheme_catalog ... which is importing several elements from wfexs_backend.fetchers . --- wfexs_backend/fetchers/__init__.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/wfexs_backend/fetchers/__init__.py b/wfexs_backend/fetchers/__init__.py index ddc52e0c..c5ed9385 100644 --- a/wfexs_backend/fetchers/__init__.py +++ b/wfexs_backend/fetchers/__init__.py @@ -2,7 +2,7 @@ # -*- coding: utf-8 -*- # SPDX-License-Identifier: Apache-2.0 -# Copyright 2020-2024 Barcelona Supercomputing Center (BSC), Spain +# Copyright 2020-2025 Barcelona Supercomputing Center (BSC), Spain # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -70,6 +70,10 @@ URIWithMetadata, ) + from ..scheme_catalog import ( + SchemeCatalog, + ) + class RepoDesc(TypedDict): repo: Required[RepoURL] tag: Required[Optional[RepoTag]] @@ -112,10 +116,6 @@ class ProtocolFetcherReturn(NamedTuple): AbstractWfExSException, ) -from ..scheme_catalog import ( - SchemeCatalog, -) - # Default priority DEFAULT_PRIORITY: "Final[int]" = 0 From 7e6cc3b480956108760b760e2d04c8776380daed Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20M=C2=AA=20Fern=C3=A1ndez?= Date: Sat, 25 Jan 2025 01:21:25 +0100 Subject: [PATCH 23/60] Priority of git and swh fetchers has been arisen --- wfexs_backend/fetchers/git.py | 5 ++++- wfexs_backend/fetchers/swh.py | 5 ++++- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/wfexs_backend/fetchers/git.py b/wfexs_backend/fetchers/git.py index cf96ef23..552838d9 100644 --- a/wfexs_backend/fetchers/git.py +++ b/wfexs_backend/fetchers/git.py @@ -2,7 +2,7 @@ # -*- coding: utf-8 -*- # SPDX-License-Identifier: Apache-2.0 -# Copyright 2020-2024 Barcelona Supercomputing Center (BSC), Spain +# Copyright 2020-2025 Barcelona Supercomputing Center (BSC), Spain # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -33,6 +33,7 @@ from typing import ( Any, + ClassVar, Mapping, MutableMapping, MutableSequence, @@ -96,6 +97,8 @@ class GitFetcher(AbstractSchemeRepoFetcher): + PRIORITY: "ClassVar[int]" = AbstractSchemeRepoFetcher.PRIORITY + 10 + GIT_PROTO: "Final[str]" = "git" GIT_PROTO_PREFIX: "Final[str]" = GIT_PROTO + "+" GITHUB_SCHEME: "Final[str]" = "github" diff --git a/wfexs_backend/fetchers/swh.py b/wfexs_backend/fetchers/swh.py index 7f5f8556..fb2485b5 100644 --- a/wfexs_backend/fetchers/swh.py +++ b/wfexs_backend/fetchers/swh.py @@ -2,7 +2,7 @@ # -*- coding: utf-8 -*- # SPDX-License-Identifier: Apache-2.0 -# Copyright 2020-2024 Barcelona Supercomputing Center (BSC), Spain +# Copyright 2020-2025 Barcelona Supercomputing Center (BSC), Spain # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -37,6 +37,7 @@ from typing import ( Any, + ClassVar, IO, Mapping, MutableSequence, @@ -89,6 +90,8 @@ class SoftwareHeritageFetcher(AbstractSchemeRepoFetcher): + PRIORITY: "ClassVar[int]" = AbstractSchemeRepoFetcher.PRIORITY + 20 + SOFTWARE_HERITAGE_SCHEME: "Final[str]" = "swh" SWH_API_REST: "Final[str]" = "https://archive.softwareheritage.org/api/1/" SWH_API_REST_KNOWN: "Final[URIType]" = cast( From deecca0cf57ff4a28d9ef6aa022c1ea0331d07f2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20M=C2=AA=20Fern=C3=A1ndez?= Date: Sat, 25 Jan 2025 01:22:26 +0100 Subject: [PATCH 24/60] Fixed issue with wrongly initialized SchemeCatalog --- tests/fetchers/test_swh.py | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/tests/fetchers/test_swh.py b/tests/fetchers/test_swh.py index 79afd2cc..c170a5aa 100644 --- a/tests/fetchers/test_swh.py +++ b/tests/fetchers/test_swh.py @@ -2,7 +2,7 @@ # -*- coding: utf-8 -*- # SPDX-License-Identifier: Apache-2.0 -# Copyright 2020-2024 Barcelona Supercomputing Center (BSC), Spain +# Copyright 2020-2025 Barcelona Supercomputing Center (BSC), Spain # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -52,11 +52,6 @@ from wfexs_backend.fetchers.swh import SoftwareHeritageFetcher -WfExS_basedir = Path(__file__).parent.parent -WfExS_basedir_file_uri = WfExS_basedir.as_uri() -WfExS_git_basedir = WfExS_basedir / ".git" -WfExS_git_basedir_file_uri = WfExS_git_basedir.as_uri() - logger = logging.getLogger(__name__) logger.setLevel(logging.INFO) @@ -147,7 +142,7 @@ def test_build_swh_pid_from_repo( pytest.skip("Skipped test because no remote repo was provided") else: scheme_catalog = SchemeCatalog( - scheme_handlers=SoftwareHeritageFetcher.GetSchemeHandlers(), + scheme_handlers=HTTPFetcher.GetSchemeHandlers(), ) fetcher = SoftwareHeritageFetcher(scheme_catalog, progs={}) From 7960100c6fb73025326fb08046e082e655e367a5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20M=C2=AA=20Fern=C3=A1ndez?= Date: Mon, 27 Jan 2025 00:18:47 +0100 Subject: [PATCH 25/60] Added initial tests to check proper functionality of GA4GHTRSFetcher as a proper repo fetcher. Some of the tests are still failing. --- tests/fetchers/test_trs.py | 308 ++++++++++++++++++++++++++++ wfexs_backend/fetchers/trs_files.py | 239 ++++++++++++++++++++- 2 files changed, 545 insertions(+), 2 deletions(-) create mode 100644 tests/fetchers/test_trs.py diff --git a/tests/fetchers/test_trs.py b/tests/fetchers/test_trs.py new file mode 100644 index 00000000..ea05bd78 --- /dev/null +++ b/tests/fetchers/test_trs.py @@ -0,0 +1,308 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +# SPDX-License-Identifier: Apache-2.0 +# Copyright 2020-2025 Barcelona Supercomputing Center (BSC), Spain +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest +import logging + +from pathlib import Path + +from typing import ( + cast, + TYPE_CHECKING, +) + +if TYPE_CHECKING: + from typing import ( + Optional, + ) + + from wfexs_backend.common import ( + RelPath, + RepoTag, + RepoURL, + TRS_Workflow_Descriptor, + URIType, + ) + + from wfexs_backend.workflow import ( + WFVersionId, + WorkflowId, + ) + +from wfexs_backend.scheme_catalog import ( + SchemeCatalog, +) + +from wfexs_backend.fetchers import ( + RemoteRepo, + RepoGuessFlavor, + RepoType, +) + +from wfexs_backend.fetchers.http import HTTPFetcher + +from wfexs_backend.fetchers.trs_files import GA4GHTRSFetcher + +logger = logging.getLogger(__name__) +logger.setLevel(logging.INFO) + +TRS_PARAMS_TESTBED = pytest.mark.parametrize( + [ + "trs_endpoint", + "workflow_id", + "version_id", + "descriptor_type", + "url", + "remote_repo", + "repo_pid", + ], + [ + ( + "https://dockstore.org/api/ga4gh/trs/v2/", + cast( + "WorkflowId", + "#workflow/github.com/sevenbridges-openworkflows/Broad-Best-Practice-Somatic-CNV-Workflows/GATK-Somatic-CNV-Panel-Workflow", + ), + cast("Optional[WFVersionId]", "master"), + None, + cast( + "URIType", + GA4GHTRSFetcher.INTERNAL_TRS_SCHEME_PREFIX + + ":" + + "https://dockstore.org/api/ga4gh/trs/v2/tools/%23workflow%2Fgithub.com%2Fsevenbridges-openworkflows%2FBroad-Best-Practice-Somatic-CNV-Workflows%2FGATK-Somatic-CNV-Panel-Workflow/versions/master", + ), + RemoteRepo( + repo_url=cast( + "RepoURL", + "https://dockstore.org/api/ga4gh/trs/v2/tools/%23workflow%2Fgithub.com%2Fsevenbridges-openworkflows%2FBroad-Best-Practice-Somatic-CNV-Workflows%2FGATK-Somatic-CNV-Panel-Workflow/versions/master", + ), + repo_type=RepoType.TRS, + ), + "trs://dockstore.org/api/%23workflow%2Fgithub.com%2Fsevenbridges-openworkflows%2FBroad-Best-Practice-Somatic-CNV-Workflows%2FGATK-Somatic-CNV-Panel-Workflow/master", + ), + ( + "https://dockstore.org/api/ga4gh/trs/v2/", + cast( + "WorkflowId", "#workflow/github.com/NCI-GDC/gdc-dnaseq-cwl/GDC_DNASeq" + ), + cast("Optional[WFVersionId]", "master"), + None, + cast( + "URIType", + GA4GHTRSFetcher.INTERNAL_TRS_SCHEME_PREFIX + + ":" + + "https://dockstore.org/api/ga4gh/trs/v2/tools/%23workflow%2Fgithub.com%2FNCI-GDC%2Fgdc-dnaseq-cwl%2FGDC_DNASeq/versions/master", + ), + RemoteRepo( + repo_url=cast( + "RepoURL", + "https://dockstore.org/api/ga4gh/trs/v2/tools/%23workflow%2Fgithub.com%2FNCI-GDC%2Fgdc-dnaseq-cwl%2FGDC_DNASeq/versions/master", + ), + repo_type=RepoType.TRS, + ), + "trs://dockstore.org/api/%23workflow%2Fgithub.com%2FNCI-GDC%2Fgdc-dnaseq-cwl%2FGDC_DNASeq/master", + ), + ( + "https://dockstore.org/api/ga4gh/trs/v2/", + cast( + "WorkflowId", "#workflow/github.com/NCI-GDC/gdc-dnaseq-cwl/GDC_DNASeq" + ), + None, + None, + cast( + "URIType", + GA4GHTRSFetcher.INTERNAL_TRS_SCHEME_PREFIX + + ":" + + "https://dockstore.org/api/ga4gh/trs/v2/tools/%23workflow%2Fgithub.com%2FNCI-GDC%2Fgdc-dnaseq-cwl%2FGDC_DNASeq", + ), + RemoteRepo( + repo_url=cast( + "RepoURL", + "https://dockstore.org/api/ga4gh/trs/v2/tools/%23workflow%2Fgithub.com%2FNCI-GDC%2Fgdc-dnaseq-cwl%2FGDC_DNASeq", + ), + repo_type=RepoType.TRS, + ), + "trs://dockstore.org/api/%23workflow%2Fgithub.com%2FNCI-GDC%2Fgdc-dnaseq-cwl%2FGDC_DNASeq", + ), + ( + "https://workflowhub.eu/ga4gh/trs/v2/tools/", + cast("WorkflowId", 107), + cast("Optional[WFVersionId]", 1), + None, + cast( + "URIType", + GA4GHTRSFetcher.INTERNAL_TRS_SCHEME_PREFIX + + ":" + + "https://workflowhub.eu/ga4gh/trs/v2/tools/107/versions/1", + ), + RemoteRepo( + repo_url=cast( + "RepoURL", + "https://workflowhub.eu/ga4gh/trs/v2/tools/107/versions/1", + ), + repo_type=RepoType.TRS, + ), + "trs://workflowhub.eu/107/1", + ), + ( + "https://workflowhub.eu/ga4gh/trs/v2/tools/", + cast("WorkflowId", 106), + cast("Optional[WFVersionId]", 3), + None, + cast( + "URIType", + GA4GHTRSFetcher.INTERNAL_TRS_SCHEME_PREFIX + + ":" + + "https://workflowhub.eu/ga4gh/trs/v2/tools/106/versions/3", + ), + RemoteRepo( + repo_url=cast( + "RepoURL", + "https://workflowhub.eu/ga4gh/trs/v2/tools/106/versions/3", + ), + repo_type=RepoType.TRS, + ), + "trs://workflowhub.eu/106/3", + ), + ( + "https://workflowhub.eu/ga4gh/trs/v2/", + cast("WorkflowId", 119), + cast("Optional[WFVersionId]", 1), + None, + cast( + "URIType", + GA4GHTRSFetcher.INTERNAL_TRS_SCHEME_PREFIX + + ":" + + "https://workflowhub.eu/ga4gh/trs/v2/tools/119/versions/1", + ), + RemoteRepo( + repo_url=cast( + "RepoURL", + "https://workflowhub.eu/ga4gh/trs/v2/tools/119/versions/1", + ), + repo_type=RepoType.TRS, + ), + "trs://workflowhub.eu/119/1", + ), + ( + "https://workflowhub.eu/ga4gh/trs/v2/tools/", + cast("WorkflowId", 244), + cast("Optional[WFVersionId]", 4), + None, + cast( + "URIType", + GA4GHTRSFetcher.INTERNAL_TRS_SCHEME_PREFIX + + ":" + + "https://workflowhub.eu/ga4gh/trs/v2/tools/244/versions/4", + ), + RemoteRepo( + repo_url=cast( + "RepoURL", + "https://workflowhub.eu/ga4gh/trs/v2/tools/244/versions/4", + ), + repo_type=RepoType.TRS, + ), + "trs://workflowhub.eu/244/4", + ), + ( + "https://ddbj.github.io/workflow-registry/", + cast("WorkflowId", "0d2ae4c2-fe4c-48f7-811a-ac277776533e"), + cast("Optional[WFVersionId]", "1.0.0"), + None, + cast( + "URIType", + GA4GHTRSFetcher.INTERNAL_TRS_SCHEME_PREFIX + + ":" + + "https://ddbj.github.io/workflow-registry/tools/0d2ae4c2-fe4c-48f7-811a-ac277776533e/versions/1.0.0", + ), + RemoteRepo( + repo_url=cast( + "RepoURL", + "https://ddbj.github.io/workflow-registry/tools/0d2ae4c2-fe4c-48f7-811a-ac277776533e/versions/1.0.0", + ), + repo_type=RepoType.TRS, + ), + "trs://ddbj.github.io/workflow-registry/0d2ae4c2-fe4c-48f7-811a-ac277776533e/1.0.0", + ), + ], +) + + +@TRS_PARAMS_TESTBED +def test_guess_trs_repo_params( + trs_endpoint: "str", + workflow_id: "WorkflowId", + version_id: "Optional[WFVersionId]", + descriptor_type: "Optional[TRS_Workflow_Descriptor]", + url: "str", + remote_repo: "Optional[RemoteRepo]", + repo_pid: "Optional[str]", +) -> "None": + output = GA4GHTRSFetcher.GuessRepoParams(cast("URIType", url), logger=logger) + + # When no web url is given, ignore what it was discovered + if output is not None and remote_repo is not None: + if remote_repo.web_url is None: + output = output._replace(web_url=None) + # For now, patch this + if remote_repo.checkout is None: + output = output._replace(checkout=None) + assert output == remote_repo + + +@TRS_PARAMS_TESTBED +def test_build_trs_internal_url_from_repo( + trs_endpoint: "str", + workflow_id: "WorkflowId", + version_id: "Optional[WFVersionId]", + descriptor_type: "Optional[TRS_Workflow_Descriptor]", + url: "str", + remote_repo: "Optional[RemoteRepo]", + repo_pid: "Optional[str]", +) -> "None": + output = GA4GHTRSFetcher.BuildRepoPIDFromTRSParams( + trs_endpoint, + workflow_id, + version_id, + descriptor_type, + ) + + assert output == url + + +@TRS_PARAMS_TESTBED +def test_build_trs_pid_from_repo( + trs_endpoint: "str", + workflow_id: "WorkflowId", + version_id: "Optional[WFVersionId]", + descriptor_type: "Optional[TRS_Workflow_Descriptor]", + url: "str", + remote_repo: "Optional[RemoteRepo]", + repo_pid: "Optional[str]", +) -> "None": + if remote_repo is None: + pytest.skip("Skipped test because no remote repo was provided") + else: + scheme_catalog = SchemeCatalog( + scheme_handlers=HTTPFetcher.GetSchemeHandlers(), + ) + + fetcher = GA4GHTRSFetcher(scheme_catalog, progs={}) + output = fetcher.build_pid_from_repo(remote_repo) + + assert output in (url, repo_pid) diff --git a/wfexs_backend/fetchers/trs_files.py b/wfexs_backend/fetchers/trs_files.py index 9600662d..8d92e914 100644 --- a/wfexs_backend/fetchers/trs_files.py +++ b/wfexs_backend/fetchers/trs_files.py @@ -2,7 +2,7 @@ # -*- coding: utf-8 -*- # SPDX-License-Identifier: Apache-2.0 -# Copyright 2020-2024 Barcelona Supercomputing Center (BSC), Spain +# Copyright 2020-2025 Barcelona Supercomputing Center (BSC), Spain # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -62,6 +62,8 @@ urlresolv, ) +from .http import HTTPFetcher + if TYPE_CHECKING: from typing import ( Any, @@ -86,9 +88,15 @@ RepoURL, SecurityContextConfig, SymbolicName, + TRS_Workflow_Descriptor, URIType, ) + from ..workflow import ( + WFVersionId, + WorkflowId, + ) + from ..scheme_catalog import ( SchemeCatalog, ) @@ -98,6 +106,7 @@ class GA4GHTRSFetcher(AbstractSchemeRepoFetcher): INTERNAL_TRS_SCHEME_PREFIX: "Final[str]" = "wfexs.trs.files" TRS_SCHEME_PREFIX: "Final[str]" = "trs" + TRS_TOOLS_SUFFIX: "Final[str]" = "tools/" TRS_FILES_SUFFIX: "Final[str]" = "/files" TRS_DESCRIPTOR_INFIX: "Final[str]" = "/descriptor/" @@ -133,7 +142,233 @@ def GuessRepoParams( logger: "Optional[logging.Logger]" = None, fail_ok: "bool" = False, ) -> "Optional[RemoteRepo]": - pass + # Deciding which is the input + wf_url: "RepoURL" + parsed_wf_url: "parse.ParseResult" + if isinstance(orig_wf_url, parse.ParseResult): + parsed_wf_url = orig_wf_url + wf_url = cast("RepoURL", parse.urlunparse(orig_wf_url)) + else: + wf_url = cast("RepoURL", orig_wf_url) + parsed_wf_url = parse.urlparse(orig_wf_url) + + if parsed_wf_url.scheme in HTTPFetcher.GetSchemeHandlers(): + wf_url = cast("RepoURL", cls.INTERNAL_TRS_SCHEME_PREFIX + ":" + wf_url) + parsed_wf_url = parse.urlparse(wf_url) + + putative_tool_uri: "Optional[URIType]" = None + if parsed_wf_url.scheme == cls.TRS_SCHEME_PREFIX: + # Duplication of code borrowed from trs_files.py + path_steps: "Sequence[str]" = parsed_wf_url.path.split("/") + if len(path_steps) < 3 or path_steps[0] != "": + if fail_ok: + return None + raise FetcherException( + f"Ill-formed TRS CURIE {wf_url}. It should be in the format of {cls.TRS_SCHEME_PREFIX}://server/id/version or {cls.TRS_SCHEME_PREFIX}://server-plus-prefix-with-slashes/id/version" + ) + trs_steps = cast("MutableSequence[str]", path_steps[0:-2]) + trs_steps.extend(["ga4gh", "trs", "v2", ""]) + trs_endpoint = urllib.parse.urlunparse( + urllib.parse.ParseResult( + scheme="https", + netloc=parsed_wf_url.netloc, + path="/".join(trs_steps), + params="", + query="", + fragment="", + ) + ) + + workflow_id = urllib.parse.unquote(path_steps[-2]) + version_id = urllib.parse.unquote(path_steps[-1]) + putative_tool_uri = cast("URIType", wf_url) + elif parsed_wf_url.scheme == cls.INTERNAL_TRS_SCHEME_PREFIX: + putative_tool_uri = cast("URIType", parsed_wf_url.path) + parsed_putative_tool_uri = urllib.parse.urlparse(putative_tool_uri) + is_wh = parsed_putative_tool_uri.netloc.endswith("workflowhub.eu") + + # Time to try guessing everything + try: + resio = io.BytesIO() + _, metaresio, _ = HTTPFetcher().streamfetch( + putative_tool_uri, + resio, + secContext={ + "headers": { + "Accept": "application/json", + }, + }, + ) + trs__meta = json.loads(resio.getvalue().decode("utf-8")) + except Exception as e: + if fail_ok: + return None + raise FetcherException( + f"trs_endpoint could not be guessed from {putative_tool_uri} (raised exception)" + ) from e + + if not isinstance(trs__meta, dict): + if fail_ok: + return None + raise FetcherException( + f"trs_endpoint could not be guessed from {putative_tool_uri} (not returning JSON object)" + ) + + trs__meta__id: "Optional[str]" = trs__meta.get("id") + if trs__meta__id is None: + if fail_ok: + return None + raise FetcherException( + f"trs_endpoint could not be guessed from {putative_tool_uri} (not returning id)" + ) + + trs__meta__url: "Optional[str]" = trs__meta.get("url") + if trs__meta__url is None: + if fail_ok: + return None + raise FetcherException( + f"trs_endpoint could not be guessed from {putative_tool_uri} (not returning url)" + ) + + # Non compliant emitted trs__meta__url + if is_wh: + trs__meta__url = putative_tool_uri + + if "descriptor_type" in trs__meta: + version_id = trs__meta__id + # Now we need to backtrack in the url to get the workflow id + tool_url_suffix = "/versions/" + urllib.parse.quote(version_id, safe="") + + # If this happens, this implementation is not so compliant with standard + dockstore_tool_url_suffix = "/versions/" + urllib.parse.quote( + trs__meta.get("name", ""), safe="" + ) + if trs__meta__url.endswith(dockstore_tool_url_suffix): + tool_url_suffix = dockstore_tool_url_suffix + version_id = trs__meta.get("name", "") + + if not trs__meta__url.endswith(tool_url_suffix): + if fail_ok: + return None + raise FetcherException( + f"trs_endpoint could not be guessed from {putative_tool_uri} and {trs__meta__url} (version {version_id}, expected suffix {tool_url_suffix}, mismatched API route)" + ) + + trs_tool_url = cast( + "URIType", trs__meta__url[0 : -len(tool_url_suffix)] + ) + try: + resio = io.BytesIO() + _, metaresio, _ = HTTPFetcher().streamfetch( + trs_tool_url, + resio, + ) + trs__meta = json.loads(resio.getvalue().decode("utf-8")) + + except Exception as e: + if fail_ok: + return None + raise FetcherException( + f"trs_endpoint could not be guessed from {trs_tool_url} (came from {putative_tool_uri}, raised exception)" + ) from e + + trs__meta__id = trs__meta.get("id") + if trs__meta__id is None: + if fail_ok: + return None + raise FetcherException( + f"trs_endpoint could not be guessed from {trs_tool_url} (came from {putative_tool_uri}, not returning id)" + ) + + trs__meta__url = trs__meta.get("url") + if trs__meta__url is None: + if fail_ok: + return None + raise FetcherException( + f"trs_endpoint could not be guessed from {trs_tool_url} (came from {putative_tool_uri}, not returning url)" + ) + else: + trs_tool_url = putative_tool_uri + version_id = None + + if "toolclass" in trs__meta: + workflow_id = trs__meta__id + + # Now we need to backtrack in the url to get the workflow id + tool_url_suffix = "/tools/" + urllib.parse.quote(workflow_id, safe="") + if not trs_tool_url.endswith(tool_url_suffix): + if fail_ok: + return None + raise FetcherException( + f"trs_endpoint could not be guessed from {trs_tool_url} and {trs__meta__url} (expected suffix {tool_url_suffix}, mismatched API route)" + ) + + trs_endpoint = trs_tool_url[0 : -len(tool_url_suffix)] + else: + if fail_ok: + return None + raise FetcherException( + f"trs_endpoint could not be guessed from {putative_tool_uri} (no clues)" + ) + + # Next two elifs should *never* happen + elif fail_ok: + return None + else: + raise FetcherException( + f"trs_endpoint could not be guessed from {orig_wf_url} (no clues)" + ) + + # This is needed to guarantee it is always declared + assert putative_tool_uri is not None + + return RemoteRepo( + repo_url=cast("RepoURL", putative_tool_uri), + repo_type=RepoType.TRS, + ) + + @classmethod + def BuildRepoPIDFromTRSParams( + cls, + trs_endpoint: "str", + workflow_id: "WorkflowId", + version_id: "Optional[WFVersionId]", + descriptor_type: "Optional[TRS_Workflow_Descriptor]", + ) -> "URIType": + if isinstance(workflow_id, int): + workflow_id_str = str(workflow_id) + else: + workflow_id_str = workflow_id + + # The base URL must end with a slash + if trs_endpoint[-1] != "/": + trs_endpoint += "/" + + # Removing the tools suffix, which appeared in first WfExS iterations + if trs_endpoint.endswith("/" + cls.TRS_TOOLS_SUFFIX): + trs_endpoint = trs_endpoint[0 : -len(cls.TRS_TOOLS_SUFFIX)] + + trs_tools_url = cast( + "URIType", + urllib.parse.urljoin( + trs_endpoint, + cls.TRS_TOOLS_SUFFIX + urllib.parse.quote(workflow_id_str, safe=""), + ), + ) + + if version_id is not None: + trs_tool_url = ( + trs_tools_url + + "/versions/" + + urllib.parse.quote(str(version_id), safe="") + ) + + if descriptor_type is not None: + trs_tool_url += "/" + urllib.parse.quote(descriptor_type, safe="") + else: + trs_tool_url = trs_tools_url + + return cast("URIType", cls.INTERNAL_TRS_SCHEME_PREFIX + ":" + trs_tool_url) def materialize_repo_from_repo( self, From 1bf4373fddde71103c75c1ea4d454e947c8de435 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20M=C2=AA=20Fern=C3=A1ndez?= Date: Mon, 27 Jan 2025 01:20:10 +0100 Subject: [PATCH 26/60] Fixed GA4GHTRSFetcher build_pid_from_repo, so it generates proper trs links --- wfexs_backend/fetchers/trs_files.py | 95 ++++++++++++++++++++++++----- 1 file changed, 80 insertions(+), 15 deletions(-) diff --git a/wfexs_backend/fetchers/trs_files.py b/wfexs_backend/fetchers/trs_files.py index 8d92e914..b63fc8a4 100644 --- a/wfexs_backend/fetchers/trs_files.py +++ b/wfexs_backend/fetchers/trs_files.py @@ -136,12 +136,12 @@ def GetNeededPrograms(cls) -> "Sequence[SymbolicName]": return tuple() @classmethod - def GuessRepoParams( + def GuessTRSParams( cls, orig_wf_url: "Union[URIType, parse.ParseResult]", logger: "Optional[logging.Logger]" = None, fail_ok: "bool" = False, - ) -> "Optional[RemoteRepo]": + ) -> "Optional[Tuple[RepoURL, str, Sequence[str], WorkflowId, Optional[WFVersionId]]]": # Deciding which is the input wf_url: "RepoURL" parsed_wf_url: "parse.ParseResult" @@ -158,7 +158,7 @@ def GuessRepoParams( putative_tool_uri: "Optional[URIType]" = None if parsed_wf_url.scheme == cls.TRS_SCHEME_PREFIX: - # Duplication of code borrowed from trs_files.py + # Duplication of code path_steps: "Sequence[str]" = parsed_wf_url.path.split("/") if len(path_steps) < 3 or path_steps[0] != "": if fail_ok: @@ -166,12 +166,15 @@ def GuessRepoParams( raise FetcherException( f"Ill-formed TRS CURIE {wf_url}. It should be in the format of {cls.TRS_SCHEME_PREFIX}://server/id/version or {cls.TRS_SCHEME_PREFIX}://server-plus-prefix-with-slashes/id/version" ) + trs_steps = cast("MutableSequence[str]", path_steps[0:-2]) trs_steps.extend(["ga4gh", "trs", "v2", ""]) + + trs_service_netloc = parsed_wf_url.netloc trs_endpoint = urllib.parse.urlunparse( urllib.parse.ParseResult( scheme="https", - netloc=parsed_wf_url.netloc, + netloc=trs_service_netloc, path="/".join(trs_steps), params="", query="", @@ -185,6 +188,7 @@ def GuessRepoParams( elif parsed_wf_url.scheme == cls.INTERNAL_TRS_SCHEME_PREFIX: putative_tool_uri = cast("URIType", parsed_wf_url.path) parsed_putative_tool_uri = urllib.parse.urlparse(putative_tool_uri) + # Detecting workflowhub derivatives is_wh = parsed_putative_tool_uri.netloc.endswith("workflowhub.eu") # Time to try guessing everything @@ -263,7 +267,7 @@ def GuessRepoParams( trs_tool_url, resio, ) - trs__meta = json.loads(resio.getvalue().decode("utf-8")) + trs_tool_meta = json.loads(resio.getvalue().decode("utf-8")) except Exception as e: if fail_ok: @@ -272,7 +276,7 @@ def GuessRepoParams( f"trs_endpoint could not be guessed from {trs_tool_url} (came from {putative_tool_uri}, raised exception)" ) from e - trs__meta__id = trs__meta.get("id") + trs__meta__id = trs_tool_meta.get("id") if trs__meta__id is None: if fail_ok: return None @@ -280,7 +284,7 @@ def GuessRepoParams( f"trs_endpoint could not be guessed from {trs_tool_url} (came from {putative_tool_uri}, not returning id)" ) - trs__meta__url = trs__meta.get("url") + trs__meta__url = trs_tool_meta.get("url") if trs__meta__url is None: if fail_ok: return None @@ -289,9 +293,12 @@ def GuessRepoParams( ) else: trs_tool_url = putative_tool_uri + trs_tool_meta = None version_id = None - if "toolclass" in trs__meta: + if (version_id is not None and "toolclass" in trs_tool_meta) or ( + version_id is None and "toolclass" in trs__meta + ): workflow_id = trs__meta__id # Now we need to backtrack in the url to get the workflow id @@ -304,6 +311,17 @@ def GuessRepoParams( ) trs_endpoint = trs_tool_url[0 : -len(tool_url_suffix)] + + ga4gh_trs_suffix = "/ga4gh/trs/v2" + # This is here for not so compliant services like yevis + trs_transient_endpoint = ( + trs_endpoint[0 : -len(ga4gh_trs_suffix)] + if trs_endpoint.endswith(ga4gh_trs_suffix) + else trs_endpoint + ) + parsed_trs_endpoint = urllib.parse.urlparse(trs_transient_endpoint) + trs_service_netloc = parsed_trs_endpoint.netloc + trs_steps = parsed_trs_endpoint.path.split("/") else: if fail_ok: return None @@ -322,9 +340,30 @@ def GuessRepoParams( # This is needed to guarantee it is always declared assert putative_tool_uri is not None - return RemoteRepo( - repo_url=cast("RepoURL", putative_tool_uri), - repo_type=RepoType.TRS, + return ( + cast("RepoURL", putative_tool_uri), + trs_service_netloc, + trs_steps, + workflow_id, + version_id, + ) + + @classmethod + def GuessRepoParams( + cls, + orig_wf_url: "Union[URIType, parse.ParseResult]", + logger: "Optional[logging.Logger]" = None, + fail_ok: "bool" = False, + ) -> "Optional[RemoteRepo]": + trs_params = cls.GuessTRSParams(orig_wf_url, logger=logger, fail_ok=fail_ok) + + return ( + None + if trs_params is None + else RemoteRepo( + repo_url=trs_params[0], + repo_type=RepoType.TRS, + ) ) @classmethod @@ -893,12 +932,38 @@ def build_pid_from_repo(self, remote_repo: "RemoteRepo") -> "Optional[str]": # TODO: improve this to cover the different cases parsedInputURL = parse.urlparse(remote_repo.repo_url) - if ( - parsedInputURL.scheme - in (self.INTERNAL_TRS_SCHEME_PREFIX, self.TRS_SCHEME_PREFIX) - or remote_repo.repo_type == RepoType.TRS + if parsedInputURL.scheme in ( + self.INTERNAL_TRS_SCHEME_PREFIX, + self.TRS_SCHEME_PREFIX, ): return remote_repo.repo_url + elif remote_repo.repo_type == RepoType.TRS: + guessed_trs_params = self.GuessTRSParams( + parsedInputURL, logger=self.logger, fail_ok=True + ) + if guessed_trs_params is not None: + ( + trs_tool_url, + trs_service_netloc, + trs_steps, + workflow_id, + version_id, + ) = guessed_trs_params + new_steps = [*trs_steps, urllib.parse.quote(str(workflow_id), safe="")] + if version_id is not None: + new_steps.append(urllib.parse.quote(str(version_id), safe="")) + + computed_trs_endpoint = urllib.parse.urlunparse( + urllib.parse.ParseResult( + scheme=self.TRS_SCHEME_PREFIX, + netloc=trs_service_netloc, + path="/".join(new_steps), + params="", + query="", + fragment="", + ) + ) + return computed_trs_endpoint return None From b6233e68f74814e08909d425518633a54d1c175f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20M=C2=AA=20Fern=C3=A1ndez?= Date: Tue, 28 Jan 2025 01:15:54 +0100 Subject: [PATCH 27/60] Fixed several corner cases and issues in preparation for the new upcoming test around materialize_repo_from_repo --- tests/fetchers/test_trs.py | 31 +- wfexs_backend/fetchers/trs_files.py | 582 ++++++++++++++-------------- 2 files changed, 327 insertions(+), 286 deletions(-) diff --git a/tests/fetchers/test_trs.py b/tests/fetchers/test_trs.py index ea05bd78..7e9dfd38 100644 --- a/tests/fetchers/test_trs.py +++ b/tests/fetchers/test_trs.py @@ -19,7 +19,7 @@ import pytest import logging -from pathlib import Path +import pathlib from typing import ( cast, @@ -133,11 +133,11 @@ RemoteRepo( repo_url=cast( "RepoURL", - "https://dockstore.org/api/ga4gh/trs/v2/tools/%23workflow%2Fgithub.com%2FNCI-GDC%2Fgdc-dnaseq-cwl%2FGDC_DNASeq", + "https://dockstore.org/api/ga4gh/trs/v2/tools/%23workflow%2Fgithub.com%2FNCI-GDC%2Fgdc-dnaseq-cwl%2FGDC_DNASeq/versions/release", ), repo_type=RepoType.TRS, ), - "trs://dockstore.org/api/%23workflow%2Fgithub.com%2FNCI-GDC%2Fgdc-dnaseq-cwl%2FGDC_DNASeq", + "trs://dockstore.org/api/%23workflow%2Fgithub.com%2FNCI-GDC%2Fgdc-dnaseq-cwl%2FGDC_DNASeq/release", ), ( "https://workflowhub.eu/ga4gh/trs/v2/tools/", @@ -306,3 +306,28 @@ def test_build_trs_pid_from_repo( output = fetcher.build_pid_from_repo(remote_repo) assert output in (url, repo_pid) + + +@TRS_PARAMS_TESTBED +def test_materialize_repo_from_repo( + tmppath: "pathlib.Path", + trs_endpoint: "str", + workflow_id: "WorkflowId", + version_id: "Optional[WFVersionId]", + descriptor_type: "Optional[TRS_Workflow_Descriptor]", + url: "str", + remote_repo: "Optional[RemoteRepo]", + repo_pid: "Optional[str]", +) -> "None": + if remote_repo is None: + pytest.skip("Skipped test because no remote repo was provided") + else: + scheme_catalog = SchemeCatalog( + scheme_handlers=HTTPFetcher.GetSchemeHandlers(), + ) + + fetcher = GA4GHTRSFetcher(scheme_catalog, progs={}) + materialized_repo = fetcher.materialize_repo_from_repo(remote_repo) + logger.warning(materialized_repo) + + assert False diff --git a/wfexs_backend/fetchers/trs_files.py b/wfexs_backend/fetchers/trs_files.py index b63fc8a4..fe7c1835 100644 --- a/wfexs_backend/fetchers/trs_files.py +++ b/wfexs_backend/fetchers/trs_files.py @@ -21,6 +21,7 @@ import atexit import copy import hashlib +import inspect import io import json import logging @@ -29,6 +30,7 @@ import shutil import tempfile import urllib.parse +import sys import warnings from typing import ( @@ -38,6 +40,10 @@ from urllib import parse +# This code needs exception groups +if sys.version_info[:2] < (3, 11): + from exceptiongroup import ExceptionGroup + from . import ( AbstractSchemeRepoFetcher, DocumentedProtocolFetcher, @@ -49,6 +55,10 @@ RepoType, ) +from .. import ( + get_WfExS_version_str, +) + from ..common import ( ContentKind, URIWithMetadata, @@ -64,6 +74,10 @@ from .http import HTTPFetcher +from ..scheme_catalog import ( + SchemeCatalog, +) + if TYPE_CHECKING: from typing import ( Any, @@ -97,10 +111,6 @@ WorkflowId, ) - from ..scheme_catalog import ( - SchemeCatalog, - ) - class GA4GHTRSFetcher(AbstractSchemeRepoFetcher): INTERNAL_TRS_SCHEME_PREFIX: "Final[str]" = "wfexs.trs.files" @@ -141,7 +151,18 @@ def GuessTRSParams( orig_wf_url: "Union[URIType, parse.ParseResult]", logger: "Optional[logging.Logger]" = None, fail_ok: "bool" = False, - ) -> "Optional[Tuple[RepoURL, str, Sequence[str], WorkflowId, Optional[WFVersionId]]]": + scheme_catalog: "Optional[SchemeCatalog]" = None, + ) -> "Optional[Tuple[RepoURL, str, Sequence[str], WorkflowId, WFVersionId, str, Sequence[URIWithMetadata], Optional[Mapping[str, Any]]]]": + if scheme_catalog is None: + scheme_catalog = SchemeCatalog( + scheme_handlers=HTTPFetcher.GetSchemeHandlers() + ) + + if logger is None: + logger = logging.getLogger( + dict(inspect.getmembers(cls))["__module__"] + "::" + cls.__name__ + ) + # Deciding which is the input wf_url: "RepoURL" parsed_wf_url: "parse.ParseResult" @@ -156,7 +177,10 @@ def GuessTRSParams( wf_url = cast("RepoURL", cls.INTERNAL_TRS_SCHEME_PREFIX + ":" + wf_url) parsed_wf_url = parse.urlparse(wf_url) + metadata_array: "MutableSequence[URIWithMetadata]" = [] putative_tool_uri: "Optional[URIType]" = None + descriptor: "Optional[str]" = None + service_info_metadata: "Optional[MutableMapping[str, Any]]" = None if parsed_wf_url.scheme == cls.TRS_SCHEME_PREFIX: # Duplication of code path_steps: "Sequence[str]" = parsed_wf_url.path.split("/") @@ -168,10 +192,10 @@ def GuessTRSParams( ) trs_steps = cast("MutableSequence[str]", path_steps[0:-2]) - trs_steps.extend(["ga4gh", "trs", "v2", ""]) + trs_steps.extend(["ga4gh", "trs", "v2", "service-info"]) trs_service_netloc = parsed_wf_url.netloc - trs_endpoint = urllib.parse.urlunparse( + trs_service_info = urllib.parse.urlunparse( urllib.parse.ParseResult( scheme="https", netloc=trs_service_netloc, @@ -182,33 +206,105 @@ def GuessTRSParams( ) ) + service_info_wfexs_meta = { + "fetched": trs_service_info, + "payload": cast("Optional[Mapping[str, Any]]", None), + } + metadata_array.append(URIWithMetadata(wf_url, service_info_wfexs_meta)) + try: + metaio = io.BytesIO() + _, metametaio, _ = scheme_catalog.streamfetch( + cast("URIType", trs_service_info), metaio + ) + service_info_metadata = json.loads(metaio.getvalue().decode("utf-8")) + service_info_wfexs_meta["payload"] = service_info_metadata + metadata_array.extend(metametaio) + + trs_endpoint = trs_service_info[0 : -len("service-info")] + except Exception as e1: + non_standard_trs_steps = cast("MutableSequence[str]", path_steps[0:-2]) + non_standard_trs_steps.extend(["service-info"]) + + non_standard_trs_service_info = urllib.parse.urlunparse( + urllib.parse.ParseResult( + scheme="https", + netloc=trs_service_netloc, + path="/".join(non_standard_trs_steps), + params="", + query="", + fragment="", + ) + ) + + try: + metaio = io.BytesIO() + _, metametaio, _ = scheme_catalog.streamfetch( + cast("URIType", non_standard_trs_service_info), metaio + ) + service_info_metadata = json.loads( + metaio.getvalue().decode("utf-8") + ) + service_info_wfexs_meta["payload"] = service_info_metadata + metadata_array.extend(metametaio) + trs_endpoint = trs_service_info[0 : -len("service-info")] + except Exception as e2: + if fail_ok: + return None + raise ExceptionGroup( + f"Error fetching or processing TRS service info metadata for {wf_url} (tried both {trs_service_info} and {non_standard_trs_service_info})", + [e1, e2], + ) + + trs_tool_uri = ( + trs_endpoint + + cls.TRS_TOOLS_SUFFIX + + path_steps[-2] + + "/versions/" + + path_steps[-1] + ) workflow_id = urllib.parse.unquote(path_steps[-2]) version_id = urllib.parse.unquote(path_steps[-1]) - putative_tool_uri = cast("URIType", wf_url) + descriptor = None elif parsed_wf_url.scheme == cls.INTERNAL_TRS_SCHEME_PREFIX: - putative_tool_uri = cast("URIType", parsed_wf_url.path) + putative_tool_uri = cast( + "URIType", + parsed_wf_url.path[0:-1] + if parsed_wf_url.path.endswith("/") + else parsed_wf_url.path, + ) + parsed_putative_tool_uri = urllib.parse.urlparse(putative_tool_uri) + trs_service_netloc = parsed_putative_tool_uri.netloc # Detecting workflowhub derivatives is_wh = parsed_putative_tool_uri.netloc.endswith("workflowhub.eu") # Time to try guessing everything + tool_wfexs_meta = { + "fetched": putative_tool_uri, + "payload": None, + } + metadata_array.append(URIWithMetadata(wf_url, tool_wfexs_meta)) try: resio = io.BytesIO() - _, metaresio, _ = HTTPFetcher().streamfetch( + _, metaresio, _ = scheme_catalog.streamfetch( putative_tool_uri, resio, - secContext={ + sec_context={ "headers": { "Accept": "application/json", + # Added to avoid Cloudflare anti-bot policy + "User-Agent": get_WfExS_version_str(), }, }, ) trs__meta = json.loads(resio.getvalue().decode("utf-8")) + tool_wfexs_meta["payload"] = trs__meta + metadata_array.extend(metaresio) except Exception as e: if fail_ok: return None raise FetcherException( - f"trs_endpoint could not be guessed from {putative_tool_uri} (raised exception)" + f"trs_endpoint could not be guessed from {putative_tool_uri} (raised exception {e})" ) from e if not isinstance(trs__meta, dict): @@ -218,117 +314,100 @@ def GuessTRSParams( f"trs_endpoint could not be guessed from {putative_tool_uri} (not returning JSON object)" ) - trs__meta__id: "Optional[str]" = trs__meta.get("id") - if trs__meta__id is None: - if fail_ok: - return None - raise FetcherException( - f"trs_endpoint could not be guessed from {putative_tool_uri} (not returning id)" - ) - - trs__meta__url: "Optional[str]" = trs__meta.get("url") - if trs__meta__url is None: - if fail_ok: - return None - raise FetcherException( - f"trs_endpoint could not be guessed from {putative_tool_uri} (not returning url)" - ) - - # Non compliant emitted trs__meta__url - if is_wh: - trs__meta__url = putative_tool_uri - - if "descriptor_type" in trs__meta: - version_id = trs__meta__id - # Now we need to backtrack in the url to get the workflow id - tool_url_suffix = "/versions/" + urllib.parse.quote(version_id, safe="") - - # If this happens, this implementation is not so compliant with standard - dockstore_tool_url_suffix = "/versions/" + urllib.parse.quote( - trs__meta.get("name", ""), safe="" - ) - if trs__meta__url.endswith(dockstore_tool_url_suffix): - tool_url_suffix = dockstore_tool_url_suffix - version_id = trs__meta.get("name", "") - - if not trs__meta__url.endswith(tool_url_suffix): + # Is this the "abstract" tool definition? + versions = trs__meta.get("versions") + if isinstance(versions, list) and "toolclass" in trs__meta: + if len(versions) == 0: if fail_ok: return None raise FetcherException( - f"trs_endpoint could not be guessed from {putative_tool_uri} and {trs__meta__url} (version {version_id}, expected suffix {tool_url_suffix}, mismatched API route)" + f"No versions found associated to TRS tool reachable through {putative_tool_uri}" ) - - trs_tool_url = cast( - "URIType", trs__meta__url[0 : -len(tool_url_suffix)] - ) - try: - resio = io.BytesIO() - _, metaresio, _ = HTTPFetcher().streamfetch( - trs_tool_url, - resio, + # Reuse the last version + trs_tool_meta = versions[-1] + trs_endpoint = urllib.parse.urlunparse( + urllib.parse.ParseResult( + scheme=parsed_putative_tool_uri.scheme, + netloc=parsed_putative_tool_uri.netloc, + path="/".join(parsed_putative_tool_uri.path.split("/")[0:-2]) + + "/", + params="", + query="", + fragment="", ) - trs_tool_meta = json.loads(resio.getvalue().decode("utf-8")) - - except Exception as e: - if fail_ok: - return None - raise FetcherException( - f"trs_endpoint could not be guessed from {trs_tool_url} (came from {putative_tool_uri}, raised exception)" - ) from e - - trs__meta__id = trs_tool_meta.get("id") - if trs__meta__id is None: - if fail_ok: - return None - raise FetcherException( - f"trs_endpoint could not be guessed from {trs_tool_url} (came from {putative_tool_uri}, not returning id)" + ) + workflow_id = urllib.parse.unquote( + parsed_putative_tool_uri.path.split("/")[-1] + ) + trs_tool_prefix = putative_tool_uri + version_id = trs_tool_meta.get("id") + name = trs_tool_meta.get("name") + if version_id is not None: + # Dockstore misbehaves + if ( + name is not None + and version_id.endswith(name) + and parsed_putative_tool_uri.netloc.endswith("dockstore.org") + ): + version_id = name + trs_tool_uri = ( + trs_tool_prefix + + "/versions/" + + urllib.parse.quote(version_id, safe="") ) - - trs__meta__url = trs_tool_meta.get("url") - if trs__meta__url is None: - if fail_ok: - return None + elif fail_ok: + return None + else: raise FetcherException( - f"trs_endpoint could not be guessed from {trs_tool_url} (came from {putative_tool_uri}, not returning url)" + f"No version id found associated to specific version of TRS tool reachable through {putative_tool_uri}" ) - else: - trs_tool_url = putative_tool_uri - trs_tool_meta = None - version_id = None - - if (version_id is not None and "toolclass" in trs_tool_meta) or ( - version_id is None and "toolclass" in trs__meta - ): - workflow_id = trs__meta__id - - # Now we need to backtrack in the url to get the workflow id - tool_url_suffix = "/tools/" + urllib.parse.quote(workflow_id, safe="") - if not trs_tool_url.endswith(tool_url_suffix): - if fail_ok: - return None - raise FetcherException( - f"trs_endpoint could not be guessed from {trs_tool_url} and {trs__meta__url} (expected suffix {tool_url_suffix}, mismatched API route)" + # ... or a concrete one? + elif "descriptor_type" in trs__meta: + trs_tool_meta = trs__meta + trs_endpoint = urllib.parse.urlunparse( + urllib.parse.ParseResult( + scheme=parsed_putative_tool_uri.scheme, + netloc=parsed_putative_tool_uri.netloc, + path="/".join(parsed_putative_tool_uri.path.split("/")[0:-4]) + + "/", + params="", + query="", + fragment="", ) - - trs_endpoint = trs_tool_url[0 : -len(tool_url_suffix)] - - ga4gh_trs_suffix = "/ga4gh/trs/v2" - # This is here for not so compliant services like yevis - trs_transient_endpoint = ( - trs_endpoint[0 : -len(ga4gh_trs_suffix)] - if trs_endpoint.endswith(ga4gh_trs_suffix) - else trs_endpoint ) - parsed_trs_endpoint = urllib.parse.urlparse(trs_transient_endpoint) - trs_service_netloc = parsed_trs_endpoint.netloc - trs_steps = parsed_trs_endpoint.path.split("/") + trs_tool_prefix = cast( + "URIType", + urllib.parse.urlunparse( + urllib.parse.ParseResult( + scheme=parsed_putative_tool_uri.scheme, + netloc=parsed_putative_tool_uri.netloc, + path="/".join( + parsed_putative_tool_uri.path.split("/")[0:-2] + ) + + "/", + params="", + query="", + fragment="", + ) + ), + ) + workflow_id = urllib.parse.unquote( + parsed_putative_tool_uri.path.split("/")[-3] + ) + version_id = urllib.parse.unquote( + parsed_putative_tool_uri.path.split("/")[-1] + ) + trs_tool_uri = putative_tool_uri + elif fail_ok: + return None else: - if fail_ok: - return None raise FetcherException( - f"trs_endpoint could not be guessed from {putative_tool_uri} (no clues)" + f"trs_endpoint at {putative_tool_uri} is not answering what it is expected" ) + parsed_trs_endpoint = urllib.parse.urlparse(trs_endpoint) + trs_steps = parsed_trs_endpoint.path[0:-1].split("/") + # Next two elifs should *never* happen elif fail_ok: return None @@ -338,14 +417,36 @@ def GuessTRSParams( ) # This is needed to guarantee it is always declared - assert putative_tool_uri is not None + assert trs_tool_uri is not None + assert trs_tool_meta is not None + + if not isinstance(trs_tool_meta.get("descriptor_type"), list): + raise FetcherException( + f"Unable to obtain descriptor_type from tool descriptor obtained from {putative_tool_uri}" + ) + + descriptor_types = trs_tool_meta["descriptor_type"] + if len(descriptor_types) == 0: + raise FetcherException( + f"Empty list of descriptor_type from tool descriptor obtained from {putative_tool_uri}" + ) + + descriptor = descriptor_types[0] + assert descriptor is not None + if len(descriptor_types) > 1: + logger.warning( + f"Found {len(descriptor_types)} descriptor types for tool {putative_tool_uri}, using first ({descriptor})" + ) return ( - cast("RepoURL", putative_tool_uri), + cast("RepoURL", trs_tool_uri), trs_service_netloc, trs_steps, workflow_id, version_id, + descriptor, + metadata_array, + service_info_metadata, ) @classmethod @@ -416,178 +517,41 @@ def materialize_repo_from_repo( base_repo_destdir: "Optional[PathLikePath]" = None, doUpdate: "Optional[bool]" = True, ) -> "MaterializedRepo": + if repo.repo_type != RepoType.TRS: + raise FetcherException( + f"Remote repository {repo} is not of type TRS. Unable to fulfil request" + ) remote_file = repo.repo_url repoTag = repo.tag - parsedInputURL = parse.urlparse(remote_file) - path_steps: "Sequence[str]" = parsedInputURL.path.split("/") - embedded_remote_file = parsedInputURL.path - - metadata_array: "MutableSequence[URIWithMetadata]" = [] - if parsedInputURL.scheme == self.INTERNAL_TRS_SCHEME_PREFIX: - # TODO: Improve this code - if not embedded_remote_file.endswith(self.TRS_FILES_SUFFIX): - files_metadata_url = cast( - "URIType", embedded_remote_file + self.TRS_FILES_SUFFIX - ) - descriptor_base_url = embedded_remote_file + self.TRS_DESCRIPTOR_INFIX - else: - files_metadata_url = cast("URIType", embedded_remote_file) - descriptor_base_url = ( - embedded_remote_file[0 : -len(self.TRS_FILES_SUFFIX)] - + self.TRS_DESCRIPTOR_INFIX - ) - # TODO: fetch here service info metadata - elif parsedInputURL.scheme == self.TRS_SCHEME_PREFIX: - # TRS official scheme - if len(path_steps) < 3 or path_steps[0] != "": - raise FetcherException( - f"Ill-formed TRS CURIE {remote_file}. It should be in the format of {self.TRS_SCHEME_PREFIX}://id/version or {self.TRS_SCHEME_PREFIX}://prefix-with-slashes/id/version" - ) - - trs_base_steps = cast("MutableSequence[str]", path_steps[0:-2]) - trs_base_steps.extend(["ga4gh", "trs", "v2"]) - - # Performing some sanity checks about the API - service_info_steps = copy.copy(trs_base_steps) - service_info_steps.append("service-info") - service_info_metadata_url = cast( - "URIType", - parse.urlunparse( - parse.ParseResult( - scheme="https", - netloc=parsedInputURL.netloc, - path="/".join(service_info_steps), - params="", - query="", - fragment="", - ) - ), - ) - service_info_wfexs_meta = { - "fetched": service_info_metadata_url, - "payload": None, - } - metadata_array.append(URIWithMetadata(remote_file, service_info_wfexs_meta)) - try: - metaio = io.BytesIO() - _, metametaio, _ = self.scheme_catalog.streamfetch( - service_info_metadata_url, metaio - ) - service_info_metadata = json.loads(metaio.getvalue().decode("utf-8")) - service_info_wfexs_meta["payload"] = service_info_metadata - metadata_array.extend(metametaio) - except FetcherException as fe: - raise FetcherException( - f"Error fetching or processing TRS service info metadata for {remote_file} : {fe.code} {fe.reason}" - ) from fe - - trs_version_str: "Optional[str]" = None - trs_artifact: "Optional[str]" = None - trs_group: "Optional[str]" = None - trs_endpoint_meta_type: "Optional[Mapping[str, str]]" = ( - service_info_metadata.get("type") - ) - if trs_endpoint_meta_type is not None: - trs_version_str = trs_endpoint_meta_type.get("version") - trs_artifact = trs_endpoint_meta_type.get("artifact") - trs_group = trs_endpoint_meta_type.get("group") - - if trs_version_str is None: - errstr = f"Unable to identify TRS version from {service_info_metadata_url}. Is this a TRS endpoint?" - raise FetcherException(errstr) - - # Avoiding querying a GA4GH DRS service, for instance - if trs_artifact is not None and trs_artifact.lower() not in ( - "trs", - "yevis", - ): - errstr = f"Unsupported GA4GH service {trs_artifact} (group {trs_group}) from {service_info_metadata_url}" - raise FetcherException(errstr) - - # Warning about potentially unsupported versions - trs_version_tuple = tuple(map(int, trs_version_str.split("."))) - if trs_version_tuple < (2, 0, 1): - self.logger.warning( - f"{service_info_metadata_url} is offering old TRS version {trs_version_str}, which diverges from what this implementation supports" - ) - elif trs_version_tuple > (3, 0): - self.logger.warning( - f"{service_info_metadata_url} is offering TRS version {trs_version_str}, which might diverge from what this implementation supports" - ) - - version_steps = copy.copy(trs_base_steps) - version_steps.extend(["tools", path_steps[-2], "versions", path_steps[-1]]) - version_metadata_url = cast( - "URIType", - parse.urlunparse( - parse.ParseResult( - scheme="https", - netloc=parsedInputURL.netloc, - path="/".join(version_steps), - params="", - query="", - fragment="", - ) - ), - ) - version_meta = { - "fetched": version_metadata_url, - "payload": None, - } - metadata_array.append(URIWithMetadata(remote_file, version_meta)) - try: - metaio = io.BytesIO() - _, metametaio, _ = self.scheme_catalog.streamfetch( - version_metadata_url, metaio - ) - version_metadata = json.loads(metaio.getvalue().decode("utf-8")) - version_meta["payload"] = version_metadata - metadata_array.extend(metametaio) - - except FetcherException as fe: - raise FetcherException( - f"Error fetching or processing TRS version metadata for {remote_file} : {fe.code} {fe.reason}" - ) from fe - - # At last, we can finish building the URL - new_path_steps = [ - *version_steps, - version_metadata["descriptor_type"][0], - "files", - ] - - files_metadata_url = cast( - "URIType", - parse.urlunparse( - parse.ParseResult( - scheme="https", - netloc=parsedInputURL.netloc, - path="/".join(new_path_steps), - params="", - query="", - fragment="", - ) - ), - ) + guessed_trs_params = self.GuessTRSParams( + remote_file, logger=self.logger, scheme_catalog=self.scheme_catalog + ) + if guessed_trs_params is None: + raise FetcherException(f"Unable to guess TRS params from {repo}") - descriptor_steps = [ - *version_steps, - version_metadata["descriptor_type"][0], - "descriptor", - ] - descriptor_base_url = parse.urlunparse( - parse.ParseResult( - scheme="https", - netloc=parsedInputURL.netloc, - path="/".join(descriptor_steps) + "/", - params="", - query="", - fragment="", - ) - ) - else: - raise FetcherException(f"FIXME: Unhandled scheme {parsedInputURL.scheme}") + ( + trs_tool_url, + trs_service_netloc, + trs_steps, + workflow_id, + version_id, + descriptor, + guessed_metadata_array, + service_info_metadata, + ) = guessed_trs_params + files_metadata_url = ( + trs_tool_url + + "/" + + urllib.parse.quote(descriptor, safe="") + + self.TRS_FILES_SUFFIX + ) + descriptor_base_url = ( + trs_tool_url + + "/" + + urllib.parse.quote(descriptor, safe="") + + self.TRS_DESCRIPTOR_INFIX + ) # Assure directory exists before next step if repo_tag_destdir is None: @@ -629,19 +593,22 @@ def materialize_repo_from_repo( "workflow_entrypoint": None, "remote_workflow_entrypoint": None, } - metadata_array = [URIWithMetadata(remote_file, topMeta)] + metadata_array = [ + *guessed_metadata_array, + URIWithMetadata(remote_file, topMeta), + ] try: metaio = io.BytesIO() _, metametaio, _ = self.scheme_catalog.streamfetch( - files_metadata_url, metaio + cast("URIType", files_metadata_url), metaio ) metadata = json.loads(metaio.getvalue().decode("utf-8")) topMeta["payload"] = metadata metadata_array.extend(metametaio) except FetcherException as fe: raise FetcherException( - "Error fetching or processing TRS files metadata for {} : {} {}".format( - remote_file, fe.code, fe.reason + "Error fetching or processing TRS files metadata for {} : {} {} (offending url {})".format( + remote_file, fe.code, fe.reason, files_metadata_url ) ) from fe @@ -891,6 +858,42 @@ def materialize_repo_from_repo( upstream_repo: "Optional[RemoteRepo]" = None recommends_upstream: "bool" = False + + if service_info_metadata is None: + parsed_trs_tool_url = urllib.parse.urlparse(trs_tool_url) + trs_service_info = urllib.parse.urlunparse( + urllib.parse.ParseResult( + scheme=parsed_trs_tool_url.scheme, + netloc=parsed_trs_tool_url.netloc, + path="/".join(parsed_trs_tool_url.path.split("/")[0:-4]) + + "/service-info", + params="", + query="", + fragment="", + ) + ) + + service_info_wfexs_meta = { + "fetched": trs_service_info, + "payload": cast("Optional[Mapping[str, Any]]", None), + } + metadata_array.append( + URIWithMetadata(trs_tool_url, service_info_wfexs_meta) + ) + try: + metaio = io.BytesIO() + _, metametaio, _ = self.scheme_catalog.streamfetch( + cast("URIType", trs_service_info), metaio + ) + service_info_metadata = json.loads(metaio.getvalue().decode("utf-8")) + service_info_wfexs_meta["payload"] = service_info_metadata + metadata_array.extend(metametaio) + + except Exception as e: + raise FetcherException( + f"Unable to fetch service info metadata {trs_service_info} (affects tool {trs_tool_url})" + ) from e + # Checking whether it is WorkflowHub # to recommend the generated Workflow RO-Crate if service_info_metadata.get("organization", {}).get("name") == "WorkflowHub": @@ -948,7 +951,19 @@ def build_pid_from_repo(self, remote_repo: "RemoteRepo") -> "Optional[str]": trs_steps, workflow_id, version_id, + descriptor, + guessed_metadata_array, + service_info_metadata, ) = guessed_trs_params + + # Remove /ga4gh/trs/v2 from the end + if ( + len(trs_steps) >= 3 + and trs_steps[-1] == "v2" + and trs_steps[-2] == "trs" + and trs_steps[-3] == "ga4gh" + ): + trs_steps = trs_steps[0:-3] new_steps = [*trs_steps, urllib.parse.quote(str(workflow_id), safe="")] if version_id is not None: new_steps.append(urllib.parse.quote(str(version_id), safe="")) @@ -963,6 +978,7 @@ def build_pid_from_repo(self, remote_repo: "RemoteRepo") -> "Optional[str]": fragment="", ) ) + self.logger.error(f"Y FUE {computed_trs_endpoint} {parsedInputURL}") return computed_trs_endpoint return None From 2aa8587f6d804765b10115072431285c4443a9e1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20M=C2=AA=20Fern=C3=A1ndez?= Date: Tue, 28 Jan 2025 16:58:02 +0100 Subject: [PATCH 28/60] Added an additional unit test which has helped catching some corner cases in GA4GH TRS implementation. --- tests/fetchers/test_trs.py | 65 +++++++++++++++++++++++++++-- wfexs_backend/fetchers/trs_files.py | 22 ++++++---- 2 files changed, 76 insertions(+), 11 deletions(-) diff --git a/tests/fetchers/test_trs.py b/tests/fetchers/test_trs.py index 7e9dfd38..fe06137a 100644 --- a/tests/fetchers/test_trs.py +++ b/tests/fetchers/test_trs.py @@ -70,6 +70,7 @@ "url", "remote_repo", "repo_pid", + "upstream_repo", ], [ ( @@ -94,6 +95,12 @@ repo_type=RepoType.TRS, ), "trs://dockstore.org/api/%23workflow%2Fgithub.com%2Fsevenbridges-openworkflows%2FBroad-Best-Practice-Somatic-CNV-Workflows%2FGATK-Somatic-CNV-Panel-Workflow/master", + RemoteRepo( + repo_url=cast( + "RepoURL", + "https://raw.githubusercontent.com/sevenbridges-openworkflows/Broad-Best-Practice-Somatic-CNV-Workflows/master/BroadCNVPanelWorkflow/gatk-cnv-panel-workflow_decomposed.cwl", + ), + ), ), ( "https://dockstore.org/api/ga4gh/trs/v2/", @@ -116,6 +123,12 @@ repo_type=RepoType.TRS, ), "trs://dockstore.org/api/%23workflow%2Fgithub.com%2FNCI-GDC%2Fgdc-dnaseq-cwl%2FGDC_DNASeq/master", + RemoteRepo( + repo_url=cast( + "RepoURL", + "https://raw.githubusercontent.com/NCI-GDC/gdc-dnaseq-cwl/master/workflows/dnaseq/transform.cwl", + ), + ), ), ( "https://dockstore.org/api/ga4gh/trs/v2/", @@ -138,6 +151,12 @@ repo_type=RepoType.TRS, ), "trs://dockstore.org/api/%23workflow%2Fgithub.com%2FNCI-GDC%2Fgdc-dnaseq-cwl%2FGDC_DNASeq/release", + RemoteRepo( + repo_url=cast( + "RepoURL", + "https://raw.githubusercontent.com/NCI-GDC/gdc-dnaseq-cwl/release/workflows/dnaseq/transform.cwl", + ), + ), ), ( "https://workflowhub.eu/ga4gh/trs/v2/tools/", @@ -158,6 +177,13 @@ repo_type=RepoType.TRS, ), "trs://workflowhub.eu/107/1", + RemoteRepo( + repo_url=cast( + "RepoURL", + "https://workflowhub.eu/ga4gh/trs/v2/tools/107/versions/1/CWL/files?format=zip", + ), + repo_type=RepoType.Raw, + ), ), ( "https://workflowhub.eu/ga4gh/trs/v2/tools/", @@ -178,6 +204,13 @@ repo_type=RepoType.TRS, ), "trs://workflowhub.eu/106/3", + RemoteRepo( + repo_url=cast( + "RepoURL", + "https://workflowhub.eu/ga4gh/trs/v2/tools/106/versions/3/NFL/files?format=zip", + ), + repo_type=RepoType.Raw, + ), ), ( "https://workflowhub.eu/ga4gh/trs/v2/", @@ -198,6 +231,13 @@ repo_type=RepoType.TRS, ), "trs://workflowhub.eu/119/1", + RemoteRepo( + repo_url=cast( + "RepoURL", + "https://workflowhub.eu/ga4gh/trs/v2/tools/119/versions/1/NFL/files?format=zip", + ), + repo_type=RepoType.Raw, + ), ), ( "https://workflowhub.eu/ga4gh/trs/v2/tools/", @@ -218,6 +258,13 @@ repo_type=RepoType.TRS, ), "trs://workflowhub.eu/244/4", + RemoteRepo( + repo_url=cast( + "RepoURL", + "https://workflowhub.eu/ga4gh/trs/v2/tools/244/versions/4/NFL/files?format=zip", + ), + repo_type=RepoType.Raw, + ), ), ( "https://ddbj.github.io/workflow-registry/", @@ -238,6 +285,12 @@ repo_type=RepoType.TRS, ), "trs://ddbj.github.io/workflow-registry/0d2ae4c2-fe4c-48f7-811a-ac277776533e/1.0.0", + RemoteRepo( + repo_url=cast( + "RepoURL", + "https://zenodo.org/api/files/2422dda0-1bd9-4109-aa44-53d55fd934de/download-sra.cwl", + ), + ), ), ], ) @@ -252,6 +305,7 @@ def test_guess_trs_repo_params( url: "str", remote_repo: "Optional[RemoteRepo]", repo_pid: "Optional[str]", + upstream_repo: "Optional[RemoteRepo]", ) -> "None": output = GA4GHTRSFetcher.GuessRepoParams(cast("URIType", url), logger=logger) @@ -274,6 +328,7 @@ def test_build_trs_internal_url_from_repo( url: "str", remote_repo: "Optional[RemoteRepo]", repo_pid: "Optional[str]", + upstream_repo: "Optional[RemoteRepo]", ) -> "None": output = GA4GHTRSFetcher.BuildRepoPIDFromTRSParams( trs_endpoint, @@ -294,6 +349,7 @@ def test_build_trs_pid_from_repo( url: "str", remote_repo: "Optional[RemoteRepo]", repo_pid: "Optional[str]", + upstream_repo: "Optional[RemoteRepo]", ) -> "None": if remote_repo is None: pytest.skip("Skipped test because no remote repo was provided") @@ -318,6 +374,7 @@ def test_materialize_repo_from_repo( url: "str", remote_repo: "Optional[RemoteRepo]", repo_pid: "Optional[str]", + upstream_repo: "Optional[RemoteRepo]", ) -> "None": if remote_repo is None: pytest.skip("Skipped test because no remote repo was provided") @@ -327,7 +384,9 @@ def test_materialize_repo_from_repo( ) fetcher = GA4GHTRSFetcher(scheme_catalog, progs={}) - materialized_repo = fetcher.materialize_repo_from_repo(remote_repo) - logger.warning(materialized_repo) + materialized_repo = fetcher.materialize_repo_from_repo( + remote_repo, base_repo_destdir=tmppath + ) - assert False + # Let's check the guessed repo' + assert materialized_repo.upstream_repo == upstream_repo diff --git a/wfexs_backend/fetchers/trs_files.py b/wfexs_backend/fetchers/trs_files.py index fe7c1835..6f7e93f0 100644 --- a/wfexs_backend/fetchers/trs_files.py +++ b/wfexs_backend/fetchers/trs_files.py @@ -517,7 +517,7 @@ def materialize_repo_from_repo( base_repo_destdir: "Optional[PathLikePath]" = None, doUpdate: "Optional[bool]" = True, ) -> "MaterializedRepo": - if repo.repo_type != RepoType.TRS: + if repo.repo_type not in (RepoType.TRS, None): raise FetcherException( f"Remote repository {repo} is not of type TRS. Unable to fulfil request" ) @@ -675,14 +675,12 @@ def materialize_repo_from_repo( if is_anon: prefix_url = "" + elif len(file_rel_2_url) == 1: + # FIXME?: this is not going to work in Windows + prefix_url = os.path.dirname(tuple(file_rel_2_url.values())[0]) else: prefix_url = os.path.commonpath(tuple(file_rel_2_url.values())) - # We have to create anonymous directories to avoid leaving the download "sandbox" - abs_download_dir = repo_tag_destpath - if "/" in prefix_url: - # This is needed to perform an effective work - prefix_url += "/" # Due the peversion of commonpath, double slashes are collapsed colon_pos = prefix_url.find(":") if colon_pos > 0: @@ -690,6 +688,13 @@ def materialize_repo_from_repo( prefix_url[0 : colon_pos + 1] + "/" + prefix_url[colon_pos + 1 :] ) + # We have to create anonymous directories to avoid leaving the download "sandbox" + abs_download_dir = repo_tag_destpath + if "/" in prefix_url: + # This is needed to perform an effective work + if not prefix_url.endswith("/"): + prefix_url += "/" + # Computing resolved relative paths for file_desc in metadata: file_rel_path = file_desc.get("path") @@ -701,6 +706,7 @@ def materialize_repo_from_repo( if is_abs_url: # An absolute URL, like in the case of DDBJ TRS implementation file_url = cast("URIType", file_rel_path) + self.logger.warning(file_rel_2_url) else: file_url = cast( "URIType", @@ -935,7 +941,7 @@ def build_pid_from_repo(self, remote_repo: "RemoteRepo") -> "Optional[str]": # TODO: improve this to cover the different cases parsedInputURL = parse.urlparse(remote_repo.repo_url) - if parsedInputURL.scheme in ( + if remote_repo.repo_type is None and parsedInputURL.scheme in ( self.INTERNAL_TRS_SCHEME_PREFIX, self.TRS_SCHEME_PREFIX, ): @@ -978,7 +984,7 @@ def build_pid_from_repo(self, remote_repo: "RemoteRepo") -> "Optional[str]": fragment="", ) ) - self.logger.error(f"Y FUE {computed_trs_endpoint} {parsedInputURL}") + return computed_trs_endpoint return None From 8119c376720e45c3c9128624cfbb21a152c3ad16 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20M=C2=AA=20Fern=C3=A1ndez?= Date: Tue, 28 Jan 2025 17:39:56 +0100 Subject: [PATCH 29/60] Enforcing the conditions to admit a RemoteRepo instance for git and swh repos --- wfexs_backend/fetchers/git.py | 6 ++++++ wfexs_backend/fetchers/swh.py | 14 +++++++++++++- 2 files changed, 19 insertions(+), 1 deletion(-) diff --git a/wfexs_backend/fetchers/git.py b/wfexs_backend/fetchers/git.py index 552838d9..7828969f 100644 --- a/wfexs_backend/fetchers/git.py +++ b/wfexs_backend/fetchers/git.py @@ -516,6 +516,8 @@ def build_pid_from_repo(self, remote_repo: "RemoteRepo") -> "Optional[str]": repo_url content or the repo type in the worst case, it should return None """ + if remote_repo.repo_type not in (RepoType.Git, None): + return None parsed_wf_url = parse.urlparse(remote_repo.repo_url) retval: "Optional[str]" = None @@ -678,6 +680,10 @@ def materialize_repo_from_repo( :param doUpdate: :return: """ + if repo.repo_type not in (RepoType.Git, None): + raise FetcherException( + f"Input RemoteRepo instance is not recognized as a fetchable URI (type {repo.repo_type})" + ) repoURL = repo.repo_url repoTag = repo.tag diff --git a/wfexs_backend/fetchers/swh.py b/wfexs_backend/fetchers/swh.py index fb2485b5..e3fbeeb4 100644 --- a/wfexs_backend/fetchers/swh.py +++ b/wfexs_backend/fetchers/swh.py @@ -240,7 +240,10 @@ def build_pid_from_repo(self, remote_repo: "RemoteRepo") -> "Optional[str]": """ parsed_wf_url = parse.urlparse(remote_repo.repo_url) - if parsed_wf_url.scheme not in self.GetSchemeHandlers(): + if ( + parsed_wf_url.scheme not in self.GetSchemeHandlers() + or remote_repo.repo_type not in (RepoType.TRS, None) + ): return None # FIXME: improve this @@ -256,6 +259,15 @@ def materialize_repo_from_repo( repoURL = cast("RepoURL", repo.tag) if repo.tag is not None else repo.repo_url repoTag = repo.tag + parsed_wf_url = parse.urlparse(repoURL) + if ( + parsed_wf_url.scheme not in self.GetSchemeHandlers() + or repo.repo_type not in (RepoType.SoftwareHeritage, None) + ): + raise FetcherException( + f"Input RemoteRepo instance is not recognized as a fetchable URI (repo {repoURL} , type {repo.repo_type})" + ) + # If we are here is because the repo is valid # as it should have been checked by GuessRepoParams From 7ddcff72ef94f28c6f39aa1aa08028515b95552f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20M=C2=AA=20Fern=C3=A1ndez?= Date: Wed, 29 Jan 2025 18:49:00 +0100 Subject: [PATCH 30/60] Values in tag field from RemoteRepo instances are both used and filled in con creation --- tests/fetchers/test_trs.py | 8 +++ wfexs_backend/fetchers/trs_files.py | 103 +++++++++++++++++++++++++--- 2 files changed, 102 insertions(+), 9 deletions(-) diff --git a/tests/fetchers/test_trs.py b/tests/fetchers/test_trs.py index fe06137a..d18f4122 100644 --- a/tests/fetchers/test_trs.py +++ b/tests/fetchers/test_trs.py @@ -92,6 +92,7 @@ "RepoURL", "https://dockstore.org/api/ga4gh/trs/v2/tools/%23workflow%2Fgithub.com%2Fsevenbridges-openworkflows%2FBroad-Best-Practice-Somatic-CNV-Workflows%2FGATK-Somatic-CNV-Panel-Workflow/versions/master", ), + tag=cast("RepoTag", "master"), repo_type=RepoType.TRS, ), "trs://dockstore.org/api/%23workflow%2Fgithub.com%2Fsevenbridges-openworkflows%2FBroad-Best-Practice-Somatic-CNV-Workflows%2FGATK-Somatic-CNV-Panel-Workflow/master", @@ -120,6 +121,7 @@ "RepoURL", "https://dockstore.org/api/ga4gh/trs/v2/tools/%23workflow%2Fgithub.com%2FNCI-GDC%2Fgdc-dnaseq-cwl%2FGDC_DNASeq/versions/master", ), + tag=cast("RepoTag", "master"), repo_type=RepoType.TRS, ), "trs://dockstore.org/api/%23workflow%2Fgithub.com%2FNCI-GDC%2Fgdc-dnaseq-cwl%2FGDC_DNASeq/master", @@ -148,6 +150,7 @@ "RepoURL", "https://dockstore.org/api/ga4gh/trs/v2/tools/%23workflow%2Fgithub.com%2FNCI-GDC%2Fgdc-dnaseq-cwl%2FGDC_DNASeq/versions/release", ), + tag=cast("RepoTag", "release"), repo_type=RepoType.TRS, ), "trs://dockstore.org/api/%23workflow%2Fgithub.com%2FNCI-GDC%2Fgdc-dnaseq-cwl%2FGDC_DNASeq/release", @@ -174,6 +177,7 @@ "RepoURL", "https://workflowhub.eu/ga4gh/trs/v2/tools/107/versions/1", ), + tag=cast("RepoTag", "1"), repo_type=RepoType.TRS, ), "trs://workflowhub.eu/107/1", @@ -201,6 +205,7 @@ "RepoURL", "https://workflowhub.eu/ga4gh/trs/v2/tools/106/versions/3", ), + tag=cast("RepoTag", "3"), repo_type=RepoType.TRS, ), "trs://workflowhub.eu/106/3", @@ -228,6 +233,7 @@ "RepoURL", "https://workflowhub.eu/ga4gh/trs/v2/tools/119/versions/1", ), + tag=cast("RepoTag", "1"), repo_type=RepoType.TRS, ), "trs://workflowhub.eu/119/1", @@ -255,6 +261,7 @@ "RepoURL", "https://workflowhub.eu/ga4gh/trs/v2/tools/244/versions/4", ), + tag=cast("RepoTag", "4"), repo_type=RepoType.TRS, ), "trs://workflowhub.eu/244/4", @@ -282,6 +289,7 @@ "RepoURL", "https://ddbj.github.io/workflow-registry/tools/0d2ae4c2-fe4c-48f7-811a-ac277776533e/versions/1.0.0", ), + tag=cast("RepoTag", "1.0.0"), repo_type=RepoType.TRS, ), "trs://ddbj.github.io/workflow-registry/0d2ae4c2-fe4c-48f7-811a-ac277776533e/1.0.0", diff --git a/wfexs_backend/fetchers/trs_files.py b/wfexs_backend/fetchers/trs_files.py index 6f7e93f0..a963fa8b 100644 --- a/wfexs_backend/fetchers/trs_files.py +++ b/wfexs_backend/fetchers/trs_files.py @@ -99,6 +99,7 @@ PathLikePath, ProgsMapping, RelPath, + RepoTag, RepoURL, SecurityContextConfig, SymbolicName, @@ -149,6 +150,7 @@ def GetNeededPrograms(cls) -> "Sequence[SymbolicName]": def GuessTRSParams( cls, orig_wf_url: "Union[URIType, parse.ParseResult]", + override_version_id: "Optional[WFVersionId]" = None, logger: "Optional[logging.Logger]" = None, fail_ok: "bool" = False, scheme_catalog: "Optional[SchemeCatalog]" = None, @@ -255,15 +257,19 @@ def GuessTRSParams( [e1, e2], ) + version_id = ( + urllib.parse.unquote(path_steps[-1]) + if not override_version_id + else override_version_id + ) trs_tool_uri = ( trs_endpoint + cls.TRS_TOOLS_SUFFIX + path_steps[-2] + "/versions/" - + path_steps[-1] + + urllib.parse.quote(cast("str", version_id), safe="") ) workflow_id = urllib.parse.unquote(path_steps[-2]) - version_id = urllib.parse.unquote(path_steps[-1]) descriptor = None elif parsed_wf_url.scheme == cls.INTERNAL_TRS_SCHEME_PREFIX: putative_tool_uri = cast( @@ -323,8 +329,35 @@ def GuessTRSParams( raise FetcherException( f"No versions found associated to TRS tool reachable through {putative_tool_uri}" ) - # Reuse the last version - trs_tool_meta = versions[-1] + + if override_version_id: + for putative_trs_tool_meta in versions: + version_id = putative_trs_tool_meta.get("id") + name = putative_trs_tool_meta.get("name") + if version_id is not None: + # Dockstore misbehaves + if ( + name is not None + and str(version_id).endswith(name) + and parsed_putative_tool_uri.netloc.endswith( + "dockstore.org" + ) + ): + version_id = name + if version_id == override_version_id: + trs_tool_meta = putative_trs_tool_meta + break + else: + if fail_ok: + return None + raise FetcherException( + f"Forced version {override_version_id} not found associated to TRS tool reachable through {putative_tool_uri}" + ) + + else: + # Reuse the last version + trs_tool_meta = versions[-1] + trs_endpoint = urllib.parse.urlunparse( urllib.parse.ParseResult( scheme=parsed_putative_tool_uri.scheme, @@ -346,14 +379,14 @@ def GuessTRSParams( # Dockstore misbehaves if ( name is not None - and version_id.endswith(name) + and str(version_id).endswith(name) and parsed_putative_tool_uri.netloc.endswith("dockstore.org") ): version_id = name trs_tool_uri = ( trs_tool_prefix + "/versions/" - + urllib.parse.quote(version_id, safe="") + + urllib.parse.quote(str(version_id), safe="") ) elif fail_ok: return None @@ -363,6 +396,50 @@ def GuessTRSParams( ) # ... or a concrete one? elif "descriptor_type" in trs__meta: + if override_version_id: + rpslash = putative_tool_uri.rfind("/") + putative_tool_uri = cast( + "URIType", + putative_tool_uri[0 : rpslash + 1] + + urllib.parse.quote(str(override_version_id), safe=""), + ) + parsed_putative_tool_uri = urllib.parse.urlparse(putative_tool_uri) + # Time to try guessing everything + tool_wfexs_meta = { + "fetched": putative_tool_uri, + "payload": None, + } + metadata_array.append(URIWithMetadata(wf_url, tool_wfexs_meta)) + try: + resio = io.BytesIO() + _, metaresio, _ = scheme_catalog.streamfetch( + putative_tool_uri, + resio, + sec_context={ + "headers": { + "Accept": "application/json", + # Added to avoid Cloudflare anti-bot policy + "User-Agent": get_WfExS_version_str(), + }, + }, + ) + trs__meta = json.loads(resio.getvalue().decode("utf-8")) + tool_wfexs_meta["payload"] = trs__meta + metadata_array.extend(metaresio) + except Exception as e: + if fail_ok: + return None + raise FetcherException( + f"trs_endpoint could not be guessed from {putative_tool_uri} (forced version {override_version_id}, raised exception {e})" + ) from e + + if "descriptor_type" not in trs__meta: + if fail_ok: + return None + raise FetcherException( + f"trs_endpoint at {putative_tool_uri} (forced version {override_version_id}) is not answering what it is expected" + ) + trs_tool_meta = trs__meta trs_endpoint = urllib.parse.urlunparse( urllib.parse.ParseResult( @@ -463,6 +540,7 @@ def GuessRepoParams( if trs_params is None else RemoteRepo( repo_url=trs_params[0], + tag=cast("RepoTag", trs_params[4]), repo_type=RepoType.TRS, ) ) @@ -525,7 +603,10 @@ def materialize_repo_from_repo( repoTag = repo.tag guessed_trs_params = self.GuessTRSParams( - remote_file, logger=self.logger, scheme_catalog=self.scheme_catalog + remote_file, + logger=self.logger, + scheme_catalog=self.scheme_catalog, + override_version_id=repoTag, ) if guessed_trs_params is None: raise FetcherException(f"Unable to guess TRS params from {repo}") @@ -575,7 +656,7 @@ def materialize_repo_from_repo( raise FetcherException(errstr) repo_hashed_tag_id = hashlib.sha1( - b"" if repoTag is None else repoTag.encode("utf-8") + b"" if version_id is None else str(version_id).encode("utf-8") ).hexdigest() repo_tag_destpath = repo_destpath / repo_hashed_tag_id else: @@ -922,6 +1003,7 @@ def materialize_repo_from_repo( local=repo_tag_destpath, repo=RemoteRepo( repo_url=remote_file, + tag=cast("RepoTag", str(version_id)), rel_path=cast("Optional[RelPath]", topMeta["workflow_entrypoint"]), repo_type=RepoType.TRS, ), @@ -948,7 +1030,10 @@ def build_pid_from_repo(self, remote_repo: "RemoteRepo") -> "Optional[str]": return remote_repo.repo_url elif remote_repo.repo_type == RepoType.TRS: guessed_trs_params = self.GuessTRSParams( - parsedInputURL, logger=self.logger, fail_ok=True + parsedInputURL, + override_version_id=remote_repo.tag, + logger=self.logger, + fail_ok=True, ) if guessed_trs_params is not None: ( From 43aae4d90816669174eaea2f102f1bc01fa5efbb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20M=C2=AA=20Fern=C3=A1ndez?= Date: Wed, 29 Jan 2025 18:56:15 +0100 Subject: [PATCH 31/60] Removing unused parameter from method BuildRepoPIDFromTRSParams --- tests/fetchers/test_trs.py | 14 -------------- wfexs_backend/fetchers/trs_files.py | 4 ---- 2 files changed, 18 deletions(-) diff --git a/tests/fetchers/test_trs.py b/tests/fetchers/test_trs.py index d18f4122..019eb0b1 100644 --- a/tests/fetchers/test_trs.py +++ b/tests/fetchers/test_trs.py @@ -66,7 +66,6 @@ "trs_endpoint", "workflow_id", "version_id", - "descriptor_type", "url", "remote_repo", "repo_pid", @@ -80,7 +79,6 @@ "#workflow/github.com/sevenbridges-openworkflows/Broad-Best-Practice-Somatic-CNV-Workflows/GATK-Somatic-CNV-Panel-Workflow", ), cast("Optional[WFVersionId]", "master"), - None, cast( "URIType", GA4GHTRSFetcher.INTERNAL_TRS_SCHEME_PREFIX @@ -109,7 +107,6 @@ "WorkflowId", "#workflow/github.com/NCI-GDC/gdc-dnaseq-cwl/GDC_DNASeq" ), cast("Optional[WFVersionId]", "master"), - None, cast( "URIType", GA4GHTRSFetcher.INTERNAL_TRS_SCHEME_PREFIX @@ -138,7 +135,6 @@ "WorkflowId", "#workflow/github.com/NCI-GDC/gdc-dnaseq-cwl/GDC_DNASeq" ), None, - None, cast( "URIType", GA4GHTRSFetcher.INTERNAL_TRS_SCHEME_PREFIX @@ -165,7 +161,6 @@ "https://workflowhub.eu/ga4gh/trs/v2/tools/", cast("WorkflowId", 107), cast("Optional[WFVersionId]", 1), - None, cast( "URIType", GA4GHTRSFetcher.INTERNAL_TRS_SCHEME_PREFIX @@ -193,7 +188,6 @@ "https://workflowhub.eu/ga4gh/trs/v2/tools/", cast("WorkflowId", 106), cast("Optional[WFVersionId]", 3), - None, cast( "URIType", GA4GHTRSFetcher.INTERNAL_TRS_SCHEME_PREFIX @@ -221,7 +215,6 @@ "https://workflowhub.eu/ga4gh/trs/v2/", cast("WorkflowId", 119), cast("Optional[WFVersionId]", 1), - None, cast( "URIType", GA4GHTRSFetcher.INTERNAL_TRS_SCHEME_PREFIX @@ -249,7 +242,6 @@ "https://workflowhub.eu/ga4gh/trs/v2/tools/", cast("WorkflowId", 244), cast("Optional[WFVersionId]", 4), - None, cast( "URIType", GA4GHTRSFetcher.INTERNAL_TRS_SCHEME_PREFIX @@ -277,7 +269,6 @@ "https://ddbj.github.io/workflow-registry/", cast("WorkflowId", "0d2ae4c2-fe4c-48f7-811a-ac277776533e"), cast("Optional[WFVersionId]", "1.0.0"), - None, cast( "URIType", GA4GHTRSFetcher.INTERNAL_TRS_SCHEME_PREFIX @@ -309,7 +300,6 @@ def test_guess_trs_repo_params( trs_endpoint: "str", workflow_id: "WorkflowId", version_id: "Optional[WFVersionId]", - descriptor_type: "Optional[TRS_Workflow_Descriptor]", url: "str", remote_repo: "Optional[RemoteRepo]", repo_pid: "Optional[str]", @@ -332,7 +322,6 @@ def test_build_trs_internal_url_from_repo( trs_endpoint: "str", workflow_id: "WorkflowId", version_id: "Optional[WFVersionId]", - descriptor_type: "Optional[TRS_Workflow_Descriptor]", url: "str", remote_repo: "Optional[RemoteRepo]", repo_pid: "Optional[str]", @@ -342,7 +331,6 @@ def test_build_trs_internal_url_from_repo( trs_endpoint, workflow_id, version_id, - descriptor_type, ) assert output == url @@ -353,7 +341,6 @@ def test_build_trs_pid_from_repo( trs_endpoint: "str", workflow_id: "WorkflowId", version_id: "Optional[WFVersionId]", - descriptor_type: "Optional[TRS_Workflow_Descriptor]", url: "str", remote_repo: "Optional[RemoteRepo]", repo_pid: "Optional[str]", @@ -378,7 +365,6 @@ def test_materialize_repo_from_repo( trs_endpoint: "str", workflow_id: "WorkflowId", version_id: "Optional[WFVersionId]", - descriptor_type: "Optional[TRS_Workflow_Descriptor]", url: "str", remote_repo: "Optional[RemoteRepo]", repo_pid: "Optional[str]", diff --git a/wfexs_backend/fetchers/trs_files.py b/wfexs_backend/fetchers/trs_files.py index a963fa8b..547526d5 100644 --- a/wfexs_backend/fetchers/trs_files.py +++ b/wfexs_backend/fetchers/trs_files.py @@ -551,7 +551,6 @@ def BuildRepoPIDFromTRSParams( trs_endpoint: "str", workflow_id: "WorkflowId", version_id: "Optional[WFVersionId]", - descriptor_type: "Optional[TRS_Workflow_Descriptor]", ) -> "URIType": if isinstance(workflow_id, int): workflow_id_str = str(workflow_id) @@ -580,9 +579,6 @@ def BuildRepoPIDFromTRSParams( + "/versions/" + urllib.parse.quote(str(version_id), safe="") ) - - if descriptor_type is not None: - trs_tool_url += "/" + urllib.parse.quote(descriptor_type, safe="") else: trs_tool_url = trs_tools_url From dec8781f93051d477f475c6ab5ba962a55c95ce1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20M=C2=AA=20Fern=C3=A1ndez?= Date: Fri, 7 Feb 2025 14:24:44 +0100 Subject: [PATCH 32/60] Latest changes about refactoring workflow discovery --- wfexs_backend/wfexs_backend.py | 906 +++++++++++---------------------- wfexs_backend/workflow.py | 18 +- 2 files changed, 305 insertions(+), 619 deletions(-) diff --git a/wfexs_backend/wfexs_backend.py b/wfexs_backend/wfexs_backend.py index 1134735f..db22ae79 100644 --- a/wfexs_backend/wfexs_backend.py +++ b/wfexs_backend/wfexs_backend.py @@ -2,7 +2,7 @@ # -*- coding: utf-8 -*- # SPDX-License-Identifier: Apache-2.0 -# Copyright 2020-2024 Barcelona Supercomputing Center (BSC), Spain +# Copyright 2020-2025 Barcelona Supercomputing Center (BSC), Spain # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -128,6 +128,7 @@ from .utils.rocrate import ( ReadROCrateMetadata, + ROCRATE_JSONLD_FILENAME, ROCrateToolbox, ) @@ -137,6 +138,7 @@ DocumentedProtocolFetcher, DocumentedStatefulProtocolFetcher, FetcherException, + MaterializedRepo, RemoteRepo, RepoType, ) @@ -684,7 +686,7 @@ def __init__( self._sngltn_fetcher: "MutableMapping[Type[AbstractStatefulFetcher], AbstractStatefulFetcher]" = ( dict() ) - self.repo_fetchers: "MutableSequence[AbstractSchemeRepoFetcher]" = list() + self._repo_fetchers: "MutableSequence[AbstractSchemeRepoFetcher]" = list() # scheme_catalog is created on first use self.scheme_catalog = SchemeCatalog() # cacheHandler is created on first use @@ -753,6 +755,10 @@ def cacheTRSFilesDir(self) -> "pathlib.Path": def cacheWorkflowInputsDir(self) -> "pathlib.Path": return self.cachePathMap[CacheType.Input] + @property + def repo_fetchers(self) -> "Sequence[AbstractSchemeRepoFetcher]": + return sorted(self._repo_fetchers, key=lambda f: f.PRIORITY, reverse=True) + def getCacheHandler( self, cache_type: "CacheType" ) -> "Tuple[CacheHandler, Optional[pathlib.Path]]": @@ -780,7 +786,7 @@ def instantiateStatefulFetcher( # Also, if it is a repository fetcher, record it separately if isinstance(instStatefulFetcher, AbstractSchemeRepoFetcher): - self.repo_fetchers.append(instStatefulFetcher) + self._repo_fetchers.append(instStatefulFetcher) return cast("StatefulFetcher", instStatefulFetcher) @@ -1896,22 +1902,22 @@ def guess_repo_params( self, wf_url: "Union[URIType, parse.ParseResult]", fail_ok: "bool" = False, - ) -> "Optional[RemoteRepo]": + ) -> "Optional[Tuple[RemoteRepo, AbstractSchemeRepoFetcher]]": if isinstance(wf_url, parse.ParseResult): parsedRepoURL = wf_url else: parsedRepoURL = urllib.parse.urlparse(wf_url) - remote_repo = SoftwareHeritageFetcher.GuessRepoParams( - parsedRepoURL, logger=self.logger, fail_ok=fail_ok - ) - if remote_repo is None: - # Assume it might be a git repo or a link to a git repo - remote_repo = GitFetcher.GuessRepoParams( + remote_repo: "Optional[RemoteRepo]" = None + fetcher: "Optional[AbstractSchemeRepoFetcher]" = None + for fetcher in self.repo_fetchers: + remote_repo = fetcher.GuessRepoParams( parsedRepoURL, logger=self.logger, fail_ok=fail_ok ) + if remote_repo is not None: + return remote_repo, fetcher - return remote_repo + return None def cacheWorkflow( self, @@ -1947,10 +1953,16 @@ def cacheWorkflow( if requested_workflow_type is None: self.logger.warning( - f"Workflow of type {descriptor_type} is not supported by this version of WfExS-backend" + f"Workflow of type {descriptor_type} is not supported by this version of WfExS-backend. Switching to guess mode." ) - putative_repo_url = str(workflow_id) + if (trs_endpoint is not None) and len(trs_endpoint) > 0: + putative_repo_url = GA4GHTRSFetcher.BuildRepoPIDFromTRSParams( + trs_endpoint, workflow_id, version_id + ) + else: + putative_repo_url = cast("URIType", str(workflow_id)) + parsedRepoURL = urllib.parse.urlparse(putative_repo_url) # It is not an absolute URL, so it is being an identifier in the workflow @@ -1960,206 +1972,59 @@ def cacheWorkflow( repoDir: "Optional[pathlib.Path]" = None putative: "bool" = False cached_putative_path: "Optional[pathlib.Path]" = None - if parsedRepoURL.scheme in ( - "", - GA4GHTRSFetcher.TRS_SCHEME_PREFIX, - GA4GHTRSFetcher.INTERNAL_TRS_SCHEME_PREFIX, - ): - # Extracting the TRS endpoint details from the parsedRepoURL - if parsedRepoURL.scheme == GA4GHTRSFetcher.TRS_SCHEME_PREFIX: - # Duplication of code borrowed from trs_files.py - path_steps: "Sequence[str]" = parsedRepoURL.path.split("/") - if len(path_steps) < 3 or path_steps[0] != "": - raise WfExSBackendException( - f"Ill-formed TRS CURIE {putative_repo_url}. It should be in the format of {GA4GHTRSFetcher.TRS_SCHEME_PREFIX}://id/version or {GA4GHTRSFetcher.TRS_SCHEME_PREFIX}://prefix-with-slashes/id/version" - ) - trs_steps = cast("MutableSequence[str]", path_steps[0:-2]) - trs_steps.extend(["ga4gh", "trs", "v2", ""]) - trs_endpoint = urllib.parse.urlunparse( - urllib.parse.ParseResult( - scheme="https", - netloc=parsedRepoURL.netloc, - path="/".join(trs_steps), - params="", - query="", - fragment="", - ) - ) - - workflow_id = urllib.parse.unquote(path_steps[-2]) - version_id = urllib.parse.unquote(path_steps[-1]) - elif parsedRepoURL.scheme == GA4GHTRSFetcher.INTERNAL_TRS_SCHEME_PREFIX: - # Time to try guessing everything - try: - internal_trs_cached_content = self.cacheHandler.fetch( - cast("URIType", parsedRepoURL.path), - destdir=meta_dir, - offline=offline, - ignoreCache=ignoreCache, - ) - - with internal_trs_cached_content.path.open( - mode="r", encoding="utf-8" - ) as ctmf: - trs__meta = json.load(ctmf) - except Exception as e: - raise WFException( - f"trs_endpoint could not be guessed from {parsedRepoURL.path} (raised exception)" - ) from e - - if not isinstance(trs__meta, dict): - raise WFException( - f"trs_endpoint could not be guessed from {parsedRepoURL.path} (not returning JSON object)" - ) - - trs__meta__id: "Optional[str]" = trs__meta.get("id") - if trs__meta__id is None: - raise WFException( - f"trs_endpoint could not be guessed from {parsedRepoURL.path} (not returning id)" - ) - - trs__meta__url: "Optional[str]" = trs__meta.get("url") - if trs__meta__url is None: - raise WFException( - f"trs_endpoint could not be guessed from {parsedRepoURL.path} (not returning url)" - ) - - if "descriptor_type" in trs__meta: - version_id = trs__meta__id - # Now we need to backtrack in the url to get the workflow id - tool_url_suffix = "/versions/" + urllib.parse.quote( - version_id, safe="" - ) - - # If this happens, this implementation is not so compliant with standard - dockstore_tool_url_suffix = "/versions/" + urllib.parse.quote( - trs__meta.get("name", ""), safe="" - ) - if trs__meta__url.endswith(dockstore_tool_url_suffix): - tool_url_suffix = dockstore_tool_url_suffix - version_id = trs__meta.get("name", "") - - if not trs__meta__url.endswith(tool_url_suffix): - raise WFException( - f"trs_endpoint could not be guessed from {parsedRepoURL.path} and {trs__meta__url} (version {version_id}, mismatched API route)" - ) - - trs_tool_url = trs__meta__url[0 : -len(tool_url_suffix)] - try: - internal_trs_cached_content = self.cacheHandler.fetch( - cast("URIType", trs_tool_url), - destdir=meta_dir, - offline=offline, - ignoreCache=ignoreCache, - ) - - with internal_trs_cached_content.path.open( - mode="r", encoding="utf-8" - ) as ctmf: - trs__meta = json.load(ctmf) - except Exception as e: - raise WFException( - f"trs_endpoint could not be guessed from {trs_tool_url} (came from {parsedRepoURL.path}, raised exception)" - ) from e - - trs__meta__id = trs__meta.get("id") - if trs__meta__id is None: - raise WFException( - f"trs_endpoint could not be guessed from {trs_tool_url} (came from {parsedRepoURL.path}, not returning id)" - ) - - trs__meta__url = trs__meta.get("url") - if trs__meta__url is None: - raise WFException( - f"trs_endpoint could not be guessed from {trs_tool_url} (came from {parsedRepoURL.path}, not returning url)" - ) - else: - trs_tool_url = parsedRepoURL.path - version_id = None - - if "toolclass" in trs__meta: - workflow_id = trs__meta__id - - # Now we need to backtrack in the url to get the workflow id - tool_url_suffix = "/tools/" + urllib.parse.quote( - workflow_id, safe="" - ) - if not trs__meta__url.endswith(tool_url_suffix): - raise WFException( - f"trs_endpoint could not be guessed from {trs_tool_url} and {trs__meta__url} (mismatched API route)" - ) - - trs_endpoint = trs__meta__url[0 : -len(tool_url_suffix)] - else: - raise WFException( - f"trs_endpoint could not be guessed from {parsedRepoURL.path} (no clues)" - ) - - if (trs_endpoint is not None) and len(trs_endpoint) > 0: - i_workflow, repoDir = self.getWorkflowRepoFromTRS( - trs_endpoint, - workflow_id, - version_id, - descriptor_type, - ignoreCache=ignoreCache, - offline=offline, - meta_dir=meta_dir, + if parsedRepoURL.scheme == "": + raise WFException("trs_endpoint was not provided") + + # Trying to be smarter + guessed = self.guess_repo_params(parsedRepoURL, fail_ok=True) + if guessed is not None: + guessedRepo = guessed[0] + if guessedRepo.tag is None and version_id is not None: + guessedRepo = RemoteRepo( + repo_url=guessedRepo.repo_url, + tag=cast("RepoTag", str(version_id)), + rel_path=guessedRepo.rel_path, + repo_type=guessedRepo.repo_type, + web_url=guessedRepo.web_url, ) - # For the cases of pure TRS repos, like Dockstore - # repoDir contains the cached path - else: - raise WFException("trs_endpoint was not provided") else: - # Trying to be smarter - guessedRepo = self.guess_repo_params(parsedRepoURL, fail_ok=True) - - if guessedRepo is not None: - if guessedRepo.tag is None and version_id is not None: - guessedRepo = RemoteRepo( - repo_url=guessedRepo.repo_url, - tag=cast("RepoTag", version_id), - rel_path=guessedRepo.rel_path, - repo_type=guessedRepo.repo_type, - web_url=guessedRepo.web_url, - ) - else: - repoRelPath: "Optional[str]" = None - ( - i_workflow, - cached_putative_path, - metadata_array, - repoRelPath, - ) = self.getWorkflowBundleFromURI( - cast("URIType", workflow_id), - offline=offline, - ignoreCache=ignoreCache, - ) + repoRelPath: "Optional[str]" = None + ( + i_workflow, + cached_putative_path, + metadata_array, + repoRelPath, + ) = self.getWorkflowBundleFromURI( + putative_repo_url, + offline=offline, + ignoreCache=ignoreCache, + ) - if i_workflow is None: - repoDir = cached_putative_path - if not repoRelPath: - if repoDir.is_dir(): - if len(parsedRepoURL.fragment) > 0: - frag_qs = urllib.parse.parse_qs(parsedRepoURL.fragment) - subDirArr = frag_qs.get("subdirectory", []) - if len(subDirArr) > 0: - repoRelPath = subDirArr[0] - elif len(metadata_array) > 0: - # Let's try getting a pretty filename - # when the workflow is a single file - repoRelPath = metadata_array[0].preferredName - - # It can be either a relative path to a directory or to a file - # It could be even empty! - if repoRelPath == "": - repoRelPath = None - # raise WFException('Unable to guess repository from RO-Crate manifest') - guessedRepo = RemoteRepo( - repo_url=cast("RepoURL", workflow_id), - tag=cast("RepoTag", version_id), - rel_path=cast("Optional[RelPath]", repoRelPath), - ) - putative = True + if i_workflow is None: + repoDir = cached_putative_path + if not repoRelPath: + if repoDir.is_dir(): + if len(parsedRepoURL.fragment) > 0: + frag_qs = urllib.parse.parse_qs(parsedRepoURL.fragment) + subDirArr = frag_qs.get("subdirectory", []) + if len(subDirArr) > 0: + repoRelPath = subDirArr[0] + elif len(metadata_array) > 0: + # Let's try getting a pretty filename + # when the workflow is a single file + repoRelPath = metadata_array[0].preferredName + + # It can be either a relative path to a directory or to a file + # It could be even empty! + if repoRelPath == "": + repoRelPath = None + # raise WFException('Unable to guess repository from RO-Crate manifest') + guessedRepo = RemoteRepo( + repo_url=cast("RepoURL", workflow_id), + tag=cast("RepoTag", version_id), + rel_path=cast("Optional[RelPath]", repoRelPath), + ) + putative = True # This can be incorrect, but let it be for now if i_workflow is not None: @@ -2178,6 +2043,7 @@ def cacheWorkflow( assert guessedRepo is not None assert guessedRepo.repo_url is not None + repo: "RemoteRepo" = guessedRepo repoEffectiveCheckout: "Optional[RepoTag]" = None # A putative workflow is one which is already materialized @@ -2188,422 +2054,216 @@ def cacheWorkflow( len(parsedRepoURL.scheme) > 0 ), f"Repository id {guessedRepo.repo_url} should be a parsable URI" - repoDir, repoEffectiveCheckout = self.doMaterializeRepo( + repoDir, materialized_repo, downstream_repos = self.doMaterializeRepo( guessedRepo, + fetcher=guessed[1] if guessed is not None else None, doUpdate=ignoreCache, registerInCache=registerInCache, ) + assert len(downstream_repos) > 0 + repo = materialized_repo.repo + repoEffectiveCheckout = repo.get_checkout() + # TODO: should we preserve the chain of repos? - return repoDir, guessedRepo, engineDesc, repoEffectiveCheckout + return repoDir, repo, engineDesc, repoEffectiveCheckout TRS_METADATA_FILE: "Final[RelPath]" = cast("RelPath", "trs_metadata.json") TRS_QUERY_CACHE_FILE: "Final[RelPath]" = cast("RelPath", "trs_result.json") - def getWorkflowRepoFromTRS( + def doMaterializeRepo( self, - trs_endpoint: "str", - workflow_id: "WorkflowId", - version_id: "Optional[WFVersionId]", - descriptor_type: "Optional[TRS_Workflow_Descriptor]", - offline: "bool" = False, - ignoreCache: "bool" = False, - meta_dir: "Optional[pathlib.Path]" = None, - ) -> "Tuple[IdentifiedWorkflow, Optional[pathlib.Path]]": + repo: "RemoteRepo", + fetcher: "Optional[AbstractSchemeRepoFetcher]" = None, + doUpdate: "bool" = True, + registerInCache: "bool" = True, + ) -> "Tuple[pathlib.Path, MaterializedRepo, Sequence[RemoteRepo]]": """ - - :return: + This method is used to materialize repos described using instances + of RemoteRepo. It starts asking all the known repo fetchers whether + they recognize the URI as consumable by them. + + Later, they fulfil the materialization task, answering the local + path where the repo was cloned, an updated instance of RemoteRepo, + the metadata array of all the requests, and whether their copy + came from another upstream repo (and whether it is recommended). + + If the upstream repo is recommended, then doMaterializeRepo calls + itself using it in order to fetch the contents of the upstream repo. + + If no repo fetcher is able to materialize the repo, then it is + considered a "raw" one, so it is fetched using standard fetchers. + With the fetched content, it is detected whether it is an RO-Crate. + If it is so, and the associated upstream repo is obtained, then + doMaterializeRepo calls itself in order to materialize it. + + At the end of the process the path to the repo, the identified + tag, a MaterializedRepo instance and the list of repos which brought + to this one is returned. """ - # If nothing is set, just create a temporary directory - if meta_dir is None: - meta_dir = pathlib.Path( - tempfile.mkdtemp(prefix="WfExS", suffix="TRSFetched") - ) - # Assuring this temporal directory is removed at the end - atexit.register(shutil.rmtree, meta_dir, True) - else: - # Assuring the destination directory does exist - meta_dir.mkdir(parents=True, exist_ok=True) - - if isinstance(workflow_id, int): - workflow_id_str = str(workflow_id) - else: - workflow_id_str = workflow_id - - # The base URL must end with a slash - if trs_endpoint[-1] != "/": - trs_endpoint += "/" - # Now, time to check whether it is a TRSv2 - trs_endpoint_v2_meta_url = cast("URIType", trs_endpoint + "service-info") - trs_endpoint_v2_beta2_meta_url = cast("URIType", trs_endpoint + "metadata") - trs_endpoint_meta_url = None - - # Needed to store this metadata - trsMetadataCache = meta_dir / self.TRS_METADATA_FILE + # This is needed in case a proposed fetcher is already set + # by the caller of this method (discouraged) + if fetcher is None: + for fetcher in self.repo_fetchers: + if fetcher.build_pid_from_repo(repo) is not None: + break + else: + fetcher = None - try: - trs_cached_content = self.cacheHandler.fetch( - trs_endpoint_v2_meta_url, - destdir=meta_dir, - offline=offline, - ignoreCache=ignoreCache, - ) - trs_endpoint_meta_url = trs_endpoint_v2_meta_url - except WFException as wfe: - try: - trs_cached_content = self.cacheHandler.fetch( - trs_endpoint_v2_beta2_meta_url, - destdir=meta_dir, - offline=offline, - ignoreCache=ignoreCache, - ) - trs_endpoint_meta_url = trs_endpoint_v2_beta2_meta_url - except WFException as wfebeta: - raise WFException( - "Unable to fetch metadata from {} in order to identify whether it is a working GA4GH TRSv2 endpoint. Exceptions:\n{}\n{}".format( - trs_endpoint, wfe, wfebeta - ) + if fetcher is None and repo.repo_type not in (RepoType.Raw, None): + raise WfExSBackendException( + f"Don't know how to materialize {repo.repo_url} (of type {repo.repo_type}) as a repository" ) - # Giving a friendly name - if not trsMetadataCache.exists(): - os.symlink(trs_cached_content.path.name, trsMetadataCache) - - with trsMetadataCache.open(mode="r", encoding="utf-8") as ctmf: - trs_endpoint_meta = json.load(ctmf) - - # Minimal checks - trs_version_str: "Optional[str]" = None - trs_artifact: "Optional[str]" = None - trs_group: "Optional[str]" = None - trs_endpoint_meta_type: "Optional[Mapping[str, str]]" = trs_endpoint_meta.get( - "type" - ) - if trs_endpoint_meta_type is not None: - trs_version_str = trs_endpoint_meta_type.get("version") - trs_artifact = trs_endpoint_meta_type.get("artifact") - trs_group = trs_endpoint_meta_type.get("group") - else: - # Supporting 2.0beta2 - trs_version_str = trs_endpoint_meta.get("api_version") - - if trs_version_str is None: - errstr = f"Unable to identify TRS version from {trs_endpoint_meta_url}. Is this a TRS endpoint?" - self.logger.error(errstr) - raise WFException(errstr) - - # Avoiding querying a GA4GH DRS service, for instance - if trs_artifact is not None and trs_artifact.lower() not in ("trs", "yevis"): - errstr = f"Unsupported GA4GH service {trs_artifact} (group {trs_group}) from {trs_endpoint_meta_url}" - self.logger.error(errstr) - raise WFException(errstr) - - # Warning about potentially unsupported versions - trs_version_tuple = tuple(map(int, trs_version_str.split("."))) - if trs_version_tuple < (2, 0, 1): - self.logger.warning( - f"{trs_endpoint_meta_url} is offering old TRS version {trs_version_str}, which diverges from what this implementation supports" - ) - elif trs_version_tuple > (3, 0): - self.logger.warning( - f"{trs_endpoint_meta_url} is offering TRS version {trs_version_str}, which might diverge from what this implementation supports" + # An specialized fetcher is used + downstream_repos: "MutableSequence[RemoteRepo]" + if fetcher is not None: + materialized_repo = fetcher.materialize_repo_from_repo( + repo, + doUpdate=doUpdate, + base_repo_destdir=self.cacheWorkflowDir, ) - # Now, check the tool does exist in the TRS, and the version - trs_tools_url = cast( - "URIType", - urllib.parse.urljoin( - trs_endpoint, - WF.TRS_TOOLS_PATH + urllib.parse.quote(workflow_id_str, safe=""), - ), - ) - - trsQueryCache = meta_dir / self.TRS_QUERY_CACHE_FILE - trs_cached_tool = self.cacheHandler.fetch( - trs_tools_url, destdir=meta_dir, offline=offline, ignoreCache=ignoreCache - ) - # Giving a friendly name - if not trsQueryCache.exists(): - os.symlink(trs_cached_tool.path.name, trsQueryCache) - - with trsQueryCache.open(mode="r", encoding="utf-8") as tQ: - rawToolDesc = tQ.read() - - # If the tool does not exist, an exception will be thrown before - jd = json.JSONDecoder() - toolDesc = jd.decode(rawToolDesc) - - # If the tool is not a workflow, complain - if toolDesc.get("toolclass", {}).get("name", "") != "Workflow": - raise WFException( - "Tool {} from {} is not labelled as a workflow. Raw answer:\n{}".format( - workflow_id_str, trs_endpoint, rawToolDesc - ) - ) + downstream_repos = [repo] + repo_path = materialized_repo.local + materialized_repo_repo = materialized_repo.repo + metadata_array = materialized_repo.metadata_array - possibleToolVersions = toolDesc.get("versions", []) - if len(possibleToolVersions) == 0: - raise WFException( - "Version {} not found in workflow {} from {} . Raw answer:\n{}".format( - version_id, workflow_id_str, trs_endpoint, rawToolDesc - ) - ) + # Now, let's register the checkout with cache structures + # using its public URI + remote_url: "str" = repo.repo_url + if fetcher.__class__ == GitFetcher: + if not repo.repo_url.startswith("git"): + remote_url = "git+" + repo.repo_url - toolVersion = None - toolVersionId = str(version_id) if isinstance(version_id, int) else version_id - if (toolVersionId is not None) and len(toolVersionId) > 0: - for possibleToolVersion in possibleToolVersions: - if isinstance(possibleToolVersion, dict): - possibleId = str(possibleToolVersion.get("id", "")) - possibleName = str(possibleToolVersion.get("name", "")) - if version_id in (possibleId, possibleName): - toolVersion = possibleToolVersion - break - else: - raise WFException( - "Version {} not found in workflow {} from {} . Raw answer:\n{}".format( - version_id, workflow_id_str, trs_endpoint, rawToolDesc - ) - ) - else: - toolVersionId = "" - for possibleToolVersion in possibleToolVersions: - possibleToolVersionId = str(possibleToolVersion.get("id", "")) - if ( - len(possibleToolVersionId) > 0 - and toolVersionId < possibleToolVersionId - ): - toolVersion = possibleToolVersion - toolVersionId = possibleToolVersionId + if repo.tag is not None: + remote_url += "@" + repo.tag - if toolVersion is None: - raise WFException( - "No valid version was found in workflow {} from {} . Raw answer:\n{}".format( - workflow_id_str, trs_endpoint, rawToolDesc - ) + repo_desc: "Optional[Mapping[str, Any]]" = ( + materialized_repo_repo.gen_repo_desc() ) - - # The version has been found - toolDescriptorTypes = toolVersion.get("descriptor_type", []) - if not isinstance(toolDescriptorTypes, list): - raise WFException( - 'Version {} of workflow {} from {} has no valid "descriptor_type" (should be a list). Raw answer:\n{}'.format( - version_id, workflow_id_str, trs_endpoint, rawToolDesc + if repo_desc is None: + repo_desc = {} + augmented_metadata_array = [ + URIWithMetadata( + uri=cast("URIType", remote_url), + metadata=repo_desc, + ), + *metadata_array, + ] + + # Give the chance to register the current fetched repo in the corresponding cache + if registerInCache: + kind = ContentKind.Directory if repo_path.is_dir() else ContentKind.File + self.cacheHandler.inject( + cast("URIType", remote_url), + destdir=self.cacheWorkflowDir, + fetched_metadata_array=augmented_metadata_array, + finalCachedFilename=repo_path, + inputKind=kind, ) - ) - # Now, realize whether it matches - chosenDescriptorType = descriptor_type - if chosenDescriptorType is None: - for candidateDescriptorType in self.RECOGNIZED_TRS_DESCRIPTORS.keys(): - if candidateDescriptorType in toolDescriptorTypes: - chosenDescriptorType = candidateDescriptorType - break - else: - raise WFException( - 'Version {} of workflow {} from {} has no acknowledged "descriptor_type". Raw answer:\n{}'.format( - version_id, workflow_id_str, trs_endpoint, rawToolDesc + # Go to the next repo only if it is recommended + if ( + materialized_repo.recommends_upstream + and materialized_repo.upstream_repo is not None + ): + try: + ( + upstream_repo_path, + upstream_materialized_repo, + upstream_downstream_repos, + ) = self.doMaterializeRepo( + materialized_repo.upstream_repo, + doUpdate=doUpdate, + registerInCache=registerInCache, ) - ) - elif chosenDescriptorType not in toolVersion["descriptor_type"]: - raise WFException( - "Descriptor type {} not available for version {} of workflow {} from {} . Raw answer:\n{}".format( - descriptor_type, - version_id, - workflow_id_str, - trs_endpoint, - rawToolDesc, - ) - ) - elif chosenDescriptorType not in self.RECOGNIZED_TRS_DESCRIPTORS: - raise WFException( - "Descriptor type {} is not among the acknowledged ones by this backend. Version {} of workflow {} from {} . Raw answer:\n{}".format( - descriptor_type, - version_id, - workflow_id_str, - trs_endpoint, - rawToolDesc, - ) - ) - - toolFilesURL = ( - trs_tools_url - + "/versions/" - + urllib.parse.quote(toolVersionId, safe="") - + "/" - + urllib.parse.quote(chosenDescriptorType, safe="") - + "/files" - ) - - # Detecting whether RO-Crate trick will work - if trs_endpoint_meta.get("organization", {}).get("name") == "WorkflowHub": - self.logger.debug("WorkflowHub workflow") - # And this is the moment where the RO-Crate must be fetched - roCrateURL = cast( - "URIType", - toolFilesURL + "?" + urllib.parse.urlencode({"format": "zip"}), - ) - + downstream_repos.extend(upstream_downstream_repos) + return ( + upstream_repo_path, + upstream_materialized_repo, + downstream_repos, + ) + except Exception as e: + self.logger.warning( + f"Recommended upstream repo {materialized_repo.upstream_repo} from repo {repo} could not be fetched, skipping. Exception: {e}" + ) + else: + downstream_repos = [] + # Let's try guessing whether it is an RO-Crate ( i_workflow, - self.cacheROCrateFilename, + cached_putative_path, metadata_array, - _, + repo_rel_path, ) = self.getWorkflowBundleFromURI( - roCrateURL, - expectedEngineDesc=self.RECOGNIZED_TRS_DESCRIPTORS[ - chosenDescriptorType - ], - offline=offline, - ignoreCache=ignoreCache, - ) - assert i_workflow is not None - return i_workflow, None - else: - self.logger.debug("TRS workflow") - # Learning the available files and maybe - # which is the entrypoint to the workflow - cached_trs_files = self.cacheFetch( - cast( - "URIType", - GA4GHTRSFetcher.INTERNAL_TRS_SCHEME_PREFIX + ":" + toolFilesURL, - ), - CacheType.TRS, - offline=offline, - ignoreCache=ignoreCache, - ) - - expectedEngineDesc = self.RECOGNIZED_TRS_DESCRIPTORS[chosenDescriptorType] - trs_meta = cached_trs_files.metadata_array[0] - remote_workflow_entrypoint = trs_meta.metadata.get( - "remote_workflow_entrypoint" + repo.repo_url, + ignoreCache=doUpdate, + registerInCache=registerInCache, ) - trs_files_path: "Optional[pathlib.Path]" = None - if remote_workflow_entrypoint is not None: - # Give it a chance to identify the original repo of the workflow - repo = self.guess_repo_params(remote_workflow_entrypoint, fail_ok=True) - - self.logger.error( - f"Now guessing from {remote_workflow_entrypoint} {repo}" - ) - if repo is not None: - try: - # This is really, really needed to recognize - # when to fall back to the safe path of what - # we already have - repoDir, repoEffectiveCheckout = self.doMaterializeRepo( - repo, - doUpdate=ignoreCache, - ) - except FetcherException as fe: - self.logger.warning( - f"Repo for {remote_workflow_entrypoint} was guessed, but some element was unreachable. Falling back to GA4GH TRS contents from {toolFilesURL}" - ) - self.logger.warning(f"(nested exception was {fe})") - repo = RemoteRepo(repo_url=cast("RepoURL", toolFilesURL)) - if repo.repo_type is None: - workflow_entrypoint = trs_meta.metadata.get( - "workflow_entrypoint" - ) - if workflow_entrypoint is not None: - repo = RemoteRepo( - repo_url=cast("RepoURL", toolFilesURL), - rel_path=workflow_entrypoint, - repo_type=RepoType.TRS, - ) - trs_files_path = cached_trs_files.path + if i_workflow is not None: + # It is an RO-Crate + downstream_repos.append(repo) + i_workflow_repo = i_workflow.remote_repo + if repo_rel_path is not None: + i_workflow_repo = i_workflow_repo._replace(rel_path=repo_rel_path) + downstream_repos.append(i_workflow_repo) - self.logger.debug( - "Derived repository {} ({} , rel {}) from {}".format( - repo.repo_url, repo.tag, repo.rel_path, trs_tools_url - ) + # We are assuming it is always recommended + try: + ( + upstream_repo_path, + upstream_materialized_repo, + upstream_downstream_repos, + ) = self.doMaterializeRepo( + i_workflow_repo, + doUpdate=doUpdate, + registerInCache=registerInCache, ) + downstream_repos.extend(upstream_downstream_repos) return ( - IdentifiedWorkflow( - workflow_type=expectedEngineDesc, remote_repo=repo - ), - trs_files_path, + upstream_repo_path, + upstream_materialized_repo, + downstream_repos, ) - - workflow_entrypoint = trs_meta.metadata.get("workflow_entrypoint") - if workflow_entrypoint is not None: - self.logger.debug( - "Using raw files from TRS tool {}".format(trs_tools_url) + except Exception as e: + raise + # TODO: extract and use payload workflow from RO-Crate as a fallback + else: + # It was not an RO-Crate, so it is a raw workflow + repo_path = cached_putative_path + parsed_repo_url = urllib.parse.urlparse(repo.repo_url) + if not repo_rel_path: + if repo_path.is_dir(): + if len(parsed_repo_url.fragment) > 0: + frag_qs = urllib.parse.parse_qs(parsed_repo_url.fragment) + subDirArr = frag_qs.get("subdirectory", []) + if len(subDirArr) > 0: + repo_rel_path = cast("RelPath", subDirArr[0]) + elif len(metadata_array) > 0: + # Let's try getting a pretty filename + # when the workflow is a single file + repo_rel_path = metadata_array[0].preferredName + + # It can be either a relative path to a directory or to a file + # It could be even empty! + if repo_rel_path == "": + repo_rel_path = None + # raise WFException('Unable to guess repository from RO-Crate manifest') + guessed_repo = RemoteRepo( + repo_url=repo.repo_url, + rel_path=repo_rel_path, + repo_type=RepoType.Raw, ) - return ( - IdentifiedWorkflow( - workflow_type=expectedEngineDesc, - remote_repo=RemoteRepo( - repo_url=cast("RepoURL", toolFilesURL), - rel_path=workflow_entrypoint, - repo_type=RepoType.TRS, - ), - ), - cached_trs_files.path, + downstream_repos.append(guessed_repo) + materialized_repo = MaterializedRepo( + local=repo_path, + repo=guessed_repo, + metadata_array=metadata_array, ) - raise WFException("Unable to find a workflow in {}".format(trs_tools_url)) - - def doMaterializeRepo( - self, - repo: "RemoteRepo", - doUpdate: "bool" = True, - registerInCache: "bool" = True, - ) -> "Tuple[pathlib.Path, RepoTag]": - fetcher_clazz: "Optional[Type[AbstractSchemeRepoFetcher]]" = None - if repo.repo_type not in (RepoType.Other, RepoType.SoftwareHeritage): - fetcher_clazz = GitFetcher - elif repo.repo_type == RepoType.SoftwareHeritage: - fetcher_clazz = SoftwareHeritageFetcher - - if fetcher_clazz is None: - raise WfExSBackendException( - f"Don't know how to materialize {repo.repo_url} as a repository" - ) - - fetcher = self.instantiateRepoFetcher(fetcher_clazz) - materialized_repo_return = fetcher.materialize_repo_from_repo( - repo, - doUpdate=doUpdate, - base_repo_destdir=self.cacheWorkflowDir, - ) - repo_path = materialized_repo_return.local - materialized_repo = materialized_repo_return.repo - metadata_array = materialized_repo_return.metadata_array - - # Now, let's register the checkout with cache structures - # using its public URI - remote_url: "str" = repo.repo_url - if fetcher_clazz == GitFetcher: - if not repo.repo_url.startswith("git"): - remote_url = "git+" + repo.repo_url - - if repo.tag is not None: - remote_url += "@" + repo.tag - - repo_desc: "Optional[Mapping[str, Any]]" = materialized_repo.gen_repo_desc() - if repo_desc is None: - repo_desc = {} - augmented_metadata_array = [ - URIWithMetadata( - uri=cast("URIType", remote_url), - metadata=repo_desc, - ), - *metadata_array, - ] - - if registerInCache: - kind = ContentKind.Directory if repo_path.is_dir() else ContentKind.File - self.cacheHandler.inject( - cast("URIType", remote_url), - destdir=self.cacheWorkflowDir, - fetched_metadata_array=augmented_metadata_array, - finalCachedFilename=repo_path, - inputKind=kind, - ) - - return repo_path, materialized_repo.get_checkout() + return repo_path, materialized_repo, downstream_repos def getWorkflowBundleFromURI( self, @@ -2633,15 +2293,29 @@ def getWorkflowBundleFromURI( if cached_content.path.is_file(): # Now, let's guess whether it is a possible RO-Crate or a bare file - encoding = magic.from_file(cached_content.path.as_posix(), mime=True) + metadata_file = cached_content.path + encoding = magic.from_file(metadata_file.as_posix(), mime=True) + elif cached_content.path.is_dir(): + metadata_file = cached_content.path / ROCRATE_JSONLD_FILENAME + if metadata_file.is_file(): + encoding = magic.from_file(metadata_file.as_posix(), mime=True) + else: + # A directory does not have mime type + encoding = "" else: - # A directory does not have mime type - encoding = "" - if encoding == "application/zip": - self.logger.info( - "putative workflow {} seems to be a packed RO-Crate".format(remote_url) + raise WfExSBackendException( + f"Unexpected cached path {cached_content.path}, which is neither file nor directory" ) + if encoding in ("application/zip", "application/json"): + if encoding == "application/zip": + info_message = ( + f"putative workflow {remote_url} seems to be a packed RO-Crate" + ) + else: + info_message = f"putative workflow from {remote_url} seems to be an unpacked RO-Crate" + self.logger.info(info_message) + crate_hashed_id = hashlib.sha1(remote_url.encode("utf-8")).hexdigest() roCrateFile = pathlib.Path(self.cacheROCrateDir) / ( crate_hashed_id + self.DEFAULT_RO_EXTENSION @@ -2654,22 +2328,29 @@ def getWorkflowBundleFromURI( roCrateFile, ) - identified_workflow = self.getWorkflowRepoFromROCrateFile( - roCrateFile, expectedEngineDesc - ) - return ( - identified_workflow, - roCrateFile, - cached_content.metadata_array, - identified_workflow.remote_repo.rel_path, - ) - else: - return ( - None, - cached_content.path, - cached_content.metadata_array, - None, - ) + try: + identified_workflow = self.getWorkflowRepoFromROCrateFile( + roCrateFile, expectedEngineDesc + ) + return ( + identified_workflow, + roCrateFile, + cached_content.metadata_array, + identified_workflow.remote_repo.rel_path, + ) + except Exception as e: + self.logger.info( + f"Putative workflow from {remote_url} is considered a raw one." + ) + self.logger.debug(f"Rejection traces {e}") + + # Default return + return ( + None, + cached_content.path, + cached_content.metadata_array, + None, + ) def getWorkflowRepoFromROCrateFile( self, @@ -2726,11 +2407,12 @@ def getWorkflowRepoFromROCrateFile( ) # We need this additional step to guess the repo type - guessedRepo = self.guess_repo_params(repo.repo_url, fail_ok=True) - if guessedRepo is None or guessedRepo.repo_type is None: + guessed = self.guess_repo_params(repo.repo_url, fail_ok=True) + if guessed is None or guessed[0].repo_type is None: raise WfExSBackendException( f"Unable to guess repository from RO-Crate manifest obtained from {public_name}" ) + guessedRepo = guessed[0] # Rescuing some values if repo.tag is not None and guessedRepo.tag is None: diff --git a/wfexs_backend/workflow.py b/wfexs_backend/workflow.py index 624a75c5..eace1131 100644 --- a/wfexs_backend/workflow.py +++ b/wfexs_backend/workflow.py @@ -2,7 +2,7 @@ # -*- coding: utf-8 -*- # SPDX-License-Identifier: Apache-2.0 -# Copyright 2020-2024 Barcelona Supercomputing Center (BSC), Spain +# Copyright 2020-2025 Barcelona Supercomputing Center (BSC), Spain # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -449,7 +449,6 @@ class WF: DEFAULT_TRS_ENDPOINT: "Final[str]" = ( "https://dev.workflowhub.eu/ga4gh/trs/v2/" # root of GA4GH TRS API ) - TRS_TOOLS_PATH: "Final[str]" = "tools/" def __init__( self, @@ -654,15 +653,12 @@ def __init__( [] if default_actions is None else default_actions ) + # We are assuming here the provided TRS endpoint is right # The endpoint should always end with a slash if isinstance(trs_endpoint, str): if trs_endpoint[-1] != "/": trs_endpoint += "/" - # Removing the tools suffix, which appeared in first WfExS iterations - if trs_endpoint.endswith("/" + self.TRS_TOOLS_PATH): - trs_endpoint = trs_endpoint[0 : -len(self.TRS_TOOLS_PATH)] - self.trs_endpoint = trs_endpoint else: self.trs_endpoint = None @@ -1944,6 +1940,7 @@ def fetchWorkflow( repoDir: "Optional[pathlib.Path]" = None injected_workflow: "Optional[LocalWorkflow]" = None rel_path_files: "Optional[Sequence[Union[RelPath, URIType]]]" = None + # Materialize the workflow, even if it was already materialized if self.remote_repo is None or ignoreCache: repoEffectiveCheckout: "Optional[RepoTag]" # Injectable repo info is a precondition for injectable local workflow @@ -1996,11 +1993,18 @@ def fetchWorkflow( f"Injected workflow has a different relPath from the injected repo" ) else: - repoDir, repoEffectiveCheckout = self.wfexs.doMaterializeRepo( + ( + repoDir, + materialized_repo, + downstream_repos, + ) = self.wfexs.doMaterializeRepo( repo, doUpdate=ignoreCache, # registerInCache=True, ) + assert len(downstream_repos) > 0 + repo = materialized_repo.repo + repoEffectiveCheckout = repo.get_checkout() else: ( repoDir, From 394886e2100039d836e95838359cab61f21b4449 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20M=C2=AA=20Fern=C3=A1ndez?= Date: Fri, 7 Feb 2025 20:21:19 +0100 Subject: [PATCH 33/60] Fixed FASP and Wiktionary fetchers initialisation --- wfexs_backend/fetchers/fasp.py | 15 ++++++++++++--- wfexs_backend/fetchers/wiktionary.py | 7 +------ 2 files changed, 13 insertions(+), 9 deletions(-) diff --git a/wfexs_backend/fetchers/fasp.py b/wfexs_backend/fetchers/fasp.py index 6e99156a..4014f985 100644 --- a/wfexs_backend/fetchers/fasp.py +++ b/wfexs_backend/fetchers/fasp.py @@ -2,7 +2,7 @@ # -*- coding: utf-8 -*- # SPDX-License-Identifier: Apache-2.0 -# Copyright 2020-2024 Barcelona Supercomputing Center (BSC), Spain +# Copyright 2020-2025 Barcelona Supercomputing Center (BSC), Spain # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -45,6 +45,10 @@ URIType, ) + from ..scheme_catalog import ( + SchemeCatalog, + ) + from . import ( AbstractStatefulFetcher, DocumentedStatefulProtocolFetcher, @@ -64,9 +68,14 @@ class FASPFetcher(AbstractStatefulFetcher): DEFAULT_ASPERA_CMD: "Final[SymbolicName]" = cast("SymbolicName", "ascp") def __init__( - self, progs: "ProgsMapping", setup_block: "Optional[Mapping[str, Any]]" = None + self, + progs: "ProgsMapping", + setup_block: "Optional[Mapping[str, Any]]" = None, + scheme_catalog: "Optional[SchemeCatalog]" = None, ): - super().__init__(progs=progs, setup_block=setup_block) + super().__init__( + progs=progs, setup_block=setup_block, scheme_catalog=scheme_catalog + ) self.ascp_cmd = self.progs.get( self.DEFAULT_ASPERA_CMD, cast("RelPath", self.DEFAULT_ASPERA_CMD) diff --git a/wfexs_backend/fetchers/wiktionary.py b/wfexs_backend/fetchers/wiktionary.py index ea434974..7921bbbf 100644 --- a/wfexs_backend/fetchers/wiktionary.py +++ b/wfexs_backend/fetchers/wiktionary.py @@ -2,7 +2,7 @@ # -*- coding: utf-8 -*- # SPDX-License-Identifier: Apache-2.0 -# Copyright 2020-2024 Barcelona Supercomputing Center (BSC), Spain +# Copyright 2020-2025 Barcelona Supercomputing Center (BSC), Spain # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -64,11 +64,6 @@ class WiktionaryFetcher(AbstractStatefulFetcher): WIKTIONARY_PROTO: "Final[str]" = "wfexs.wiktionary" - def __init__( - self, progs: "ProgsMapping", setup_block: "Optional[Mapping[str, Any]]" = None - ): - super().__init__(progs=progs, setup_block=setup_block) - @classmethod def GetSchemeHandlers(cls) -> "Mapping[str, DocumentedStatefulProtocolFetcher]": # These are de-facto schemes supported by pip and git client From 76136bd159f5dd9770a0e3eeac737d29a3e61869 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20M=C2=AA=20Fern=C3=A1ndez?= Date: Fri, 7 Feb 2025 20:21:45 +0100 Subject: [PATCH 34/60] Added better error handling on fetcher initialisation --- wfexs_backend/scheme_catalog.py | 31 ++++++++++++++++++++----------- 1 file changed, 20 insertions(+), 11 deletions(-) diff --git a/wfexs_backend/scheme_catalog.py b/wfexs_backend/scheme_catalog.py index 0691abba..83e93681 100644 --- a/wfexs_backend/scheme_catalog.py +++ b/wfexs_backend/scheme_catalog.py @@ -2,7 +2,7 @@ # -*- coding: utf-8 -*- # SPDX-License-Identifier: Apache-2.0 -# Copyright 2020-2024 Barcelona Supercomputing Center (BSC), Spain +# Copyright 2020-2025 Barcelona Supercomputing Center (BSC), Spain # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -452,16 +452,21 @@ def addSchemeHandlers( instSchemeHandler = None if isinstance(schemeHandler, DocumentedStatefulProtocolFetcher): - instSchemeInstance = self.instantiateStatefulFetcher( - schemeHandler.fetcher_class, setup_block=setup_block - ) - if instSchemeInstance is not None: - instSchemeHandler = DocumentedProtocolFetcher( - fetcher=instSchemeInstance.fetch, - description=instSchemeInstance.description - if schemeHandler.description is None - else schemeHandler.description, - priority=schemeHandler.priority, + try: + instSchemeInstance = self.instantiateStatefulFetcher( + schemeHandler.fetcher_class, setup_block=setup_block + ) + if instSchemeInstance is not None: + instSchemeHandler = DocumentedProtocolFetcher( + fetcher=instSchemeInstance.fetch, + description=instSchemeInstance.description + if schemeHandler.description is None + else schemeHandler.description, + priority=schemeHandler.priority, + ) + except Exception as e: + self.logger.exception( + f"Error while instantiating handler implemented at {schemeHandler.fetcher_class} for scheme {lScheme}" ) elif isinstance(schemeHandler, DocumentedProtocolFetcher) and callable( schemeHandler.fetcher @@ -473,5 +478,9 @@ def addSchemeHandlers( # Schemes are case insensitive, so register only # the lowercase version instSchemeHandlers[lScheme] = instSchemeHandler + else: + self.logger.warning( + f"Scheme {lScheme} could not be properly instantiated" + ) self.addRawSchemeHandlers(instSchemeHandlers) From 57ddd5ef1c2e8281755800114e355d5793409f78 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20M=C2=AA=20Fern=C3=A1ndez?= Date: Sat, 8 Feb 2025 13:50:38 +0100 Subject: [PATCH 35/60] Fixed corner case where some variables were uninitialized on TRS parameter guessing --- wfexs_backend/fetchers/trs_files.py | 47 ++++++++++++++++++++++++++--- 1 file changed, 42 insertions(+), 5 deletions(-) diff --git a/wfexs_backend/fetchers/trs_files.py b/wfexs_backend/fetchers/trs_files.py index 547526d5..5cca4a65 100644 --- a/wfexs_backend/fetchers/trs_files.py +++ b/wfexs_backend/fetchers/trs_files.py @@ -183,6 +183,9 @@ def GuessTRSParams( putative_tool_uri: "Optional[URIType]" = None descriptor: "Optional[str]" = None service_info_metadata: "Optional[MutableMapping[str, Any]]" = None + trs_tool_uri: "URIType" + trs_tool_meta: "Optional[Mapping[str, Any]]" = None + version_id: "Optional[WFVersionId]" = None if parsed_wf_url.scheme == cls.TRS_SCHEME_PREFIX: # Duplication of code path_steps: "Sequence[str]" = parsed_wf_url.path.split("/") @@ -262,12 +265,13 @@ def GuessTRSParams( if not override_version_id else override_version_id ) - trs_tool_uri = ( + trs_tool_uri = cast( + "URIType", trs_endpoint + cls.TRS_TOOLS_SUFFIX + path_steps[-2] + "/versions/" - + urllib.parse.quote(cast("str", version_id), safe="") + + urllib.parse.quote(cast("str", version_id), safe=""), ) workflow_id = urllib.parse.unquote(path_steps[-2]) descriptor = None @@ -358,6 +362,8 @@ def GuessTRSParams( # Reuse the last version trs_tool_meta = versions[-1] + assert trs_tool_meta is not None + trs_endpoint = urllib.parse.urlunparse( urllib.parse.ParseResult( scheme=parsed_putative_tool_uri.scheme, @@ -373,7 +379,7 @@ def GuessTRSParams( parsed_putative_tool_uri.path.split("/")[-1] ) trs_tool_prefix = putative_tool_uri - version_id = trs_tool_meta.get("id") + version_id = cast("Optional[WFVersionId]", trs_tool_meta.get("id")) name = trs_tool_meta.get("name") if version_id is not None: # Dockstore misbehaves @@ -383,10 +389,11 @@ def GuessTRSParams( and parsed_putative_tool_uri.netloc.endswith("dockstore.org") ): version_id = name - trs_tool_uri = ( + trs_tool_uri = cast( + "URIType", trs_tool_prefix + "/versions/" - + urllib.parse.quote(str(version_id), safe="") + + urllib.parse.quote(str(version_id), safe=""), ) elif fail_ok: return None @@ -494,7 +501,37 @@ def GuessTRSParams( ) # This is needed to guarantee it is always declared + assert version_id is not None assert trs_tool_uri is not None + if trs_tool_meta is None: + trs_tool_wfexs_meta: "MutableMapping[str, Union[URIType, Optional[Mapping[str, Any]]]]" = { + "fetched": trs_tool_uri, + "payload": None, + } + metadata_array.append(URIWithMetadata(wf_url, trs_tool_wfexs_meta)) + try: + resio = io.BytesIO() + _, metaresio, _ = scheme_catalog.streamfetch( + trs_tool_uri, + resio, + sec_context={ + "headers": { + "Accept": "application/json", + # Added to avoid Cloudflare anti-bot policy + "User-Agent": get_WfExS_version_str(), + }, + }, + ) + trs_tool_meta = json.loads(resio.getvalue().decode("utf-8")) + trs_tool_wfexs_meta["payload"] = trs_tool_meta + metadata_array.extend(metaresio) + except Exception as e: + if fail_ok: + return None + raise FetcherException( + f"trs_endpoint could not be guessed from {putative_tool_uri} (forced version {override_version_id}, raised exception {e})" + ) from e + assert trs_tool_meta is not None if not isinstance(trs_tool_meta.get("descriptor_type"), list): From 13dceea5c48bcee5ef5342c0363ed84e94f5ebd9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20M=C2=AA=20Fern=C3=A1ndez?= Date: Sat, 8 Feb 2025 23:07:21 +0100 Subject: [PATCH 36/60] Fixed corner case of non-standard GA4GH TRS identification --- wfexs_backend/fetchers/trs_files.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/wfexs_backend/fetchers/trs_files.py b/wfexs_backend/fetchers/trs_files.py index 5cca4a65..0b20d7c2 100644 --- a/wfexs_backend/fetchers/trs_files.py +++ b/wfexs_backend/fetchers/trs_files.py @@ -251,7 +251,9 @@ def GuessTRSParams( ) service_info_wfexs_meta["payload"] = service_info_metadata metadata_array.extend(metametaio) - trs_endpoint = trs_service_info[0 : -len("service-info")] + trs_endpoint = non_standard_trs_service_info[ + 0 : -len("service-info") + ] except Exception as e2: if fail_ok: return None From 1792f4e8a1ed564e831bbf300f37cdb1a6951af9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20M=C2=AA=20Fern=C3=A1ndez?= Date: Tue, 11 Feb 2025 02:31:38 +0100 Subject: [PATCH 37/60] Pass down the list of programs to the stateful fetchers to be initialized. * Also, obtain the list of repo fetchers from the fetchers initialization procedure. * Cleaned up dead code. * Propagate workflow type for scenarios where it could be obtained. --- wfexs_backend/scheme_catalog.py | 126 ++++++++++++++++++++------------ wfexs_backend/wfexs_backend.py | 67 ++++++----------- wfexs_backend/workflow.py | 1 + 3 files changed, 103 insertions(+), 91 deletions(-) diff --git a/wfexs_backend/scheme_catalog.py b/wfexs_backend/scheme_catalog.py index 83e93681..109e5b9b 100644 --- a/wfexs_backend/scheme_catalog.py +++ b/wfexs_backend/scheme_catalog.py @@ -256,10 +256,14 @@ def instantiateStatefulFetcher( instStatefulFetcher: "Optional[AbstractStatefulFetcher]" = None if inspect.isclass(statefulFetcher): if issubclass(statefulFetcher, AbstractStatefulFetcher): + # Setting the default list of programs + mutable_progs = copy.copy(progs) + for prog in statefulFetcher.GetNeededPrograms(): + mutable_progs.setdefault(prog, cast("RelPath", prog)) try: if issubclass(statefulFetcher, AbstractSchemeRepoFetcher): instStatefulFetcher = statefulFetcher( - self, progs=progs, setup_block=setup_block + self, progs=mutable_progs, setup_block=setup_block ) else: instStatefulFetcher = statefulFetcher( @@ -289,12 +293,14 @@ def findAndAddSchemeHandlersFromModuleName( self, the_module_name: "str" = "wfexs_backend.fetchers", fetchers_setup_block: "Optional[Mapping[str, Mapping[str, Any]]]" = None, - ) -> None: + progs: "ProgsMapping" = dict(), + ) -> "Sequence[AbstractSchemeRepoFetcher]": try: the_module = importlib.import_module(the_module_name) - self.findAndAddSchemeHandlersFromModule( + return self.findAndAddSchemeHandlersFromModule( the_module, fetchers_setup_block=fetchers_setup_block, + progs=progs, ) except Exception as e: errmsg = f"Unable to import module {the_module_name} in order to gather scheme handlers, due errors:" @@ -305,7 +311,10 @@ def findAndAddSchemeHandlersFromModule( self, the_module: "ModuleType", fetchers_setup_block: "Optional[Mapping[str, Mapping[str, Any]]]" = None, - ) -> None: + progs: "ProgsMapping" = dict(), + ) -> "Sequence[AbstractSchemeRepoFetcher]": + repo_fetchers: "MutableSequence[AbstractSchemeRepoFetcher]" = [] + for finder, module_name, ispkg in iter_namespace(the_module): try: named_module = importlib.import_module(module_name) @@ -334,9 +343,12 @@ def findAndAddSchemeHandlersFromModule( ): # Now, let's learn whether the class is enabled if getattr(obj, "ENABLED", False): - self.addStatefulSchemeHandlers( - obj, - fetchers_setup_block=fetchers_setup_block, + repo_fetchers.extend( + self.addStatefulSchemeHandlers( + obj, + fetchers_setup_block=fetchers_setup_block, + progs=progs, + ) ) skipit = False @@ -345,11 +357,14 @@ def findAndAddSchemeHandlersFromModule( f"Fetch module {named_module} was not eligible (no SCHEME_HANDLERS dictionary or subclass of {AbstractStatefulFetcher.__name__})" ) + return repo_fetchers + def addStatefulSchemeHandlers( self, statefulSchemeHandler: "Type[AbstractStatefulFetcher]", fetchers_setup_block: "Optional[Mapping[str, Mapping[str, Any]]]" = None, - ) -> None: + progs: "ProgsMapping" = dict(), + ) -> "Sequence[AbstractSchemeRepoFetcher]": """ This method adds scheme handlers (aka "fetchers") from a given stateful fetcher, also adding the needed programs @@ -358,8 +373,10 @@ def addStatefulSchemeHandlers( # Get the scheme handlers from this fetcher schemeHandlers = statefulSchemeHandler.GetSchemeHandlers() - self.addSchemeHandlers( - schemeHandlers, fetchers_setup_block=fetchers_setup_block + return self.addSchemeHandlers( + schemeHandlers, + fetchers_setup_block=fetchers_setup_block, + progs=progs, ) def get(self, scheme: "str") -> "Optional[DocumentedProtocolFetcher]": @@ -429,32 +446,40 @@ def addSchemeHandlers( self, schemeHandlers: "Mapping[str, Union[DocumentedProtocolFetcher, DocumentedStatefulProtocolFetcher]]", fetchers_setup_block: "Optional[Mapping[str, Mapping[str, Any]]]" = None, - ) -> None: + progs: "ProgsMapping" = dict(), + ) -> "Sequence[AbstractSchemeRepoFetcher]": """ This method adds scheme handlers (aka "fetchers") or instantiates stateful scheme handlers (aka "stateful fetchers") """ - if isinstance(schemeHandlers, dict): - instSchemeHandlers = dict() - if fetchers_setup_block is None: - fetchers_setup_block = dict() - for scheme, schemeHandler in schemeHandlers.items(): - if self.SCHEME_PAT.search(scheme) is None: - self.logger.warning( - f"Fetcher associated to scheme {scheme} has been skipped, as the scheme does not comply with RFC3986" - ) - continue - - lScheme = scheme.lower() - # When no setup block is available for the scheme fetcher, - # provide an empty one - setup_block = fetchers_setup_block.get(lScheme, dict()) - - instSchemeHandler = None - if isinstance(schemeHandler, DocumentedStatefulProtocolFetcher): + instSchemeHandlers = dict() + fetchers_mapping: "MutableMapping[Type[AbstractStatefulFetcher], DocumentedProtocolFetcher]" = ( + dict() + ) + repo_fetchers: "MutableSequence[AbstractSchemeRepoFetcher]" = [] + if fetchers_setup_block is None: + fetchers_setup_block = dict() + for scheme, schemeHandler in schemeHandlers.items(): + if self.SCHEME_PAT.search(scheme) is None: + self.logger.warning( + f"Fetcher associated to scheme {scheme} has been skipped, as the scheme does not comply with RFC3986" + ) + continue + + lScheme = scheme.lower() + # When no setup block is available for the scheme fetcher, + # provide an empty one + setup_block = fetchers_setup_block.get(lScheme, dict()) + + instSchemeHandler: "Optional[DocumentedProtocolFetcher]" = None + if isinstance(schemeHandler, DocumentedStatefulProtocolFetcher): + instSchemeHandler = fetchers_mapping.get(schemeHandler.fetcher_class) + if instSchemeHandler is None: try: instSchemeInstance = self.instantiateStatefulFetcher( - schemeHandler.fetcher_class, setup_block=setup_block + schemeHandler.fetcher_class, + setup_block=setup_block, + progs=progs, ) if instSchemeInstance is not None: instSchemeHandler = DocumentedProtocolFetcher( @@ -464,23 +489,32 @@ def addSchemeHandlers( else schemeHandler.description, priority=schemeHandler.priority, ) + fetchers_mapping[ + schemeHandler.fetcher_class + ] = instSchemeHandler + if isinstance( + instSchemeInstance, AbstractSchemeRepoFetcher + ): + repo_fetchers.append(instSchemeInstance) except Exception as e: self.logger.exception( f"Error while instantiating handler implemented at {schemeHandler.fetcher_class} for scheme {lScheme}" ) - elif isinstance(schemeHandler, DocumentedProtocolFetcher) and callable( - schemeHandler.fetcher - ): - instSchemeHandler = schemeHandler - - # Only the ones which have overcome the sanity checks - if instSchemeHandler is not None: - # Schemes are case insensitive, so register only - # the lowercase version - instSchemeHandlers[lScheme] = instSchemeHandler - else: - self.logger.warning( - f"Scheme {lScheme} could not be properly instantiated" - ) - - self.addRawSchemeHandlers(instSchemeHandlers) + elif isinstance(schemeHandler, DocumentedProtocolFetcher) and callable( + schemeHandler.fetcher + ): + instSchemeHandler = schemeHandler + + # Only the ones which have overcome the sanity checks + if instSchemeHandler is not None: + # Schemes are case insensitive, so register only + # the lowercase version + instSchemeHandlers[lScheme] = instSchemeHandler + else: + self.logger.warning( + f"Scheme {lScheme} could not be properly instantiated" + ) + + self.addRawSchemeHandlers(instSchemeHandlers) + + return repo_fetchers diff --git a/wfexs_backend/wfexs_backend.py b/wfexs_backend/wfexs_backend.py index db22ae79..c2b985dd 100644 --- a/wfexs_backend/wfexs_backend.py +++ b/wfexs_backend/wfexs_backend.py @@ -686,7 +686,6 @@ def __init__( self._sngltn_fetcher: "MutableMapping[Type[AbstractStatefulFetcher], AbstractStatefulFetcher]" = ( dict() ) - self._repo_fetchers: "MutableSequence[AbstractSchemeRepoFetcher]" = list() # scheme_catalog is created on first use self.scheme_catalog = SchemeCatalog() # cacheHandler is created on first use @@ -697,8 +696,11 @@ def __init__( fetchers_setup_block = local_config.get("fetchers-setup") # All the scheme handlers should be added here - self.scheme_catalog.findAndAddSchemeHandlersFromModuleName( - fetchers_setup_block=fetchers_setup_block + self._repo_fetchers = ( + self.scheme_catalog.findAndAddSchemeHandlersFromModuleName( + fetchers_setup_block=fetchers_setup_block, + progs=self.progs, + ) ) # Registry of export plugins is created here @@ -764,42 +766,6 @@ def getCacheHandler( ) -> "Tuple[CacheHandler, Optional[pathlib.Path]]": return self.cacheHandler, self.cachePathMap.get(cache_type) - def instantiateStatefulFetcher( - self, - statefulFetcher: "Type[StatefulFetcher]", - setup_block: "Optional[Mapping[str, Any]]" = None, - ) -> "StatefulFetcher": - """ - Method to instantiate stateful fetchers once - """ - instStatefulFetcher = self._sngltn_fetcher.get(statefulFetcher) - if instStatefulFetcher is None: - # Setting the default list of programs - for prog in statefulFetcher.GetNeededPrograms(): - self.progs.setdefault(prog, cast("RelPath", prog)) - # Let's augment the list of needed progs by this - # stateful fetcher - instStatefulFetcher = self.scheme_catalog.instantiateStatefulFetcher( - statefulFetcher, progs=self.progs, setup_block=setup_block - ) - self._sngltn_fetcher[statefulFetcher] = instStatefulFetcher - - # Also, if it is a repository fetcher, record it separately - if isinstance(instStatefulFetcher, AbstractSchemeRepoFetcher): - self._repo_fetchers.append(instStatefulFetcher) - - return cast("StatefulFetcher", instStatefulFetcher) - - def instantiateRepoFetcher( - self, - repoFetcher: "Type[SchemeRepoFetcher]", - setup_block: "Optional[Mapping[str, Any]]" = None, - ) -> "SchemeRepoFetcher": - """ - Method to instantiate repo fetchers once - """ - return self.instantiateStatefulFetcher(repoFetcher, setup_block=setup_block) - def findAndAddWorkflowEnginesFromModuleName( self, the_module_name: "str" = "wfexs_backend.workflow_engines", @@ -1967,7 +1933,7 @@ def cacheWorkflow( # It is not an absolute URL, so it is being an identifier in the workflow i_workflow: "Optional[IdentifiedWorkflow]" = None - engineDesc: "Optional[WorkflowType]" = None + workflow_type: "Optional[WorkflowType]" = None guessedRepo: "Optional[RemoteRepo]" = None repoDir: "Optional[pathlib.Path]" = None putative: "bool" = False @@ -2037,7 +2003,7 @@ def cacheWorkflow( raise WfExSBackendException(message) guessedRepo = i_workflow.remote_repo - engineDesc = i_workflow.workflow_type + workflow_type = i_workflow.workflow_type if cached_putative_path is not None: self.cacheROCrateFilename = cached_putative_path @@ -2054,7 +2020,12 @@ def cacheWorkflow( len(parsedRepoURL.scheme) > 0 ), f"Repository id {guessedRepo.repo_url} should be a parsable URI" - repoDir, materialized_repo, downstream_repos = self.doMaterializeRepo( + ( + repoDir, + materialized_repo, + workflow_type, + downstream_repos, + ) = self.doMaterializeRepo( guessedRepo, fetcher=guessed[1] if guessed is not None else None, doUpdate=ignoreCache, @@ -2065,7 +2036,7 @@ def cacheWorkflow( repoEffectiveCheckout = repo.get_checkout() # TODO: should we preserve the chain of repos? - return repoDir, repo, engineDesc, repoEffectiveCheckout + return repoDir, repo, workflow_type, repoEffectiveCheckout TRS_METADATA_FILE: "Final[RelPath]" = cast("RelPath", "trs_metadata.json") TRS_QUERY_CACHE_FILE: "Final[RelPath]" = cast("RelPath", "trs_result.json") @@ -2076,7 +2047,7 @@ def doMaterializeRepo( fetcher: "Optional[AbstractSchemeRepoFetcher]" = None, doUpdate: "bool" = True, registerInCache: "bool" = True, - ) -> "Tuple[pathlib.Path, MaterializedRepo, Sequence[RemoteRepo]]": + ) -> "Tuple[pathlib.Path, MaterializedRepo, Optional[WorkflowType], Sequence[RemoteRepo]]": """ This method is used to materialize repos described using instances of RemoteRepo. It starts asking all the known repo fetchers whether @@ -2115,6 +2086,7 @@ def doMaterializeRepo( f"Don't know how to materialize {repo.repo_url} (of type {repo.repo_type}) as a repository" ) + workflow_type: "Optional[WorkflowType]" = None # An specialized fetcher is used downstream_repos: "MutableSequence[RemoteRepo]" if fetcher is not None: @@ -2172,6 +2144,7 @@ def doMaterializeRepo( ( upstream_repo_path, upstream_materialized_repo, + upstream_workflow_type, upstream_downstream_repos, ) = self.doMaterializeRepo( materialized_repo.upstream_repo, @@ -2182,6 +2155,7 @@ def doMaterializeRepo( return ( upstream_repo_path, upstream_materialized_repo, + upstream_workflow_type, downstream_repos, ) except Exception as e: @@ -2206,6 +2180,7 @@ def doMaterializeRepo( # It is an RO-Crate downstream_repos.append(repo) i_workflow_repo = i_workflow.remote_repo + workflow_type = i_workflow.workflow_type if repo_rel_path is not None: i_workflow_repo = i_workflow_repo._replace(rel_path=repo_rel_path) downstream_repos.append(i_workflow_repo) @@ -2215,6 +2190,7 @@ def doMaterializeRepo( ( upstream_repo_path, upstream_materialized_repo, + upstream_workflow_type, upstream_downstream_repos, ) = self.doMaterializeRepo( i_workflow_repo, @@ -2225,6 +2201,7 @@ def doMaterializeRepo( return ( upstream_repo_path, upstream_materialized_repo, + upstream_workflow_type, downstream_repos, ) except Exception as e: @@ -2263,7 +2240,7 @@ def doMaterializeRepo( metadata_array=metadata_array, ) - return repo_path, materialized_repo, downstream_repos + return repo_path, materialized_repo, workflow_type, downstream_repos def getWorkflowBundleFromURI( self, diff --git a/wfexs_backend/workflow.py b/wfexs_backend/workflow.py index eace1131..874a9652 100644 --- a/wfexs_backend/workflow.py +++ b/wfexs_backend/workflow.py @@ -1996,6 +1996,7 @@ def fetchWorkflow( ( repoDir, materialized_repo, + workflow_type, downstream_repos, ) = self.wfexs.doMaterializeRepo( repo, From cd8e541aefad374aa776076a5ccfa3feb464330e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20M=C2=AA=20Fern=C3=A1ndez?= Date: Tue, 11 Feb 2025 02:42:35 +0100 Subject: [PATCH 38/60] Avoid long timeout repository guessing cases at git fetcher when a uri is from a non-supported scheme. --- wfexs_backend/fetchers/git.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/wfexs_backend/fetchers/git.py b/wfexs_backend/fetchers/git.py index 7828969f..febec4ee 100644 --- a/wfexs_backend/fetchers/git.py +++ b/wfexs_backend/fetchers/git.py @@ -451,8 +451,10 @@ def GuessRepoParams( found_params = cls._find_git_repo_in_uri(cast("URIType", repoURL)) guessedRepoFlavor = found_params[0].guess_flavor # TODO handling other popular cases, like bitbucket - else: + elif parsed_wf_url.scheme in ("http", "https", "file", "ssh"): found_params = cls._find_git_repo_in_uri(parsed_wf_url) + else: + return None except RepoGuessException as gge: if not fail_ok: From 38fde1e9e37e7b2db388c8cad66a614d547b2d18 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20M=C2=AA=20Fern=C3=A1ndez?= Date: Tue, 11 Feb 2025 02:52:04 +0100 Subject: [PATCH 39/60] Removed debug trace --- wfexs_backend/fetchers/trs_files.py | 1 - 1 file changed, 1 deletion(-) diff --git a/wfexs_backend/fetchers/trs_files.py b/wfexs_backend/fetchers/trs_files.py index 0b20d7c2..c275ddf5 100644 --- a/wfexs_backend/fetchers/trs_files.py +++ b/wfexs_backend/fetchers/trs_files.py @@ -822,7 +822,6 @@ def materialize_repo_from_repo( if is_abs_url: # An absolute URL, like in the case of DDBJ TRS implementation file_url = cast("URIType", file_rel_path) - self.logger.warning(file_rel_2_url) else: file_url = cast( "URIType", From 2f9e759c222198be7b42bf6166c6d1dfe6572958 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20M=C2=AA=20Fern=C3=A1ndez?= Date: Tue, 11 Feb 2025 14:16:16 +0100 Subject: [PATCH 40/60] Added scaffolding work to propagate and test cache offline capabilities --- wfexs_backend/__main__.py | 9 ++++++++- wfexs_backend/wfexs_backend.py | 5 +++++ wfexs_backend/workflow.py | 1 + 3 files changed, 14 insertions(+), 1 deletion(-) diff --git a/wfexs_backend/__main__.py b/wfexs_backend/__main__.py index ef5d9fa6..51fa3e83 100644 --- a/wfexs_backend/__main__.py +++ b/wfexs_backend/__main__.py @@ -749,7 +749,7 @@ def processCacheCommand( cached_content = wfBackend.cacheFetch( cast("URIType", uri_to_fetch), args.cache_type, - offline=False, + offline=args.doCacheOffline, vault=vault, sec_context_name=secContextName, default_clonable=default_clonable, @@ -1256,6 +1256,13 @@ def _get_wfexs_argparse_internal( action="store_true", default=False, ) + ap_c.add_argument( + "--offline", + dest="doCacheOffline", + help="Try checking the offline behaviour of cache management", + action="store_true", + default=False, + ) ap_c.add_argument( "-g", "--glob", diff --git a/wfexs_backend/wfexs_backend.py b/wfexs_backend/wfexs_backend.py index c2b985dd..ef77ea93 100644 --- a/wfexs_backend/wfexs_backend.py +++ b/wfexs_backend/wfexs_backend.py @@ -2030,6 +2030,7 @@ def cacheWorkflow( fetcher=guessed[1] if guessed is not None else None, doUpdate=ignoreCache, registerInCache=registerInCache, + offline=offline, ) assert len(downstream_repos) > 0 repo = materialized_repo.repo @@ -2047,6 +2048,7 @@ def doMaterializeRepo( fetcher: "Optional[AbstractSchemeRepoFetcher]" = None, doUpdate: "bool" = True, registerInCache: "bool" = True, + offline: "bool" = False, ) -> "Tuple[pathlib.Path, MaterializedRepo, Optional[WorkflowType], Sequence[RemoteRepo]]": """ This method is used to materialize repos described using instances @@ -2150,6 +2152,7 @@ def doMaterializeRepo( materialized_repo.upstream_repo, doUpdate=doUpdate, registerInCache=registerInCache, + offline=offline, ) downstream_repos.extend(upstream_downstream_repos) return ( @@ -2174,6 +2177,7 @@ def doMaterializeRepo( repo.repo_url, ignoreCache=doUpdate, registerInCache=registerInCache, + offline=offline, ) if i_workflow is not None: @@ -2196,6 +2200,7 @@ def doMaterializeRepo( i_workflow_repo, doUpdate=doUpdate, registerInCache=registerInCache, + offline=offline, ) downstream_repos.extend(upstream_downstream_repos) return ( diff --git a/wfexs_backend/workflow.py b/wfexs_backend/workflow.py index 874a9652..67f45356 100644 --- a/wfexs_backend/workflow.py +++ b/wfexs_backend/workflow.py @@ -2002,6 +2002,7 @@ def fetchWorkflow( repo, doUpdate=ignoreCache, # registerInCache=True, + offline=offline, ) assert len(downstream_repos) > 0 repo = materialized_repo.repo From 4f732baed90518578af7186e224e992ebe252742 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20M=C2=AA=20Fern=C3=A1ndez?= Date: Tue, 11 Feb 2025 14:54:44 +0100 Subject: [PATCH 41/60] The different GuessRepoParams implementations should honor offline mode. Added needed code to honor offline mode when GuessRepoParams is called from a WfExSBackend instance. Also, added needed changes to the different repo fetchers, so they are honoring the offline mode, raising an OfflineRepoGuessException, even when fail_ok is true. --- wfexs_backend/fetchers/__init__.py | 5 ++++ wfexs_backend/fetchers/git.py | 33 ++++++++++++++++++----- wfexs_backend/fetchers/swh.py | 11 +++++++- wfexs_backend/fetchers/trs_files.py | 42 +++++++++++++++++++---------- wfexs_backend/wfexs_backend.py | 15 ++++++++--- 5 files changed, 80 insertions(+), 26 deletions(-) diff --git a/wfexs_backend/fetchers/__init__.py b/wfexs_backend/fetchers/__init__.py index c5ed9385..6590df38 100644 --- a/wfexs_backend/fetchers/__init__.py +++ b/wfexs_backend/fetchers/__init__.py @@ -250,6 +250,10 @@ class RepoGuessException(FetcherException): pass +class OfflineRepoGuessException(RepoGuessException): + pass + + class RepoType(enum.Enum): Git = "git" Raw = "raw" @@ -387,6 +391,7 @@ def GuessRepoParams( orig_wf_url: "Union[URIType, parse.ParseResult]", logger: "Optional[logging.Logger]" = None, fail_ok: "bool" = False, + offline: "bool" = False, ) -> "Optional[RemoteRepo]": pass diff --git a/wfexs_backend/fetchers/git.py b/wfexs_backend/fetchers/git.py index febec4ee..ca48eb6b 100644 --- a/wfexs_backend/fetchers/git.py +++ b/wfexs_backend/fetchers/git.py @@ -79,6 +79,7 @@ DocumentedStatefulProtocolFetcher, FetcherException, MaterializedRepo, + OfflineRepoGuessException, ProtocolFetcherReturn, RemoteRepo, RepoGuessException, @@ -149,6 +150,7 @@ def GetNeededPrograms(cls) -> "Sequence[SymbolicName]": def _find_git_repo_in_uri( cls, remote_file: "Union[URIType, parse.ParseResult]", + offline: "bool" = False, ) -> "Tuple[RemoteRepo, Sequence[str], Sequence[RepoTag]]": if isinstance(remote_file, parse.ParseResult): parsedInputURL = remote_file @@ -170,6 +172,10 @@ def _find_git_repo_in_uri( remote_uri_anc = parse.urlunparse(parsedInputURL._replace(path=pre_path)) remote_refs_dict: "Mapping[bytes, bytes]" + if offline: + raise OfflineRepoGuessException( + f"Query to {remote_uri_anc} is not allowed in offline mode" + ) try: # Dulwich works both with file, ssh, git and http(s) protocols remote_refs_dict = dulwich.porcelain.ls_remote(remote_uri_anc) @@ -257,7 +263,12 @@ def GuessRepoParams( wf_url: "Union[URIType, parse.ParseResult]", logger: "Optional[logging.Logger]" = None, fail_ok: "bool" = False, + offline: "bool" = False, ) -> "Optional[RemoteRepo]": + """ + When it is in offline mode, GuessRepoParams can raise an OfflineRepoGuessException + """ + repoURL = None repoTag = None repoRelPath = None @@ -299,7 +310,9 @@ def GuessRepoParams( fragment="", ) ) - found_params = cls._find_git_repo_in_uri(cast("URIType", repoURL)) + found_params = cls._find_git_repo_in_uri( + cast("URIType", repoURL), offline=offline + ) elif ( parsed_wf_url.scheme in ("http", "https") @@ -307,7 +320,7 @@ def GuessRepoParams( and "@" not in parsed_wf_url.path and parsed_wf_url.fragment == "" ): - found_params = cls._find_git_repo_in_uri(parsed_wf_url) + found_params = cls._find_git_repo_in_uri(parsed_wf_url, offline=offline) repoURL = found_params[0].repo_url repoType = RepoType.Git guessedRepoFlavor = RepoGuessFlavor.GitHub @@ -354,7 +367,9 @@ def GuessRepoParams( # And now, guessing the tag/checkout and the relative path # WARNING! This code can have problems with tags which contain slashes - found_params = cls._find_git_repo_in_uri(cast("URIType", repoURL)) + found_params = cls._find_git_repo_in_uri( + cast("URIType", repoURL), offline=offline + ) if len(wf_path) >= 4: repo_branches_tags = found_params[2] # Validate against existing branch and tag names @@ -448,20 +463,24 @@ def GuessRepoParams( repoURL = parse.urlunparse( (gitScheme, parsed_wf_url.netloc, gitPath, "", "", "") ) - found_params = cls._find_git_repo_in_uri(cast("URIType", repoURL)) + found_params = cls._find_git_repo_in_uri( + cast("URIType", repoURL), offline=offline + ) guessedRepoFlavor = found_params[0].guess_flavor # TODO handling other popular cases, like bitbucket elif parsed_wf_url.scheme in ("http", "https", "file", "ssh"): - found_params = cls._find_git_repo_in_uri(parsed_wf_url) + found_params = cls._find_git_repo_in_uri(parsed_wf_url, offline=offline) else: return None + except OfflineRepoGuessException as ogge: + raise except RepoGuessException as gge: if not fail_ok: import traceback traceback.print_exc() - raise FetcherException( + raise RepoGuessException( f"FIXME: Unsupported http(s) git repository {wf_url} (see cascade exception)" ) from gge @@ -472,7 +491,7 @@ def GuessRepoParams( if guessedRepoFlavor is None: guessedRepoFlavor = found_params[0].guess_flavor elif not fail_ok: - raise FetcherException( + raise RepoGuessException( f"FIXME: Unsupported git repository {wf_url}. (Is it really a git repo???)" ) diff --git a/wfexs_backend/fetchers/swh.py b/wfexs_backend/fetchers/swh.py index e3fbeeb4..ad2a0352 100644 --- a/wfexs_backend/fetchers/swh.py +++ b/wfexs_backend/fetchers/swh.py @@ -70,6 +70,7 @@ DocumentedStatefulProtocolFetcher, FetcherException, MaterializedRepo, + OfflineRepoGuessException, ProtocolFetcherReturn, RemoteRepo, RepoGuessException, @@ -176,6 +177,7 @@ def GuessRepoParams( orig_wf_url: "Union[URIType, parse.ParseResult]", logger: "Optional[logging.Logger]" = None, fail_ok: "bool" = False, + offline: "bool" = False, ) -> "Optional[RemoteRepo]": # Deciding which is the input wf_url: "RepoURL" @@ -187,6 +189,11 @@ def GuessRepoParams( wf_url = cast("RepoURL", orig_wf_url) parsed_wf_url = parse.urlparse(orig_wf_url) + if offline: + raise OfflineRepoGuessException( + f"Queries related to {wf_url} are not allowed in offline mode" + ) + if parsed_wf_url.scheme not in cls.GetSchemeHandlers(): return None @@ -211,7 +218,9 @@ def GuessRepoParams( except Exception as e: if fail_ok: return None - raise + raise RepoGuessException( + f"Errors while querying {wf_url} for guessing purposes" + ) from e # It could be a valid swh identifier, but it is not registered if not isinstance(val_doc, dict) or not val_doc.get( diff --git a/wfexs_backend/fetchers/trs_files.py b/wfexs_backend/fetchers/trs_files.py index c275ddf5..a432fe5e 100644 --- a/wfexs_backend/fetchers/trs_files.py +++ b/wfexs_backend/fetchers/trs_files.py @@ -50,8 +50,10 @@ DocumentedStatefulProtocolFetcher, FetcherException, MaterializedRepo, + OfflineRepoGuessException, ProtocolFetcherReturn, RemoteRepo, + RepoGuessException, RepoType, ) @@ -154,6 +156,7 @@ def GuessTRSParams( logger: "Optional[logging.Logger]" = None, fail_ok: "bool" = False, scheme_catalog: "Optional[SchemeCatalog]" = None, + offline: "bool" = False, ) -> "Optional[Tuple[RepoURL, str, Sequence[str], WorkflowId, WFVersionId, str, Sequence[URIWithMetadata], Optional[Mapping[str, Any]]]]": if scheme_catalog is None: scheme_catalog = SchemeCatalog( @@ -187,12 +190,16 @@ def GuessTRSParams( trs_tool_meta: "Optional[Mapping[str, Any]]" = None version_id: "Optional[WFVersionId]" = None if parsed_wf_url.scheme == cls.TRS_SCHEME_PREFIX: + if offline: + raise OfflineRepoGuessException( + f"Queries related to {wf_url} are not allowed in offline mode" + ) # Duplication of code path_steps: "Sequence[str]" = parsed_wf_url.path.split("/") if len(path_steps) < 3 or path_steps[0] != "": if fail_ok: return None - raise FetcherException( + raise RepoGuessException( f"Ill-formed TRS CURIE {wf_url}. It should be in the format of {cls.TRS_SCHEME_PREFIX}://server/id/version or {cls.TRS_SCHEME_PREFIX}://server-plus-prefix-with-slashes/id/version" ) @@ -278,6 +285,10 @@ def GuessTRSParams( workflow_id = urllib.parse.unquote(path_steps[-2]) descriptor = None elif parsed_wf_url.scheme == cls.INTERNAL_TRS_SCHEME_PREFIX: + if offline: + raise OfflineRepoGuessException( + f"Queries related to {wf_url} are not allowed in offline mode" + ) putative_tool_uri = cast( "URIType", parsed_wf_url.path[0:-1] @@ -315,14 +326,14 @@ def GuessTRSParams( except Exception as e: if fail_ok: return None - raise FetcherException( + raise RepoGuessException( f"trs_endpoint could not be guessed from {putative_tool_uri} (raised exception {e})" ) from e if not isinstance(trs__meta, dict): if fail_ok: return None - raise FetcherException( + raise RepoGuessException( f"trs_endpoint could not be guessed from {putative_tool_uri} (not returning JSON object)" ) @@ -332,7 +343,7 @@ def GuessTRSParams( if len(versions) == 0: if fail_ok: return None - raise FetcherException( + raise RepoGuessException( f"No versions found associated to TRS tool reachable through {putative_tool_uri}" ) @@ -356,7 +367,7 @@ def GuessTRSParams( else: if fail_ok: return None - raise FetcherException( + raise RepoGuessException( f"Forced version {override_version_id} not found associated to TRS tool reachable through {putative_tool_uri}" ) @@ -400,7 +411,7 @@ def GuessTRSParams( elif fail_ok: return None else: - raise FetcherException( + raise RepoGuessException( f"No version id found associated to specific version of TRS tool reachable through {putative_tool_uri}" ) # ... or a concrete one? @@ -438,14 +449,14 @@ def GuessTRSParams( except Exception as e: if fail_ok: return None - raise FetcherException( + raise RepoGuessException( f"trs_endpoint could not be guessed from {putative_tool_uri} (forced version {override_version_id}, raised exception {e})" ) from e if "descriptor_type" not in trs__meta: if fail_ok: return None - raise FetcherException( + raise RepoGuessException( f"trs_endpoint at {putative_tool_uri} (forced version {override_version_id}) is not answering what it is expected" ) @@ -487,7 +498,7 @@ def GuessTRSParams( elif fail_ok: return None else: - raise FetcherException( + raise RepoGuessException( f"trs_endpoint at {putative_tool_uri} is not answering what it is expected" ) @@ -498,7 +509,7 @@ def GuessTRSParams( elif fail_ok: return None else: - raise FetcherException( + raise RepoGuessException( f"trs_endpoint could not be guessed from {orig_wf_url} (no clues)" ) @@ -530,20 +541,20 @@ def GuessTRSParams( except Exception as e: if fail_ok: return None - raise FetcherException( + raise RepoGuessException( f"trs_endpoint could not be guessed from {putative_tool_uri} (forced version {override_version_id}, raised exception {e})" ) from e assert trs_tool_meta is not None if not isinstance(trs_tool_meta.get("descriptor_type"), list): - raise FetcherException( + raise RepoGuessException( f"Unable to obtain descriptor_type from tool descriptor obtained from {putative_tool_uri}" ) descriptor_types = trs_tool_meta["descriptor_type"] if len(descriptor_types) == 0: - raise FetcherException( + raise RepoGuessException( f"Empty list of descriptor_type from tool descriptor obtained from {putative_tool_uri}" ) @@ -571,8 +582,11 @@ def GuessRepoParams( orig_wf_url: "Union[URIType, parse.ParseResult]", logger: "Optional[logging.Logger]" = None, fail_ok: "bool" = False, + offline: "bool" = False, ) -> "Optional[RemoteRepo]": - trs_params = cls.GuessTRSParams(orig_wf_url, logger=logger, fail_ok=fail_ok) + trs_params = cls.GuessTRSParams( + orig_wf_url, logger=logger, fail_ok=fail_ok, offline=offline + ) return ( None diff --git a/wfexs_backend/wfexs_backend.py b/wfexs_backend/wfexs_backend.py index ef77ea93..cfd6a923 100644 --- a/wfexs_backend/wfexs_backend.py +++ b/wfexs_backend/wfexs_backend.py @@ -1868,6 +1868,7 @@ def guess_repo_params( self, wf_url: "Union[URIType, parse.ParseResult]", fail_ok: "bool" = False, + offline: "bool" = False, ) -> "Optional[Tuple[RemoteRepo, AbstractSchemeRepoFetcher]]": if isinstance(wf_url, parse.ParseResult): parsedRepoURL = wf_url @@ -1878,7 +1879,10 @@ def guess_repo_params( fetcher: "Optional[AbstractSchemeRepoFetcher]" = None for fetcher in self.repo_fetchers: remote_repo = fetcher.GuessRepoParams( - parsedRepoURL, logger=self.logger, fail_ok=fail_ok + parsedRepoURL, + logger=self.logger, + fail_ok=fail_ok, + offline=offline, ) if remote_repo is not None: return remote_repo, fetcher @@ -1942,7 +1946,7 @@ def cacheWorkflow( raise WFException("trs_endpoint was not provided") # Trying to be smarter - guessed = self.guess_repo_params(parsedRepoURL, fail_ok=True) + guessed = self.guess_repo_params(parsedRepoURL, offline=offline, fail_ok=True) if guessed is not None: guessedRepo = guessed[0] if guessedRepo.tag is None and version_id is not None: @@ -2312,7 +2316,9 @@ def getWorkflowBundleFromURI( try: identified_workflow = self.getWorkflowRepoFromROCrateFile( - roCrateFile, expectedEngineDesc + roCrateFile, + expectedEngineDesc=expectedEngineDesc, + offline=offline, ) return ( identified_workflow, @@ -2338,6 +2344,7 @@ def getWorkflowRepoFromROCrateFile( self, roCrateFile: "pathlib.Path", expectedEngineDesc: "Optional[WorkflowType]" = None, + offline: "bool" = False, ) -> "IdentifiedWorkflow": """ @@ -2389,7 +2396,7 @@ def getWorkflowRepoFromROCrateFile( ) # We need this additional step to guess the repo type - guessed = self.guess_repo_params(repo.repo_url, fail_ok=True) + guessed = self.guess_repo_params(repo.repo_url, offline=offline, fail_ok=True) if guessed is None or guessed[0].repo_type is None: raise WfExSBackendException( f"Unable to guess repository from RO-Crate manifest obtained from {public_name}" From b4383d49a8ac11bd95d8b898c1576a185ff5ffce Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20M=C2=AA=20Fern=C3=A1ndez?= Date: Tue, 11 Feb 2025 15:18:09 +0100 Subject: [PATCH 42/60] Fixed wrongly performed check in SoftwareHeritageFetcher.build_pid_from_repo --- wfexs_backend/fetchers/swh.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/wfexs_backend/fetchers/swh.py b/wfexs_backend/fetchers/swh.py index ad2a0352..b931658b 100644 --- a/wfexs_backend/fetchers/swh.py +++ b/wfexs_backend/fetchers/swh.py @@ -251,7 +251,7 @@ def build_pid_from_repo(self, remote_repo: "RemoteRepo") -> "Optional[str]": parsed_wf_url = parse.urlparse(remote_repo.repo_url) if ( parsed_wf_url.scheme not in self.GetSchemeHandlers() - or remote_repo.repo_type not in (RepoType.TRS, None) + or remote_repo.repo_type not in (RepoType.SoftwareHeritage, None) ): return None From f73618064b50d33ddc5dcf03ed95e775e9116e19 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20M=C2=AA=20Fern=C3=A1ndez?= Date: Tue, 11 Feb 2025 15:19:25 +0100 Subject: [PATCH 43/60] Assure calling build_pid_from_repo is always safe in offline mode --- wfexs_backend/fetchers/__init__.py | 2 ++ wfexs_backend/fetchers/trs_files.py | 20 ++++++++++++++------ 2 files changed, 16 insertions(+), 6 deletions(-) diff --git a/wfexs_backend/fetchers/__init__.py b/wfexs_backend/fetchers/__init__.py index 6590df38..bc7640b9 100644 --- a/wfexs_backend/fetchers/__init__.py +++ b/wfexs_backend/fetchers/__init__.py @@ -381,6 +381,8 @@ def build_pid_from_repo(self, remote_repo: "RemoteRepo") -> "Optional[str]": If the fetcher does not recognize the type of repo, either using repo_url content or the repo type in the worst case, it should return None + + Calling this method in offline mode should be safe """ pass diff --git a/wfexs_backend/fetchers/trs_files.py b/wfexs_backend/fetchers/trs_files.py index a432fe5e..a837962d 100644 --- a/wfexs_backend/fetchers/trs_files.py +++ b/wfexs_backend/fetchers/trs_files.py @@ -1077,12 +1077,20 @@ def build_pid_from_repo(self, remote_repo: "RemoteRepo") -> "Optional[str]": ): return remote_repo.repo_url elif remote_repo.repo_type == RepoType.TRS: - guessed_trs_params = self.GuessTRSParams( - parsedInputURL, - override_version_id=remote_repo.tag, - logger=self.logger, - fail_ok=True, - ) + try: + guessed_trs_params = self.GuessTRSParams( + parsedInputURL, + override_version_id=remote_repo.tag, + logger=self.logger, + fail_ok=True, + offline=True, + ) + except OfflineRepoGuessException as orge: + self.logger.error( + f"While building pid for {remote_repo.repo_url} called code which should be safe offline" + ) + guessed_trs_params = None + if guessed_trs_params is not None: ( trs_tool_url, From 92afe624296697be4ab96568ee8b299b11a62e56 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20M=C2=AA=20Fern=C3=A1ndez?= Date: Tue, 11 Feb 2025 15:30:23 +0100 Subject: [PATCH 44/60] Added better filtering out capabilities to SoftwareHeritageFetcher.GuessRepoParams when the input uri belongs to a scheme not understood by it. --- wfexs_backend/fetchers/swh.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/wfexs_backend/fetchers/swh.py b/wfexs_backend/fetchers/swh.py index b931658b..3a689179 100644 --- a/wfexs_backend/fetchers/swh.py +++ b/wfexs_backend/fetchers/swh.py @@ -189,6 +189,9 @@ def GuessRepoParams( wf_url = cast("RepoURL", orig_wf_url) parsed_wf_url = parse.urlparse(orig_wf_url) + if fail_ok and parsed_wf_url.scheme not in cls.GetSchemeHandlers(): + return None + if offline: raise OfflineRepoGuessException( f"Queries related to {wf_url} are not allowed in offline mode" From 0d3a28484225e331261140a3f261d4942a60b6c7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20M=C2=AA=20Fern=C3=A1ndez?= Date: Wed, 12 Feb 2025 13:35:30 +0100 Subject: [PATCH 45/60] Added caching capabilities on WfExSBackend.guess_repo_params, so using it in offline mode is possible when repository parameters guessing was previously cached. --- wfexs_backend/wfexs_backend.py | 110 ++++++++++++++++++++++++++++++--- 1 file changed, 101 insertions(+), 9 deletions(-) diff --git a/wfexs_backend/wfexs_backend.py b/wfexs_backend/wfexs_backend.py index cfd6a923..db8371c8 100644 --- a/wfexs_backend/wfexs_backend.py +++ b/wfexs_backend/wfexs_backend.py @@ -111,6 +111,7 @@ ) from .utils.marshalling_handling import ( + marshall_namedtuple, unmarshall_namedtuple, ) @@ -1864,19 +1865,69 @@ def instantiateEngine( config_directory=self.config_directory, ) + def matchRepoFetcherByClassname( + self, clazzname: "str" + ) -> "Optional[AbstractSchemeRepoFetcher]": + for fetcher in self._repo_fetchers: + if fetcher.__class__.__name__ == clazzname: + return fetcher + + return None + def guess_repo_params( self, wf_url: "Union[URIType, parse.ParseResult]", fail_ok: "bool" = False, offline: "bool" = False, + ignoreCache: "bool" = False, + registerInCache: "bool" = True, ) -> "Optional[Tuple[RemoteRepo, AbstractSchemeRepoFetcher]]": + remote_repo: "Optional[RemoteRepo]" = None + fetcher: "Optional[AbstractSchemeRepoFetcher]" = None + guess_cache = self.cacheWorkflowDir / "guess-cache" + + if not ignoreCache: + try: + # Let's check whether the workflow was registered + # kind: "ContentKind" + # path: "pathlib.Path" + # metadata_array: "Sequence[URIWithMetadata]" + # licences: "Tuple[URIType, ...]" + # fingerprint: "Optional[Fingerprint]" = None + # clonable: "bool" = True + cached_content = self.cacheHandler.fetch( + cast("URIType", wf_url), + offline=True, + destdir=guess_cache, + ) + # Always a cached metadata file + assert cached_content.kind == ContentKind.File + with cached_content.path.open(mode="r", encoding="utf-8") as ccH: + guessed_repo_payload = json.load(ccH) + + if isinstance(guessed_repo_payload, (tuple, list)): + remote_repo, fetcher_class_name = unmarshall_namedtuple( + guessed_repo_payload + ) + # Now, time to find the fetcher itself + if remote_repo is not None: + fetcher = self.matchRepoFetcherByClassname(fetcher_class_name) + if fetcher is not None: + return remote_repo, fetcher + self.logger.debug( + f"Cached empty guessing elements associated to {wf_url}. Ignoring" + ) + elif offline: + # Do not try again if it is in offline mode + return None + except Exception as e: + self.logger.debug(f"Guessed {wf_url} not cached (exception {e})") + if isinstance(wf_url, parse.ParseResult): parsedRepoURL = wf_url else: parsedRepoURL = urllib.parse.urlparse(wf_url) - remote_repo: "Optional[RemoteRepo]" = None - fetcher: "Optional[AbstractSchemeRepoFetcher]" = None for fetcher in self.repo_fetchers: remote_repo = fetcher.GuessRepoParams( parsedRepoURL, @@ -1885,6 +1936,31 @@ def guess_repo_params( offline=offline, ) if remote_repo is not None: + if registerInCache: + temp_cached = guess_cache / ("caching-" + str(uuid.uuid4())) + try: + with temp_cached.open(mode="w", encoding="utf-8") as tC: + json.dump( + marshall_namedtuple( + (remote_repo, fetcher.__class__.__name__) + ), + tC, + ) + self.cacheHandler.inject( + cast("URIType", wf_url), + destdir=guess_cache, + tempCachedFilename=temp_cached, + inputKind=ContentKind.File, + ) + except Exception as e: + self.logger.exception( + f"Unable to register guess cache for {wf_url} (see exception trace)" + ) + finally: + # Removing the leftovers, whether they worked or not + if temp_cached.exists(): + temp_cached.unlink() + return remote_repo, fetcher return None @@ -1946,7 +2022,13 @@ def cacheWorkflow( raise WFException("trs_endpoint was not provided") # Trying to be smarter - guessed = self.guess_repo_params(parsedRepoURL, offline=offline, fail_ok=True) + guessed = self.guess_repo_params( + parsedRepoURL, + offline=offline, + ignoreCache=ignoreCache, + registerInCache=registerInCache, + fail_ok=True, + ) if guessed is not None: guessedRepo = guessed[0] if guessedRepo.tag is None and version_id is not None: @@ -1968,6 +2050,7 @@ def cacheWorkflow( putative_repo_url, offline=offline, ignoreCache=ignoreCache, + registerInCache=registerInCache, ) if i_workflow is None: @@ -2087,11 +2170,6 @@ def doMaterializeRepo( else: fetcher = None - if fetcher is None and repo.repo_type not in (RepoType.Raw, None): - raise WfExSBackendException( - f"Don't know how to materialize {repo.repo_url} (of type {repo.repo_type}) as a repository" - ) - workflow_type: "Optional[WorkflowType]" = None # An specialized fetcher is used downstream_repos: "MutableSequence[RemoteRepo]" @@ -2169,6 +2247,10 @@ def doMaterializeRepo( self.logger.warning( f"Recommended upstream repo {materialized_repo.upstream_repo} from repo {repo} could not be fetched, skipping. Exception: {e}" ) + elif repo.repo_type not in (RepoType.Raw, None): + raise WfExSBackendException( + f"Don't know how to materialize {repo.repo_url} (of type {repo.repo_type}) as a repository" + ) else: downstream_repos = [] # Let's try guessing whether it is an RO-Crate @@ -2319,6 +2401,8 @@ def getWorkflowBundleFromURI( roCrateFile, expectedEngineDesc=expectedEngineDesc, offline=offline, + ignoreCache=ignoreCache, + registerInCache=registerInCache, ) return ( identified_workflow, @@ -2345,6 +2429,8 @@ def getWorkflowRepoFromROCrateFile( roCrateFile: "pathlib.Path", expectedEngineDesc: "Optional[WorkflowType]" = None, offline: "bool" = False, + ignoreCache: "bool" = False, + registerInCache: "bool" = True, ) -> "IdentifiedWorkflow": """ @@ -2396,7 +2482,13 @@ def getWorkflowRepoFromROCrateFile( ) # We need this additional step to guess the repo type - guessed = self.guess_repo_params(repo.repo_url, offline=offline, fail_ok=True) + guessed = self.guess_repo_params( + repo.repo_url, + offline=offline, + ignoreCache=ignoreCache, + registerInCache=registerInCache, + fail_ok=True, + ) if guessed is None or guessed[0].repo_type is None: raise WfExSBackendException( f"Unable to guess repository from RO-Crate manifest obtained from {public_name}" From 9143525ad125b91efbd299559d089424b96ac9b7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20M=C2=AA=20Fern=C3=A1ndez?= Date: Wed, 12 Feb 2025 13:38:22 +0100 Subject: [PATCH 46/60] Minor cleanup of unused variable --- wfexs_backend/wfexs_backend.py | 2 -- wfexs_backend/workflow.py | 1 - 2 files changed, 3 deletions(-) diff --git a/wfexs_backend/wfexs_backend.py b/wfexs_backend/wfexs_backend.py index db8371c8..0f24ff13 100644 --- a/wfexs_backend/wfexs_backend.py +++ b/wfexs_backend/wfexs_backend.py @@ -2091,8 +2091,6 @@ def cacheWorkflow( guessedRepo = i_workflow.remote_repo workflow_type = i_workflow.workflow_type - if cached_putative_path is not None: - self.cacheROCrateFilename = cached_putative_path assert guessedRepo is not None assert guessedRepo.repo_url is not None diff --git a/wfexs_backend/workflow.py b/wfexs_backend/workflow.py index 67f45356..b98cbbc7 100644 --- a/wfexs_backend/workflow.py +++ b/wfexs_backend/workflow.py @@ -893,7 +893,6 @@ def __init__( self.arch: "Optional[ProcessorArchitecture]" = None self.stagedExecutions: "Optional[MutableSequence[StagedExecution]]" = None - self.cacheROCrateFilename: "Optional[pathlib.Path]" = None self.runExportActions: "Optional[MutableSequence[MaterializedExportAction]]" = ( None From 134e30035a602a10b166a866b6a17afe5139536e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20M=C2=AA=20Fern=C3=A1ndez?= Date: Thu, 13 Feb 2025 11:40:46 +0100 Subject: [PATCH 47/60] Very minor code refactoring, towards identifying dead code --- wfexs_backend/wfexs_backend.py | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/wfexs_backend/wfexs_backend.py b/wfexs_backend/wfexs_backend.py index 0f24ff13..524a2f16 100644 --- a/wfexs_backend/wfexs_backend.py +++ b/wfexs_backend/wfexs_backend.py @@ -2078,19 +2078,18 @@ def cacheWorkflow( rel_path=cast("Optional[RelPath]", repoRelPath), ) putative = True + else: + # This can be incorrect, but let it be for now + if ( + requested_workflow_type is not None + and requested_workflow_type != i_workflow.workflow_type + ): + message = f"Fetched workflow is of type {i_workflow.workflow_type.shortname} , but it was explicitly requested to be of type {requested_workflow_type.shortname}" + self.logger.error(message) + raise WfExSBackendException(message) - # This can be incorrect, but let it be for now - if i_workflow is not None: - if ( - requested_workflow_type is not None - and requested_workflow_type != i_workflow.workflow_type - ): - message = f"Fetched workflow is of type {i_workflow.workflow_type.shortname} , but it was explicitly requested to be of type {requested_workflow_type.shortname}" - self.logger.error(message) - raise WfExSBackendException(message) - - guessedRepo = i_workflow.remote_repo - workflow_type = i_workflow.workflow_type + guessedRepo = i_workflow.remote_repo + workflow_type = i_workflow.workflow_type assert guessedRepo is not None assert guessedRepo.repo_url is not None From 83f9f3ed3ce92861f7b93408c47c78c0900d71f8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20M=C2=AA=20Fern=C3=A1ndez?= Date: Fri, 14 Feb 2025 14:03:36 +0100 Subject: [PATCH 48/60] Fixed unmarshalling issue on RemoteRepo when the instance has a RemoteGuessFlavor --- wfexs_backend/wfexs_backend.py | 1 + 1 file changed, 1 insertion(+) diff --git a/wfexs_backend/wfexs_backend.py b/wfexs_backend/wfexs_backend.py index 524a2f16..9de6cd46 100644 --- a/wfexs_backend/wfexs_backend.py +++ b/wfexs_backend/wfexs_backend.py @@ -141,6 +141,7 @@ FetcherException, MaterializedRepo, RemoteRepo, + RepoGuessFlavor, # This is needed for proper unmarshalling of cached repository guesses RepoType, ) From a07abbc1644c328475a2109f4cb8c8c6312e5a36 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20M=C2=AA=20Fern=C3=A1ndez?= Date: Fri, 14 Feb 2025 17:37:51 +0100 Subject: [PATCH 49/60] After a long refactoring, partially fix case related to issue #139 --- wfexs_backend/wfexs_backend.py | 11 +++++++++-- wfexs_backend/workflow.py | 20 ++++++++++++++++---- 2 files changed, 25 insertions(+), 6 deletions(-) diff --git a/wfexs_backend/wfexs_backend.py b/wfexs_backend/wfexs_backend.py index 9de6cd46..b4741ede 100644 --- a/wfexs_backend/wfexs_backend.py +++ b/wfexs_backend/wfexs_backend.py @@ -487,11 +487,18 @@ def FromDescription( # It should not happen enabled_profiles = [str(profiles)] + parsed_workflow_id = urllib.parse.urlparse(workflow_meta["workflow_id"]) + trs_endpoint: "Optional[str]" + if parsed_workflow_id.scheme != "": + trs_endpoint = workflow_meta.get("trs_endpoint") + else: + trs_endpoint = workflow_meta.get("trs_endpoint", WF.DEFAULT_TRS_ENDPOINT) + return cls(updated_local_config, config_directory=config_directory).newSetup( workflow_meta["workflow_id"], workflow_meta.get("version"), descriptor_type=workflow_meta.get("workflow_type"), - trs_endpoint=workflow_meta.get("trs_endpoint", WF.DEFAULT_TRS_ENDPOINT), + trs_endpoint=trs_endpoint, params=workflow_meta.get("params", {}), enabled_profiles=enabled_profiles, environment=workflow_meta.get("environment", {}), @@ -1004,7 +1011,7 @@ def newSetup( workflow_id: "WorkflowId", version_id: "Optional[WFVersionId]" = None, descriptor_type: "Optional[TRS_Workflow_Descriptor]" = None, - trs_endpoint: "str" = WF.DEFAULT_TRS_ENDPOINT, + trs_endpoint: "Optional[str]" = None, params: "Optional[ParamsBlock]" = None, enabled_profiles: "Optional[Sequence[str]]" = None, environment: "Optional[EnvironmentBlock]" = None, diff --git a/wfexs_backend/workflow.py b/wfexs_backend/workflow.py index b98cbbc7..ca3a4509 100644 --- a/wfexs_backend/workflow.py +++ b/wfexs_backend/workflow.py @@ -456,7 +456,7 @@ def __init__( workflow_id: "Optional[WorkflowId]" = None, version_id: "Optional[WFVersionId]" = None, descriptor_type: "Optional[TRS_Workflow_Descriptor]" = None, - trs_endpoint: "str" = DEFAULT_TRS_ENDPOINT, + trs_endpoint: "Optional[str]" = None, params: "Optional[ParamsBlock]" = None, enabled_profiles: "Optional[Sequence[str]]" = None, environment: "Optional[EnvironmentBlock]" = None, @@ -1828,12 +1828,18 @@ def FromDescription( # It should not happen enabled_profiles = [str(profiles)] + parsed_workflow_id = urllib.parse.urlparse(workflow_meta["workflow_id"]) + if parsed_workflow_id.scheme != "": + trs_endpoint = workflow_meta.get("trs_endpoint") + else: + trs_endpoint = workflow_meta.get("trs_endpoint", cls.DEFAULT_TRS_ENDPOINT) + return cls( wfexs, workflow_meta["workflow_id"], workflow_meta.get("version"), descriptor_type=workflow_meta.get("workflow_type"), - trs_endpoint=workflow_meta.get("trs_endpoint", cls.DEFAULT_TRS_ENDPOINT), + trs_endpoint=trs_endpoint, params=workflow_meta.get("params", dict()), enabled_profiles=enabled_profiles, environment=workflow_meta.get("environment", dict()), @@ -1892,12 +1898,18 @@ def FromForm( # It should not happen enabled_profiles = [str(profiles)] + parsed_workflow_id = urllib.parse.urlparse(workflow_meta["workflow_id"]) + if parsed_workflow_id.scheme != "": + trs_endpoint = workflow_meta.get("trs_endpoint") + else: + trs_endpoint = workflow_meta.get("trs_endpoint", cls.DEFAULT_TRS_ENDPOINT) + return cls( wfexs, workflow_meta["workflow_id"], workflow_meta.get("version"), descriptor_type=workflow_meta.get("workflow_type"), - trs_endpoint=workflow_meta.get("trs_endpoint", cls.DEFAULT_TRS_ENDPOINT), + trs_endpoint=trs_endpoint, params=workflow_meta.get("params", dict()), enabled_profiles=enabled_profiles, environment=workflow_meta.get("environment", dict()), @@ -2125,7 +2137,7 @@ def fetchWorkflow( ) else: raise WFException( - "No engine recognized a valid workflow at {}".format(self.repoURL) + f"No engine recognized a valid workflow at {self.repoURL} ({localWorkflow})" ) else: self.logger.debug("Fixed engine " + self.engineDesc.trs_descriptor) From fe1b7ba8a7da5b02c72e4fd34b29494c19e69c97 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20M=C2=AA=20Fern=C3=A1ndez?= Date: Sat, 15 Feb 2025 03:32:58 +0100 Subject: [PATCH 50/60] Avoid losing input rel_path in GitFetcher.materialize_repo_from_repo . Fixed bug where materializing a git repo was losing the rel_path (and other properties) in the generated RemoteRepo instance. These data come from the input RemoteRepo instance. --- wfexs_backend/fetchers/git.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/wfexs_backend/fetchers/git.py b/wfexs_backend/fetchers/git.py index ca48eb6b..2f6fd6fe 100644 --- a/wfexs_backend/fetchers/git.py +++ b/wfexs_backend/fetchers/git.py @@ -865,12 +865,7 @@ def materialize_repo_from_repo( "RepoTag", revproc.stdout.read().rstrip() ) - remote_repo = RemoteRepo( - repo_url=repoURL, - tag=repoTag, - repo_type=RepoType.Git, - checkout=repo_effective_checkout, - ) + remote_repo = repo._replace(checkout=repo_effective_checkout) return MaterializedRepo( local=repo_tag_destpath, From 55fc56b22ab73d203062afd8a8f2c124e1e64c35 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20M=C2=AA=20Fern=C3=A1ndez?= Date: Sat, 15 Feb 2025 17:28:04 +0100 Subject: [PATCH 51/60] Assure guessed and fetched repos described by RemoteRepo instances are of the proper types --- wfexs_backend/fetchers/git.py | 5 ++++- wfexs_backend/fetchers/swh.py | 4 +--- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/wfexs_backend/fetchers/git.py b/wfexs_backend/fetchers/git.py index 2f6fd6fe..66f0f78c 100644 --- a/wfexs_backend/fetchers/git.py +++ b/wfexs_backend/fetchers/git.py @@ -865,7 +865,10 @@ def materialize_repo_from_repo( "RepoTag", revproc.stdout.read().rstrip() ) - remote_repo = repo._replace(checkout=repo_effective_checkout) + remote_repo = repo._replace( + repo_type=RepoType.Git, + checkout=repo_effective_checkout, + ) return MaterializedRepo( local=repo_tag_destpath, diff --git a/wfexs_backend/fetchers/swh.py b/wfexs_backend/fetchers/swh.py index 3a689179..a26203c3 100644 --- a/wfexs_backend/fetchers/swh.py +++ b/wfexs_backend/fetchers/swh.py @@ -653,9 +653,7 @@ def materialize_repo_from_repo( f"Unexpected Software Heritage object type {object_type} for {repoURL}" ) - remote_repo = RemoteRepo( - repo_url=repoURL, - tag=repoTag, + remote_repo = repo._replace( repo_type=RepoType.SoftwareHeritage, checkout=cast("RepoTag", repo_effective_checkout), ) From de984cca38e5bd16c53efbfd65a94a038f3a10af Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20M=C2=AA=20Fern=C3=A1ndez?= Date: Tue, 18 Feb 2025 02:29:08 +0100 Subject: [PATCH 52/60] Implemented `prefer_upstream_source` to control whether upstream, trustable sources should be followed when a repo is going to be materialized --- wfexs_backend/schemas/stage-definition.json | 6 ++++++ wfexs_backend/wfexs_backend.py | 19 ++++++++++++++++++- wfexs_backend/workflow.py | 17 +++++++++++++++++ 3 files changed, 41 insertions(+), 1 deletion(-) diff --git a/wfexs_backend/schemas/stage-definition.json b/wfexs_backend/schemas/stage-definition.json index b391cfa3..60b9a148 100644 --- a/wfexs_backend/schemas/stage-definition.json +++ b/wfexs_backend/schemas/stage-definition.json @@ -575,6 +575,12 @@ } ] }, + "prefer_upstream_source": { + "title": "Prefer upstream source for the workflow, if available", + "description": "Prefer discovered and recommended upstream source for the workflow (if available) instead of the initially requested one, which can happen in cascade. This is needed for cases where the workflow is incomplete in the initially proposed source, and it is a somewhat known fact", + "type": "boolean", + "default": true + }, "nickname": { "title": "A friendly nickname (prefix) for the instances", "type": "string" diff --git a/wfexs_backend/wfexs_backend.py b/wfexs_backend/wfexs_backend.py index b4741ede..81c67593 100644 --- a/wfexs_backend/wfexs_backend.py +++ b/wfexs_backend/wfexs_backend.py @@ -499,6 +499,7 @@ def FromDescription( workflow_meta.get("version"), descriptor_type=workflow_meta.get("workflow_type"), trs_endpoint=trs_endpoint, + prefer_upstream_source=workflow_meta.get("prefer_upstream_source"), params=workflow_meta.get("params", {}), enabled_profiles=enabled_profiles, environment=workflow_meta.get("environment", {}), @@ -1012,6 +1013,7 @@ def newSetup( version_id: "Optional[WFVersionId]" = None, descriptor_type: "Optional[TRS_Workflow_Descriptor]" = None, trs_endpoint: "Optional[str]" = None, + prefer_upstream_source: "Optional[bool]" = None, params: "Optional[ParamsBlock]" = None, enabled_profiles: "Optional[Sequence[str]]" = None, environment: "Optional[EnvironmentBlock]" = None, @@ -1032,6 +1034,7 @@ def newSetup( version_id=version_id, descriptor_type=descriptor_type, trs_endpoint=trs_endpoint, + prefer_upstream_source=prefer_upstream_source, params=params, enabled_profiles=enabled_profiles, environment=environment, @@ -1979,6 +1982,7 @@ def cacheWorkflow( version_id: "Optional[WFVersionId]" = None, trs_endpoint: "Optional[str]" = None, descriptor_type: "Optional[TRS_Workflow_Descriptor]" = None, + prefer_upstream_source: "bool" = True, ignoreCache: "bool" = False, registerInCache: "bool" = True, offline: "bool" = False, @@ -2056,6 +2060,7 @@ def cacheWorkflow( repoRelPath, ) = self.getWorkflowBundleFromURI( putative_repo_url, + prefer_upstream_source=prefer_upstream_source, offline=offline, ignoreCache=ignoreCache, registerInCache=registerInCache, @@ -2120,6 +2125,7 @@ def cacheWorkflow( ) = self.doMaterializeRepo( guessedRepo, fetcher=guessed[1] if guessed is not None else None, + prefer_upstream_source=prefer_upstream_source, doUpdate=ignoreCache, registerInCache=registerInCache, offline=offline, @@ -2138,6 +2144,7 @@ def doMaterializeRepo( self, repo: "RemoteRepo", fetcher: "Optional[AbstractSchemeRepoFetcher]" = None, + prefer_upstream_source: "bool" = True, doUpdate: "bool" = True, registerInCache: "bool" = True, offline: "bool" = False, @@ -2226,7 +2233,8 @@ def doMaterializeRepo( # Go to the next repo only if it is recommended if ( - materialized_repo.recommends_upstream + prefer_upstream_source + and materialized_repo.recommends_upstream and materialized_repo.upstream_repo is not None ): try: @@ -2237,6 +2245,7 @@ def doMaterializeRepo( upstream_downstream_repos, ) = self.doMaterializeRepo( materialized_repo.upstream_repo, + prefer_upstream_source=prefer_upstream_source, doUpdate=doUpdate, registerInCache=registerInCache, offline=offline, @@ -2266,6 +2275,7 @@ def doMaterializeRepo( repo_rel_path, ) = self.getWorkflowBundleFromURI( repo.repo_url, + prefer_upstream_source=prefer_upstream_source, ignoreCache=doUpdate, registerInCache=registerInCache, offline=offline, @@ -2289,6 +2299,7 @@ def doMaterializeRepo( upstream_downstream_repos, ) = self.doMaterializeRepo( i_workflow_repo, + prefer_upstream_source=prefer_upstream_source, doUpdate=doUpdate, registerInCache=registerInCache, offline=offline, @@ -2342,6 +2353,7 @@ def getWorkflowBundleFromURI( self, remote_url: "URIType", expectedEngineDesc: "Optional[WorkflowType]" = None, + prefer_upstream_source: "bool" = True, offline: "bool" = False, ignoreCache: "bool" = False, registerInCache: "bool" = True, @@ -2405,6 +2417,7 @@ def getWorkflowBundleFromURI( identified_workflow = self.getWorkflowRepoFromROCrateFile( roCrateFile, expectedEngineDesc=expectedEngineDesc, + prefer_upstream_source=prefer_upstream_source, offline=offline, ignoreCache=ignoreCache, registerInCache=registerInCache, @@ -2433,6 +2446,7 @@ def getWorkflowRepoFromROCrateFile( self, roCrateFile: "pathlib.Path", expectedEngineDesc: "Optional[WorkflowType]" = None, + prefer_upstream_source: "bool" = True, offline: "bool" = False, ignoreCache: "bool" = False, registerInCache: "bool" = True, @@ -2471,6 +2485,9 @@ def getWorkflowRepoFromROCrateFile( # the branch/tag/checkout , and the relative directory in the # fetched content (needed by Nextflow) + # TODO: honour prefer_upstream_source parameter when it is false + # and the payload of the RO-Crate contains a copy of the workflow + # Some RO-Crates might have this value missing or ill-built repo, workflow_type, _ = self.rocrate_toolbox.extractWorkflowMetadata( g, diff --git a/wfexs_backend/workflow.py b/wfexs_backend/workflow.py index ca3a4509..9e82c298 100644 --- a/wfexs_backend/workflow.py +++ b/wfexs_backend/workflow.py @@ -457,6 +457,7 @@ def __init__( version_id: "Optional[WFVersionId]" = None, descriptor_type: "Optional[TRS_Workflow_Descriptor]" = None, trs_endpoint: "Optional[str]" = None, + prefer_upstream_source: "Optional[bool]" = None, params: "Optional[ParamsBlock]" = None, enabled_profiles: "Optional[Sequence[str]]" = None, environment: "Optional[EnvironmentBlock]" = None, @@ -603,6 +604,8 @@ def __init__( workflow_meta["workflow_type"] = descriptor_type if trs_endpoint is not None: workflow_meta["trs_endpoint"] = trs_endpoint + if prefer_upstream_source is not None: + workflow_meta["prefer_upstream_source"] = prefer_upstream_source if workflow_config is not None: workflow_meta["workflow_config"] = workflow_config if params is not None: @@ -641,6 +644,9 @@ def __init__( self.id = str(workflow_id) if workflow_id is not None else None self.version_id = str(version_id) if version_id is not None else None self.descriptor_type = descriptor_type + self.prefer_upstream_source = ( + prefer_upstream_source if prefer_upstream_source is not None else True + ) self.params = params self.enabled_profiles = enabled_profiles self.environment = environment @@ -665,6 +671,7 @@ def __init__( self.id = None self.version_id = None self.descriptor_type = None + self.prefer_upstream_source = True if instanceId is not None: self.instanceId = instanceId @@ -1840,6 +1847,7 @@ def FromDescription( workflow_meta.get("version"), descriptor_type=workflow_meta.get("workflow_type"), trs_endpoint=trs_endpoint, + prefer_upstream_source=workflow_meta.get("prefer_upstream_source"), params=workflow_meta.get("params", dict()), enabled_profiles=enabled_profiles, environment=workflow_meta.get("environment", dict()), @@ -1910,6 +1918,7 @@ def FromForm( workflow_meta.get("version"), descriptor_type=workflow_meta.get("workflow_type"), trs_endpoint=trs_endpoint, + prefer_upstream_source=workflow_meta.get("prefer_upstream_source"), params=workflow_meta.get("params", dict()), enabled_profiles=enabled_profiles, environment=workflow_meta.get("environment", dict()), @@ -1930,6 +1939,7 @@ def fetchWorkflow( version_id: "Optional[WFVersionId]", trs_endpoint: "Optional[str]", descriptor_type: "Optional[TRS_Workflow_Descriptor]", + prefer_upstream_source: "bool" = True, offline: "bool" = False, ignoreCache: "bool" = False, injectable_repo: "Optional[Tuple[RemoteRepo, WorkflowType]]" = None, @@ -2011,6 +2021,7 @@ def fetchWorkflow( downstream_repos, ) = self.wfexs.doMaterializeRepo( repo, + prefer_upstream_source=prefer_upstream_source, doUpdate=ignoreCache, # registerInCache=True, offline=offline, @@ -2028,6 +2039,7 @@ def fetchWorkflow( workflow_id=workflow_id, version_id=version_id, trs_endpoint=trs_endpoint, + prefer_upstream_source=prefer_upstream_source, descriptor_type=descriptor_type, ignoreCache=ignoreCache, offline=offline, @@ -2195,6 +2207,7 @@ def setupEngine( self.version_id, self.trs_endpoint, self.descriptor_type, + prefer_upstream_source=self.prefer_upstream_source, offline=offline, ignoreCache=ignoreCache, injectable_repo=injectable_repo, @@ -4302,6 +4315,7 @@ def exportResults( def staging_recipe(self) -> "WritableWorkflowMetaConfigBlock": workflow_meta: "WritableWorkflowMetaConfigBlock" = { "workflow_id": self.id, + "prefer_upstream_source": self.prefer_upstream_source, "paranoid_mode": self.paranoidMode, } if self.nickname is not None: @@ -4424,6 +4438,9 @@ def unmarshallConfig( self.version_id = workflow_meta.get("version") self.descriptor_type = workflow_meta.get("workflow_type") self.trs_endpoint = workflow_meta.get("trs_endpoint") + self.prefer_upstream_source = workflow_meta.get( + "prefer_upstream_source", True + ) self.workflow_config = workflow_meta.get("workflow_config") self.params = workflow_meta.get("params") profiles: "Optional[Union[str, Sequence[str]]]" = workflow_meta.get( From 8114bed309b8b3ed539caae5e130eeb5bb056c9c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20M=C2=AA=20Fern=C3=A1ndez?= Date: Tue, 18 Feb 2025 03:14:19 +0100 Subject: [PATCH 53/60] Detected some git related pytests were failing. Fixed behaviour both on GitFetcher and its associated tests --- tests/fetchers/test_git.py | 65 +++++++---- wfexs_backend/fetchers/git.py | 214 ++++++++++++++++++++++------------ 2 files changed, 181 insertions(+), 98 deletions(-) diff --git a/tests/fetchers/test_git.py b/tests/fetchers/test_git.py index eaaaa8d5..27e2d142 100644 --- a/tests/fetchers/test_git.py +++ b/tests/fetchers/test_git.py @@ -29,6 +29,8 @@ if TYPE_CHECKING: from typing import ( Optional, + Type, + Union, ) from wfexs_backend.common import ( @@ -44,6 +46,7 @@ from wfexs_backend.fetchers import ( RemoteRepo, + RepoGuessException, RepoGuessFlavor, RepoType, ) @@ -52,7 +55,9 @@ from wfexs_backend.fetchers.git import GitFetcher -WfExS_basedir = Path(__file__).parent.parent +import wfexs_backend + +WfExS_basedir = Path(wfexs_backend.__file__).parent.parent WfExS_basedir_file_uri = WfExS_basedir.as_uri() WfExS_git_basedir = WfExS_basedir / ".git" WfExS_git_basedir_file_uri = WfExS_git_basedir.as_uri() @@ -61,7 +66,7 @@ logger.setLevel(logging.INFO) GIT_TESTBED = pytest.mark.parametrize( - ["url", "remote_repo", "repo_pid"], + ["url", "remote_repo_or_exception_class", "repo_pid"], [ ( "https://github.com/inab/WfExS-backend.git", @@ -123,12 +128,9 @@ "git+ssh://git@github.com:inab/WfExS-backend.git@main", ), ( + # This tag does not exists! "ssh://git@github.com:inab/WfExS-backend.git@0.1.2", - RemoteRepo( - repo_url=cast("RepoURL", "ssh://git@github.com/inab/WfExS-backend.git"), - repo_type=RepoType.Git, - tag=cast("RepoTag", "0.1.2"), - ), + RepoGuessException, "git+ssh://git@github.com:inab/WfExS-backend.git@0.1.2", ), ( @@ -160,12 +162,9 @@ "git+" + WfExS_git_basedir_file_uri, ), ( + # This tag does not exists! WfExS_git_basedir_file_uri + "@0.1.2", - RemoteRepo( - repo_url=cast("RepoURL", WfExS_git_basedir_file_uri), - repo_type=RepoType.Git, - tag=cast("RepoTag", "0.1.2"), - ), + RepoGuessException, "git+" + WfExS_git_basedir_file_uri + "@0.1.2", ), ( @@ -240,25 +239,41 @@ @GIT_TESTBED def test_guess_git_repo_params( - url: "str", remote_repo: "Optional[RemoteRepo]", repo_pid: "Optional[str]" + url: "str", + remote_repo_or_exception_class: "Optional[Union[RemoteRepo, Type[Exception]]]", + repo_pid: "Optional[str]", ) -> "None": - output = GitFetcher.GuessRepoParams(cast("URIType", url), logger=logger) + if ( + isinstance(remote_repo_or_exception_class, RemoteRepo) + or remote_repo_or_exception_class is None + ): + output = GitFetcher.GuessRepoParams(cast("URIType", url), logger=logger) - # When no tag is given, ignore what it was discovered - if output is not None and remote_repo is not None: - if remote_repo.tag is None: - output = output._replace(tag=None) - # For now, patch this - if remote_repo.checkout is not None: - output = output._replace(checkout=remote_repo.checkout) - assert output == remote_repo + # When no tag is given, ignore what it was discovered + if output is not None and remote_repo_or_exception_class is not None: + if remote_repo_or_exception_class.tag is None: + output = output._replace(tag=None) + # For now, patch this + if remote_repo_or_exception_class.checkout is not None: + output = output._replace( + checkout=remote_repo_or_exception_class.checkout + ) + assert output == remote_repo_or_exception_class + + else: + with pytest.raises(remote_repo_or_exception_class): + output = GitFetcher.GuessRepoParams(cast("URIType", url), logger=logger) @GIT_TESTBED def test_build_git_pid_from_repo( - url: "str", remote_repo: "Optional[RemoteRepo]", repo_pid: "Optional[str]" + url: "str", + remote_repo_or_exception_class: "Optional[Union[RemoteRepo, Type[Exception]]]", + repo_pid: "Optional[str]", ) -> "None": - if remote_repo is None: + if remote_repo_or_exception_class is None or not isinstance( + remote_repo_or_exception_class, RemoteRepo + ): pytest.skip("Skipped test because no remote repo was provided") else: scheme_catalog = SchemeCatalog( @@ -266,6 +281,6 @@ def test_build_git_pid_from_repo( ) fetcher = GitFetcher(scheme_catalog, progs={}) - output = fetcher.build_pid_from_repo(remote_repo) + output = fetcher.build_pid_from_repo(remote_repo_or_exception_class) assert output == repo_pid diff --git a/wfexs_backend/fetchers/git.py b/wfexs_backend/fetchers/git.py index 66f0f78c..6830b7e2 100644 --- a/wfexs_backend/fetchers/git.py +++ b/wfexs_backend/fetchers/git.py @@ -20,9 +20,12 @@ import hashlib import os import pathlib +import re import shutil import subprocess import tempfile +import warnings + from typing import ( cast, TYPE_CHECKING, @@ -72,7 +75,10 @@ from urllib import parse, request +from dulwich.client import get_transport_and_path + import dulwich.porcelain +import dulwich.repo from . import ( AbstractSchemeRepoFetcher, @@ -156,7 +162,17 @@ def _find_git_repo_in_uri( parsedInputURL = remote_file else: parsedInputURL = parse.urlparse(remote_file) - sp_path = parsedInputURL.path.split("/") + + # Getting the tag or branch in cases like https://github.com/inab/WfExS-backend.git@0.2.0 + repoTag: "Optional[str]" = None + if "@" in parsedInputURL.path: + gitPath, repoTag = parsedInputURL.path.rsplit("@", 1) + reparsedInputURL = parsedInputURL._replace(path=gitPath) + else: + gitPath = parsedInputURL.path + reparsedInputURL = parsedInputURL + + sp_path = reparsedInputURL.path.split("/") shortest_pre_path: "Optional[URIType]" = None longest_post_path: "Optional[Sequence[str]]" = None @@ -169,7 +185,10 @@ def _find_git_repo_in_uri( pre_path = "/".join(sp_path[:pos]) if pre_path == "": pre_path = "/" - remote_uri_anc = parse.urlunparse(parsedInputURL._replace(path=pre_path)) + # Remove fragments + remote_uri_anc = parse.urlunparse( + reparsedInputURL._replace(path=pre_path, fragment="") + ) remote_refs_dict: "Mapping[bytes, bytes]" if offline: @@ -180,63 +199,112 @@ def _find_git_repo_in_uri( # Dulwich works both with file, ssh, git and http(s) protocols remote_refs_dict = dulwich.porcelain.ls_remote(remote_uri_anc) repo_type = RepoType.Git + + break except ( dulwich.errors.NotGitRepository, dulwich.errors.GitProtocolError, ) as ngr: # Skip and continue continue + else: + # Metadata is all we really need + remote_uri_anc = parse.urlunparse(reparsedInputURL._replace(fragment="")) + req = request.Request(remote_uri_anc, method="HEAD") + try: + with request.urlopen(req) as resp: + # Is it gitlab? + if list( + filter( + lambda c: "gitlab" in c, + resp.headers.get_all("Set-Cookie"), + ) + ): + repo_type = RepoType.Git + guessed_repo_flavor = RepoGuessFlavor.GitLab + elif list( + filter( + lambda c: GITHUB_NETLOC in c, + resp.headers.get_all("Set-Cookie"), + ) + ): + repo_type = RepoType.Git + guessed_repo_flavor = RepoGuessFlavor.GitHub + elif list( + filter( + lambda c: "bitbucket" in c, + resp.headers.get_all("X-View-Name"), + ) + ): + repo_type = RepoType.Git + guessed_repo_flavor = RepoGuessFlavor.BitBucket + except Exception as e: + pass - the_remote_uri = remote_uri_anc + if repo_type != RepoType.Git: + raise RepoGuessException(f"Unable to identify {remote_file} as a git repo") - head_remote_ref = remote_refs_dict[cls.HEAD_LABEL] - repo_branches = [] - b_default_repo_tag = None - for remote_label, remote_ref in remote_refs_dict.items(): - if remote_label.startswith(cls.REFS_HEADS_PREFIX): - b_repo_tag = remote_label[len(cls.REFS_HEADS_PREFIX) :].decode( - "utf-8", errors="continue" - ) - repo_branches.append(cast("RepoTag", b_repo_tag)) - if b_default_repo_tag is None and remote_ref == head_remote_ref: + the_remote_uri = remote_uri_anc + + # Now, try matching either a branch or a tag + head_remote_ref = remote_refs_dict[cls.HEAD_LABEL] + repo_branches = [] + b_default_repo_tag = None + b_checkout: "Optional[RepoTag]" = None + for remote_label, remote_ref in remote_refs_dict.items(): + b_repo_tag: "Optional[str]" = None + if remote_label.startswith(cls.REFS_HEADS_PREFIX): + b_repo_tag = remote_label[len(cls.REFS_HEADS_PREFIX) :].decode( + "utf-8", errors="continue" + ) + elif remote_label.startswith(cls.REFS_TAGS_PREFIX): + b_repo_tag = remote_label[len(cls.REFS_TAGS_PREFIX) :].decode( + "utf-8", errors="continue" + ) + + if b_repo_tag is not None: + repo_branches.append(cast("RepoTag", b_repo_tag)) + if b_default_repo_tag is None: + b_remote_ref = remote_ref.decode("utf-8", errors="continue") + if repoTag is None and remote_ref == head_remote_ref: b_default_repo_tag = b_repo_tag + b_checkout = cast("RepoTag", b_remote_ref) + elif repoTag in (b_repo_tag, b_remote_ref): + b_default_repo_tag = repoTag + b_checkout = cast("RepoTag", b_remote_ref) - # It is considered a git repo! - shortest_pre_path = cast("URIType", pre_path) - longest_post_path = sp_path[pos:] - if repo_type is None: - # Metadata is all we really need - repo_type = RepoType.Raw - req = request.Request(remote_uri_anc, method="HEAD") - try: - with request.urlopen(req) as resp: - # Is it gitlab? - if list( - filter( - lambda c: "gitlab" in c, - resp.headers.get_all("Set-Cookie"), - ) - ): - repo_type = RepoType.Git - guessed_repo_flavor = RepoGuessFlavor.GitLab - elif list( - filter( - lambda c: GITHUB_NETLOC in c, - resp.headers.get_all("Set-Cookie"), - ) - ): - repo_type = RepoType.Git - guessed_repo_flavor = RepoGuessFlavor.GitHub - elif list( - filter( - lambda c: "bitbucket" in c, - resp.headers.get_all("X-View-Name"), - ) - ): - repo_type = RepoType.Git - guessed_repo_flavor = RepoGuessFlavor.BitBucket - except Exception as e: - pass + if b_default_repo_tag is not None: + break + + if b_default_repo_tag is None: + if repoTag is None: + raise RepoGuessException( + f"No tag was obtained while getting default branch name from {remote_file}" + ) + + if len(repoTag) != 40 or re.search(r"[^a-f0-9]", repoTag): + raise RepoGuessException( + f"Tried to use ill formed {repoTag} SHA to query {remote_file} repo. It should be a 40 characters alphanumeric code." + ) + + # Possible sha in repoTag + memory_repo = dulwich.repo.MemoryRepo() + transport, path = get_transport_and_path(remote_uri_anc) + fetch_pack_result = transport.fetch( + path, cast("dulwich.repo.Repo", memory_repo) + ) + try: + memory_repo.get_object(repoTag.encode("utf-8")) + b_default_repo_tag = repoTag + b_checkout = cast("RepoTag", repoTag) + except (Exception, ValueError) as e: + raise RepoGuessException( + f"Git repo {remote_uri_anc} does not have either a reference or a commit identified as {repoTag}" + ) from e + + # It is considered a git repo! + shortest_pre_path = cast("URIType", pre_path) + longest_post_path = sp_path[pos:] if repo_type is None: raise RepoGuessException(f"Unable to identify {remote_file} as a git repo") @@ -244,16 +312,32 @@ def _find_git_repo_in_uri( if b_default_repo_tag is None: raise RepoGuessException( f"No tag was obtained while getting default branch name from {remote_file}" + if repoTag is None + else f"No tag matched {repoTag} from {remote_file}" ) assert longest_post_path is not None assert repo_branches is not None + # Getting the repoRelPath (if available) + the_rel_path: "Optional[RelPath]" = None + if len(reparsedInputURL.fragment) > 0: + frag_qs = parse.parse_qs(reparsedInputURL.fragment) + subDirArr = frag_qs.get("subdirectory", []) + if len(subDirArr) > 0: + the_rel_path = cast( + "RelPath", "/".join([*longest_post_path, subDirArr[0]]) + ) + elif len(longest_post_path) > 0: + the_rel_path = cast("RelPath", "/".join(longest_post_path)) + repo = RemoteRepo( repo_url=cast("RepoURL", the_remote_uri), + rel_path=the_rel_path, tag=cast("RepoTag", b_default_repo_tag), repo_type=repo_type, guess_flavor=guessed_repo_flavor, + checkout=b_checkout, ) return repo, longest_post_path, repo_branches @@ -392,10 +476,12 @@ def GuessRepoParams( repoTag = wf_path[3] if len(wf_path) > 4: repoRelPath = "/".join(wf_path[4:]) + # TODO handling other popular cases, like bitbucket elif ( parsed_wf_url.scheme == "" or (parsed_wf_url.scheme in cls.GetSchemeHandlers()) or (parsed_wf_url.scheme in cls.GIT_SCHEMES) + or parsed_wf_url.scheme == "http" ): if parsed_wf_url.scheme == "": # It could be a checkout uri in the form of 'git@github.com:inab/WfExS-backend.git' @@ -446,30 +532,12 @@ def GuessRepoParams( parse.urlunparse(denorm_parsed_wf_url) ) - # Getting the tag or branch - if "@" in parsed_wf_url.path: - gitPath, repoTag = parsed_wf_url.path.split("@", 1) - else: - gitPath = parsed_wf_url.path - - # Getting the repoRelPath (if available) - if len(parsed_wf_url.fragment) > 0: - frag_qs = parse.parse_qs(parsed_wf_url.fragment) - subDirArr = frag_qs.get("subdirectory", []) - if len(subDirArr) > 0: - repoRelPath = subDirArr[0] - - # Now, reassemble the repoURL - repoURL = parse.urlunparse( - (gitScheme, parsed_wf_url.netloc, gitPath, "", "", "") - ) - found_params = cls._find_git_repo_in_uri( - cast("URIType", repoURL), offline=offline - ) - guessedRepoFlavor = found_params[0].guess_flavor - # TODO handling other popular cases, like bitbucket - elif parsed_wf_url.scheme in ("http", "https", "file", "ssh"): found_params = cls._find_git_repo_in_uri(parsed_wf_url, offline=offline) + if found_params is not None: + repoURL = found_params[0].repo_url + repoRelPath = found_params[0].rel_path + guessedRepoFlavor = found_params[0].guess_flavor + else: return None @@ -927,7 +995,7 @@ def fetch( repoTag: "Optional[RepoTag]" if "@" in parsedInputURL.path: gitPath, repoTag = cast( - "Tuple[str, RepoTag]", tuple(parsedInputURL.path.split("@", 1)) + "Tuple[str, RepoTag]", tuple(parsedInputURL.path.rsplit("@", 1)) ) else: gitPath = parsedInputURL.path From 4279059634437d5d96f5cbfbb04e7773a5c0420f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20M=C2=AA=20Fern=C3=A1ndez?= Date: Tue, 18 Feb 2025 03:27:41 +0100 Subject: [PATCH 54/60] Raw repositories can have an empty tag --- wfexs_backend/workflow.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/wfexs_backend/workflow.py b/wfexs_backend/workflow.py index 9e82c298..7564b0b9 100644 --- a/wfexs_backend/workflow.py +++ b/wfexs_backend/workflow.py @@ -5477,7 +5477,10 @@ def createStageResearchObject( assert self.localWorkflow is not None assert self.materializedEngine is not None assert self.remote_repo is not None - assert self.remote_repo.tag is not None + assert self.remote_repo.tag is not None or self.remote_repo.repo_type in ( + RepoType.Raw, + None, + ) assert self.materializedParams is not None assert self.materializedEnvironment is not None assert self.staged_setup.work_dir is not None @@ -5551,7 +5554,10 @@ def createResultsResearchObject( assert self.localWorkflow is not None assert self.materializedEngine is not None assert self.remote_repo is not None - assert self.remote_repo.tag is not None + assert self.remote_repo.tag is not None or self.remote_repo.repo_type in ( + RepoType.Raw, + None, + ) assert self.staged_setup.work_dir is not None assert ( isinstance(self.stagedExecutions, list) and len(self.stagedExecutions) > 0 From b757d6c719c23db823efbf4447937a07389688c2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20M=C2=AA=20Fern=C3=A1ndez?= Date: Tue, 18 Feb 2025 03:35:09 +0100 Subject: [PATCH 55/60] Removed exception which was stopping the generation of WRROCs for workflows which were fetched using the file protocol --- wfexs_backend/ro_crate.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/wfexs_backend/ro_crate.py b/wfexs_backend/ro_crate.py index c41ea6a1..807f99ba 100644 --- a/wfexs_backend/ro_crate.py +++ b/wfexs_backend/ro_crate.py @@ -2161,7 +2161,7 @@ def _add_workflow_to_crate( ) else: - raise ROCrateGenerationException( + self.logger.warning( "FIXME: Unsupported http(s) git repository {}".format( remote_repo.repo_url ) From 0858ac519d5063923b870673536e8649c9592692 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20M=C2=AA=20Fern=C3=A1ndez?= Date: Tue, 18 Feb 2025 04:00:22 +0100 Subject: [PATCH 56/60] Updated pre-commit GitHub CI workflow components --- .github/workflows/pre-commit.yml | 48 ++++++++++++++++++++++++-------- 1 file changed, 36 insertions(+), 12 deletions(-) diff --git a/.github/workflows/pre-commit.yml b/.github/workflows/pre-commit.yml index 25f2253d..7d75ab4c 100644 --- a/.github/workflows/pre-commit.yml +++ b/.github/workflows/pre-commit.yml @@ -18,7 +18,7 @@ jobs: - uses: actions/checkout@v4 with: fetch-depth: 100 - - uses: actions/setup-python@v4 + - uses: actions/setup-python@v5 with: python-version: ${{ matrix.python-version }} cache: 'pip' @@ -73,8 +73,9 @@ jobs: - name: Print licences report if: ${{ always() }} run: echo "${{ steps.license_check_report.outputs.report }}" - - uses: actions/upload-artifact@v3 + - uses: actions/upload-artifact@v4 with: + name: pre-commit-${{ matrix.python-version }} retention-days: 2 path: constraints-${{ matrix.python-version }}.txt @@ -88,7 +89,7 @@ jobs: - uses: actions/checkout@v4 with: fetch-depth: 100 - - uses: actions/setup-python@v4 + - uses: actions/setup-python@v5 with: python-version: ${{ matrix.python-version }} cache: 'pip' @@ -143,8 +144,9 @@ jobs: - name: Print licences report if: ${{ always() }} run: echo "${{ steps.license_check_report.outputs.report }}" - - uses: actions/upload-artifact@v3 + - uses: actions/upload-artifact@v4 with: + name: pre-commit-${{ matrix.python-version }} retention-days: 2 path: constraints-${{ matrix.python-version }}.txt @@ -157,24 +159,46 @@ jobs: - pre-commit - pre-commit-22_04 steps: - - uses: actions/checkout@v3 - - uses: actions/download-artifact@v3 + - name: Get analysis timestamp + id: timestamp + run: echo "timestamp=$(date -Is)" >> "$GITHUB_OUTPUT" + - uses: actions/checkout@v4 + - uses: actions/download-artifact@v4 + id: download with: + pattern: pre-commit-* + merge-multiple: true path: changes-dir - name: Move artifacts to their right place + id: move run: | - cp -dpr changes-dir/artifact/* . - rm -r changes-dir/artifact + skip=true + if [ -d "${{steps.download.outputs.download-path}}" ] ; then + for con in "${{steps.download.outputs.download-path}}"/constraints-*.txt ; do + case "$con" in + */constraints-\*.txt) + break + ;; + *) + cp -p "$con" . + skip=false + ;; + esac + done + fi + echo "skip=$skip" >> "$GITHUB_OUTPUT" - name: Create Pull Request id: cpr - uses: peter-evans/create-pull-request@v5 + uses: peter-evans/create-pull-request@v7 + if: steps.move.outputs.skip == 'false' with: - title: Updated constraints (triggered by ${{ github.sha }}) + title: Updated constraints (triggered on ${{ steps.timestamp.outputs.timestamp }} by ${{ github.sha }}) branch: create-pull-request/patch-constraints + add-paths: constraints-*.txt delete-branch: true commit-message: "[create-pull-request] Automatically commit updated contents (constraints)" - name: Check outputs if: ${{ steps.cpr.outputs.pull-request-number }} run: | - echo "Pull Request Number - ${{ steps.cpr.outputs.pull-request-number }}" - echo "Pull Request URL - ${{ steps.cpr.outputs.pull-request-url }}" + echo "Pull Request Number - ${{ steps.cpr.outputs.pull-request-number }}" >> "$GITHUB_STEP_SUMMARY" + echo "Pull Request URL - ${{ steps.cpr.outputs.pull-request-url }}" >> "$GITHUB_STEP_SUMMARY" From 237bf6eacb30c5184daeea2f157d6066ebe84fb4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20M=C2=AA=20Fern=C3=A1ndez?= Date: Tue, 18 Feb 2025 04:13:27 +0100 Subject: [PATCH 57/60] Fixed parameter naming issue with data-url 1.1.1 --- requirements.txt | 2 +- wfexs_backend/utils/contents.py | 5 ++--- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/requirements.txt b/requirements.txt index a7f58bca..f2f31fdf 100644 --- a/requirements.txt +++ b/requirements.txt @@ -26,7 +26,7 @@ wiktionary-term-fetcher >= 0.1.1 funny-passphrase >= 0.2.3 pyxdg groovy-parser == 0.1.1 -data-url +data-url >= 1.1.1 pgzip defusedxml # This is needed for exception groups diff --git a/wfexs_backend/utils/contents.py b/wfexs_backend/utils/contents.py index e81a78fc..ac8b7676 100644 --- a/wfexs_backend/utils/contents.py +++ b/wfexs_backend/utils/contents.py @@ -561,7 +561,6 @@ def bin2dataurl(content: "bytes") -> "URIType": return cast( "URIType", - data_url.construct_data_url( - mime_type=mime_type, base64_encode=True, data=content - ), + # mime_type=mime_type, base64_encoded=True, data=content + data_url.construct_data_url(mime_type, True, content), ) From 16e2adcdbb29803164800cd8afc920ef22dc8cd2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20M=C2=AA=20Fern=C3=A1ndez?= Date: Tue, 18 Feb 2025 04:18:52 +0100 Subject: [PATCH 58/60] Force the installation of setuptools on GitHub CI workflow, so it does not randomly fail --- .github/workflows/pre-commit.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/pre-commit.yml b/.github/workflows/pre-commit.yml index 7d75ab4c..e6cb0dd2 100644 --- a/.github/workflows/pre-commit.yml +++ b/.github/workflows/pre-commit.yml @@ -35,7 +35,7 @@ jobs: - name: 'Install requirements (standard or constraints ${{ matrix.python-version }})' run: | - pip install --upgrade pip wheel + pip install --upgrade pip wheel setuptools if [ ${{ steps.changed-requirements-txt.outputs.any_changed }} != 'true' ] && [ -f constraints-${{ matrix.python-version }}.txt ] ; then pip install -r requirements.txt -c constraints-${{ matrix.python-version }}.txt else @@ -106,7 +106,7 @@ jobs: - name: 'Install requirements (standard or constraints ${{ matrix.python-version }})' run: | - pip install --upgrade pip wheel + pip install --upgrade pip wheel setuptools if [ ${{ steps.changed-requirements-txt.outputs.any_changed }} != 'true' ] && [ -f constraints-${{ matrix.python-version }}.txt ] ; then pip install -r requirements.txt -c constraints-${{ matrix.python-version }}.txt else From 61d7ddb3e9276171ef26818d551e0ff413db24bf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20M=C2=AA=20Fern=C3=A1ndez?= Date: Tue, 18 Feb 2025 04:23:17 +0100 Subject: [PATCH 59/60] Fixed issue with pylint and python 3.10 when using the substitute ExceptionGroup --- wfexs_backend/fetchers/trs_files.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/wfexs_backend/fetchers/trs_files.py b/wfexs_backend/fetchers/trs_files.py index a837962d..8f135495 100644 --- a/wfexs_backend/fetchers/trs_files.py +++ b/wfexs_backend/fetchers/trs_files.py @@ -264,7 +264,7 @@ def GuessTRSParams( except Exception as e2: if fail_ok: return None - raise ExceptionGroup( + raise ExceptionGroup( # pylint: disable=possibly-used-before-assignment f"Error fetching or processing TRS service info metadata for {wf_url} (tried both {trs_service_info} and {non_standard_trs_service_info})", [e1, e2], ) From 115f9a52356ec6efb1e79bde0cdcf1794a14278d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20M=C2=AA=20Fern=C3=A1ndez?= Date: Tue, 18 Feb 2025 04:39:56 +0100 Subject: [PATCH 60/60] Newer mypy version arose a condition where calling hexDigest could not be safe --- wfexs_backend/ro_crate.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/wfexs_backend/ro_crate.py b/wfexs_backend/ro_crate.py index 807f99ba..e5f9971f 100644 --- a/wfexs_backend/ro_crate.py +++ b/wfexs_backend/ro_crate.py @@ -1302,7 +1302,7 @@ def _add_containers( the_size = os.stat(container.localPath).st_size if container.image_signature is not None: digest, algo = extract_digest(container.image_signature) - if digest is None: + if digest is None or digest == False: digest, algo = unstringifyDigest(container.image_signature) assert algo is not None the_signature = hexDigest(algo, digest) @@ -1561,7 +1561,7 @@ def addWorkflowInputs( the_signature: "Optional[Fingerprint]" = None if itemInValues.fingerprint is not None: digest, algo = extract_digest(itemInValues.fingerprint) - if digest is not None: + if digest is not None and digest != False: assert algo is not None the_signature = hexDigest(algo, digest) @@ -1796,7 +1796,7 @@ def addWorkflowInputs( sec_digest, sec_algo = extract_digest( secInput.fingerprint ) - if sec_digest is not None: + if sec_digest is not None and sec_digest != False: assert sec_algo is not None the_sec_signature = hexDigest( sec_algo, sec_digest @@ -2977,7 +2977,7 @@ def _add_GeneratedContent_to_crate( assert the_content.signature is not None digest, algo = extract_digest(the_content.signature) - if digest is None: + if digest is None or digest == False: digest, algo = unstringifyDigest(the_content.signature) assert algo is not None dest_path = os.path.relpath(the_content.local, self.work_dir)