From 287a1cb90e2c9cc56dea6fc2ce84f8ab6743c860 Mon Sep 17 00:00:00 2001 From: Daniel Bojar Date: Sun, 3 Mar 2024 06:26:42 +0100 Subject: [PATCH] Added filter_dealbreakers to regex for strong dealbreakers --- glycowork/motif/regex.py | 35 +++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/glycowork/motif/regex.py b/glycowork/motif/regex.py index 5431b50f..6c598722 100644 --- a/glycowork/motif/regex.py +++ b/glycowork/motif/regex.py @@ -213,6 +213,8 @@ def filter_matches_by_location(matches, ggraph, match_location): | :- | Returns a filtered list of matches """ + if matches and matches[0] and matches[0][0] and isinstance(matches[0][0], list): + matches = unwrap(matches) if match_location == 'start': degrees = {node: ggraph.degree[node] for node in ggraph} return [m for m in matches if degrees[m[0]] == 1] @@ -558,6 +560,38 @@ def format_retrieved_matches(lists, ggraph): return sorted([graph_to_string(ggraph.subgraph(trace)) for trace in lists if nx.is_connected(ggraph.subgraph(trace))], key = len, reverse = True) +def filter_dealbreakers(lists, ggraph, pattern): + """performs some checks to see whether traces come from sequences breaking the pattern negations\n + | Arguments: + | :- + | lists (list of list of int): A list of traces containing sublists of node indices + | ggraph (networkx): glycan graph as a networkx object + | pattern (string): glyco-regular expression in the form of "Hex-HexNAc-([Hex|Fuc]){1,2}-HexNAc"\n + | Returns: + | :- + | Returns a list of list of int; basically traces that survive the filtering + """ + if '!' not in pattern: + return lists + else: + lists2 = [] + node_dict = nx.get_node_attributes(ggraph, "string_labels") + for listy in lists: + last = pattern.split('-')[-1] + if '!' in last and node_dict.get(listy[-1]+2, 'default') != re.findall(r'[a-zA-Z0-9!]+', last)[0][1:]: + lists2.append(listy) + continue + first = pattern.split('-')[0] + if '!' in first and node_dict.get(listy[0]-2, 'default') != re.findall(r'[a-zA-Z0-9!]+', first)[0][1:]: + lists2.append(listy) + continue + second_to_last = pattern.split('-')[-2] + if '!' in second_to_last and node_dict.get(listy[-1]-2, 'default') != re.findall(r'[a-zA-Z0-9!]+', second_to_last)[0][1:]: + lists2.append(listy) + continue + return lists2 + + def compile(pattern): """pre-compiles glyco-regular expression for faster processing\n | Arguments: @@ -593,6 +627,7 @@ def get_match(pattern, glycan, return_matches = True): if pattern_matches: traces, used_patterns = trace_path(pattern_matches, ggraph) traces = fill_missing_in_list(traces) + traces = filter_dealbreakers(traces, ggraph, pattern) if traces: return True if not return_matches else format_retrieved_matches(traces, ggraph) else: