Skip to content

Commit

Permalink
minor graph operation optimizations
Browse files Browse the repository at this point in the history
  • Loading branch information
Bribak committed Mar 8, 2024
1 parent 3d64c6c commit 8c14cc3
Show file tree
Hide file tree
Showing 2 changed files with 134 additions and 154 deletions.
144 changes: 67 additions & 77 deletions build/lib/glycowork/motif/graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,9 +51,10 @@ def glycan_to_graph(glycan):
# Integers that are in place of glycoletters go up from 1 character (0-9) to 3 characters (>99)
adjustment = 2 if k >= 100 else 1 if k >= 10 else 0
adjustment2 = 2+adjustment
cache_first_part = glycan_indexes[str(k)]+1
for j in range(k+1, n):
# Subset the part of the glycan that is bookended by k and j
glycan_part = glycan[glycan_indexes[str(k)]+1:glycan_indexes[str(j)]]
glycan_part = glycan[cache_first_part:glycan_indexes[str(j)]]
# Immediately adjacent residues
if evaluate_adjacency(glycan_part, adjustment):
adj_matrix[k, j] = 1
Expand All @@ -73,7 +74,7 @@ def glycan_to_nxGraph_int(glycan, libr = None,
| glycan (string): glycan in IUPAC-condensed format
| libr (dict): dictionary of form glycoletter:index
| termini (string): whether to encode terminal/internal position of monosaccharides, 'ignore' for skipping, 'calc' for automatic annotation, or 'provided' if this information is provided in termini_list; default:'ignore'
| termini_list (list): list of monosaccharide/linkage positions (from 'terminal','internal', and 'flexible')\n
| termini_list (list): list of monosaccharide/linkage positions (from 'terminal', 'internal', and 'flexible')\n
| Returns:
| :-
| Returns networkx graph object of glycan
Expand Down Expand Up @@ -120,7 +121,7 @@ def glycan_to_nxGraph(glycan, libr = None,
| glycan (string): glycan in IUPAC-condensed format
| libr (dict): dictionary of form glycoletter:index
| termini (string): whether to encode terminal/internal position of monosaccharides, 'ignore' for skipping, 'calc' for automatic annotation, or 'provided' if this information is provided in termini_list; default:'ignore'
| termini_list (list): list of monosaccharide positions (from 'terminal','internal', and 'flexible')\n
| termini_list (list): list of monosaccharide positions (from 'terminal', 'internal', and 'flexible')\n
| Returns:
| :-
| Returns networkx graph object of glycan
Expand Down Expand Up @@ -160,30 +161,28 @@ def categorical_node_match_wildcard(attr, default, narrow_wildcard_list, attr2,
if isinstance(attr, str):

def check_termini(termini1, termini2):
return termini1 == termini2 or 'flexible' in [termini1, termini2]
return termini1 == termini2 or 'flexible' in {termini1, termini2}

def match(data1, data2):
data1_labels2, data2_labels2 = data1.get(attr2, default2), data2.get(attr2, default2)
termini_check = check_termini(data1_labels2, data2_labels2)
if not termini_check:
return False
data1_labels, data2_labels = data1.get(attr, default), data2.get(attr, default)
if "Monosaccharide" in [data1_labels, data2_labels] and not any(['-' in lab for lab in [data1_labels, data2_labels]]):
if "Monosaccharide" in {data1_labels, data2_labels} and not any('-' in lab for lab in {data1_labels, data2_labels}):
return True
if "?1-?" in [data1_labels, data2_labels] and all(['-' in lab for lab in [data1_labels, data2_labels]]):
if "?1-?" in {data1_labels, data2_labels} and all('-' in lab for lab in {data1_labels, data2_labels}):
return True
if data2_labels.startswith('!') and data1_labels != data2_labels[1:] and '-' not in data1_labels:
return True
if data1_labels in narrow_wildcard_list and data2_labels in narrow_wildcard_list[data1_labels]:
return True
elif data2_labels in narrow_wildcard_list and data1_labels in narrow_wildcard_list[data2_labels]:
return True
else:
return data1_labels == data2_labels
return data1_labels == data2_labels
else:
attrs = list(zip(attr, default))
def match(data1, data2):
return all(data1.get(attr, d) == data2.get(attr, d) for attr, d in attrs)
return all(data1.get(a, d) == data2.get(a, d) for a, d in zip(attr, default))
return match


Expand All @@ -198,37 +197,32 @@ def compare_glycans(glycan_a, glycan_b, wildcards_ptm = False):
| :-
| Returns True if two glycans are the same and False if not
"""
if isinstance(glycan_a, str):
if isinstance(glycan_a, str) and isinstance(glycan_b, str):
proc = min_process_glycans([glycan_a, glycan_b])
# Check whether glycan_a and glycan_b have the same length // in theory the "Monosaccharide" wildcard can mess with this
if len(set([len(k) for k in proc])) == 1:
if wildcards_ptm:
glycan_a = re.sub(r"(?<=[a-zA-Z])\d+(?=[a-zA-Z])", 'O', glycan_a).replace('NeuOAc', 'Neu5Ac').replace('NeuOGc', 'Neu5Gc')
glycan_b = re.sub(r"(?<=[a-zA-Z])\d+(?=[a-zA-Z])", 'O', glycan_b).replace('NeuOAc', 'Neu5Ac').replace('NeuOGc', 'Neu5Gc')
glycan_a, glycan_b = [re.sub(r"(?<=[a-zA-Z])\d+(?=[a-zA-Z])", 'O', glycan).replace('NeuOAc', 'Neu5Ac').replace('NeuOGc', 'Neu5Gc') for glycan in [glycan_a, glycan_b]]
proc = set(unwrap(proc))
g1 = glycan_to_nxGraph(glycan_a)
g2 = glycan_to_nxGraph(glycan_b)
g1, g2 = glycan_to_nxGraph(glycan_a), glycan_to_nxGraph(glycan_b)
else:
return False
else:
proc = set(list(nx.get_node_attributes(glycan_a, "string_labels").values()) + list(nx.get_node_attributes(glycan_b, "string_labels").values()))
g1 = glycan_a
g2 = glycan_b
if len(g1.nodes) == len(g2.nodes):
narrow_wildcard_list = {k:[j for j in get_possible_linkages(k)] for k in proc if '?' in k}
narrow_wildcard_list2 = {k:[j for j in get_possible_monosaccharides(k)] for k in proc if k in ['Hex', 'HexNAc', 'dHex', 'Sia', 'HexA', 'Pen', 'Monosaccharide'] or '!' in k}
narrow_wildcard_list = {**narrow_wildcard_list, **narrow_wildcard_list2}
if narrow_wildcard_list:
return nx.is_isomorphic(g1, g2, node_match = categorical_node_match_wildcard('string_labels', 'unknown', narrow_wildcard_list,
'termini', 'flexible'))
else:
# First check whether components of both glycan graphs are identical, then check graph isomorphism (costly)
if sorted(''.join(nx.get_node_attributes(g1, "string_labels").values())) == sorted(''.join(nx.get_node_attributes(g2, "string_labels").values())):
return nx.is_isomorphic(g1, g2, node_match = nx.algorithms.isomorphism.categorical_node_match('string_labels', 'unknown'))
else:
return False
else:
g1, g2 = glycan_a, glycan_b
if len(g1.nodes) != len(g2.nodes):
return False
narrow_wildcard_list = {k:[j for j in get_possible_linkages(k)] for k in proc if '?' in k}
narrow_wildcard_list2 = {k:[j for j in get_possible_monosaccharides(k)] for k in proc if k in {'Hex', 'HexNAc', 'dHex', 'Sia', 'HexA', 'Pen', 'Monosaccharide'} or '!' in k}
narrow_wildcard_list = {**narrow_wildcard_list, **narrow_wildcard_list2}
if narrow_wildcard_list:
return nx.is_isomorphic(g1, g2, node_match = categorical_node_match_wildcard('string_labels', 'unknown', narrow_wildcard_list, 'termini', 'flexible'))
else:
# First check whether components of both glycan graphs are identical, then check graph isomorphism (costly)
if sorted(nx.get_node_attributes(g1, "string_labels").values()) == sorted(nx.get_node_attributes(g2, "string_labels").values()):
return nx.is_isomorphic(g1, g2, node_match = nx.algorithms.isomorphism.categorical_node_match('string_labels', 'unknown'))
else:
return False


def expand_termini_list(motif, termini_list):
Expand All @@ -253,8 +247,7 @@ def expand_termini_list(motif, termini_list):
return t_list


def subgraph_isomorphism(glycan, motif, termini_list = [],
count = False, wildcards_ptm = False,
def subgraph_isomorphism(glycan, motif, termini_list = [], count = False, wildcards_ptm = False,
return_matches = False):
"""returns True if motif is in glycan and False if not\n
| Arguments:
Expand All @@ -269,64 +262,61 @@ def subgraph_isomorphism(glycan, motif, termini_list = [],
| :-
| Returns True if motif is in glycan and False if not
"""
if isinstance(glycan, str):
if isinstance(glycan, str) and isinstance(motif, str):
motif_comp = min_process_glycans([motif, glycan])
if wildcards_ptm:
glycan = re.sub(r"(?<=[a-zA-Z])\d+(?=[a-zA-Z])", 'O', glycan).replace('NeuOAc', 'Neu5Ac').replace('NeuOGc', 'Neu5Gc')
motif = re.sub(r"(?<=[a-zA-Z])\d+(?=[a-zA-Z])", 'O', motif).replace('NeuOAc', 'Neu5Ac').replace('NeuOGc', 'Neu5Gc')
glycan, motif = [re.sub(r"(?<=[a-zA-Z])\d+(?=[a-zA-Z])", 'O', glycan).replace('NeuOAc', 'Neu5Ac').replace('NeuOGc', 'Neu5Gc') for glycan in [glycan, motif]]
g1 = glycan_to_nxGraph(glycan, termini = 'calc') if termini_list else glycan_to_nxGraph(glycan)
g2 = glycan_to_nxGraph(motif, termini = 'provided', termini_list = termini_list) if termini_list else glycan_to_nxGraph(motif)
else:
motif_comp = [nx.get_node_attributes(motif, "string_labels").values(), nx.get_node_attributes(glycan, "string_labels").values()]
g1 = copy.deepcopy(glycan)
g2 = motif
g1, g2 = copy.deepcopy(glycan), motif
narrow_wildcard_list = {k:[j for j in get_possible_linkages(k)] for k in set(unwrap(motif_comp)) if '?' in k}
narrow_wildcard_list2 = {k:[j for j in get_possible_monosaccharides(k)] for k in set(unwrap(motif_comp)) if k in ['Hex', 'HexNAc', 'dHex', 'Sia', 'HexA', 'Pen', 'Monosaccharide'] or '!' in k}
narrow_wildcard_list2 = {k:[j for j in get_possible_monosaccharides(k)] for k in set(unwrap(motif_comp)) if k in {'Hex', 'HexNAc', 'dHex', 'Sia', 'HexA', 'Pen', 'Monosaccharide'} or '!' in k}
narrow_wildcard_list = {**narrow_wildcard_list, **narrow_wildcard_list2}

# Check whether length of glycan is larger or equal than the motif
if len(g1.nodes) >= len(g2.nodes):
if termini_list or narrow_wildcard_list:
graph_pair = nx.algorithms.isomorphism.GraphMatcher(g1, g2, node_match = categorical_node_match_wildcard('string_labels', 'unknown', narrow_wildcard_list,
'termini', 'flexible'))
if len(g1.nodes) < len(g2.nodes):
return (0, []) if return_matches else 0 if count else False
if termini_list or narrow_wildcard_list:
graph_pair = nx.algorithms.isomorphism.GraphMatcher(g1, g2, node_match = categorical_node_match_wildcard('string_labels', 'unknown', narrow_wildcard_list,
'termini', 'flexible'))
else:
g1_node_attr = set(nx.get_node_attributes(g1, "string_labels").values())
if all(k in g1_node_attr for k in motif_comp[0]):
graph_pair = nx.algorithms.isomorphism.GraphMatcher(g1, g2, node_match = nx.algorithms.isomorphism.categorical_node_match('string_labels', 'unknown'))
else:
g1_node_attr = set(nx.get_node_attributes(g1, "string_labels").values())
if all(k in g1_node_attr for k in motif_comp[0]):
graph_pair = nx.algorithms.isomorphism.GraphMatcher(g1, g2, node_match = nx.algorithms.isomorphism.categorical_node_match('string_labels', 'unknown'))
return 0 if count else False

if return_matches:
mappings = [list(isos.keys()) for isos in graph_pair.subgraph_isomorphisms_iter()]

# Count motif occurrence
if count:
counts = 0
while graph_pair.subgraph_is_isomorphic():
mapping = graph_pair.mapping
mapping = {v: k for k, v in mapping.items()}
if all(mapping[node] < mapping[neighbor] for node, neighbor in g2.edges()):
counts += 1
g1.remove_nodes_from(graph_pair.mapping.keys())
if termini_list or narrow_wildcard_list:
graph_pair = nx.algorithms.isomorphism.GraphMatcher(g1, g2, node_match = categorical_node_match_wildcard('string_labels', 'unknown', narrow_wildcard_list,
'termini', 'flexible'))
else:
return 0 if count else False

if return_matches:
mappings = [list(isos.keys()) for isos in graph_pair.subgraph_isomorphisms_iter()]

# Count motif occurrence
if count:
counts = 0
while graph_pair.subgraph_is_isomorphic():
mapping = graph_pair.mapping
mapping = {v: k for k, v in mapping.items()}
if all(mapping[node] < mapping[neighbor] for node, neighbor in g2.edges()):
counts += 1
g1.remove_nodes_from(graph_pair.mapping.keys())
if termini_list or narrow_wildcard_list:
graph_pair = nx.algorithms.isomorphism.GraphMatcher(g1, g2, node_match = categorical_node_match_wildcard('string_labels', 'unknown', narrow_wildcard_list,
'termini', 'flexible'))
g1_node_attr = set(nx.get_node_attributes(g1, "string_labels").values())
if all(k in g1_node_attr for k in motif_comp[0]):
graph_pair = nx.algorithms.isomorphism.GraphMatcher(g1, g2, node_match = nx.algorithms.isomorphism.categorical_node_match('string_labels', 'unknown'))
else:
g1_node_attr = set(nx.get_node_attributes(g1, "string_labels").values())
if all(k in g1_node_attr for k in motif_comp[0]):
graph_pair = nx.algorithms.isomorphism.GraphMatcher(g1, g2, node_match = nx.algorithms.isomorphism.categorical_node_match('string_labels', 'unknown'))
else:
return counts if not return_matches else (counts, mappings)
return counts if not return_matches else (counts, mappings)
else:
if graph_pair.subgraph_is_isomorphic():
mapping = graph_pair.mapping
mapping = {v: k for k, v in mapping.items()}
res = all(mapping[node] < mapping[neighbor] for node, neighbor in g2.edges())
return res if not return_matches else (int(res), mappings)
return False if not return_matches else (0, [])
return counts if not return_matches else (counts, mappings)
return counts if not return_matches else (counts, mappings)
else:
return (0, []) if return_matches else 0 if count else False
if graph_pair.subgraph_is_isomorphic():
mapping = graph_pair.mapping
mapping = {v: k for k, v in mapping.items()}
res = all(mapping[node] < mapping[neighbor] for node, neighbor in g2.edges())
return res if not return_matches else (int(res), mappings)
return False if not return_matches else (0, [])


def generate_graph_features(glycan, glycan_graph = True, label = 'network'):
Expand Down
Loading

0 comments on commit 8c14cc3

Please sign in to comment.