Skip to content

Commit

Permalink
more dogfeeding in GlycoDraw
Browse files Browse the repository at this point in the history
- added count_nested_brackets
- Deprecated multiple_branches, multiple_branch_branches, reorder_for_drawing, branch_order
- switched get_coordinates_and_labels to using choose_correct_isoform
- made find_isomorphs and choose_correct_isoform more general
  • Loading branch information
Bribak committed Nov 26, 2024
1 parent 4f1ccfa commit 41bb1a1
Show file tree
Hide file tree
Showing 5 changed files with 107 additions and 154 deletions.
12 changes: 12 additions & 0 deletions glycowork/glycan_data/loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -335,3 +335,15 @@ def deserialize(cls, path: str # file path to load serialized data
)

serializer = DataFrameSerializer()

def count_nested_brackets(s):
count = 0
depth = 0
for c in s:
if c == '[':
if depth > 0: # Only count if we're already inside brackets
count += 1
depth += 1
elif c == ']':
depth -= 1
return count
23 changes: 6 additions & 17 deletions glycowork/motif/annotate.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,26 +122,21 @@ def get_molecular_properties(
import pubchempy as pcp
except ImportError:
raise ImportError("You must install the 'chem' dependencies to use this feature. Try 'pip install glycowork[chem]'.")
smiles_list = IUPAC_to_SMILES(glycan_list)
if placeholder:
dummy = IUPAC_to_SMILES(['Glc'])[0]
compounds_list, failed_requests = [], []
for s in smiles_list:
for s in IUPAC_to_SMILES(glycan_list):
try:
c = pcp.get_compounds(s, 'smiles')[0]
if c.cid is None:
if placeholder:
compounds_list.append(pcp.get_compounds(dummy, 'smiles')[0])
else:
failed_requests.append(s)
compounds_list.append(pcp.get_compounds(dummy, 'smiles')[0]) if placeholder else failed_requests.append(s)
else:
compounds_list.append(c)
except:
failed_requests.append(s)
if verbose and len(failed_requests) >= 1:
print('The following SMILES were not found on PubChem:')
for failed in failed_requests:
print(failed)
print('The following SMILES were not found on PubChem:\n')
print(failed_requests)
df = pcp.compounds_to_frame(compounds_list, properties = ['molecular_weight', 'xlogp',
'charge', 'exact_mass', 'monoisotopic_mass', 'tpsa', 'complexity',
'h_bond_donor_count', 'h_bond_acceptor_count',
Expand Down Expand Up @@ -362,15 +357,9 @@ def get_k_saccharides(
out_matrix = out_matrix.T.groupby(by = out_matrix.columns).sum().T
if up_to:
combined_df= pd.concat([wga_letter, out_matrix], axis = 1).fillna(0).astype(int)
if just_motifs:
return combined_df.apply(lambda x: list(combined_df.columns[x > 0]), axis = 1).tolist()
else:
return combined_df
return combined_df.apply(lambda x: list(combined_df.columns[x > 0]), axis = 1).tolist() if just_motifs else combined_df
else:
if just_motifs:
return out_matrix.apply(lambda x: list(out_matrix.columns[x > 0]), axis = 1).tolist()
else:
return out_matrix.fillna(0).astype(int)
return out_matrix.apply(lambda x: list(out_matrix.columns[x > 0]), axis = 1).tolist() if just_motifs else out_matrix.fillna(0).astype(int)


def get_terminal_structures(
Expand Down
104 changes: 1 addition & 103 deletions glycowork/motif/draw.py
Original file line number Diff line number Diff line change
Expand Up @@ -523,104 +523,6 @@ def add_sugar(
drawing.append(p)


def multiple_branches(
glycan: str # IUPAC-condensed glycan sequence
) -> str: # Reordered glycan sequence
"Reorders multiple branches in glycan by linkage positions"
if ']' not in glycan:
return glycan
tmp = multireplace(glycan, {'(': '*', ')': '*', '[': '(', ']': ')'})
open_close = [(openpos, closepos) for openpos, closepos, level in matches(tmp) if level == 0 and not re.search(r"^Fuc\S{6}$", tmp[openpos:closepos])]
for k in range(len(open_close)-1):
if open_close[k + 1][0] - open_close[k][1] != 2:
continue
branch1, branch2 = glycan[open_close[k][0]:open_close[k][1]], glycan[open_close[k + 1][0]:open_close[k + 1][1]]
tmp1, tmp2 = branch1[-2], branch2[-2]
if tmp1 == tmp2 == '?':
if len(branch1) > len(branch2):
tmp1, tmp2 = [1, 2]
else:
tmp1, tmp2 = [2, 1]
if tmp1 > tmp2:
glycan = f"{glycan[:open_close[k][0]]}{branch2}][{branch1}{glycan[open_close[k + 1][1]:]}"
return glycan


def multiple_branch_branches(
glycan: str # IUPAC-condensed glycan sequence
) -> str: # Reordered glycan sequence
"Reorders nested multiple branches in glycan by linkage positions"
if ']' not in glycan:
return glycan
tmp = multireplace(glycan, {'(': '*', ')': '*', '[': '(', ']': ')'})
for openpos, closepos, level in matches(tmp):
if level == 0 and not re.search(r"^Fuc\S{6}$", tmp[openpos:closepos]):
glycan = glycan[:openpos] + multiple_branches(glycan[openpos:closepos]) + glycan[closepos:]
return glycan


def reorder_for_drawing(
glycan: str, # IUPAC-condensed glycan sequence
by: str = 'linkage' # Ordering criterion: 'linkage' or 'length'
) -> str: # Reordered glycan sequence
"Orders level 0 branches ascending by linkage position or length"
if ']' not in glycan:
return glycan
tmp = multireplace(glycan, {'(': '*', ')': '*', '[': '(', ']': ')'})
for openpos, closepos, level in matches(tmp):
if level == 0 and not re.search(r"^Fuc\S{6}$|^Xyl\S{6}$", tmp[openpos:closepos]):
# Nested branches
glycan = glycan[:openpos] + branch_order(glycan[openpos:closepos]) + glycan[closepos:]
# Branches
group1 = glycan[:openpos-1]
group2 = glycan[openpos:closepos]
branch_end = [j[-2] for j in [group1, group2]]
if by == 'length':
branch_len = [len(k) for k in min_process_glycans([group1, group2])]
if branch_len[0] == branch_len[1]:
if branch_end[0] == branch_end[1]:
glycan = group1 + '[' + group2 + ']' + glycan[closepos+1:]
else:
glycan = [group1, group2][np.argmin(branch_end)] + '[' + [group1, group2][np.argmax(branch_end)] + ']' + glycan[closepos+1:]
else:
glycan = [group1, group2][np.argmax(branch_len)] + '[' + [group1, group2][np.argmin(branch_len)] + ']' + glycan[closepos+1:]
elif by == 'linkage':
if branch_end[0] == branch_end[1]:
glycan = group1 + '[' + group2 + ']' + glycan[closepos+1:]
else:
glycan = [group1, group2][np.argmin(branch_end)] + '[' + [group1, group2][np.argmax(branch_end)] + ']' + glycan[closepos+1:]
return glycan


def branch_order(
glycan: str, # IUPAC-condensed glycan sequence
by: str = 'linkage' # Ordering criterion: 'linkage' or 'length'
) -> str: # Reordered glycan sequence
"Orders nested branches ascending by linkage position or length"
tmp = multireplace(glycan, {'(': '*', ')': '*', '[': '(', ']': ')'})
for openpos, closepos, level in matches(tmp):
if level == 0 and not re.search(r"^Fuc\S{6}$|^Xyl\S{6}$", tmp[openpos:closepos]):
group1 = glycan[:openpos-1]
group2 = glycan[openpos:closepos]
branch_end = [j[-2] for j in [group1, group2]]
branch_end = [k.replace('z', '9') for k in branch_end]
if by == 'length':
branch_len = [len(k) for k in min_process_glycans([group1, group2])]
if branch_len[0] == branch_len[1]:
if branch_end[0] == branch_end[1]:
glycan = group1 + '[' + group2 + ']' + glycan[closepos+1:]
else:
glycan = [group1, group2][np.argmin(branch_end)] + '[' + [group1, group2][np.argmax(branch_end)] + ']' + glycan[closepos+1:]
else:
glycan = [group1, group2][np.argmax(branch_len)] + '[' + [group1, group2][np.argmin(branch_len)] + ']' + glycan[closepos+1:]
elif by == 'linkage':
if branch_end[0] == branch_end[1]:
glycan = group1 + '[' + group2 + ']' + glycan[closepos+1:]
else:
glycan = [group1, group2][np.argmin(branch_end)] + '[' + [group1, group2][np.argmax(branch_end)] + ']' + glycan[closepos+1:]
return glycan


def split_node(
G: nx.Graph, # NetworkX glycan graph
node: int # Node to split
Expand Down Expand Up @@ -744,11 +646,7 @@ def get_coordinates_and_labels(
) -> List[List]: # Drawing coordinates and labels (monosaccharide label, x position, y position, modification, bond, conformation)
"Calculates drawing coordinates and formats labels for glycan visualization"
if not draw_this.startswith('['):
#draw_this = choose_correct_isoform(draw_this, order_by = "linkage") # one day
draw_this = multiple_branches(multiple_branch_branches(draw_this))
if not ('[GlcNAc(b1-4)]' in draw_this and not 'Man(a1-3)' in draw_this):
draw_this = reorder_for_drawing(draw_this)
draw_this = multiple_branch_branches(multiple_branches(draw_this))
draw_this = choose_correct_isoform(draw_this, order_by = "linkage")

graph = glycan_to_nxGraph(draw_this, termini = 'calc') if termini_list else glycan_to_nxGraph(draw_this)
graph = get_highlight_attribute(graph, highlight_motif, termini_list = termini_list)
Expand Down
76 changes: 57 additions & 19 deletions glycowork/motif/processing.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from collections import defaultdict
from itertools import permutations
from typing import Dict, List, Set, Union, Optional, Callable, Tuple
from glycowork.glycan_data.loader import (unwrap, multireplace,
from glycowork.glycan_data.loader import (unwrap, multireplace, count_nested_brackets,
find_nth, find_nth_reverse, lib, HexOS, HexNAcOS,
linkages, Hex, HexNAc, dHex, Sia, HexA, Pen)

Expand Down Expand Up @@ -122,7 +122,8 @@ def find_isomorphs(glycan: str # Glycan in IUPAC-condensed format
elif not bool(re.search(r'\[[^\]]+\[', glycan[find_nth(glycan, ']', 2):])) and bool(re.search(r'\[[^\]]+\[', glycan[:find_nth(glycan, '[', 3)])):
out_list.add(re.sub(r'^(.*?)\[(.*?)(\])((?:[^\[\]]|\[[^\[\]]*\])*)\]', r'\2\3\4[\1]', glycan, 1))
# Double branch swap
temp = {re.sub(r'\[([^[\]]+)\]\[([^[\]]+)\]', r'[\2][\1]', k) for k in out_list if '][' in k}
temp = {re.sub(r'(?<!\[)\[((?:[^[\]]|\[[^[\]]*\])*?)\]\[((?:[^[\]]|\[[^[\]]*\])*?)\]', r'[\2][\1]', k) for k in out_list if '][' in k}
#temp = {re.sub(r'\[([^[\]]+)\]\[([^[\]]+)\]', r'[\2][\1]', k) for k in out_list if '][' in k}
out_list.update(temp)
# Triple branch handling
temp = set()
Expand Down Expand Up @@ -211,16 +212,40 @@ def choose_correct_isoform(glycans: Union[List[str], str], # Glycans in IUPAC-co
glycans2 = [g for k, g in enumerate(glycans) if mains[k] == max_mains]
else: # order_by == 'linkage'
glycans2 = glycans.copy()

def is_special_single_branch(branch: str) -> bool:
"""Check if a single-monosaccharide branch contains GlcNAc(b1-4) or Xyl(b1-2)"""
return branch.count('(') == 1 and ('GlcNAc(b1-4)' in branch or 'Xyl(b1-2)' in branch)

def compare_branches(branch1: str, branch2: str, use_linkage: bool = False) -> bool:
"""Compare two branches and return True if they're in wrong order.
use_linkage=True forces linkage-based comparison regardless of branch types."""
count1, count2 = branch1.count('('), branch2.count('(')
pos1 = int(branch1[-2] if branch1[-2].isdigit() else '9')
pos2 = int(branch2[-2] if branch2[-2].isdigit() else '9')
if use_linkage:
return pos1 > pos2
# If either branch is a single monosaccharide
if count1 == 1 or count2 == 1:
# If either is a special single branch, use linkage ordering
if is_special_single_branch(branch1) or is_special_single_branch(branch2):
return pos1 > pos2
# Non-special single branches: use length ordering
return (count1 < count2) or (count1 == count2 and pos1 > pos2)
# Neither is single monosaccharide: use linkage ordering
return pos1 > pos2

# Handle neighboring branches
kill_list = set()
for g in glycans2:
if '][' in g:
# First try triple branches
triple_match = re.search(r'\[([^[\]]+)\]\[([^[\]]+)\]\[([^[\]]+)\]', g)
if triple_match:
branches = list(triple_match.groups())
if order_by == 'length':
counts = [m.count('(') for m in triple_match.groups()]
positions = [int(m[-2]) if m[-2].isdigit() else 9 for m in triple_match.groups()]
counts = [m.count('(') for m in branches]
positions = [int(m[-2]) if m[-2].isdigit() else 9 for m in branches]
# Check if branches are not in descending order by length
for i in range(len(counts)-1):
if counts[i] < counts[i+1]:
Expand All @@ -231,41 +256,52 @@ def choose_correct_isoform(glycans: Union[List[str], str], # Glycans in IUPAC-co
kill_list.add(g)
break
else: # order_by == 'linkage'
positions = [int(m[-2]) if m[-2].isdigit() else 9 for m in triple_match.groups()]
if not all(positions[i] <= positions[i+1] for i in range(len(positions)-1)):
kill_list.add(g)
for i in range(len(branches)-1):
if compare_branches(branches[i], branches[i+1]):
kill_list.add(g)
break
else:
# Fall back to pair checking
pair_match = re.search(r'\[([^[\]]+)\]\[([^[\]]+)\]', g)
if pair_match:
branch1, branch2 = pair_match.group(1), pair_match.group(2)
if order_by == 'length':
count1, count2 = pair_match.group(1).count('('), pair_match.group(2).count('(')
count1, count2 = branch1.count('('), branch2.count('(')
pos1 = int(branch1[-2] if branch1[-2].isdigit() else '9')
pos2 = int(branch2[-2] if branch2[-2].isdigit() else '9')
if count1 < count2:
kill_list.add(g)
elif count1 == count2 and int(pair_match.group(1)[-2] if pair_match.group(1)[-2].isdigit() else '9') > int(pair_match.group(2)[-2] if pair_match.group(2)[-2].isdigit() else '9'):
elif count1 == count2 and pos1 > pos2:
kill_list.add(g)
else: # order_by == 'linkage'
pos1 = int(pair_match.group(1)[-2] if pair_match.group(1)[-2].isdigit() else '9')
pos2 = int(pair_match.group(2)[-2] if pair_match.group(2)[-2].isdigit() else '9')
if pos1 > pos2:
if compare_branches(branch1, branch2):
kill_list.add(g)
if order_by == "linkage":
for g in glycans2:
if g[:g.index('[')].count('(') == 1 and g[g.index('['):g.index(']')].count('(') > 1:
kill_list.add(g)
glycans2 = [k for k in glycans2 if k not in kill_list]
# Choose the isoform with the longest main chain before the branch & or the branch ending in the smallest number if all lengths are equal
if len(glycans2) > 1:
candidates = {k: find_matching_brackets_indices(k) for k in glycans2}
if order_by == 'length':
prefix = [min_process_glycans([k[j[0]+1:j[1]] for j in candidates[k]]) for k in candidates.keys()]
prefix = [np.argmax([len(j) for j in k]) for k in prefix]
prefix = min_process_glycans([k[:candidates[k][prefix[i]][0]] for i, k in enumerate(candidates.keys())])
else: # order_by == 'linkage'
prefix = min_process_glycans([k[:candidates[k][0][0]] for k in candidates.keys()])
prefix = [min_process_glycans([k[j[0]+1:j[1]] for j in v]) for k, v in candidates.items()]
prefix = [np.argmax([len(j) for j in k]) for k in prefix]
prefix = [k[:candidates[k][prefix[i]][0]] for i, k in enumerate(candidates.keys())]
for i, p in enumerate(prefix):
if p.endswith(']'):
prefix[i] = p[:p.rfind('[')]
prefix = min_process_glycans(prefix)
branch_endings = [k[-1][-1] if k[-1][-1].isdigit() else 10 for k in prefix]
if len(set(branch_endings)) == 1:
branch_endings = [ord(k[0][0]) for k in prefix]
branch_endings = [int(k) for k in branch_endings]
min_ending = min(branch_endings)
glycans2 = [g for k, g in enumerate(glycans2) if branch_endings[k] == min_ending]
if len(glycans2) > 1:
complexity = [count_nested_brackets(g) for g in glycans2]
min_complexity = min(complexity)
glycans2 = [g for k, g in enumerate(glycans2) if complexity[k] == min_complexity]
if len(glycans2) > 1:
preprefix = min_process_glycans([glyc[:glyc.index('[')] for glyc in glycans2])
branch_endings = [k[-1][-1] if k[-1][-1].isdigit() else 10 for k in preprefix]
branch_endings = [int(k) for k in branch_endings]
Expand All @@ -275,8 +311,10 @@ def choose_correct_isoform(glycans: Union[List[str], str], # Glycans in IUPAC-co
correct_isoform = sorted(glycans2)[0]
else:
correct_isoform = glycans2[0]
else:
else:
correct_isoform = glycans2[0]
else:
correct_isoform = glycans2[0]
else:
correct_isoform = glycans2[0]
if floaty:
Expand Down
46 changes: 31 additions & 15 deletions tests/test_core_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@
)
from glycowork.glycan_data.loader import (
unwrap, find_nth, find_nth_reverse, remove_unmatched_brackets,
reindex, stringify_dict, replace_every_second, multireplace,
reindex, stringify_dict, replace_every_second, multireplace, count_nested_brackets,
strip_suffixes, build_custom_df, DataFrameSerializer, Hex, linkages
)
from glycowork.glycan_data.stats import (
Expand Down Expand Up @@ -79,8 +79,7 @@
from glycowork.motif.draw import (matches, process_bonds, split_monosaccharide_linkage, draw_hex, split_node,
scale_in_range, glycan_to_skeleton, process_per_residue, col_dict_base,
get_hit_atoms_and_bonds, add_colours_to_map, unique, is_jupyter, process_repeat, draw_bracket,
display_svg_with_matplotlib, multiple_branch_branches,
get_coordinates_and_labels, get_highlight_attribute, add_sugar, add_bond, draw_shape,
display_svg_with_matplotlib, get_coordinates_and_labels, get_highlight_attribute, add_sugar, add_bond, draw_shape,
process_bonds, draw_chem2d, draw_chem3d, GlycoDraw, plot_glycans_excel, annotate_figure, get_indices
)
from glycowork.motif.analysis import (preprocess_data, get_pvals_motifs, select_grouping, get_glycanova, get_differential_expression,
Expand Down Expand Up @@ -1009,6 +1008,35 @@ def test_dataframe_serializer():
assert result == ['1', '2', '3']


def test_count_nested_brackets():
# Basic cases
assert count_nested_brackets('') == 0
assert count_nested_brackets('[]') == 0
assert count_nested_brackets('[a]') == 0
assert count_nested_brackets('[[]]') == 1
assert count_nested_brackets('[[][]]') == 2
assert count_nested_brackets('[a][b]') == 0 # Parallel brackets don't count
# Complex nested cases
assert count_nested_brackets('[a[b][c]]') == 2
assert count_nested_brackets('[a[b[c]]]') == 2
assert count_nested_brackets('[a][b[c][d]]') == 2
# Real glycan examples
glycan1 = '[Neu5Ac(a2-3)Gal(b1-4)[Fuc(a1-3)]GlcNAc]'
assert count_nested_brackets(glycan1) == 1
glycan2 = '[a[b[c]][d[e]]]'
assert count_nested_brackets(glycan2) == 4
# Edge cases
assert count_nested_brackets('[][][]') == 0
assert count_nested_brackets('[[[][]]]') == 3


def test_real_glycan_structures():
s1 = 'Neu5Ac(a2-3)Gal(b1-4)GlcNAc(b1-4)[Neu5Ac(a2-3)Gal(b1-4)[Fuc(a1-3)]GlcNAc(b1-2)]Man(a1-3)[Neu5Ac(a2-3)Gal(b1-4)[Fuc(a1-3)]GlcNAc(b1-2)[Neu5Ac(a2-3)Gal(b1-4)GlcNAc(b1-6)]Man(a1-6)][GlcNAc(b1-4)]Man(b1-4)GlcNAc(b1-4)[Fuc(a1-6)]GlcNAc'
s2 = "Neu5Ac(a2-3)Gal(b1-4)[Fuc(a1-3)]GlcNAc(b1-2)[Neu5Ac(a2-3)Gal(b1-4)GlcNAc(b1-4)]Man(a1-3)[Neu5Ac(a2-3)Gal(b1-4)[Fuc(a1-3)]GlcNAc(b1-2)[Neu5Ac(a2-3)Gal(b1-4)GlcNAc(b1-6)]Man(a1-6)][GlcNAc(b1-4)]Man(b1-4)GlcNAc(b1-4)[Fuc(a1-6)]GlcNAc"
# These should have different nested bracket counts
assert count_nested_brackets(s1) != count_nested_brackets(s2)


def test_fast_two_sum():
# Test basic addition
assert fast_two_sum(5, 3) == [8]
Expand Down Expand Up @@ -2494,18 +2522,6 @@ def show():
plt = old_plt


def test_multiple_branch_branches():
# Test nested branch reordering
input_str = "GlcNAc(b1-4)[Fuc(a1-2)[Fuc(a1-3)]Fuc(a1-6)]GlcNAc"
result = multiple_branch_branches(input_str)
assert "[Fuc(a1-3)]" in result
# Test multiple nested branches
input_str = "GlcNAc(b1-4)[Fuc(a1-2)[Fuc(a1-3)][Fuc(a1-6)]Fuc(a1-6)]GlcNAc"
result = multiple_branch_branches(input_str)
assert "[Fuc(a1-6)]" in result
assert "[Fuc(a1-3)]" in result


def test_draw_shape_hex():
# Test basic hexose drawing
mock_drawing = get_clean_drawing()
Expand Down

0 comments on commit 41bb1a1

Please sign in to comment.