Skip to content

Commit

Permalink
last tweaking of branch canonicalization
Browse files Browse the repository at this point in the history
  • Loading branch information
Bribak committed Feb 18, 2025
1 parent 9ba7f60 commit 8f80e89
Show file tree
Hide file tree
Showing 2 changed files with 20 additions and 11 deletions.
27 changes: 16 additions & 11 deletions glycowork/motif/processing.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@
replace_dic = {'Nac': 'NAc', 'AC': 'Ac', 'Nc': 'NAc', 'Nue': 'Neu', 'NeuAc': 'Neu5Ac', 'NeuNAc': 'Neu5Ac', 'NeuGc': 'Neu5Gc',
'α': 'a', 'β': 'b', 'N(Gc)': 'NGc', 'GL': 'Gl', 'GaN': 'GalN', '(9Ac)': '9Ac', '5,9Ac2': '5Ac9Ac', '4,5Ac2': '4Ac5Ac',
'KDN': 'Kdn', 'OSO3': 'S', '-O-Su-': 'S', '(S)': 'S', 'SO3-': 'S', 'SO3(-)': 'S', 'H2PO3': 'P', '(P)': 'P', 'L-6dGal': 'Fuc',
'–': '-', ' ': '', 'ß': 'b', '.': '', '((': '(', '))': ')', '→': '-', '*': '', 'Ga(': 'Gal(', 'aa': 'a', 'bb': 'b',
'–': '-', ' ': '', 'ß': 'b', '.': '', '((': '(', '))': ')', '→': '-', '*': '', 'Ga(': 'Gal(', 'aa': 'a', 'bb': 'b', 'Pc': 'PCho',
'Glcp': 'Glc', 'Galp': 'Gal', 'Manp': 'Man', 'Fucp': 'Fuc', 'Neup': 'Neu', 'a?': 'a1',
'5Ac4Ac': '4Ac5Ac', '(-)': '(?1-?)', '(?-?)': '(?1-?)', '?-?)': '1-?)', '5ac': '5Ac', '-_': '-?'}
CANONICALIZE = re.compile('|'.join(map(re.escape, list(replace_dic.keys()))))
Expand Down Expand Up @@ -186,9 +186,10 @@ def find_isomorphs(glycan: str # Glycan in IUPAC-condensed format
temp = set()
# Starting branch swapped with next side branch again to also include double branch swapped isomorphs
for k in out_list:
temp.add(re.sub(r'^(.*?)\[((?:[^\[\]]|\[[^\[\]]*\])*?)\]((?:\[[^\[\]]*\])*?)([A-Z][a-z0-9]*[A-Z]?[a-z0-9]*\([ab?][1-2]-[1-9?]\))', r'\2[\1]\3\4', k, 1))
if k.count('[') > 1 and k.index('[') > 0 and find_nth(k, '[', 2) > k.index(']') and (find_nth(k, ']', 2) < find_nth(k, '[', 3) or k.count('[') == 2):
temp.add(re.sub(r'^(.*?)\[(.*?)\](.*?)\[(.*?)\]', r'\4[\1[\2]\3]', k, 1))
if k.index(']') > k.index('['):
temp.add(re.sub(r'^(.*?)\[((?:[^\[\]]|\[[^\[\]]*\])*?)\]((?:\[[^\[\]]*\])*?)([A-Z][a-z0-9]*[A-Z]?[a-z0-9]*\([ab?][1-2]-[1-9?]\))', r'\2[\1]\3\4', k, 1))
if k.count('[') > 1 and k.index('[') > 0 and find_nth(k, '[', 2) > k.index(']') and (find_nth(k, ']', 2) < find_nth(k, '[', 3) or k.count('[') == 2):
temp.add(re.sub(r'^(.*?)\[(.*?)\](.*?)\[(.*?)\]', r'\4[\1[\2]\3]', k, 1))
out_list.update(temp)
out_list = {k for k in out_list if not any([j in k for j in ['[[', ']]']]) and k.index(']') > k.index('[')}
if floaty:
Expand Down Expand Up @@ -237,7 +238,7 @@ def choose_correct_isoform(glycans: Union[List[str], str], # Glycans in IUPAC-co
"Given a list of glycan branch isomers, returns the correct isomer"
if isinstance(glycans, str):
glycans = find_isomorphs(glycans)
glycans = list(set(glycans))
glycans = sorted(list(set(glycans)))
if '?' in ''.join(glycans) and not reverse:
min_questions = min(glycan.count('?') for glycan in glycans)
glycans = [glycan for glycan in glycans if glycan.count('?') == min_questions]
Expand Down Expand Up @@ -294,6 +295,9 @@ def kill_noncanonical(glycans):
kill_list.add(g)
current_count = paren_counts[-2] # Return to parent branch's count
paren_counts.pop() # Remove current branch level
if '][' in g:
if any(sum(1 for c in left if c == '(') < sum(1 for c in right if c == '(') for match in re.finditer(r'\[([^\[\]]*)\]\[([^\[\]]*(?:\[[^\[\]]*\][^\[\]]*)*)\]', g) if (left := match.group(1)) and (right := match.group(2))):
kill_list.add(g)
return [g for g in glycans if g not in kill_list]

# Handle neighboring branches
Expand Down Expand Up @@ -350,13 +354,14 @@ def kill_noncanonical(glycans):
prefix = [min_process_glycans([k[j[0]:j[1]] for j in v]) for k, v in candidates.items()]
prefix = [np.argmax([len(j) for j in k]) for k in prefix]
prefix = [k[:candidates[k][prefix[i]][0]] for i, k in enumerate(candidates.keys())]
prefix = [k[:k.rfind('[')] if k.endswith('][') else k for k in prefix]
for i, p in enumerate(prefix):
if p.endswith(']'):
prefix[i] = p[:p.rfind('[')]
prefix = min_process_glycans(prefix)
branch_endings = [k[-1][-1] if k[-1][-1].isdigit() else 10 for k in prefix]
prefix_proc = min_process_glycans(prefix)
branch_endings = [k[-1][-1] if k[-1][-1].isdigit() else 10 for k in prefix_proc]
if len(set(branch_endings)) == 1:
branch_endings = [ord(k[0][0]) for k in prefix]
branch_endings = [k[k.index('[')-2] if k[k.index('[')-2].isdigit() else 10 for k in prefix] if ']' in prefix[0] else [ord(k[0][0]) for k in prefix_proc]
branch_endings = [int(k) for k in branch_endings]
min_ending = min(branch_endings)
glycans2 = [g for k, g in enumerate(glycans2) if branch_endings[k] == min_ending]
Expand Down Expand Up @@ -896,8 +901,8 @@ def glycoworkbench_to_iupac(glycan: str # Glycan in GlycoWorkBench nomenclature
converted_glycan = branch_str[:second_brack_end] + ']' + branch_str[second_brack_end:-1] + converted_glycan[double_brack_idx+2:]
if floaty_parts: # Add floating parts to final structure
converted_glycan = ''.join(f"{{{part}}}" for part in floaty_parts) + converted_glycan
converted_glycan = re.sub(r'S[\)\(]*\?1-\?\)\[(.*?)\]([^(]+)', r'\1\2OS', converted_glycan) # O-sulfate case
converted_glycan = re.sub(r'S[\)\(]*\?1-(\d)\)\[(.*?)\]([^(]+)', r'\2\3\1S', converted_glycan) # numbered sulfate
converted_glycan = re.sub(r'([SP])[\)\(]*\?1-\?\)\[(.*?)\]([^(]+)', r'\2\3O\1', converted_glycan) # O-sulfate/phosphate case
converted_glycan = re.sub(r'([SP])[\)\(]*\?1-(\d)\)\[(.*?)\]([^(]+)', r'\3\4\2\1', converted_glycan) # numbered sulfate/phosphate
return f"{converted_glycan[:-6]}-ol" if 'freeEnd' in glycan else converted_glycan[:-6]


Expand Down Expand Up @@ -974,7 +979,7 @@ def canonicalize_iupac(glycan: str # Glycan sequence in any supported format
if bool(re.search(r"^[^2-6]*1?[^2-6]*$", glycan)):
glycan = re.sub(r'(a|b)(\d)(?!\-)', r'\g<1>\g<2>-?', glycan)
else:
glycan = re.sub(r'(a|b)(\d)(?!\-)', r'\g<1>1-\g<2>', glycan)
glycan = re.sub(r'(?<!h)(a|b)(\d)(?!\-)', r'\g<1>1-\g<2>', glycan)
# Introduce parentheses for linkages
if '(' not in glycan and len(glycan) > 6:
for k in range(1, glycan.count('-')+1):
Expand Down
4 changes: 4 additions & 0 deletions tests/test_core_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -719,6 +719,7 @@ def test_canonicalize_iupac():
assert canonicalize_iupac("Fucα2Galβ1-4GlcNAcβ1-3(NeuAcα2-3Galβ1-4GlcNAcβ1-6)Galβ1-4GlcNAcol") == "Fuc(a1-2)Gal(b1-4)GlcNAc(b1-3)[Neu5Ac(a2-3)Gal(b1-4)GlcNAc(b1-6)]Gal(b1-4)GlcNAc"
assert canonicalize_iupac("Fucα1-2Galβ1-4GlcNAcβ1-3(Fucα1-2Galβ1-4GlcNAcβ1-6)Galβ1-4GlcNAcβ1-3Galβ1-4Glcβ-") == "Fuc(a1-2)Gal(b1-4)GlcNAc(b1-3)[Fuc(a1-2)Gal(b1-4)GlcNAc(b1-6)]Gal(b1-4)GlcNAc(b1-3)Gal(b1-4)Glc"
assert canonicalize_iupac("Neu5Aca2-3Galb1-3{Neu5Aca2-6}GalNAc") == "Neu5Ac(a2-3)Gal(b1-3)[Neu5Ac(a2-6)]GalNAc"
assert canonicalize_iupac("Gal(a1-2)[Man(a1-3)D-Rha(a1-3)][Rha2Me3Me(a1-2)D-Ara(b1-3)Rha(b1-4)Xyl(b1-4)]Fuc(a1-3)[Xyl(b1-4)]Glc") == "Rha2Me3Me(a1-2)D-Ara(b1-3)Rha(b1-4)Xyl(b1-4)[Man(a1-3)D-Rha(a1-3)][Gal(a1-2)]Fuc(a1-3)[Xyl(b1-4)]Glc"
# Test linkage uncertainty
assert canonicalize_iupac("Gal-GlcNAc") == "Gal(?1-?)GlcNAc"
assert canonicalize_iupac("Gal(b1-3/4)Gal(b1-4)GlcNAc") == "Gal(b1-3/4)Gal(b1-4)GlcNAc"
Expand All @@ -742,6 +743,9 @@ def test_canonicalize_iupac():
assert canonicalize_iupac("Gal(b1-4)GlcNAc(b1-2)[Gal(b1-4)GlcNAc(b1-6)]Man(a1-6)[Gal(b1-4)GlcNAc(b1-2)[Gal(b1-4)GlcNAc(b1-4)]Man(a1-3)]Man(b1-4)GlcNAc(b1-4)[Fuc(a1-6)]GlcNAc") == "Gal(b1-4)GlcNAc(b1-2)[Gal(b1-4)GlcNAc(b1-4)]Man(a1-3)[Gal(b1-4)GlcNAc(b1-2)[Gal(b1-4)GlcNAc(b1-6)]Man(a1-6)]Man(b1-4)GlcNAc(b1-4)[Fuc(a1-6)]GlcNAc"
assert canonicalize_iupac("Galβ1–4GlcNAcβ1–2(Fucα1–3(Galβ1–4)GlcNAcβ1–4)Manα1–3(Galβ1–4GlcNAcβ1–2(Galβ1–4GlcNAcβ1–6)Manα1–6)Manβ1–4GlcNAcβ1–4GlcNAc") == "Gal(b1-4)GlcNAc(b1-2)[Fuc(a1-3)[Gal(b1-4)]GlcNAc(b1-4)]Man(a1-3)[Gal(b1-4)GlcNAc(b1-2)[Gal(b1-4)GlcNAc(b1-6)]Man(a1-6)]Man(b1-4)GlcNAc(b1-4)GlcNAc"
assert canonicalize_iupac("Fuc(a1-3)[Gal(b1-4)]GlcNAc(b1-2)[GlcNAc(b1-4)]Man(a1-3)[Man(a1-3)[Man(a1-6)]Man(a1-6)]Man(b1-4)GlcNAc(b1-4)GlcNAc") == "Fuc(a1-3)[Gal(b1-4)]GlcNAc(b1-2)[GlcNAc(b1-4)]Man(a1-3)[Man(a1-3)[Man(a1-6)]Man(a1-6)]Man(b1-4)GlcNAc(b1-4)GlcNAc"
assert canonicalize_iupac("Fuc(a1-3)[Gal(b1-4)]GlcNAc(b1-2)[Gal(b1-4)GlcNAc(b1-4)]Man(a1-3)[Man(a1-3)[Man(a1-6)]Man(a1-6)]Man(b1-4)GlcNAc(b1-4)GlcNAc") == "Fuc(a1-3)[Gal(b1-4)]GlcNAc(b1-2)[Gal(b1-4)GlcNAc(b1-4)]Man(a1-3)[Man(a1-3)[Man(a1-6)]Man(a1-6)]Man(b1-4)GlcNAc(b1-4)GlcNAc"
assert canonicalize_iupac("GalNAc(b1-4)[Neu5Ac(a2-3)]Gal(b1-4)GlcNAc(b1-6)[Fuc(a1-2)[GalNAc(a1-3)]Gal(b1-3)]GalNAc") == "Neu5Ac(a2-3)[GalNAc(b1-4)]Gal(b1-4)GlcNAc(b1-6)[Fuc(a1-2)[GalNAc(a1-3)]Gal(b1-3)]GalNAc"
assert canonicalize_iupac("Gal(b1-3)[Fuc(a1-4)]GlcNAc(b1-2)Man(a1-3)[Xyl(b1-2)][Gal(b1-3)[Fuc(a1-4)]GlcNAc(b1-2)Man(a1-6)]Man(b1-4)GlcNAc(b1-4)[Fuc(a1-3)]GlcNAc") == "Gal(b1-3)[Fuc(a1-4)]GlcNAc(b1-2)Man(a1-3)[Gal(b1-3)[Fuc(a1-4)]GlcNAc(b1-2)Man(a1-6)][Xyl(b1-2)]Man(b1-4)GlcNAc(b1-4)[Fuc(a1-3)]GlcNAc"
# Test other nomenclatures
assert canonicalize_iupac("DManpa1-6DManpb1-4DGlcpNAcb1-4[LFucpa1-6]DGlcpNAcb1-OH") == "Man(a1-6)Man(b1-4)GlcNAc(b1-4)[Fuc(a1-6)]GlcNAc"
assert canonicalize_iupac("Neup5Aca2-3DGalpb1-4DGlcpNAcb1-3DGalpb1-3DGalpb1-4DGlcpb1-OH") == "Neu5Ac(a2-3)Gal(b1-4)GlcNAc(b1-3)Gal(b1-3)Gal(b1-4)Glc"
Expand Down

0 comments on commit 8f80e89

Please sign in to comment.