expand get_terminal_structures to size2 if desired

- also add some linkage variations to canonicalize_iupac
BojarLab · Mar 4, 2024 · 94f1dff · 94f1dff
1 parent 287a1cb
commit 94f1dff
Show file tree

Hide file tree

Showing 4 changed files with 40 additions and 16 deletions.
diff --git a/build/lib/glycowork/motif/annotate.py b/build/lib/glycowork/motif/annotate.py
@@ -140,8 +140,8 @@ def annotate_dataset(glycans, motifs = None, feature_set = ['known'],
   | glycans (list): list of IUPAC-condensed glycan sequences as strings
   | motifs (dataframe): dataframe of glycan motifs (name + sequence); default:motif_list
   | feature_set (list): which feature set to use for annotations, add more to list to expand; default is 'known'; options are: 'known' (hand-crafted glycan features), \
-  |   'graph' (structural graph features of glycans), 'exhaustive' (all mono- and disaccharide features), 'terminal' (non-reducing end motifs), \
-  |   'custom' (specify your own motifs in custom_motifs), and 'chemical' (molecular properties of glycan)
+  |   'graph' (structural graph features of glycans), 'exhaustive' (all mono- and disaccharide features), 'terminal' (non-reducing end motifs of size 1), \
+  |   'terminal2' (non-reducing end motifs of size 2), 'custom' (specify your own motifs in custom_motifs), and 'chemical' (molecular properties of glycan)
   | termini_list (list): list of monosaccharide/linkage positions (from 'terminal', 'internal', and 'flexible')
   | condense (bool): if True, throws away columns with only zeroes; default:False
   | custom_motifs (list): list of glycan motifs, used if feature_set includes 'custom'; default:empty\n
@@ -180,8 +180,13 @@ def annotate_dataset(glycans, motifs = None, feature_set = ['known'],
     shopping_cart.append(temp)
   if 'chemical' in feature_set:
     shopping_cart.append(get_molecular_properties(glycans, placeholder = True))
-  if 'terminal' in feature_set:
-    bag = [get_terminal_structures(glycan) for glycan in glycans]
+  if 'terminal' or 'terminal2' in feature_set:
+    bag1, bag2 = [], []
+    if 'terminal' in feature_set:
+      bag1 = [get_terminal_structures(glycan) for glycan in glycans]
+    if 'terminal2' in feature_set:
+      bag2 = [get_terminal_structures(glycan, size = 2) for glycan in glycans]
+    bag = bag1 + bag2
     repertoire = set(unwrap(bag))
     repertoire2 = [re.sub(r"\(([ab])(\d)-(\d)\)", r"(\1\2-?)", g) for g in repertoire]
     repertoire2 = set([k for k in repertoire2 if repertoire2.count(k) > 1 and k not in repertoire])
@@ -319,18 +324,23 @@ def get_k_saccharides(glycans, size = 2, up_to = False, just_motifs = False):
       return out_matrix.fillna(0).astype(int)
 
 
-def get_terminal_structures(glycan):
+def get_terminal_structures(glycan, size = 1):
   """returns terminal structures from all non-reducing ends (monosaccharide+linkage)\n
   | Arguments:
   | :-
-  | glycan (string or networkx): glycan in IUPAC-condensed nomenclature or as networkx graph\n
+  | glycan (string or networkx): glycan in IUPAC-condensed nomenclature or as networkx graph
+  | size (int): how large the extracted motif should be in terms of monosaccharides (for now 1 or 2 are supported); default:1\n
   | Returns:
   | :-
   | Returns a list of terminal structures (strings)
   """
   ggraph = ensure_graph(glycan)
   nodeDict = dict(ggraph.nodes(data = True))
-  return [nodeDict[k]['string_labels']+'('+nodeDict[k+1]['string_labels']+')' for k in list(ggraph.nodes())[:-1] if ggraph.degree[k] == 1 and k+1 in nodeDict.keys() and nodeDict[k]['string_labels'] not in linkages]
+  temp =  [nodeDict[k]['string_labels']+'('+nodeDict[k+1]['string_labels']+')' + \
+   ''.join([nodeDict.get(k+1+j+i, {'string_labels': ''})['string_labels']+'('+nodeDict.get(k+2+j+i, {'string_labels': ''})['string_labels']+')' \
+            for i, j in enumerate(range(1, size))]) for k in list(ggraph.nodes())[:-1] if \
+          ggraph.degree[k] == 1 and k+1 in nodeDict.keys() and nodeDict[k]['string_labels'] not in linkages]
+  return [g.replace('()', '') for g in temp]
 
 
 def create_correlation_network(df, correlation_threshold):

diff --git a/build/lib/glycowork/motif/processing.py b/build/lib/glycowork/motif/processing.py
@@ -828,8 +828,10 @@ def canonicalize_iupac(glycan):
                  'KDN': 'Kdn', 'OSO3': 'S', '-O-Su-': 'S', '(S)': 'S', 'H2PO3': 'P', '(P)': 'P',
                  '–': '-', ' ': '', ',': '-', 'α': 'a', 'β': 'b', 'ß': 'b', '.': '', '((': '(', '))': ')', '→': '-',
                  'Glcp': 'Glc', 'Galp': 'Gal', 'Manp': 'Man', 'Fucp': 'Fuc', 'Neup': 'Neu', 'a?': 'a1',
-                 '5Ac4Ac': '4Ac5Ac'}
+                 '5Ac4Ac': '4Ac5Ac', '(-)': '(?1-?)'}
   glycan = multireplace(glycan, replace_dic)
+  if '{' in glycan and '(' not in glycan:
+    glycan = glycan.replace('{', '(').replace('}', ')')
   # Trim linkers
   if '-' in glycan:
     if bool(re.search(r'[a-z]\-[a-zA-Z]', glycan[glycan.rindex('-')-1:])) and 'ol' not in glycan:

diff --git a/glycowork/motif/annotate.py b/glycowork/motif/annotate.py
@@ -140,8 +140,8 @@ def annotate_dataset(glycans, motifs = None, feature_set = ['known'],
   | glycans (list): list of IUPAC-condensed glycan sequences as strings
   | motifs (dataframe): dataframe of glycan motifs (name + sequence); default:motif_list
   | feature_set (list): which feature set to use for annotations, add more to list to expand; default is 'known'; options are: 'known' (hand-crafted glycan features), \
-  |   'graph' (structural graph features of glycans), 'exhaustive' (all mono- and disaccharide features), 'terminal' (non-reducing end motifs), \
-  |   'custom' (specify your own motifs in custom_motifs), and 'chemical' (molecular properties of glycan)
+  |   'graph' (structural graph features of glycans), 'exhaustive' (all mono- and disaccharide features), 'terminal' (non-reducing end motifs of size 1), \
+  |   'terminal2' (non-reducing end motifs of size 2), 'custom' (specify your own motifs in custom_motifs), and 'chemical' (molecular properties of glycan)
   | termini_list (list): list of monosaccharide/linkage positions (from 'terminal', 'internal', and 'flexible')
   | condense (bool): if True, throws away columns with only zeroes; default:False
   | custom_motifs (list): list of glycan motifs, used if feature_set includes 'custom'; default:empty\n
@@ -180,8 +180,13 @@ def annotate_dataset(glycans, motifs = None, feature_set = ['known'],
     shopping_cart.append(temp)
   if 'chemical' in feature_set:
     shopping_cart.append(get_molecular_properties(glycans, placeholder = True))
-  if 'terminal' in feature_set:
-    bag = [get_terminal_structures(glycan) for glycan in glycans]
+  if 'terminal' or 'terminal2' in feature_set:
+    bag1, bag2 = [], []
+    if 'terminal' in feature_set:
+      bag1 = [get_terminal_structures(glycan) for glycan in glycans]
+    if 'terminal2' in feature_set:
+      bag2 = [get_terminal_structures(glycan, size = 2) for glycan in glycans]
+    bag = bag1 + bag2
     repertoire = set(unwrap(bag))
     repertoire2 = [re.sub(r"\(([ab])(\d)-(\d)\)", r"(\1\2-?)", g) for g in repertoire]
     repertoire2 = set([k for k in repertoire2 if repertoire2.count(k) > 1 and k not in repertoire])
@@ -319,18 +324,23 @@ def get_k_saccharides(glycans, size = 2, up_to = False, just_motifs = False):
       return out_matrix.fillna(0).astype(int)
 
 
-def get_terminal_structures(glycan):
+def get_terminal_structures(glycan, size = 1):
   """returns terminal structures from all non-reducing ends (monosaccharide+linkage)\n
   | Arguments:
   | :-
-  | glycan (string or networkx): glycan in IUPAC-condensed nomenclature or as networkx graph\n
+  | glycan (string or networkx): glycan in IUPAC-condensed nomenclature or as networkx graph
+  | size (int): how large the extracted motif should be in terms of monosaccharides (for now 1 or 2 are supported); default:1\n
   | Returns:
   | :-
   | Returns a list of terminal structures (strings)
   """
   ggraph = ensure_graph(glycan)
   nodeDict = dict(ggraph.nodes(data = True))
-  return [nodeDict[k]['string_labels']+'('+nodeDict[k+1]['string_labels']+')' for k in list(ggraph.nodes())[:-1] if ggraph.degree[k] == 1 and k+1 in nodeDict.keys() and nodeDict[k]['string_labels'] not in linkages]
+  temp =  [nodeDict[k]['string_labels']+'('+nodeDict[k+1]['string_labels']+')' + \
+   ''.join([nodeDict.get(k+1+j+i, {'string_labels': ''})['string_labels']+'('+nodeDict.get(k+2+j+i, {'string_labels': ''})['string_labels']+')' \
+            for i, j in enumerate(range(1, size))]) for k in list(ggraph.nodes())[:-1] if \
+          ggraph.degree[k] == 1 and k+1 in nodeDict.keys() and nodeDict[k]['string_labels'] not in linkages]
+  return [g.replace('()', '') for g in temp]
 
 
 def create_correlation_network(df, correlation_threshold):

diff --git a/glycowork/motif/processing.py b/glycowork/motif/processing.py
@@ -828,8 +828,10 @@ def canonicalize_iupac(glycan):
                  'KDN': 'Kdn', 'OSO3': 'S', '-O-Su-': 'S', '(S)': 'S', 'H2PO3': 'P', '(P)': 'P',
                  '–': '-', ' ': '', ',': '-', 'α': 'a', 'β': 'b', 'ß': 'b', '.': '', '((': '(', '))': ')', '→': '-',
                  'Glcp': 'Glc', 'Galp': 'Gal', 'Manp': 'Man', 'Fucp': 'Fuc', 'Neup': 'Neu', 'a?': 'a1',
-                 '5Ac4Ac': '4Ac5Ac'}
+                 '5Ac4Ac': '4Ac5Ac', '(-)': '(?1-?)'}
   glycan = multireplace(glycan, replace_dic)
+  if '{' in glycan and '(' not in glycan:
+    glycan = glycan.replace('{', '(').replace('}', ')')
   # Trim linkers
   if '-' in glycan:
     if bool(re.search(r'[a-z]\-[a-zA-Z]', glycan[glycan.rindex('-')-1:])) and 'ol' not in glycan: