Skip to content

Commit

Permalink
expand get_terminal_structures to size2 if desired
Browse files Browse the repository at this point in the history
- also add some linkage variations to canonicalize_iupac
  • Loading branch information
Bribak committed Mar 4, 2024
1 parent 287a1cb commit 94f1dff
Show file tree
Hide file tree
Showing 4 changed files with 40 additions and 16 deletions.
24 changes: 17 additions & 7 deletions build/lib/glycowork/motif/annotate.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,8 +140,8 @@ def annotate_dataset(glycans, motifs = None, feature_set = ['known'],
| glycans (list): list of IUPAC-condensed glycan sequences as strings
| motifs (dataframe): dataframe of glycan motifs (name + sequence); default:motif_list
| feature_set (list): which feature set to use for annotations, add more to list to expand; default is 'known'; options are: 'known' (hand-crafted glycan features), \
| 'graph' (structural graph features of glycans), 'exhaustive' (all mono- and disaccharide features), 'terminal' (non-reducing end motifs), \
| 'custom' (specify your own motifs in custom_motifs), and 'chemical' (molecular properties of glycan)
| 'graph' (structural graph features of glycans), 'exhaustive' (all mono- and disaccharide features), 'terminal' (non-reducing end motifs of size 1), \
| 'terminal2' (non-reducing end motifs of size 2), 'custom' (specify your own motifs in custom_motifs), and 'chemical' (molecular properties of glycan)
| termini_list (list): list of monosaccharide/linkage positions (from 'terminal', 'internal', and 'flexible')
| condense (bool): if True, throws away columns with only zeroes; default:False
| custom_motifs (list): list of glycan motifs, used if feature_set includes 'custom'; default:empty\n
Expand Down Expand Up @@ -180,8 +180,13 @@ def annotate_dataset(glycans, motifs = None, feature_set = ['known'],
shopping_cart.append(temp)
if 'chemical' in feature_set:
shopping_cart.append(get_molecular_properties(glycans, placeholder = True))
if 'terminal' in feature_set:
bag = [get_terminal_structures(glycan) for glycan in glycans]
if 'terminal' or 'terminal2' in feature_set:
bag1, bag2 = [], []
if 'terminal' in feature_set:
bag1 = [get_terminal_structures(glycan) for glycan in glycans]
if 'terminal2' in feature_set:
bag2 = [get_terminal_structures(glycan, size = 2) for glycan in glycans]
bag = bag1 + bag2
repertoire = set(unwrap(bag))
repertoire2 = [re.sub(r"\(([ab])(\d)-(\d)\)", r"(\1\2-?)", g) for g in repertoire]
repertoire2 = set([k for k in repertoire2 if repertoire2.count(k) > 1 and k not in repertoire])
Expand Down Expand Up @@ -319,18 +324,23 @@ def get_k_saccharides(glycans, size = 2, up_to = False, just_motifs = False):
return out_matrix.fillna(0).astype(int)


def get_terminal_structures(glycan):
def get_terminal_structures(glycan, size = 1):
"""returns terminal structures from all non-reducing ends (monosaccharide+linkage)\n
| Arguments:
| :-
| glycan (string or networkx): glycan in IUPAC-condensed nomenclature or as networkx graph\n
| glycan (string or networkx): glycan in IUPAC-condensed nomenclature or as networkx graph
| size (int): how large the extracted motif should be in terms of monosaccharides (for now 1 or 2 are supported); default:1\n
| Returns:
| :-
| Returns a list of terminal structures (strings)
"""
ggraph = ensure_graph(glycan)
nodeDict = dict(ggraph.nodes(data = True))
return [nodeDict[k]['string_labels']+'('+nodeDict[k+1]['string_labels']+')' for k in list(ggraph.nodes())[:-1] if ggraph.degree[k] == 1 and k+1 in nodeDict.keys() and nodeDict[k]['string_labels'] not in linkages]
temp = [nodeDict[k]['string_labels']+'('+nodeDict[k+1]['string_labels']+')' + \
''.join([nodeDict.get(k+1+j+i, {'string_labels': ''})['string_labels']+'('+nodeDict.get(k+2+j+i, {'string_labels': ''})['string_labels']+')' \
for i, j in enumerate(range(1, size))]) for k in list(ggraph.nodes())[:-1] if \
ggraph.degree[k] == 1 and k+1 in nodeDict.keys() and nodeDict[k]['string_labels'] not in linkages]
return [g.replace('()', '') for g in temp]


def create_correlation_network(df, correlation_threshold):
Expand Down
4 changes: 3 additions & 1 deletion build/lib/glycowork/motif/processing.py
Original file line number Diff line number Diff line change
Expand Up @@ -828,8 +828,10 @@ def canonicalize_iupac(glycan):
'KDN': 'Kdn', 'OSO3': 'S', '-O-Su-': 'S', '(S)': 'S', 'H2PO3': 'P', '(P)': 'P',
'–': '-', ' ': '', ',': '-', 'α': 'a', 'β': 'b', 'ß': 'b', '.': '', '((': '(', '))': ')', '→': '-',
'Glcp': 'Glc', 'Galp': 'Gal', 'Manp': 'Man', 'Fucp': 'Fuc', 'Neup': 'Neu', 'a?': 'a1',
'5Ac4Ac': '4Ac5Ac'}
'5Ac4Ac': '4Ac5Ac', '(-)': '(?1-?)'}
glycan = multireplace(glycan, replace_dic)
if '{' in glycan and '(' not in glycan:
glycan = glycan.replace('{', '(').replace('}', ')')
# Trim linkers
if '-' in glycan:
if bool(re.search(r'[a-z]\-[a-zA-Z]', glycan[glycan.rindex('-')-1:])) and 'ol' not in glycan:
Expand Down
24 changes: 17 additions & 7 deletions glycowork/motif/annotate.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,8 +140,8 @@ def annotate_dataset(glycans, motifs = None, feature_set = ['known'],
| glycans (list): list of IUPAC-condensed glycan sequences as strings
| motifs (dataframe): dataframe of glycan motifs (name + sequence); default:motif_list
| feature_set (list): which feature set to use for annotations, add more to list to expand; default is 'known'; options are: 'known' (hand-crafted glycan features), \
| 'graph' (structural graph features of glycans), 'exhaustive' (all mono- and disaccharide features), 'terminal' (non-reducing end motifs), \
| 'custom' (specify your own motifs in custom_motifs), and 'chemical' (molecular properties of glycan)
| 'graph' (structural graph features of glycans), 'exhaustive' (all mono- and disaccharide features), 'terminal' (non-reducing end motifs of size 1), \
| 'terminal2' (non-reducing end motifs of size 2), 'custom' (specify your own motifs in custom_motifs), and 'chemical' (molecular properties of glycan)
| termini_list (list): list of monosaccharide/linkage positions (from 'terminal', 'internal', and 'flexible')
| condense (bool): if True, throws away columns with only zeroes; default:False
| custom_motifs (list): list of glycan motifs, used if feature_set includes 'custom'; default:empty\n
Expand Down Expand Up @@ -180,8 +180,13 @@ def annotate_dataset(glycans, motifs = None, feature_set = ['known'],
shopping_cart.append(temp)
if 'chemical' in feature_set:
shopping_cart.append(get_molecular_properties(glycans, placeholder = True))
if 'terminal' in feature_set:
bag = [get_terminal_structures(glycan) for glycan in glycans]
if 'terminal' or 'terminal2' in feature_set:
bag1, bag2 = [], []
if 'terminal' in feature_set:
bag1 = [get_terminal_structures(glycan) for glycan in glycans]
if 'terminal2' in feature_set:
bag2 = [get_terminal_structures(glycan, size = 2) for glycan in glycans]
bag = bag1 + bag2
repertoire = set(unwrap(bag))
repertoire2 = [re.sub(r"\(([ab])(\d)-(\d)\)", r"(\1\2-?)", g) for g in repertoire]
repertoire2 = set([k for k in repertoire2 if repertoire2.count(k) > 1 and k not in repertoire])
Expand Down Expand Up @@ -319,18 +324,23 @@ def get_k_saccharides(glycans, size = 2, up_to = False, just_motifs = False):
return out_matrix.fillna(0).astype(int)


def get_terminal_structures(glycan):
def get_terminal_structures(glycan, size = 1):
"""returns terminal structures from all non-reducing ends (monosaccharide+linkage)\n
| Arguments:
| :-
| glycan (string or networkx): glycan in IUPAC-condensed nomenclature or as networkx graph\n
| glycan (string or networkx): glycan in IUPAC-condensed nomenclature or as networkx graph
| size (int): how large the extracted motif should be in terms of monosaccharides (for now 1 or 2 are supported); default:1\n
| Returns:
| :-
| Returns a list of terminal structures (strings)
"""
ggraph = ensure_graph(glycan)
nodeDict = dict(ggraph.nodes(data = True))
return [nodeDict[k]['string_labels']+'('+nodeDict[k+1]['string_labels']+')' for k in list(ggraph.nodes())[:-1] if ggraph.degree[k] == 1 and k+1 in nodeDict.keys() and nodeDict[k]['string_labels'] not in linkages]
temp = [nodeDict[k]['string_labels']+'('+nodeDict[k+1]['string_labels']+')' + \
''.join([nodeDict.get(k+1+j+i, {'string_labels': ''})['string_labels']+'('+nodeDict.get(k+2+j+i, {'string_labels': ''})['string_labels']+')' \
for i, j in enumerate(range(1, size))]) for k in list(ggraph.nodes())[:-1] if \
ggraph.degree[k] == 1 and k+1 in nodeDict.keys() and nodeDict[k]['string_labels'] not in linkages]
return [g.replace('()', '') for g in temp]


def create_correlation_network(df, correlation_threshold):
Expand Down
4 changes: 3 additions & 1 deletion glycowork/motif/processing.py
Original file line number Diff line number Diff line change
Expand Up @@ -828,8 +828,10 @@ def canonicalize_iupac(glycan):
'KDN': 'Kdn', 'OSO3': 'S', '-O-Su-': 'S', '(S)': 'S', 'H2PO3': 'P', '(P)': 'P',
'–': '-', ' ': '', ',': '-', 'α': 'a', 'β': 'b', 'ß': 'b', '.': '', '((': '(', '))': ')', '→': '-',
'Glcp': 'Glc', 'Galp': 'Gal', 'Manp': 'Man', 'Fucp': 'Fuc', 'Neup': 'Neu', 'a?': 'a1',
'5Ac4Ac': '4Ac5Ac'}
'5Ac4Ac': '4Ac5Ac', '(-)': '(?1-?)'}
glycan = multireplace(glycan, replace_dic)
if '{' in glycan and '(' not in glycan:
glycan = glycan.replace('{', '(').replace('}', ')')
# Trim linkers
if '-' in glycan:
if bool(re.search(r'[a-z]\-[a-zA-Z]', glycan[glycan.rindex('-')-1:])) and 'ol' not in glycan:
Expand Down

0 comments on commit 94f1dff

Please sign in to comment.