Skip to content

Commit

Permalink
Merge branch 'dev' of github.com:BojarLab/glycowork into dev
Browse files Browse the repository at this point in the history
  • Loading branch information
Old-Shatterhand committed Feb 20, 2025
2 parents be49f25 + 407cd6f commit 17b8ae8
Show file tree
Hide file tree
Showing 7 changed files with 79 additions and 268 deletions.
231 changes: 8 additions & 223 deletions CHANGELOG.md

Large diffs are not rendered by default.

3 changes: 2 additions & 1 deletion glycowork/glycan_data/loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -252,7 +252,8 @@ def download_model(file_id: str, # Google Drive file ID
"Download the model weights file from Google Drive"
file_id = file_id.split('/d/')[1].split('/view')[0]
url = f'https://drive.google.com/uc?id={file_id}'
response = requests.get(url, stream = True, timeout = 30)
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'}
response = requests.get(url, stream = True, timeout = 30, headers = headers)
if response.status_code == 200:
with open(local_path, 'wb') as f:
f.write(response.content)
Expand Down
4 changes: 2 additions & 2 deletions glycowork/motif/analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@
from glycowork.motif.processing import enforce_class, process_for_glycoshift
from glycowork.motif.annotate import (annotate_dataset, quantify_motifs, link_find, create_correlation_network,
group_glycans_core, group_glycans_sia_fuc, group_glycans_N_glycan_type, load_lectin_lib,
create_lectin_and_motif_mappings, lectin_motif_scoring, clean_up_heatmap)
create_lectin_and_motif_mappings, lectin_motif_scoring, deduplicate_motifs)
from glycowork.motif.graph import subgraph_isomorphism


Expand Down Expand Up @@ -231,7 +231,7 @@ def get_heatmap(
df_motif = df_motif.replace(0, np.nan).dropna(thresh = np.max([np.round(rarity_filter * df_motif.shape[0]), 1]), axis = 1)
df = df_motif.T.fillna(0) @ df
df = df.apply(lambda col: col / col.sum()).T
df = clean_up_heatmap(df.T)
df = deduplicate_motifs(df.T)
df = df.dropna(axis = 1)
if not (df < 0).any().any():
df /= df.sum()
Expand Down
6 changes: 3 additions & 3 deletions glycowork/motif/annotate.py
Original file line number Diff line number Diff line change
Expand Up @@ -272,7 +272,7 @@ def annotate_switchboard(glycan):
return pd.concat(shopping_cart, axis = 1)


def clean_up_heatmap(
def deduplicate_motifs(
df: pd.DataFrame # DataFrame with glycan motifs as rows, samples as columns
) -> pd.DataFrame: # DataFrame with redundant motifs removed
"Removes redundant motif entries from glycan abundance data while preserving the most informative labels"
Expand All @@ -298,7 +298,7 @@ def quantify_motifs(
glycans: List[str], # List of IUPAC-condensed glycan sequences
feature_set: List[str], # Feature types to analyze: known, graph, exhaustive, terminal(1-3), custom, chemical, size_branch
custom_motifs: List = [], # Custom motifs when using 'custom' feature set
remove_redundant: bool = True # Remove redundant motifs via clean_up_heatmap
remove_redundant: bool = True # Remove redundant motifs via deduplicate_motifs
) -> pd.DataFrame: # DataFrame with motif abundances (motifs as columns, samples as rows)
"Extracts and quantifies motif abundances from glycan abundance data by weighting motif occurrences"
if isinstance(df, str):
Expand All @@ -321,7 +321,7 @@ def quantify_motifs(
else:
collect_dic[col] = (temp * df_motif.iloc[indices, c].reset_index(drop = True)).sum(axis = 1)
df = pd.DataFrame(collect_dic)
return df if not remove_redundant else clean_up_heatmap(df.T)
return df if not remove_redundant else deduplicate_motifs(df.T)


def count_unique_subgraphs_of_size_k(
Expand Down
44 changes: 41 additions & 3 deletions glycowork/motif/processing.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from collections import defaultdict
from itertools import permutations, combinations
from typing import Dict, List, Set, Union, Optional, Callable, Tuple, Generator
from glycowork.glycan_data.loader import (unwrap, multireplace,
from glycowork.glycan_data.loader import (unwrap, multireplace, df_glycan,
find_nth, find_nth_reverse, lib, HexOS, HexNAcOS,
linkages, Hex, HexNAc, dHex, Sia, HexA, Pen)

Expand Down Expand Up @@ -906,6 +906,28 @@ def glycoworkbench_to_iupac(glycan: str # Glycan in GlycoWorkBench nomenclature
return f"{converted_glycan[:-6]}-ol" if 'freeEnd' in glycan else converted_glycan[:-6]


def glytoucan_to_glycan(ids: List[str], # List of GlyTouCan IDs or glycans
revert: bool = False # Whether to map glycans to IDs; default:False
) -> List[str]: # List of glycans or IDs
"Convert between GlyTouCan IDs and IUPAC-condensed glycans"
if not hasattr(glytoucan_to_glycan, 'glycan_dict'):
glytoucan_to_glycan.glycan_dict = dict(zip(df_glycan.glytoucan_id, df_glycan.glycan))
glytoucan_to_glycan.id_dict = dict(zip(df_glycan.glycan, df_glycan.glytoucan_id))
lookup = glytoucan_to_glycan.id_dict if revert else glytoucan_to_glycan.glycan_dict
result , not_found = [], []
for item in ids:
if item in lookup:
result.append(lookup[item])
else:
result.append(item)
not_found.append(item)
# Print missing items if any
if not_found:
msg = 'glycans' if revert else 'IDs'
print(f'These {msg} are not in our database: {not_found}')
return result


def check_nomenclature(glycan: str # Glycan string to check
) -> None: # Prints reason if not convertible
"Check whether glycan has correct nomenclature for glycowork"
Expand All @@ -915,11 +937,24 @@ def check_nomenclature(glycan: str # Glycan string to check
raise ValueError("Seems like you're using SMILES. We currently can only convert IUPAC-->SMILES; not the other way around.")


def sanitize_iupac(glycan: str # Glycan string to check
) -> str: # Sanitized glycan string
"""Sanitize IUPAC glycan sequence by identifying and correcting chemical impossibilities."""
# Handle NAc special case (any sugar with NAc can't have linkage at position 2)
glycan = re.sub(r'([A-Za-z]+)\(([ab?][1-2])-2\)([A-Za-z]+NAc)', r'\1(\2-?)\3', glycan)
# Handle modifications (can't have a linkage to a position that's modified)
glycan = re.sub(r'\(([ab?][1-2])-(\d)\)([A-Za-z]+\2[A-Z])', r'(\1-?)\3', glycan)
# Handle branched cases with same linkage position
for match in re.finditer(r'([A-Za-z]+)\(([ab?][1-2])-(\d)\)\[([A-Za-z]+)\(([ab?][1-2])-(\3)\)\]', glycan):
glycan = glycan.replace(match.group(0), f'{match.group(1)}({match.group(2)}-?)[{match.group(4)}({match.group(5)}-?)]')
return glycan


def canonicalize_iupac(glycan: str # Glycan sequence in any supported format
) -> str: # Standardized IUPAC-condensed format
"Convert glycan from IUPAC-extended, LinearCode, GlycoCT, WURCS, Oxford, GLYCAM, and GlycoWorkBench to standardized IUPAC-condensed format"
"Convert glycan from IUPAC-extended, LinearCode, GlycoCT, WURCS, Oxford, GLYCAM, GlycoWorkBench, and GlyTouCanIDs to standardized IUPAC-condensed format"
glycan = glycan.strip()
# Check for different nomenclatures: LinearCode, IUPAC-extended, GlycoCT, WURCS, Oxford, GLYCAM, GlycoWorkBench
# Check for different nomenclatures: LinearCode, IUPAC-extended, GlycoCT, WURCS, Oxford, GLYCAM, GlycoWorkBench, GlyTouCanIDs
if ';' in glycan:
glycan = linearcode_to_iupac(glycan)
elif '-D-' in glycan:
Expand All @@ -932,6 +967,8 @@ def canonicalize_iupac(glycan: str # Glycan sequence in any supported format
glycan = glycam_to_iupac(glycan)
elif '$MONO' in glycan:
glycan = glycoworkbench_to_iupac(glycan)
elif bool(re.match(r'^G\d+', glycan)):
glycan = glytoucan_to_glycan([glycan])[0]
elif not isinstance(glycan, str) or '@' in glycan:
check_nomenclature(glycan)
return
Expand Down Expand Up @@ -1025,6 +1062,7 @@ def canonicalize_iupac(glycan: str # Glycan sequence in any supported format
'Kdn(a1': 'Kdn(a2'}
glycan = multireplace(glycan, post_process)
glycan = re.sub(r'[ab]-$', '', glycan) # Remove endings like Glcb-
glycan = sanitize_iupac(glycan)
# Canonicalize branch ordering
if '[' in glycan and not glycan.startswith('['):
glycan = choose_correct_isoform(glycan)
Expand Down
18 changes: 1 addition & 17 deletions glycowork/motif/query.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import numpy as np
import pandas as pd
from typing import List, Optional
from typing import Optional

from glycowork.glycan_data.loader import motif_list, df_glycan
from glycowork.motif.graph import compare_glycans
Expand Down Expand Up @@ -45,19 +45,3 @@ def get_insight(glycan: str, # Glycan in IUPAC-condensed format
disease_sample[k]) for k in range(len(disease))])
)
print("\nThat's all we can do for you at this point!")


def glytoucan_to_glycan(ids: List[str], # List of GlyTouCan IDs or glycans
revert: bool = False # Whether to map glycans to IDs; default:False
) -> List[str]: # List of glycans or IDs
"Convert between GlyTouCan IDs and IUPAC-condensed glycans"
if revert:
ids = [df_glycan.glytoucan_id.values.tolist()[np.where(df_glycan.glycan.values == k)[0][0]] if k in df_glycan.glycan.values else k for k in ids]
if any([k not in df_glycan.glycan.values.tolist() for k in ids]):
print('These glycans are not in our database: ' + str([k for k in ids if k not in df_glycan.glycan]))
return ids
else:
glycans = [df_glycan.glycan.values.tolist()[np.where(df_glycan.glytoucan_id.values == k)[0][0]] if k in df_glycan.glytoucan_id.values else k for k in ids]
if any([k not in df_glycan.glytoucan_id.values for k in ids]):
print('These IDs are not in our database: ' + str([k for k in ids if k not in df_glycan.glytoucan_id]))
return glycans
41 changes: 22 additions & 19 deletions tests/test_core_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
from collections import Counter
from contextlib import contextmanager
from glycowork.glycan_data.data_entry import check_presence
from glycowork.motif.query import get_insight, glytoucan_to_glycan
from glycowork.motif.query import get_insight
from glycowork.motif.tokenization import (
constrain_prot, prot_to_coded, string_to_labels, pad_sequence, mz_to_composition,
get_core, get_modification, get_stem_lib, stemify_glycan, mask_rare_glycoletters,
Expand All @@ -34,7 +34,7 @@
from glycowork.motif.processing import (
min_process_glycans, get_lib, expand_lib, get_possible_linkages,
get_possible_monosaccharides, de_wildcard_glycoletter, canonicalize_iupac,
glycoct_to_iupac, wurcs_to_iupac, oxford_to_iupac,
glycoct_to_iupac, wurcs_to_iupac, oxford_to_iupac, glytoucan_to_glycan,
canonicalize_composition, parse_glycoform, find_isomorphs,
presence_to_matrix, process_for_glycoshift, linearcode_to_iupac, iupac_extended_to_condensed,
in_lib, get_class, enforce_class, equal_repeats, get_matching_indices,
Expand Down Expand Up @@ -69,7 +69,7 @@
get_k_saccharides, get_terminal_structures, create_correlation_network,
group_glycans_core, group_glycans_sia_fuc, group_glycans_N_glycan_type,
Lectin, load_lectin_lib, create_lectin_and_motif_mappings,
lectin_motif_scoring, clean_up_heatmap, quantify_motifs, get_size_branching_features,
lectin_motif_scoring, deduplicate_motifs, quantify_motifs, get_size_branching_features,
count_unique_subgraphs_of_size_k, annotate_glycan_topology_uncertainty
)
from glycowork.motif.regex import (preprocess_pattern, specify_linkages, process_occurrence,
Expand Down Expand Up @@ -736,6 +736,10 @@ def test_canonicalize_iupac():
assert canonicalize_iupac("Gal(b1-4)Glc-olS") == "Gal(b1-4)GlcOS-ol"
assert canonicalize_iupac("SGalNAc(b1-4)GlcNAc") == "GalNAcOS(b1-4)GlcNAc"
assert canonicalize_iupac("S-Gal(b1-4)Glc-ol") == "GalOS(b1-4)Glc-ol"
# Test sanitization
assert canonicalize_iupac("GlcNAc(b1-2)[GlcNAc(b1-2)]Man") == "GlcNAc(b1-?)[GlcNAc(b1-?)]Man"
assert canonicalize_iupac("Gal(b1-2)GlcNAc") == "Gal(b1-?)GlcNAc"
assert canonicalize_iupac("GlcNAc(b1-3)Gal3S") == "GlcNAc(b1-?)Gal3S"
# Test branch ordering
assert canonicalize_iupac("GalNAcβ1-4(NeuAcα2-3)GlcNAcβ1-3(NeuAcα2-3Galβ1-4GlcNAcβ1-6)Galβ1-4Glcol") == "Neu5Ac(a2-3)Gal(b1-4)GlcNAc(b1-6)[Neu5Ac(a2-3)[GalNAc(b1-4)]GlcNAc(b1-3)]Gal(b1-4)Glc-ol"
assert canonicalize_iupac("Fucα1-2Galβ1-4GlcNAcβ1-3[NeuAcα2-3Galβ1-4(Fucα1-3)GlcNAcβ1-6]Galβ1-4GlcNAcβ1-3(Fucα1-2Galβ1-4GlcNAcβ1-6)Galβ1-4Glcol") == "Fuc(a1-2)Gal(b1-4)GlcNAc(b1-3)[Neu5Ac(a2-3)Gal(b1-4)[Fuc(a1-3)]GlcNAc(b1-6)]Gal(b1-4)GlcNAc(b1-3)[Fuc(a1-2)Gal(b1-4)GlcNAc(b1-6)]Gal(b1-4)Glc-ol"
Expand Down Expand Up @@ -777,6 +781,9 @@ def test_canonicalize_iupac():
assert canonicalize_iupac("WURCS=2.0/4,8,7/[u2122h_2*NCC/3=O][a2121A-1a_1-5][a2122h-1a_1-5_2*NCC/3=O][a2122A-1b_1-5]/1-2-3-4-3-2-3-4/a4-b1_b4-c1_c4-d1_d4-e1_e4-f1_f4-g1_g4-h1") == "GlcA(b1-4)GlcNAc(a1-4)IdoA(a1-4)GlcNAc(a1-4)GlcA(b1-4)GlcNAc(a1-4)IdoA(a1-4)GlcNAc"
assert canonicalize_iupac("WURCS=2.0/6,9,8/[a2122h-1a_1-5_2*NCC/3=O][a1221m-1a_1-5][a2122h-1b_1-5_2*NCC/3=O][a1122h-1b_1-5][a212h-1b_1-5][a1122h-1a_1-5]/1-2-3-4-5-6-3-6-3/a3-b1_a4-c1_c4-d1_d2-e1_d3-f1_d6-h1_f2-g1_h2-i1") == "GlcNAc(b1-2)Man(a1-3)[GlcNAc(b1-2)Man(a1-6)][Xyl(b1-2)]Man(b1-4)GlcNAc(b1-4)[Fuc(a1-3)]GlcNAc"
assert canonicalize_iupac("WURCS=2.0/7,19,18/[u2122h_2*NCC/3=O][a2122h-1b_1-5_2*NCC/3=O][a1122h-1b_1-5][a1122h-1a_1-5][a1221m-1a_1-5][a2112h-1b_1-5][Aad21122h-2a_2-6_5*NCCO/3=O]/1-2-3-4-2-5-6-7-2-6-2-4-2-5-6-2-5-6-7/a4-b1_b4-c1_c3-d1_c4-k1_c6-l1_d2-e1_d4-i1_e3-f1_e4-g1_i4-j1_l2-m1_l6-p1_m3-n1_m4-o1_p3-q1_p4-r1_h2-g3|g6_s2-r3|r6 ") == "Neu5Gc(a2-3/6)Gal(b1-4)[Fuc(a1-3)]GlcNAc(b1-2)[Gal(b1-4)GlcNAc(b1-4)]Man(a1-3)[Fuc(a1-3)[Gal(b1-4)]GlcNAc(b1-2)[Fuc(a1-3)[Neu5Gc(a2-3/6)Gal(b1-4)]GlcNAc(b1-6)]Man(a1-6)][GlcNAc(b1-4)]Man(b1-4)GlcNAc(b1-4)GlcNAc"
assert canonicalize_iupac("G07426YY") == "Gal(b1-4)GlcNAc(b1-2)Man(a1-3)[Gal(b1-4)GlcNAc(b1-2)Man(a1-6)]Man(b1-4)GlcNAc(b1-4)[Fuc(a1-6)]GlcNAc"
assert canonicalize_iupac("G96417BZ") == "Man(a1-2)Man(a1-3)[Man(a1-3)[Man(a1-6)]Man(a1-6)]Man(b1-4)GlcNAc(b1-4)GlcNAc"
assert canonicalize_iupac("G26039ES") == "Gal(b1-4)Glc-ol"
assert canonicalize_iupac("""RES
1b:b-dglc-HEX-1:5
2s:n-acetyl
Expand Down Expand Up @@ -2224,12 +2231,12 @@ def test_lectin_motif_scoring():
assert "score" in result.columns


def test_clean_up_heatmap():
def test_deduplicate_motifs():
data = pd.DataFrame({
'sample1': [1, 1],
'sample2': [2, 2]
}, index=['motif1', 'motif2'])
result = clean_up_heatmap(data)
result = deduplicate_motifs(data)
assert isinstance(result, pd.DataFrame)
assert len(result) <= len(data)

Expand Down Expand Up @@ -5401,32 +5408,28 @@ def test_get_insight_with_disease(sample_glycan_df, sample_motif_df, monkeypatch
assert "tumor" in captured.out


def test_glytoucan_to_glycan_forward(sample_glycan_df, monkeypatch):
monkeypatch.setattr('glycowork.motif.query.df_glycan', sample_glycan_df)
result = glytoucan_to_glycan(['G00001', 'G00002'])
def test_glytoucan_to_glycan_forward():
result = glytoucan_to_glycan(['G26039ES', 'G65562ZE'])
assert len(result) == 2
assert 'Gal(b1-4)Glc-ol' in result
assert 'Fuc(a1-3)[Gal(b1-4)]GlcNAc(b1-3)Gal(b1-4)Glc-ol' in result
assert 'Neu5Ac(a2-3)Gal(b1-3)GalNAc' in result


def test_glytoucan_to_glycan_reverse(sample_glycan_df, monkeypatch):
monkeypatch.setattr('glycowork.motif.query.df_glycan', sample_glycan_df)
result = glytoucan_to_glycan(['Gal(b1-4)Glc-ol', 'Fuc(a1-3)[Gal(b1-4)]GlcNAc(b1-3)Gal(b1-4)Glc-ol'], revert=True)
def test_glytoucan_to_glycan_reverse():
result = glytoucan_to_glycan(['Gal(b1-4)Glc-ol', 'Neu5Ac(a2-3)Gal(b1-3)GalNAc'], revert=True)
assert len(result) == 2
assert 'G00001' in result
assert 'G00002' in result
assert 'G26039ES' in result
assert 'G65562ZE' in result


def test_glytoucan_to_glycan_missing_id(sample_glycan_df, monkeypatch, capsys):
monkeypatch.setattr('glycowork.motif.query.df_glycan', sample_glycan_df)
glytoucan_to_glycan(['G00001', 'MISSING'])
def test_glytoucan_to_glycan_missing_id(capsys):
glytoucan_to_glycan(['G26039ES', 'MISSING'])
captured = capsys.readouterr()
assert 'These IDs are not in our database: ' in captured.out
assert 'MISSING' in captured.out


def test_glytoucan_to_glycan_missing_glycan(sample_glycan_df, monkeypatch, capsys):
monkeypatch.setattr('glycowork.motif.query.df_glycan', sample_glycan_df)
def test_glytoucan_to_glycan_missing_glycan(capsys):
glytoucan_to_glycan(['Gal(b1-4)Glc-ol', 'MISSING'], revert=True)
captured = capsys.readouterr()
assert 'These glycans are not in our database: ' in captured.out
Expand Down

0 comments on commit 17b8ae8

Please sign in to comment.