Merge branch 'dev' of github.com:BojarLab/glycowork into dev

BojarLab · Feb 20, 2025 · 17b8ae8 · 17b8ae8
2 parents be49f25 + 407cd6f
commit 17b8ae8
Show file tree

Hide file tree

Showing 7 changed files with 79 additions and 268 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
diff --git a/glycowork/glycan_data/loader.py b/glycowork/glycan_data/loader.py
@@ -252,7 +252,8 @@ def download_model(file_id: str, # Google Drive file ID
   "Download the model weights file from Google Drive"
   file_id = file_id.split('/d/')[1].split('/view')[0]
   url = f'https://drive.google.com/uc?id={file_id}'
-  response = requests.get(url, stream = True, timeout = 30)
+  headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'}
+  response = requests.get(url, stream = True, timeout = 30, headers = headers)
   if response.status_code == 200:
     with open(local_path, 'wb') as f:
       f.write(response.content)

diff --git a/glycowork/motif/analysis.py b/glycowork/motif/analysis.py
@@ -35,7 +35,7 @@
 from glycowork.motif.processing import enforce_class, process_for_glycoshift
 from glycowork.motif.annotate import (annotate_dataset, quantify_motifs, link_find, create_correlation_network,
                                       group_glycans_core, group_glycans_sia_fuc, group_glycans_N_glycan_type, load_lectin_lib,
-                                      create_lectin_and_motif_mappings, lectin_motif_scoring, clean_up_heatmap)
+                                      create_lectin_and_motif_mappings, lectin_motif_scoring, deduplicate_motifs)
 from glycowork.motif.graph import subgraph_isomorphism
 
 
@@ -231,7 +231,7 @@ def get_heatmap(
       df_motif = df_motif.replace(0, np.nan).dropna(thresh = np.max([np.round(rarity_filter * df_motif.shape[0]), 1]), axis = 1)
       df = df_motif.T.fillna(0) @ df
       df = df.apply(lambda col: col / col.sum()).T
-      df = clean_up_heatmap(df.T)
+      df = deduplicate_motifs(df.T)
   df = df.dropna(axis = 1)
   if not (df < 0).any().any():
     df /= df.sum()

diff --git a/glycowork/motif/annotate.py b/glycowork/motif/annotate.py
@@ -272,7 +272,7 @@ def annotate_switchboard(glycan):
     return pd.concat(shopping_cart, axis = 1)
 
 
-def clean_up_heatmap(
+def deduplicate_motifs(
     df: pd.DataFrame # DataFrame with glycan motifs as rows, samples as columns
     ) -> pd.DataFrame: # DataFrame with redundant motifs removed
   "Removes redundant motif entries from glycan abundance data while preserving the most informative labels"
@@ -298,7 +298,7 @@ def quantify_motifs(
    glycans: List[str], # List of IUPAC-condensed glycan sequences
    feature_set: List[str], # Feature types to analyze: known, graph, exhaustive, terminal(1-3), custom, chemical, size_branch
    custom_motifs: List = [], # Custom motifs when using 'custom' feature set
-   remove_redundant: bool = True # Remove redundant motifs via clean_up_heatmap
+   remove_redundant: bool = True # Remove redundant motifs via deduplicate_motifs
    ) -> pd.DataFrame: # DataFrame with motif abundances (motifs as columns, samples as rows)
   "Extracts and quantifies motif abundances from glycan abundance data by weighting motif occurrences"
   if isinstance(df, str):
@@ -321,7 +321,7 @@ def quantify_motifs(
     else:
       collect_dic[col] = (temp * df_motif.iloc[indices, c].reset_index(drop = True)).sum(axis = 1)
   df = pd.DataFrame(collect_dic)
-  return df if not remove_redundant else clean_up_heatmap(df.T)
+  return df if not remove_redundant else deduplicate_motifs(df.T)
 
 
 def count_unique_subgraphs_of_size_k(

diff --git a/glycowork/motif/processing.py b/glycowork/motif/processing.py
@@ -7,7 +7,7 @@
 from collections import defaultdict
 from itertools import permutations, combinations
 from typing import Dict, List, Set, Union, Optional, Callable, Tuple, Generator
-from glycowork.glycan_data.loader import (unwrap, multireplace,
+from glycowork.glycan_data.loader import (unwrap, multireplace, df_glycan,
                                           find_nth, find_nth_reverse, lib, HexOS, HexNAcOS,
                                           linkages, Hex, HexNAc, dHex, Sia, HexA, Pen)
 
@@ -906,6 +906,28 @@ def glycoworkbench_to_iupac(glycan: str # Glycan in GlycoWorkBench nomenclature
   return f"{converted_glycan[:-6]}-ol" if 'freeEnd' in glycan else converted_glycan[:-6]
 
 
+def glytoucan_to_glycan(ids: List[str], # List of GlyTouCan IDs or glycans
+                       revert: bool = False # Whether to map glycans to IDs; default:False
+                      ) -> List[str]: # List of glycans or IDs
+    "Convert between GlyTouCan IDs and IUPAC-condensed glycans"
+    if not hasattr(glytoucan_to_glycan, 'glycan_dict'):
+      glytoucan_to_glycan.glycan_dict = dict(zip(df_glycan.glytoucan_id, df_glycan.glycan))
+      glytoucan_to_glycan.id_dict = dict(zip(df_glycan.glycan, df_glycan.glytoucan_id))
+    lookup = glytoucan_to_glycan.id_dict if revert else glytoucan_to_glycan.glycan_dict
+    result , not_found = [], []
+    for item in ids:
+      if item in lookup:
+        result.append(lookup[item])
+      else:
+        result.append(item)
+        not_found.append(item)
+    # Print missing items if any
+    if not_found:
+      msg = 'glycans' if revert else 'IDs'
+      print(f'These {msg} are not in our database: {not_found}')
+    return result
+
+
 def check_nomenclature(glycan: str # Glycan string to check
                      ) -> None: # Prints reason if not convertible
   "Check whether glycan has correct nomenclature for glycowork"
@@ -915,11 +937,24 @@ def check_nomenclature(glycan: str # Glycan string to check
     raise ValueError("Seems like you're using SMILES. We currently can only convert IUPAC-->SMILES; not the other way around.")
 
 
+def sanitize_iupac(glycan: str # Glycan string to check
+                   ) -> str: # Sanitized glycan string
+  """Sanitize IUPAC glycan sequence by identifying and correcting chemical impossibilities."""
+  # Handle NAc special case (any sugar with NAc can't have linkage at position 2)
+  glycan = re.sub(r'([A-Za-z]+)\(([ab?][1-2])-2\)([A-Za-z]+NAc)', r'\1(\2-?)\3', glycan)
+  # Handle modifications (can't have a linkage to a position that's modified)
+  glycan = re.sub(r'\(([ab?][1-2])-(\d)\)([A-Za-z]+\2[A-Z])', r'(\1-?)\3', glycan)
+  # Handle branched cases with same linkage position
+  for match in re.finditer(r'([A-Za-z]+)\(([ab?][1-2])-(\d)\)\[([A-Za-z]+)\(([ab?][1-2])-(\3)\)\]', glycan):
+    glycan = glycan.replace(match.group(0), f'{match.group(1)}({match.group(2)}-?)[{match.group(4)}({match.group(5)}-?)]')
+  return glycan
+
+
 def canonicalize_iupac(glycan: str # Glycan sequence in any supported format
                      ) -> str: # Standardized IUPAC-condensed format
-  "Convert glycan from IUPAC-extended, LinearCode, GlycoCT, WURCS, Oxford, GLYCAM, and GlycoWorkBench to standardized IUPAC-condensed format"
+  "Convert glycan from IUPAC-extended, LinearCode, GlycoCT, WURCS, Oxford, GLYCAM, GlycoWorkBench, and GlyTouCanIDs to standardized IUPAC-condensed format"
   glycan = glycan.strip()
-  # Check for different nomenclatures: LinearCode, IUPAC-extended, GlycoCT, WURCS, Oxford, GLYCAM, GlycoWorkBench
+  # Check for different nomenclatures: LinearCode, IUPAC-extended, GlycoCT, WURCS, Oxford, GLYCAM, GlycoWorkBench, GlyTouCanIDs
   if ';' in glycan:
     glycan = linearcode_to_iupac(glycan)
   elif '-D-' in glycan:
@@ -932,6 +967,8 @@ def canonicalize_iupac(glycan: str # Glycan sequence in any supported format
     glycan = glycam_to_iupac(glycan)
   elif '$MONO' in glycan:
     glycan = glycoworkbench_to_iupac(glycan)
+  elif bool(re.match(r'^G\d+', glycan)):
+    glycan = glytoucan_to_glycan([glycan])[0]
   elif not isinstance(glycan, str) or '@' in glycan:
     check_nomenclature(glycan)
     return
@@ -1025,6 +1062,7 @@ def canonicalize_iupac(glycan: str # Glycan sequence in any supported format
                   'Kdn(a1': 'Kdn(a2'}
   glycan = multireplace(glycan, post_process)
   glycan = re.sub(r'[ab]-$', '', glycan)  # Remove endings like Glcb-
+  glycan = sanitize_iupac(glycan)
   # Canonicalize branch ordering
   if '[' in glycan and not glycan.startswith('['):
     glycan = choose_correct_isoform(glycan)

diff --git a/glycowork/motif/query.py b/glycowork/motif/query.py
@@ -1,6 +1,6 @@
 import numpy as np
 import pandas as pd
-from typing import List, Optional
+from typing import Optional
 
 from glycowork.glycan_data.loader import motif_list, df_glycan
 from glycowork.motif.graph import compare_glycans
@@ -45,19 +45,3 @@ def get_insight(glycan: str, # Glycan in IUPAC-condensed format
                     disease_sample[k]) for k in range(len(disease))])
             )
     print("\nThat's all we can do for you at this point!")
-
-
-def glytoucan_to_glycan(ids: List[str], # List of GlyTouCan IDs or glycans
-                       revert: bool = False # Whether to map glycans to IDs; default:False
-                      ) -> List[str]: # List of glycans or IDs
-    "Convert between GlyTouCan IDs and IUPAC-condensed glycans"
-    if revert:
-        ids = [df_glycan.glytoucan_id.values.tolist()[np.where(df_glycan.glycan.values == k)[0][0]] if k in df_glycan.glycan.values else k for k in ids]
-        if any([k not in df_glycan.glycan.values.tolist() for k in ids]):
-            print('These glycans are not in our database: ' + str([k for k in ids if k not in df_glycan.glycan]))
-        return ids
-    else:
-        glycans = [df_glycan.glycan.values.tolist()[np.where(df_glycan.glytoucan_id.values == k)[0][0]] if k in df_glycan.glytoucan_id.values else k for k in ids]
-        if any([k not in df_glycan.glytoucan_id.values for k in ids]):
-            print('These IDs are not in our database: ' + str([k for k in ids if k not in df_glycan.glytoucan_id]))
-        return glycans
diff --git a/tests/test_core_functions.py b/tests/test_core_functions.py
@@ -22,7 +22,7 @@
 from collections import Counter
 from contextlib import contextmanager
 from glycowork.glycan_data.data_entry import check_presence
-from glycowork.motif.query import get_insight, glytoucan_to_glycan
+from glycowork.motif.query import get_insight
 from glycowork.motif.tokenization import (
     constrain_prot, prot_to_coded, string_to_labels, pad_sequence, mz_to_composition,
     get_core, get_modification, get_stem_lib, stemify_glycan, mask_rare_glycoletters,
@@ -34,7 +34,7 @@
 from glycowork.motif.processing import (
     min_process_glycans, get_lib, expand_lib, get_possible_linkages,
     get_possible_monosaccharides, de_wildcard_glycoletter, canonicalize_iupac,
-    glycoct_to_iupac, wurcs_to_iupac, oxford_to_iupac,
+    glycoct_to_iupac, wurcs_to_iupac, oxford_to_iupac, glytoucan_to_glycan,
     canonicalize_composition, parse_glycoform, find_isomorphs,
     presence_to_matrix, process_for_glycoshift, linearcode_to_iupac, iupac_extended_to_condensed,
     in_lib, get_class, enforce_class, equal_repeats, get_matching_indices,
@@ -69,7 +69,7 @@
     get_k_saccharides, get_terminal_structures, create_correlation_network,
     group_glycans_core, group_glycans_sia_fuc, group_glycans_N_glycan_type,
     Lectin, load_lectin_lib, create_lectin_and_motif_mappings,
-    lectin_motif_scoring, clean_up_heatmap, quantify_motifs, get_size_branching_features,
+    lectin_motif_scoring, deduplicate_motifs, quantify_motifs, get_size_branching_features,
     count_unique_subgraphs_of_size_k, annotate_glycan_topology_uncertainty
 )
 from glycowork.motif.regex import (preprocess_pattern, specify_linkages, process_occurrence,
@@ -736,6 +736,10 @@ def test_canonicalize_iupac():
     assert canonicalize_iupac("Gal(b1-4)Glc-olS") == "Gal(b1-4)GlcOS-ol"
     assert canonicalize_iupac("SGalNAc(b1-4)GlcNAc") == "GalNAcOS(b1-4)GlcNAc"
     assert canonicalize_iupac("S-Gal(b1-4)Glc-ol") == "GalOS(b1-4)Glc-ol"
+    # Test sanitization
+    assert canonicalize_iupac("GlcNAc(b1-2)[GlcNAc(b1-2)]Man") == "GlcNAc(b1-?)[GlcNAc(b1-?)]Man"
+    assert canonicalize_iupac("Gal(b1-2)GlcNAc") == "Gal(b1-?)GlcNAc"
+    assert canonicalize_iupac("GlcNAc(b1-3)Gal3S") == "GlcNAc(b1-?)Gal3S"
     # Test branch ordering
     assert canonicalize_iupac("GalNAcβ1-4(NeuAcα2-3)GlcNAcβ1-3(NeuAcα2-3Galβ1-4GlcNAcβ1-6)Galβ1-4Glcol") == "Neu5Ac(a2-3)Gal(b1-4)GlcNAc(b1-6)[Neu5Ac(a2-3)[GalNAc(b1-4)]GlcNAc(b1-3)]Gal(b1-4)Glc-ol"
     assert canonicalize_iupac("Fucα1-2Galβ1-4GlcNAcβ1-3[NeuAcα2-3Galβ1-4(Fucα1-3)GlcNAcβ1-6]Galβ1-4GlcNAcβ1-3(Fucα1-2Galβ1-4GlcNAcβ1-6)Galβ1-4Glcol") == "Fuc(a1-2)Gal(b1-4)GlcNAc(b1-3)[Neu5Ac(a2-3)Gal(b1-4)[Fuc(a1-3)]GlcNAc(b1-6)]Gal(b1-4)GlcNAc(b1-3)[Fuc(a1-2)Gal(b1-4)GlcNAc(b1-6)]Gal(b1-4)Glc-ol"
@@ -777,6 +781,9 @@ def test_canonicalize_iupac():
     assert canonicalize_iupac("WURCS=2.0/4,8,7/[u2122h_2*NCC/3=O][a2121A-1a_1-5][a2122h-1a_1-5_2*NCC/3=O][a2122A-1b_1-5]/1-2-3-4-3-2-3-4/a4-b1_b4-c1_c4-d1_d4-e1_e4-f1_f4-g1_g4-h1") == "GlcA(b1-4)GlcNAc(a1-4)IdoA(a1-4)GlcNAc(a1-4)GlcA(b1-4)GlcNAc(a1-4)IdoA(a1-4)GlcNAc"
     assert canonicalize_iupac("WURCS=2.0/6,9,8/[a2122h-1a_1-5_2*NCC/3=O][a1221m-1a_1-5][a2122h-1b_1-5_2*NCC/3=O][a1122h-1b_1-5][a212h-1b_1-5][a1122h-1a_1-5]/1-2-3-4-5-6-3-6-3/a3-b1_a4-c1_c4-d1_d2-e1_d3-f1_d6-h1_f2-g1_h2-i1") == "GlcNAc(b1-2)Man(a1-3)[GlcNAc(b1-2)Man(a1-6)][Xyl(b1-2)]Man(b1-4)GlcNAc(b1-4)[Fuc(a1-3)]GlcNAc"
     assert canonicalize_iupac("WURCS=2.0/7,19,18/[u2122h_2*NCC/3=O][a2122h-1b_1-5_2*NCC/3=O][a1122h-1b_1-5][a1122h-1a_1-5][a1221m-1a_1-5][a2112h-1b_1-5][Aad21122h-2a_2-6_5*NCCO/3=O]/1-2-3-4-2-5-6-7-2-6-2-4-2-5-6-2-5-6-7/a4-b1_b4-c1_c3-d1_c4-k1_c6-l1_d2-e1_d4-i1_e3-f1_e4-g1_i4-j1_l2-m1_l6-p1_m3-n1_m4-o1_p3-q1_p4-r1_h2-g3|g6_s2-r3|r6 ") == "Neu5Gc(a2-3/6)Gal(b1-4)[Fuc(a1-3)]GlcNAc(b1-2)[Gal(b1-4)GlcNAc(b1-4)]Man(a1-3)[Fuc(a1-3)[Gal(b1-4)]GlcNAc(b1-2)[Fuc(a1-3)[Neu5Gc(a2-3/6)Gal(b1-4)]GlcNAc(b1-6)]Man(a1-6)][GlcNAc(b1-4)]Man(b1-4)GlcNAc(b1-4)GlcNAc"
+    assert canonicalize_iupac("G07426YY") == "Gal(b1-4)GlcNAc(b1-2)Man(a1-3)[Gal(b1-4)GlcNAc(b1-2)Man(a1-6)]Man(b1-4)GlcNAc(b1-4)[Fuc(a1-6)]GlcNAc"
+    assert canonicalize_iupac("G96417BZ") == "Man(a1-2)Man(a1-3)[Man(a1-3)[Man(a1-6)]Man(a1-6)]Man(b1-4)GlcNAc(b1-4)GlcNAc"
+    assert canonicalize_iupac("G26039ES") == "Gal(b1-4)Glc-ol"
     assert canonicalize_iupac("""RES
 1b:b-dglc-HEX-1:5
 2s:n-acetyl
@@ -2224,12 +2231,12 @@ def test_lectin_motif_scoring():
     assert "score" in result.columns
 
 
-def test_clean_up_heatmap():
+def test_deduplicate_motifs():
     data = pd.DataFrame({
         'sample1': [1, 1],
         'sample2': [2, 2]
     }, index=['motif1', 'motif2'])
-    result = clean_up_heatmap(data)
+    result = deduplicate_motifs(data)
     assert isinstance(result, pd.DataFrame)
     assert len(result) <= len(data)
 
@@ -5401,32 +5408,28 @@ def test_get_insight_with_disease(sample_glycan_df, sample_motif_df, monkeypatch
     assert "tumor" in captured.out
 
 
-def test_glytoucan_to_glycan_forward(sample_glycan_df, monkeypatch):
-    monkeypatch.setattr('glycowork.motif.query.df_glycan', sample_glycan_df)
-    result = glytoucan_to_glycan(['G00001', 'G00002'])
+def test_glytoucan_to_glycan_forward():
+    result = glytoucan_to_glycan(['G26039ES', 'G65562ZE'])
     assert len(result) == 2
     assert 'Gal(b1-4)Glc-ol' in result
-    assert 'Fuc(a1-3)[Gal(b1-4)]GlcNAc(b1-3)Gal(b1-4)Glc-ol' in result
+    assert 'Neu5Ac(a2-3)Gal(b1-3)GalNAc' in result
 
 
-def test_glytoucan_to_glycan_reverse(sample_glycan_df, monkeypatch):
-    monkeypatch.setattr('glycowork.motif.query.df_glycan', sample_glycan_df)
-    result = glytoucan_to_glycan(['Gal(b1-4)Glc-ol', 'Fuc(a1-3)[Gal(b1-4)]GlcNAc(b1-3)Gal(b1-4)Glc-ol'], revert=True)
+def test_glytoucan_to_glycan_reverse():
+    result = glytoucan_to_glycan(['Gal(b1-4)Glc-ol', 'Neu5Ac(a2-3)Gal(b1-3)GalNAc'], revert=True)
     assert len(result) == 2
-    assert 'G00001' in result
-    assert 'G00002' in result
+    assert 'G26039ES' in result
+    assert 'G65562ZE' in result
 
 
-def test_glytoucan_to_glycan_missing_id(sample_glycan_df, monkeypatch, capsys):
-    monkeypatch.setattr('glycowork.motif.query.df_glycan', sample_glycan_df)
-    glytoucan_to_glycan(['G00001', 'MISSING'])
+def test_glytoucan_to_glycan_missing_id(capsys):
+    glytoucan_to_glycan(['G26039ES', 'MISSING'])
     captured = capsys.readouterr()
     assert 'These IDs are not in our database: ' in captured.out
     assert 'MISSING' in captured.out
 
 
-def test_glytoucan_to_glycan_missing_glycan(sample_glycan_df, monkeypatch, capsys):
-    monkeypatch.setattr('glycowork.motif.query.df_glycan', sample_glycan_df)
+def test_glytoucan_to_glycan_missing_glycan(capsys):
     glytoucan_to_glycan(['Gal(b1-4)Glc-ol', 'MISSING'], revert=True)
     captured = capsys.readouterr()
     assert 'These glycans are not in our database: ' in captured.out