add basic GLYCAM parser for Universal Input

BojarLab · Feb 5, 2025 · 2fb5dc6 · 2fb5dc6
1 parent 35ed71a
commit 2fb5dc6
Show file tree

Hide file tree

Showing 5 changed files with 23 additions and 8 deletions.
diff --git a/.coveragerc b/.coveragerc
@@ -1,4 +1,4 @@
 [run]
 omit = 
     setup.py
-    glycowork/glycowork/*.py
+    glycowork/*.py
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -13,7 +13,7 @@
 ### Changed 🔄
 - Bumped minimum supported Python version to 3.9 (3.8 is no longer supported, see https://devguide.python.org/versions/) (4960c5c)
 - Switched docstring style to docments (<https://nbdev.fast.ai/tutorials/best_practices.html#document-parameters-with-docments>) (e6721a1)
-- Removed `gdown` dependency; Will be handled by the standard library module `urllib` for better retrieval of externally stored models/files (319981e)
+- Removed `gdown` dependency; Will be handled by the standard library module `urllib` for better retrieval of externally stored models/files (319981e, 35ed71a)
 - Switched pathing from `os` to `pathlib` (319981e)
 
 ### glycan_data
@@ -88,9 +88,10 @@
 #### processing
 ##### Added ✨
 - Added "antennary_Fuc" as another inferred feature to `infer_features_from_composition` (a64f694)
-- Added "IdoA", "GalA", "Araf", "D-Fuc", "AllNAc", "Par", "Kdo", "GlcN", "Ido", "Col", "Tyv", "GalN", "QuiNAc", "Gul", and "Gal6S" to recognized WURCS2 tokens (52fc16e, f3cd8f0, 7551805)
+- Added "IdoA", "GalA", "Araf", "D-Fuc", "AllNAc", "Par", "Kdo", "GlcN", "Ido", "Col", "Tyv", "GalN", "QuiNAc", "Gul", and "Gal6S" to recognized WURCS2 tokens (52fc16e, f3cd8f0, 7551805, 35ed71a)
 - Added the new "order_by" keyword argument to `choose_correct_isoform` to enforce strictly sorting branches by branch endings / linkages, if desired (918d18f)
-- Added "Col", "Ido", "Kdo", and "Gul" to supported GlycoCT monosaccharides (7551805)
+- Added "Col", "Ido", "Kdo", and "Gul" to supported GlycoCT monosaccharides (7551805, 35ed71a)
+- GLYCAM is now another supported nomenclature in the Universal Input framework, enabled by the added `glycam_to_iupac` function, which is also integrated into `canonicalize_iupac`
 
 ##### Changed 🔄
 - `check_nomenclature` will now actually raise appropriate Exceptions, in case nomenclature is incompatible with glycowork, instead of print warnings (23d6456)

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -63,15 +63,15 @@ pytest
 
 * Docs are automatically created from the notebooks in the nbs folder.
 
-## Wishlist for future glycowork updates (last update: 2025-02-03)
+## Wishlist for future glycowork updates (last update: 2025-02-05)
 
 ### Urgent
 
 * more, and more informative, error messages
 
 ### At some point
 
-* less commonly used nomenclaturs for universal input: GLYCAM and GlycoWorkBench
+* less commonly used nomenclatures for universal input: GlycoWorkBench
 * any further expansion of our universal input pipeline, to cover more usecases etc.
 * split motif_list into ‘core’ motifs (occurring frequently) and ‘extended’ motifs (that are rare or niche) for performance reasons
 * characterize_monosaccharide only factors in subsequent sequence context; make it possible (as an option) to also consider upstream sequence context

diff --git a/glycowork/motif/processing.py b/glycowork/motif/processing.py
@@ -840,6 +840,15 @@ def oxford_to_iupac(oxford: str # Glycan in Oxford format
   return floaty + iupac.strip('[]')
 
 
+def glycam_to_iupac(glycan: str # Glycan in GLYCAM nomenclature
+                    ) -> str: # Basic IUPAC-condensed format
+  "Convert glycan from GLYCAM to IUPAC-condensed format"
+  pattern = r'(?:[DL])|(?:\[(\d+[SP]+)\])'
+  glycan = '-'.join(glycan.split('-')[:-1])[:-2]
+  glycan = re.sub(pattern, lambda m: m.group(1) if m.group(1) else '', glycan)
+  return glycan.replace('[', '(').replace(']', ')')
+
+
 def check_nomenclature(glycan: str # Glycan string to check
                      ) -> None: # Prints reason if not convertible
   "Check whether glycan has correct nomenclature for glycowork"
@@ -851,9 +860,9 @@ def check_nomenclature(glycan: str # Glycan string to check
 
 def canonicalize_iupac(glycan: str # Glycan sequence in any supported format
                      ) -> str: # Standardized IUPAC-condensed format
-  "Convert glycan from IUPAC-extended, LinearCode, GlycoCT, and WURCS to standardized IUPAC-condensed format"
+  "Convert glycan from IUPAC-extended, LinearCode, GlycoCT, WURCS, Oxford, and GLYCAM to standardized IUPAC-condensed format"
   glycan = glycan.strip()
-  # Check for different nomenclatures: LinearCode, IUPAC-extended, GlycoCT, WURCS, Oxford
+  # Check for different nomenclatures: LinearCode, IUPAC-extended, GlycoCT, WURCS, Oxford, GLYCAM
   if ';' in glycan:
     glycan = linearcode_to_iupac(glycan)
   elif '-D-' in glycan:
@@ -862,6 +871,8 @@ def canonicalize_iupac(glycan: str # Glycan sequence in any supported format
     glycan = glycoct_to_iupac(glycan)
   elif '=' in glycan:
     glycan = wurcs_to_iupac(glycan)
+  elif glycan.endswith('-OH'):
+    glycan = glycam_to_iupac(glycan)
   elif not isinstance(glycan, str) or any([k in glycan for k in ['@']]):
     check_nomenclature(glycan)
     return

diff --git a/tests/test_core_functions.py b/tests/test_core_functions.py
@@ -739,6 +739,9 @@ def test_canonicalize_iupac():
     assert canonicalize_iupac("GalNAcβ1-4(NeuAcα2-3)GlcNAcβ1-3(NeuAcα2-3Galβ1-4GlcNAcβ1-6)Galβ1-4Glcol") == "Neu5Ac(a2-3)Gal(b1-4)GlcNAc(b1-6)[Neu5Ac(a2-3)[GalNAc(b1-4)]GlcNAc(b1-3)]Gal(b1-4)Glc-ol"
     assert canonicalize_iupac("Fucα1-2Galβ1-4GlcNAcβ1-3[NeuAcα2-3Galβ1-4(Fucα1-3)GlcNAcβ1-6]Galβ1-4GlcNAcβ1-3(Fucα1-2Galβ1-4GlcNAcβ1-6)Galβ1-4Glcol") == "Fuc(a1-2)Gal(b1-4)GlcNAc(b1-3)[Neu5Ac(a2-3)Gal(b1-4)[Fuc(a1-3)]GlcNAc(b1-6)]Gal(b1-4)GlcNAc(b1-3)[Fuc(a1-2)Gal(b1-4)GlcNAc(b1-6)]Gal(b1-4)Glc-ol"
     # Test other nomenclatures
+    assert canonicalize_iupac("DManpa1-6DManpb1-4DGlcpNAcb1-4[LFucpa1-6]DGlcpNAcb1-OH") == "Man(a1-6)Man(b1-4)GlcNAc(b1-4)[Fuc(a1-6)]GlcNAc"
+    assert canonicalize_iupac("Neup5Aca2-3DGalpb1-4DGlcpNAcb1-3DGalpb1-3DGalpb1-4DGlcpb1-OH") == "Neu5Ac(a2-3)Gal(b1-4)GlcNAc(b1-3)Gal(b1-3)Gal(b1-4)Glc"
+    assert canonicalize_iupac("DGalp[6S]b1-3DGalpNAca1-OH") == "Gal6S(b1-3)GalNAc"
     assert canonicalize_iupac("Ma3(Ma6)Mb4GNb4GN;") == "Man(a1-3)[Man(a1-6)]Man(b1-4)GlcNAc(b1-4)GlcNAc"
     assert canonicalize_iupac("β-D-Galp-(1→4)-β-D-GlcpNAc-(1→") == "Gal(b1-4)GlcNAc"
     assert canonicalize_iupac("α-D-Neup5Ac-(2→3)-β-D-Galp-(1→4)-β-D-GlcpNAc-(1→") == "Neu5Ac(a2-3)Gal(b1-4)GlcNAc"