Skip to content

Commit

Permalink
better canonicalization and cleaner GUI script
Browse files Browse the repository at this point in the history
- better isomorph construction in find_isomorphs
- better canonical choice in choose_correct_isoform
- anomer fine-tuning in canonicalize_iupac
- allow LectinOracle customization in prep_model
- refactor GUI script to reduce redundancy
  • Loading branch information
Bribak committed Feb 14, 2025
1 parent 0a1babd commit d1ff321
Show file tree
Hide file tree
Showing 7 changed files with 335 additions and 458 deletions.
6 changes: 5 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@
- Added the new "order_by" keyword argument to `choose_correct_isoform` to enforce strictly sorting branches by branch endings / linkages, if desired (918d18f)
- Added "Col", "Ido", "Kdo", and "Gul" to supported GlycoCT monosaccharides (7551805, 35ed71a)
- GLYCAM is now another supported nomenclature in the Universal Input framework, enabled by the added `glycam_to_iupac` function, which is also integrated into `canonicalize_iupac` (2fb5dc6)
- GlycoWorkBench (GlycanBuilder) is now anoter supported nomenclature in the Universal Input framework, enabled by the added `glycoworkbench_to_iupac` function, which is also integrated into `canonicalize_iupac` (ea1fdfc)
- GlycoWorkBench (GlycanBuilder) is now another supported nomenclature in the Universal Input framework, enabled by the added `glycoworkbench_to_iupac` function, which is also integrated into `canonicalize_iupac` (ea1fdfc)

##### Changed 🔄
- `check_nomenclature` will now actually raise appropriate Exceptions, in case nomenclature is incompatible with glycowork, instead of print warnings (23d6456)
Expand Down Expand Up @@ -218,6 +218,10 @@
##### Changed 🔄
- Changed `resources.open_text` to `resources.files` to prevent `DeprecationWarning` from `importlib` (d1a8c6d)

#### models
##### Changed 🔄
- In `prep_model`, the `hidden_dim` argument can now also be used to modify the protein embedding size of a newly defined LectinOracle model

### network
#### evolution
##### Fixed 🐛
Expand Down
3 changes: 1 addition & 2 deletions CONTRIBUTING.md
Original file line number Diff line number Diff line change
Expand Up @@ -63,15 +63,14 @@ pytest

* Docs are automatically created from the notebooks in the nbs folder.

## Wishlist for future glycowork updates (last update: 2025-02-05)
## Wishlist for future glycowork updates (last update: 2025-02-13)

### Urgent

* more, and more informative, error messages

### At some point

* less commonly used nomenclatures for universal input: GlycoWorkBench
* any further expansion of our universal input pipeline, to cover more usecases etc.
* split motif_list into ‘core’ motifs (occurring frequently) and ‘extended’ motifs (that are rare or niche) for performance reasons
* characterize_monosaccharide only factors in subsequent sequence context; make it possible (as an option) to also consider upstream sequence context
Expand Down
702 changes: 282 additions & 420 deletions bin/glycoworkGUI.py

Large diffs are not rendered by default.

4 changes: 2 additions & 2 deletions glycowork/ml/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -295,7 +295,7 @@ def prep_model(model_type: Literal["SweetNet", "LectinOracle", "LectinOracle_fle
num_classes: int, # number of unique classes for classification
libr: Optional[Dict[str, int]] = None, # dictionary of form glycoletter:index
trained: bool = False, # whether to use pretrained model
hidden_dim: int = 128 # hidden dimension for the model (SweetNet only)
hidden_dim: int = 128 # hidden dimension for the model (SweetNet/LectinOracle only)
) -> torch.nn.Module: # initialized PyTorch model
"wrapper to instantiate model, initialize it, and put it on the GPU"
if libr is None:
Expand All @@ -311,7 +311,7 @@ def prep_model(model_type: Literal["SweetNet", "LectinOracle", "LectinOracle_fle
model.load_state_dict(torch.load("SweetNet_v1_4.pt", map_location = device, weights_only = True))
model = model.to(device)
elif model_type == 'LectinOracle':
model = LectinOracle(len(libr), num_classes = num_classes)
model = LectinOracle(len(libr), num_classes = num_classes, input_size_prot = 10*hidden_dim)
model = model.apply(lambda module: init_weights(module, mode = 'xavier'))
if trained:
if not Path("LectinOracle_v1_4.pt").exists():
Expand Down
2 changes: 1 addition & 1 deletion glycowork/motif/draw.py
Original file line number Diff line number Diff line change
Expand Up @@ -1270,7 +1270,7 @@ def draw_chem2d(
elif filepath.suffix.lower() == '.pdf':
try:
from cairosvg import svg2pdf
svg2pdf(bytestring = svg_data, write_to = filepath)
svg2pdf(bytestring = svg_data, write_to = str(filepath))
except ImportError:
raise ImportError("You're missing some draw dependencies. Either use .svg or head to https://bojarlab.github.io/glycowork/examples.html#glycodraw-code-snippets to learn more.")
return SVG(svg_data) if is_jupyter() else display_svg_with_matplotlib(svg_data)
Expand Down
55 changes: 32 additions & 23 deletions glycowork/motif/processing.py
Original file line number Diff line number Diff line change
Expand Up @@ -153,11 +153,15 @@ def find_isomorphs(glycan: str # Glycan in IUPAC-condensed format
if re.search(main_vs_branch, k):
temp.add(re.sub(main_vs_branch, r'\2[\1]\3', k))
out_list.update(temp)
temp = {re.sub(r'(.*Man\(a1-[36]\))\[(.*Man\(a1-[36]\))\](Man\(b1-4\).*)', r'\2[\1]\3', k) for k in out_list}
out_list.update(temp)
# Double branch swap
temp = {re.sub(r'(?<!\[)\[((?:[^[\]]|\[[^[\]]*\])*?)\]\[((?:[^[\]]|\[[^[\]]*\])*?)\]', r'[\2][\1]', k) for k in out_list if '][' in k}
out_list.update(temp)
temp = {re.sub(r'\[((?:[^[\]]|\[(?:[^[\]]|\[[^[\]]*\])*\])*)\]\[((?:[^[\]]|\[(?:[^[\]]|\[[^[\]]*\])*\])*)\]', r'[\2][\1]', k) for k in out_list if '][' in k}
out_list.update(temp)
temp = {re.sub(r'(.*)\[((?:[^[\]]|\[[^[\]]*\])*)\]\[((?:[^[\]]|\[[^[\]]*\])*)\]([A-Z][^[\]]*?)$', r'\1[\3][\2]\4', k) for k in out_list if '][' in k}
out_list.update(temp)
# Triple branch handling
temp = set()
for k in out_list:
Expand Down Expand Up @@ -273,6 +277,25 @@ def compare_branches(branch1: str, branch2: str, use_linkage: bool = False) -> b
# Neither is single monosaccharide: use linkage ordering
return pos1 > pos2

def kill_noncanonical(glycans):
kill_list = set()
for g in glycans:
paren_counts = [0] # Stack of counts for each nesting level
current_count = 0
for c in g:
if c == '(':
current_count += 1
paren_counts[-1] = current_count # Update current level's count
elif c == '[':
paren_counts.append(0) # Start new branch level
current_count = 0 # Reset for new branch
elif c == ']':
if current_count > paren_counts[-2]: # Compare with parent branch
kill_list.add(g)
current_count = paren_counts[-2] # Return to parent branch's count
paren_counts.pop() # Remove current branch level
return [g for g in glycans if g not in kill_list]

# Handle neighboring branches
kill_list = set()
for g in glycans2:
Expand Down Expand Up @@ -338,21 +361,9 @@ def compare_branches(branch1: str, branch2: str, use_linkage: bool = False) -> b
min_ending = min(branch_endings)
glycans2 = [g for k, g in enumerate(glycans2) if branch_endings[k] == min_ending]
if len(glycans2) > 1:
complexity = [count_nested_brackets(g, length = True) for g in glycans2]
min_complexity = min(complexity)
glycans2 = [g for k, g in enumerate(glycans2) if complexity[k] == min_complexity]
if len(glycans2) > 1:
preprefix = min_process_glycans([glyc[:glyc.index('[')] for glyc in glycans2])
branch_endings = [k[-1][-1] if k[-1][-1].isdigit() else 10 for k in preprefix]
branch_endings = [int(k) for k in branch_endings]
min_ending = min(branch_endings)
glycans2 = [g for k, g in enumerate(glycans2) if branch_endings[k] == min_ending]
if len(glycans2) > 1:
correct_isoform = sorted(glycans2, key = lambda x: sum(int(num) for num in re.findall(r'(\d+)\)\[', x)))[0] # take the one with lowest sum of num)[
else:
correct_isoform = glycans2[0]
else:
correct_isoform = glycans2[0]
glycans2 = kill_noncanonical(glycans2) or glycans2 if order_by == "length" else glycans2
correct_isoform = sorted(glycans2, key = lambda x: sum((10 if num == '?' else int(num)) * (len(re.findall(r'(\d+|\?)\)[\[\]]', x))-i) \
for i, num in enumerate(re.findall(r'(\d+|\?)\)[\[\]]', x))))[0] # take the one with lowest sum of num)[ and num)]
else:
correct_isoform = glycans2[0]
else:
Expand Down Expand Up @@ -887,9 +898,7 @@ def glycoworkbench_to_iupac(glycan: str # Glycan in GlycoWorkBench nomenclature
converted_glycan = ''.join(f"{{{part}}}" for part in floaty_parts) + converted_glycan
converted_glycan = re.sub(r'S[\)\(]*\?1-\?\)\[(.*?)\]([^(]+)', r'\1\2OS', converted_glycan) # O-sulfate case
converted_glycan = re.sub(r'S[\)\(]*\?1-(\d)\)\[(.*?)\]([^(]+)', r'\2\3\1S', converted_glycan) # numbered sulfate
if 'freeEnd' in glycan:
return converted_glycan[:-6]+'-ol'
return converted_glycan[:-6]
return f"{converted_glycan[:-6]}-ol" if 'freeEnd' in glycan else converted_glycan[:-6]


def check_nomenclature(glycan: str # Glycan string to check
Expand All @@ -903,9 +912,9 @@ def check_nomenclature(glycan: str # Glycan string to check

def canonicalize_iupac(glycan: str # Glycan sequence in any supported format
) -> str: # Standardized IUPAC-condensed format
"Convert glycan from IUPAC-extended, LinearCode, GlycoCT, WURCS, Oxford, and GLYCAM to standardized IUPAC-condensed format"
"Convert glycan from IUPAC-extended, LinearCode, GlycoCT, WURCS, Oxford, GLYCAM, and GlycoWorkBench to standardized IUPAC-condensed format"
glycan = glycan.strip()
# Check for different nomenclatures: LinearCode, IUPAC-extended, GlycoCT, WURCS, Oxford, GLYCAM
# Check for different nomenclatures: LinearCode, IUPAC-extended, GlycoCT, WURCS, Oxford, GLYCAM, GlycoWorkBench
if ';' in glycan:
glycan = linearcode_to_iupac(glycan)
elif '-D-' in glycan:
Expand All @@ -918,7 +927,7 @@ def canonicalize_iupac(glycan: str # Glycan sequence in any supported format
glycan = glycam_to_iupac(glycan)
elif '$MONO' in glycan:
glycan = glycoworkbench_to_iupac(glycan)
elif not isinstance(glycan, str) or any([k in glycan for k in ['@']]):
elif not isinstance(glycan, str) or '@' in glycan:
check_nomenclature(glycan)
return
elif ((glycan[-1].isdigit() and bool(re.search("[A-Z]", glycan))) or (glycan[-2].isdigit() and glycan[-1] == ']') or glycan.endswith('B') or glycan.endswith("LacDiNAc")) and 'e' not in glycan and '-' not in glycan:
Expand Down Expand Up @@ -1006,8 +1015,8 @@ def canonicalize_iupac(glycan: str # Glycan sequence in any supported format
elif '-' not in prefix:
glycan = glycan.replace('+', '(?1-?)+')
glycan = '{'+glycan.replace('+', '}')
post_process = {'5Ac(?1': '5Ac(a2', '5Gc(?1': '5Gc(a2', '5Ac(a1': '5Ac(a2', '5Gc(a1': '5Gc(a2', 'Fuc(?': 'Fuc(a',
'GalS': 'GalOS', 'GlcS': 'GlcOS', 'GlcNAcS': 'GlcNAcOS', 'GalNAcS': 'GalNAcOS', 'SGal': 'GalOS', 'Kdn(?1': 'Kdn(a2',
post_process = {'5Ac(?': '5Ac(a', '5Gc(?': '5Gc(a', '5Ac(a1': '5Ac(a2', '5Gc(a1': '5Gc(a2', 'Fuc(?': 'Fuc(a',
'GalS': 'GalOS', 'GlcS': 'GlcOS', 'GlcNAcS': 'GlcNAcOS', 'GalNAcS': 'GalNAcOS', 'SGal': 'GalOS', 'Kdn(?': 'Kdn(a',
'Kdn(a1': 'Kdn(a2'}
glycan = multireplace(glycan, post_process)
glycan = re.sub(r'[ab]-$', '', glycan) # Remove endings like Glcb-
Expand Down
Loading

0 comments on commit d1ff321

Please sign in to comment.