Skip to content

Commit

Permalink
forbid too aggressive branch swapping in find_isomorphs
Browse files Browse the repository at this point in the history
  • Loading branch information
Bribak committed Feb 12, 2025
1 parent 91331e7 commit b60fcea
Show file tree
Hide file tree
Showing 5 changed files with 27 additions and 23 deletions.
4 changes: 2 additions & 2 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -121,7 +121,7 @@
##### Added ✨
- Natively support narrow linkage ambiguity in `categorical_node_match_wildcard`; that means you can use things like "Gal(b1-3/4)GlcNAc" with `subgraph_isomorphism` or `compare_glycans` (as well as all functions using these core functions) and it will only return True for "Gal(b1-3)GlcNAc", "Gal(b1-4)GlcNAc", and "Gal(b1-?)GlcNAc" (b94744e)
- Added `build_wildcard_cache` for a central handling of wildcard mapping that can also be cached (a98461f)
- `compare_glycans` now also has the `return_matches` keyword argument that allows for a retrieval of the node mapping if the glycans are isomorphic
- `compare_glycans` now also has the `return_matches` keyword argument that allows for a retrieval of the node mapping if the glycans are isomorphic (7c510c9)

##### Changed 🔄
- Ensured that `compare_glycans` is 100% order-specific, never matching something like ("Gal(b1-4)GlcNAc", "GlcNAc(b1-4)Gal") (5a99d6b)
Expand All @@ -148,7 +148,7 @@
- `get_coordinates_and_labels` now internally uses `motif.processing.choose_correct_isoform` to reorder the glycan for drawing (41bb1a1)
- Improved console drawing quality controlled by `display_svg_with_matplotlib` and image quality in Excel cells using `plot_glycans_excel` (a64f694)
- `draw_chem2d` and `draw_chem3d` will now detect whether the user is in a Jupyter environment and, if not, plot to the Matplotlib console (c3a7f64)
- `process_per_residue` now will re-order the `per_residue` list in the same way as the glycan is re-ordered for drawing with `GlycoDraw`
- `process_per_residue` now will re-order the `per_residue` list in the same way as the glycan is re-ordered for drawing with `GlycoDraw` (7c510c9)

##### Deprecated ⚠️
- Deprecated `hex_circumference`, the functionality is now available within `draw_hex` with the new keyword argument "outline_only" (4f1ccfa)
Expand Down
7 changes: 3 additions & 4 deletions glycowork/glycan_data/loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -252,10 +252,10 @@ def download_model(file_id: str, # Google Drive file ID
"Download the model weights file from Google Drive"
file_id = file_id.split('/d/')[1].split('/view')[0]
url = f'https://drive.google.com/uc?id={file_id}'
response = requests.get(url, stream=True, timeout=10)
response = requests.get(url, stream = True, timeout = 10)
if response.status_code == 200:
with open(local_path, 'wb') as f:
for chunk in response.iter_content(chunk_size=8192):
for chunk in response.iter_content(chunk_size = 8192):
f.write(chunk)
print("Download completed.")
else:
Expand Down Expand Up @@ -360,8 +360,7 @@ def deserialize(cls, path: str # file path to load serialized data
serializer = DataFrameSerializer()


def count_nested_brackets(
s: str,
def count_nested_brackets(s: str,
length: bool = False
) -> int:
count = 0
Expand Down
2 changes: 1 addition & 1 deletion glycowork/motif/draw.py
Original file line number Diff line number Diff line change
Expand Up @@ -1306,7 +1306,7 @@ def draw_chem3d(
atom_colors = {k: ['#ECECEC'] if len(v) > 1 else v for k, v in atom_colors.items()}

if pdb_file:
mol = MolFromPDBFile(pdb_file)
mol = MolFromPDBFile(str(pdb_file))
else:
mol = AddHs(mol)
EmbedMolecule(mol)
Expand Down
36 changes: 20 additions & 16 deletions glycowork/motif/processing.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,8 +90,8 @@ def get_possible_linkages(wildcard: str, # Pattern to match, ? can be wildcard
numbers = re.search(r'-(\d+(?:/\d+)*)', wildcard).group(1).split('/')
base_pattern = f"{prefix}-({('|'.join(numbers))}|\\?)"
return {l for l in linkage_list if re.compile(f'^{base_pattern}$').fullmatch(l)} | \
({f"{wildcard[:wildcard.index('-')]}-{'/'.join(sorted(combo))}"
for combo in combinations(numbers, r = 2)} if len(numbers) > 2 else set())
({f"{wildcard[:wildcard.index('-')]}-{'/'.join(sorted(combo))}"
for combo in combinations(numbers, r = 2)} if len(numbers) > 2 else set())
pattern = f"^{wildcard.replace('?', '[ab1-9?]')}$"
return {l for l in linkage_list if re.compile(pattern).fullmatch(l)}

Expand Down Expand Up @@ -128,7 +128,7 @@ def bracket_removal(glycan_part: str # Residual part of glycan from glycan_to_gr
def find_isomorphs(glycan: str # Glycan in IUPAC-condensed format
) -> List[str]: # List of isomorphic glycan notations
"Returns a set of isomorphic glycans by swapping branches"
if '[' not in glycan or glycan.index('[') == 0:
if '[' not in glycan or ']' not in glycan or glycan.index('[') == 0:
return [glycan]
floaty = False
if '{' in glycan:
Expand Down Expand Up @@ -162,18 +162,22 @@ def find_isomorphs(glycan: str # Glycan in IUPAC-condensed format
temp = set()
for k in out_list:
if k.count('[') >= 3 and k.count('][') >= 2:
m = re.search(r'^(.*?)\[(.*?)\]\[(.*?)\]\[(.*?)\](.*?)$', k)
if m and not bool(re.search(r'\[[^\]]+\[', k)):
main, b1, b2, b3, rest = m.groups()
branches = [main, b1, b2, b3]
# Generate all 24 permutations
for p in permutations(range(4)):
# First element is main chain (no brackets), rest get brackets
result = branches[p[0]]
for idx in p[1:]:
result += f"[{branches[idx]}]"
result += rest
temp.add(result)
groups = re.finditer(r'((?:^[^[]+|^)\[[^[\]]*\]\[[^[\]]*\]\[[^[\]]*\](?=[A-Z]|$))', k)
for g in groups:
branch_section = g.group(1)
if not bool(re.search(r'\[[^\]]+\[', branch_section)):
m = re.search(r'^(.*?)\[(.*?)\]\[(.*?)\]\[(.*?)\]', branch_section)
if m:
main, b1, b2, b3 = m.groups()
branches = [main, b1, b2, b3]
# Generate all 24 permutations
for p in permutations(range(4)):
# First element is main chain (no brackets), rest get brackets
result = k[:g.start(1)] + branches[p[0]]
for idx in p[1:]:
result += f"[{branches[idx]}]"
result += k[g.end(1):]
temp.add(result)
out_list.update(temp)
temp = set()
# Starting branch swapped with next side branch again to also include double branch swapped isomorphs
Expand All @@ -182,7 +186,7 @@ def find_isomorphs(glycan: str # Glycan in IUPAC-condensed format
if k.count('[') > 1 and k.index('[') > 0 and find_nth(k, '[', 2) > k.index(']') and (find_nth(k, ']', 2) < find_nth(k, '[', 3) or k.count('[') == 2):
temp.add(re.sub(r'^(.*?)\[(.*?)\](.*?)\[(.*?)\]', r'\4[\1[\2]\3]', k, 1))
out_list.update(temp)
out_list = {k for k in out_list if not any([j in k for j in ['[[', ']]']])}
out_list = {k for k in out_list if not any([j in k for j in ['[[', ']]']]) and k.index(']') > k.index('[')}
if floaty:
out_list = {floaty+k for k in out_list}
return list(out_list)
Expand Down
1 change: 1 addition & 0 deletions tests/test_core_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -1127,6 +1127,7 @@ def test_choose_correct_isoform():
result = choose_correct_isoform("Gal(a1-3)[Fuc(a1-2)Gal(b1-4)[Fuc(a1-3)]GlcNAc(b1-6)]Gal(b1-4)GlcNAc(b1-6)[Gal(a1-3)[Fuc(a1-2)]Gal(b1-4)GlcNAc(b1-3)]GalNAc")
assert result == "Fuc(a1-2)Gal(b1-4)[Fuc(a1-3)]GlcNAc(b1-6)[Gal(a1-3)]Gal(b1-4)GlcNAc(b1-6)[Fuc(a1-2)[Gal(a1-3)]Gal(b1-4)GlcNAc(b1-3)]GalNAc"
assert choose_correct_isoform("Xyl(b1-2)[Man(a1-3)][Man(a1-6)][GlcNAc(b1-4)]Man(b1-4)GlcNAc(b1-4)GlcNAc") == "Xyl(b1-2)[Man(a1-3)][GlcNAc(b1-4)][Man(a1-6)]Man(b1-4)GlcNAc(b1-4)GlcNAc"
assert choose_correct_isoform('Man(a1-3)[Xyl(b1-2)][Man(a1-6)]Man(b1-4)GlcNAc(b1-4)[Fuc(a1-3)][Fuc(a1-6)]GlcNAc') == "Xyl(b1-2)[Man(a1-3)][Man(a1-6)]Man(b1-4)GlcNAc(b1-4)[Fuc(a1-3)][Fuc(a1-6)]GlcNAc"
# Mode for GlycoDraw
result = choose_correct_isoform("Neu5Ac(a2-3)Gal(b1-4)[Fuc(a1-3)]GlcNAc(b1-2)[Neu5Ac(a2-3)Gal(b1-4)GlcNAc(b1-4)]Man(a1-3)[Neu5Ac(a2-3)Gal(b1-4)[Fuc(a1-3)]GlcNAc(b1-2)[Neu5Ac(a2-3)Gal(b1-4)GlcNAc(b1-6)]Man(a1-6)][GlcNAc(b1-4)]Man(b1-4)GlcNAc(b1-4)[Fuc(a1-6)]GlcNAc", order_by="linkage")
assert result == "Neu5Ac(a2-3)Gal(b1-4)[Fuc(a1-3)]GlcNAc(b1-2)[Neu5Ac(a2-3)Gal(b1-4)GlcNAc(b1-4)]Man(a1-3)[GlcNAc(b1-4)][Neu5Ac(a2-3)Gal(b1-4)[Fuc(a1-3)]GlcNAc(b1-2)[Neu5Ac(a2-3)Gal(b1-4)GlcNAc(b1-6)]Man(a1-6)]Man(b1-4)GlcNAc(b1-4)[Fuc(a1-6)]GlcNAc"
Expand Down

0 comments on commit b60fcea

Please sign in to comment.