From ef776b2454fd523104afa34badad586202601697 Mon Sep 17 00:00:00 2001 From: Fabian Gruenewald Date: Wed, 20 Nov 2024 18:37:08 +0100 Subject: [PATCH 01/10] allow bonding after branch --- cgsmiles/read_fragments.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/cgsmiles/read_fragments.py b/cgsmiles/read_fragments.py index f065317..5c144d9 100644 --- a/cgsmiles/read_fragments.py +++ b/cgsmiles/read_fragments.py @@ -126,6 +126,7 @@ def strip_bonding_descriptors(fragment_string): node_count = 0 prev_node = 0 current_order = None + anchor = [] for token in smile_iter: if token == '[': peek = next(smile_iter) @@ -162,10 +163,10 @@ def strip_bonding_descriptors(fragment_string): prev_node = node_count node_count += 1 elif token == '(': - anchor = prev_node + anchor.append(prev_node) smile += token elif token == ')': - prev_node = anchor + prev_node = anchor.pop() smile += token elif token in bond_to_order: current_order = bond_to_order[token] From 04c01b5b58a5d3c5875db92e988eab28606693e5 Mon Sep 17 00:00:00 2001 From: Fabian Gruenewald Date: Wed, 20 Nov 2024 18:39:32 +0100 Subject: [PATCH 02/10] add tests --- cgsmiles/tests/test_cgsmile_parsing.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/cgsmiles/tests/test_cgsmile_parsing.py b/cgsmiles/tests/test_cgsmile_parsing.py index 812817f..3f2814f 100644 --- a/cgsmiles/tests/test_cgsmile_parsing.py +++ b/cgsmiles/tests/test_cgsmile_parsing.py @@ -269,6 +269,18 @@ def test_read_cgsmiles(smile, nodes, charges, edges, orders): {0: ["$1"], 2: ["$1"]}, None, None), + # smiple symmetric bonding after branch + ("[$]CC(CC)[$]", + "CC(CC)", + {0: ["$1"], 1: ["$1"]}, + None, + None), + # smiple symmetric bonding after ring + ("[$]CC1[$]CCC1", + "CC1CCC1", + {0: ["$1"], 1: ["$1"]}, + None, + None), # smiple symmetric bonding with more than one name ("[$1A]COC[$1A]", "COC", From ff51950af85b40969a79ed7a738f88ca438d3f48 Mon Sep 17 00:00:00 2001 From: Fabian Gruenewald Date: Wed, 20 Nov 2024 18:48:48 +0100 Subject: [PATCH 03/10] clear bond orders correclty --- cgsmiles/read_fragments.py | 2 +- cgsmiles/tests/test_cgsmile_parsing.py | 6 ++++++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/cgsmiles/read_fragments.py b/cgsmiles/read_fragments.py index 5c144d9..7fb49a1 100644 --- a/cgsmiles/read_fragments.py +++ b/cgsmiles/read_fragments.py @@ -158,10 +158,10 @@ def strip_bonding_descriptors(fragment_string): else: atom += peek peek = next(smile_iter) - smile = smile + atom + "]" prev_node = node_count node_count += 1 + current_order = None elif token == '(': anchor.append(prev_node) smile += token diff --git a/cgsmiles/tests/test_cgsmile_parsing.py b/cgsmiles/tests/test_cgsmile_parsing.py index 3f2814f..269ff89 100644 --- a/cgsmiles/tests/test_cgsmile_parsing.py +++ b/cgsmiles/tests/test_cgsmile_parsing.py @@ -281,6 +281,12 @@ def test_read_cgsmiles(smile, nodes, charges, edges, orders): {0: ["$1"], 1: ["$1"]}, None, None), + # clear order symbol + ("[CH][$a]=[CH][$c]", + "[CH]=[CH]", + {0: ["$a1"], 1: ["$c1"]}, + None, + None), # smiple symmetric bonding with more than one name ("[$1A]COC[$1A]", "COC", From a083eb603256df236bb02493332ad3932255ed11 Mon Sep 17 00:00:00 2001 From: Fabian Gruenewald Date: Wed, 20 Nov 2024 18:53:05 +0100 Subject: [PATCH 04/10] add tests for multiple bonding descriptors each having a bond order different from 1 --- cgsmiles/tests/test_cgsmile_parsing.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/cgsmiles/tests/test_cgsmile_parsing.py b/cgsmiles/tests/test_cgsmile_parsing.py index 269ff89..9f67d2c 100644 --- a/cgsmiles/tests/test_cgsmile_parsing.py +++ b/cgsmiles/tests/test_cgsmile_parsing.py @@ -287,6 +287,12 @@ def test_read_cgsmiles(smile, nodes, charges, edges, orders): {0: ["$a1"], 1: ["$c1"]}, None, None), + # multiple non-one bonding l + ("CC=[$a]=[$b]CC", + "CCCC", + {1: ["$a2", "$b2"]}, + None, + None), # smiple symmetric bonding with more than one name ("[$1A]COC[$1A]", "COC", From 354aba2be1a07f801bd4520f36e930da686aa6e5 Mon Sep 17 00:00:00 2001 From: Fabian Gruenewald Date: Thu, 21 Nov 2024 10:33:22 +0100 Subject: [PATCH 05/10] adjust hcount by bonding op number if required --- cgsmiles/pysmiles_utils.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/cgsmiles/pysmiles_utils.py b/cgsmiles/pysmiles_utils.py index e8d6213..d6cb21c 100644 --- a/cgsmiles/pysmiles_utils.py +++ b/cgsmiles/pysmiles_utils.py @@ -76,9 +76,21 @@ def rebuild_h_atoms(mol_graph, keep_bonding=False): raise SyntaxError(msg) nx.set_node_attributes(mol_graph, 0, 'hcount') + # first we need to figure out the correct hcounts on each node + # this also corrects for simple aromatic problems like in thiophene pysmiles.smiles_helper.fill_valence(mol_graph, respect_hcount=False) + + # optionally we adjust the hcount by the number of bonding operators + if keep_bonding: + bonding_nodes = nx.get_node_attributes(mol_graph, 'bonding') + for node, bond_ops in bonding_nodes.items(): + mol_graph.nodes[node]['hcount'] -= len(bond_ops) + + # now we add the hydrogen atoms pysmiles.smiles_helper.add_explicit_hydrogens(mol_graph) + # if we are having single hydrogen fragments we need to + # make sure the fragid and fragname is keept for node in mol_graph.nodes: if mol_graph.nodes[node].get("element", "*") == "H" and\ not mol_graph.nodes[node].get("single_h_frag", False): From 4624f521da43f08c4330f23dc5b343dd5a5754bc Mon Sep 17 00:00:00 2001 From: Fabian Gruenewald Date: Thu, 21 Nov 2024 10:41:57 +0100 Subject: [PATCH 06/10] set atomnames also for meta-graph if one is provided --- cgsmiles/graph_utils.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/cgsmiles/graph_utils.py b/cgsmiles/graph_utils.py index 64c384f..ab750fd 100644 --- a/cgsmiles/graph_utils.py +++ b/cgsmiles/graph_utils.py @@ -175,7 +175,9 @@ def set_atom_names_atomistic(molecule, meta_graph=None): assert len(fragids) == 1 fraglist[fragids[0]].append(node) - for fragnodes in fraglist.values(): + for meta_node, fragnodes in fraglist.items(): for idx, node in enumerate(fragnodes): atomname = molecule.nodes[node]['element'] + str(idx) molecule.nodes[node]['atomname'] = atomname + if meta_graph: + meta_graph.nodes[meta_node]['graph'].nodes[node]['atomname'] = atomname From 715e1bc2ad42fe0a47d9c45f3b82e3935aa8a6c9 Mon Sep 17 00:00:00 2001 From: Fabian Gruenewald Date: Thu, 21 Nov 2024 10:42:24 +0100 Subject: [PATCH 07/10] remove double atomname assignment in resolve --- cgsmiles/resolve.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/cgsmiles/resolve.py b/cgsmiles/resolve.py index d079e05..dae2eb5 100644 --- a/cgsmiles/resolve.py +++ b/cgsmiles/resolve.py @@ -382,18 +382,16 @@ def resolve(self): mark_chiral_atoms(self.molecule) # assign rs isomerism annotate_ez_isomers(self.molecule) - # in all-atom MD there are common naming conventions - # that might be expected and hence we set them here - set_atom_names_atomistic(self.molecule, self.meta_graph) # and redo the meta molecule self.meta_graph = annotate_fragments(self.meta_graph, self.molecule) - # in all-atom MD there are common naming conventions - # that might be expected and hence we set them here if all_atom: - set_atom_names_atomistic(self.molecule, self.meta_graph) + # in all-atom MD there are common naming conventions + # that might be expected and hence we set them here + set_atom_names_atomistic(self.molecule, + self.meta_graph) # increment the resolution counter self.resolution_counter += 1 From b36f0fc2d4031ae1c0559052bd01bb43b5e12e9f Mon Sep 17 00:00:00 2001 From: Fabian Gruenewald Date: Thu, 21 Nov 2024 12:34:38 +0100 Subject: [PATCH 08/10] tests and account for bond order --- cgsmiles/pysmiles_utils.py | 16 +++++++++------- cgsmiles/tests/test_utils.py | 32 ++++++++++++++++++++++++++++++++ 2 files changed, 41 insertions(+), 7 deletions(-) create mode 100644 cgsmiles/tests/test_utils.py diff --git a/cgsmiles/pysmiles_utils.py b/cgsmiles/pysmiles_utils.py index d6cb21c..42de836 100644 --- a/cgsmiles/pysmiles_utils.py +++ b/cgsmiles/pysmiles_utils.py @@ -67,12 +67,12 @@ def rebuild_h_atoms(mol_graph, keep_bonding=False): pysmiles.smiles_helper.correct_aromatic_rings(mol_graph, strict=True) except SyntaxError as pysmiles_err: print(pysmiles_err) - msg = ("Likely you are writing an aromatic molecule that does not " - "show delocalization-induced molecular equivalency and thus " - "is not considered aromatic. For example, 4-methyl imidazole " - "is often written as [nH]1cc(nc1)C, but should be written as " - "[NH]1C=C(N=C1)C. A corresponding CGSmiles string would be " - "{[#A]1[#B][#C]1}.{#A=[>][<]N,#B=[$]N=C[>],#C=[$]C(C)=C[<]}") + msg = (r"Likely you are writing an aromatic molecule that does not " + r"show delocalization-induced molecular equivalency and thus " + r"is not considered aromatic. For example, 4-methyl imidazole " + r"is often written as [nH]1cc(nc1)C, but should be written as " + r"[NH]1C=C(N=C1)C. A corresponding CGSmiles string would be " + r"{[#A]1[#B][#C]1}.{#A=[>][<]N,#B=[$]N=C[>],#C=[$]C(C)=C[<]}") raise SyntaxError(msg) nx.set_node_attributes(mol_graph, 0, 'hcount') @@ -84,7 +84,9 @@ def rebuild_h_atoms(mol_graph, keep_bonding=False): if keep_bonding: bonding_nodes = nx.get_node_attributes(mol_graph, 'bonding') for node, bond_ops in bonding_nodes.items(): - mol_graph.nodes[node]['hcount'] -= len(bond_ops) + print(bond_ops) + print(sum([int(bond[-1]) for bond in bond_ops])) + mol_graph.nodes[node]['hcount'] -= sum([int(bond[-1]) for bond in bond_ops]) # now we add the hydrogen atoms pysmiles.smiles_helper.add_explicit_hydrogens(mol_graph) diff --git a/cgsmiles/tests/test_utils.py b/cgsmiles/tests/test_utils.py new file mode 100644 index 0000000..fa0c730 --- /dev/null +++ b/cgsmiles/tests/test_utils.py @@ -0,0 +1,32 @@ +import re +import pytest +import cgsmiles + +err_msg_rebuild_h = ("Likely you are writing an aromatic molecule that does not " + "show delocalization-induced molecular equivalency and thus " + "is not considered aromatic. For example, 4-methyl imidazole " + "is often written as [nH]1cc(nc1)C, but should be written as " + "[NH]1C=C(N=C1)C. A corresponding CGSmiles string would be " + "{[#A]1[#B][#C]1}.{#A=[>][<]N,#B=[$]N=C[>],#C=[$]C(C)=C[<]}") + +@pytest.mark.parametrize('frag_str, hatoms_ref, error_type, err_msg', ( + ('{#A=[$]CCC[$]}', 6, None, None), + ('{#A=CCC}', 8, None, None), + ('{#A=C[!]CC}', 7, None, None), + ('{#A=[$]=CCC=[$]}', 4, None, None), + ('{#A=[$]cccc}',5, None, None), + ('{#A=[$]ccc}', 0, SyntaxError, err_msg_rebuild_h), +)) +def test_rebuild_hatoms(frag_str, hatoms_ref, error_type, err_msg): + frag_dict = cgsmiles.read_fragments(frag_str) + frag_graph = frag_dict['A'] + if error_type: + with pytest.raises(error_type, match=re.escape(err_msg)): + cgsmiles.pysmiles_utils.rebuild_h_atoms(frag_graph, keep_bonding=True) + else: + cgsmiles.pysmiles_utils.rebuild_h_atoms(frag_graph, keep_bonding=True) + hatoms = 0 + for node, ele in frag_graph.nodes(data='element'): + if ele == 'H': + hatoms += 1 + assert hatoms == hatoms_ref From 48051a16cf19d4dbdaf00dbe0c9c14a95894d5a0 Mon Sep 17 00:00:00 2001 From: Fabian Gruenewald Date: Thu, 21 Nov 2024 16:34:25 +0100 Subject: [PATCH 09/10] remove leftovers --- cgsmiles/pysmiles_utils.py | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/cgsmiles/pysmiles_utils.py b/cgsmiles/pysmiles_utils.py index 42de836..952a353 100644 --- a/cgsmiles/pysmiles_utils.py +++ b/cgsmiles/pysmiles_utils.py @@ -67,12 +67,12 @@ def rebuild_h_atoms(mol_graph, keep_bonding=False): pysmiles.smiles_helper.correct_aromatic_rings(mol_graph, strict=True) except SyntaxError as pysmiles_err: print(pysmiles_err) - msg = (r"Likely you are writing an aromatic molecule that does not " - r"show delocalization-induced molecular equivalency and thus " - r"is not considered aromatic. For example, 4-methyl imidazole " - r"is often written as [nH]1cc(nc1)C, but should be written as " - r"[NH]1C=C(N=C1)C. A corresponding CGSmiles string would be " - r"{[#A]1[#B][#C]1}.{#A=[>][<]N,#B=[$]N=C[>],#C=[$]C(C)=C[<]}") + msg = ("Likely you are writing an aromatic molecule that does not " + "show delocalization-induced molecular equivalency and thus " + "is not considered aromatic. For example, 4-methyl imidazole " + "is often written as [nH]1cc(nc1)C, but should be written as " + "[NH]1C=C(N=C1)C. A corresponding CGSmiles string would be " + "{[#A]1[#B][#C]1}.{#A=[>][<]N,#B=[$]N=C[>],#C=[$]C(C)=C[<]}") raise SyntaxError(msg) nx.set_node_attributes(mol_graph, 0, 'hcount') @@ -84,8 +84,6 @@ def rebuild_h_atoms(mol_graph, keep_bonding=False): if keep_bonding: bonding_nodes = nx.get_node_attributes(mol_graph, 'bonding') for node, bond_ops in bonding_nodes.items(): - print(bond_ops) - print(sum([int(bond[-1]) for bond in bond_ops])) mol_graph.nodes[node]['hcount'] -= sum([int(bond[-1]) for bond in bond_ops]) # now we add the hydrogen atoms From 6a11252a1907f856e90a61905af738ae84766cbd Mon Sep 17 00:00:00 2001 From: Fabian Gruenewald Date: Thu, 21 Nov 2024 16:36:14 +0100 Subject: [PATCH 10/10] add test --- cgsmiles/tests/test_cgsmile_parsing.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/cgsmiles/tests/test_cgsmile_parsing.py b/cgsmiles/tests/test_cgsmile_parsing.py index 9f67d2c..9a59a7b 100644 --- a/cgsmiles/tests/test_cgsmile_parsing.py +++ b/cgsmiles/tests/test_cgsmile_parsing.py @@ -293,6 +293,12 @@ def test_read_cgsmiles(smile, nodes, charges, edges, orders): {1: ["$a2", "$b2"]}, None, None), + # multiple non-one bonding l + ("CC[$a]=[$b]CC", + "CCCC", + {1: ["$a1", "$b2"]}, + None, + None), # smiple symmetric bonding with more than one name ("[$1A]COC[$1A]", "COC",