From 5a99d6b0bc978222b274fa4d7744878e9a241637 Mon Sep 17 00:00:00 2001
From: Daniel Bojar <daniel@bojar.net>
Date: Fri, 15 Nov 2024 14:30:39 +0100
Subject: [PATCH] more tests, make compare_glycans direction-sensitive,
 deprecationWarnings

---
 glycowork/glycan_data/loader.py |   4 +-
 glycowork/motif/annotate.py     |   4 +-
 glycowork/motif/graph.py        |  16 +-
 tests/test_core_functions.py    | 390 +++++++++++++++++++++++++++++++-
 4 files changed, 397 insertions(+), 17 deletions(-)

diff --git a/glycowork/glycan_data/loader.py b/glycowork/glycan_data/loader.py
index a8424ab5..dd09134f 100644
--- a/glycowork/glycan_data/loader.py
+++ b/glycowork/glycan_data/loader.py
@@ -17,12 +17,12 @@
 
 def __getattr__(name):
   if name == "glycan_binding":
-    with resources.open_text("glycowork.glycan_data", "glycan_binding.csv") as f:
+    with resources.files("glycowork.glycan_data").joinpath("glycan_binding.csv").open(encoding = 'utf-8-sig') as f:
       glycan_binding = pd.read_csv(f)
     globals()[name] = glycan_binding  # Cache it to avoid reloading
     return glycan_binding
   elif name == "df_species":
-    with resources.open_text("glycowork.glycan_data", "v11_df_species.csv") as f:
+    with resources.files("glycowork.glycan_data").joinpath("v11_df_species.csv").open(encoding = 'utf-8-sig') as f:
       df_species = pd.read_csv(f)
     globals()[name] = df_species  # Cache it to avoid reloading
     return df_species
diff --git a/glycowork/motif/annotate.py b/glycowork/motif/annotate.py
index 9a1c5c62..1652a208 100644
--- a/glycowork/motif/annotate.py
+++ b/glycowork/motif/annotate.py
@@ -6,7 +6,7 @@
 from functools import partial
 from typing import Dict, List, Optional, Set, Tuple, Union
 
-from glycowork.glycan_data.loader import linkages, motif_list, find_nth, unwrap, replace_every_second, remove_unmatched_brackets
+from glycowork.glycan_data.loader import linkages, motif_list, find_nth, unwrap, replace_every_second, remove_unmatched_brackets, df_species
 from glycowork.motif.graph import subgraph_isomorphism, generate_graph_features, glycan_to_nxGraph, graph_to_string, ensure_graph, possible_topology_check
 from glycowork.motif.processing import IUPAC_to_SMILES, get_lib, find_isomorphs, rescue_glycans
 from glycowork.motif.regex import get_match
@@ -248,7 +248,7 @@ def clean_up_heatmap(
   # Group the DataFrame by identical rows
   grouped = df.groupby(list(df.columns))
   # Find the row with the longest string index within each group and return a new DataFrame
-  max_idx_series = grouped.apply(lambda group: group.index.to_series().str.len().idxmax())
+  max_idx_series = grouped.apply(lambda group: group.index.to_series().str.len().idxmax(), include_groups = False)
   result = df.loc[max_idx_series].drop_duplicates()
   result.index = result.index.str.strip()
   motif_dic = {value: key for key, value in motif_dic.items()}
diff --git a/glycowork/motif/graph.py b/glycowork/motif/graph.py
index 5de7e60a..9452cf25 100644
--- a/glycowork/motif/graph.py
+++ b/glycowork/motif/graph.py
@@ -190,13 +190,21 @@ def compare_glycans(glycan_a: Union[str, nx.Graph], # First glycan to compare
   narrow_wildcard_list = {k: get_possible_linkages(k) if '?' in k else get_possible_monosaccharides(k) for k in proc
                           if '?' in k or k in {'Hex', 'HexOS', 'HexNAc', 'HexNAcOS', 'dHex', 'Sia', 'HexA', 'Pen', 'Monosaccharide'} or '!' in k}
   if narrow_wildcard_list:
-    return nx.is_isomorphic(g1, g2, node_match = categorical_node_match_wildcard('string_labels', 'unknown', narrow_wildcard_list, 'termini', 'flexible'))
+    matcher = nx.isomorphism.GraphMatcher(g1, g2, categorical_node_match_wildcard('string_labels', 'unknown', narrow_wildcard_list, 'termini', 'flexible'))
+    for m in matcher.isomorphisms_iter():
+      inverse_mapping = {v: k for k, v in m.items()}
+      if all(inverse_mapping[node] < inverse_mapping[neighbor] for node, neighbor in g2.edges()):
+        return True
+    return False
   else:
     # First check whether components of both glycan graphs are identical, then check graph isomorphism (costly)
     if sorted(nx.get_node_attributes(g1, "string_labels").values()) == sorted(nx.get_node_attributes(g2, "string_labels").values()):
-      return nx.is_isomorphic(g1, g2, node_match = nx.algorithms.isomorphism.categorical_node_match('string_labels', 'unknown'))
-    else:
-      return False
+      matcher = nx.isomorphism.GraphMatcher(g1, g2, nx.algorithms.isomorphism.categorical_node_match('string_labels', 'unknown'))
+      for m in matcher.isomorphisms_iter():
+        inverse_mapping = {v: k for k, v in m.items()}
+        if all(inverse_mapping[node] < inverse_mapping[neighbor] for node, neighbor in g2.edges()):
+          return True
+    return False
 
 
 def expand_termini_list(motif: Union[str, nx.Graph], # Glycan motif sequence or graph
diff --git a/tests/test_core_functions.py b/tests/test_core_functions.py
index 44fd2ae3..b314091b 100644
--- a/tests/test_core_functions.py
+++ b/tests/test_core_functions.py
@@ -3,7 +3,6 @@
 import networkx.algorithms.isomorphism as iso
 import pandas as pd
 import numpy as np
-from scipy import stats
 from glycowork.motif.tokenization import (
     constrain_prot, prot_to_coded, string_to_labels, pad_sequence, mz_to_composition,
     get_core, get_modification, get_stem_lib, stemify_glycan, mask_rare_glycoletters,
@@ -33,12 +32,26 @@
     clr_transformation, alr_transformation, get_procrustes_scores,
     get_additive_logratio_transformation, get_BF, get_alphaN,
     pi0_tst, TST_grouped_benjamini_hochberg, compare_inter_vs_intra_group,
-    correct_multiple_testing, process_glm_results, partial_corr,
-    estimate_technical_variance, jtkdist, jtkinit, jtkstat, jtkx, MissForest, impute_and_normalize,
+    correct_multiple_testing, partial_corr, estimate_technical_variance, jtkdist, jtkinit, jtkstat, jtkx, MissForest, impute_and_normalize,
     variance_based_filtering, get_glycoform_diff, get_glm, process_glm_results, replace_outliers_with_IQR_bounds,
     replace_outliers_winsorization, perform_tests_monte_carlo
 )
-from glycowork.motif.graph import glycan_to_nxGraph, graph_to_string
+from glycowork.motif.graph import (
+    glycan_to_graph, glycan_to_nxGraph, evaluate_adjacency,
+    compare_glycans, subgraph_isomorphism, generate_graph_features,
+    graph_to_string, largest_subgraph, get_possible_topologies,
+    deduplicate_glycans, neighbor_is_branchpoint, graph_to_string_int, try_string_conversion,
+    subgraph_isomorphism_with_negation, categorical_node_match_wildcard,
+    expand_termini_list, ensure_graph, possible_topology_check
+)
+from glycowork.motif.annotate import (
+    link_find, annotate_glycan, annotate_dataset, get_molecular_properties,
+    get_k_saccharides, get_terminal_structures, create_correlation_network,
+    group_glycans_core, group_glycans_sia_fuc, group_glycans_N_glycan_type,
+    Lectin, load_lectin_lib, create_lectin_and_motif_mappings,
+    lectin_motif_scoring, clean_up_heatmap, quantify_motifs,
+    count_unique_subgraphs_of_size_k, annotate_glycan_topology_uncertainty
+)
 
 
 @pytest.mark.parametrize("glycan", [
@@ -955,13 +968,13 @@ def test_cohen_d():
     # Test with clearly different groups
     group1 = [1, 2, 3, 4, 5]
     group2 = [6, 7, 8, 9, 10]
-    d, var = cohen_d(group1, group2)
+    d, _ = cohen_d(group1, group2)
     assert d < 0  # Effect size should be negative (group1 < group2)
     # Test with paired samples
-    d, var = cohen_d(group1, group2, paired=True)
+    d, _ = cohen_d(group1, group2, paired=True)
     assert d < 0  # Should still be negative
     # Test with identical groups
-    d, var = cohen_d(group1, group1)
+    d, _ = cohen_d(group1, group1)
     assert abs(d) < 1e-10  # Effect size should be approximately 0
 
 
@@ -1376,11 +1389,11 @@ def test_get_glm():
         'H': [1, 2, 1, 2, 1, 2, 1, 2] * 2,
         'N': [2, 3, 2, 3, 2, 3, 2, 3] * 2 
     })
-    model, vars = get_glm(data)
+    model, variables = get_glm(data)
     if not isinstance(model, str):  # If model fitting succeeded
         assert hasattr(model, 'params')
         assert hasattr(model, 'pvalues')
-        assert len(vars) > 0
+        assert len(variables) > 0
 
 
 def test_process_glm_results():
@@ -1426,3 +1439,362 @@ def test_perform_tests_monte_carlo():
     assert len(raw_p) == len(adj_p) == len(effect) == 5
     assert all(0 <= p <= 1 for p in raw_p)
     assert all(0 <= p <= 1 for p in adj_p)
+
+
+def test_evaluate_adjacency():
+    assert evaluate_adjacency("(", 0)
+    assert evaluate_adjacency(")", 0)
+    assert not evaluate_adjacency("(1)[", 0)
+    assert not evaluate_adjacency(")[", 0)
+
+
+def test_glycan_to_graph():
+    glycan = "Gal(b1-4)GlcNAc"
+    node_dict, adj_matrix = glycan_to_graph(glycan)
+    assert len(node_dict) == 3 # Gal, b1-4, GlcNAc
+    assert adj_matrix.shape == (3, 3)
+    assert adj_matrix[0, 1] == 1  # Gal connected to linkage
+    assert adj_matrix[1, 2] == 1  # Linkage connected to GlcNAc
+
+
+def test_glycan_to_nxGraph():
+    glycan = "Gal(b1-4)GlcNAc"
+    graph = glycan_to_nxGraph(glycan)
+    assert len(graph.nodes) == 3
+    assert len(graph.edges) == 2
+    assert all('string_labels' in data for _, data in graph.nodes(data=True))
+
+
+def test_compare_glycans():
+    # Test identical glycans
+    assert compare_glycans("Gal(b1-4)GlcNAc", "Gal(b1-4)GlcNAc")
+    # Test order-specificity of comparison
+    assert not compare_glycans("Gal(b1-4)GlcNAc", "GlcNAc(b1-4)Gal")
+    # Test non-equivalent glycans
+    assert not compare_glycans("Gal(b1-4)GlcNAc", "Man(a1-3)GlcNAc")
+    # Test with PTM wildcards
+    assert compare_glycans("Gal6S(b1-4)GlcNAc", "GalOS(b1-4)GlcNAc")
+
+
+def test_subgraph_isomorphism():
+    # Test motif presence
+    assert subgraph_isomorphism("Gal(b1-4)GlcNAc", "GlcNAc")
+    # Test motif count
+    assert subgraph_isomorphism("Gal(b1-4)GlcNAc(b1-4)GlcNAc", "GlcNAc", count=True) == 2
+    # Test with negation
+    assert subgraph_isomorphism("Gal(b1-4)GlcNAc", "!Man(a1-3)GlcNAc")
+    # Test with termini constraints
+    assert subgraph_isomorphism("Gal(b1-4)GlcNAc", "GlcNAc", termini_list=['terminal'])
+
+
+def test_generate_graph_features():
+    glycan = "Gal(b1-4)GlcNAc"
+    features = generate_graph_features(glycan)
+    assert isinstance(features, pd.DataFrame)
+    assert len(features) == 1
+    assert 'diameter' in features.columns
+    assert 'branching' in features.columns
+    assert 'avgDeg' in features.columns
+
+
+def test_largest_subgraph():
+    glycan1 = "Gal(b1-4)GlcNAc(b1-4)GlcNAc"
+    glycan2 = "Man(a1-3)GlcNAc(b1-4)GlcNAc"
+    result = largest_subgraph(glycan1, glycan2)
+    assert "GlcNAc(b1-4)GlcNAc" in result
+
+
+def test_get_possible_topologies():
+    glycan = "{Neu5Ac(a2-?)}Gal(b1-3)GalNAc"
+    topologies = get_possible_topologies(glycan)
+    assert len(topologies) > 0
+    assert all(isinstance(g, nx.Graph) for g in topologies)
+
+
+def test_deduplicate_glycans():
+    glycans = [
+        "Gal(b1-4)[Fuc(a1-3)]GlcNAc",
+        "Fuc(a1-3)[Gal(b1-4)]GlcNAc",
+        "Man(a1-3)GlcNAc"
+    ]
+    result = deduplicate_glycans(glycans)
+    assert len(result) == 2
+
+
+def test_neighbor_is_branchpoint():
+    # Create test graph
+    G = nx.Graph()
+    G.add_nodes_from(range(6))
+    G.add_edges_from([(0,1), (1,2), (2,3), (2,4), (2,5)])
+    assert neighbor_is_branchpoint(G, 1)  # Node 1 connected to branch point 2
+    assert not neighbor_is_branchpoint(G, 3)  # Node 3 is leaf
+
+
+def test_subgraph_isomorphism_with_negation():
+    glycan = "Gal(b1-4)GlcNAc"
+    motif = "!Man(a1-3)GlcNAc"
+    # Should match since glycan doesn't contain Man(a1-3)GlcNAc
+    assert subgraph_isomorphism_with_negation(glycan, motif)
+    # Test with counting
+    result, matches = subgraph_isomorphism_with_negation(glycan, motif, count=True, return_matches=True)
+    assert isinstance(result, int)
+    assert isinstance(matches, list)
+
+
+def test_categorical_node_match_wildcard():
+    narrow_wildcard_list = {"Hex": ["Gal", "Glc", "Man"]}
+    matcher = categorical_node_match_wildcard(
+        'string_labels', 'unknown', narrow_wildcard_list, 'termini', 'flexible'
+    )
+    # Test wildcard matching
+    assert matcher(
+        {'string_labels': 'Hex', 'termini': 'terminal'},
+        {'string_labels': 'Gal', 'termini': 'terminal'}
+    )
+    # Test exact matching
+    assert matcher(
+        {'string_labels': 'GlcNAc', 'termini': 'terminal'},
+        {'string_labels': 'GlcNAc', 'termini': 'terminal'}
+    )
+    # Test flexible position
+    assert matcher(
+        {'string_labels': 'GlcNAc', 'termini': 'flexible'},
+        {'string_labels': 'GlcNAc', 'termini': 'terminal'}
+    )
+
+
+def test_expand_termini_list():
+    motif = "Gal(b1-4)GlcNAc"
+    termini_list = ['t', 'i']  # terminal, internal
+    result = expand_termini_list(motif, termini_list)
+    assert len(result) == 3  # Two monosaccharides + one linkage
+    assert 'terminal' in result
+    assert 'internal' in result
+    assert 'flexible' in result
+
+
+def test_ensure_graph():
+    # Test with string input
+    glycan = "Gal(b1-4)GlcNAc"
+    result = ensure_graph(glycan)
+    assert isinstance(result, nx.Graph)
+    # Test with graph input
+    G = nx.Graph()
+    G.add_node(0, string_labels="Gal")
+    result = ensure_graph(G)
+    assert result is G
+
+
+def test_possible_topology_check():
+    float_glycan = "{Neu5Ac(a2-?)}Gal(b1-4)GlcNAc"
+    glycans = [
+        "Neu5Ac(a2-3)Gal(b1-4)GlcNAc",
+        "Neu5Ac(a2-6)Gal(b1-4)GlcNAc",
+        "Gal(b1-4)GlcNAc"
+    ]
+    matches = possible_topology_check(float_glycan, glycans)
+    assert len(matches) == 2  # Should match first two glycans
+    assert "Gal(b1-4)GlcNAc" not in matches
+
+
+def test_link_find():
+    # Simple glycan
+    glycan = "Gal(b1-4)Gal(b1-4)GlcNAc"
+    result = link_find(glycan)
+    assert "Gal(b1-4)GlcNAc" in result
+    # Branched glycan
+    glycan = "Gal(b1-4)[Fuc(a1-3)]GlcNAc"
+    result = link_find(glycan)
+    assert "Gal(b1-4)GlcNAc" in result
+    assert "Fuc(a1-3)GlcNAc" in result
+
+
+def test_annotate_glycan():
+    glycan = "Gal(b1-4)GlcNAc"
+    result = annotate_glycan(glycan)
+    assert isinstance(result, pd.DataFrame)
+    assert len(result) == 1
+    assert result.index[0] == glycan
+
+
+def test_annotate_dataset():
+    glycans = ["Gal(b1-4)GlcNAc", "Man(a1-3)GlcNAc"]
+    # Test known motifs
+    result = annotate_dataset(glycans, feature_set=['known'])
+    assert isinstance(result, pd.DataFrame)
+    assert len(result) == 2
+    # Test graph features
+    result = annotate_dataset(glycans, feature_set=['graph'])
+    assert 'diameter' in result.columns
+    assert 'branching' in result.columns
+
+
+def test_get_molecular_properties():
+    try:
+        glycans = ["Gal(b1-4)GlcNAc"]
+        result = get_molecular_properties(glycans, verbose=True, placeholder=True)
+        assert isinstance(result, pd.DataFrame)
+        assert 'molecular_weight' in result.columns
+        assert 'xlogp' in result.columns
+    except ImportError:
+        pytest.skip("Skipping test due to missing dependencies")
+
+
+def test_get_k_saccharides():
+    glycans = ["Man(a1-3)[Man(a1-6)]Man(b1-4)GlcNAc(b1-4)[Fuc(a1-6)]GlcNAc"]
+    # Test basic functionality
+    result = get_k_saccharides(glycans, size=2)
+    assert isinstance(result, pd.DataFrame)
+    # Test with up_to=True
+    result = get_k_saccharides(glycans, size=2, up_to=True)
+    assert isinstance(result, pd.DataFrame)
+    assert len(result.columns) > 0
+
+
+def test_get_terminal_structures():
+    glycan = "Man(b1-4)GlcNAc(b1-4)[Fuc(a1-6)]GlcNAc"
+    # Test size=1
+    result = get_terminal_structures(glycan, size=1)
+    assert isinstance(result, list)
+    assert "Man(b1-4)" in result[0]
+    # Test size=2
+    result = get_terminal_structures(glycan, size=2)
+    assert isinstance(result, list)
+    assert "Man(b1-4)GlcNAc(b1-4)" in result
+
+
+def test_create_correlation_network():
+    data = pd.DataFrame({
+        'glycan1': [1, 2, 3],
+        'glycan2': [1, 2, 3],
+        'glycan3': [-1, -2, -3]
+    })
+    clusters = create_correlation_network(data, 0.9)
+    assert isinstance(clusters, list)
+    assert isinstance(clusters[0], set)
+    assert len(clusters) > 0
+
+
+def test_group_glycans_core():
+    glycans = ["GlcNAc(b1-6)GalNAc", "Gal(b1-3)GalNAc"]
+    p_values = [0.01, 0.02]
+    glycan_groups, p_val_groups = group_glycans_core(glycans, p_values)
+    assert "core2" in glycan_groups
+    assert "core1" in glycan_groups
+    assert len(glycan_groups["core2"]) == 1
+    assert len(glycan_groups["core1"]) == 1
+
+
+def test_group_glycans_sia_fuc():
+    glycans = ["Neu5Ac(a2-3)Gal", "Fuc(a1-3)GlcNAc", "Gal(b1-4)GlcNAc"]
+    p_values = [0.01, 0.02, 0.03]
+    glycan_groups, p_val_groups = group_glycans_sia_fuc(glycans, p_values)
+    assert "Sia" in glycan_groups
+    assert "Fuc" in glycan_groups
+    assert "rest" in glycan_groups
+
+
+def test_group_glycans_N_glycan_type():
+    glycans = [
+        "Neu5Ac(a2-3)Gal(b1-4)GlcNAc(b1-2)Man(a1-3)[GlcNAc(b1-2)Man(a1-6)]Man(b1-4)GlcNAc(b1-4)GlcNAc",  # complex
+        "Man(a1-2)Man(a1-2)Man(a1-3)[Man(a1-2)Man(a1-3)[Man(a1-2)Man(a1-6)]Man(a1-6)]Man(b1-4)GlcNAc(b1-4)GlcNAc",  # high mannose
+        "Man(b1-4)GlcNAc(b1-4)GlcNAc"  # other
+    ]
+    p_values = [0.01, 0.02, 0.03]
+    glycan_groups, p_val_groups = group_glycans_N_glycan_type(glycans, p_values)
+    assert "complex" in glycan_groups
+    assert "high_man" in glycan_groups
+    assert "rest" in glycan_groups
+
+
+def test_Lectin():
+    lectin = Lectin(
+        abbr=["WGA"],
+        name=["Wheat Germ Agglutinin"],
+        specificity={
+            "primary": {"GlcNAc": []},
+            "secondary": None,
+            "negative": None
+        }
+    )
+    
+    # Test basic attributes
+    assert lectin.abbr == ["WGA"]
+    assert lectin.name == ["Wheat Germ Agglutinin"]
+    # Test binding check
+    result = lectin.check_binding("GlcNAc(b1-4)GlcNAc")
+    assert result in [0, 1, 2]
+
+
+def test_load_lectin_lib():
+    lectin_lib = load_lectin_lib()
+    assert isinstance(lectin_lib, dict)
+    assert all(isinstance(v, Lectin) for v in lectin_lib.values())
+
+
+def test_create_lectin_and_motif_mappings():
+    lectin_lib = load_lectin_lib()
+    lectin_list = ["WGA", "ConA"]
+    lectin_mapping, motif_mapping = create_lectin_and_motif_mappings(
+        lectin_list, lectin_lib
+    )
+    assert isinstance(lectin_mapping, dict)
+    assert isinstance(motif_mapping, dict)
+
+def test_lectin_motif_scoring():
+    lectin_lib = load_lectin_lib()
+    lectin_mapping = {"WGA": 0, "ConA": 1}
+    motif_mapping = {"GlcNAc": {"WGA": 0, "ConA": 1}}
+    lectin_scores = {"WGA": 1.0, "ConA": -0.5}
+    idf = {"WGA": 1.0, "ConA": 1.0}
+    result = lectin_motif_scoring(
+        lectin_mapping, motif_mapping, lectin_scores,
+        lectin_lib, idf
+    )
+    assert isinstance(result, pd.DataFrame)
+    assert "motif" in result.columns
+    assert "score" in result.columns
+
+
+def test_clean_up_heatmap():
+    data = pd.DataFrame({
+        'sample1': [1, 1],
+        'sample2': [2, 2]
+    }, index=['motif1', 'motif2'])
+    result = clean_up_heatmap(data)
+    assert isinstance(result, pd.DataFrame)
+    assert len(result) <= len(data)
+
+
+def test_quantify_motifs():
+    df = pd.DataFrame({
+        'sample1': [1, 2],
+        'sample2': [2, 3]
+    })
+    glycans = ["Gal(b1-4)GlcNAc", "Man(a1-3)GlcNAc"]
+    result = quantify_motifs(
+        df, glycans, feature_set=['exhaustive'],
+        remove_redundant=True
+    )
+    assert isinstance(result, pd.DataFrame)
+    assert len(result.columns) > 0
+
+
+def test_count_unique_subgraphs_of_size_k():
+    glycan = "Man(a1-2)Man(a1-2)Man(a1-3)[Man(a1-2)Man(a1-3)[Man(a1-2)Man(a1-6)]Man(a1-6)]Man(b1-4)GlcNAc(b1-4)GlcNAc"
+    graph = glycan_to_nxGraph(glycan)
+    # Test size 2
+    result = count_unique_subgraphs_of_size_k(graph, size=2)
+    assert isinstance(result, dict)
+    assert len(result) > 0
+    # Test terminal only
+    result = count_unique_subgraphs_of_size_k(graph, size=2, terminal=True)
+    assert isinstance(result, dict)
+
+
+def test_annotate_glycan_topology_uncertainty():
+    glycan = "{Neu5Ac(a2-?)}Fuc(a1-3)[Gal(b1-4)]GlcNAc"
+    result = annotate_glycan_topology_uncertainty(glycan)
+    assert isinstance(result, pd.DataFrame)
+    assert len(result) == 1
+    assert result.index[0] == glycan