last minor fixes

BojarLab · Nov 9, 2024 · d5780bd · d5780bd
1 parent a6905dc
commit d5780bd
Show file tree

Hide file tree

Showing 8 changed files with 43 additions and 41 deletions.
diff --git a/glycowork/glycan_data/loader.py b/glycowork/glycan_data/loader.py
@@ -6,7 +6,7 @@
 from os import path
 from itertools import chain
 from importlib import resources
-from typing import Any, Dict, List, Union
+from typing import Any, Dict, Union
 
 with resources.open_text("glycowork.glycan_data", "glycan_motifs.csv") as f:
   motif_list = pd.read_csv(f)
@@ -212,7 +212,7 @@ def replace_every_second(string, old_char, new_char):
   | old_char (string): a string character to be replaced (every second occurrence)
   | new_char (string): the string character to replace old_char with\n
   | Returns:
-  | :-                           
+  | :-
   | Returns string with replaced characters
   """
   count = 0
@@ -285,7 +285,7 @@ def download_model(file_id, local_path = 'model_weights.pt'):
 class DataFrameSerializer:
   """A utility class for serializing and deserializing pandas DataFrames with complex data types
   in a version-independent manner."""
-  
+
   @staticmethod
   def _serialize_cell(value: Any) -> Dict[str, Any]:
     """Convert a cell value to a serializable format with type information."""
@@ -334,7 +334,7 @@ def _deserialize_cell(cell_data: Dict[str, Any]) -> Any:
   @classmethod
   def serialize(cls, df: pd.DataFrame, path: str) -> None:
     """Serialize a DataFrame to JSON with type information.
-    
+
     Args:
       df: pandas DataFrame to serialize
       path: file path to save the serialized data"""
@@ -343,31 +343,31 @@ def serialize(cls, df: pd.DataFrame, path: str) -> None:
       'index': list(df.index),
       'data': []
     }
-    
+
     for _, row in df.iterrows():
       serialized_row = [cls._serialize_cell(val) for val in row]
       data['data'].append(serialized_row)
-    
+
     with open(path, 'w') as f:
       json.dump(data, f)
 
   @classmethod
   def deserialize(cls, path: str) -> pd.DataFrame:
     """Deserialize a DataFrame from JSON.
-    
+
     Args:
       path: file path to load the serialized data from
-    
+
     Returns:
       pandas DataFrame with restored data types"""
     with open(path, 'r') as f:
       data = json.load(f)
-    
+
     deserialized_data = []
     for row in data['data']:
       deserialized_row = [cls._deserialize_cell(cell) for cell in row]
       deserialized_data.append(deserialized_row)
-    
+
     return pd.DataFrame(
       data = deserialized_data,
       columns = data['columns'],

diff --git a/glycowork/glycan_data/stats.py b/glycowork/glycan_data/stats.py
@@ -6,7 +6,7 @@
 from sklearn.linear_model import Ridge
 from sklearn.ensemble import RandomForestRegressor
 from scipy.special import gammaln
-from scipy.stats import wilcoxon, rankdata, norm, chi2, t, f, entropy, gmean, f_oneway, combine_pvalues, dirichlet
+from scipy.stats import wilcoxon, rankdata, norm, chi2, t, f, entropy, gmean, f_oneway, combine_pvalues, dirichlet, spearmanr, ttest_rel
 from scipy.stats.mstats import winsorize
 from scipy.spatial import procrustes
 from scipy.spatial.distance import squareform
@@ -1190,7 +1190,7 @@ def perform_tests_monte_carlo(group_a, group_b, num_instances = 128, paired = Fa
   | (i) list of uncorrected p-values
   | (ii) list of corrected p-values (two-stage Benjamini-Hochberg)
   | (ii) list of effect sizes (Cohen's d)"""
-  num_features, num_columns = group_a.shape
+  num_features, _ = group_a.shape
   avg_uncorrected_p_values, avg_corrected_p_values, avg_effect_sizes = np.zeros(num_features), np.zeros(num_features), np.zeros(num_features)
   for instance in range(num_instances):
     instance_p_values = []
@@ -1199,7 +1199,7 @@ def perform_tests_monte_carlo(group_a, group_b, num_instances = 128, paired = Fa
       sample_a = group_a.iloc[feature, instance::num_instances].values
       sample_b = group_b.iloc[feature, instance::num_instances].values
       p_value = ttest_rel(sample_b, sample_a)[1] if paired else ttest_ind(sample_b, sample_a, equal_var = False)[1]
-      effect_size, effect_size_variance = cohen_d(sample_b, sample_a, paired = paired)
+      effect_size, _ = cohen_d(sample_b, sample_a, paired = paired)
       instance_p_values.append(p_value)
       instance_effect_sizes.append(effect_size)
     # Apply Benjamini-Hochberg correction for multiple testing within the instance

diff --git a/glycowork/motif/analysis.py b/glycowork/motif/analysis.py
@@ -1488,7 +1488,7 @@ def get_glycoshift_per_site(df, group1, group2, paired = False, impute = True,
   | (i) Regression coefficient from the GLM (indicating direction of change in the treatment condition)
   | (ii) Corrected p-values (two-tailed t-test with two-stage Benjamini-Hochberg correction) for testing the coefficient against zero
   | (iii) Significance: True/False of whether the corrected p-value lies below the sample size-appropriate significance threshold"""
-  df, df_org, group1, group2 = preprocess_data(df, group1, group2, experiment = "diff", motifs = False, impute = impute,
+  df, _, group1, group2 = preprocess_data(df, group1, group2, experiment = "diff", motifs = False, impute = impute,
                                                min_samples = min_samples, transform = "Nothing", paired = paired)
   alpha = get_alphaN(len(group1 + group2))
   df, glycan_features = process_for_glycoshift(df)

diff --git a/glycowork/motif/annotate.py b/glycowork/motif/annotate.py
@@ -214,6 +214,8 @@ def annotate_dataset(glycans, motifs = None, feature_set = ['known'],
       from glycowork.glycan_data.loader import df_species
       feasibles = set(df_species[df_species.Class == "Mammalia"].glycan.values.tolist())
       partial_annotate_topology_uncertainty = partial(annotate_glycan_topology_uncertainty, feasibles = feasibles, motifs = motifs, termini_list = termini_list, gmotifs = gmotifs)
+    else:
+      partial_annotate_topology_uncertainty = partial(annotate_glycan, motifs = motifs, termini_list = termini_list, gmotifs = gmotifs)
     def annotate_switchboard(glycan):
       return partial_annotate_topology_uncertainty(glycan) if glycan.count('{') == 1 else partial_annotate(glycan)
     shopping_cart.append(pd.concat(list(map(annotate_switchboard, glycans)), axis = 0))

diff --git a/glycowork/motif/graph.py b/glycowork/motif/graph.py
@@ -1,7 +1,7 @@
 import re
 from copy import deepcopy
 from glycowork.glycan_data.loader import unwrap, modification_map
-from glycowork.motif.processing import min_process_glycans, canonicalize_iupac, bracket_removal, get_possible_linkages, get_possible_monosaccharides, rescue_glycans, choose_correct_isoform
+from glycowork.motif.processing import min_process_glycans, bracket_removal, get_possible_linkages, get_possible_monosaccharides, rescue_glycans, choose_correct_isoform
 import numpy as np
 import pandas as pd
 import networkx as nx
@@ -598,7 +598,7 @@ def get_possible_topologies(glycan, exhaustive = False, allowed_disaccharides =
     dangling_carbon = ggraph.nodes[dangling_linkage]['string_labels'][-1]
     floating_monosaccharide = dangling_linkage - 1
   topologies = []
-  candidate_nodes = [k for k in list(main_part.nodes())[::2] 
+  candidate_nodes = [k for k in list(main_part.nodes())[::2]
                      if exhaustive or (main_part.degree[k] == 1 and k != max(main_part.nodes()))]
   for k in candidate_nodes:
     neighbor_carbons = [ggraph.nodes[n]['string_labels'][-1] for n in ggraph.neighbors(k) if n < k]

diff --git a/glycowork/motif/processing.py b/glycowork/motif/processing.py
@@ -299,7 +299,7 @@ def choose_correct_isoform(glycans, reverse = False):
     correct_isoform = glycans
   return correct_isoform
 
-  
+
 def enforce_class(glycan, glycan_class, conf = None, extra_thresh = 0.3):
   """given a glycan and glycan class, determines whether glycan is from this class\n
   | Arguments:

diff --git a/glycowork/motif/tokenization.py b/glycowork/motif/tokenization.py
@@ -272,7 +272,7 @@ def mz_to_composition(mz_value, mode = 'negative', mass_value = 'monoisotopic',
   if filter_out is None:
     filter_out = set()
   if adduct:
-    mz -= calculate_adduct_mass(adduct, mass_value)
+    mz_value -= calculate_adduct_mass(adduct, mass_value)
   adduct_mass = mass_dict['Acetate'] if mode == 'negative' else mass_dict['Na+']
   if reduced:
     mz_value -= 1.0078

diff --git a/glycowork/network/biosynthesis.py b/glycowork/network/biosynthesis.py
@@ -706,7 +706,7 @@ def plot_network(network, plot_format = 'pydot2', edge_label_draw = True,
     scatter = nx.draw_networkx_nodes(network, pos, node_size = node_sizes, alpha = 0.7,
                                      node_color = color_map, ax = ax)
   edge_attributes = nx.get_edge_attributes(network, 'diffs')
-  if edge_label_draw: 
+  if edge_label_draw:
     if lfc_dict:
       # Map log-fold changes of responsible enzymes onto biosynthetic steps
       c_list = ['red' if lfc_dict.get(attr, 0) < 0 else 'green' if lfc_dict.get(attr, 0) > 0 else 'cornflowerblue' for attr in edge_attributes.values()]
@@ -1001,7 +1001,7 @@ def prune_network(network, node_attr = 'abundance', threshold = 0.):
   network_out.remove_nodes_from(to_cut)
   # Remove virtual nodes with total degree of max 1 and an in-degree of at least 1
   nodes_to_remove = [node for node, attr in network_out.nodes(data = True)
-                       if (network_out.degree[node] <= 1) and (attr.get('virtual') == 1) 
+                       if (network_out.degree[node] <= 1) and (attr.get('virtual') == 1)
                        and (network_out.in_degree[node] > 0)]
   network_out.remove_nodes_from(nodes_to_remove)
   return network_out
@@ -1244,7 +1244,7 @@ def get_differential_biosynthesis(df, group1, group2 = None, analysis = "reactio
   | :-
   | For binary comparison: A dataframe with differential flow features and statistics
   | For longitudinal analysis: A dataframe with reaction changes over time"""
-  
+
   if longitudinal:
     assert id_column is not None, "id_column must be specified for longitudinal analysis"
     assert group2 is None, "group2 should not be specified for longitudinal analysis"
@@ -1254,18 +1254,18 @@ def get_differential_biosynthesis(df, group1, group2 = None, analysis = "reactio
     assert group2 is not None, "group2 must be specified for binary comparison"
     if paired:
       assert len(group1) == len(group2), "For paired samples, the size of group1 and group2 should be the same"
-  
+
   # Handle input data
   if isinstance(df, str):
     df = pd.read_csv(df) if df.endswith(".csv") else pd.read_excel(df)
-  
+
   if not longitudinal and not isinstance(group1[0], str):
     columns_list = df.columns.tolist()
     group1 = [columns_list[k] for k in group1]
     group2 = [columns_list[k] for k in group2]
-  
+
   all_groups = group1 + (group2 or [])
-  
+
   # Prepare data for analysis
   if longitudinal:
     df['participant'] = df[id_column].apply(lambda x: x.split('_')[0])
@@ -1276,11 +1276,11 @@ def get_differential_biosynthesis(df, group1, group2 = None, analysis = "reactio
     df_analysis = df[df['time_point'].isin(time_points)].copy()
   else:
     glycan_columns = all_groups
-  
+
   df_analysis = df_analysis if longitudinal else df.set_index(df.columns.tolist()[0])
   df_analysis = df_analysis.loc[:, glycan_columns].fillna(0)
   df_analysis = (df_analysis / df_analysis.sum(axis = 1).values[:, None]) * 100
-  
+
   if not longitudinal:
     df_analysis = df_analysis[df_analysis.any(axis = 1)]
   else:
@@ -1291,17 +1291,17 @@ def get_differential_biosynthesis(df, group1, group2 = None, analysis = "reactio
   root = max(root, key = len) if '-ol' not in root[0] else min(root, key = len)
   min_default = 0.1 if root.endswith('GlcNAc') else 0.001
   core_net = construct_network(df_analysis.index.tolist() if not longitudinal else glycan_columns)
-  
-  nets = {}
+
+  nets, features = {}, []
   for col in df_analysis.columns:
     temp = deepcopy(core_net)
     abundance_mapping = dict(zip(df_analysis.index.tolist() if not longitudinal else glycan_columns,
                                  df_analysis[col].values.tolist()))
     nx.set_node_attributes(temp, {g: {'abundance': abundance_mapping.get(g, 0.0)} for g in temp.nodes()})
     nets[col] = estimate_weights(temp, root = root, min_default = min_default)
-  
+
   res = {col: get_maximum_flow(nets[col], source = root) for col in nets}
-  
+
   # Perform reaction or flow analysis
   if analysis == "reaction":
     res2 = {col: get_reaction_flow(nets[col], res[col], aggregate = "sum") for col in nets}
@@ -1316,20 +1316,20 @@ def get_differential_biosynthesis(df, group1, group2 = None, analysis = "reactio
     raise ValueError("Only 'reaction' and 'flow' are currently supported analysis modes.")
 
   res2 = pd.DataFrame(res2).T
-  
+
   if analysis == "reaction" and not longitudinal:
     res2 = res2.loc[:, res2.var(axis = 0) > 0.01]
   elif analysis == "flow" and not longitudinal:
     res2.columns = features
-  
+
   features = res2.columns.tolist()
-  
+
   # Perform statistical analysis
   if longitudinal:
     res_df = res2.reset_index()
     res_df = res_df.rename(columns = {'index': id_column})
     res_df = pd.merge(res_df, df[['participant', 'time_point']], on = id_column)
-    
+
     results = []
     for reaction in features:
       reaction_data = res_df[[id_column, 'participant', 'time_point', reaction]]
@@ -1339,21 +1339,21 @@ def get_differential_biosynthesis(df, group1, group2 = None, analysis = "reactio
       slopes = reaction_data.groupby('participant').apply(lambda x: np.polyfit(x['time_numeric'], x[reaction], 1)[0], include_groups = False)
       average_slope = slopes.mean()
       direction = "Increase" if average_slope > 0 else "Decrease"
-      
+
       model = ols('Q("{0}") ~ C(time_point) + C(participant)'.format(reaction), data = reaction_data).fit()
       anova_table = sm.stats.anova_lm(model, typ = 2)
-      
+
       f_value = anova_table.loc['C(time_point)', 'F']
       p_value = anova_table.loc['C(time_point)', 'PR(>F)']
-      
+
       results.append({
           'Feature': reaction,
           'F-statistic': f_value,
           'p-val': p_value,
           'Direction': direction,
           'Average Slope': average_slope
         })
-    
+
     out = pd.DataFrame(results)
     out['corr p-val'] = multipletests(out['p-val'], method = 'fdr_bh')[1]
     out['significant'] = out['corr p-val'] < 0.05
@@ -1367,10 +1367,10 @@ def get_differential_biosynthesis(df, group1, group2 = None, analysis = "reactio
     alpha = get_alphaN(len(all_groups))
     significance = [p < alpha for p in corrpvals] if pvals else []
     effect_sizes, _ = zip(*[cohen_d(row_b, row_a, paired = paired) for row_a, row_b in zip(df_a.values, df_b.values)])
-    
+
     out = pd.DataFrame({'Feature': features, 'Mean abundance': mean_abundance, 'Log2FC': log2fc, 'p-val': pvals,
                         'corr p-val': corrpvals, 'significant': significance, 'Effect size': effect_sizes})
-  
+
   out = out.set_index('Feature')
   return out.dropna().sort_values(by = 'p-val')