diff --git a/glycowork/glycan_data/loader.py b/glycowork/glycan_data/loader.py index a8d0ea1d..63919a03 100644 --- a/glycowork/glycan_data/loader.py +++ b/glycowork/glycan_data/loader.py @@ -6,7 +6,7 @@ from os import path from itertools import chain from importlib import resources -from typing import Any, Dict, List, Union +from typing import Any, Dict, Union with resources.open_text("glycowork.glycan_data", "glycan_motifs.csv") as f: motif_list = pd.read_csv(f) @@ -212,7 +212,7 @@ def replace_every_second(string, old_char, new_char): | old_char (string): a string character to be replaced (every second occurrence) | new_char (string): the string character to replace old_char with\n | Returns: - | :- + | :- | Returns string with replaced characters """ count = 0 @@ -285,7 +285,7 @@ def download_model(file_id, local_path = 'model_weights.pt'): class DataFrameSerializer: """A utility class for serializing and deserializing pandas DataFrames with complex data types in a version-independent manner.""" - + @staticmethod def _serialize_cell(value: Any) -> Dict[str, Any]: """Convert a cell value to a serializable format with type information.""" @@ -334,7 +334,7 @@ def _deserialize_cell(cell_data: Dict[str, Any]) -> Any: @classmethod def serialize(cls, df: pd.DataFrame, path: str) -> None: """Serialize a DataFrame to JSON with type information. - + Args: df: pandas DataFrame to serialize path: file path to save the serialized data""" @@ -343,31 +343,31 @@ def serialize(cls, df: pd.DataFrame, path: str) -> None: 'index': list(df.index), 'data': [] } - + for _, row in df.iterrows(): serialized_row = [cls._serialize_cell(val) for val in row] data['data'].append(serialized_row) - + with open(path, 'w') as f: json.dump(data, f) @classmethod def deserialize(cls, path: str) -> pd.DataFrame: """Deserialize a DataFrame from JSON. - + Args: path: file path to load the serialized data from - + Returns: pandas DataFrame with restored data types""" with open(path, 'r') as f: data = json.load(f) - + deserialized_data = [] for row in data['data']: deserialized_row = [cls._deserialize_cell(cell) for cell in row] deserialized_data.append(deserialized_row) - + return pd.DataFrame( data = deserialized_data, columns = data['columns'], diff --git a/glycowork/glycan_data/stats.py b/glycowork/glycan_data/stats.py index 0276fceb..4d509b2f 100644 --- a/glycowork/glycan_data/stats.py +++ b/glycowork/glycan_data/stats.py @@ -6,7 +6,7 @@ from sklearn.linear_model import Ridge from sklearn.ensemble import RandomForestRegressor from scipy.special import gammaln -from scipy.stats import wilcoxon, rankdata, norm, chi2, t, f, entropy, gmean, f_oneway, combine_pvalues, dirichlet +from scipy.stats import wilcoxon, rankdata, norm, chi2, t, f, entropy, gmean, f_oneway, combine_pvalues, dirichlet, spearmanr, ttest_rel from scipy.stats.mstats import winsorize from scipy.spatial import procrustes from scipy.spatial.distance import squareform @@ -1190,7 +1190,7 @@ def perform_tests_monte_carlo(group_a, group_b, num_instances = 128, paired = Fa | (i) list of uncorrected p-values | (ii) list of corrected p-values (two-stage Benjamini-Hochberg) | (ii) list of effect sizes (Cohen's d)""" - num_features, num_columns = group_a.shape + num_features, _ = group_a.shape avg_uncorrected_p_values, avg_corrected_p_values, avg_effect_sizes = np.zeros(num_features), np.zeros(num_features), np.zeros(num_features) for instance in range(num_instances): instance_p_values = [] @@ -1199,7 +1199,7 @@ def perform_tests_monte_carlo(group_a, group_b, num_instances = 128, paired = Fa sample_a = group_a.iloc[feature, instance::num_instances].values sample_b = group_b.iloc[feature, instance::num_instances].values p_value = ttest_rel(sample_b, sample_a)[1] if paired else ttest_ind(sample_b, sample_a, equal_var = False)[1] - effect_size, effect_size_variance = cohen_d(sample_b, sample_a, paired = paired) + effect_size, _ = cohen_d(sample_b, sample_a, paired = paired) instance_p_values.append(p_value) instance_effect_sizes.append(effect_size) # Apply Benjamini-Hochberg correction for multiple testing within the instance diff --git a/glycowork/motif/analysis.py b/glycowork/motif/analysis.py index 62c6f1a7..ee9c4930 100644 --- a/glycowork/motif/analysis.py +++ b/glycowork/motif/analysis.py @@ -1488,7 +1488,7 @@ def get_glycoshift_per_site(df, group1, group2, paired = False, impute = True, | (i) Regression coefficient from the GLM (indicating direction of change in the treatment condition) | (ii) Corrected p-values (two-tailed t-test with two-stage Benjamini-Hochberg correction) for testing the coefficient against zero | (iii) Significance: True/False of whether the corrected p-value lies below the sample size-appropriate significance threshold""" - df, df_org, group1, group2 = preprocess_data(df, group1, group2, experiment = "diff", motifs = False, impute = impute, + df, _, group1, group2 = preprocess_data(df, group1, group2, experiment = "diff", motifs = False, impute = impute, min_samples = min_samples, transform = "Nothing", paired = paired) alpha = get_alphaN(len(group1 + group2)) df, glycan_features = process_for_glycoshift(df) diff --git a/glycowork/motif/annotate.py b/glycowork/motif/annotate.py index 6997a6f8..f7c7197c 100644 --- a/glycowork/motif/annotate.py +++ b/glycowork/motif/annotate.py @@ -214,6 +214,8 @@ def annotate_dataset(glycans, motifs = None, feature_set = ['known'], from glycowork.glycan_data.loader import df_species feasibles = set(df_species[df_species.Class == "Mammalia"].glycan.values.tolist()) partial_annotate_topology_uncertainty = partial(annotate_glycan_topology_uncertainty, feasibles = feasibles, motifs = motifs, termini_list = termini_list, gmotifs = gmotifs) + else: + partial_annotate_topology_uncertainty = partial(annotate_glycan, motifs = motifs, termini_list = termini_list, gmotifs = gmotifs) def annotate_switchboard(glycan): return partial_annotate_topology_uncertainty(glycan) if glycan.count('{') == 1 else partial_annotate(glycan) shopping_cart.append(pd.concat(list(map(annotate_switchboard, glycans)), axis = 0)) diff --git a/glycowork/motif/graph.py b/glycowork/motif/graph.py index c71c1093..5d5da98a 100644 --- a/glycowork/motif/graph.py +++ b/glycowork/motif/graph.py @@ -1,7 +1,7 @@ import re from copy import deepcopy from glycowork.glycan_data.loader import unwrap, modification_map -from glycowork.motif.processing import min_process_glycans, canonicalize_iupac, bracket_removal, get_possible_linkages, get_possible_monosaccharides, rescue_glycans, choose_correct_isoform +from glycowork.motif.processing import min_process_glycans, bracket_removal, get_possible_linkages, get_possible_monosaccharides, rescue_glycans, choose_correct_isoform import numpy as np import pandas as pd import networkx as nx @@ -598,7 +598,7 @@ def get_possible_topologies(glycan, exhaustive = False, allowed_disaccharides = dangling_carbon = ggraph.nodes[dangling_linkage]['string_labels'][-1] floating_monosaccharide = dangling_linkage - 1 topologies = [] - candidate_nodes = [k for k in list(main_part.nodes())[::2] + candidate_nodes = [k for k in list(main_part.nodes())[::2] if exhaustive or (main_part.degree[k] == 1 and k != max(main_part.nodes()))] for k in candidate_nodes: neighbor_carbons = [ggraph.nodes[n]['string_labels'][-1] for n in ggraph.neighbors(k) if n < k] diff --git a/glycowork/motif/processing.py b/glycowork/motif/processing.py index 436632db..b1b3a2fe 100644 --- a/glycowork/motif/processing.py +++ b/glycowork/motif/processing.py @@ -299,7 +299,7 @@ def choose_correct_isoform(glycans, reverse = False): correct_isoform = glycans return correct_isoform - + def enforce_class(glycan, glycan_class, conf = None, extra_thresh = 0.3): """given a glycan and glycan class, determines whether glycan is from this class\n | Arguments: diff --git a/glycowork/motif/tokenization.py b/glycowork/motif/tokenization.py index 92afd768..f5afd716 100644 --- a/glycowork/motif/tokenization.py +++ b/glycowork/motif/tokenization.py @@ -272,7 +272,7 @@ def mz_to_composition(mz_value, mode = 'negative', mass_value = 'monoisotopic', if filter_out is None: filter_out = set() if adduct: - mz -= calculate_adduct_mass(adduct, mass_value) + mz_value -= calculate_adduct_mass(adduct, mass_value) adduct_mass = mass_dict['Acetate'] if mode == 'negative' else mass_dict['Na+'] if reduced: mz_value -= 1.0078 diff --git a/glycowork/network/biosynthesis.py b/glycowork/network/biosynthesis.py index 0632d8ad..714a2753 100644 --- a/glycowork/network/biosynthesis.py +++ b/glycowork/network/biosynthesis.py @@ -706,7 +706,7 @@ def plot_network(network, plot_format = 'pydot2', edge_label_draw = True, scatter = nx.draw_networkx_nodes(network, pos, node_size = node_sizes, alpha = 0.7, node_color = color_map, ax = ax) edge_attributes = nx.get_edge_attributes(network, 'diffs') - if edge_label_draw: + if edge_label_draw: if lfc_dict: # Map log-fold changes of responsible enzymes onto biosynthetic steps c_list = ['red' if lfc_dict.get(attr, 0) < 0 else 'green' if lfc_dict.get(attr, 0) > 0 else 'cornflowerblue' for attr in edge_attributes.values()] @@ -1001,7 +1001,7 @@ def prune_network(network, node_attr = 'abundance', threshold = 0.): network_out.remove_nodes_from(to_cut) # Remove virtual nodes with total degree of max 1 and an in-degree of at least 1 nodes_to_remove = [node for node, attr in network_out.nodes(data = True) - if (network_out.degree[node] <= 1) and (attr.get('virtual') == 1) + if (network_out.degree[node] <= 1) and (attr.get('virtual') == 1) and (network_out.in_degree[node] > 0)] network_out.remove_nodes_from(nodes_to_remove) return network_out @@ -1244,7 +1244,7 @@ def get_differential_biosynthesis(df, group1, group2 = None, analysis = "reactio | :- | For binary comparison: A dataframe with differential flow features and statistics | For longitudinal analysis: A dataframe with reaction changes over time""" - + if longitudinal: assert id_column is not None, "id_column must be specified for longitudinal analysis" assert group2 is None, "group2 should not be specified for longitudinal analysis" @@ -1254,18 +1254,18 @@ def get_differential_biosynthesis(df, group1, group2 = None, analysis = "reactio assert group2 is not None, "group2 must be specified for binary comparison" if paired: assert len(group1) == len(group2), "For paired samples, the size of group1 and group2 should be the same" - + # Handle input data if isinstance(df, str): df = pd.read_csv(df) if df.endswith(".csv") else pd.read_excel(df) - + if not longitudinal and not isinstance(group1[0], str): columns_list = df.columns.tolist() group1 = [columns_list[k] for k in group1] group2 = [columns_list[k] for k in group2] - + all_groups = group1 + (group2 or []) - + # Prepare data for analysis if longitudinal: df['participant'] = df[id_column].apply(lambda x: x.split('_')[0]) @@ -1276,11 +1276,11 @@ def get_differential_biosynthesis(df, group1, group2 = None, analysis = "reactio df_analysis = df[df['time_point'].isin(time_points)].copy() else: glycan_columns = all_groups - + df_analysis = df_analysis if longitudinal else df.set_index(df.columns.tolist()[0]) df_analysis = df_analysis.loc[:, glycan_columns].fillna(0) df_analysis = (df_analysis / df_analysis.sum(axis = 1).values[:, None]) * 100 - + if not longitudinal: df_analysis = df_analysis[df_analysis.any(axis = 1)] else: @@ -1291,17 +1291,17 @@ def get_differential_biosynthesis(df, group1, group2 = None, analysis = "reactio root = max(root, key = len) if '-ol' not in root[0] else min(root, key = len) min_default = 0.1 if root.endswith('GlcNAc') else 0.001 core_net = construct_network(df_analysis.index.tolist() if not longitudinal else glycan_columns) - - nets = {} + + nets, features = {}, [] for col in df_analysis.columns: temp = deepcopy(core_net) abundance_mapping = dict(zip(df_analysis.index.tolist() if not longitudinal else glycan_columns, df_analysis[col].values.tolist())) nx.set_node_attributes(temp, {g: {'abundance': abundance_mapping.get(g, 0.0)} for g in temp.nodes()}) nets[col] = estimate_weights(temp, root = root, min_default = min_default) - + res = {col: get_maximum_flow(nets[col], source = root) for col in nets} - + # Perform reaction or flow analysis if analysis == "reaction": res2 = {col: get_reaction_flow(nets[col], res[col], aggregate = "sum") for col in nets} @@ -1316,20 +1316,20 @@ def get_differential_biosynthesis(df, group1, group2 = None, analysis = "reactio raise ValueError("Only 'reaction' and 'flow' are currently supported analysis modes.") res2 = pd.DataFrame(res2).T - + if analysis == "reaction" and not longitudinal: res2 = res2.loc[:, res2.var(axis = 0) > 0.01] elif analysis == "flow" and not longitudinal: res2.columns = features - + features = res2.columns.tolist() - + # Perform statistical analysis if longitudinal: res_df = res2.reset_index() res_df = res_df.rename(columns = {'index': id_column}) res_df = pd.merge(res_df, df[['participant', 'time_point']], on = id_column) - + results = [] for reaction in features: reaction_data = res_df[[id_column, 'participant', 'time_point', reaction]] @@ -1339,13 +1339,13 @@ def get_differential_biosynthesis(df, group1, group2 = None, analysis = "reactio slopes = reaction_data.groupby('participant').apply(lambda x: np.polyfit(x['time_numeric'], x[reaction], 1)[0], include_groups = False) average_slope = slopes.mean() direction = "Increase" if average_slope > 0 else "Decrease" - + model = ols('Q("{0}") ~ C(time_point) + C(participant)'.format(reaction), data = reaction_data).fit() anova_table = sm.stats.anova_lm(model, typ = 2) - + f_value = anova_table.loc['C(time_point)', 'F'] p_value = anova_table.loc['C(time_point)', 'PR(>F)'] - + results.append({ 'Feature': reaction, 'F-statistic': f_value, @@ -1353,7 +1353,7 @@ def get_differential_biosynthesis(df, group1, group2 = None, analysis = "reactio 'Direction': direction, 'Average Slope': average_slope }) - + out = pd.DataFrame(results) out['corr p-val'] = multipletests(out['p-val'], method = 'fdr_bh')[1] out['significant'] = out['corr p-val'] < 0.05 @@ -1367,10 +1367,10 @@ def get_differential_biosynthesis(df, group1, group2 = None, analysis = "reactio alpha = get_alphaN(len(all_groups)) significance = [p < alpha for p in corrpvals] if pvals else [] effect_sizes, _ = zip(*[cohen_d(row_b, row_a, paired = paired) for row_a, row_b in zip(df_a.values, df_b.values)]) - + out = pd.DataFrame({'Feature': features, 'Mean abundance': mean_abundance, 'Log2FC': log2fc, 'p-val': pvals, 'corr p-val': corrpvals, 'significant': significance, 'Effect size': effect_sizes}) - + out = out.set_index('Feature') return out.dropna().sort_values(by = 'p-val')