Skip to content

Commit

Permalink
last minor fixes
Browse files Browse the repository at this point in the history
  • Loading branch information
Bribak committed Nov 9, 2024
1 parent a6905dc commit d5780bd
Show file tree
Hide file tree
Showing 8 changed files with 43 additions and 41 deletions.
20 changes: 10 additions & 10 deletions glycowork/glycan_data/loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from os import path
from itertools import chain
from importlib import resources
from typing import Any, Dict, List, Union
from typing import Any, Dict, Union

Check notice on line 9 in glycowork/glycan_data/loader.py

View check run for this annotation

Codacy Production / Codacy Static Code Analysis

glycowork/glycan_data/loader.py#L9

'typing.Union' imported but unused (F401)

Check warning on line 9 in glycowork/glycan_data/loader.py

View check run for this annotation

Codacy Production / Codacy Static Code Analysis

glycowork/glycan_data/loader.py#L9

Unused Union imported from typing

with resources.open_text("glycowork.glycan_data", "glycan_motifs.csv") as f:
motif_list = pd.read_csv(f)
Expand Down Expand Up @@ -212,7 +212,7 @@ def replace_every_second(string, old_char, new_char):
| old_char (string): a string character to be replaced (every second occurrence)
| new_char (string): the string character to replace old_char with\n
| Returns:
| :-
| :-
| Returns string with replaced characters
"""
count = 0
Expand Down Expand Up @@ -285,7 +285,7 @@ def download_model(file_id, local_path = 'model_weights.pt'):
class DataFrameSerializer:
"""A utility class for serializing and deserializing pandas DataFrames with complex data types
in a version-independent manner."""

@staticmethod
def _serialize_cell(value: Any) -> Dict[str, Any]:
"""Convert a cell value to a serializable format with type information."""
Expand Down Expand Up @@ -334,7 +334,7 @@ def _deserialize_cell(cell_data: Dict[str, Any]) -> Any:
@classmethod
def serialize(cls, df: pd.DataFrame, path: str) -> None:
"""Serialize a DataFrame to JSON with type information.
Args:
df: pandas DataFrame to serialize
path: file path to save the serialized data"""
Expand All @@ -343,31 +343,31 @@ def serialize(cls, df: pd.DataFrame, path: str) -> None:
'index': list(df.index),
'data': []
}

for _, row in df.iterrows():
serialized_row = [cls._serialize_cell(val) for val in row]
data['data'].append(serialized_row)

with open(path, 'w') as f:
json.dump(data, f)

@classmethod
def deserialize(cls, path: str) -> pd.DataFrame:
"""Deserialize a DataFrame from JSON.
Args:
path: file path to load the serialized data from
Returns:
pandas DataFrame with restored data types"""
with open(path, 'r') as f:
data = json.load(f)

deserialized_data = []
for row in data['data']:
deserialized_row = [cls._deserialize_cell(cell) for cell in row]
deserialized_data.append(deserialized_row)

return pd.DataFrame(
data = deserialized_data,
columns = data['columns'],
Expand Down
6 changes: 3 additions & 3 deletions glycowork/glycan_data/stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor
from scipy.special import gammaln
from scipy.stats import wilcoxon, rankdata, norm, chi2, t, f, entropy, gmean, f_oneway, combine_pvalues, dirichlet
from scipy.stats import wilcoxon, rankdata, norm, chi2, t, f, entropy, gmean, f_oneway, combine_pvalues, dirichlet, spearmanr, ttest_rel
from scipy.stats.mstats import winsorize
from scipy.spatial import procrustes
from scipy.spatial.distance import squareform
Expand Down Expand Up @@ -1190,7 +1190,7 @@ def perform_tests_monte_carlo(group_a, group_b, num_instances = 128, paired = Fa
| (i) list of uncorrected p-values
| (ii) list of corrected p-values (two-stage Benjamini-Hochberg)
| (ii) list of effect sizes (Cohen's d)"""
num_features, num_columns = group_a.shape
num_features, _ = group_a.shape
avg_uncorrected_p_values, avg_corrected_p_values, avg_effect_sizes = np.zeros(num_features), np.zeros(num_features), np.zeros(num_features)
for instance in range(num_instances):
instance_p_values = []
Expand All @@ -1199,7 +1199,7 @@ def perform_tests_monte_carlo(group_a, group_b, num_instances = 128, paired = Fa
sample_a = group_a.iloc[feature, instance::num_instances].values
sample_b = group_b.iloc[feature, instance::num_instances].values
p_value = ttest_rel(sample_b, sample_a)[1] if paired else ttest_ind(sample_b, sample_a, equal_var = False)[1]

Check notice on line 1201 in glycowork/glycan_data/stats.py

View check run for this annotation

Codacy Production / Codacy Static Code Analysis

glycowork/glycan_data/stats.py#L1201

undefined name 'ttest_rel' (F821)
effect_size, effect_size_variance = cohen_d(sample_b, sample_a, paired = paired)
effect_size, _ = cohen_d(sample_b, sample_a, paired = paired)
instance_p_values.append(p_value)
instance_effect_sizes.append(effect_size)
# Apply Benjamini-Hochberg correction for multiple testing within the instance
Expand Down
2 changes: 1 addition & 1 deletion glycowork/motif/analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -1488,7 +1488,7 @@ def get_glycoshift_per_site(df, group1, group2, paired = False, impute = True,
| (i) Regression coefficient from the GLM (indicating direction of change in the treatment condition)
| (ii) Corrected p-values (two-tailed t-test with two-stage Benjamini-Hochberg correction) for testing the coefficient against zero
| (iii) Significance: True/False of whether the corrected p-value lies below the sample size-appropriate significance threshold"""
df, df_org, group1, group2 = preprocess_data(df, group1, group2, experiment = "diff", motifs = False, impute = impute,
df, _, group1, group2 = preprocess_data(df, group1, group2, experiment = "diff", motifs = False, impute = impute,
min_samples = min_samples, transform = "Nothing", paired = paired)
alpha = get_alphaN(len(group1 + group2))
df, glycan_features = process_for_glycoshift(df)
Expand Down
2 changes: 2 additions & 0 deletions glycowork/motif/annotate.py
Original file line number Diff line number Diff line change
Expand Up @@ -214,6 +214,8 @@ def annotate_dataset(glycans, motifs = None, feature_set = ['known'],
from glycowork.glycan_data.loader import df_species
feasibles = set(df_species[df_species.Class == "Mammalia"].glycan.values.tolist())
partial_annotate_topology_uncertainty = partial(annotate_glycan_topology_uncertainty, feasibles = feasibles, motifs = motifs, termini_list = termini_list, gmotifs = gmotifs)
else:
partial_annotate_topology_uncertainty = partial(annotate_glycan, motifs = motifs, termini_list = termini_list, gmotifs = gmotifs)
def annotate_switchboard(glycan):
return partial_annotate_topology_uncertainty(glycan) if glycan.count('{') == 1 else partial_annotate(glycan)
shopping_cart.append(pd.concat(list(map(annotate_switchboard, glycans)), axis = 0))
Expand Down
4 changes: 2 additions & 2 deletions glycowork/motif/graph.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import re
from copy import deepcopy
from glycowork.glycan_data.loader import unwrap, modification_map
from glycowork.motif.processing import min_process_glycans, canonicalize_iupac, bracket_removal, get_possible_linkages, get_possible_monosaccharides, rescue_glycans, choose_correct_isoform
from glycowork.motif.processing import min_process_glycans, bracket_removal, get_possible_linkages, get_possible_monosaccharides, rescue_glycans, choose_correct_isoform
import numpy as np
import pandas as pd
import networkx as nx
Expand Down Expand Up @@ -598,7 +598,7 @@ def get_possible_topologies(glycan, exhaustive = False, allowed_disaccharides =
dangling_carbon = ggraph.nodes[dangling_linkage]['string_labels'][-1]
floating_monosaccharide = dangling_linkage - 1
topologies = []
candidate_nodes = [k for k in list(main_part.nodes())[::2]
candidate_nodes = [k for k in list(main_part.nodes())[::2]
if exhaustive or (main_part.degree[k] == 1 and k != max(main_part.nodes()))]
for k in candidate_nodes:
neighbor_carbons = [ggraph.nodes[n]['string_labels'][-1] for n in ggraph.neighbors(k) if n < k]
Expand Down
2 changes: 1 addition & 1 deletion glycowork/motif/processing.py
Original file line number Diff line number Diff line change
Expand Up @@ -299,7 +299,7 @@ def choose_correct_isoform(glycans, reverse = False):
correct_isoform = glycans
return correct_isoform


def enforce_class(glycan, glycan_class, conf = None, extra_thresh = 0.3):
"""given a glycan and glycan class, determines whether glycan is from this class\n
| Arguments:
Expand Down
2 changes: 1 addition & 1 deletion glycowork/motif/tokenization.py
Original file line number Diff line number Diff line change
Expand Up @@ -272,7 +272,7 @@ def mz_to_composition(mz_value, mode = 'negative', mass_value = 'monoisotopic',
if filter_out is None:
filter_out = set()
if adduct:
mz -= calculate_adduct_mass(adduct, mass_value)
mz_value -= calculate_adduct_mass(adduct, mass_value)
adduct_mass = mass_dict['Acetate'] if mode == 'negative' else mass_dict['Na+']
if reduced:
mz_value -= 1.0078
Expand Down
46 changes: 23 additions & 23 deletions glycowork/network/biosynthesis.py
Original file line number Diff line number Diff line change
Expand Up @@ -706,7 +706,7 @@ def plot_network(network, plot_format = 'pydot2', edge_label_draw = True,
scatter = nx.draw_networkx_nodes(network, pos, node_size = node_sizes, alpha = 0.7,
node_color = color_map, ax = ax)
edge_attributes = nx.get_edge_attributes(network, 'diffs')
if edge_label_draw:
if edge_label_draw:
if lfc_dict:
# Map log-fold changes of responsible enzymes onto biosynthetic steps
c_list = ['red' if lfc_dict.get(attr, 0) < 0 else 'green' if lfc_dict.get(attr, 0) > 0 else 'cornflowerblue' for attr in edge_attributes.values()]
Expand Down Expand Up @@ -1001,7 +1001,7 @@ def prune_network(network, node_attr = 'abundance', threshold = 0.):
network_out.remove_nodes_from(to_cut)
# Remove virtual nodes with total degree of max 1 and an in-degree of at least 1
nodes_to_remove = [node for node, attr in network_out.nodes(data = True)
if (network_out.degree[node] <= 1) and (attr.get('virtual') == 1)
if (network_out.degree[node] <= 1) and (attr.get('virtual') == 1)
and (network_out.in_degree[node] > 0)]
network_out.remove_nodes_from(nodes_to_remove)
return network_out
Expand Down Expand Up @@ -1244,7 +1244,7 @@ def get_differential_biosynthesis(df, group1, group2 = None, analysis = "reactio
| :-
| For binary comparison: A dataframe with differential flow features and statistics
| For longitudinal analysis: A dataframe with reaction changes over time"""

if longitudinal:
assert id_column is not None, "id_column must be specified for longitudinal analysis"
assert group2 is None, "group2 should not be specified for longitudinal analysis"
Expand All @@ -1254,18 +1254,18 @@ def get_differential_biosynthesis(df, group1, group2 = None, analysis = "reactio
assert group2 is not None, "group2 must be specified for binary comparison"
if paired:
assert len(group1) == len(group2), "For paired samples, the size of group1 and group2 should be the same"

# Handle input data
if isinstance(df, str):
df = pd.read_csv(df) if df.endswith(".csv") else pd.read_excel(df)

if not longitudinal and not isinstance(group1[0], str):
columns_list = df.columns.tolist()
group1 = [columns_list[k] for k in group1]
group2 = [columns_list[k] for k in group2]

all_groups = group1 + (group2 or [])

# Prepare data for analysis
if longitudinal:
df['participant'] = df[id_column].apply(lambda x: x.split('_')[0])
Expand All @@ -1276,11 +1276,11 @@ def get_differential_biosynthesis(df, group1, group2 = None, analysis = "reactio
df_analysis = df[df['time_point'].isin(time_points)].copy()
else:
glycan_columns = all_groups

df_analysis = df_analysis if longitudinal else df.set_index(df.columns.tolist()[0])
df_analysis = df_analysis.loc[:, glycan_columns].fillna(0)
df_analysis = (df_analysis / df_analysis.sum(axis = 1).values[:, None]) * 100

if not longitudinal:
df_analysis = df_analysis[df_analysis.any(axis = 1)]
else:
Expand All @@ -1291,17 +1291,17 @@ def get_differential_biosynthesis(df, group1, group2 = None, analysis = "reactio
root = max(root, key = len) if '-ol' not in root[0] else min(root, key = len)
min_default = 0.1 if root.endswith('GlcNAc') else 0.001
core_net = construct_network(df_analysis.index.tolist() if not longitudinal else glycan_columns)
nets = {}

nets, features = {}, []
for col in df_analysis.columns:
temp = deepcopy(core_net)
abundance_mapping = dict(zip(df_analysis.index.tolist() if not longitudinal else glycan_columns,
df_analysis[col].values.tolist()))
nx.set_node_attributes(temp, {g: {'abundance': abundance_mapping.get(g, 0.0)} for g in temp.nodes()})
nets[col] = estimate_weights(temp, root = root, min_default = min_default)

res = {col: get_maximum_flow(nets[col], source = root) for col in nets}

# Perform reaction or flow analysis
if analysis == "reaction":
res2 = {col: get_reaction_flow(nets[col], res[col], aggregate = "sum") for col in nets}
Expand All @@ -1316,20 +1316,20 @@ def get_differential_biosynthesis(df, group1, group2 = None, analysis = "reactio
raise ValueError("Only 'reaction' and 'flow' are currently supported analysis modes.")

res2 = pd.DataFrame(res2).T

if analysis == "reaction" and not longitudinal:
res2 = res2.loc[:, res2.var(axis = 0) > 0.01]
elif analysis == "flow" and not longitudinal:
res2.columns = features

features = res2.columns.tolist()

# Perform statistical analysis
if longitudinal:
res_df = res2.reset_index()
res_df = res_df.rename(columns = {'index': id_column})
res_df = pd.merge(res_df, df[['participant', 'time_point']], on = id_column)

results = []
for reaction in features:
reaction_data = res_df[[id_column, 'participant', 'time_point', reaction]]
Expand All @@ -1339,21 +1339,21 @@ def get_differential_biosynthesis(df, group1, group2 = None, analysis = "reactio
slopes = reaction_data.groupby('participant').apply(lambda x: np.polyfit(x['time_numeric'], x[reaction], 1)[0], include_groups = False)
average_slope = slopes.mean()
direction = "Increase" if average_slope > 0 else "Decrease"

model = ols('Q("{0}") ~ C(time_point) + C(participant)'.format(reaction), data = reaction_data).fit()
anova_table = sm.stats.anova_lm(model, typ = 2)

f_value = anova_table.loc['C(time_point)', 'F']
p_value = anova_table.loc['C(time_point)', 'PR(>F)']

results.append({
'Feature': reaction,
'F-statistic': f_value,
'p-val': p_value,
'Direction': direction,
'Average Slope': average_slope
})

out = pd.DataFrame(results)
out['corr p-val'] = multipletests(out['p-val'], method = 'fdr_bh')[1]
out['significant'] = out['corr p-val'] < 0.05
Expand All @@ -1367,10 +1367,10 @@ def get_differential_biosynthesis(df, group1, group2 = None, analysis = "reactio
alpha = get_alphaN(len(all_groups))
significance = [p < alpha for p in corrpvals] if pvals else []
effect_sizes, _ = zip(*[cohen_d(row_b, row_a, paired = paired) for row_a, row_b in zip(df_a.values, df_b.values)])

out = pd.DataFrame({'Feature': features, 'Mean abundance': mean_abundance, 'Log2FC': log2fc, 'p-val': pvals,
'corr p-val': corrpvals, 'significant': significance, 'Effect size': effect_sizes})

out = out.set_index('Feature')
return out.dropna().sort_values(by = 'p-val')

Expand Down

0 comments on commit d5780bd

Please sign in to comment.