From 3d64c6c63824dc3a90410224e0e6c77c354a339c Mon Sep 17 00:00:00 2001 From: Daniel Bojar Date: Wed, 6 Mar 2024 09:39:54 +0100 Subject: [PATCH] spread outlier smoothing and add terminal3 --- build/lib/glycowork/motif/analysis.py | 73 +++++++++------------------ build/lib/glycowork/motif/annotate.py | 29 ++++++++--- glycowork/glycan_data/stats.py | 53 +++++++++++++++---- glycowork/motif/analysis.py | 73 +++++++++------------------ glycowork/motif/annotate.py | 29 ++++++++--- 5 files changed, 134 insertions(+), 123 deletions(-) diff --git a/build/lib/glycowork/motif/analysis.py b/build/lib/glycowork/motif/analysis.py index 8d051614..2c696b86 100644 --- a/build/lib/glycowork/motif/analysis.py +++ b/build/lib/glycowork/motif/analysis.py @@ -8,7 +8,7 @@ import matplotlib.pyplot as plt plt.style.use('default') from collections import Counter -from scipy.stats import ttest_ind, ttest_rel, f, norm, levene, f_oneway +from scipy.stats import ttest_ind, ttest_rel, norm, levene, f_oneway from statsmodels.formula.api import ols from statsmodels.stats.multitest import multipletests from statsmodels.stats.multicomp import pairwise_tukeyhsd @@ -20,7 +20,7 @@ from glycowork.glycan_data.stats import (cohen_d, mahalanobis_distance, mahalanobis_variance, variance_stabilization, impute_and_normalize, variance_based_filtering, jtkdist, jtkinit, MissForest, jtkx, get_alphaN, TST_grouped_benjamini_hochberg, - test_inter_vs_intra_group, replace_outliers_with_median) + test_inter_vs_intra_group, replace_outliers_with_IQR_bounds, hotellings_t2) from glycowork.motif.annotate import (annotate_dataset, quantify_motifs, link_find, create_correlation_network, group_glycans_core, group_glycans_sia_fuc, group_glycans_N_glycan_type) from glycowork.motif.graph import subgraph_isomorphism @@ -40,7 +40,8 @@ def get_pvals_motifs(df, glycan_col_name = 'glycan', label_col_name = 'target', | sorting (bool): whether p-value dataframe should be sorted ascendingly; default: True | feature_set (list): which feature set to use for annotations, add more to list to expand; default is 'known'; options are: 'known' (hand-crafted glycan features), \ | 'graph' (structural graph features of glycans), 'exhaustive' (all mono- and disaccharide features), 'terminal' (non-reducing end motifs), \ - | 'terminal2' (non-reducing end motifs of size 2), 'custom' (specify your own motifs in custom_motifs), and 'chemical' (molecular properties of glycan) + | 'terminal2' (non-reducing end motifs of size 2), 'terminal3' (non-reducing end motifs of size 3), 'custom' (specify your own motifs in custom_motifs), \ + | and 'chemical' (molecular properties of glycan) | multiple_samples (bool): set to True if you have multiple samples (rows) with glycan information (columns); default:False | motifs (dataframe): can be used to pass a modified motif_list to the function; default:None\n | Returns: @@ -53,6 +54,7 @@ def get_pvals_motifs(df, glycan_col_name = 'glycan', label_col_name = 'target', if multiple_samples: df = df.drop('target', axis = 1, errors = 'ignore').T.reset_index() df.columns = [glycan_col_name] + [label_col_name] * (len(df.columns) - 1) + df = df.apply(replace_outliers_with_IQR_bounds, axis = 1) if not zscores: means = df.iloc[:, 1:].mean() std_devs = df.iloc[:, 1:].std() @@ -156,7 +158,8 @@ def get_heatmap(df, mode = 'sequence', feature_set = ['known'], | mode (string): whether glycan 'sequence' or 'motif' should be used for clustering; default:sequence | feature_set (list): which feature set to use for annotations, add more to list to expand; default is 'known'; options are: 'known' (hand-crafted glycan features), \ | 'graph' (structural graph features of glycans), 'exhaustive' (all mono- and disaccharide features), 'terminal' (non-reducing end motifs), \ - | 'terminal2' (non-reducing end motifs of size 2), 'custom' (specify your own motifs in custom_motifs), and 'chemical' (molecular properties of glycan) + | 'terminal2' (non-reducing end motifs of size 2), 'terminal3' (non-reducing end motifs of size 3), 'custom' (specify your own motifs in custom_motifs), \ + | and 'chemical' (molecular properties of glycan) | datatype (string): whether df comes from a dataset with quantitative variable ('response') or from presence_to_matrix ('presence') | rarity_filter (float): proportion of samples that need to have a non-zero value for a variable to be included; default:0.05 | filepath (string): absolute path including full filename allows for saving the plot @@ -363,42 +366,6 @@ def characterize_monosaccharide(sugar, df = None, mode = 'sugar', glycan_col_nam plt.show() -def hotellings_t2(group1, group2, paired = False): - """Hotelling's T^2 test (the t-test for multivariate comparisons)\n - """ - if paired: - assert group1.shape == group2.shape, "For paired samples, the size of group1 and group2 should be the same" - group1 -= group2 - group2 = None - # Calculate the means and covariances of each group - n1, p = group1.shape - mean1 = np.mean(group1, axis = 0) - cov1 = np.cov(group1, rowvar = False) - if group2 is not None: - n2, _ = group2.shape - mean2 = np.mean(group2, axis = 0) - cov2 = np.cov(group2, rowvar = False) - else: - n2 = 0 - mean2 = np.zeros_like(mean1) - cov2 = np.zeros_like(cov1) - # Calculate the difference between the means - diff = mean1 - mean2 - # Calculate the pooled covariance matrix - denom = (n1 + n2 - 2) - pooled_cov = cov1 if denom < 1 else ((n1 - 1) * cov1 + (n2 - 1) * cov2) / denom - pooled_cov += np.eye(p) * 1e-6 - # Calculate the Hotelling's T^2 statistic - T2 = (n1 * n2) / (n1 + n2) * diff @ np.linalg.pinv(pooled_cov) @ diff.T - # Convert the T^2 statistic to an F statistic - F = 0 if denom < 1 else T2 * (denom + 1 - p) / (denom * p) - if F == 0: - return F, 1.0 - # Calculate the p-value of the F statistic - p_value = f.sf(F, p, n1 + n2 - p - 1) - return F, p_value - - def get_coverage(df, filepath = ''): """ Plot glycan coverage across samples, ordered by average intensity\n | Arguments: @@ -435,7 +402,8 @@ def get_pca(df, groups = None, motifs = False, feature_set = ['known', 'exhausti | motifs (bool): whether to analyze full sequences (False) or motifs (True); default:False | feature_set (list): which feature set to use for annotations, add more to list to expand; default is 'known'; options are: 'known' (hand-crafted glycan features), \ | 'graph' (structural graph features of glycans), 'exhaustive' (all mono- and disaccharide features), 'terminal' (non-reducing end motifs), \ - | 'terminal2' (non-reducing end motifs of size 2), 'custom' (specify your own motifs in custom_motifs), and 'chemical' (molecular properties of glycan) + | 'terminal2' (non-reducing end motifs of size 2), 'terminal3' (non-reducing end motifs of size 3), 'custom' (specify your own motifs in custom_motifs), \ + | and 'chemical' (molecular properties of glycan) | pc_x (int): principal component to plot on x axis; default:1 | pc_y (int): principal component to plot on y axis; default:2 | color (string): if design dataframe is provided: column name for color grouping; default:None @@ -529,9 +497,10 @@ def get_differential_expression(df, group1, group2, | motifs (bool): whether to analyze full sequences (False) or motifs (True); default:False | feature_set (list): which feature set to use for annotations, add more to list to expand; default is 'known'; options are: 'known' (hand-crafted glycan features), \ | 'graph' (structural graph features of glycans), 'exhaustive' (all mono- and disaccharide features), 'terminal' (non-reducing end motifs), \ - | 'terminal2' (non-reducing end motifs of size 2), 'custom' (specify your own motifs in custom_motifs), and 'chemical' (molecular properties of glycan) + | 'terminal2' (non-reducing end motifs of size 2), 'terminal3' (non-reducing end motifs of size 3), 'custom' (specify your own motifs in custom_motifs), \ + | and 'chemical' (molecular properties of glycan) | paired (bool): whether samples are paired or not (e.g., tumor & tumor-adjacent tissue from same patient); default:False - | impute (bool): replaces zeroes with draws from left-shifted distribution or KNN-Imputer; default:True + | impute (bool): replaces zeroes with a Random Forest based model; default:True | sets (bool): whether to identify clusters of highly correlated glycans/motifs to test for differential expression; default:False | set_thresh (float): correlation value used as a threshold for clusters; only used when sets=True; default:0.9 | effect_size_variance (bool): whether effect size variance should also be calculated/estimated; default:False @@ -559,7 +528,7 @@ def get_differential_expression(df, group1, group2, df = df.loc[:, [df.columns.tolist()[0]]+group1+group2].fillna(0) # Drop rows with all zero, followed by outlier removal and imputation & normalization df = df.loc[~(df == 0).all(axis = 1)] - df = df.apply(replace_outliers_with_median, axis = 1) + df = df.apply(replace_outliers_with_IQR_bounds, axis = 1) df = impute_and_normalize(df, [group1, group2], impute = impute, min_samples = min_samples) # Sample-size aware alpha via Bayesian-Adaptive Alpha Adjustment alpha = get_alphaN(df.shape[1] - 1) @@ -732,11 +701,12 @@ def get_glycanova(df, groups, impute = True, motifs = False, feature_set = ['exh | :- | df (dataframe): dataframe containing glycan sequences in first column and relative abundances in subsequent columns [alternative: filepath to .csv or .xlsx] | group_sizes (list): a list of group identifiers for each sample (e.g., [1,1,1,2,2,2,3,3,3]) - | impute (bool): replaces zeroes with draws from left-shifted distribution or KNN-Imputer; default:True + | impute (bool): replaces zeroes with with a Random Forest based model; default:True | motifs (bool): whether to analyze full sequences (False) or motifs (True); default:False | feature_set (list): which feature set to use for annotations, add more to list to expand; default is 'known'; options are: 'known' (hand-crafted glycan features), \ | 'graph' (structural graph features of glycans), 'exhaustive' (all mono- and disaccharide features), 'terminal' (non-reducing end motifs), \ - | 'terminal2' (non-reducing end motifs of size 2), 'custom' (specify your own motifs in custom_motifs), and 'chemical' (molecular properties of glycan) + | 'terminal2' (non-reducing end motifs of size 2), 'terminal3' (non-reducing end motifs of size 3), 'custom' (specify your own motifs in custom_motifs), \ + | and 'chemical' (molecular properties of glycan) | min_samples (int): How many samples per group need to have non-zero values for glycan to be kept; default: at least half per group | posthoc (bool): whether to do Tukey's HSD test post-hoc to find out which differences were significant; default:True\n | Returns: @@ -748,6 +718,7 @@ def get_glycanova(df, groups, impute = True, motifs = False, feature_set = ['exh df = pd.read_csv(df) if df.endswith(".csv") else pd.read_excel(df) results, posthoc_results = [], {} df.fillna(0, inplace = True) + df = df.apply(replace_outliers_with_IQR_bounds, axis = 1) groups_unq = sorted(set(groups)) df = impute_and_normalize(df, [[df.columns[i+1] for i, x in enumerate(groups) if x == g] for g in groups_unq], impute = impute, min_samples = min_samples) @@ -894,11 +865,12 @@ def get_time_series(df, impute = True, motifs = False, feature_set = ['known', ' | Arguments: | :- | df (dataframe): dataframe containing sample IDs of style sampleID_UnitTimepoint_replicate (e.g., T1_h5_r1) in first column and glycan relative abundances in subsequent columns [alternative: filepath to .csv or .xlsx] - | impute (bool): replaces zeroes with draws from left-shifted distribution or KNN-Imputer; default:True + | impute (bool): replaces zeroes with a Random Forest based model; default:True | motifs (bool): whether to analyze full sequences (False) or motifs (True); default:False | feature_set (list): which feature set to use for annotations, add more to list to expand; default is 'known'; options are: 'known' (hand-crafted glycan features), \ | 'graph' (structural graph features of glycans), 'exhaustive' (all mono- and disaccharide features), 'terminal' (non-reducing end motifs), \ - | 'terminal2' (non-reducing end motifs of size 2), 'custom' (specify your own motifs in custom_motifs), and 'chemical' (molecular properties of glycan) + | 'terminal2' (non-reducing end motifs of size 2), 'terminal3' (non-reducing end motifs of size 3), 'custom' (specify your own motifs in custom_motifs), \ + | and 'chemical' (molecular properties of glycan) | degree (int): degree of the polynomial for regression, default:1 for linear regression | min_samples (int): How many samples per group need to have non-zero values for glycan to be kept; default: at least half per group\n | Returns: @@ -914,6 +886,7 @@ def get_time_series(df, impute = True, motifs = False, feature_set = ['known', ' df = pd.read_csv(df) if df.endswith(".csv") else pd.read_excel(df) df.fillna(0, inplace = True) df = df.set_index(df.columns[0]).T + df = df.apply(replace_outliers_with_IQR_bounds, axis = 1) df = impute_and_normalize(df, [df.columns], impute = impute, min_samples = min_samples).reset_index() # Sample-size aware alpha via Bayesian-Adaptive Alpha Adjustment alpha = get_alphaN(df.shape[1] - 1) @@ -951,7 +924,8 @@ def get_jtk(df_in, timepoints, periods, interval, motifs = False, feature_set = | motifs (bool): a flag for running structural of motif-based analysis (True = run motif analysis); default:False. | feature_set (list): which feature set to use for annotations, add more to list to expand; default is 'known'; options are: 'known' (hand-crafted glycan features), \ | 'graph' (structural graph features of glycans), 'exhaustive' (all mono- and disaccharide features), 'terminal' (non-reducing end motifs), \ - | 'terminal2' (non-reducing end motifs of size 2), 'custom' (specify your own motifs in custom_motifs), and 'chemical' (molecular properties of glycan)\n + | 'terminal2' (non-reducing end motifs of size 2), 'terminal3' (non-reducing end motifs of size 3), 'custom' (specify your own motifs in custom_motifs), \ + | and 'chemical' (molecular properties of glycan)\n | Returns: | :- | Returns a pandas dataframe containing the adjusted p-values, and most important waveform parameters for each @@ -967,6 +941,7 @@ def get_jtk(df_in, timepoints, periods, interval, motifs = False, feature_set = "VAR": [], "EXV": [], "SDV": [], "CGOOSV": []} param_dic = jtkdist(timepoints, param_dic, replicates) param_dic = jtkinit(periods, param_dic, interval, replicates) + df = df.apply(replace_outliers_with_IQR_bounds, axis = 1) mf = MissForest() df.replace(0, np.nan, inplace = True) annot = df.pop(df.columns.tolist()[0]) diff --git a/build/lib/glycowork/motif/annotate.py b/build/lib/glycowork/motif/annotate.py index 5423467a..05231fb5 100644 --- a/build/lib/glycowork/motif/annotate.py +++ b/build/lib/glycowork/motif/annotate.py @@ -141,7 +141,8 @@ def annotate_dataset(glycans, motifs = None, feature_set = ['known'], | motifs (dataframe): dataframe of glycan motifs (name + sequence); default:motif_list | feature_set (list): which feature set to use for annotations, add more to list to expand; default is 'known'; options are: 'known' (hand-crafted glycan features), \ | 'graph' (structural graph features of glycans), 'exhaustive' (all mono- and disaccharide features), 'terminal' (non-reducing end motifs of size 1), \ - | 'terminal2' (non-reducing end motifs of size 2), 'custom' (specify your own motifs in custom_motifs), and 'chemical' (molecular properties of glycan) + | 'terminal2' (non-reducing end motifs of size 2), 'terminal3' (non-reducing end motifs of size 3), 'custom' (specify your own motifs in custom_motifs), \ + | and 'chemical' (molecular properties of glycan) | termini_list (list): list of monosaccharide/linkage positions (from 'terminal', 'internal', and 'flexible') | condense (bool): if True, throws away columns with only zeroes; default:False | custom_motifs (list): list of glycan motifs, used if feature_set includes 'custom'; default:empty\n @@ -197,6 +198,10 @@ def annotate_dataset(glycans, motifs = None, feature_set = ['known'], bag_out = pd.concat([bag_out, shadow_bag], axis = 1).reset_index(drop = True) bag_out.index = glycans shopping_cart.append(bag_out) + if 'terminal3' in feature_set: + temp = get_k_saccharides(glycans, size = 3, terminal = True) + temp.index = glycans + shopping_cart.append(temp) if condense: # Remove motifs that never occur temp = pd.concat(shopping_cart, axis = 1) @@ -236,12 +241,13 @@ def quantify_motifs(df, glycans, feature_set): return df -def count_unique_subgraphs_of_size_k(graph, size = 2): +def count_unique_subgraphs_of_size_k(graph, size = 2, terminal = False): """function to count unique, connected subgraphs of size k (default:disaccharides) occurring in a glycan graph\n | Arguments: | :- | graph (networkx): glycan graph as returned by glycan_to_nxGraph - | size (int): number of monosaccharides per -saccharide, default:2 (for disaccharides)\n + | size (int): number of monosaccharides per -saccharide, default:2 (for disaccharides) + | terminal (bool): whether to only count terminal subgraphs; default:False\n | Returns: | :- | Returns dictionary of type k-saccharide:count @@ -253,6 +259,9 @@ def count_unique_subgraphs_of_size_k(graph, size = 2): for _, path_dict in paths: for path in path_dict.values(): if len(path) == size: + if terminal: + if graph.degree[min(path)] != 1: + continue if labels[path[0]][0] not in {'a', 'b', '?', '('}: path_sorted = sorted(path) path_str = "" @@ -274,27 +283,30 @@ def count_unique_subgraphs_of_size_k(graph, size = 2): @rescue_glycans -def get_k_saccharides(glycans, size = 2, up_to = False, just_motifs = False): +def get_k_saccharides(glycans, size = 2, up_to = False, just_motifs = False, terminal = False): """function to retrieve k-saccharides (default:disaccharides) occurring in a list of glycans\n | Arguments: | :- | glycans (list): list of glycans in IUPAC-condensed nomenclature | size (int): number of monosaccharides per -saccharide, default:2 (for disaccharides) | up_to (bool): in theory: include -saccharides up to size k; in practice: include monosaccharides; default:False - | just_motifs (bool): if you only want the motifs as a nested list, no dataframe with counts; default:False\n + | just_motifs (bool): if you only want the motifs as a nested list, no dataframe with counts; default:False + | terminal (bool): whether to only count terminal subgraphs; default:False\n | Returns: | :- | Returns dataframe with k-saccharide counts (columns) for each glycan (rows) """ + if not isinstance(glycans, list): + raise TypeError("The input has to be a list of glycans") if any([k in glycans[0] for k in [';', '-D-', 'RES', '=']]): raise Exception if up_to: wga_letter = pd.DataFrame([{i: len(re.findall(rf'{re.escape(i)}(?=\(|$)', g)) for i in get_lib(glycans) if i not in linkages} for g in glycans]) regex = re.compile(r"\(([ab])(\d)-(\d)\)") shadow_glycans = [regex.sub(r"(\1\2-?)", g) for g in glycans] - ggraphs = pd.DataFrame([count_unique_subgraphs_of_size_k(glycan_to_nxGraph(g), size = size) for g in glycans]) + ggraphs = pd.DataFrame([count_unique_subgraphs_of_size_k(glycan_to_nxGraph(g), size = size, terminal = terminal) for g in glycans]) ggraphs = ggraphs.drop(columns = [col for col in ggraphs.columns if '?' in col]) - shadow_glycans = pd.DataFrame([count_unique_subgraphs_of_size_k(glycan_to_nxGraph(g), size = size) for g in shadow_glycans]) + shadow_glycans = pd.DataFrame([count_unique_subgraphs_of_size_k(glycan_to_nxGraph(g), size = size, terminal = terminal) for g in shadow_glycans]) org_len = ggraphs.shape[1] out_matrix = pd.concat([ggraphs, shadow_glycans], axis = 1).reset_index(drop = True) out_matrix = out_matrix.loc[:, ~out_matrix.columns.duplicated()].copy() @@ -331,7 +343,8 @@ def get_terminal_structures(glycan, size = 1): | Arguments: | :- | glycan (string or networkx): glycan in IUPAC-condensed nomenclature or as networkx graph - | size (int): how large the extracted motif should be in terms of monosaccharides (for now 1 or 2 are supported); default:1\n + | size (int): how large the extracted motif should be in terms of monosaccharides (for now 1 or 2 are supported; \ + | if you want to go higher use get_k_saccharides with terminal = True); default:1\n | Returns: | :- | Returns a list of terminal structures (strings) diff --git a/glycowork/glycan_data/stats.py b/glycowork/glycan_data/stats.py index 345cfe3a..6886eed7 100644 --- a/glycowork/glycan_data/stats.py +++ b/glycowork/glycan_data/stats.py @@ -6,7 +6,7 @@ from sklearn.ensemble import RandomForestRegressor from sklearn.base import BaseEstimator from scipy.special import gammaln -from scipy.stats import wilcoxon, rankdata, norm, chi2, t +from scipy.stats import wilcoxon, rankdata, norm, chi2, t, f import scipy.integrate as integrate from statsmodels.stats.multitest import multipletests from statsmodels.tools.sm_exceptions import ConvergenceWarning @@ -645,7 +645,7 @@ def test_inter_vs_intra_group(cohort_b, cohort_a, glycans, grouped_glycans, pair return icc, inter_group_corr -def replace_outliers_with_median(full_row): +def replace_outliers_with_IQR_bounds(full_row): """replaces outlier values with row median\n | Arguments: | :- @@ -659,14 +659,49 @@ def replace_outliers_with_median(full_row): Q1 = row.quantile(0.25) Q3 = row.quantile(0.75) IQR = Q3 - Q1 + lower_bound = Q1 - 1.5*IQR + upper_bound = Q3 + 1.5*IQR # Define outliers as values outside of Q1 - 1.5*IQR and Q3 + 1.5*IQR - outlier_condition = ~row.between(Q1 - 1.5*IQR, Q3 + 1.5*IQR) - # Calculate row median - row_median = row.median() + capped_values = row.apply(lambda x: lower_bound if (x < lower_bound and x != 0) else (upper_bound if (x > upper_bound and x != 0) else x)) # Replace outliers with row median if isinstance(full_row.iloc[0], str): - full_row.iloc[1:][outlier_condition] = row_median - return full_row + full_row.iloc[1:] = capped_values else: - row[outlier_condition] = row_median - return row + full_row = capped_values + return full_row + + +def hotellings_t2(group1, group2, paired = False): + """Hotelling's T^2 test (the t-test for multivariate comparisons)\n + """ + if paired: + assert group1.shape == group2.shape, "For paired samples, the size of group1 and group2 should be the same" + group1 -= group2 + group2 = None + # Calculate the means and covariances of each group + n1, p = group1.shape + mean1 = np.mean(group1, axis = 0) + cov1 = np.cov(group1, rowvar = False) + if group2 is not None: + n2, _ = group2.shape + mean2 = np.mean(group2, axis = 0) + cov2 = np.cov(group2, rowvar = False) + else: + n2 = 0 + mean2 = np.zeros_like(mean1) + cov2 = np.zeros_like(cov1) + # Calculate the difference between the means + diff = mean1 - mean2 + # Calculate the pooled covariance matrix + denom = (n1 + n2 - 2) + pooled_cov = cov1 if denom < 1 else ((n1 - 1) * cov1 + (n2 - 1) * cov2) / denom + pooled_cov += np.eye(p) * 1e-6 + # Calculate the Hotelling's T^2 statistic + T2 = (n1 * n2) / (n1 + n2) * diff @ np.linalg.pinv(pooled_cov) @ diff.T + # Convert the T^2 statistic to an F statistic + F = 0 if denom < 1 else T2 * (denom + 1 - p) / (denom * p) + if F == 0: + return F, 1.0 + # Calculate the p-value of the F statistic + p_value = f.sf(F, p, n1 + n2 - p - 1) + return F, p_value diff --git a/glycowork/motif/analysis.py b/glycowork/motif/analysis.py index 8d051614..2c696b86 100644 --- a/glycowork/motif/analysis.py +++ b/glycowork/motif/analysis.py @@ -8,7 +8,7 @@ import matplotlib.pyplot as plt plt.style.use('default') from collections import Counter -from scipy.stats import ttest_ind, ttest_rel, f, norm, levene, f_oneway +from scipy.stats import ttest_ind, ttest_rel, norm, levene, f_oneway from statsmodels.formula.api import ols from statsmodels.stats.multitest import multipletests from statsmodels.stats.multicomp import pairwise_tukeyhsd @@ -20,7 +20,7 @@ from glycowork.glycan_data.stats import (cohen_d, mahalanobis_distance, mahalanobis_variance, variance_stabilization, impute_and_normalize, variance_based_filtering, jtkdist, jtkinit, MissForest, jtkx, get_alphaN, TST_grouped_benjamini_hochberg, - test_inter_vs_intra_group, replace_outliers_with_median) + test_inter_vs_intra_group, replace_outliers_with_IQR_bounds, hotellings_t2) from glycowork.motif.annotate import (annotate_dataset, quantify_motifs, link_find, create_correlation_network, group_glycans_core, group_glycans_sia_fuc, group_glycans_N_glycan_type) from glycowork.motif.graph import subgraph_isomorphism @@ -40,7 +40,8 @@ def get_pvals_motifs(df, glycan_col_name = 'glycan', label_col_name = 'target', | sorting (bool): whether p-value dataframe should be sorted ascendingly; default: True | feature_set (list): which feature set to use for annotations, add more to list to expand; default is 'known'; options are: 'known' (hand-crafted glycan features), \ | 'graph' (structural graph features of glycans), 'exhaustive' (all mono- and disaccharide features), 'terminal' (non-reducing end motifs), \ - | 'terminal2' (non-reducing end motifs of size 2), 'custom' (specify your own motifs in custom_motifs), and 'chemical' (molecular properties of glycan) + | 'terminal2' (non-reducing end motifs of size 2), 'terminal3' (non-reducing end motifs of size 3), 'custom' (specify your own motifs in custom_motifs), \ + | and 'chemical' (molecular properties of glycan) | multiple_samples (bool): set to True if you have multiple samples (rows) with glycan information (columns); default:False | motifs (dataframe): can be used to pass a modified motif_list to the function; default:None\n | Returns: @@ -53,6 +54,7 @@ def get_pvals_motifs(df, glycan_col_name = 'glycan', label_col_name = 'target', if multiple_samples: df = df.drop('target', axis = 1, errors = 'ignore').T.reset_index() df.columns = [glycan_col_name] + [label_col_name] * (len(df.columns) - 1) + df = df.apply(replace_outliers_with_IQR_bounds, axis = 1) if not zscores: means = df.iloc[:, 1:].mean() std_devs = df.iloc[:, 1:].std() @@ -156,7 +158,8 @@ def get_heatmap(df, mode = 'sequence', feature_set = ['known'], | mode (string): whether glycan 'sequence' or 'motif' should be used for clustering; default:sequence | feature_set (list): which feature set to use for annotations, add more to list to expand; default is 'known'; options are: 'known' (hand-crafted glycan features), \ | 'graph' (structural graph features of glycans), 'exhaustive' (all mono- and disaccharide features), 'terminal' (non-reducing end motifs), \ - | 'terminal2' (non-reducing end motifs of size 2), 'custom' (specify your own motifs in custom_motifs), and 'chemical' (molecular properties of glycan) + | 'terminal2' (non-reducing end motifs of size 2), 'terminal3' (non-reducing end motifs of size 3), 'custom' (specify your own motifs in custom_motifs), \ + | and 'chemical' (molecular properties of glycan) | datatype (string): whether df comes from a dataset with quantitative variable ('response') or from presence_to_matrix ('presence') | rarity_filter (float): proportion of samples that need to have a non-zero value for a variable to be included; default:0.05 | filepath (string): absolute path including full filename allows for saving the plot @@ -363,42 +366,6 @@ def characterize_monosaccharide(sugar, df = None, mode = 'sugar', glycan_col_nam plt.show() -def hotellings_t2(group1, group2, paired = False): - """Hotelling's T^2 test (the t-test for multivariate comparisons)\n - """ - if paired: - assert group1.shape == group2.shape, "For paired samples, the size of group1 and group2 should be the same" - group1 -= group2 - group2 = None - # Calculate the means and covariances of each group - n1, p = group1.shape - mean1 = np.mean(group1, axis = 0) - cov1 = np.cov(group1, rowvar = False) - if group2 is not None: - n2, _ = group2.shape - mean2 = np.mean(group2, axis = 0) - cov2 = np.cov(group2, rowvar = False) - else: - n2 = 0 - mean2 = np.zeros_like(mean1) - cov2 = np.zeros_like(cov1) - # Calculate the difference between the means - diff = mean1 - mean2 - # Calculate the pooled covariance matrix - denom = (n1 + n2 - 2) - pooled_cov = cov1 if denom < 1 else ((n1 - 1) * cov1 + (n2 - 1) * cov2) / denom - pooled_cov += np.eye(p) * 1e-6 - # Calculate the Hotelling's T^2 statistic - T2 = (n1 * n2) / (n1 + n2) * diff @ np.linalg.pinv(pooled_cov) @ diff.T - # Convert the T^2 statistic to an F statistic - F = 0 if denom < 1 else T2 * (denom + 1 - p) / (denom * p) - if F == 0: - return F, 1.0 - # Calculate the p-value of the F statistic - p_value = f.sf(F, p, n1 + n2 - p - 1) - return F, p_value - - def get_coverage(df, filepath = ''): """ Plot glycan coverage across samples, ordered by average intensity\n | Arguments: @@ -435,7 +402,8 @@ def get_pca(df, groups = None, motifs = False, feature_set = ['known', 'exhausti | motifs (bool): whether to analyze full sequences (False) or motifs (True); default:False | feature_set (list): which feature set to use for annotations, add more to list to expand; default is 'known'; options are: 'known' (hand-crafted glycan features), \ | 'graph' (structural graph features of glycans), 'exhaustive' (all mono- and disaccharide features), 'terminal' (non-reducing end motifs), \ - | 'terminal2' (non-reducing end motifs of size 2), 'custom' (specify your own motifs in custom_motifs), and 'chemical' (molecular properties of glycan) + | 'terminal2' (non-reducing end motifs of size 2), 'terminal3' (non-reducing end motifs of size 3), 'custom' (specify your own motifs in custom_motifs), \ + | and 'chemical' (molecular properties of glycan) | pc_x (int): principal component to plot on x axis; default:1 | pc_y (int): principal component to plot on y axis; default:2 | color (string): if design dataframe is provided: column name for color grouping; default:None @@ -529,9 +497,10 @@ def get_differential_expression(df, group1, group2, | motifs (bool): whether to analyze full sequences (False) or motifs (True); default:False | feature_set (list): which feature set to use for annotations, add more to list to expand; default is 'known'; options are: 'known' (hand-crafted glycan features), \ | 'graph' (structural graph features of glycans), 'exhaustive' (all mono- and disaccharide features), 'terminal' (non-reducing end motifs), \ - | 'terminal2' (non-reducing end motifs of size 2), 'custom' (specify your own motifs in custom_motifs), and 'chemical' (molecular properties of glycan) + | 'terminal2' (non-reducing end motifs of size 2), 'terminal3' (non-reducing end motifs of size 3), 'custom' (specify your own motifs in custom_motifs), \ + | and 'chemical' (molecular properties of glycan) | paired (bool): whether samples are paired or not (e.g., tumor & tumor-adjacent tissue from same patient); default:False - | impute (bool): replaces zeroes with draws from left-shifted distribution or KNN-Imputer; default:True + | impute (bool): replaces zeroes with a Random Forest based model; default:True | sets (bool): whether to identify clusters of highly correlated glycans/motifs to test for differential expression; default:False | set_thresh (float): correlation value used as a threshold for clusters; only used when sets=True; default:0.9 | effect_size_variance (bool): whether effect size variance should also be calculated/estimated; default:False @@ -559,7 +528,7 @@ def get_differential_expression(df, group1, group2, df = df.loc[:, [df.columns.tolist()[0]]+group1+group2].fillna(0) # Drop rows with all zero, followed by outlier removal and imputation & normalization df = df.loc[~(df == 0).all(axis = 1)] - df = df.apply(replace_outliers_with_median, axis = 1) + df = df.apply(replace_outliers_with_IQR_bounds, axis = 1) df = impute_and_normalize(df, [group1, group2], impute = impute, min_samples = min_samples) # Sample-size aware alpha via Bayesian-Adaptive Alpha Adjustment alpha = get_alphaN(df.shape[1] - 1) @@ -732,11 +701,12 @@ def get_glycanova(df, groups, impute = True, motifs = False, feature_set = ['exh | :- | df (dataframe): dataframe containing glycan sequences in first column and relative abundances in subsequent columns [alternative: filepath to .csv or .xlsx] | group_sizes (list): a list of group identifiers for each sample (e.g., [1,1,1,2,2,2,3,3,3]) - | impute (bool): replaces zeroes with draws from left-shifted distribution or KNN-Imputer; default:True + | impute (bool): replaces zeroes with with a Random Forest based model; default:True | motifs (bool): whether to analyze full sequences (False) or motifs (True); default:False | feature_set (list): which feature set to use for annotations, add more to list to expand; default is 'known'; options are: 'known' (hand-crafted glycan features), \ | 'graph' (structural graph features of glycans), 'exhaustive' (all mono- and disaccharide features), 'terminal' (non-reducing end motifs), \ - | 'terminal2' (non-reducing end motifs of size 2), 'custom' (specify your own motifs in custom_motifs), and 'chemical' (molecular properties of glycan) + | 'terminal2' (non-reducing end motifs of size 2), 'terminal3' (non-reducing end motifs of size 3), 'custom' (specify your own motifs in custom_motifs), \ + | and 'chemical' (molecular properties of glycan) | min_samples (int): How many samples per group need to have non-zero values for glycan to be kept; default: at least half per group | posthoc (bool): whether to do Tukey's HSD test post-hoc to find out which differences were significant; default:True\n | Returns: @@ -748,6 +718,7 @@ def get_glycanova(df, groups, impute = True, motifs = False, feature_set = ['exh df = pd.read_csv(df) if df.endswith(".csv") else pd.read_excel(df) results, posthoc_results = [], {} df.fillna(0, inplace = True) + df = df.apply(replace_outliers_with_IQR_bounds, axis = 1) groups_unq = sorted(set(groups)) df = impute_and_normalize(df, [[df.columns[i+1] for i, x in enumerate(groups) if x == g] for g in groups_unq], impute = impute, min_samples = min_samples) @@ -894,11 +865,12 @@ def get_time_series(df, impute = True, motifs = False, feature_set = ['known', ' | Arguments: | :- | df (dataframe): dataframe containing sample IDs of style sampleID_UnitTimepoint_replicate (e.g., T1_h5_r1) in first column and glycan relative abundances in subsequent columns [alternative: filepath to .csv or .xlsx] - | impute (bool): replaces zeroes with draws from left-shifted distribution or KNN-Imputer; default:True + | impute (bool): replaces zeroes with a Random Forest based model; default:True | motifs (bool): whether to analyze full sequences (False) or motifs (True); default:False | feature_set (list): which feature set to use for annotations, add more to list to expand; default is 'known'; options are: 'known' (hand-crafted glycan features), \ | 'graph' (structural graph features of glycans), 'exhaustive' (all mono- and disaccharide features), 'terminal' (non-reducing end motifs), \ - | 'terminal2' (non-reducing end motifs of size 2), 'custom' (specify your own motifs in custom_motifs), and 'chemical' (molecular properties of glycan) + | 'terminal2' (non-reducing end motifs of size 2), 'terminal3' (non-reducing end motifs of size 3), 'custom' (specify your own motifs in custom_motifs), \ + | and 'chemical' (molecular properties of glycan) | degree (int): degree of the polynomial for regression, default:1 for linear regression | min_samples (int): How many samples per group need to have non-zero values for glycan to be kept; default: at least half per group\n | Returns: @@ -914,6 +886,7 @@ def get_time_series(df, impute = True, motifs = False, feature_set = ['known', ' df = pd.read_csv(df) if df.endswith(".csv") else pd.read_excel(df) df.fillna(0, inplace = True) df = df.set_index(df.columns[0]).T + df = df.apply(replace_outliers_with_IQR_bounds, axis = 1) df = impute_and_normalize(df, [df.columns], impute = impute, min_samples = min_samples).reset_index() # Sample-size aware alpha via Bayesian-Adaptive Alpha Adjustment alpha = get_alphaN(df.shape[1] - 1) @@ -951,7 +924,8 @@ def get_jtk(df_in, timepoints, periods, interval, motifs = False, feature_set = | motifs (bool): a flag for running structural of motif-based analysis (True = run motif analysis); default:False. | feature_set (list): which feature set to use for annotations, add more to list to expand; default is 'known'; options are: 'known' (hand-crafted glycan features), \ | 'graph' (structural graph features of glycans), 'exhaustive' (all mono- and disaccharide features), 'terminal' (non-reducing end motifs), \ - | 'terminal2' (non-reducing end motifs of size 2), 'custom' (specify your own motifs in custom_motifs), and 'chemical' (molecular properties of glycan)\n + | 'terminal2' (non-reducing end motifs of size 2), 'terminal3' (non-reducing end motifs of size 3), 'custom' (specify your own motifs in custom_motifs), \ + | and 'chemical' (molecular properties of glycan)\n | Returns: | :- | Returns a pandas dataframe containing the adjusted p-values, and most important waveform parameters for each @@ -967,6 +941,7 @@ def get_jtk(df_in, timepoints, periods, interval, motifs = False, feature_set = "VAR": [], "EXV": [], "SDV": [], "CGOOSV": []} param_dic = jtkdist(timepoints, param_dic, replicates) param_dic = jtkinit(periods, param_dic, interval, replicates) + df = df.apply(replace_outliers_with_IQR_bounds, axis = 1) mf = MissForest() df.replace(0, np.nan, inplace = True) annot = df.pop(df.columns.tolist()[0]) diff --git a/glycowork/motif/annotate.py b/glycowork/motif/annotate.py index 5423467a..05231fb5 100644 --- a/glycowork/motif/annotate.py +++ b/glycowork/motif/annotate.py @@ -141,7 +141,8 @@ def annotate_dataset(glycans, motifs = None, feature_set = ['known'], | motifs (dataframe): dataframe of glycan motifs (name + sequence); default:motif_list | feature_set (list): which feature set to use for annotations, add more to list to expand; default is 'known'; options are: 'known' (hand-crafted glycan features), \ | 'graph' (structural graph features of glycans), 'exhaustive' (all mono- and disaccharide features), 'terminal' (non-reducing end motifs of size 1), \ - | 'terminal2' (non-reducing end motifs of size 2), 'custom' (specify your own motifs in custom_motifs), and 'chemical' (molecular properties of glycan) + | 'terminal2' (non-reducing end motifs of size 2), 'terminal3' (non-reducing end motifs of size 3), 'custom' (specify your own motifs in custom_motifs), \ + | and 'chemical' (molecular properties of glycan) | termini_list (list): list of monosaccharide/linkage positions (from 'terminal', 'internal', and 'flexible') | condense (bool): if True, throws away columns with only zeroes; default:False | custom_motifs (list): list of glycan motifs, used if feature_set includes 'custom'; default:empty\n @@ -197,6 +198,10 @@ def annotate_dataset(glycans, motifs = None, feature_set = ['known'], bag_out = pd.concat([bag_out, shadow_bag], axis = 1).reset_index(drop = True) bag_out.index = glycans shopping_cart.append(bag_out) + if 'terminal3' in feature_set: + temp = get_k_saccharides(glycans, size = 3, terminal = True) + temp.index = glycans + shopping_cart.append(temp) if condense: # Remove motifs that never occur temp = pd.concat(shopping_cart, axis = 1) @@ -236,12 +241,13 @@ def quantify_motifs(df, glycans, feature_set): return df -def count_unique_subgraphs_of_size_k(graph, size = 2): +def count_unique_subgraphs_of_size_k(graph, size = 2, terminal = False): """function to count unique, connected subgraphs of size k (default:disaccharides) occurring in a glycan graph\n | Arguments: | :- | graph (networkx): glycan graph as returned by glycan_to_nxGraph - | size (int): number of monosaccharides per -saccharide, default:2 (for disaccharides)\n + | size (int): number of monosaccharides per -saccharide, default:2 (for disaccharides) + | terminal (bool): whether to only count terminal subgraphs; default:False\n | Returns: | :- | Returns dictionary of type k-saccharide:count @@ -253,6 +259,9 @@ def count_unique_subgraphs_of_size_k(graph, size = 2): for _, path_dict in paths: for path in path_dict.values(): if len(path) == size: + if terminal: + if graph.degree[min(path)] != 1: + continue if labels[path[0]][0] not in {'a', 'b', '?', '('}: path_sorted = sorted(path) path_str = "" @@ -274,27 +283,30 @@ def count_unique_subgraphs_of_size_k(graph, size = 2): @rescue_glycans -def get_k_saccharides(glycans, size = 2, up_to = False, just_motifs = False): +def get_k_saccharides(glycans, size = 2, up_to = False, just_motifs = False, terminal = False): """function to retrieve k-saccharides (default:disaccharides) occurring in a list of glycans\n | Arguments: | :- | glycans (list): list of glycans in IUPAC-condensed nomenclature | size (int): number of monosaccharides per -saccharide, default:2 (for disaccharides) | up_to (bool): in theory: include -saccharides up to size k; in practice: include monosaccharides; default:False - | just_motifs (bool): if you only want the motifs as a nested list, no dataframe with counts; default:False\n + | just_motifs (bool): if you only want the motifs as a nested list, no dataframe with counts; default:False + | terminal (bool): whether to only count terminal subgraphs; default:False\n | Returns: | :- | Returns dataframe with k-saccharide counts (columns) for each glycan (rows) """ + if not isinstance(glycans, list): + raise TypeError("The input has to be a list of glycans") if any([k in glycans[0] for k in [';', '-D-', 'RES', '=']]): raise Exception if up_to: wga_letter = pd.DataFrame([{i: len(re.findall(rf'{re.escape(i)}(?=\(|$)', g)) for i in get_lib(glycans) if i not in linkages} for g in glycans]) regex = re.compile(r"\(([ab])(\d)-(\d)\)") shadow_glycans = [regex.sub(r"(\1\2-?)", g) for g in glycans] - ggraphs = pd.DataFrame([count_unique_subgraphs_of_size_k(glycan_to_nxGraph(g), size = size) for g in glycans]) + ggraphs = pd.DataFrame([count_unique_subgraphs_of_size_k(glycan_to_nxGraph(g), size = size, terminal = terminal) for g in glycans]) ggraphs = ggraphs.drop(columns = [col for col in ggraphs.columns if '?' in col]) - shadow_glycans = pd.DataFrame([count_unique_subgraphs_of_size_k(glycan_to_nxGraph(g), size = size) for g in shadow_glycans]) + shadow_glycans = pd.DataFrame([count_unique_subgraphs_of_size_k(glycan_to_nxGraph(g), size = size, terminal = terminal) for g in shadow_glycans]) org_len = ggraphs.shape[1] out_matrix = pd.concat([ggraphs, shadow_glycans], axis = 1).reset_index(drop = True) out_matrix = out_matrix.loc[:, ~out_matrix.columns.duplicated()].copy() @@ -331,7 +343,8 @@ def get_terminal_structures(glycan, size = 1): | Arguments: | :- | glycan (string or networkx): glycan in IUPAC-condensed nomenclature or as networkx graph - | size (int): how large the extracted motif should be in terms of monosaccharides (for now 1 or 2 are supported); default:1\n + | size (int): how large the extracted motif should be in terms of monosaccharides (for now 1 or 2 are supported; \ + | if you want to go higher use get_k_saccharides with terminal = True); default:1\n | Returns: | :- | Returns a list of terminal structures (strings)