From 3d64c6c63824dc3a90410224e0e6c77c354a339c Mon Sep 17 00:00:00 2001
From: Daniel Bojar <daniel@bojar.net>
Date: Wed, 6 Mar 2024 09:39:54 +0100
Subject: [PATCH] spread outlier smoothing and add terminal3

---
 build/lib/glycowork/motif/analysis.py | 73 +++++++++------------------
 build/lib/glycowork/motif/annotate.py | 29 ++++++++---
 glycowork/glycan_data/stats.py        | 53 +++++++++++++++----
 glycowork/motif/analysis.py           | 73 +++++++++------------------
 glycowork/motif/annotate.py           | 29 ++++++++---
 5 files changed, 134 insertions(+), 123 deletions(-)

diff --git a/build/lib/glycowork/motif/analysis.py b/build/lib/glycowork/motif/analysis.py
index 8d051614..2c696b86 100644
--- a/build/lib/glycowork/motif/analysis.py
+++ b/build/lib/glycowork/motif/analysis.py
@@ -8,7 +8,7 @@
 import matplotlib.pyplot as plt
 plt.style.use('default')
 from collections import Counter
-from scipy.stats import ttest_ind, ttest_rel, f, norm, levene, f_oneway
+from scipy.stats import ttest_ind, ttest_rel, norm, levene, f_oneway
 from statsmodels.formula.api import ols
 from statsmodels.stats.multitest import multipletests
 from statsmodels.stats.multicomp import pairwise_tukeyhsd
@@ -20,7 +20,7 @@
 from glycowork.glycan_data.stats import (cohen_d, mahalanobis_distance, mahalanobis_variance,
                                          variance_stabilization, impute_and_normalize, variance_based_filtering,
                                          jtkdist, jtkinit, MissForest, jtkx, get_alphaN, TST_grouped_benjamini_hochberg,
-                                         test_inter_vs_intra_group, replace_outliers_with_median)
+                                         test_inter_vs_intra_group, replace_outliers_with_IQR_bounds, hotellings_t2)
 from glycowork.motif.annotate import (annotate_dataset, quantify_motifs, link_find, create_correlation_network,
                                       group_glycans_core, group_glycans_sia_fuc, group_glycans_N_glycan_type)
 from glycowork.motif.graph import subgraph_isomorphism
@@ -40,7 +40,8 @@ def get_pvals_motifs(df, glycan_col_name = 'glycan', label_col_name = 'target',
     | sorting (bool): whether p-value dataframe should be sorted ascendingly; default: True
     | feature_set (list): which feature set to use for annotations, add more to list to expand; default is 'known'; options are: 'known' (hand-crafted glycan features), \
     |   'graph' (structural graph features of glycans), 'exhaustive' (all mono- and disaccharide features), 'terminal' (non-reducing end motifs), \
-    |   'terminal2' (non-reducing end motifs of size 2), 'custom' (specify your own motifs in custom_motifs), and 'chemical' (molecular properties of glycan)
+    |   'terminal2' (non-reducing end motifs of size 2), 'terminal3' (non-reducing end motifs of size 3), 'custom' (specify your own motifs in custom_motifs), \
+    |   and 'chemical' (molecular properties of glycan)
     | multiple_samples (bool): set to True if you have multiple samples (rows) with glycan information (columns); default:False
     | motifs (dataframe): can be used to pass a modified motif_list to the function; default:None\n
     | Returns:
@@ -53,6 +54,7 @@ def get_pvals_motifs(df, glycan_col_name = 'glycan', label_col_name = 'target',
     if multiple_samples:
         df = df.drop('target', axis = 1, errors = 'ignore').T.reset_index()
         df.columns = [glycan_col_name] + [label_col_name] * (len(df.columns) - 1)
+        df = df.apply(replace_outliers_with_IQR_bounds, axis = 1)
     if not zscores:
         means = df.iloc[:, 1:].mean()
         std_devs = df.iloc[:, 1:].std()
@@ -156,7 +158,8 @@ def get_heatmap(df, mode = 'sequence', feature_set = ['known'],
   | mode (string): whether glycan 'sequence' or 'motif' should be used for clustering; default:sequence
   | feature_set (list): which feature set to use for annotations, add more to list to expand; default is 'known'; options are: 'known' (hand-crafted glycan features), \
   |   'graph' (structural graph features of glycans), 'exhaustive' (all mono- and disaccharide features), 'terminal' (non-reducing end motifs), \
-  |   'terminal2' (non-reducing end motifs of size 2), 'custom' (specify your own motifs in custom_motifs), and 'chemical' (molecular properties of glycan)
+  |   'terminal2' (non-reducing end motifs of size 2), 'terminal3' (non-reducing end motifs of size 3), 'custom' (specify your own motifs in custom_motifs), \
+  |   and 'chemical' (molecular properties of glycan)
   | datatype (string): whether df comes from a dataset with quantitative variable ('response') or from presence_to_matrix ('presence')
   | rarity_filter (float): proportion of samples that need to have a non-zero value for a variable to be included; default:0.05
   | filepath (string): absolute path including full filename allows for saving the plot
@@ -363,42 +366,6 @@ def characterize_monosaccharide(sugar, df = None, mode = 'sugar', glycan_col_nam
   plt.show()
 
 
-def hotellings_t2(group1, group2, paired = False):
-  """Hotelling's T^2 test (the t-test for multivariate comparisons)\n
-  """
-  if paired:
-    assert group1.shape == group2.shape, "For paired samples, the size of group1 and group2 should be the same"
-    group1 -= group2
-    group2 = None
-  # Calculate the means and covariances of each group
-  n1, p = group1.shape
-  mean1 = np.mean(group1, axis = 0)
-  cov1 = np.cov(group1, rowvar = False)
-  if group2 is not None:
-    n2, _ = group2.shape
-    mean2 = np.mean(group2, axis = 0)
-    cov2 = np.cov(group2, rowvar = False)
-  else:
-    n2 = 0
-    mean2 = np.zeros_like(mean1)
-    cov2 = np.zeros_like(cov1)
-  # Calculate the difference between the means
-  diff = mean1 - mean2
-  # Calculate the pooled covariance matrix
-  denom = (n1 + n2 - 2)
-  pooled_cov = cov1 if denom < 1 else ((n1 - 1) * cov1 + (n2 - 1) * cov2) / denom
-  pooled_cov += np.eye(p) * 1e-6
-  # Calculate the Hotelling's T^2 statistic
-  T2 = (n1 * n2) / (n1 + n2) * diff @ np.linalg.pinv(pooled_cov) @ diff.T
-  # Convert the T^2 statistic to an F statistic
-  F = 0 if denom < 1 else T2 * (denom + 1 - p) / (denom * p)
-  if F == 0:
-    return F, 1.0
-  # Calculate the p-value of the F statistic
-  p_value = f.sf(F, p, n1 + n2 - p - 1)
-  return F, p_value
-
-
 def get_coverage(df, filepath = ''):
   """ Plot glycan coverage across samples, ordered by average intensity\n
   | Arguments:
@@ -435,7 +402,8 @@ def get_pca(df, groups = None, motifs = False, feature_set = ['known', 'exhausti
   | motifs (bool): whether to analyze full sequences (False) or motifs (True); default:False
   | feature_set (list): which feature set to use for annotations, add more to list to expand; default is 'known'; options are: 'known' (hand-crafted glycan features), \
   |   'graph' (structural graph features of glycans), 'exhaustive' (all mono- and disaccharide features), 'terminal' (non-reducing end motifs), \
-  |   'terminal2' (non-reducing end motifs of size 2), 'custom' (specify your own motifs in custom_motifs), and 'chemical' (molecular properties of glycan)
+  |   'terminal2' (non-reducing end motifs of size 2), 'terminal3' (non-reducing end motifs of size 3), 'custom' (specify your own motifs in custom_motifs), \
+  |   and 'chemical' (molecular properties of glycan)
   | pc_x (int): principal component to plot on x axis; default:1
   | pc_y (int): principal component to plot on y axis; default:2
   | color (string): if design dataframe is provided: column name for color grouping; default:None
@@ -529,9 +497,10 @@ def get_differential_expression(df, group1, group2,
   | motifs (bool): whether to analyze full sequences (False) or motifs (True); default:False
   | feature_set (list): which feature set to use for annotations, add more to list to expand; default is 'known'; options are: 'known' (hand-crafted glycan features), \
   |   'graph' (structural graph features of glycans), 'exhaustive' (all mono- and disaccharide features), 'terminal' (non-reducing end motifs), \
-  |   'terminal2' (non-reducing end motifs of size 2), 'custom' (specify your own motifs in custom_motifs), and 'chemical' (molecular properties of glycan)
+  |   'terminal2' (non-reducing end motifs of size 2), 'terminal3' (non-reducing end motifs of size 3), 'custom' (specify your own motifs in custom_motifs), \
+  |   and 'chemical' (molecular properties of glycan)
   | paired (bool): whether samples are paired or not (e.g., tumor & tumor-adjacent tissue from same patient); default:False
-  | impute (bool): replaces zeroes with draws from left-shifted distribution or KNN-Imputer; default:True
+  | impute (bool): replaces zeroes with a Random Forest based model; default:True
   | sets (bool): whether to identify clusters of highly correlated glycans/motifs to test for differential expression; default:False
   | set_thresh (float): correlation value used as a threshold for clusters; only used when sets=True; default:0.9
   | effect_size_variance (bool): whether effect size variance should also be calculated/estimated; default:False
@@ -559,7 +528,7 @@ def get_differential_expression(df, group1, group2,
   df = df.loc[:, [df.columns.tolist()[0]]+group1+group2].fillna(0)
   # Drop rows with all zero, followed by outlier removal and imputation & normalization
   df = df.loc[~(df == 0).all(axis = 1)]
-  df = df.apply(replace_outliers_with_median, axis = 1)
+  df = df.apply(replace_outliers_with_IQR_bounds, axis = 1)
   df = impute_and_normalize(df, [group1, group2], impute = impute, min_samples = min_samples)
   # Sample-size aware alpha via Bayesian-Adaptive Alpha Adjustment
   alpha = get_alphaN(df.shape[1] - 1)
@@ -732,11 +701,12 @@ def get_glycanova(df, groups, impute = True, motifs = False, feature_set = ['exh
     | :-
     | df (dataframe): dataframe containing glycan sequences in first column and relative abundances in subsequent columns [alternative: filepath to .csv or .xlsx]
     | group_sizes (list): a list of group identifiers for each sample (e.g., [1,1,1,2,2,2,3,3,3])
-    | impute (bool): replaces zeroes with draws from left-shifted distribution or KNN-Imputer; default:True
+    | impute (bool): replaces zeroes with with a Random Forest based model; default:True
     | motifs (bool): whether to analyze full sequences (False) or motifs (True); default:False
     | feature_set (list): which feature set to use for annotations, add more to list to expand; default is 'known'; options are: 'known' (hand-crafted glycan features), \
     |   'graph' (structural graph features of glycans), 'exhaustive' (all mono- and disaccharide features), 'terminal' (non-reducing end motifs), \
-    |   'terminal2' (non-reducing end motifs of size 2), 'custom' (specify your own motifs in custom_motifs), and 'chemical' (molecular properties of glycan)
+    |   'terminal2' (non-reducing end motifs of size 2), 'terminal3' (non-reducing end motifs of size 3), 'custom' (specify your own motifs in custom_motifs), \
+    |   and 'chemical' (molecular properties of glycan)
     | min_samples (int): How many samples per group need to have non-zero values for glycan to be kept; default: at least half per group
     | posthoc (bool): whether to do Tukey's HSD test post-hoc to find out which differences were significant; default:True\n
     | Returns:
@@ -748,6 +718,7 @@ def get_glycanova(df, groups, impute = True, motifs = False, feature_set = ['exh
         df = pd.read_csv(df) if df.endswith(".csv") else pd.read_excel(df)
     results, posthoc_results = [], {}
     df.fillna(0, inplace = True)
+    df = df.apply(replace_outliers_with_IQR_bounds, axis = 1)
     groups_unq = sorted(set(groups))
     df = impute_and_normalize(df, [[df.columns[i+1] for i, x in enumerate(groups) if x == g] for g in groups_unq], impute = impute,
                               min_samples = min_samples)
@@ -894,11 +865,12 @@ def get_time_series(df, impute = True, motifs = False, feature_set = ['known', '
     | Arguments:
     | :-
     | df (dataframe): dataframe containing sample IDs of style sampleID_UnitTimepoint_replicate (e.g., T1_h5_r1) in first column and glycan relative abundances in subsequent columns [alternative: filepath to .csv or .xlsx]
-    | impute (bool): replaces zeroes with draws from left-shifted distribution or KNN-Imputer; default:True
+    | impute (bool): replaces zeroes with a Random Forest based model; default:True
     | motifs (bool): whether to analyze full sequences (False) or motifs (True); default:False
     | feature_set (list): which feature set to use for annotations, add more to list to expand; default is 'known'; options are: 'known' (hand-crafted glycan features), \
     |   'graph' (structural graph features of glycans), 'exhaustive' (all mono- and disaccharide features), 'terminal' (non-reducing end motifs), \
-    |   'terminal2' (non-reducing end motifs of size 2), 'custom' (specify your own motifs in custom_motifs), and 'chemical' (molecular properties of glycan)
+    |   'terminal2' (non-reducing end motifs of size 2), 'terminal3' (non-reducing end motifs of size 3), 'custom' (specify your own motifs in custom_motifs), \
+    |   and 'chemical' (molecular properties of glycan)
     | degree (int): degree of the polynomial for regression, default:1 for linear regression
     | min_samples (int): How many samples per group need to have non-zero values for glycan to be kept; default: at least half per group\n
     | Returns:
@@ -914,6 +886,7 @@ def get_time_series(df, impute = True, motifs = False, feature_set = ['known', '
         df = pd.read_csv(df) if df.endswith(".csv") else pd.read_excel(df)
     df.fillna(0, inplace = True)
     df = df.set_index(df.columns[0]).T
+    df = df.apply(replace_outliers_with_IQR_bounds, axis = 1)
     df = impute_and_normalize(df, [df.columns], impute = impute, min_samples = min_samples).reset_index()
     # Sample-size aware alpha via Bayesian-Adaptive Alpha Adjustment
     alpha = get_alphaN(df.shape[1] - 1)
@@ -951,7 +924,8 @@ def get_jtk(df_in, timepoints, periods, interval, motifs = False, feature_set =
     | motifs (bool): a flag for running structural of motif-based analysis (True = run motif analysis); default:False.
     | feature_set (list): which feature set to use for annotations, add more to list to expand; default is 'known'; options are: 'known' (hand-crafted glycan features), \
     |   'graph' (structural graph features of glycans), 'exhaustive' (all mono- and disaccharide features), 'terminal' (non-reducing end motifs), \
-    |   'terminal2' (non-reducing end motifs of size 2), 'custom' (specify your own motifs in custom_motifs), and 'chemical' (molecular properties of glycan)\n
+    |   'terminal2' (non-reducing end motifs of size 2), 'terminal3' (non-reducing end motifs of size 3), 'custom' (specify your own motifs in custom_motifs), \
+    |   and 'chemical' (molecular properties of glycan)\n
     | Returns:
     | :-
     | Returns a pandas dataframe containing the adjusted p-values, and most important waveform parameters for each
@@ -967,6 +941,7 @@ def get_jtk(df_in, timepoints, periods, interval, motifs = False, feature_set =
                  "VAR": [], "EXV": [], "SDV": [], "CGOOSV": []}
     param_dic = jtkdist(timepoints, param_dic, replicates)
     param_dic = jtkinit(periods, param_dic, interval, replicates)
+    df = df.apply(replace_outliers_with_IQR_bounds, axis = 1)
     mf = MissForest()
     df.replace(0, np.nan, inplace = True)
     annot = df.pop(df.columns.tolist()[0])
diff --git a/build/lib/glycowork/motif/annotate.py b/build/lib/glycowork/motif/annotate.py
index 5423467a..05231fb5 100644
--- a/build/lib/glycowork/motif/annotate.py
+++ b/build/lib/glycowork/motif/annotate.py
@@ -141,7 +141,8 @@ def annotate_dataset(glycans, motifs = None, feature_set = ['known'],
   | motifs (dataframe): dataframe of glycan motifs (name + sequence); default:motif_list
   | feature_set (list): which feature set to use for annotations, add more to list to expand; default is 'known'; options are: 'known' (hand-crafted glycan features), \
   |   'graph' (structural graph features of glycans), 'exhaustive' (all mono- and disaccharide features), 'terminal' (non-reducing end motifs of size 1), \
-  |   'terminal2' (non-reducing end motifs of size 2), 'custom' (specify your own motifs in custom_motifs), and 'chemical' (molecular properties of glycan)
+  |   'terminal2' (non-reducing end motifs of size 2), 'terminal3' (non-reducing end motifs of size 3), 'custom' (specify your own motifs in custom_motifs), \
+  |   and 'chemical' (molecular properties of glycan)
   | termini_list (list): list of monosaccharide/linkage positions (from 'terminal', 'internal', and 'flexible')
   | condense (bool): if True, throws away columns with only zeroes; default:False
   | custom_motifs (list): list of glycan motifs, used if feature_set includes 'custom'; default:empty\n
@@ -197,6 +198,10 @@ def annotate_dataset(glycans, motifs = None, feature_set = ['known'],
     bag_out = pd.concat([bag_out, shadow_bag], axis = 1).reset_index(drop = True)
     bag_out.index = glycans
     shopping_cart.append(bag_out)
+  if 'terminal3' in feature_set:
+    temp = get_k_saccharides(glycans, size = 3, terminal = True)
+    temp.index = glycans
+    shopping_cart.append(temp)
   if condense:
     # Remove motifs that never occur
     temp = pd.concat(shopping_cart, axis = 1)
@@ -236,12 +241,13 @@ def quantify_motifs(df, glycans, feature_set):
     return df
 
 
-def count_unique_subgraphs_of_size_k(graph, size = 2):
+def count_unique_subgraphs_of_size_k(graph, size = 2, terminal = False):
   """function to count unique, connected subgraphs of size k (default:disaccharides) occurring in a glycan graph\n
   | Arguments:
   | :-
   | graph (networkx): glycan graph as returned by glycan_to_nxGraph
-  | size (int): number of monosaccharides per -saccharide, default:2 (for disaccharides)\n
+  | size (int): number of monosaccharides per -saccharide, default:2 (for disaccharides)
+  | terminal (bool): whether to only count terminal subgraphs; default:False\n
   | Returns:
   | :-                   
   | Returns dictionary of type k-saccharide:count
@@ -253,6 +259,9 @@ def count_unique_subgraphs_of_size_k(graph, size = 2):
   for _, path_dict in paths:
     for path in path_dict.values():
       if len(path) == size:
+        if terminal:
+          if graph.degree[min(path)] != 1:
+            continue
         if labels[path[0]][0] not in {'a', 'b', '?', '('}:
           path_sorted = sorted(path)
           path_str = ""
@@ -274,27 +283,30 @@ def count_unique_subgraphs_of_size_k(graph, size = 2):
 
 
 @rescue_glycans
-def get_k_saccharides(glycans, size = 2, up_to = False, just_motifs = False):
+def get_k_saccharides(glycans, size = 2, up_to = False, just_motifs = False, terminal = False):
   """function to retrieve k-saccharides (default:disaccharides) occurring in a list of glycans\n
   | Arguments:
   | :-
   | glycans (list): list of glycans in IUPAC-condensed nomenclature
   | size (int): number of monosaccharides per -saccharide, default:2 (for disaccharides)
   | up_to (bool): in theory: include -saccharides up to size k; in practice: include monosaccharides; default:False
-  | just_motifs (bool): if you only want the motifs as a nested list, no dataframe with counts; default:False\n
+  | just_motifs (bool): if you only want the motifs as a nested list, no dataframe with counts; default:False
+  | terminal (bool): whether to only count terminal subgraphs; default:False\n
   | Returns:
   | :-                 
   | Returns dataframe with k-saccharide counts (columns) for each glycan (rows)
   """
+  if not isinstance(glycans, list):
+    raise TypeError("The input has to be a list of glycans")
   if any([k in glycans[0] for k in [';', '-D-', 'RES', '=']]):
     raise Exception
   if up_to:
     wga_letter = pd.DataFrame([{i: len(re.findall(rf'{re.escape(i)}(?=\(|$)', g)) for i in get_lib(glycans) if i not in linkages} for g in glycans])
   regex = re.compile(r"\(([ab])(\d)-(\d)\)")
   shadow_glycans = [regex.sub(r"(\1\2-?)", g) for g in glycans]
-  ggraphs = pd.DataFrame([count_unique_subgraphs_of_size_k(glycan_to_nxGraph(g), size = size) for g in glycans])
+  ggraphs = pd.DataFrame([count_unique_subgraphs_of_size_k(glycan_to_nxGraph(g), size = size, terminal = terminal) for g in glycans])
   ggraphs = ggraphs.drop(columns = [col for col in ggraphs.columns if '?' in col])
-  shadow_glycans = pd.DataFrame([count_unique_subgraphs_of_size_k(glycan_to_nxGraph(g), size = size) for g in shadow_glycans])
+  shadow_glycans = pd.DataFrame([count_unique_subgraphs_of_size_k(glycan_to_nxGraph(g), size = size, terminal = terminal) for g in shadow_glycans])
   org_len = ggraphs.shape[1]
   out_matrix = pd.concat([ggraphs, shadow_glycans], axis = 1).reset_index(drop = True)
   out_matrix = out_matrix.loc[:, ~out_matrix.columns.duplicated()].copy()
@@ -331,7 +343,8 @@ def get_terminal_structures(glycan, size = 1):
   | Arguments:
   | :-
   | glycan (string or networkx): glycan in IUPAC-condensed nomenclature or as networkx graph
-  | size (int): how large the extracted motif should be in terms of monosaccharides (for now 1 or 2 are supported); default:1\n
+  | size (int): how large the extracted motif should be in terms of monosaccharides (for now 1 or 2 are supported; \
+  |   if you want to go higher use get_k_saccharides with terminal = True); default:1\n
   | Returns:
   | :-
   | Returns a list of terminal structures (strings)
diff --git a/glycowork/glycan_data/stats.py b/glycowork/glycan_data/stats.py
index 345cfe3a..6886eed7 100644
--- a/glycowork/glycan_data/stats.py
+++ b/glycowork/glycan_data/stats.py
@@ -6,7 +6,7 @@
 from sklearn.ensemble import RandomForestRegressor
 from sklearn.base import BaseEstimator
 from scipy.special import gammaln
-from scipy.stats import wilcoxon, rankdata, norm, chi2, t
+from scipy.stats import wilcoxon, rankdata, norm, chi2, t, f
 import scipy.integrate as integrate
 from statsmodels.stats.multitest import multipletests
 from statsmodels.tools.sm_exceptions import ConvergenceWarning
@@ -645,7 +645,7 @@ def test_inter_vs_intra_group(cohort_b, cohort_a, glycans, grouped_glycans, pair
   return icc, inter_group_corr
 
 
-def replace_outliers_with_median(full_row):
+def replace_outliers_with_IQR_bounds(full_row):
   """replaces outlier values with row median\n
   | Arguments:
   | :-
@@ -659,14 +659,49 @@ def replace_outliers_with_median(full_row):
   Q1 = row.quantile(0.25)
   Q3 = row.quantile(0.75)
   IQR = Q3 - Q1
+  lower_bound = Q1 - 1.5*IQR
+  upper_bound = Q3 + 1.5*IQR
   # Define outliers as values outside of Q1 - 1.5*IQR and Q3 + 1.5*IQR
-  outlier_condition = ~row.between(Q1 - 1.5*IQR, Q3 + 1.5*IQR)
-  # Calculate row median
-  row_median = row.median()
+  capped_values = row.apply(lambda x: lower_bound if (x < lower_bound and x != 0) else (upper_bound if (x > upper_bound and x != 0) else x))
   # Replace outliers with row median
   if isinstance(full_row.iloc[0], str):
-    full_row.iloc[1:][outlier_condition] = row_median
-    return full_row
+    full_row.iloc[1:] = capped_values
   else:
-    row[outlier_condition] = row_median
-    return row
+    full_row = capped_values
+  return full_row
+
+
+def hotellings_t2(group1, group2, paired = False):
+  """Hotelling's T^2 test (the t-test for multivariate comparisons)\n
+  """
+  if paired:
+    assert group1.shape == group2.shape, "For paired samples, the size of group1 and group2 should be the same"
+    group1 -= group2
+    group2 = None
+  # Calculate the means and covariances of each group
+  n1, p = group1.shape
+  mean1 = np.mean(group1, axis = 0)
+  cov1 = np.cov(group1, rowvar = False)
+  if group2 is not None:
+    n2, _ = group2.shape
+    mean2 = np.mean(group2, axis = 0)
+    cov2 = np.cov(group2, rowvar = False)
+  else:
+    n2 = 0
+    mean2 = np.zeros_like(mean1)
+    cov2 = np.zeros_like(cov1)
+  # Calculate the difference between the means
+  diff = mean1 - mean2
+  # Calculate the pooled covariance matrix
+  denom = (n1 + n2 - 2)
+  pooled_cov = cov1 if denom < 1 else ((n1 - 1) * cov1 + (n2 - 1) * cov2) / denom
+  pooled_cov += np.eye(p) * 1e-6
+  # Calculate the Hotelling's T^2 statistic
+  T2 = (n1 * n2) / (n1 + n2) * diff @ np.linalg.pinv(pooled_cov) @ diff.T
+  # Convert the T^2 statistic to an F statistic
+  F = 0 if denom < 1 else T2 * (denom + 1 - p) / (denom * p)
+  if F == 0:
+    return F, 1.0
+  # Calculate the p-value of the F statistic
+  p_value = f.sf(F, p, n1 + n2 - p - 1)
+  return F, p_value
diff --git a/glycowork/motif/analysis.py b/glycowork/motif/analysis.py
index 8d051614..2c696b86 100644
--- a/glycowork/motif/analysis.py
+++ b/glycowork/motif/analysis.py
@@ -8,7 +8,7 @@
 import matplotlib.pyplot as plt
 plt.style.use('default')
 from collections import Counter
-from scipy.stats import ttest_ind, ttest_rel, f, norm, levene, f_oneway
+from scipy.stats import ttest_ind, ttest_rel, norm, levene, f_oneway
 from statsmodels.formula.api import ols
 from statsmodels.stats.multitest import multipletests
 from statsmodels.stats.multicomp import pairwise_tukeyhsd
@@ -20,7 +20,7 @@
 from glycowork.glycan_data.stats import (cohen_d, mahalanobis_distance, mahalanobis_variance,
                                          variance_stabilization, impute_and_normalize, variance_based_filtering,
                                          jtkdist, jtkinit, MissForest, jtkx, get_alphaN, TST_grouped_benjamini_hochberg,
-                                         test_inter_vs_intra_group, replace_outliers_with_median)
+                                         test_inter_vs_intra_group, replace_outliers_with_IQR_bounds, hotellings_t2)
 from glycowork.motif.annotate import (annotate_dataset, quantify_motifs, link_find, create_correlation_network,
                                       group_glycans_core, group_glycans_sia_fuc, group_glycans_N_glycan_type)
 from glycowork.motif.graph import subgraph_isomorphism
@@ -40,7 +40,8 @@ def get_pvals_motifs(df, glycan_col_name = 'glycan', label_col_name = 'target',
     | sorting (bool): whether p-value dataframe should be sorted ascendingly; default: True
     | feature_set (list): which feature set to use for annotations, add more to list to expand; default is 'known'; options are: 'known' (hand-crafted glycan features), \
     |   'graph' (structural graph features of glycans), 'exhaustive' (all mono- and disaccharide features), 'terminal' (non-reducing end motifs), \
-    |   'terminal2' (non-reducing end motifs of size 2), 'custom' (specify your own motifs in custom_motifs), and 'chemical' (molecular properties of glycan)
+    |   'terminal2' (non-reducing end motifs of size 2), 'terminal3' (non-reducing end motifs of size 3), 'custom' (specify your own motifs in custom_motifs), \
+    |   and 'chemical' (molecular properties of glycan)
     | multiple_samples (bool): set to True if you have multiple samples (rows) with glycan information (columns); default:False
     | motifs (dataframe): can be used to pass a modified motif_list to the function; default:None\n
     | Returns:
@@ -53,6 +54,7 @@ def get_pvals_motifs(df, glycan_col_name = 'glycan', label_col_name = 'target',
     if multiple_samples:
         df = df.drop('target', axis = 1, errors = 'ignore').T.reset_index()
         df.columns = [glycan_col_name] + [label_col_name] * (len(df.columns) - 1)
+        df = df.apply(replace_outliers_with_IQR_bounds, axis = 1)
     if not zscores:
         means = df.iloc[:, 1:].mean()
         std_devs = df.iloc[:, 1:].std()
@@ -156,7 +158,8 @@ def get_heatmap(df, mode = 'sequence', feature_set = ['known'],
   | mode (string): whether glycan 'sequence' or 'motif' should be used for clustering; default:sequence
   | feature_set (list): which feature set to use for annotations, add more to list to expand; default is 'known'; options are: 'known' (hand-crafted glycan features), \
   |   'graph' (structural graph features of glycans), 'exhaustive' (all mono- and disaccharide features), 'terminal' (non-reducing end motifs), \
-  |   'terminal2' (non-reducing end motifs of size 2), 'custom' (specify your own motifs in custom_motifs), and 'chemical' (molecular properties of glycan)
+  |   'terminal2' (non-reducing end motifs of size 2), 'terminal3' (non-reducing end motifs of size 3), 'custom' (specify your own motifs in custom_motifs), \
+  |   and 'chemical' (molecular properties of glycan)
   | datatype (string): whether df comes from a dataset with quantitative variable ('response') or from presence_to_matrix ('presence')
   | rarity_filter (float): proportion of samples that need to have a non-zero value for a variable to be included; default:0.05
   | filepath (string): absolute path including full filename allows for saving the plot
@@ -363,42 +366,6 @@ def characterize_monosaccharide(sugar, df = None, mode = 'sugar', glycan_col_nam
   plt.show()
 
 
-def hotellings_t2(group1, group2, paired = False):
-  """Hotelling's T^2 test (the t-test for multivariate comparisons)\n
-  """
-  if paired:
-    assert group1.shape == group2.shape, "For paired samples, the size of group1 and group2 should be the same"
-    group1 -= group2
-    group2 = None
-  # Calculate the means and covariances of each group
-  n1, p = group1.shape
-  mean1 = np.mean(group1, axis = 0)
-  cov1 = np.cov(group1, rowvar = False)
-  if group2 is not None:
-    n2, _ = group2.shape
-    mean2 = np.mean(group2, axis = 0)
-    cov2 = np.cov(group2, rowvar = False)
-  else:
-    n2 = 0
-    mean2 = np.zeros_like(mean1)
-    cov2 = np.zeros_like(cov1)
-  # Calculate the difference between the means
-  diff = mean1 - mean2
-  # Calculate the pooled covariance matrix
-  denom = (n1 + n2 - 2)
-  pooled_cov = cov1 if denom < 1 else ((n1 - 1) * cov1 + (n2 - 1) * cov2) / denom
-  pooled_cov += np.eye(p) * 1e-6
-  # Calculate the Hotelling's T^2 statistic
-  T2 = (n1 * n2) / (n1 + n2) * diff @ np.linalg.pinv(pooled_cov) @ diff.T
-  # Convert the T^2 statistic to an F statistic
-  F = 0 if denom < 1 else T2 * (denom + 1 - p) / (denom * p)
-  if F == 0:
-    return F, 1.0
-  # Calculate the p-value of the F statistic
-  p_value = f.sf(F, p, n1 + n2 - p - 1)
-  return F, p_value
-
-
 def get_coverage(df, filepath = ''):
   """ Plot glycan coverage across samples, ordered by average intensity\n
   | Arguments:
@@ -435,7 +402,8 @@ def get_pca(df, groups = None, motifs = False, feature_set = ['known', 'exhausti
   | motifs (bool): whether to analyze full sequences (False) or motifs (True); default:False
   | feature_set (list): which feature set to use for annotations, add more to list to expand; default is 'known'; options are: 'known' (hand-crafted glycan features), \
   |   'graph' (structural graph features of glycans), 'exhaustive' (all mono- and disaccharide features), 'terminal' (non-reducing end motifs), \
-  |   'terminal2' (non-reducing end motifs of size 2), 'custom' (specify your own motifs in custom_motifs), and 'chemical' (molecular properties of glycan)
+  |   'terminal2' (non-reducing end motifs of size 2), 'terminal3' (non-reducing end motifs of size 3), 'custom' (specify your own motifs in custom_motifs), \
+  |   and 'chemical' (molecular properties of glycan)
   | pc_x (int): principal component to plot on x axis; default:1
   | pc_y (int): principal component to plot on y axis; default:2
   | color (string): if design dataframe is provided: column name for color grouping; default:None
@@ -529,9 +497,10 @@ def get_differential_expression(df, group1, group2,
   | motifs (bool): whether to analyze full sequences (False) or motifs (True); default:False
   | feature_set (list): which feature set to use for annotations, add more to list to expand; default is 'known'; options are: 'known' (hand-crafted glycan features), \
   |   'graph' (structural graph features of glycans), 'exhaustive' (all mono- and disaccharide features), 'terminal' (non-reducing end motifs), \
-  |   'terminal2' (non-reducing end motifs of size 2), 'custom' (specify your own motifs in custom_motifs), and 'chemical' (molecular properties of glycan)
+  |   'terminal2' (non-reducing end motifs of size 2), 'terminal3' (non-reducing end motifs of size 3), 'custom' (specify your own motifs in custom_motifs), \
+  |   and 'chemical' (molecular properties of glycan)
   | paired (bool): whether samples are paired or not (e.g., tumor & tumor-adjacent tissue from same patient); default:False
-  | impute (bool): replaces zeroes with draws from left-shifted distribution or KNN-Imputer; default:True
+  | impute (bool): replaces zeroes with a Random Forest based model; default:True
   | sets (bool): whether to identify clusters of highly correlated glycans/motifs to test for differential expression; default:False
   | set_thresh (float): correlation value used as a threshold for clusters; only used when sets=True; default:0.9
   | effect_size_variance (bool): whether effect size variance should also be calculated/estimated; default:False
@@ -559,7 +528,7 @@ def get_differential_expression(df, group1, group2,
   df = df.loc[:, [df.columns.tolist()[0]]+group1+group2].fillna(0)
   # Drop rows with all zero, followed by outlier removal and imputation & normalization
   df = df.loc[~(df == 0).all(axis = 1)]
-  df = df.apply(replace_outliers_with_median, axis = 1)
+  df = df.apply(replace_outliers_with_IQR_bounds, axis = 1)
   df = impute_and_normalize(df, [group1, group2], impute = impute, min_samples = min_samples)
   # Sample-size aware alpha via Bayesian-Adaptive Alpha Adjustment
   alpha = get_alphaN(df.shape[1] - 1)
@@ -732,11 +701,12 @@ def get_glycanova(df, groups, impute = True, motifs = False, feature_set = ['exh
     | :-
     | df (dataframe): dataframe containing glycan sequences in first column and relative abundances in subsequent columns [alternative: filepath to .csv or .xlsx]
     | group_sizes (list): a list of group identifiers for each sample (e.g., [1,1,1,2,2,2,3,3,3])
-    | impute (bool): replaces zeroes with draws from left-shifted distribution or KNN-Imputer; default:True
+    | impute (bool): replaces zeroes with with a Random Forest based model; default:True
     | motifs (bool): whether to analyze full sequences (False) or motifs (True); default:False
     | feature_set (list): which feature set to use for annotations, add more to list to expand; default is 'known'; options are: 'known' (hand-crafted glycan features), \
     |   'graph' (structural graph features of glycans), 'exhaustive' (all mono- and disaccharide features), 'terminal' (non-reducing end motifs), \
-    |   'terminal2' (non-reducing end motifs of size 2), 'custom' (specify your own motifs in custom_motifs), and 'chemical' (molecular properties of glycan)
+    |   'terminal2' (non-reducing end motifs of size 2), 'terminal3' (non-reducing end motifs of size 3), 'custom' (specify your own motifs in custom_motifs), \
+    |   and 'chemical' (molecular properties of glycan)
     | min_samples (int): How many samples per group need to have non-zero values for glycan to be kept; default: at least half per group
     | posthoc (bool): whether to do Tukey's HSD test post-hoc to find out which differences were significant; default:True\n
     | Returns:
@@ -748,6 +718,7 @@ def get_glycanova(df, groups, impute = True, motifs = False, feature_set = ['exh
         df = pd.read_csv(df) if df.endswith(".csv") else pd.read_excel(df)
     results, posthoc_results = [], {}
     df.fillna(0, inplace = True)
+    df = df.apply(replace_outliers_with_IQR_bounds, axis = 1)
     groups_unq = sorted(set(groups))
     df = impute_and_normalize(df, [[df.columns[i+1] for i, x in enumerate(groups) if x == g] for g in groups_unq], impute = impute,
                               min_samples = min_samples)
@@ -894,11 +865,12 @@ def get_time_series(df, impute = True, motifs = False, feature_set = ['known', '
     | Arguments:
     | :-
     | df (dataframe): dataframe containing sample IDs of style sampleID_UnitTimepoint_replicate (e.g., T1_h5_r1) in first column and glycan relative abundances in subsequent columns [alternative: filepath to .csv or .xlsx]
-    | impute (bool): replaces zeroes with draws from left-shifted distribution or KNN-Imputer; default:True
+    | impute (bool): replaces zeroes with a Random Forest based model; default:True
     | motifs (bool): whether to analyze full sequences (False) or motifs (True); default:False
     | feature_set (list): which feature set to use for annotations, add more to list to expand; default is 'known'; options are: 'known' (hand-crafted glycan features), \
     |   'graph' (structural graph features of glycans), 'exhaustive' (all mono- and disaccharide features), 'terminal' (non-reducing end motifs), \
-    |   'terminal2' (non-reducing end motifs of size 2), 'custom' (specify your own motifs in custom_motifs), and 'chemical' (molecular properties of glycan)
+    |   'terminal2' (non-reducing end motifs of size 2), 'terminal3' (non-reducing end motifs of size 3), 'custom' (specify your own motifs in custom_motifs), \
+    |   and 'chemical' (molecular properties of glycan)
     | degree (int): degree of the polynomial for regression, default:1 for linear regression
     | min_samples (int): How many samples per group need to have non-zero values for glycan to be kept; default: at least half per group\n
     | Returns:
@@ -914,6 +886,7 @@ def get_time_series(df, impute = True, motifs = False, feature_set = ['known', '
         df = pd.read_csv(df) if df.endswith(".csv") else pd.read_excel(df)
     df.fillna(0, inplace = True)
     df = df.set_index(df.columns[0]).T
+    df = df.apply(replace_outliers_with_IQR_bounds, axis = 1)
     df = impute_and_normalize(df, [df.columns], impute = impute, min_samples = min_samples).reset_index()
     # Sample-size aware alpha via Bayesian-Adaptive Alpha Adjustment
     alpha = get_alphaN(df.shape[1] - 1)
@@ -951,7 +924,8 @@ def get_jtk(df_in, timepoints, periods, interval, motifs = False, feature_set =
     | motifs (bool): a flag for running structural of motif-based analysis (True = run motif analysis); default:False.
     | feature_set (list): which feature set to use for annotations, add more to list to expand; default is 'known'; options are: 'known' (hand-crafted glycan features), \
     |   'graph' (structural graph features of glycans), 'exhaustive' (all mono- and disaccharide features), 'terminal' (non-reducing end motifs), \
-    |   'terminal2' (non-reducing end motifs of size 2), 'custom' (specify your own motifs in custom_motifs), and 'chemical' (molecular properties of glycan)\n
+    |   'terminal2' (non-reducing end motifs of size 2), 'terminal3' (non-reducing end motifs of size 3), 'custom' (specify your own motifs in custom_motifs), \
+    |   and 'chemical' (molecular properties of glycan)\n
     | Returns:
     | :-
     | Returns a pandas dataframe containing the adjusted p-values, and most important waveform parameters for each
@@ -967,6 +941,7 @@ def get_jtk(df_in, timepoints, periods, interval, motifs = False, feature_set =
                  "VAR": [], "EXV": [], "SDV": [], "CGOOSV": []}
     param_dic = jtkdist(timepoints, param_dic, replicates)
     param_dic = jtkinit(periods, param_dic, interval, replicates)
+    df = df.apply(replace_outliers_with_IQR_bounds, axis = 1)
     mf = MissForest()
     df.replace(0, np.nan, inplace = True)
     annot = df.pop(df.columns.tolist()[0])
diff --git a/glycowork/motif/annotate.py b/glycowork/motif/annotate.py
index 5423467a..05231fb5 100644
--- a/glycowork/motif/annotate.py
+++ b/glycowork/motif/annotate.py
@@ -141,7 +141,8 @@ def annotate_dataset(glycans, motifs = None, feature_set = ['known'],
   | motifs (dataframe): dataframe of glycan motifs (name + sequence); default:motif_list
   | feature_set (list): which feature set to use for annotations, add more to list to expand; default is 'known'; options are: 'known' (hand-crafted glycan features), \
   |   'graph' (structural graph features of glycans), 'exhaustive' (all mono- and disaccharide features), 'terminal' (non-reducing end motifs of size 1), \
-  |   'terminal2' (non-reducing end motifs of size 2), 'custom' (specify your own motifs in custom_motifs), and 'chemical' (molecular properties of glycan)
+  |   'terminal2' (non-reducing end motifs of size 2), 'terminal3' (non-reducing end motifs of size 3), 'custom' (specify your own motifs in custom_motifs), \
+  |   and 'chemical' (molecular properties of glycan)
   | termini_list (list): list of monosaccharide/linkage positions (from 'terminal', 'internal', and 'flexible')
   | condense (bool): if True, throws away columns with only zeroes; default:False
   | custom_motifs (list): list of glycan motifs, used if feature_set includes 'custom'; default:empty\n
@@ -197,6 +198,10 @@ def annotate_dataset(glycans, motifs = None, feature_set = ['known'],
     bag_out = pd.concat([bag_out, shadow_bag], axis = 1).reset_index(drop = True)
     bag_out.index = glycans
     shopping_cart.append(bag_out)
+  if 'terminal3' in feature_set:
+    temp = get_k_saccharides(glycans, size = 3, terminal = True)
+    temp.index = glycans
+    shopping_cart.append(temp)
   if condense:
     # Remove motifs that never occur
     temp = pd.concat(shopping_cart, axis = 1)
@@ -236,12 +241,13 @@ def quantify_motifs(df, glycans, feature_set):
     return df
 
 
-def count_unique_subgraphs_of_size_k(graph, size = 2):
+def count_unique_subgraphs_of_size_k(graph, size = 2, terminal = False):
   """function to count unique, connected subgraphs of size k (default:disaccharides) occurring in a glycan graph\n
   | Arguments:
   | :-
   | graph (networkx): glycan graph as returned by glycan_to_nxGraph
-  | size (int): number of monosaccharides per -saccharide, default:2 (for disaccharides)\n
+  | size (int): number of monosaccharides per -saccharide, default:2 (for disaccharides)
+  | terminal (bool): whether to only count terminal subgraphs; default:False\n
   | Returns:
   | :-                   
   | Returns dictionary of type k-saccharide:count
@@ -253,6 +259,9 @@ def count_unique_subgraphs_of_size_k(graph, size = 2):
   for _, path_dict in paths:
     for path in path_dict.values():
       if len(path) == size:
+        if terminal:
+          if graph.degree[min(path)] != 1:
+            continue
         if labels[path[0]][0] not in {'a', 'b', '?', '('}:
           path_sorted = sorted(path)
           path_str = ""
@@ -274,27 +283,30 @@ def count_unique_subgraphs_of_size_k(graph, size = 2):
 
 
 @rescue_glycans
-def get_k_saccharides(glycans, size = 2, up_to = False, just_motifs = False):
+def get_k_saccharides(glycans, size = 2, up_to = False, just_motifs = False, terminal = False):
   """function to retrieve k-saccharides (default:disaccharides) occurring in a list of glycans\n
   | Arguments:
   | :-
   | glycans (list): list of glycans in IUPAC-condensed nomenclature
   | size (int): number of monosaccharides per -saccharide, default:2 (for disaccharides)
   | up_to (bool): in theory: include -saccharides up to size k; in practice: include monosaccharides; default:False
-  | just_motifs (bool): if you only want the motifs as a nested list, no dataframe with counts; default:False\n
+  | just_motifs (bool): if you only want the motifs as a nested list, no dataframe with counts; default:False
+  | terminal (bool): whether to only count terminal subgraphs; default:False\n
   | Returns:
   | :-                 
   | Returns dataframe with k-saccharide counts (columns) for each glycan (rows)
   """
+  if not isinstance(glycans, list):
+    raise TypeError("The input has to be a list of glycans")
   if any([k in glycans[0] for k in [';', '-D-', 'RES', '=']]):
     raise Exception
   if up_to:
     wga_letter = pd.DataFrame([{i: len(re.findall(rf'{re.escape(i)}(?=\(|$)', g)) for i in get_lib(glycans) if i not in linkages} for g in glycans])
   regex = re.compile(r"\(([ab])(\d)-(\d)\)")
   shadow_glycans = [regex.sub(r"(\1\2-?)", g) for g in glycans]
-  ggraphs = pd.DataFrame([count_unique_subgraphs_of_size_k(glycan_to_nxGraph(g), size = size) for g in glycans])
+  ggraphs = pd.DataFrame([count_unique_subgraphs_of_size_k(glycan_to_nxGraph(g), size = size, terminal = terminal) for g in glycans])
   ggraphs = ggraphs.drop(columns = [col for col in ggraphs.columns if '?' in col])
-  shadow_glycans = pd.DataFrame([count_unique_subgraphs_of_size_k(glycan_to_nxGraph(g), size = size) for g in shadow_glycans])
+  shadow_glycans = pd.DataFrame([count_unique_subgraphs_of_size_k(glycan_to_nxGraph(g), size = size, terminal = terminal) for g in shadow_glycans])
   org_len = ggraphs.shape[1]
   out_matrix = pd.concat([ggraphs, shadow_glycans], axis = 1).reset_index(drop = True)
   out_matrix = out_matrix.loc[:, ~out_matrix.columns.duplicated()].copy()
@@ -331,7 +343,8 @@ def get_terminal_structures(glycan, size = 1):
   | Arguments:
   | :-
   | glycan (string or networkx): glycan in IUPAC-condensed nomenclature or as networkx graph
-  | size (int): how large the extracted motif should be in terms of monosaccharides (for now 1 or 2 are supported); default:1\n
+  | size (int): how large the extracted motif should be in terms of monosaccharides (for now 1 or 2 are supported; \
+  |   if you want to go higher use get_k_saccharides with terminal = True); default:1\n
   | Returns:
   | :-
   | Returns a list of terminal structures (strings)