add replace_outliers_with_median and start adding outlier handling in…

… .analysis
BojarLab · Mar 4, 2024 · 49e9909 · 49e9909
1 parent 1a95a27
commit 49e9909
Show file tree

Hide file tree

Showing 8 changed files with 48 additions and 17 deletions.
diff --git a/build/lib/glycowork/motif/analysis.py b/build/lib/glycowork/motif/analysis.py
@@ -20,7 +20,7 @@
 from glycowork.glycan_data.stats import (cohen_d, mahalanobis_distance, mahalanobis_variance,
                                          variance_stabilization, impute_and_normalize, variance_based_filtering,
                                          jtkdist, jtkinit, MissForest, jtkx, get_alphaN, TST_grouped_benjamini_hochberg,
-                                         test_inter_vs_intra_group)
+                                         test_inter_vs_intra_group, replace_outliers_with_median)
 from glycowork.motif.annotate import (annotate_dataset, quantify_motifs, link_find, create_correlation_network,
                                       group_glycans_core, group_glycans_sia_fuc, group_glycans_N_glycan_type)
 from glycowork.motif.graph import subgraph_isomorphism
@@ -557,8 +557,9 @@ def get_differential_expression(df, group1, group2,
       group1 = [columns_list[k] for k in group1]
       group2 = [columns_list[k] for k in group2]
   df = df.loc[:, [df.columns.tolist()[0]]+group1+group2].fillna(0)
-  # Drop rows with all zero, followed by imputation & normalization
+  # Drop rows with all zero, followed by outlier removal and imputation & normalization
   df = df.loc[~(df == 0).all(axis = 1)]
+  df = df.apply(replace_outliers_with_median, axis = 1)
   df = impute_and_normalize(df, [group1, group2], impute = impute, min_samples = min_samples)
   # Sample-size aware alpha via Bayesian-Adaptive Alpha Adjustment
   alpha = get_alphaN(df.shape[1] - 1)

diff --git a/build/lib/glycowork/motif/annotate.py b/build/lib/glycowork/motif/annotate.py
@@ -209,15 +209,17 @@ def quantify_motifs(df, glycans, feature_set):
     """Extracts and quantifies motifs for a dataset\n
     | Arguments:
     | :-
-    | df (dataframe): dataframe containing relative abundances (each sample one column) [alternative: filepath to .csv]
+    | df (dataframe): dataframe containing relative abundances (each sample one column) [alternative: filepath to .csv or .xlsx]
     | glycans(list): glycans as IUPAC-condensed strings
-    | feature_set (list): which feature set to use for annotations, add more to list to expand; default is ['exhaustive','known']; options are: 'known' (hand-crafted glycan features), 'graph' (structural graph features of glycans), 'exhaustive' (all mono- and disaccharide features), 'terminal' (non-reducing end motifs), and 'chemical' (molecular properties of glycan)
+    | feature_set (list): which feature set to use for annotations, add more to list to expand; default is ['exhaustive','known']; options are: 'known' (hand-crafted glycan features), \
+    |   'graph' (structural graph features of glycans), 'exhaustive' (all mono- and disaccharide features), 'terminal' (non-reducing end motifs), \
+    |   'terminal2' (non-reducing end motifs of size 2), 'custom' (specify your own motifs in custom_motifs), and 'chemical' (molecular properties of glycan)
     | Returns:
     | :-
     | Returns a pandas DataFrame with motifs as columns and samples as rows
     """
     if isinstance(df, str):
-      df = pd.read_csv(df)
+      df = pd.read_csv(df) if df.endswith(".csv") else pd.read_excel(df)
     # Motif extraction
     df_motif = annotate_dataset(glycans,
                                 feature_set = feature_set,

diff --git a/build/lib/glycowork/motif/draw.py b/build/lib/glycowork/motif/draw.py
@@ -1927,16 +1927,15 @@ def GlycoDraw(draw_this, vertical = False, compact = False, show_linkage = True,
       floaty_bits.append(draw_this[openpos:closepos]+'blank')
       draw_this = draw_this[:openpos-1] + len(draw_this[openpos-1:closepos+1])*'*' + draw_this[closepos+1:]
   draw_this = draw_this.replace('*', '')
-
 
   if draw_this in motif_list.motif_name.values.tolist():
     draw_this = motif_list.loc[motif_list.motif_name == draw_this].motif.values.tolist()[0]
 
   try:
     data = get_coordinates_and_labels(draw_this, show_linkage = show_linkage, highlight_motif = highlight_motif, termini_list = highlight_termini_list)
   except:
-    return print('Error: did you enter a real glycan or motif?')
-    ys.exit(1)
+    print('Warning: did you enter a real glycan or motif?')
+    raise Exception
 
   main_sugar, main_sugar_x_pos, main_sugar_y_pos, main_sugar_modification, main_bond, main_conf, main_sugar_label, main_bond_label = data[0]
   branch_sugar, branch_x_pos, branch_y_pos, branch_sugar_modification, branch_bond, branch_connection, b_conf, branch_sugar_label, branch_bond_label = data[1]

diff --git a/glycowork/glycan_data/stats.py b/glycowork/glycan_data/stats.py
@@ -643,3 +643,30 @@ def test_inter_vs_intra_group(cohort_b, cohort_a, glycans, grouped_glycans, pair
   # Calculate Inter-group Correlation
   inter_group_corr = var_samples / total_var
   return icc, inter_group_corr
+
+
+def replace_outliers_with_median(full_row):
+  """replaces outlier values with row median\n
+  | Arguments:
+  | :-
+  | full_row (pd.DataFrame row): row from a pandas dataframe, with all but possibly the first value being numerical\n
+  | Returns:
+  | :-
+  | Returns row with replaced outliers
+  """
+  row = full_row.iloc[1:] if isinstance(full_row.iloc[0], str) else full_row
+  # Calculate Q1, Q3, and IQR for each row
+  Q1 = row.quantile(0.25)
+  Q3 = row.quantile(0.75)
+  IQR = Q3 - Q1
+  # Define outliers as values outside of Q1 - 1.5*IQR and Q3 + 1.5*IQR
+  outlier_condition = ~row.between(Q1 - 1.5*IQR, Q3 + 1.5*IQR)
+  # Calculate row median
+  row_median = row.median()
+  # Replace outliers with row median
+  if isinstance(full_row.iloc[0], str):
+    full_row.iloc[1:][outlier_condition] = row_median
+    return full_row
+  else:
+    row[outlier_condition] = row_median
+    return row
diff --git a/glycowork/motif/analysis.py b/glycowork/motif/analysis.py
@@ -20,7 +20,7 @@
 from glycowork.glycan_data.stats import (cohen_d, mahalanobis_distance, mahalanobis_variance,
                                          variance_stabilization, impute_and_normalize, variance_based_filtering,
                                          jtkdist, jtkinit, MissForest, jtkx, get_alphaN, TST_grouped_benjamini_hochberg,
-                                         test_inter_vs_intra_group)
+                                         test_inter_vs_intra_group, replace_outliers_with_median)
 from glycowork.motif.annotate import (annotate_dataset, quantify_motifs, link_find, create_correlation_network,
                                       group_glycans_core, group_glycans_sia_fuc, group_glycans_N_glycan_type)
 from glycowork.motif.graph import subgraph_isomorphism
@@ -557,8 +557,9 @@ def get_differential_expression(df, group1, group2,
       group1 = [columns_list[k] for k in group1]
       group2 = [columns_list[k] for k in group2]
   df = df.loc[:, [df.columns.tolist()[0]]+group1+group2].fillna(0)
-  # Drop rows with all zero, followed by imputation & normalization
+  # Drop rows with all zero, followed by outlier removal and imputation & normalization
   df = df.loc[~(df == 0).all(axis = 1)]
+  df = df.apply(replace_outliers_with_median, axis = 1)
   df = impute_and_normalize(df, [group1, group2], impute = impute, min_samples = min_samples)
   # Sample-size aware alpha via Bayesian-Adaptive Alpha Adjustment
   alpha = get_alphaN(df.shape[1] - 1)

diff --git a/glycowork/motif/annotate.py b/glycowork/motif/annotate.py
@@ -209,15 +209,17 @@ def quantify_motifs(df, glycans, feature_set):
     """Extracts and quantifies motifs for a dataset\n
     | Arguments:
     | :-
-    | df (dataframe): dataframe containing relative abundances (each sample one column) [alternative: filepath to .csv]
+    | df (dataframe): dataframe containing relative abundances (each sample one column) [alternative: filepath to .csv or .xlsx]
     | glycans(list): glycans as IUPAC-condensed strings
-    | feature_set (list): which feature set to use for annotations, add more to list to expand; default is ['exhaustive','known']; options are: 'known' (hand-crafted glycan features), 'graph' (structural graph features of glycans), 'exhaustive' (all mono- and disaccharide features), 'terminal' (non-reducing end motifs), and 'chemical' (molecular properties of glycan)
+    | feature_set (list): which feature set to use for annotations, add more to list to expand; default is ['exhaustive','known']; options are: 'known' (hand-crafted glycan features), \
+    |   'graph' (structural graph features of glycans), 'exhaustive' (all mono- and disaccharide features), 'terminal' (non-reducing end motifs), \
+    |   'terminal2' (non-reducing end motifs of size 2), 'custom' (specify your own motifs in custom_motifs), and 'chemical' (molecular properties of glycan)
     | Returns:
     | :-
     | Returns a pandas DataFrame with motifs as columns and samples as rows
     """
     if isinstance(df, str):
-      df = pd.read_csv(df)
+      df = pd.read_csv(df) if df.endswith(".csv") else pd.read_excel(df)
     # Motif extraction
     df_motif = annotate_dataset(glycans,
                                 feature_set = feature_set,

diff --git a/glycowork/motif/draw.py b/glycowork/motif/draw.py
@@ -1927,16 +1927,15 @@ def GlycoDraw(draw_this, vertical = False, compact = False, show_linkage = True,
       floaty_bits.append(draw_this[openpos:closepos]+'blank')
       draw_this = draw_this[:openpos-1] + len(draw_this[openpos-1:closepos+1])*'*' + draw_this[closepos+1:]
   draw_this = draw_this.replace('*', '')
-
 
   if draw_this in motif_list.motif_name.values.tolist():
     draw_this = motif_list.loc[motif_list.motif_name == draw_this].motif.values.tolist()[0]
 
   try:
     data = get_coordinates_and_labels(draw_this, show_linkage = show_linkage, highlight_motif = highlight_motif, termini_list = highlight_termini_list)
   except:
-    return print('Error: did you enter a real glycan or motif?')
-    ys.exit(1)
+    print('Warning: did you enter a real glycan or motif?')
+    raise Exception
 
   main_sugar, main_sugar_x_pos, main_sugar_y_pos, main_sugar_modification, main_bond, main_conf, main_sugar_label, main_bond_label = data[0]
   branch_sugar, branch_x_pos, branch_y_pos, branch_sugar_modification, branch_bond, branch_connection, b_conf, branch_sugar_label, branch_bond_label = data[1]

diff --git a/glycowork/motif/mz_to_composition.csv b/glycowork/motif/mz_to_composition.csv
@@ -8,7 +8,7 @@ Neu5Gc,307.0903,307.2573,391.1842,391.4186,475.1326,475.4064
 Kdn,250.0689,250.2053,320.1472,320.3397,376.1006,376.3171
 HexA,176.03209,176.1259,218.079,218.2066,260.0532,260.2005
 HexN,161.068816,161.16044,,,,
-Acetonitrile,42.0106,42.0373,42.0106,42.0373,42.0106,42.0373
+Acetonitrile,41.026549,41.05195164,41.026549,41.05195164,41.026549,41.05195164
 Acetate,59.013851,59.044,59.013851,59.044,59.013851,59.044
 Formate,45.01703884,45.021,45.01703884,45.021,45.01703884,45.021
 Cl-,34.96885271,35.453,34.96885271,35.453,34.96885271,35.453