Skip to content

Commit

Permalink
add replace_outliers_with_median and start adding outlier handling in…
Browse files Browse the repository at this point in the history
… .analysis
  • Loading branch information
Bribak committed Mar 4, 2024
1 parent 1a95a27 commit 49e9909
Show file tree
Hide file tree
Showing 8 changed files with 48 additions and 17 deletions.
5 changes: 3 additions & 2 deletions build/lib/glycowork/motif/analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
from glycowork.glycan_data.stats import (cohen_d, mahalanobis_distance, mahalanobis_variance,
variance_stabilization, impute_and_normalize, variance_based_filtering,
jtkdist, jtkinit, MissForest, jtkx, get_alphaN, TST_grouped_benjamini_hochberg,
test_inter_vs_intra_group)
test_inter_vs_intra_group, replace_outliers_with_median)
from glycowork.motif.annotate import (annotate_dataset, quantify_motifs, link_find, create_correlation_network,
group_glycans_core, group_glycans_sia_fuc, group_glycans_N_glycan_type)
from glycowork.motif.graph import subgraph_isomorphism
Expand Down Expand Up @@ -557,8 +557,9 @@ def get_differential_expression(df, group1, group2,
group1 = [columns_list[k] for k in group1]
group2 = [columns_list[k] for k in group2]
df = df.loc[:, [df.columns.tolist()[0]]+group1+group2].fillna(0)
# Drop rows with all zero, followed by imputation & normalization
# Drop rows with all zero, followed by outlier removal and imputation & normalization
df = df.loc[~(df == 0).all(axis = 1)]
df = df.apply(replace_outliers_with_median, axis = 1)
df = impute_and_normalize(df, [group1, group2], impute = impute, min_samples = min_samples)
# Sample-size aware alpha via Bayesian-Adaptive Alpha Adjustment
alpha = get_alphaN(df.shape[1] - 1)
Expand Down
8 changes: 5 additions & 3 deletions build/lib/glycowork/motif/annotate.py
Original file line number Diff line number Diff line change
Expand Up @@ -209,15 +209,17 @@ def quantify_motifs(df, glycans, feature_set):
"""Extracts and quantifies motifs for a dataset\n
| Arguments:
| :-
| df (dataframe): dataframe containing relative abundances (each sample one column) [alternative: filepath to .csv]
| df (dataframe): dataframe containing relative abundances (each sample one column) [alternative: filepath to .csv or .xlsx]
| glycans(list): glycans as IUPAC-condensed strings
| feature_set (list): which feature set to use for annotations, add more to list to expand; default is ['exhaustive','known']; options are: 'known' (hand-crafted glycan features), 'graph' (structural graph features of glycans), 'exhaustive' (all mono- and disaccharide features), 'terminal' (non-reducing end motifs), and 'chemical' (molecular properties of glycan)
| feature_set (list): which feature set to use for annotations, add more to list to expand; default is ['exhaustive','known']; options are: 'known' (hand-crafted glycan features), \
| 'graph' (structural graph features of glycans), 'exhaustive' (all mono- and disaccharide features), 'terminal' (non-reducing end motifs), \
| 'terminal2' (non-reducing end motifs of size 2), 'custom' (specify your own motifs in custom_motifs), and 'chemical' (molecular properties of glycan)
| Returns:
| :-
| Returns a pandas DataFrame with motifs as columns and samples as rows
"""
if isinstance(df, str):
df = pd.read_csv(df)
df = pd.read_csv(df) if df.endswith(".csv") else pd.read_excel(df)
# Motif extraction
df_motif = annotate_dataset(glycans,
feature_set = feature_set,
Expand Down
5 changes: 2 additions & 3 deletions build/lib/glycowork/motif/draw.py
Original file line number Diff line number Diff line change
Expand Up @@ -1927,16 +1927,15 @@ def GlycoDraw(draw_this, vertical = False, compact = False, show_linkage = True,
floaty_bits.append(draw_this[openpos:closepos]+'blank')
draw_this = draw_this[:openpos-1] + len(draw_this[openpos-1:closepos+1])*'*' + draw_this[closepos+1:]
draw_this = draw_this.replace('*', '')


if draw_this in motif_list.motif_name.values.tolist():
draw_this = motif_list.loc[motif_list.motif_name == draw_this].motif.values.tolist()[0]

try:
data = get_coordinates_and_labels(draw_this, show_linkage = show_linkage, highlight_motif = highlight_motif, termini_list = highlight_termini_list)
except:
return print('Error: did you enter a real glycan or motif?')
ys.exit(1)
print('Warning: did you enter a real glycan or motif?')
raise Exception

main_sugar, main_sugar_x_pos, main_sugar_y_pos, main_sugar_modification, main_bond, main_conf, main_sugar_label, main_bond_label = data[0]
branch_sugar, branch_x_pos, branch_y_pos, branch_sugar_modification, branch_bond, branch_connection, b_conf, branch_sugar_label, branch_bond_label = data[1]
Expand Down
27 changes: 27 additions & 0 deletions glycowork/glycan_data/stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -643,3 +643,30 @@ def test_inter_vs_intra_group(cohort_b, cohort_a, glycans, grouped_glycans, pair
# Calculate Inter-group Correlation
inter_group_corr = var_samples / total_var
return icc, inter_group_corr


def replace_outliers_with_median(full_row):
"""replaces outlier values with row median\n
| Arguments:
| :-
| full_row (pd.DataFrame row): row from a pandas dataframe, with all but possibly the first value being numerical\n
| Returns:
| :-
| Returns row with replaced outliers
"""
row = full_row.iloc[1:] if isinstance(full_row.iloc[0], str) else full_row
# Calculate Q1, Q3, and IQR for each row
Q1 = row.quantile(0.25)
Q3 = row.quantile(0.75)
IQR = Q3 - Q1
# Define outliers as values outside of Q1 - 1.5*IQR and Q3 + 1.5*IQR
outlier_condition = ~row.between(Q1 - 1.5*IQR, Q3 + 1.5*IQR)
# Calculate row median
row_median = row.median()
# Replace outliers with row median
if isinstance(full_row.iloc[0], str):
full_row.iloc[1:][outlier_condition] = row_median
return full_row
else:
row[outlier_condition] = row_median
return row
5 changes: 3 additions & 2 deletions glycowork/motif/analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
from glycowork.glycan_data.stats import (cohen_d, mahalanobis_distance, mahalanobis_variance,
variance_stabilization, impute_and_normalize, variance_based_filtering,
jtkdist, jtkinit, MissForest, jtkx, get_alphaN, TST_grouped_benjamini_hochberg,
test_inter_vs_intra_group)
test_inter_vs_intra_group, replace_outliers_with_median)
from glycowork.motif.annotate import (annotate_dataset, quantify_motifs, link_find, create_correlation_network,
group_glycans_core, group_glycans_sia_fuc, group_glycans_N_glycan_type)
from glycowork.motif.graph import subgraph_isomorphism
Expand Down Expand Up @@ -557,8 +557,9 @@ def get_differential_expression(df, group1, group2,
group1 = [columns_list[k] for k in group1]
group2 = [columns_list[k] for k in group2]
df = df.loc[:, [df.columns.tolist()[0]]+group1+group2].fillna(0)
# Drop rows with all zero, followed by imputation & normalization
# Drop rows with all zero, followed by outlier removal and imputation & normalization
df = df.loc[~(df == 0).all(axis = 1)]
df = df.apply(replace_outliers_with_median, axis = 1)
df = impute_and_normalize(df, [group1, group2], impute = impute, min_samples = min_samples)
# Sample-size aware alpha via Bayesian-Adaptive Alpha Adjustment
alpha = get_alphaN(df.shape[1] - 1)
Expand Down
8 changes: 5 additions & 3 deletions glycowork/motif/annotate.py
Original file line number Diff line number Diff line change
Expand Up @@ -209,15 +209,17 @@ def quantify_motifs(df, glycans, feature_set):
"""Extracts and quantifies motifs for a dataset\n
| Arguments:
| :-
| df (dataframe): dataframe containing relative abundances (each sample one column) [alternative: filepath to .csv]
| df (dataframe): dataframe containing relative abundances (each sample one column) [alternative: filepath to .csv or .xlsx]
| glycans(list): glycans as IUPAC-condensed strings
| feature_set (list): which feature set to use for annotations, add more to list to expand; default is ['exhaustive','known']; options are: 'known' (hand-crafted glycan features), 'graph' (structural graph features of glycans), 'exhaustive' (all mono- and disaccharide features), 'terminal' (non-reducing end motifs), and 'chemical' (molecular properties of glycan)
| feature_set (list): which feature set to use for annotations, add more to list to expand; default is ['exhaustive','known']; options are: 'known' (hand-crafted glycan features), \
| 'graph' (structural graph features of glycans), 'exhaustive' (all mono- and disaccharide features), 'terminal' (non-reducing end motifs), \
| 'terminal2' (non-reducing end motifs of size 2), 'custom' (specify your own motifs in custom_motifs), and 'chemical' (molecular properties of glycan)
| Returns:
| :-
| Returns a pandas DataFrame with motifs as columns and samples as rows
"""
if isinstance(df, str):
df = pd.read_csv(df)
df = pd.read_csv(df) if df.endswith(".csv") else pd.read_excel(df)
# Motif extraction
df_motif = annotate_dataset(glycans,
feature_set = feature_set,
Expand Down
5 changes: 2 additions & 3 deletions glycowork/motif/draw.py
Original file line number Diff line number Diff line change
Expand Up @@ -1927,16 +1927,15 @@ def GlycoDraw(draw_this, vertical = False, compact = False, show_linkage = True,
floaty_bits.append(draw_this[openpos:closepos]+'blank')
draw_this = draw_this[:openpos-1] + len(draw_this[openpos-1:closepos+1])*'*' + draw_this[closepos+1:]
draw_this = draw_this.replace('*', '')


if draw_this in motif_list.motif_name.values.tolist():
draw_this = motif_list.loc[motif_list.motif_name == draw_this].motif.values.tolist()[0]

try:
data = get_coordinates_and_labels(draw_this, show_linkage = show_linkage, highlight_motif = highlight_motif, termini_list = highlight_termini_list)
except:
return print('Error: did you enter a real glycan or motif?')
ys.exit(1)
print('Warning: did you enter a real glycan or motif?')
raise Exception

main_sugar, main_sugar_x_pos, main_sugar_y_pos, main_sugar_modification, main_bond, main_conf, main_sugar_label, main_bond_label = data[0]
branch_sugar, branch_x_pos, branch_y_pos, branch_sugar_modification, branch_bond, branch_connection, b_conf, branch_sugar_label, branch_bond_label = data[1]
Expand Down
2 changes: 1 addition & 1 deletion glycowork/motif/mz_to_composition.csv
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ Neu5Gc,307.0903,307.2573,391.1842,391.4186,475.1326,475.4064
Kdn,250.0689,250.2053,320.1472,320.3397,376.1006,376.3171
HexA,176.03209,176.1259,218.079,218.2066,260.0532,260.2005
HexN,161.068816,161.16044,,,,
Acetonitrile,42.0106,42.0373,42.0106,42.0373,42.0106,42.0373
Acetonitrile,41.026549,41.05195164,41.026549,41.05195164,41.026549,41.05195164
Acetate,59.013851,59.044,59.013851,59.044,59.013851,59.044
Formate,45.01703884,45.021,45.01703884,45.021,45.01703884,45.021
Cl-,34.96885271,35.453,34.96885271,35.453,34.96885271,35.453
Expand Down

0 comments on commit 49e9909

Please sign in to comment.