Skip to content

Commit

Permalink
add means and effect size to get_biodiversity + version bump
Browse files Browse the repository at this point in the history
  • Loading branch information
Bribak committed Mar 14, 2024
1 parent f653387 commit 0434a71
Show file tree
Hide file tree
Showing 6 changed files with 32 additions and 22 deletions.
23 changes: 14 additions & 9 deletions build/lib/glycowork/motif/analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -583,8 +583,8 @@ def get_differential_expression(df, group1, group2,
df_a, df_b = variance_stabilization(df_a), variance_stabilization(df_b)
if paired:
assert len(group1) == len(group2), "For paired samples, the size of group1 and group2 should be the same"
pvals = [ttest_rel(row_a, row_b)[1] if paired else ttest_ind(row_a, row_b, equal_var = False)[1] for row_a, row_b in zip(df_a.values, df_b.values)]
levene_pvals = [levene(row_a, row_b)[1] for row_a, row_b in zip(df_a.values, df_b.values)]
pvals = [ttest_rel(row_b, row_a)[1] if paired else ttest_ind(row_b, row_a, equal_var = False)[1] for row_a, row_b in zip(df_a.values, df_b.values)]
levene_pvals = [levene(row_b, row_a)[1] for row_a, row_b in zip(df_a.values, df_b.values)]
effect_sizes, variances = zip(*[cohen_d(row_b, row_a, paired = paired) for row_a, row_b in zip(df_a.values, df_b.values)])
# Multiple testing correction
if pvals:
Expand Down Expand Up @@ -992,9 +992,12 @@ def get_biodiversity(df, group1, group2, motifs = False, feature_set = ['exhaust
| :-
| Returns a dataframe with:
| (i) Diversity indices/metrics
| (ii) Uncorrected p-values (Welch's t-test) for difference in mean
| (iii) Corrected p-values (Welch's t-test with Benjamini-Hochberg correction) for difference in mean
| (iv) Significance: True/False of whether the corrected p-value lies below the sample size-appropriate significance threshold
| (ii) Mean value of diversity metrics in group 1
| (iii) Mean value of diversity metrics in group 2
| (iv) Uncorrected p-values (Welch's t-test) for difference in mean
| (v) Corrected p-values (Welch's t-test with Benjamini-Hochberg correction) for difference in mean
| (vi) Significance: True/False of whether the corrected p-value lies below the sample size-appropriate significance threshold
| (vii) Effect size as Cohen's d
"""
if isinstance(df, str):
df = pd.read_csv(df) if df.endswith(".csv") else pd.read_excel(df)
Expand All @@ -1003,7 +1006,7 @@ def get_biodiversity(df, group1, group2, motifs = False, feature_set = ['exhaust
group1 = [columns_list[k] for k in group1]
group2 = [columns_list[k] for k in group2]
df = df.loc[:, [df.columns.tolist()[0]]+group1+group2].fillna(0)
# Drop rows with all zero, followed by outlier removal and imputation & normalization
# Drop rows with all zero, followed by outlier removal
df = df.loc[~(df == 0).all(axis = 1)]
df = df.apply(replace_outliers_with_IQR_bounds, axis = 1)
# Sample-size aware alpha via Bayesian-Adaptive Alpha Adjustment
Expand All @@ -1018,15 +1021,17 @@ def get_biodiversity(df, group1, group2, motifs = False, feature_set = ['exhaust
unique_counts = df.iloc[:, 1:].apply(sequence_richness)
shannon_diversity = df.iloc[:, 1:].apply(shannon_diversity_index)
simpson_diversity = df.iloc[:, 1:].apply(simpson_diversity_index)
df_out = pd.DataFrame({'alpha_diversity': unique_counts,
df_out = pd.DataFrame({'richness': unique_counts,
'shannon_diversity': shannon_diversity, 'simpson_diversity': simpson_diversity}).T
df_a, df_b = df_out[group1], df_out[group2]
mean_a, mean_b = [np.mean(row_a) for row_a in df_a.values], [np.mean(row_b) for row_b in df_b.values]
if paired:
assert len(group1) == len(group2), "For paired samples, the size of group1 and group2 should be the same"
pvals = [ttest_rel(row_a, row_b)[1] if paired else ttest_ind(row_a, row_b, equal_var = False)[1] for row_a, row_b in zip(df_a.values, df_b.values)]
pvals = [p if p > 0 and p < 1 else 1.0 for p in pvals]
corrpvals = multipletests(pvals, method = 'fdr_bh')[1]
significance = [p < alpha for p in corrpvals]
out = pd.DataFrame(list(zip(df_out.index.tolist(), pvals, corrpvals, significance)),
columns = ["Metric", "p-val", "corr p-val", "significant"])
effect_sizes, variances = zip(*[cohen_d(row_b, row_a, paired = paired) for row_a, row_b in zip(df_a.values, df_b.values)])
out = pd.DataFrame(list(zip(df_out.index.tolist(), mean_a, mean_b, pvals, corrpvals, significance, effect_sizes)),
columns = ["Metric", "Group1 mean", "Group2 mean", "p-val", "corr p-val", "significant", "Effect size"])
return out.sort_values(by = 'p-val').sort_values(by = 'corr p-val')
2 changes: 1 addition & 1 deletion glycowork.egg-info/PKG-INFO
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
Metadata-Version: 2.1
Name: glycowork
Version: 1.1.0
Version: 1.2.0
Summary: Package for processing and analyzing glycans
Home-page: https://github.com/BojarLab/glycowork
Author: Daniel Bojar
Expand Down
2 changes: 1 addition & 1 deletion glycowork/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
__version__ = "1.0.1"
__version__ = "1.2.0"
#from .glycowork import *

__all__ = ['ml', 'motif', 'glycan_data', 'network']
23 changes: 14 additions & 9 deletions glycowork/motif/analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -583,8 +583,8 @@ def get_differential_expression(df, group1, group2,
df_a, df_b = variance_stabilization(df_a), variance_stabilization(df_b)
if paired:
assert len(group1) == len(group2), "For paired samples, the size of group1 and group2 should be the same"
pvals = [ttest_rel(row_a, row_b)[1] if paired else ttest_ind(row_a, row_b, equal_var = False)[1] for row_a, row_b in zip(df_a.values, df_b.values)]
levene_pvals = [levene(row_a, row_b)[1] for row_a, row_b in zip(df_a.values, df_b.values)]
pvals = [ttest_rel(row_b, row_a)[1] if paired else ttest_ind(row_b, row_a, equal_var = False)[1] for row_a, row_b in zip(df_a.values, df_b.values)]
levene_pvals = [levene(row_b, row_a)[1] for row_a, row_b in zip(df_a.values, df_b.values)]
effect_sizes, variances = zip(*[cohen_d(row_b, row_a, paired = paired) for row_a, row_b in zip(df_a.values, df_b.values)])
# Multiple testing correction
if pvals:
Expand Down Expand Up @@ -992,9 +992,12 @@ def get_biodiversity(df, group1, group2, motifs = False, feature_set = ['exhaust
| :-
| Returns a dataframe with:
| (i) Diversity indices/metrics
| (ii) Uncorrected p-values (Welch's t-test) for difference in mean
| (iii) Corrected p-values (Welch's t-test with Benjamini-Hochberg correction) for difference in mean
| (iv) Significance: True/False of whether the corrected p-value lies below the sample size-appropriate significance threshold
| (ii) Mean value of diversity metrics in group 1
| (iii) Mean value of diversity metrics in group 2
| (iv) Uncorrected p-values (Welch's t-test) for difference in mean
| (v) Corrected p-values (Welch's t-test with Benjamini-Hochberg correction) for difference in mean
| (vi) Significance: True/False of whether the corrected p-value lies below the sample size-appropriate significance threshold
| (vii) Effect size as Cohen's d
"""
if isinstance(df, str):
df = pd.read_csv(df) if df.endswith(".csv") else pd.read_excel(df)
Expand All @@ -1003,7 +1006,7 @@ def get_biodiversity(df, group1, group2, motifs = False, feature_set = ['exhaust
group1 = [columns_list[k] for k in group1]
group2 = [columns_list[k] for k in group2]
df = df.loc[:, [df.columns.tolist()[0]]+group1+group2].fillna(0)
# Drop rows with all zero, followed by outlier removal and imputation & normalization
# Drop rows with all zero, followed by outlier removal
df = df.loc[~(df == 0).all(axis = 1)]
df = df.apply(replace_outliers_with_IQR_bounds, axis = 1)
# Sample-size aware alpha via Bayesian-Adaptive Alpha Adjustment
Expand All @@ -1018,15 +1021,17 @@ def get_biodiversity(df, group1, group2, motifs = False, feature_set = ['exhaust
unique_counts = df.iloc[:, 1:].apply(sequence_richness)
shannon_diversity = df.iloc[:, 1:].apply(shannon_diversity_index)
simpson_diversity = df.iloc[:, 1:].apply(simpson_diversity_index)
df_out = pd.DataFrame({'alpha_diversity': unique_counts,
df_out = pd.DataFrame({'richness': unique_counts,
'shannon_diversity': shannon_diversity, 'simpson_diversity': simpson_diversity}).T
df_a, df_b = df_out[group1], df_out[group2]
mean_a, mean_b = [np.mean(row_a) for row_a in df_a.values], [np.mean(row_b) for row_b in df_b.values]
if paired:
assert len(group1) == len(group2), "For paired samples, the size of group1 and group2 should be the same"
pvals = [ttest_rel(row_a, row_b)[1] if paired else ttest_ind(row_a, row_b, equal_var = False)[1] for row_a, row_b in zip(df_a.values, df_b.values)]
pvals = [p if p > 0 and p < 1 else 1.0 for p in pvals]
corrpvals = multipletests(pvals, method = 'fdr_bh')[1]
significance = [p < alpha for p in corrpvals]
out = pd.DataFrame(list(zip(df_out.index.tolist(), pvals, corrpvals, significance)),
columns = ["Metric", "p-val", "corr p-val", "significant"])
effect_sizes, variances = zip(*[cohen_d(row_b, row_a, paired = paired) for row_a, row_b in zip(df_a.values, df_b.values)])
out = pd.DataFrame(list(zip(df_out.index.tolist(), mean_a, mean_b, pvals, corrpvals, significance, effect_sizes)),
columns = ["Metric", "Group1 mean", "Group2 mean", "p-val", "corr p-val", "significant", "Effect size"])
return out.sort_values(by = 'p-val').sort_values(by = 'corr p-val')
2 changes: 1 addition & 1 deletion settings.ini
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ lib_name = glycowork
repo_name = glycowork
host = github
user = BojarLab
version = 1.0.1
version = 1.2.0
min_python = 3.8
audience = Developers
custom_sidebar = False
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

setuptools.setup(
name="glycowork",
version="1.1.0",
version="1.2.0",
author="Daniel Bojar",
author_email="daniel.bojar@gu.se",
description="Package for processing and analyzing glycans",
Expand Down

0 comments on commit 0434a71

Please sign in to comment.