Skip to content

Commit

Permalink
rename columns post abc2, tests
Browse files Browse the repository at this point in the history
  • Loading branch information
colinvwood committed Jan 8, 2025
1 parent a6e8ea4 commit 4b888aa
Show file tree
Hide file tree
Showing 2 changed files with 105 additions and 9 deletions.
43 changes: 43 additions & 0 deletions q2_composition/_ancombc2.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,11 @@ def ancombc2(

slices['structural_zeros'] = structural_zeros_df

# rename columns to original names
slices = _rename_columns(slices, metadata)

# split categorical variables from levels and append reference where needed

return transform(data=slices, to_type=ANCOMBC2OutputDirFmt)


Expand Down Expand Up @@ -499,3 +504,41 @@ def _split_into_slices(model_statistics: pd.DataFrame) -> ANCOMBC2SliceMapping:
slices[slice_name] = slice_df

return slices


def _rename_columns(
slices: ANCOMBC2SliceMapping, metadata: qiime2.Metadata
) -> ANCOMBC2SliceMapping:
'''
Renames any variables in the ANCOMBC2 output that were renamed to their
equivalent R-style identifiers back to their original names.
Parameters
----------
slices : ANCOMBC2SliceMapping
The raw slices as transformed from ANCOMBC2's output.
metadata : qiime2.Metadata
The per-sample metadata containing the original variable names.
Returns
-------
ANCOMBC2SliceMapping
The slices with any R-style identifiers renamed.
'''
# create a mapping from r-style identifiers to original identifiers
r_names = {}
for column in metadata.columns:
r_name = r_base.make_names(column)[0]
r_names[r_name] = column

# rename any columns that contain an r-style identifier
for slice_df in slices.values():
for slice_column in slice_df.columns:
for r_name, name in r_names.items():
if slice_column.startswith(r_name):
renamed = name + slice_column.lstrip(r_name)
slice_df.rename(
{slice_column: renamed}, axis='columns', inplace=True
)

return slices
71 changes: 62 additions & 9 deletions q2_composition/tests/test_ancombc2.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,8 @@
from qiime2.plugin.util import transform

from q2_composition._ancombc2 import (
r_base, ancombc2, _process_formula, _convert_metadata, _split_into_slices
r_base, ancombc2, _process_formula, _convert_metadata, _split_into_slices,
_rename_columns,
)
from q2_composition._format import ANCOMBC2SliceMapping

Expand Down Expand Up @@ -61,19 +62,26 @@ def test_wrapped_ancombc2(self):
as when it is called in R. The `r-model-statistics.tsv` and
`r-structural-zeros.tsv` files were obtained by running ANCOMBC2 in R
using the moving pictures tutorial data.
Note: the `_rename_columns` function is patched so that column names
are shared between the R output and the wrapper's output.
'''
model_stats_fp = self.test_data_fp / 'r-model-statistics.tsv'
ground_truth_model_stats = pd.read_csv(model_stats_fp, sep='\t')
structural_zeros_fp = self.test_data_fp / 'r-structural-zeros.tsv'
ground_truth_struc_zeros = pd.read_csv(structural_zeros_fp, sep='\t')

output_format = ancombc2(
table=self.biom_table,
metadata=self.metadata,
fixed_effects_formula='body-site + year',
group='body-site',
structural_zeros=True
)
with unittest.mock.patch(
'q2_composition._ancombc2._rename_columns',
side_effect=lambda slices, metadata: slices
):
output_format = ancombc2(
table=self.biom_table,
metadata=self.metadata,
fixed_effects_formula='body-site + year',
group='body-site',
structural_zeros=True
)

slices = transform(data=output_format, to_type=ANCOMBC2SliceMapping)
model_stats = self._slices_to_single_df(slices)
Expand Down Expand Up @@ -261,6 +269,11 @@ def test_variable_type_conversion(self):

class TestANCOMBC2Helpers(TestANCOMBC2Base):
def test_split_into_slices(self):
'''
Tests that a single model statistics table as returned by ANCOMBC2 is
properly converted to per-statistics slices.
'''
# non-overlapping prefixes
df = pd.DataFrame({
'taxon': ['feature1', 'feature2', 'feature3'],
'lfc_variable.1': [0.2, 0.9, 0.1],
Expand All @@ -287,7 +300,7 @@ def test_split_into_slices(self):
assert_frame_equal(exp['lfc'], obs['lfc'])
assert_frame_equal(exp['se'], obs['se'])

def test_split_into_slices_overlapping_prefixes(self):
# overlapping prefixes
df = pd.DataFrame({
'taxon': ['feature1', 'feature2', 'feature3'],
'p_variable.1': [0.2, 0.9, 0.1],
Expand All @@ -313,3 +326,43 @@ def test_split_into_slices_overlapping_prefixes(self):

assert_frame_equal(exp['p'], obs['p'])
assert_frame_equal(exp['passed_ss'], obs['passed_ss'])

def test_rename_columns(self):
'''
Tests that any metadata variables that were renamed to valid R-style
identifiers are properly renamed to the original identifers.
'''
slices = ANCOMBC2SliceMapping(
lfc=pd.DataFrame({
'taxon': ['feature1', 'feature2', 'feature3'],
'body.sitevariable.1': [0.2, 0.9, 0.1],
'body.sitevariable.2': [-0.4, 0.0, -0.3],
'year': [0.33, 0.2, 0.8],
}),
se=pd.DataFrame({
'taxon': ['feature1', 'feature2', 'feature3'],
'body.sitevariable.1': [0.4, 0.02, 0.1],
'body.sitevariable.2': [0.04, 0.0, 0.3],
'year': [0.33, 0.2, 0.8],
})
)

obs = _rename_columns(slices, self.metadata)

exp = ANCOMBC2SliceMapping(
lfc=pd.DataFrame({
'taxon': ['feature1', 'feature2', 'feature3'],
'body-sitevariable.1': [0.2, 0.9, 0.1],
'body-sitevariable.2': [-0.4, 0.0, -0.3],
'year': [0.33, 0.2, 0.8],
}),
se=pd.DataFrame({
'taxon': ['feature1', 'feature2', 'feature3'],
'body-sitevariable.1': [0.4, 0.02, 0.1],
'body-sitevariable.2': [0.04, 0.0, 0.3],
'year': [0.33, 0.2, 0.8],
})
)

assert_frame_equal(exp['lfc'], obs['lfc'])
assert_frame_equal(exp['se'], obs['se'])

0 comments on commit 4b888aa

Please sign in to comment.