Skip to content

Commit

Permalink
Resolve ambiguities (#90)
Browse files Browse the repository at this point in the history
* TST: tests for resolving ambiguities

* BUG: ambiguity map did not correspond to doc expectations

* ENH: Methods to resolve ambiguity

* API: allow resolution of ambiguities

* API: allow reduction of ambiguities in output metadata

* Revert an accidental change
  • Loading branch information
wasade authored and antgonza committed Nov 5, 2019
1 parent 5acae76 commit f12133c
Show file tree
Hide file tree
Showing 3 changed files with 238 additions and 9 deletions.
62 changes: 57 additions & 5 deletions redbiom/commands/fetch.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,19 +62,28 @@ def fetch_features_contained(context):
@click.option('--tagged', is_flag=True, default=False,
help=("Obtain the tag specific metadata (e.g., preparation "
"information)."))
@click.option('--resolve-ambiguities', is_flag=True, default=False,
help=("Output unambiguous identifiers only. This option is "
"incompatible with --tagged."))
@click.option('--force-category', type=str, required=False, multiple=True,
help=("Force the output to include specific metadata variables "
"if the metadata variable was observed in any of the "
"samples. This can be specified mulitple times for "
"multiple categories."))
@click.argument('samples', nargs=-1)
def fetch_sample_metadata(from_, samples, all_columns, context, output,
tagged, force_category):
tagged, force_category, resolve_ambiguities):
"""Retreive sample metadata."""
import redbiom.util
iterator = redbiom.util.from_or_nargs(from_, samples)
if resolve_ambiguities and tagged:
click.echo("Cannot resolve ambiguities and fetch tagged metadata",
err=True)
click.exit(1)

import redbiom.util
import redbiom.fetch
import pandas as pd

iterator = redbiom.util.from_or_nargs(from_, samples)

if not force_category:
force_category = None
Expand All @@ -84,6 +93,28 @@ def fetch_sample_metadata(from_, samples, all_columns, context, output,
restrict_to=force_category,
tagged=tagged)

if resolve_ambiguities:
md.set_index('#SampleID', inplace=True)

# a temporary key to use when resolving ambiguities
# that will be removed before writing the metadata
key = "__@@AMBIGUITY@@__"

# add ambiguity information into the frame
ambigs = pd.Series(map_)
ambigs = ambigs.loc[md.index]
md[key] = ambigs

# remove duplicated unambiguous identifiers
md = md[~md[key].duplicated()]

# remove our index, and replace the entries with the ambiguous names
md.reset_index(inplace=True)
md['#SampleID'] = md[key]

# cleanup
md.drop(columns=key, inplace=True)

md.to_csv(output, sep='\t', header=True, index=False, encoding='utf-8')

_write_ambig(map_, output)
Expand All @@ -103,9 +134,14 @@ def fetch_sample_metadata(from_, samples, all_columns, context, output,
help="Calculate and use MD5 for the features. This will also "
"save a tsv file with the original feature name and the md5",
default=False)
@click.option('--resolve-ambiguities', required=False,
type=click.Choice(['merge', 'most-reads']), default=None,
help=("Resolve ambiguities that may be present in the samples "
"which can arise from, for example, technical "
"replicates."))
@click.argument('features', nargs=-1)
def fetch_samples_from_obserations(features, exact, from_, output,
context, md5):
context, md5, resolve_ambiguities):
"""Fetch sample data containing features."""
import redbiom.util
iterable = redbiom.util.from_or_nargs(from_, features)
Expand All @@ -118,6 +154,11 @@ def fetch_samples_from_obserations(features, exact, from_, output,
with open(output + '.tsv', 'w') as f:
f.write('\n'.join(['\t'.join(x) for x in new_ids.items()]))

if resolve_ambiguities == 'merge':
tab = redbiom.fetch._ambiguity_keep_most_reads(tab, map_)
elif resolve_ambiguities == 'most-reads':
tab = redbiom.fetch._ambiguity_merge(tab, map_)

import h5py
with h5py.File(output, 'w') as fp:
tab.to_hdf5(fp, 'redbiom')
Expand All @@ -137,8 +178,14 @@ def fetch_samples_from_obserations(features, exact, from_, output,
help="Calculate and use MD5 for the features. This will also "
"save a tsv file with the original feature name and the md5",
default=False)
@click.option('--resolve-ambiguities', required=False,
type=click.Choice(['merge', 'most-reads']), default=None,
help=("Resolve ambiguities that may be present in the samples "
"which can arise from, for example, technical "
"replicates."))
@click.argument('samples', nargs=-1)
def fetch_samples_from_samples(samples, from_, output, context, md5):
def fetch_samples_from_samples(samples, from_, output, context, md5,
resolve_ambiguities):
"""Fetch sample data."""
import redbiom.util
iterable = redbiom.util.from_or_nargs(from_, samples)
Expand All @@ -151,6 +198,11 @@ def fetch_samples_from_samples(samples, from_, output, context, md5):
with open(output + '.tsv', 'w') as f:
f.write('\n'.join(['\t'.join(x) for x in new_ids.items()]))

if resolve_ambiguities == 'merge':
table = redbiom.fetch._ambiguity_keep_most_reads(table, ambig)
elif resolve_ambiguities == 'most-reads':
table = redbiom.fetch._ambiguity_merge(table, ambig)

import h5py
with h5py.File(output, 'w') as fp:
table.to_hdf5(fp, 'redbiom')
Expand Down
101 changes: 99 additions & 2 deletions redbiom/fetch.py
Original file line number Diff line number Diff line change
Expand Up @@ -175,12 +175,16 @@ def sample_metadata(samples, common=True, context=None, restrict_to=None,
samples = untagged + tagged_clean

# resolve ambiguities
ambig_map = {}
if context is not None:
_, _, ambig_assoc, rbid_map = \
redbiom.util.resolve_ambiguities(context, samples, get)

if tagged:
ambig_assoc = {rbid: [rbid] for rbid in rbid_map}
ambig_map = {rbid: rbid for rbid in rbid_map}
else:
ambig_map = {v: k.split('_', 1)[1] for k, v in rbid_map.items()}
else:
ambig_assoc = {k: [k] for k in samples}

Expand Down Expand Up @@ -235,7 +239,7 @@ def sample_metadata(samples, common=True, context=None, restrict_to=None,
new_ids.append("%s.%s" % (id_, tag))
md['#SampleID'] = new_ids

return md, ambig_assoc
return md, ambig_map


def data_from_features(context, features, exact):
Expand Down Expand Up @@ -381,7 +385,12 @@ def _biom_from_samples(context, samples, get=None, normalize_taxonomy=None):
table = biom.Table(mat, obs_ids, sample_ids, obs_md)
table.update_ids(rimap)

return table, ambig_assoc
ambiguity_map = {}
for k, v in rimap.items():
tag, id_ = k.split('_', 1)
ambiguity_map[v] = id_

return table, ambiguity_map


def taxon_ancestors(context, ids, get=None, normalize=None):
Expand Down Expand Up @@ -733,3 +742,91 @@ def get_sample_values(samples, category, get=None):
multikey=key)

return [item for chunk in getter for item in zip(*chunk)]


def _ambiguity_merge(table, collapse_map):
"""Merge ambiguous samples
Parameters
----------
table : biom.Table
The table obtained from redbiom
collapse_map : dict
A mapping of a sample ID in the table to its collapse
target name.
Raises
------
ValueError
If the IDs present in the table are not a perfect match to the keys
of the collapse map.
Returns
-------
biom.Table
A table of the merged data with updated sample identifiers
"""
if set(collapse_map) != set(table.ids()):
raise ValueError("IDs are inconsistent")

def collapser(i, m):
return collapse_map[i]

collapsed_table = table.collapse(collapser, axis='sample', norm=False)

seen = set()
keep = []
for k, v in collapse_map.items():
if v not in seen:
keep.append(k)
seen.add(v)

return collapsed_table


def _ambiguity_keep_most_reads(table, ambig_map):
"""Keep the ambiguous sample with the most reads
Parameters
----------
table : biom.Table
The table obtained from redbiom
ambig_map : dict
A mapping of a sample ID in the table to its ambiguous form.
Returns
-------
biom.Table
A table of the most volumous data with updated sample identifiers
"""
import pandas as pd

if set(ambig_map) != set(table.ids()):
raise ValueError("IDs are inconsistent")

sample_counts = pd.Series(table.sum('sample'), index=table.ids()).to_dict()

ambigs = {}
for k, v in ambig_map.items():
if v not in ambigs:
ambigs[v] = []
ambigs[v].append(k)

to_keep = []
for sample_name, sample_ids in ambigs.items():
if len(sample_ids) > 1:
best = sample_ids[0]
best_cnt = sample_counts[best]
for i in sample_ids[1:]:
cnt = sample_counts[i]
if cnt > best_cnt:
best = i
best_cnt = cnt
to_keep.append(best)
else:
to_keep.append(sample_ids[0])

subset_table = table.filter(set(to_keep), inplace=False).remove_empty()
subset_table.update_ids(ambig_map, inplace=True)

return subset_table
84 changes: 82 additions & 2 deletions redbiom/tests/test_fetch.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import requests
from future.moves.itertools import zip_longest

import numpy as np
import biom
import pandas as pd
import pandas.util.testing as pdt
Expand All @@ -10,7 +11,9 @@
import redbiom.fetch
from redbiom.fetch import (_biom_from_samples, sample_metadata,
samples_in_context, features_in_context,
sample_counts_per_category, get_sample_values)
sample_counts_per_category, get_sample_values,
_ambiguity_keep_most_reads, _ambiguity_merge)

from redbiom.tests import assert_test_env

assert_test_env()
Expand Down Expand Up @@ -110,7 +113,7 @@ def test_biom_from_samples(self):

fetch = exp.ids()[:]

exp_map = {k: ["UNTAGGED_%s" % k] for k in exp.ids()}
exp_map = {"%s.UNTAGGED" % k: k for k in exp.ids()}
exp.update_ids({k: "%s.UNTAGGED" % k for k in exp.ids()})

obs, obs_map = _biom_from_samples('test', fetch,
Expand Down Expand Up @@ -252,6 +255,7 @@ def test_sample_metadata_all_cols(self):
exp = metadata.copy()
exp.set_index('#SampleID', inplace=True)
obs, ambig = sample_metadata(table.ids(), common=False)

obs.set_index('#SampleID', inplace=True)
self.assertEqual(sorted(exp.index), sorted(obs.index))
self.assertTrue(set(obs.columns).issubset(exp.columns))
Expand Down Expand Up @@ -355,6 +359,82 @@ def test_sample_counts_per_category_specific(self):
self.assertEqual(obs['LATITUDE'], 10)
self.assertEqual(obs['LONGITUDE'], 10)

def test_ambiguity_merge(self):
ambig_map = {'10317.1234.foo': '10317.1234',
'10317.1234.bar': '10317.1234',
'10317.4321.foo': '10317.4321',
'10317.1234.baz': '10317.1234'}
table = biom.Table(np.array([[0, 1, 2, 3],
[4, 5, 6, 7],
[8, 9, 10, 11]]),
['O1', 'O2', 'O3'],
['10317.1234.foo',
'10317.1234.bar',
'10317.4321.foo',
'10317.1234.baz'])
exp_table = biom.Table(np.array([[4, 2], [16, 6], [28, 10]]),
['O1', 'O2', 'O3'],
['10317.1234', '10317.4321'])
obs_table = _ambiguity_merge(table, ambig_map)
obs_table.del_metadata()
obs_table = obs_table.sort_order(exp_table.ids())
self.assertEqual(obs_table, exp_table)

def test_ambiguity_merge_mismatch(self):
ambig_map = {'10317.1234.foo': '10317.1234',
'10317.4321.foo': '10317.4321',
'10317.1234.baz': '10317.1234'}
table = biom.Table(np.array([[0, 1, 2, 3],
[4, 5, 6, 7],
[8, 9, 10, 11]]),
['O1', 'O2', 'O3'],
['10317.1234.foo',
'10317.1234.bar',
'10317.4321.foo',
'10317.1234.baz'])

with self.assertRaisesRegex(ValueError, "IDs are inconsistent"):
_ambiguity_merge(table, ambig_map)

def test_ambiguity_keep_most_reads(self):
ambig_map = {'10317.1234.foo': '10317.1234',
'10317.1234.bar': '10317.1234',
'10317.4321.foo': '10317.4321',
'10317.1234.baz': '10317.1234'}

table = biom.Table(np.array([[0, 3, 2, 1],
[4, 7, 6, 5],
[8, 11, 10, 9]]),
['O1', 'O2', 'O3'],
['10317.1234.foo',
'10317.1234.bar',
'10317.4321.foo',
'10317.1234.baz'])

exp_table = biom.Table(np.array([[3, 2], [7, 6], [11, 10]]),
['O1', 'O2', 'O3'],
['10317.1234', '10317.4321'])

obs_table = _ambiguity_keep_most_reads(table, ambig_map)
self.assertEqual(obs_table, exp_table)

def test_ambiguity_keep_most_reads_mismatch(self):
ambig_map = {'10317.1234.foo': '10317.1234',
'10317.4321.foo': '10317.4321',
'10317.1234.baz': '10317.1234'}

table = biom.Table(np.array([[0, 3, 2, 1],
[4, 7, 6, 5],
[8, 11, 10, 9]]),
['O1', 'O2', 'O3'],
['10317.1234.foo',
'10317.1234.bar',
'10317.4321.foo',
'10317.1234.baz'])

with self.assertRaisesRegex(ValueError, "IDs are inconsistent"):
_ambiguity_keep_most_reads(table, ambig_map)


if __name__ == '__main__':
unittest.main()

0 comments on commit f12133c

Please sign in to comment.