Resolve ambiguities (#90)

* TST: tests for resolving ambiguities * BUG: ambiguity map did not correspond to doc expectations * ENH: Methods to resolve ambiguity * API: allow resolution of ambiguities * API: allow reduction of ambiguities in output metadata * Revert an accidental change
biocore · Nov 5, 2019 · f12133c · f12133c
1 parent 5acae76
commit f12133c
Show file tree

Hide file tree

Showing 3 changed files with 238 additions and 9 deletions.
diff --git a/redbiom/commands/fetch.py b/redbiom/commands/fetch.py
@@ -62,19 +62,28 @@ def fetch_features_contained(context):
 @click.option('--tagged', is_flag=True, default=False,
               help=("Obtain the tag specific metadata (e.g., preparation "
                     "information)."))
+@click.option('--resolve-ambiguities', is_flag=True, default=False,
+              help=("Output unambiguous identifiers only. This option is "
+                    "incompatible with --tagged."))
 @click.option('--force-category', type=str, required=False, multiple=True,
               help=("Force the output to include specific metadata variables "
                     "if the metadata variable was observed in any of the "
                     "samples. This can be specified mulitple times for "
                     "multiple categories."))
 @click.argument('samples', nargs=-1)
 def fetch_sample_metadata(from_, samples, all_columns, context, output,
-                          tagged, force_category):
+                          tagged, force_category, resolve_ambiguities):
     """Retreive sample metadata."""
-    import redbiom.util
-    iterator = redbiom.util.from_or_nargs(from_, samples)
+    if resolve_ambiguities and tagged:
+        click.echo("Cannot resolve ambiguities and fetch tagged metadata",
+                   err=True)
+        click.exit(1)
 
+    import redbiom.util
     import redbiom.fetch
+    import pandas as pd
+
+    iterator = redbiom.util.from_or_nargs(from_, samples)
 
     if not force_category:
         force_category = None
@@ -84,6 +93,28 @@ def fetch_sample_metadata(from_, samples, all_columns, context, output,
                                              restrict_to=force_category,
                                              tagged=tagged)
 
+    if resolve_ambiguities:
+        md.set_index('#SampleID', inplace=True)
+
+        # a temporary key to use when resolving ambiguities
+        # that will be removed before writing the metadata
+        key = "__@@AMBIGUITY@@__"
+
+        # add ambiguity information into the frame
+        ambigs = pd.Series(map_)
+        ambigs = ambigs.loc[md.index]
+        md[key] = ambigs
+
+        # remove duplicated unambiguous identifiers
+        md = md[~md[key].duplicated()]
+
+        # remove our index, and replace the entries with the ambiguous names
+        md.reset_index(inplace=True)
+        md['#SampleID'] = md[key]
+
+        # cleanup
+        md.drop(columns=key, inplace=True)
+
     md.to_csv(output, sep='\t', header=True, index=False, encoding='utf-8')
 
     _write_ambig(map_, output)
@@ -103,9 +134,14 @@ def fetch_sample_metadata(from_, samples, all_columns, context, output,
               help="Calculate and use MD5 for the features. This will also "
               "save a tsv file with the original feature name and the md5",
               default=False)
+@click.option('--resolve-ambiguities', required=False,
+              type=click.Choice(['merge', 'most-reads']), default=None,
+              help=("Resolve ambiguities that may be present in the samples "
+                    "which can arise from, for example, technical "
+                    "replicates."))
 @click.argument('features', nargs=-1)
 def fetch_samples_from_obserations(features, exact, from_, output,
-                                   context, md5):
+                                   context, md5, resolve_ambiguities):
     """Fetch sample data containing features."""
     import redbiom.util
     iterable = redbiom.util.from_or_nargs(from_, features)
@@ -118,6 +154,11 @@ def fetch_samples_from_obserations(features, exact, from_, output,
         with open(output + '.tsv', 'w') as f:
             f.write('\n'.join(['\t'.join(x) for x in new_ids.items()]))
 
+    if resolve_ambiguities == 'merge':
+        tab = redbiom.fetch._ambiguity_keep_most_reads(tab, map_)
+    elif resolve_ambiguities == 'most-reads':
+        tab = redbiom.fetch._ambiguity_merge(tab, map_)
+
     import h5py
     with h5py.File(output, 'w') as fp:
         tab.to_hdf5(fp, 'redbiom')
@@ -137,8 +178,14 @@ def fetch_samples_from_obserations(features, exact, from_, output,
               help="Calculate and use MD5 for the features. This will also "
               "save a tsv file with the original feature name and the md5",
               default=False)
+@click.option('--resolve-ambiguities', required=False,
+              type=click.Choice(['merge', 'most-reads']), default=None,
+              help=("Resolve ambiguities that may be present in the samples "
+                    "which can arise from, for example, technical "
+                    "replicates."))
 @click.argument('samples', nargs=-1)
-def fetch_samples_from_samples(samples, from_, output, context, md5):
+def fetch_samples_from_samples(samples, from_, output, context, md5,
+                               resolve_ambiguities):
     """Fetch sample data."""
     import redbiom.util
     iterable = redbiom.util.from_or_nargs(from_, samples)
@@ -151,6 +198,11 @@ def fetch_samples_from_samples(samples, from_, output, context, md5):
         with open(output + '.tsv', 'w') as f:
             f.write('\n'.join(['\t'.join(x) for x in new_ids.items()]))
 
+    if resolve_ambiguities == 'merge':
+        table = redbiom.fetch._ambiguity_keep_most_reads(table, ambig)
+    elif resolve_ambiguities == 'most-reads':
+        table = redbiom.fetch._ambiguity_merge(table, ambig)
+
     import h5py
     with h5py.File(output, 'w') as fp:
         table.to_hdf5(fp, 'redbiom')

diff --git a/redbiom/fetch.py b/redbiom/fetch.py
@@ -175,12 +175,16 @@ def sample_metadata(samples, common=True, context=None, restrict_to=None,
     samples = untagged + tagged_clean
 
     # resolve ambiguities
+    ambig_map = {}
     if context is not None:
         _, _, ambig_assoc, rbid_map = \
             redbiom.util.resolve_ambiguities(context, samples, get)
 
         if tagged:
             ambig_assoc = {rbid: [rbid] for rbid in rbid_map}
+            ambig_map = {rbid: rbid for rbid in rbid_map}
+        else:
+            ambig_map = {v: k.split('_', 1)[1] for k, v in rbid_map.items()}
     else:
         ambig_assoc = {k: [k] for k in samples}
 
@@ -235,7 +239,7 @@ def sample_metadata(samples, common=True, context=None, restrict_to=None,
             new_ids.append("%s.%s" % (id_, tag))
         md['#SampleID'] = new_ids
 
-    return md, ambig_assoc
+    return md, ambig_map
 
 
 def data_from_features(context, features, exact):
@@ -381,7 +385,12 @@ def _biom_from_samples(context, samples, get=None, normalize_taxonomy=None):
     table = biom.Table(mat, obs_ids, sample_ids, obs_md)
     table.update_ids(rimap)
 
-    return table, ambig_assoc
+    ambiguity_map = {}
+    for k, v in rimap.items():
+        tag, id_ = k.split('_', 1)
+        ambiguity_map[v] = id_
+
+    return table, ambiguity_map
 
 
 def taxon_ancestors(context, ids, get=None, normalize=None):
@@ -733,3 +742,91 @@ def get_sample_values(samples, category, get=None):
                                         multikey=key)
 
     return [item for chunk in getter for item in zip(*chunk)]
+
+
+def _ambiguity_merge(table, collapse_map):
+    """Merge ambiguous samples
+
+    Parameters
+    ----------
+    table : biom.Table
+        The table obtained from redbiom
+    collapse_map : dict
+        A mapping of a sample ID in the table to its collapse
+        target name.
+
+    Raises
+    ------
+    ValueError
+        If the IDs present in the table are not a perfect match to the keys
+        of the collapse map.
+
+    Returns
+    -------
+    biom.Table
+        A table of the merged data with updated sample identifiers
+    """
+    if set(collapse_map) != set(table.ids()):
+        raise ValueError("IDs are inconsistent")
+
+    def collapser(i, m):
+        return collapse_map[i]
+
+    collapsed_table = table.collapse(collapser, axis='sample', norm=False)
+
+    seen = set()
+    keep = []
+    for k, v in collapse_map.items():
+        if v not in seen:
+            keep.append(k)
+            seen.add(v)
+
+    return collapsed_table
+
+
+def _ambiguity_keep_most_reads(table, ambig_map):
+    """Keep the ambiguous sample with the most reads
+
+    Parameters
+    ----------
+    table : biom.Table
+        The table obtained from redbiom
+    ambig_map : dict
+        A mapping of a sample ID in the table to its ambiguous form.
+
+    Returns
+    -------
+    biom.Table
+        A table of the most volumous data with updated sample identifiers
+    """
+    import pandas as pd
+
+    if set(ambig_map) != set(table.ids()):
+        raise ValueError("IDs are inconsistent")
+
+    sample_counts = pd.Series(table.sum('sample'), index=table.ids()).to_dict()
+
+    ambigs = {}
+    for k, v in ambig_map.items():
+        if v not in ambigs:
+            ambigs[v] = []
+        ambigs[v].append(k)
+
+    to_keep = []
+    for sample_name, sample_ids in ambigs.items():
+        if len(sample_ids) > 1:
+            best = sample_ids[0]
+            best_cnt = sample_counts[best]
+            for i in sample_ids[1:]:
+                cnt = sample_counts[i]
+                if cnt > best_cnt:
+                    best = i
+                    best_cnt = cnt
+            to_keep.append(best)
+        else:
+            to_keep.append(sample_ids[0])
+
+    subset_table = table.filter(set(to_keep), inplace=False).remove_empty()
+    subset_table.update_ids(ambig_map, inplace=True)
+
+    return subset_table
diff --git a/redbiom/tests/test_fetch.py b/redbiom/tests/test_fetch.py
@@ -2,6 +2,7 @@
 import requests
 from future.moves.itertools import zip_longest
 
+import numpy as np
 import biom
 import pandas as pd
 import pandas.util.testing as pdt
@@ -10,7 +11,9 @@
 import redbiom.fetch
 from redbiom.fetch import (_biom_from_samples, sample_metadata,
                            samples_in_context, features_in_context,
-                           sample_counts_per_category, get_sample_values)
+                           sample_counts_per_category, get_sample_values,
+                           _ambiguity_keep_most_reads, _ambiguity_merge)
+
 from redbiom.tests import assert_test_env
 
 assert_test_env()
@@ -110,7 +113,7 @@ def test_biom_from_samples(self):
 
         fetch = exp.ids()[:]
 
-        exp_map = {k: ["UNTAGGED_%s" % k] for k in exp.ids()}
+        exp_map = {"%s.UNTAGGED" % k: k for k in exp.ids()}
         exp.update_ids({k: "%s.UNTAGGED" % k for k in exp.ids()})
 
         obs, obs_map = _biom_from_samples('test', fetch,
@@ -252,6 +255,7 @@ def test_sample_metadata_all_cols(self):
         exp = metadata.copy()
         exp.set_index('#SampleID', inplace=True)
         obs, ambig = sample_metadata(table.ids(), common=False)
+
         obs.set_index('#SampleID', inplace=True)
         self.assertEqual(sorted(exp.index), sorted(obs.index))
         self.assertTrue(set(obs.columns).issubset(exp.columns))
@@ -355,6 +359,82 @@ def test_sample_counts_per_category_specific(self):
         self.assertEqual(obs['LATITUDE'], 10)
         self.assertEqual(obs['LONGITUDE'], 10)
 
+    def test_ambiguity_merge(self):
+        ambig_map = {'10317.1234.foo': '10317.1234',
+                     '10317.1234.bar': '10317.1234',
+                     '10317.4321.foo': '10317.4321',
+                     '10317.1234.baz': '10317.1234'}
+        table = biom.Table(np.array([[0, 1, 2, 3],
+                                     [4, 5, 6, 7],
+                                     [8, 9, 10, 11]]),
+                           ['O1', 'O2', 'O3'],
+                           ['10317.1234.foo',
+                            '10317.1234.bar',
+                            '10317.4321.foo',
+                            '10317.1234.baz'])
+        exp_table = biom.Table(np.array([[4, 2], [16, 6], [28, 10]]),
+                               ['O1', 'O2', 'O3'],
+                               ['10317.1234', '10317.4321'])
+        obs_table = _ambiguity_merge(table, ambig_map)
+        obs_table.del_metadata()
+        obs_table = obs_table.sort_order(exp_table.ids())
+        self.assertEqual(obs_table, exp_table)
+
+    def test_ambiguity_merge_mismatch(self):
+        ambig_map = {'10317.1234.foo': '10317.1234',
+                     '10317.4321.foo': '10317.4321',
+                     '10317.1234.baz': '10317.1234'}
+        table = biom.Table(np.array([[0, 1, 2, 3],
+                                     [4, 5, 6, 7],
+                                     [8, 9, 10, 11]]),
+                           ['O1', 'O2', 'O3'],
+                           ['10317.1234.foo',
+                            '10317.1234.bar',
+                            '10317.4321.foo',
+                            '10317.1234.baz'])
+
+        with self.assertRaisesRegex(ValueError, "IDs are inconsistent"):
+            _ambiguity_merge(table, ambig_map)
+
+    def test_ambiguity_keep_most_reads(self):
+        ambig_map = {'10317.1234.foo': '10317.1234',
+                     '10317.1234.bar': '10317.1234',
+                     '10317.4321.foo': '10317.4321',
+                     '10317.1234.baz': '10317.1234'}
+
+        table = biom.Table(np.array([[0, 3, 2, 1],
+                                     [4, 7, 6, 5],
+                                     [8, 11, 10, 9]]),
+                           ['O1', 'O2', 'O3'],
+                           ['10317.1234.foo',
+                            '10317.1234.bar',
+                            '10317.4321.foo',
+                            '10317.1234.baz'])
+
+        exp_table = biom.Table(np.array([[3, 2], [7, 6], [11, 10]]),
+                               ['O1', 'O2', 'O3'],
+                               ['10317.1234', '10317.4321'])
+
+        obs_table = _ambiguity_keep_most_reads(table, ambig_map)
+        self.assertEqual(obs_table, exp_table)
+
+    def test_ambiguity_keep_most_reads_mismatch(self):
+        ambig_map = {'10317.1234.foo': '10317.1234',
+                     '10317.4321.foo': '10317.4321',
+                     '10317.1234.baz': '10317.1234'}
+
+        table = biom.Table(np.array([[0, 3, 2, 1],
+                                     [4, 7, 6, 5],
+                                     [8, 11, 10, 9]]),
+                           ['O1', 'O2', 'O3'],
+                           ['10317.1234.foo',
+                            '10317.1234.bar',
+                            '10317.4321.foo',
+                            '10317.1234.baz'])
+
+        with self.assertRaisesRegex(ValueError, "IDs are inconsistent"):
+            _ambiguity_keep_most_reads(table, ambig_map)
+
 
 if __name__ == '__main__':
     unittest.main()