From f3ed69b52e1e9808731e7521cd97a31de87ddd3e Mon Sep 17 00:00:00 2001
From: Daniel McDonald <mcdonadt@colorado.edu>
Date: Tue, 11 Dec 2018 14:06:30 -0800
Subject: [PATCH] Remap IDs to simplify storage for taxonomy, and simplify
 structures used for taxonomy (#68)

* Remap IDs to simplify storage for taxonomy

Use lighter weight structures for taxonomy

* Addressing @antgonza's comments

* flake8

* @eldeveloper's comments
---
 redbiom/admin.py            | 19 +++++++++++-
 redbiom/fetch.py            | 58 ++++++++++++++++++++++++++++---------
 redbiom/tests/test_admin.py | 14 +++++++--
 redbiom/util.py             |  4 +--
 test.sh                     |  2 +-
 5 files changed, 78 insertions(+), 19 deletions(-)

diff --git a/redbiom/admin.py b/redbiom/admin.py
index 90a603b..350b8b9 100644
--- a/redbiom/admin.py
+++ b/redbiom/admin.py
@@ -268,14 +268,26 @@ def load_sample_data(table, context, tag=None, redis_protocol=False):
                                           table.metadata(axis='observation'))
     if taxonomy is not None:
         post(context, 'HSET', "state/has-taxonomy/1")
+        hmgetter = redbiom._requests.buffered
+
+        tip_names = {n.name: n for n in taxonomy.tips()}
+        ids_ = hmgetter(tip_names, None, 'HMGET', context,
+                        get=get, buffer_size=100,
+                        multikey='feature-index')
+
+        for blk in ids_:
+            for entity, idx in zip(*blk):
+                tip_names[entity].name = idx
 
         for node in taxonomy.postorder(include_self=False):
             if not node.is_tip():
                 # define node -> children relationships
                 pack = []
+                terminal_pack = []
                 for c in node.children:
                     if c.is_tip():
-                        pack.append('terminal:%s' % c.name)
+                        pack.append('has-terminal')
+                        terminal_pack.append(c.name)
                     else:
                         pack.append(c.name)
 
@@ -288,6 +300,11 @@ def load_sample_data(table, context, tag=None, redis_protocol=False):
                         for c in node.children]
                 post(context, 'HMSET', 'taxonomy-parents/%s' % '/'.join(pack))
 
+                if terminal_pack:
+                    id_pack = '/'.join(terminal_pack)
+                    post(context, 'SADD', 'terminal-of:%s/%s' % (node.name,
+                                                                 id_pack))
+
     return len(samples)
 
 
diff --git a/redbiom/fetch.py b/redbiom/fetch.py
index e375312..a1e4d0c 100644
--- a/redbiom/fetch.py
+++ b/redbiom/fetch.py
@@ -382,14 +382,29 @@ def taxon_ancestors(context, ids, get=None, normalize=None):
         import redbiom
         config = redbiom.get_config()
         get = redbiom._requests.make_get(config)
+
+    hmgetter = redbiom._requests.buffered
+    remapped_bulk = hmgetter(iter(ids), None, 'HMGET', context,
+                             get=get, buffer_size=100,
+                             multikey='feature-index')
+
+    # map the feature identifier to an internal ID
+    # if an internal ID does not exist, keep the provided ID
+    # the provided ID is kept in the event a taxon name such as
+    # p__Firmicutes is provided
+    remapped = {name: id_ if id_ is not None else name
+                for names, idx in remapped_bulk
+                for name, id_ in zip(names, idx)}
+
     # bulk gather the taxonomy information for all the tips and their parents
-    to_get = ids
+    to_get = list(remapped.values())
     child_parent = {}
+
     while to_get:
         key = 'taxonomy-parents'
-        getter = redbiom._requests.buffered(iter(to_get), None, 'HMGET',
-                                            context, get=get,
-                                            buffer_size=100, multikey=key)
+        getter = hmgetter(iter(to_get), None, 'HMGET',
+                          context, get=get,
+                          buffer_size=100, multikey=key)
 
         new_to_get = set()
         for block in getter:
@@ -408,7 +423,7 @@ def taxon_ancestors(context, ids, get=None, normalize=None):
     lineages = []
     for id_ in ids:
         lineage = []
-        current = id_
+        current = remapped[id_]
         while current is not None:
             current = child_parent.get(current)
             if current is not None:
@@ -446,24 +461,41 @@ def taxon_descendents(context, taxon, get=None):
     ---------------------
     SMEMBERS <context>:taxonomy-children:<taxon>
     """
+    import redbiom._requests
+
     if get is None:
         import redbiom
-        import redbiom._requests
         config = redbiom.get_config()
         get = redbiom._requests.make_get(config)
+    hmgetter = redbiom._requests.buffered
 
-    to_get = [taxon]
+    to_get = [(None, taxon), ]
     to_keep = set()
     while to_get:
         new_to_get = []
-        for t in to_get:
-            if t.startswith('terminal:'):
-                to_keep.add(t.split(':', 1)[1])
+        for parent, taxon in to_get:
+            if taxon == 'has-terminal':
+                tips = get(context, 'SMEMBERS', 'terminal-of:%s' % parent)
+                to_keep.update(set(tips))
             else:
-                gotten = get(context, 'SMEMBERS', 'taxonomy-children:%s' % t)
-                new_to_get.extend(gotten)
+                gotten = get(context, 'SMEMBERS',
+                             'taxonomy-children:%s' % taxon)
+                new_to_get.extend([(taxon, child) for child in gotten])
         to_get = new_to_get
-    return to_keep
+
+    remapped_bulk = hmgetter(to_keep, None, 'HMGET', context,
+                             get=get, buffer_size=100,
+                             multikey='feature-index-inverted')
+
+    remapped = {name
+                for idx, names in remapped_bulk
+                for id_, name in zip(idx, names)}
+
+    if None in remapped:
+        # this should not happen and is a consistency check
+        raise ValueError("An unassociated index has been found")
+
+    return remapped
 
 
 def category_sample_values(category, samples=None):
diff --git a/redbiom/tests/test_admin.py b/redbiom/tests/test_admin.py
index 0ea975c..6ede5d9 100644
--- a/redbiom/tests/test_admin.py
+++ b/redbiom/tests/test_admin.py
@@ -195,7 +195,8 @@ def test_load_sample_data_taxonomy(self):
         # has an unclassified genus, so it should have tips directly descending
         f__Actinomycetaceae = {'g__Varibaculum',
                                'g__Actinomyces',
-                               'terminal:TACGTAGGGCGCGAGCGTTGTCCGGAATTATTGGGCGTAAAGGGCTCGTAGGCGGCTTGTCGCGTCTGCTGTGAAAATGCGGGGCTTAACTCCGTACGTG'}  # noqa
+                               'has-terminal'}
+        f__Actinomycetaceae_terminal = 'TACGTAGGGCGCGAGCGTTGTCCGGAATTATTGGGCGTAAAGGGCTCGTAGGCGGCTTGTCGCGTCTGCTGTGAAAATGCGGGGCTTAACTCCGTACGTG'  # noqa
 
         obs_bacteria = self.get(context, 'SMEMBERS',
                                 ':'.join(['taxonomy-children',
@@ -206,10 +207,19 @@ def test_load_sample_data_taxonomy(self):
                                                   'f__Actinomycetaceae']))
         self.assertEqual(set(obs_Actinomycetaceae), f__Actinomycetaceae)
 
+        key = 'terminal-of:f__Actinomycetaceae'
+        obs_Actinomycetaceae_terminal = self.get(context, 'SMEMBERS', key)
+        self.assertEqual(len(obs_Actinomycetaceae_terminal), 1)
+        id_ = obs_Actinomycetaceae_terminal[0]
+        key = 'feature-index-inverted/%d' % int(id_)
+        obs_Actinomycetaceae_terminal = self.get(context, 'HGET', key)
+        self.assertEqual(obs_Actinomycetaceae_terminal,
+                         f__Actinomycetaceae_terminal)
+
         exp_parents = [('p__Firmicutes', 'k__Bacteria'),
                        ('p__Fusobacteria', 'k__Bacteria'),
                        ('g__Actinomyces', 'f__Actinomycetaceae'),
-                       ('TACGTAGGGCGCGAGCGTTGTCCGGAATTATTGGGCGTAAAGGGCTCGTAGGCGGCTTGTCGCGTCTGCTGTGAAAATGCGGGGCTTAACTCCGTACGTG', 'f__Actinomycetaceae')]  # noqa
+                       (id_, 'f__Actinomycetaceae')]
         for name, exp in exp_parents:
             obs = self.get(context, 'HGET', 'taxonomy-parents/%s' % name)
             self.assertEqual(obs, exp)
diff --git a/redbiom/util.py b/redbiom/util.py
index b1112a4..26fa598 100644
--- a/redbiom/util.py
+++ b/redbiom/util.py
@@ -326,11 +326,11 @@ def stems(stops, stemmer, string):
     to_skip.update(NULL_VALUES)
 
     # match numbers (doesn't catch sci notation...)
-    numeric_regex = re.compile('(^-?\d+\.\d+$)|(^-?\d+$)')
+    numeric_regex = re.compile(r'(^-?\d+\.\d+$)|(^-?\d+$)')
 
     # time like. we don't actually care if this doesn't match time
     # as things like 1234:23123 are probably not useful for *general* search
-    time_regex = re.compile("^\d+:\d+(am|AM|pm|PM)?$")
+    time_regex = re.compile(r"^\d+:\d+(am|AM|pm|PM)?$")
 
     if string in to_skip:
         raise StopIteration
diff --git a/test.sh b/test.sh
index b535513..b999ea7 100644
--- a/test.sh
+++ b/test.sh
@@ -150,7 +150,7 @@ md5test obs_metadata_counts.txt exp_metadata_counts.txt
 # load table with some duplicate sample IDs
 head -n 1 test.txt > test.with_dups.txt
 tail -n 2 test.txt >> test.with_dups.txt
-tail -n 1 test.txt | sed -s 's/^10317\.[0-9]*/anewID/' >> test.with_dups.txt
+tail -n 1 test.txt | sed 's/^10317\.[0-9]*/anewID/' >> test.with_dups.txt
 echo "Loaded 1 samples" > exp_load_count.txt
 redbiom admin load-sample-metadata --metadata test.with_dups.txt > obs_load_count.txt
 md5test obs_load_count.txt exp_load_count.txt