From f3ed69b52e1e9808731e7521cd97a31de87ddd3e Mon Sep 17 00:00:00 2001 From: Daniel McDonald Date: Tue, 11 Dec 2018 14:06:30 -0800 Subject: [PATCH] Remap IDs to simplify storage for taxonomy, and simplify structures used for taxonomy (#68) * Remap IDs to simplify storage for taxonomy Use lighter weight structures for taxonomy * Addressing @antgonza's comments * flake8 * @eldeveloper's comments --- redbiom/admin.py | 19 +++++++++++- redbiom/fetch.py | 58 ++++++++++++++++++++++++++++--------- redbiom/tests/test_admin.py | 14 +++++++-- redbiom/util.py | 4 +-- test.sh | 2 +- 5 files changed, 78 insertions(+), 19 deletions(-) diff --git a/redbiom/admin.py b/redbiom/admin.py index 90a603b..350b8b9 100644 --- a/redbiom/admin.py +++ b/redbiom/admin.py @@ -268,14 +268,26 @@ def load_sample_data(table, context, tag=None, redis_protocol=False): table.metadata(axis='observation')) if taxonomy is not None: post(context, 'HSET', "state/has-taxonomy/1") + hmgetter = redbiom._requests.buffered + + tip_names = {n.name: n for n in taxonomy.tips()} + ids_ = hmgetter(tip_names, None, 'HMGET', context, + get=get, buffer_size=100, + multikey='feature-index') + + for blk in ids_: + for entity, idx in zip(*blk): + tip_names[entity].name = idx for node in taxonomy.postorder(include_self=False): if not node.is_tip(): # define node -> children relationships pack = [] + terminal_pack = [] for c in node.children: if c.is_tip(): - pack.append('terminal:%s' % c.name) + pack.append('has-terminal') + terminal_pack.append(c.name) else: pack.append(c.name) @@ -288,6 +300,11 @@ def load_sample_data(table, context, tag=None, redis_protocol=False): for c in node.children] post(context, 'HMSET', 'taxonomy-parents/%s' % '/'.join(pack)) + if terminal_pack: + id_pack = '/'.join(terminal_pack) + post(context, 'SADD', 'terminal-of:%s/%s' % (node.name, + id_pack)) + return len(samples) diff --git a/redbiom/fetch.py b/redbiom/fetch.py index e375312..a1e4d0c 100644 --- a/redbiom/fetch.py +++ b/redbiom/fetch.py @@ -382,14 +382,29 @@ def taxon_ancestors(context, ids, get=None, normalize=None): import redbiom config = redbiom.get_config() get = redbiom._requests.make_get(config) + + hmgetter = redbiom._requests.buffered + remapped_bulk = hmgetter(iter(ids), None, 'HMGET', context, + get=get, buffer_size=100, + multikey='feature-index') + + # map the feature identifier to an internal ID + # if an internal ID does not exist, keep the provided ID + # the provided ID is kept in the event a taxon name such as + # p__Firmicutes is provided + remapped = {name: id_ if id_ is not None else name + for names, idx in remapped_bulk + for name, id_ in zip(names, idx)} + # bulk gather the taxonomy information for all the tips and their parents - to_get = ids + to_get = list(remapped.values()) child_parent = {} + while to_get: key = 'taxonomy-parents' - getter = redbiom._requests.buffered(iter(to_get), None, 'HMGET', - context, get=get, - buffer_size=100, multikey=key) + getter = hmgetter(iter(to_get), None, 'HMGET', + context, get=get, + buffer_size=100, multikey=key) new_to_get = set() for block in getter: @@ -408,7 +423,7 @@ def taxon_ancestors(context, ids, get=None, normalize=None): lineages = [] for id_ in ids: lineage = [] - current = id_ + current = remapped[id_] while current is not None: current = child_parent.get(current) if current is not None: @@ -446,24 +461,41 @@ def taxon_descendents(context, taxon, get=None): --------------------- SMEMBERS :taxonomy-children: """ + import redbiom._requests + if get is None: import redbiom - import redbiom._requests config = redbiom.get_config() get = redbiom._requests.make_get(config) + hmgetter = redbiom._requests.buffered - to_get = [taxon] + to_get = [(None, taxon), ] to_keep = set() while to_get: new_to_get = [] - for t in to_get: - if t.startswith('terminal:'): - to_keep.add(t.split(':', 1)[1]) + for parent, taxon in to_get: + if taxon == 'has-terminal': + tips = get(context, 'SMEMBERS', 'terminal-of:%s' % parent) + to_keep.update(set(tips)) else: - gotten = get(context, 'SMEMBERS', 'taxonomy-children:%s' % t) - new_to_get.extend(gotten) + gotten = get(context, 'SMEMBERS', + 'taxonomy-children:%s' % taxon) + new_to_get.extend([(taxon, child) for child in gotten]) to_get = new_to_get - return to_keep + + remapped_bulk = hmgetter(to_keep, None, 'HMGET', context, + get=get, buffer_size=100, + multikey='feature-index-inverted') + + remapped = {name + for idx, names in remapped_bulk + for id_, name in zip(idx, names)} + + if None in remapped: + # this should not happen and is a consistency check + raise ValueError("An unassociated index has been found") + + return remapped def category_sample_values(category, samples=None): diff --git a/redbiom/tests/test_admin.py b/redbiom/tests/test_admin.py index 0ea975c..6ede5d9 100644 --- a/redbiom/tests/test_admin.py +++ b/redbiom/tests/test_admin.py @@ -195,7 +195,8 @@ def test_load_sample_data_taxonomy(self): # has an unclassified genus, so it should have tips directly descending f__Actinomycetaceae = {'g__Varibaculum', 'g__Actinomyces', - 'terminal:TACGTAGGGCGCGAGCGTTGTCCGGAATTATTGGGCGTAAAGGGCTCGTAGGCGGCTTGTCGCGTCTGCTGTGAAAATGCGGGGCTTAACTCCGTACGTG'} # noqa + 'has-terminal'} + f__Actinomycetaceae_terminal = 'TACGTAGGGCGCGAGCGTTGTCCGGAATTATTGGGCGTAAAGGGCTCGTAGGCGGCTTGTCGCGTCTGCTGTGAAAATGCGGGGCTTAACTCCGTACGTG' # noqa obs_bacteria = self.get(context, 'SMEMBERS', ':'.join(['taxonomy-children', @@ -206,10 +207,19 @@ def test_load_sample_data_taxonomy(self): 'f__Actinomycetaceae'])) self.assertEqual(set(obs_Actinomycetaceae), f__Actinomycetaceae) + key = 'terminal-of:f__Actinomycetaceae' + obs_Actinomycetaceae_terminal = self.get(context, 'SMEMBERS', key) + self.assertEqual(len(obs_Actinomycetaceae_terminal), 1) + id_ = obs_Actinomycetaceae_terminal[0] + key = 'feature-index-inverted/%d' % int(id_) + obs_Actinomycetaceae_terminal = self.get(context, 'HGET', key) + self.assertEqual(obs_Actinomycetaceae_terminal, + f__Actinomycetaceae_terminal) + exp_parents = [('p__Firmicutes', 'k__Bacteria'), ('p__Fusobacteria', 'k__Bacteria'), ('g__Actinomyces', 'f__Actinomycetaceae'), - ('TACGTAGGGCGCGAGCGTTGTCCGGAATTATTGGGCGTAAAGGGCTCGTAGGCGGCTTGTCGCGTCTGCTGTGAAAATGCGGGGCTTAACTCCGTACGTG', 'f__Actinomycetaceae')] # noqa + (id_, 'f__Actinomycetaceae')] for name, exp in exp_parents: obs = self.get(context, 'HGET', 'taxonomy-parents/%s' % name) self.assertEqual(obs, exp) diff --git a/redbiom/util.py b/redbiom/util.py index b1112a4..26fa598 100644 --- a/redbiom/util.py +++ b/redbiom/util.py @@ -326,11 +326,11 @@ def stems(stops, stemmer, string): to_skip.update(NULL_VALUES) # match numbers (doesn't catch sci notation...) - numeric_regex = re.compile('(^-?\d+\.\d+$)|(^-?\d+$)') + numeric_regex = re.compile(r'(^-?\d+\.\d+$)|(^-?\d+$)') # time like. we don't actually care if this doesn't match time # as things like 1234:23123 are probably not useful for *general* search - time_regex = re.compile("^\d+:\d+(am|AM|pm|PM)?$") + time_regex = re.compile(r"^\d+:\d+(am|AM|pm|PM)?$") if string in to_skip: raise StopIteration diff --git a/test.sh b/test.sh index b535513..b999ea7 100644 --- a/test.sh +++ b/test.sh @@ -150,7 +150,7 @@ md5test obs_metadata_counts.txt exp_metadata_counts.txt # load table with some duplicate sample IDs head -n 1 test.txt > test.with_dups.txt tail -n 2 test.txt >> test.with_dups.txt -tail -n 1 test.txt | sed -s 's/^10317\.[0-9]*/anewID/' >> test.with_dups.txt +tail -n 1 test.txt | sed 's/^10317\.[0-9]*/anewID/' >> test.with_dups.txt echo "Loaded 1 samples" > exp_load_count.txt redbiom admin load-sample-metadata --metadata test.with_dups.txt > obs_load_count.txt md5test obs_load_count.txt exp_load_count.txt