Skip to content

Commit

Permalink
Remap IDs to simplify storage for taxonomy, and simplify structures u…
Browse files Browse the repository at this point in the history
…sed for taxonomy (#68)

* Remap IDs to simplify storage for taxonomy

Use lighter weight structures for taxonomy

* Addressing @antgonza's comments

* flake8

* @ElDeveloper's comments
  • Loading branch information
wasade authored and antgonza committed Dec 11, 2018
1 parent f328b55 commit f3ed69b
Show file tree
Hide file tree
Showing 5 changed files with 78 additions and 19 deletions.
19 changes: 18 additions & 1 deletion redbiom/admin.py
Original file line number Diff line number Diff line change
Expand Up @@ -268,14 +268,26 @@ def load_sample_data(table, context, tag=None, redis_protocol=False):
table.metadata(axis='observation'))
if taxonomy is not None:
post(context, 'HSET', "state/has-taxonomy/1")
hmgetter = redbiom._requests.buffered

tip_names = {n.name: n for n in taxonomy.tips()}
ids_ = hmgetter(tip_names, None, 'HMGET', context,
get=get, buffer_size=100,
multikey='feature-index')

for blk in ids_:
for entity, idx in zip(*blk):
tip_names[entity].name = idx

for node in taxonomy.postorder(include_self=False):
if not node.is_tip():
# define node -> children relationships
pack = []
terminal_pack = []
for c in node.children:
if c.is_tip():
pack.append('terminal:%s' % c.name)
pack.append('has-terminal')
terminal_pack.append(c.name)
else:
pack.append(c.name)

Expand All @@ -288,6 +300,11 @@ def load_sample_data(table, context, tag=None, redis_protocol=False):
for c in node.children]
post(context, 'HMSET', 'taxonomy-parents/%s' % '/'.join(pack))

if terminal_pack:
id_pack = '/'.join(terminal_pack)
post(context, 'SADD', 'terminal-of:%s/%s' % (node.name,
id_pack))

return len(samples)


Expand Down
58 changes: 45 additions & 13 deletions redbiom/fetch.py
Original file line number Diff line number Diff line change
Expand Up @@ -382,14 +382,29 @@ def taxon_ancestors(context, ids, get=None, normalize=None):
import redbiom
config = redbiom.get_config()
get = redbiom._requests.make_get(config)

hmgetter = redbiom._requests.buffered
remapped_bulk = hmgetter(iter(ids), None, 'HMGET', context,
get=get, buffer_size=100,
multikey='feature-index')

# map the feature identifier to an internal ID
# if an internal ID does not exist, keep the provided ID
# the provided ID is kept in the event a taxon name such as
# p__Firmicutes is provided
remapped = {name: id_ if id_ is not None else name
for names, idx in remapped_bulk
for name, id_ in zip(names, idx)}

# bulk gather the taxonomy information for all the tips and their parents
to_get = ids
to_get = list(remapped.values())
child_parent = {}

while to_get:
key = 'taxonomy-parents'
getter = redbiom._requests.buffered(iter(to_get), None, 'HMGET',
context, get=get,
buffer_size=100, multikey=key)
getter = hmgetter(iter(to_get), None, 'HMGET',
context, get=get,
buffer_size=100, multikey=key)

new_to_get = set()
for block in getter:
Expand All @@ -408,7 +423,7 @@ def taxon_ancestors(context, ids, get=None, normalize=None):
lineages = []
for id_ in ids:
lineage = []
current = id_
current = remapped[id_]
while current is not None:
current = child_parent.get(current)
if current is not None:
Expand Down Expand Up @@ -446,24 +461,41 @@ def taxon_descendents(context, taxon, get=None):
---------------------
SMEMBERS <context>:taxonomy-children:<taxon>
"""
import redbiom._requests

if get is None:
import redbiom
import redbiom._requests
config = redbiom.get_config()
get = redbiom._requests.make_get(config)
hmgetter = redbiom._requests.buffered

to_get = [taxon]
to_get = [(None, taxon), ]
to_keep = set()
while to_get:
new_to_get = []
for t in to_get:
if t.startswith('terminal:'):
to_keep.add(t.split(':', 1)[1])
for parent, taxon in to_get:
if taxon == 'has-terminal':
tips = get(context, 'SMEMBERS', 'terminal-of:%s' % parent)
to_keep.update(set(tips))
else:
gotten = get(context, 'SMEMBERS', 'taxonomy-children:%s' % t)
new_to_get.extend(gotten)
gotten = get(context, 'SMEMBERS',
'taxonomy-children:%s' % taxon)
new_to_get.extend([(taxon, child) for child in gotten])
to_get = new_to_get
return to_keep

remapped_bulk = hmgetter(to_keep, None, 'HMGET', context,
get=get, buffer_size=100,
multikey='feature-index-inverted')

remapped = {name
for idx, names in remapped_bulk
for id_, name in zip(idx, names)}

if None in remapped:
# this should not happen and is a consistency check
raise ValueError("An unassociated index has been found")

return remapped


def category_sample_values(category, samples=None):
Expand Down
14 changes: 12 additions & 2 deletions redbiom/tests/test_admin.py
Original file line number Diff line number Diff line change
Expand Up @@ -195,7 +195,8 @@ def test_load_sample_data_taxonomy(self):
# has an unclassified genus, so it should have tips directly descending
f__Actinomycetaceae = {'g__Varibaculum',
'g__Actinomyces',
'terminal:TACGTAGGGCGCGAGCGTTGTCCGGAATTATTGGGCGTAAAGGGCTCGTAGGCGGCTTGTCGCGTCTGCTGTGAAAATGCGGGGCTTAACTCCGTACGTG'} # noqa
'has-terminal'}
f__Actinomycetaceae_terminal = 'TACGTAGGGCGCGAGCGTTGTCCGGAATTATTGGGCGTAAAGGGCTCGTAGGCGGCTTGTCGCGTCTGCTGTGAAAATGCGGGGCTTAACTCCGTACGTG' # noqa

obs_bacteria = self.get(context, 'SMEMBERS',
':'.join(['taxonomy-children',
Expand All @@ -206,10 +207,19 @@ def test_load_sample_data_taxonomy(self):
'f__Actinomycetaceae']))
self.assertEqual(set(obs_Actinomycetaceae), f__Actinomycetaceae)

key = 'terminal-of:f__Actinomycetaceae'
obs_Actinomycetaceae_terminal = self.get(context, 'SMEMBERS', key)
self.assertEqual(len(obs_Actinomycetaceae_terminal), 1)
id_ = obs_Actinomycetaceae_terminal[0]
key = 'feature-index-inverted/%d' % int(id_)
obs_Actinomycetaceae_terminal = self.get(context, 'HGET', key)
self.assertEqual(obs_Actinomycetaceae_terminal,
f__Actinomycetaceae_terminal)

exp_parents = [('p__Firmicutes', 'k__Bacteria'),
('p__Fusobacteria', 'k__Bacteria'),
('g__Actinomyces', 'f__Actinomycetaceae'),
('TACGTAGGGCGCGAGCGTTGTCCGGAATTATTGGGCGTAAAGGGCTCGTAGGCGGCTTGTCGCGTCTGCTGTGAAAATGCGGGGCTTAACTCCGTACGTG', 'f__Actinomycetaceae')] # noqa
(id_, 'f__Actinomycetaceae')]
for name, exp in exp_parents:
obs = self.get(context, 'HGET', 'taxonomy-parents/%s' % name)
self.assertEqual(obs, exp)
Expand Down
4 changes: 2 additions & 2 deletions redbiom/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -326,11 +326,11 @@ def stems(stops, stemmer, string):
to_skip.update(NULL_VALUES)

# match numbers (doesn't catch sci notation...)
numeric_regex = re.compile('(^-?\d+\.\d+$)|(^-?\d+$)')
numeric_regex = re.compile(r'(^-?\d+\.\d+$)|(^-?\d+$)')

# time like. we don't actually care if this doesn't match time
# as things like 1234:23123 are probably not useful for *general* search
time_regex = re.compile("^\d+:\d+(am|AM|pm|PM)?$")
time_regex = re.compile(r"^\d+:\d+(am|AM|pm|PM)?$")

if string in to_skip:
raise StopIteration
Expand Down
2 changes: 1 addition & 1 deletion test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -150,7 +150,7 @@ md5test obs_metadata_counts.txt exp_metadata_counts.txt
# load table with some duplicate sample IDs
head -n 1 test.txt > test.with_dups.txt
tail -n 2 test.txt >> test.with_dups.txt
tail -n 1 test.txt | sed -s 's/^10317\.[0-9]*/anewID/' >> test.with_dups.txt
tail -n 1 test.txt | sed 's/^10317\.[0-9]*/anewID/' >> test.with_dups.txt
echo "Loaded 1 samples" > exp_load_count.txt
redbiom admin load-sample-metadata --metadata test.with_dups.txt > obs_load_count.txt
md5test obs_load_count.txt exp_load_count.txt
Expand Down

0 comments on commit f3ed69b

Please sign in to comment.