Skip to content

Commit

Permalink
feat(reader): merge_on parameter (#8)
Browse files Browse the repository at this point in the history
Merge units on a single feature, respecting unit type and parent-child
relations. Note that this assumes input types are mapped appropriately.
  • Loading branch information
mr-martian committed Oct 1, 2024
1 parent 7cf2c29 commit b787123
Show file tree
Hide file tree
Showing 4 changed files with 252 additions and 31 deletions.
115 changes: 101 additions & 14 deletions rebabel_format/reader.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
#!/usr/bin/env python3

from rebabel_format.db import RBBLFile
from rebabel_format.db import RBBLFile, WhereClause
from rebabel_format.parameters import Parameter, process_parameters
from rebabel_format.query import ResultTable
import logging
from collections import defaultdict

Expand All @@ -14,6 +15,8 @@ class Reader:
identifier = None
parameters = {}

merge_on = Parameter(required=False, type=dict)

def __init__(self, db, user, conf, kwargs):
self.db = db
self.user = user
Expand Down Expand Up @@ -95,12 +98,87 @@ def set_feature(self, unit_name, feature: str, ftype: str, value,
self.db.check_type(ftype, value)
self.features[unit_name][(feature, ftype)] = (value, confidence)

def _remap_features(self):
new_feats = defaultdict(dict)
for name, dct in self.features.items():
for (feature, ftype), val in dct.items():
m_key = (feature, self.types.get(name))
n_key = (feature, None)
feature = self.feature_map.get(m_key,
self.feature_map.get(n_key, feature))
new_feats[name][(feature, ftype)] = val
self.features = new_feats

def finish_block(self, parent_if_missing=None, keep_uids=False):
parent_type_if_missing = None
if parent_if_missing is not None:
parent_type_if_missing = self.db.get_unit_type(parent_if_missing)

self._remap_features()
uids = self.uids.copy()

is_merged = set()
if self.merge_on:
merge_values = {k: defaultdict(list) for k in self.merge_on}
for name in self.id_seq:
typ = self.types.get(name)
if typ in merge_values:
for feat, val in self.features[name].items():
if feat[0] == self.merge_on[typ]:
merge_values[typ][val[0]].append(name)
merge_possible = defaultdict(list)
for typ, val_map in merge_values.items():
table = ResultTable(self.db,
{'N': {
'type': typ,
'features': [{
'feature': self.merge_on[typ],
'value': sorted(val_map.keys()),
}],
}})
table.add_features('N', [self.merge_on[typ]])
for nodes, features in table.results():
val = features[nodes['N']][self.merge_on[typ]]
for name in val_map[val]:
merge_possible[name].append(nodes['N'])
child_names = defaultdict(list)
all_merge = []
for name in self.id_seq:
if name not in merge_possible:
continue
all_merge += merge_possible[name]
if name not in self.parents:
continue
if self.parents[name] not in merge_possible:
# TODO: what is parent is in self.uids?
del merge_possible[name]
child_names[self.parents[name]].append(name)
self.db.execute_clauses('SELECT parent, child FROM relations',
WhereClause('child', all_merge),
WhereClause('parent', all_merge),
WhereClause('active', True),
WhereClause('isprimary', True))
child_ids = defaultdict(list)
for p, c in self.db.cur.fetchall():
child_ids[p].append(c)
todo = sorted(merge_possible.keys())
while todo:
next_todo = []
for name in todo:
pc = set()
for n in merge_possible[name]:
pc.update(child_ids[n])
for ch in child_names[name]:
update = set(merge_possible[ch]) & pc
if len(update) < len(merge_possible[ch]):
merge_possible[ch] = sorted(update)
next_todo.append(ch)
todo = next_todo
for name, ids in merge_possible.items():
if ids:
uids[name] = ids[0]
is_merged.add(name)

for name in self.id_seq:
if name in uids:
continue
Expand Down Expand Up @@ -129,22 +207,17 @@ def finish_block(self, parent_if_missing=None, keep_uids=False):
'isprimary': False, 'active': True, 'date': self.db.now()}
)
self.db.cur.executemany(
'INSERT INTO relations(parent, parent_type, child, child_type, isprimary, active, date) VALUES(:parent, :parent_type, :child, :child_type, :isprimary, :active, :date)',
'INSERT OR IGNORE INTO relations(parent, parent_type, child, child_type, isprimary, active, date) VALUES(:parent, :parent_type, :child, :child_type, :isprimary, :active, :date)',
parents,
)
self.parents = {}
self.relations = defaultdict(set)

feature_ids = {}
features = []
merge_features = []
for name in self.id_seq:
for (feature, ftype), (value, conf) in self.features[name].items():
m_key = (feature, self.types[name])
n_key = (feature, None)
if m_key in self.feature_map:
feature = self.feature_map[m_key]
elif n_key in self.feature_map:
feature = self.feature_map[n_key]
key = (feature, ftype, self.types[name])
if key in feature_ids:
fid = feature_ids[key]
Expand All @@ -153,15 +226,29 @@ def finish_block(self, parent_if_missing=None, keep_uids=False):
feature_ids[key] = fid
if ftype == 'ref':
value = uids[value]
features.append({
dct = {
'unit': uids[name], 'feature': fid, 'value': value,
'user': self.user, 'date': self.db.now(),
'confidence': conf,
})
self.db.cur.executemany(
'INSERT INTO features(unit, feature, value, user, date, confidence) VALUES(:unit, :feature, :value, :user, :date, :confidence)',
features,
)
}
if name in is_merged:
merge_features.append(dct)
else:
features.append(dct)
if features:
self.db.cur.executemany(
'INSERT INTO features(unit, feature, value, user, date, confidence) VALUES(:unit, :feature, :value, :user, :date, :confidence)',
features,
)
if merge_features:
self.db.cur.executemany(
'UPDATE features SET value = :value, user = :user, confidence = :confidence, date = :date WHERE unit = :unit AND feature = :feature',
merge_features,
)
self.db.cur.executemany(
'INSERT OR IGNORE INTO features(unit, feature, value, user, confidence, date) VALUES(:unit, :feature, :value, :user, :confidence, :date)',
merge_features,
)
self.features = defaultdict(dict)

self.id_seq = []
Expand Down
78 changes: 61 additions & 17 deletions rebabel_format/test/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,16 +4,26 @@
import io
import os
import tempfile
from rebabel_format import load_processes, load_readers, load_writers
from rebabel_format.process import ALL_PROCESSES
from rebabel_format import load_processes, load_readers, load_writers, run_command
from rebabel_format.config import read_config

load_processes(False)
load_readers(False)
load_writers(False)

@contextlib.contextmanager
def data_dir(name: str):
cwd_was = os.getcwd()
dir_name = os.path.join(os.path.dirname(os.path.abspath(__file__)), name)
os.chdir(dir_name)
try:
yield
finally:
os.chdir(cwd_was)

class StaticTests(unittest.TestCase):
'''Run commands and compare the outputs to the files in static/
'''
Run commands and compare the outputs to the files in static/
To add a new test to this runner:
- Create static/NAME.toml with desired parameters
Expand All @@ -23,20 +33,18 @@ class StaticTests(unittest.TestCase):
- files will be sorted lexicographically, so zero-pad ORDER
- outputs will be compared with leading and trailing whitespace trimmed
'''

def single_command(self, db, config, fname):
command = fname.split('.')[-2]
with self.subTest(fname):
if command == 'export':
name = fname.split('.')[0]
proc = ALL_PROCESSES[command](config, db=db, outfile=fname+'.out')
proc.run()
run_command(command, config, db=db, outfile=fname+'.out')
with open(fname+'.out') as fin:
text = fin.read()
else:
stream = io.StringIO()
with contextlib.redirect_stdout(stream):
proc = ALL_PROCESSES[command](config, db=db)
proc.run()
run_command(command, config, db=db)
text = stream.getvalue()
with open(fname) as fin:
expected = fin.read()
Expand All @@ -49,16 +57,52 @@ def single_test(self, name):
if os.path.isfile(db):
os.remove(db)
with self.subTest('import'):
proc = ALL_PROCESSES['import'](config, db=db)
proc.run()
run_command('import', config, db=db)
for path in sorted(glob.glob(name+'.[0123456789]*.txt')):
self.single_command(db, config, path)

def runTest(self):
cwd_was = os.getcwd()
dir_name = os.path.join(os.path.dirname(os.path.abspath(__file__)),
'static')
os.chdir(dir_name)
for fname in glob.glob('*.toml'):
self.single_test(fname[:-5])
os.chdir(cwd_was)
with data_dir('static'):
for fname in glob.glob('*.toml'):
self.single_test(fname[:-5])

class MergeTest(unittest.TestCase):
def runTest(self):
with data_dir(''):
if os.path.isfile('merge.db'):
os.remove('merge.db')
run_command('import', {}, infiles=['data/merge_text.flextext'],
mode='flextext', db='merge.db')
run_command('import', {}, infiles=['data/merge_pos.flextext'],
mode='flextext', db='merge.db',
merge_on={
'interlinear-text': 'meta:index',
'paragraph': 'meta:index',
'phrase': 'meta:index',
'word': 'meta:index',
})
from rebabel_format.db import RBBLFile
from rebabel_format.query import ResultTable
db = RBBLFile('merge.db')
table = ResultTable(db,
{
'phrase': {'type': 'phrase'},
'word': {'type': 'word', 'parent': 'phrase'},
},
order=['phrase', 'word'])
table.add_features('phrase', ['FlexText:en:segnum', 'meta:index'])
table.add_features('word', ['FlexText:en:txt', 'FlexText:en:pos',
'meta:index'])
results = list(table.results())
self.assertEqual(8, len(results))
first = set(x[0]['phrase'] for x in results[:4])
self.assertEqual(1, len(first))
second = set(x[0]['phrase'] for x in results[:4])
self.assertEqual(1, len(second))
expected = [
('The', 'DET'), ('man', 'NOUN'), ('snores', 'VERB'), ('.', 'PUNCT'),
('The', 'DET'), ('woman', 'NOUN'), ('sings', 'VERB'), ('.', 'PUNCT'),
]
for exp, (nodes, features) in zip(expected, results):
self.assertEqual(exp[0], features[nodes['word']]['FlexText:en:txt'])
self.assertEqual(exp[1], features[nodes['word']]['FlexText:en:pos'])
45 changes: 45 additions & 0 deletions rebabel_format/test/data/merge_pos.flextext
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
<?xml version='1.0' encoding='UTF-8'?>
<document version="2">
<interlinear-text>
<paragraphs>
<paragraph>
<phrases>
<phrase>
<words>
<word>
<item lang="en" type="pos">DET</item>
</word>
<word>
<item lang="en" type="pos">NOUN</item>
</word>
<word>
<item lang="en" type="pos">VERB</item>
</word>
<word>
<item lang="en" type="pos">PUNCT</item>
</word>
</words>
<item lang="en" type="segnum">1</item>
</phrase>
<phrase>
<words>
<word>
<item lang="en" type="pos">DET</item>
</word>
<word>
<item lang="en" type="pos">NOUN</item>
</word>
<word>
<item lang="en" type="pos">VERB</item>
</word>
<word>
<item lang="en" type="pos">PUNCT</item>
</word>
</words>
<item lang="en" type="segnum">2</item>
</phrase>
</phrases>
</paragraph>
</paragraphs>
</interlinear-text>
</document>
45 changes: 45 additions & 0 deletions rebabel_format/test/data/merge_text.flextext
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
<?xml version='1.0' encoding='UTF-8'?>
<document version="2">
<interlinear-text>
<paragraphs>
<paragraph>
<phrases>
<phrase>
<words>
<word>
<item lang="en" type="txt">The</item>
</word>
<word>
<item lang="en" type="txt">man</item>
</word>
<word>
<item lang="en" type="txt">snores</item>
</word>
<word>
<item lang="en" type="txt">.</item>
</word>
</words>
<item lang="en" type="segnum">1</item>
</phrase>
<phrase>
<words>
<word>
<item lang="en" type="txt">The</item>
</word>
<word>
<item lang="en" type="txt">woman</item>
</word>
<word>
<item lang="en" type="txt">sings</item>
</word>
<word>
<item lang="en" type="txt">.</item>
</word>
</words>
<item lang="en" type="segnum">2</item>
</phrase>
</phrases>
</paragraph>
</paragraphs>
</interlinear-text>
</document>

0 comments on commit b787123

Please sign in to comment.