Skip to content

Commit

Permalink
fix(reader): when merging, don't assume keys are all the same type
Browse files Browse the repository at this point in the history
also add another test
  • Loading branch information
mr-martian committed Oct 15, 2024
1 parent 741675f commit 9051c92
Show file tree
Hide file tree
Showing 3 changed files with 93 additions and 38 deletions.
2 changes: 1 addition & 1 deletion rebabel_format/reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -174,7 +174,7 @@ def finish_block(self, parent_if_missing=None, keep_uids=False):

# 4. Remove correspondences which don't match existing
# parent-child links
todo = sorted(merge_possible.keys())
todo = sorted(merge_possible.keys(), key=repr)
while todo:
next_todo = []
for name in todo:
Expand Down
127 changes: 90 additions & 37 deletions rebabel_format/test/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,43 +66,96 @@ def runTest(self):
for fname in glob.glob('*.toml'):
self.single_test(fname[:-5])

class MergeTest(unittest.TestCase):
class SimpleTest:
def runTest(self):
db_name = self.__class__.__name__ + '.db'
with data_dir(''):
if os.path.isfile('merge.db'):
os.remove('merge.db')
run_command('import', {}, infiles=['data/merge_text.flextext'],
mode='flextext', db='merge.db')
run_command('import', {}, infiles=['data/merge_pos.flextext'],
mode='flextext', db='merge.db',
merge_on={
'interlinear-text': 'meta:index',
'paragraph': 'meta:index',
'phrase': 'meta:index',
'word': 'meta:index',
})
if os.path.isfile(db_name):
os.remove(db_name)
self.commands(db_name)
from rebabel_format.db import RBBLFile
from rebabel_format.query import ResultTable
db = RBBLFile('merge.db')
table = ResultTable(db,
{
'phrase': {'type': 'phrase'},
'word': {'type': 'word', 'parent': 'phrase'},
},
order=['phrase', 'word'])
table.add_features('phrase', ['FlexText:en:segnum', 'meta:index'])
table.add_features('word', ['FlexText:en:txt', 'FlexText:en:pos',
'meta:index'])
results = list(table.results())
self.assertEqual(8, len(results))
first = set(x[0]['phrase'] for x in results[:4])
self.assertEqual(1, len(first))
second = set(x[0]['phrase'] for x in results[:4])
self.assertEqual(1, len(second))
expected = [
('The', 'DET'), ('man', 'NOUN'), ('snores', 'VERB'), ('.', 'PUNCT'),
('The', 'DET'), ('woman', 'NOUN'), ('sings', 'VERB'), ('.', 'PUNCT'),
]
for exp, (nodes, features) in zip(expected, results):
self.assertEqual(exp[0], features[nodes['word']]['FlexText:en:txt'])
self.assertEqual(exp[1], features[nodes['word']]['FlexText:en:pos'])
db = RBBLFile(db_name)
self.checks(db)

class FlexTextMergeTest(SimpleTest, unittest.TestCase):
def commands(self, db_name):
run_command('import', {}, infiles=['data/merge_text.flextext'],
mode='flextext', db=db_name)
run_command('import', {}, infiles=['data/merge_pos.flextext'],
mode='flextext', db=db_name,
merge_on={
'interlinear-text': 'meta:index',
'paragraph': 'meta:index',
'phrase': 'meta:index',
'word': 'meta:index',
})

def checks(self, db):
from rebabel_format.query import ResultTable
table = ResultTable(db,
{
'phrase': {'type': 'phrase'},
'word': {'type': 'word', 'parent': 'phrase'},
},
order=['phrase', 'word'])
table.add_features('phrase', ['FlexText:en:segnum', 'meta:index'])
table.add_features('word', ['FlexText:en:txt', 'FlexText:en:pos',
'meta:index'])
results = list(table.results())
self.assertEqual(8, len(results))
first = set(x[0]['phrase'] for x in results[:4])
self.assertEqual(1, len(first))
second = set(x[0]['phrase'] for x in results[:4])
self.assertEqual(1, len(second))
expected = [
('The', 'DET'), ('man', 'NOUN'), ('snores', 'VERB'), ('.', 'PUNCT'),
('The', 'DET'), ('woman', 'NOUN'), ('sings', 'VERB'), ('.', 'PUNCT'),
]
for exp, (nodes, features) in zip(expected, results):
self.assertEqual(exp[0], features[nodes['word']]['FlexText:en:txt'])
self.assertEqual(exp[1], features[nodes['word']]['FlexText:en:pos'])

class ConlluNLPMergeTest(SimpleTest, unittest.TestCase):
def commands(self, db_name):
run_command('import', {}, infiles=['data/basic.conllu'], mode='conllu',
db=db_name)
run_command('import', {}, infiles=['data/basic.conllu.nlp_apertium.txt'],
mode='nlp_pos', db=db_name,
merge_on={
'sentence': 'meta:index',
'word': 'meta:index',
})

def checks(self, db):
from rebabel_format.query import ResultTable
table = ResultTable(db,
{
'sentence': {'type': 'sentence'},
'word': {'type': 'word', 'parent': 'sentence'},
},
order=['sentence', 'word'])
table.add_features('sentence', ['UD:sent_id', 'meta:index'])
table.add_features('word', ['UD:form', 'UD:upos', 'nlp:form', 'nlp:pos',
'meta:index'])

results = list(table.results())
self.assertEqual(8, len(results))
self.assertEqual(1, len(set(x[0]['sentence'] for x in results[:4])))
self.assertEqual(1, len(set(x[0]['sentence'] for x in results[4:])))

expected = [
{'meta:index': 1, 'UD:form': 'The', 'UD:upos': 'DET', 'nlp:form': 'The', 'nlp:pos': 'det'},
{'meta:index': 2, 'UD:form': 'man', 'UD:upos': 'NOUN', 'nlp:form': 'man', 'nlp:pos': 'n'},
{'meta:index': 3, 'UD:form': 'snores', 'UD:upos': 'VERB', 'nlp:form': 'snores', 'nlp:pos': 'vblex'},
{'meta:index': 4, 'UD:form': '.', 'UD:upos': 'PUNCT', 'nlp:form': '.', 'nlp:pos': 'sent'},
{'meta:index': 1, 'UD:form': 'The', 'UD:upos': 'DET', 'nlp:form': 'The', 'nlp:pos': 'det'},
{'meta:index': 2, 'UD:form': 'woman', 'UD:upos': 'NOUN', 'nlp:form': 'woman', 'nlp:pos': 'n'},
{'meta:index': 3, 'UD:form': 'sings', 'UD:upos': 'VERB', 'nlp:form': 'sings', 'nlp:pos': 'vblex'},
{'meta:index': 4, 'UD:form': '.', 'UD:upos': 'PUNCT', 'nlp:form': '.', 'nlp:pos': 'sent'},
]
for (units, features), exp in zip(results, expected):
fdict = features[units['word']]
self.assertEqual(len(exp), len(fdict))
for k in exp:
self.assertIn(k, fdict)
self.assertEqual(exp[k], fdict[k])
2 changes: 2 additions & 0 deletions rebabel_format/test/data/basic.conllu.nlp_apertium.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
The/det man/n snores/vblex ./sent
The/det woman/n sings/vblex ./sent

0 comments on commit 9051c92

Please sign in to comment.