This repository has been archived by the owner on Jul 13, 2023. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
sensevalapi.py
82 lines (76 loc) · 3.73 KB
/
sensevalapi.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
from xml.etree import cElementTree as ElementTree
SENSEVAL3_TEST_DATA_FILE = "english-all-words.xml"
SENSEVAL3_TEST_ANSWERS_FILE = "EnglishAW.test.key"
def senseval_data():
# TODO: Add part of speech of each word using WordNet
all_sentences = []
senseval_test = ElementTree.parse(SENSEVAL3_TEST_DATA_FILE)
texts = senseval_test.getroot().findall("text")
sentence = []
sats = []
test_words = {}
test_phrases = []
macro_sentence = [] # Macro variable are for sentences with subclauses in brackets, to process the clause in
macro_test_words = {} # the brackets without losing the continuity of the sentence outside the brackets
macro_test_phrases = []
for text in texts:
elems = text.iter()
for elem in elems:
if elem.tag == "text":
tail_words = elem.text.lower().split()
elif elem.tag == "sat":
sentence.append(elem.text.lower())
tail_words = elem.tail.lower().split()
sats.append(elem)
elif elem.tag == "head":
if "sats" in elem.attrib:
test_phrases.append({"headword": (elem.attrib["id"], elem.text.lower()), "sats": elem.attrib["sats"].split()})
else:
test_words[elem.attrib["id"]] = elem.text.lower()
sentence.append(elem.text.lower())
tail_words = elem.tail.lower().split()
else:
raise ValueError("tag of unidentified kind: " + elem.tag)
for tail_word in tail_words:
# Ignore certain characters
if not tail_word.isdigit() and tail_word[0] != "*" and tail_word != "," and tail_word != """:
# if sentence over, run sentence through Lesk
if tail_word == "." or tail_word == "!" or tail_word == "?" or \
tail_word == "--" or tail_word == ":":
all_sentences.append({"sentence": sentence, "test_words": test_words, "test_phrases": test_phrases})
sentence = []
test_words = {}
test_phrases = []
# if left bracket
elif tail_word == "-LRB-":
macro_sentence = sentence
macro_test_words = test_words
macro_test_phrases = test_phrases
sentence = []
test_words = {}
test_phrases = []
# if right bracket
elif tail_word == "-RRB-":
all_sentences.append({"sentence": sentence, "test_words": test_words, "test_phrases": test_phrases})
sentence = macro_sentence
test_words = macro_test_words
test_phrases = macro_test_phrases
macro_sentence = []
macro_test_words = {}
macro_test_phrases = []
else:
sentence.append(tail_word.lower())
if sentence or test_words:
all_sentences.append({"sentence": sentence, "test_words": test_words, "test_phrases": test_phrases})
return all_sentences
def senseval_answers():
with open(SENSEVAL3_TEST_ANSWERS_FILE) as answer_file:
answers = answer_file.read().split('\n')[:-1]
answer_dicts = []
total_answers = 0
for answer in answers:
answer = answer.split()
answer_dicts.append({"id": answer[1], "lemmas": answer[2:]})
if answer[2] != "U": # catches the case where WordNet doesn't provide the proper sense.
total_answers += 1
return answer_dicts, total_answers