-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathentity_span_agreement.pl
executable file
·123 lines (104 loc) · 4.05 KB
/
entity_span_agreement.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
#!/usr/bin/env perl
use File::Basename;
use Data::Dumper;
BEGIN {
my $dirname = dirname(__FILE__);
push(@INC, "$dirname/scorer/v4/lib");
}
use CorScorer;
# Parse a file containing source file names and the offset spans for
# each mention of an entity, formatted one-entity-per-line as:
# source_name<tab>mention1begin-mention1end<tab>mention2begin-mention2end<tab>...
# source_name<tab>mention1begin-mention1end<tab>mention2begin-mention2end<tab>...
# ...
# The result is a hash mapping from each source file name to an array of
# entities, where each entity is an array of mentions and each mention
# is a length-two array of the mention's begin and end offsets
# (inclusive). For example:
#
# %result = (
# "doc1.txt" => (
# [ [1,3], [45,45], [57,62] ],
# [ [5,5], [25,27] ],
# ...
# ),
# "doc2.txt" => (
# ...
# ),
# ...
# );
#
# In doc1.txt, there are two entities. The first is composed of 3
# mentions spanning offsets 1 to 3, 45 to 45 and 57 to 62. The second
# is composed of 2 mentions spanning offsets from 5 to 5 and 25 to 27.
sub GetSourceToEntitiesHash
{
my ($file) = @_;
my %coref;
my %ind;
open (F, $file) || die "Can not open $file: $!";
while (my $l = <F>) {
chomp($l);
my @columns = split(/\t/, $l);
my $source_name = shift(@columns);
my @entity;
for my $span_string (@columns) {
my @begin_end = split(/-/, $span_string);
my $begin = $begin_end[0];
my $end = $begin_end[1];
push(@entity, [$begin, $end]);
}
push(@{$coref{$source_name}}, \@entity);
}
return \%coref;
}
# ======================
# Main evaluation script
# ======================
# parse the key and response files to get entities
die "usage: evaluate_entity_spans.pl keys-file response-file\n" unless @ARGV == 2;
my %keyFileEntities = %{GetSourceToEntitiesHash(shift(@ARGV))};
my %responseFileEntities = %{GetSourceToEntitiesHash(shift(@ARGV))};
# define the set of scorers that will be applied
my @scorerNames = ("MUC", "B^3", "CEAFm", "CEAFe");
my %scorers = (
"MUC" => \&CorScorer::MUCScorer,
"B^3" => \&CorScorer::BCUBED,
"CEAFm" => (sub {CorScorer::CEAF(@_, 1)}),
"CEAFe" => (sub {CorScorer::CEAF(@_, 0)}),
);
my %scorerResults;
# invoke each scorer to compare the key and response entities
print("--------------------------------------------------------------------------\n");
foreach my $scorerName (@scorerNames) {
print("** $scorerName **\n");
print("--------------------------------------------------------------------------\n");
# precision and recall numerators and denominators
my %idenTotals = (recallDen => 0, recallNum => 0, precisionDen => 0, precisionNum => 0);
my ($recallNum, $recallDen, $precisionNum, $precisionDen) = (0,0,0,0);
# iterate over common files between key and response
my %sourceFileSet = map { $_ => 1 } (keys %keyFileEntities, keys %responseFileEntities);
foreach my $source_name (sort(keys %sourceFileSet)) {
print("Source file: $source_name\n");
# transform entity lists into input for scorers
my ($keyChains, $responseChains) = CorScorer::IdentifMentions(
$keyFileEntities{$source_name},
$responseFileEntities{$source_name},
\%idenTotals);
# invoke scorer to get precision and recall numerators and denominators for this file
my ($nr, $dr, $np, $dp) = $scorers{$scorerName}->($keyChains, $responseChains);
# update global precision and recall numerators and denominators
$recallNum += $nr;
$recallDen += $dr;
$precisionNum += $np;
$precisionDen += $dp;
}
# store the arguments to CorScorer::ShowRPF so that results can be printed together at the end
my @showRPFargs = ($recallNum, $recallDen, $precisionNum, $precisionDen);
$scorerResults{$scorerName} = \@showRPFargs;
}
# print out scores
foreach my $scorerName (@scorerNames) {
print("** $scorerName **\n");
CorScorer::ShowRPF(@{$scorerResults{$scorerName}});
}