-
Notifications
You must be signed in to change notification settings - Fork 46
/
Copy pathcollect_propn_sequences.pl
executable file
·109 lines (107 loc) · 3.18 KB
/
collect_propn_sequences.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
#!/usr/bin/env perl
# Reads a CoNLL-U file and watches for sequences of words tagged PROPN. In certain
# treebanks, especially the UD treebanks donated by Google, multi-word named entities
# were all tagged PROPN, regardless of the fact that some words were not proper
# nouns and not even nouns. This approach does not comply with the UD guidelines
# and it should be corrected; however, the information that there is a multi-word
# named entity is interesting and we don't want to lose it, so we will keep it
# in the MISC field.
# Copyright © 2020 Dan Zeman <[email protected]>
# License: GNU GPL
use utf8;
use open ':utf8';
binmode(STDIN, ':utf8');
binmode(STDOUT, ':utf8');
binmode(STDERR, ':utf8');
my %ne;
my $current_ne = '';
my $mwtto = 0;
my $mwt_form = '';
my $mwt_no_space_after = 0;
my $mwt_is_propn;
while(<>)
{
s/\r?\n$//;
if(m/^\d+-(\d+)\t/)
{
$mwtto = $1;
my @f = split(/\t/, $_);
$mwt_form = $f[1];
$mwt_no_space_after = scalar(grep {m/^SpaceAfter=No$/} (split(/\|/, $f[9])));
$mwt_is_propn = 1; # any non-PROPN word within the MWT will clear this flag
}
elsif(m/^(\d+)\t/)
{
my @f = split(/\t/, $_);
if($mwtto > 0 && $f[0] <= $mwtto)
{
# We are in a non-final word of a multi-word token and it is not a PROPN.
# If the buffer contains a named entity, it will not include any part of
# the MWT, even if its initial words were PROPN.
if($f[3] ne 'PROPN')
{
if($current_ne ne '')
{
$current_ne =~ s/\s+$//;
$ne{$current_ne}++;
}
$current_ne = '';
$mwt_is_propn = 0;
}
if($f[0] == $mwtto)
{
if($mwt_is_propn)
{
$current_ne .= $mwt_form;
unless($mwt_no_space_after)
{
$current_ne .= ' ';
}
}
$mwtto = 0;
$mwt_form = '';
$mwt_no_space_after = 0;
$mwt_is_propn = undef;
}
}
# Normal word/token that is not part of a multi-word token.
else
{
if($f[3] eq 'PROPN')
{
$current_ne .= $f[1];
unless(grep {m/^SpaceAfter=No$/} (split(/\|/, $f[9])))
{
$current_ne .= ' ';
}
}
else
{
if($current_ne ne '')
{
$current_ne =~ s/\s+$//;
$ne{$current_ne}++;
}
$current_ne = '';
}
}
}
elsif(m/^\s*$/)
{
if($current_ne ne '')
{
$current_ne =~ s/\s+$//;
$ne{$current_ne}++;
}
$current_ne = '';
$mwtto = 0;
$mwt_form = '';
$mwt_no_space_after = 0;
$mwt_is_propn = undef;
}
}
my @ne = sort {my $r = $ne{$b} <=> $ne{$a}; unless($r) {$r = $a cmp $b} $r} (keys(%ne));
foreach my $ne (@ne)
{
print("$ne\t$ne{$ne}\n");
}