forked from sebastianruder/emotion_proposition_store
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathUtils.java
423 lines (367 loc) · 18.2 KB
/
Utils.java
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
import com.sun.javaws.exceptions.InvalidArgumentException;
import edu.jhu.agiga.*;
import edu.stanford.nlp.trees.Tree;
import java.io.File;
import java.io.FilenameFilter;
import java.io.IOException;
import java.io.StringWriter;
import java.util.*;
/**
* Utility functions for extracting tokens or trees.
*
* Created by sebastian on 12/04/15.
*/
public class Utils {
// pronouns
public static List<String> pronouns = Arrays.asList("I", "you", "he", "she", "it", "him", "her", "we", "they", "me", "us", "them");
// possessive pronouns
private static List<String> possessivePronouns = Arrays.asList("my", "your", "his", "her", "its", "our", "their");
// named entity tags
private static List<String> NEtags = Arrays.asList("NUMBER", "PERSON", "LOCATION", "ORGANIZATION");
/**
* Method to create a map mapping sentence indexes to a list of mentions and their representatives.
* @param corefs a list of AgigaCoref
* @return the mention map
*/
public static Map<Integer, List<Map.Entry<AgigaMention, AgigaMention>>> createMentionMap(List<AgigaCoref> corefs) {
Map<Integer, List<Map.Entry<AgigaMention, AgigaMention>>> mentionMap = new HashMap<Integer, List<Map.Entry<AgigaMention, AgigaMention>>>();
for (AgigaCoref coref : corefs) {
List<AgigaMention> mentions = coref.getMentions();
AgigaMention rep = null;
for (AgigaMention mention : mentions) {
if (mention.isRepresentative()) {
rep = mention;
break;
}
}
for (AgigaMention mention : mentions) {
int sentIdx = mention.getSentenceIdx();
if (!mentionMap.containsKey(sentIdx)) {
mentionMap.put(sentIdx, new ArrayList<Map.Entry<AgigaMention, AgigaMention>>());
}
mentionMap.get(sentIdx).add(new AbstractMap.SimpleEntry<AgigaMention, AgigaMention>(mention, rep));
}
}
return mentionMap;
}
/**
* Get the bag-of-words of the NP or S(BAR) that encompasses the cause, tagged with parts-of-speech.
* @param root the root of the tree
* @param leaf the leaf node (cause)
* @param label the label of the cause (either NP or S)
* @param tokens a list of Agiga tokens of the sentence
* @return the bag-of-words of the sentence
*/
public static String getBagOfWords(Tree root, Tree leaf, String label, List<AgigaToken> tokens) {
// at height 1 from leaf node is preterminal, at height 2 is the first ancestor
for (int height = 2; height <= 6; height++) {
try {
Tree ancestor_1 = leaf.ancestor(height, root);
Tree ancestor_2 = leaf.ancestor(height + 1, root);
String ancestor_1_label = ancestor_1.label().toString();
String ancestor_2_label = ancestor_2.label().toString();
if ((ancestor_1_label.equals(label) || ancestor_1_label.equals(label + "BAR")) &&
!(ancestor_2_label.equals(label) || ancestor_2_label.equals(label + "BAR"))) {
int start = ancestor_1.getSpan().getSource();
int end = ancestor_1.getSpan().getTarget();
return createStringFromTokens(tokens.subList(start, end + 1), true, true, false, true);
}
} catch (ArrayIndexOutOfBoundsException exception) {
// is thrown when node doesn't have a child
} catch (NullPointerException exception) {
// is thrown when ancestor is null as tree ends
}
}
return "";
}
/**
* Create a string from a list of tokens. Tokens are separated by whitespace; part-of-speech and index are
* separated by a slash from tokens.
* @param tokens a list of Agiga tokens
* @param lemma a boolean indicating if tokens should be outputted as lemmas
* @param pos a boolean indicating if part-of-speech should be added
* @param idx a boolean indicating if indexes should be added
* @param excludePunctuation a boolean indicating if punctuation tokens should be excluded
* @return the concatenated token string
*/
public static String createStringFromTokens(List<AgigaToken> tokens, boolean lemma, boolean pos, boolean idx,
boolean excludePunctuation) {
StringBuilder sb = new StringBuilder();
for (AgigaToken tok : tokens) {
if (excludePunctuation && (tok.getWord().equals(":") || tok.getWord().equals(",") ||
tok.getWord().equals("''") || tok.getWord().equals("_") || tok.getWord().equals("``"))) {
continue;
}
String lemmaString = tok.getLemma();
String posString = tok.getPosTag();
int idxInt = tok.getTokIdx();
if (lemma) { sb.append(lemmaString); }
else { sb.append(tok.getWord()); }
if (pos) { sb.append("/"); sb.append(posString); }
if (idx) { sb.append("/"); sb.append(idxInt); }
sb.append(" ");
}
return sb.toString().trim();
}
/**
* Transforms a comp to a string made up of its subject, predicate, object, and optionally prepositional objects.
* The string can optionally be in lemma form and coreferents and named entities can be replaced.
* @param gov the index of the comp predicate (dependent of the comp, governor of nsubj and dobj relationships in comp)
* @param colDeps the list of collapsed AgigaTypedDependencies of the sentence
* @param tokens the list of tokens of the sentence
* @param sentences the list of sentences in the document
* @param mentionPairs a list of pairs of mentions and their representative mention
* @param NEtokens the tokens of the sentence tagged with named entities
* @param addPObj if prepositional objects should be added
* @param replaceCoref if coreferents should be replaced
* @param asLemma if the string should be returned in lemma form
* @param addNER if named entities should be replaced with their named entity tags
* @return the string of the comp
*/
public static String compToString(int gov, List<AgigaTypedDependency> colDeps, List<AgigaToken> tokens,
List<AgigaSentence> sentences, List<Map.Entry<AgigaMention,
AgigaMention>> mentionPairs, String[] NEtokens,
boolean addPObj, boolean replaceCoref, boolean asLemma, boolean addNER) {
// retrieve subject and direct object
String nsubj = "";
String dobj = "";
for (AgigaTypedDependency dep : colDeps) {
String type = dep.getType();
if ((type.equals("nsubj") || type.equals("nsubjpass"))&& dep.getGovIdx() == gov) {
nsubj = depToString(dep.getDepIdx(), colDeps, tokens, sentences, mentionPairs, NEtokens, addPObj, replaceCoref, asLemma, addNER);
}
else if (type.equals("dobj") && dep.getGovIdx() == gov) {
dobj = depToString(dep.getDepIdx(), colDeps, tokens, sentences, mentionPairs, NEtokens, addPObj, replaceCoref, asLemma, addNER);
}
}
// append subject, predicate, object to string builder
StringBuilder sb = new StringBuilder();
sb.append(nsubj);
sb.append("\t");
sb.append(tokens.get(gov).getLemma());
sb.append("\t");
sb.append(dobj);
sb.append("\t");
// append list of prepositional objects
if (addPObj) {
Map<String, List<Integer>> pObjs = extractPObjs(gov, colDeps);
for (int i = 0; i < pObjs.keySet().size(); i++) {
if (i == 0) {
sb.append("[");
}
else {
sb.append(", ");
}
String prep = (String)pObjs.keySet().toArray()[i];
sb.append(prep);
sb.append(":");
sb.append(buildString(pObjs.get(prep), tokens, sentences, mentionPairs, NEtokens, replaceCoref, asLemma, addNER, "_"));
if (i + 1 == pObjs.keySet().size()) {
sb.append("]");
}
}
}
return sb.toString();
}
/**
* Transform a subject or object dependency into a string, adding modifiers, and optionally prepositional objects,
* in lemma form, replacing corefs and named entities.
* @param gov the index of the subject / object (dependent of the nsubj / dobj, etc. dependency)
* @param colDeps a list of collapsed AgigaTypedDependencies
* @param tokens the tokens in the sentence
* @param sentences the list of sentences in the document
* @param mentionPairs a list of pairs of mentions and their representative mention
* @param NEtokens the list of tokens and their named entity tags
* @param addPObj if prepositional objects should be added
* @param replaceCoref if coreferents should be replaced
* @param asLemma if the string should be returned in lemma form
* @param addNER if named entities should be replaced with their named entity tags
* @return the subject or object in string form
*/
public static String depToString(int gov, List<AgigaTypedDependency> colDeps, List<AgigaToken> tokens,
List<AgigaSentence> sentences, List<Map.Entry<AgigaMention,
AgigaMention>> mentionPairs, String[] NEtokens,
boolean addPObj, boolean replaceCoref, boolean asLemma, boolean addNER) {
StringBuilder sb = new StringBuilder();
sb.append(buildString(addModifiers(gov, colDeps), tokens, sentences, mentionPairs, NEtokens, replaceCoref, asLemma, addNER, " "));
if (addPObj) {
Map<String, List<Integer>> pObjs = extractPObjs(gov, colDeps);
for (String prep : pObjs.keySet()) {
sb.append(" ");
sb.append(prep);
sb.append(":");
sb.append(buildString(pObjs.get(prep), tokens, sentences, mentionPairs, NEtokens, replaceCoref, asLemma, addNER, "_"));
}
}
return sb.toString().trim();
}
/**
* Build a string from token indexes. Optionally in lemma form, replacing coreferents and named entities.
* @param idxs the indexes of the tokens that the string should be built from
* @param tokens the tokens in the sentence
* @param sentences the list of sentences in the document
* @param mentionPairs a list of pairs of mentions and their representative mention
* @param NEtokens the list of tokens and their named entity tags
* @param replaceCoref if coreferents should be replaced
* @param asLemma if the string should be returned in lemma form
* @param addNER if named entities should be replaced with their named entity tags
* @param sep separator (whitespace for regular strings, underscore for prepositional objects)
* @return the string built from the specified tokens
*/
private static String buildString(List<Integer> idxs, List<AgigaToken> tokens, List<AgigaSentence> sentences,
List<Map.Entry<AgigaMention, AgigaMention>> mentionPairs, String[] NEtokens,
boolean replaceCoref, boolean asLemma, boolean addNER, String sep) {
StringBuilder sb = new StringBuilder();
for (int i : idxs) {
AgigaToken token = tokens.get(i);
String str = token.getWord();
String NEtoken = NEtokens[i].split("/")[1];
// replace numbers
if (addNER && NEtoken.equals("NUMBER")) {
str = "NUMBER";
}
// if mention is pronoun, replace with representative mention if exists
else if (replaceCoref && pronouns.contains(token.getWord())) {
try {
str = replaceCoref(str, i, sentences, mentionPairs, asLemma, addNER);
}
catch (IOException ex) {
System.out.println("NE tags couldn't be assigned.");
}
}
else if (asLemma && !possessivePronouns.contains(token.getWord())) {
str = token.getLemma();
}
sb.append(sep);
sb.append(str);
if (addNER && !NEtoken.equals("NUMBER") && NEtags.contains(NEtoken)) {
sb.append("/");
sb.append(NEtokens[i].split("/")[1]);
}
}
return sb.toString().substring(1).trim();
}
/**
* Return a list containing the index of a noun and its modifiers (compound nouns, adjectives, numbers).
* @param gov the index of the noun
* @param colDeps the list of collapsed AgigaTypedDependencies of the sentence
* @return a sorted list of indexes
*/
private static List<Integer> addModifiers(int gov, List<AgigaTypedDependency> colDeps) {
List<Integer> mods = new ArrayList<Integer>();
mods.add(gov);
for (AgigaTypedDependency dep : colDeps) {
String type = dep.getType();
if ((type.equals("nn") || type.equals("amod") || type.equals("num")) && dep.getGovIdx() == gov) {
mods.add(dep.getDepIdx());
}
}
Collections.sort(mods);
return mods;
}
/**
* Return the indexes of the prepositional objects of a noun mapped to their prepositions.
* @param gov the index of the noun
* @param colDeps the list of collapsed AgigaTypedDependencies of the sentence
* @return a map containing the preposition type and a list of indexes of the prepositional object
*/
private static Map<String, List<Integer>> extractPObjs(int gov, List<AgigaTypedDependency> colDeps) {
Map<String, List<Integer>> pObjs = new HashMap<String, List<Integer>>();
for (AgigaTypedDependency dep : colDeps) {
if (dep.getType().startsWith("prep_") && dep.getGovIdx() == gov) {
pObjs.put(dep.getType().substring(5), addModifiers(dep.getDepIdx(), colDeps));
}
}
return pObjs;
}
/**
* Replace a mention with its most representative mention if it exists and is not too long.
* @param input the mention that should be replaced
* @param idx the index of the mention
* @param sentences a list of sentences of the document
* @param mentionPairs a list of pairs of mentions and their representative mention
* @return the most representative mention; if it doesn't exist or is too long, the mention
*/
private static String replaceCoref(String input, int idx, List<AgigaSentence> sentences,
List<Map.Entry<AgigaMention, AgigaMention>> mentionPairs,
boolean asLemma, boolean addNER) throws IOException {
for (Map.Entry<AgigaMention, AgigaMention> pair : mentionPairs) {
AgigaMention mention = pair.getKey();
AgigaMention rep = pair.getValue();
if (mention.getStartTokenIdx() == idx) {
List<AgigaToken> repSentTokens = sentences.get(rep.getSentenceIdx()).getTokens();
AgigaSentence repSent = sentences.get(rep.getSentenceIdx());
List<AgigaTypedDependency> colDeps = repSent.getColDeps();
// get NE tags
StringWriter stringWriter = new StringWriter();
repSent.writeNerTags(stringWriter);
String[] repNEtokens = stringWriter.toString().split(" ");
String representative = depToString(rep.getHeadTokenIdx(), colDeps, repSentTokens, sentences, null, repNEtokens,
true, false, asLemma, addNER);
return representative;
};
}
return input;
}
/**
* Combines a variable number of paths into a single valid path.
* @param paths a variable number of file path strings
* @return the combined path
*/
public static String combine(String... paths)
{
File file = new File(paths[0]);
for (int i = 1; i < paths.length; i++) {
file = new File(file, paths[i]);
}
return file.getPath();
}
/**
* Deletes all files in a directory with a specified extension.
* @param dir the directory
* @param extension the file extension that should be deleted
* @throws IOException if the directory is not found
*/
public static void cleanDirectory(String dir, final String extension) throws IOException {
File fileDir = new File(dir);
if (!fileDir.isDirectory()) {
throw new IOException(String.format("%s is not a valid directory.", dir));
}
String[] fileNames = fileDir.list(new FilenameFilter() {
@Override
public boolean accept(File dir, String name) {
if (extension.equals("*")) {
return true;
}
else {
return name.endsWith(extension);
}
}
});
for (String fileName : fileNames) {
File file = new File(Utils.combine(dir, fileName));
file.delete();
}
}
/**
* Retrieves the files in a directory that match all of the specified patterns. You can -- of course -- AND your
* patterns together as well.
* @param dir the directory from which the files should be retrieved
* @param patterns the patterns that the file name should match
* @return a list of file names
*/
public static List<String> getFileNames(String dir, final String... patterns) {
File fileDir = new File(dir);
List<String> fileNames = Arrays.asList(fileDir.list(new FilenameFilter() {
@Override
public boolean accept(File dir, String name) {
boolean isAccepted = true;
for (String pattern : patterns) {
isAccepted &= name.matches(String.format(".*%s.*", pattern));
};
return isAccepted;
}
}));
return fileNames;
}
}