-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path02.1-word-presence-and-counts.py
81 lines (64 loc) · 2.71 KB
/
02.1-word-presence-and-counts.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
import argparse
import os
from asap.core import load_train_instances, load_test_instances, gather_input_files, Pipeline
from asap.core.features import Tokenizer, ContainsWords, UniqueWordCount, WordCount, CharacterCount, \
NonWhitespaceCharacterCount
from asap.core.preprocess import PunctuationStripper, WhitespaceNormalizer, LowerCaser
from asap.core.ml import RandomForest
from asap.core.runners import PipelineRunner
from asap.metrics import write_qwk_markdown_table
def parse_args():
ap = argparse.ArgumentParser()
ap.add_argument("input_path")
ap.add_argument("output_path")
ap.add_argument("num_trees", type=int)
ap.add_argument("max_depth", type=int)
return ap.parse_args()
def make_pipeline(words_path, trees, depth):
pipe = Pipeline()
# Preprocessing
pipe.add_phase(PunctuationStripper())
pipe.add_phase(WhitespaceNormalizer())
pipe.add_phase(LowerCaser())
# Features
pipe.add_phase(Tokenizer())
pipe.add_phase(ContainsWords(words_path))
pipe.add_phase(UniqueWordCount())
pipe.add_phase(WordCount())
pipe.add_phase(CharacterCount())
pipe.add_phase(NonWhitespaceCharacterCount())
# ML
pipe.add_phase(RandomForest(num_trees=trees, max_depth=depth, target="score1",
features=["word-presence", "unique-word-count", "word-count", "char-count",
"!white-char-count"]))
return pipe
if __name__ == "__main__":
args = parse_args()
# Gathering inputs
inputs = gather_input_files(args.input_path)
qwk_scores = {}
for num_id, train_path, test_path in inputs:
print("Working on [" + num_id + "]")
# Make output directory
output_dir = os.path.join(args.output_path, num_id)
os.makedirs(output_dir)
print("Loading data...", end='')
# Load instance data
train = load_train_instances(train_path)
test = load_test_instances(test_path)
print("Done")
print("Creating pipeline...", end='')
# Create pipeline with tokenizer and word presence vector
pipe = make_pipeline("data/keywords/keywords-" + num_id + ".txt", args.num_trees, args.max_depth)
print("Done")
runner = PipelineRunner(pipe, test, 'score1', 'prediction', train, output_dir, evaluate=True, save=True)
results = runner.run()
qwk_scores[num_id] = runner.qwk_score
qwk_score_path = os.path.join(args.output_path, 'qwk_scores.md')
scores = []
for item in inputs:
scores.append(qwk_scores[item[0]])
scores.append(sum(qwk_scores.values()) / len(qwk_scores))
write_qwk_markdown_table(scores, qwk_score_path)
print("Wrote QWK scores to " + qwk_score_path)
print("Done")