-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathMakefile
98 lines (76 loc) · 3.59 KB
/
Makefile
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
## IF you want to generate your own rule set on the basis of a number
## of (tokenized documents), then you need to redefine the novels and
## novelsdir variables
##
## the tok.gz files in the novels directory should be in the tokenized
## Alpino format: every line is a sentence prefixed by a key
##
##
## 1|The first sentence ...
## 2|The scond one !
## 3|Etcetera .
#novels = $(wildcard dbnl_without/*.sents.gz)
novels = $(wildcard $(novelsdir)/*.tok.gz)
novelsdir = /mnt/local/tmp/andreas/DBNL-20230214/output/tokenized
tests = $(wildcard TestWithout/*.tok)
threshold = 5
build: nouns adj_pair spelling
all: btest.sents all.alts apply-all
# for now, words including |,[,] are removed because Alpino -lex_all will do funny things
words.freq: $(novels)
find $(novelsdir) -name '*.tok.gz' | xargs zcat | sed -e 's/^[^|]*[|]//' | tr ' ' '\n' |\
sort | uniq -c | sort -nr | awk '{ if ($$1 > $(threshold)) print $$0 }'|\
grep -v '[|]' > words.freq
filter=cat
## not much worth the trouble
## filter=grep -v -e '^[0-9_,+./!?;:(){}\\]' -e '^[†$$=±§€£-][0-9_,+./!?;:(){}\\]' -e '[.]$$'
words.unknowns: words.freq report_missing_lex.pl
awk '{ print $$2 }' words.freq | $(filter) |\
Alpino -notk -l report_missing_lex batch_command=go >words.unknowns
auto: words.unknowns generate_alt_spelling.pl
grep unknown: words.unknowns |\
awk '{print $$2 }'|\
Alpino -notk -l generate_alt_spelling batch_command=go > auto
spelling: auto hand
python3 add_cap.py < hand > handc
sort -u auto hand handc > spelling
check_spelling: spelling
awk '{ print $$1 }' spelling | sort | uniq -d
pipe=python3 triples.py | python3 meta.py spelling | sed -f map.sed | python3 meta.py hand2
%.sents: % meta.py triples.py nouns adj_pair det_pair map.sed hand2 spelling
cat $< |\
$(pipe) \
> $*.sents
all.alts: dbnl_with/*.sents.gz
zcat dbnl_with/*.sents.gz |grep -o ' [[][^]]*[]]' |sort | uniq -c | sort -nr > all.alts
adjn:
find $(novelsdir) -name '*.tok.gz' | xargs zcat |\
grep -o ' den [a-z]*[abcdfghjklmnpqrstvwxyz]en [a-z]* ' |\
awk '{print $$2 }' | sort | uniq -c |sort -nr |\
awk '{ if ($$1>10) print $$2}' > adjn
adj_pair: adjn generate_adj_pair.pl generate_alt_spelling.pl
sort adjn | Alpino -notk -l generate_adj_pair batch_command=go_adjs |uniq > adj_pair
qnouns:
find $(novelsdir) -name '*.tok.gz' | xargs zcat |\
egrep -o '[ |](mijnen|dezen|den|zulken|een|eenen|hunnen|menigen|haren|zijnen|mijnen) [^ ][^ ]*en [^ ][^ ][^ ]* ' | awk '{ print $$3 }' | sort -u > qnouns
nouns: qnouns generate_nouns.pl generate_alt_spelling.pl
cat qnouns |\
Alpino -notk unknowns=off -l generate_nouns batch_command=go_nouns |\
uniq > nouns
dbnl_with/%.sents.gz: dbnl_without/%.sents.gz triples.py spelling map.sed hand2 nouns adj_pair det_pair
zcat dbnl_without/$*.sents.gz |\
$(pipe) |\
gzip > dbnl_with/$*.sents.gz
TestWith/%.tok: TestWithout/%.tok triples.py spelling map.sed hand2 nouns adj_pair det_pair
cat TestWithout/$*.tok |\
$(pipe) \
> TestWith/$*.tok
apply-all: $(novels:dbnl_without/%.sents.gz=dbnl_with/%.sents.gz) $(tests:TestWithout/%.tok=TestWith/%.tok)
## pipe through cat so make does not complain that there is an error if
## there is no output (which is what we want!)
check:
grep '[[][^]]*[[]' *.sents |grep -v '[\][[]' |cat
zgrep '[[][^]]*[[]' *.sents.gz |grep -v '[\][[]' |cat
### this will "apply" the meta-annotation, so that only the "modern" text remains
undo:
@zcat $(file) | perl -p -e 's!\[ \@alt ~(\w+?)~\w+ .*?\]!$$1!g; s!\[ \@(alt|mwu_alt|phantom) (\w+) .*?\]!$$2!g; s!\[ \@skip .*? \] *!!g;'