This repository has been archived by the owner on Oct 6, 2020. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 10
/
Copy pathMakefile
59 lines (50 loc) · 1.48 KB
/
Makefile
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
# Both are pre-set for using the gutenberg cleaned dataset
# feel free to change them to use your own datasets
GUTENBERG_ZIP=data/Gutenberg.zip
DATA_FILES_DIR=data/Gutenberg/txt/
default: gutenberg_unzip build_trainingset train_test
dirs:
mkdir -p data
mkdir -p models
mkdir -p graph
gutenberg_unzip: dirs
if ! [ -f ${GUTENBERG_ZIP} ]; then \
echo "You need to download the Gutenberg.zip file first";
echo "see this URL for more information & download link";
echo "http://web.eecs.umich.edu/~lahiri/gutenberg_dataset.html";
fi
unzip -d data ${GUTENBERG_ZIP}
trainingset: build_trainingset downsample_trainingset
build_trainingset:
find ${DATA_FILES_DIR} -type f -exec cat {} \; \
| grep -v '^\s*$$' \
| sed 's/\[.*\]//g' \
| tr [:upper:] [:lower:] \
| tr '-' ' ' \
| sed "s/[^a-z0-9\s'\.\?\!]/ /g" \
| sed 's/\s\+/ /g' \
| ./bin/sentence_tokenize.py \
| tr -d '.!?' \
> data/dataset.sentences
downsample_trainingset:
cat data/dataset.sentences \
| ./bin/downsample.py \
> data/dataset.downsampled
build_validation_set:
cat data/validation.raw \
| grep -v '^\s*$$' \
| sed 's/\[.*\]//g' \
| tr [:upper:] [:lower:] \
| tr '-' ' ' \
| sed "s/[^a-z0-9\s'\.\?\!]/ /g" \
| sed 's/\s\+/ /g' \
| ./bin/sentence_tokenize.py \
| tr -d '.!?' \
> data/validation.sentences
cat data/validation.sentences \
| ./bin/downsample.py \
> data/validation.downsampled
train_test:
./classifier.py data/dataset.sentences
tensorboard:
tensorboard --logdir ./graph