This repository has been archived by the owner on Aug 3, 2023. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathnaive_bayes_classifier.py
92 lines (66 loc) · 2.69 KB
/
naive_bayes_classifier.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
import numpy as np
import tensorflow as tf
import joblib
import sklearn
from sklearn.model_selection import StratifiedKFold
from sklearn.naive_bayes import ComplementNB, MultinomialNB
from sklearn.pipeline import Pipeline, make_pipeline
import skopt
from skopt.space import Categorical, Real
from preprocessing import ds_to_ndarray, load_env_vars, load_vec_ds, normalize_ds
def train_nb_classifier(vec_ds: tf.data.Dataset, **params):
x, y = ds_to_ndarray(vec_ds)
clf = make_pipeline(
# MultinomialNB(alpha=0.17601196310151052)
# MultinomialNB(alpha=0.9314472681366592)
MultinomialNB(alpha=0.34518859009142816)
)
skf = StratifiedKFold(n_splits=params['NUM_FOLDS'])
precision, recall, f1 = [], [], []
for i_train, i_test in skf.split(x, y):
print(' - Training SVM classifier on test fold')
with joblib.parallel_backend('threading', n_jobs=-1):
clf.fit(x[i_train], y[i_train])
print(' - Classifying test fold')
pred = clf.predict(x[i_test])
precision.append(sklearn.metrics.precision_score(y[i_test], pred))
recall.append(sklearn.metrics.recall_score(y[i_test], pred))
f1.append(sklearn.metrics.f1_score(y[i_test], pred))
print(sum(y[i_test]), sum(pred))
# print("Precision:", sklearn.metrics.precision_score(y[i_test], pred))
# print("Recall:", sklearn.metrics.recall_score(y[i_test], pred))
print('')
print(f'{precision} \nAverage precision: {sum(precision) / len(precision)} \n')
print(f'{recall} \nAverage recall: {sum(recall) / len(recall)} \n')
print(f'{f1} \nAverage F1 score: {sum(f1) / len(f1)}')
return clf
def optimize_hyperparameters(vec_ds: tf.data.Dataset, **params):
pipe = Pipeline([
('model', sklearn.naive_bayes.GaussianNB())
])
nb_search = {
'model': Categorical([MultinomialNB(), ComplementNB()]),
'model__alpha': Real(0, 1, 'uniform')
}
opt = skopt.BayesSearchCV(
pipe,
nb_search,
n_iter=500,
scoring='f1',
n_jobs=6,
cv=params['NUM_FOLDS'],
verbose=3
)
x, y = ds_to_ndarray(vec_ds)
opt.fit(x, y)
print("val. score: %s" % opt.best_score_)
print("best params: %s" % str(opt.best_params_))
return opt
if __name__ == '__main__':
settings, params = load_env_vars()
ds_list = load_vec_ds(settings['BASE_DIR'] / settings['DATA_DIR'], is_xlsx=False, prefix='sk_',
ds_types=('train', 'test'), **params)
norm_vec_ds = normalize_ds(ds_list[0])
print('Optimizing naive Bayes classifier')
clf = train_nb_classifier(ds_list[0], **params)
# optimize_hyperparameters(ds_list[0], **params)