-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathxgboost_.py
36 lines (30 loc) · 1.34 KB
/
xgboost_.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
def evaluate(X,y, classifer):
pred = classifer.predict(X)
accuracy = metrics.accuracy_score(y, pred)
print("Accuracy: ", accuracy)
def train_xgboost(X,y):
classifier = XGBClassifier()
classifier.fit(X,y)
return classifier
def split_validation_set(X, y, valid_size):
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=valid_size, random_state=1)
return X_train, X_valid, y_train, y_valid
def tfidf_ngrams(X, X_valid, n):
vectWord = TfidfVectorizer(lowercase=True, analyzer='word',ngram_range=(1,n),dtype=np.float32)
vectorX = vectWord.fit_transform(X)
vectorValidX = vectWord.transform(X_valid)
return vectorX, vectorValidX
def run_xgboost(X,y, valid_size= 0.3, n_gram = 2):
print("Run Xgboost model...")
X_train, X_valid, y_train, y_valid = split_validation_set(X,y, valid_size)
X_train_vector, X_valid_vector = tfidf_ngrams(X_train, X_valid, n_gram)
classifier = train_xgboost(X_train_vector, y_train)
print("Evaluate on the training set")
evaluate(X_train_vector, y_train, classifier)
print("Evaluate on the valid set")
evaluate(X_valid_vector, y_valid,classifier)