-
Notifications
You must be signed in to change notification settings - Fork 10
/
Copy pathxgb-try_v3.R
141 lines (115 loc) · 4.52 KB
/
xgb-try_v3.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
# Based on Ben Hamner script from Springleaf
# https://www.kaggle.com/benhamner/springleaf-marketing-response/random-forest-example
library(readr)
library(xgboost)
#my favorite seed^^
set.seed(8)
cat("reading the train and test data\n")
train <- read_csv("data/train_states_R.csv")
test <- read_csv("data/test_states_R.csv")
store <- read_csv("data/store.csv")
# removing the date column (since elements are extracted) and also StateHoliday which has a lot of NAs (may add it back in later)
train <- merge(train,store)
test <- merge(test,store)
# There are some NAs in the integer columns so conversion to zero
train[is.na(train)] <- 0
test[is.na(test)] <- 0
cat("train data column names and details\n")
names(train)
str(train)
summary(train)
cat("test data column names and details\n")
names(test)
str(test)
summary(test)
# looking at only stores that were open in the train set
# may change this later
train <- train[ which(train$Open=='1'),]
train <- train[ which(train$Sales!='0'),]
# seperating out the elements of the date column for the train set
# train$month <- as.integer(format(train$Date, "%m"))
# train$year <- as.integer(format(train$Date, "%y"))
# train$day <- as.integer(format(train$Date, "%d"))
# removing the date column (since elements are extracted) and also StateHoliday which has a lot of NAs (may add it back in later)
train <- train[,-c(2,4,9)]
# seperating out the elements of the date column for the test set
# test$month <- as.integer(format(test$Date, "%m"))
# test$year <- as.integer(format(test$Date, "%y"))
# test$day <- as.integer(format(test$Date, "%d"))
# removing the date column (since elements are extracted) and also StateHoliday which has a lot of NAs (may add it back in later)
test <- test[,-c(2,5,8)]
feature.names <- names(train)[c(1,2,6:11,17:28)]
cat("Feature Names\n")
feature.names
cat("assuming text variables are categorical & replacing them with numeric ids\n")
for (f in feature.names) {
if (class(train[[f]])=="character") {
levels <- unique(c(train[[f]], test[[f]]))
train[[f]] <- as.integer(factor(train[[f]], levels=levels))
test[[f]] <- as.integer(factor(test[[f]], levels=levels))
}
}
cat("train data column names after slight feature engineering\n")
names(train)
cat("test data column names after slight feature engineering\n")
names(test)
tra<-train[,feature.names]
RMPSE<- function(preds, dtrain) {
labels <- getinfo(dtrain, "label")
elab<-exp(as.numeric(labels))-1
epreds<-exp(as.numeric(preds))-1
err <- sqrt(mean((epreds/elab-1)^2))
return(list(metric = "RMPSE", value = err))
}
nrow(train)
dim(tra)
dim(train)
cat("start model training\n")
myRseed=8
set.seed(myRseed)
h<-sample(nrow(train),10000)
dval<-xgb.DMatrix(data=data.matrix(tra[h,]),label=log(train$Sales+1)[h])
dtrain<-xgb.DMatrix(data=data.matrix(tra[-h,]),label=log(train$Sales+1)[-h])
watchlist<-list(val=dval,train=dtrain)
param <- list( objective = "reg:linear",
booster = "gbtree",
eta = 0.5,
max_depth = 10,
subsample = 0.9,
colsample_bytree = 0.7
#num_parallel_tree = 2
# alpha = 0.0001,
# lambda = 1
)
clf <- xgb.train( params = param,
data = dtrain,
nrounds = 60,
verbose = 0,
early.stop.round = 100,
watchlist = watchlist,
maximize = FALSE,
nthread=16,
seed=myRseed,
feval=RMPSE
)
#clf.bestInd
pred1 <- exp(predict(clf, data.matrix(test[,feature.names]))) -1
clf <- xgb.train( params = param,
data = dtrain,
nrounds = 60,
verbose = 0,
early.stop.round = 100,
watchlist = watchlist,
maximize = FALSE,
nthread=64,
seed=myRseed+2,
feval=RMPSE
)
pred2 <- exp(predict(clf, data.matrix(test[,feature.names]))) -1
submission <- data.frame(Id=test$Id, Sales=pred1)
submission2 <- data.frame(Id=test$Id, Sales=pred2)
submission3 <- data.frame(Id=test$Id, Sales=(pred1+pred2)/2)
cat("saving the submission file\n")
write_csv(submission, "xgb_DMC_5.csv")
write_csv(submission2, "xgb2_DMC_6.csv")
write_csv(submission3, "xgb2_DMC_7.csv")