JF_H20_Random_Forest_3.3.Rmd



```{r}
library(data.table)  
library(h2o)
library(dplyr)

# localH2O = h2o.init()
localH2O = h2o.init(nthreads = -1, max_mem_size = '4g', assertion = FALSE)

cat("reading the train and test data (with data.table) \n")
train <- fread("../data/train.csv",stringsAsFactors = T)
#str(train)

test  <- fread("../data/test.csv",stringsAsFactors = T)
#str(test)

store <- fread("data/store.csv",stringsAsFactors = T)
train <- train[Sales > 0,]  ## We are not judged on 0 sales records in test set
## See Scripts discussion from 10/8 for more explanation.
```

** Merge Train and Test with Store

```{r}
train <- merge(train,store,by="Store")
str(train)
test <- merge(test,store,by="Store")
str(test)
```

Summarize the Data:

```{r}
cat("train data column names and details\n")
summary(train)
cat("test data column names and details\n")
summary(test)
```

* More care should be taken to ensure the dates of test can be projected from train
* Decision trees do not project well, so you will want to have some strategy here, if using the dates

```{r}
train[,Date:=as.Date(Date)]
test[,Date:=as.Date(Date)]
```

* Separating out the elements of the date column for the train set

```{r}
train[,Store:=as.factor(as.numeric(Store))]
test[,Store:=as.factor(as.numeric(Store))]
```

* log transformation to not be as sensitive to high sales
*  decent rule of thumb:if the data spans an order of magnitude, consider a log transform
```{r}
train[,logSales:=log1p(Sales)]
```

## Load data into cluster from R

```{r}
trainHex<-as.h2o(train)
```

## Set up variable to use all features other than those specified here

```{r}
features<-colnames(train)[!(colnames(train) %in% c( "V1","Id","Date","Sales","logSales",
                                                    "Customers","Open","type","SH_IMM_MEAS"))]
```

## Train a random forest using all default parameters
```{r}
trees = 100
depth = 40
rfHex <- h2o.randomForest(x=features,
                          y="logSales", 
                          ntrees = trees, 
                          max_depth = depth,  
                          nbins_cats = 1115, ## allow it to fit store ID
                          training_frame=trainHex)

?h2o.randomForest
```

```{r}
summary(rfHex)
varimps = data.frame(h2o.varimp(rfHex))
varimps
cat("Predicting Sales\n")
```

## Saving our model:
```{r}
rfHex_30_60 = rfHex
h2o.saveModel(rfHex_30_60, path = '/Users/jfdarre/Documents/NYCDS/Project4/H2O_models', force = FALSE)

rfHex_50_65 = rfHex
h2o.saveModel(rfHex_50_65, path = '/Users/jfdarre/Documents/NYCDS/Project4/H2O_models', force = FALSE)

rfHex_50_55_1 = rfHex
h2o.saveModel(rfHex_50_55_1, path = '/Users/jfdarre/Documents/NYCDS/Project4/H2O_models', force = FALSE)

rfHex_50_55_2 = rfHex
h2o.saveModel(rfHex_50_55_2, path = '/Users/jfdarre/Documents/NYCDS/Project4/H2O_models', force = FALSE)

rfHex_50_55_3 = rfHex
h2o.saveModel(rfHex_50_55_3, path = '/Users/jfdarre/Documents/NYCDS/Project4/H2O_models', force = FALSE)

rfHex_30_50_3 = rfHex
h2o.saveModel(rfHex_30_50_3, path = '/Users/jfdarre/Documents/NYCDS/Project4/H2O_models_v2', force = FALSE)

rfHex_50_60_v2_data = rfHex
h2o.saveModel(rfHex_50_60_v2_data, path = '/Users/jfdarre/Documents/NYCDS/Project4/H2O_models_v2', force = FALSE)

```

## Testing training score:
```{r}
rmse = function(predictions, targets) {
  return(((predictions - targets)/targets) ** 2)
}
```

## Running training errors
```{r}
train_pred = as.data.frame(h2o.predict(rfHex,trainHex))
train_pred <- expm1(train_pred[,1])

train_pred = train_pred
train$pred = train_pred
train$rmse = rmse(train_pred, train$Sales)
train2 = filter(train, month %in% c(8,9))
(total_rmse = sqrt(sum(train$rmse)/nrow(train)))
(partial_rmse = sqrt(sum(train2$rmse)/nrow(train2)))
sumup = as.data.frame(rbind(summary(train_pred), summary(train$Sales), summary(train2$pred), summary(train2$Sales)))
sumup$sd = c(round(sd(train_pred)), round(sd(train$Sales)), round(sd(train2$pred)), round(sd(train2$Sales)))
sumup
```

## Load test data into cluster from R
```{r}
testHex<-as.h2o(test)
```

## Get predictions out; predicts in H2O, as.data.frame gets them into R
```{r}
predictions<-as.data.frame(h2o.predict(rfHex,testHex))
```

## Return the predictions to the original scale of the Sales data
```{r}
pred <- expm1(predictions[,1])
submission <- data.frame(Id=test$Id, Sales=pred)
cat("saving the submission file\n")
write.csv(submission, "./H2O_submits/h2o_50_60_v2_data.csv",row.names=F)
```


## Doing predictions one at a time and updating the test table with sales:
```{r}
new_pred = data.frame()
temp = filter(test, day_of_year == 213)
test2 = temp
for (i in 214:261) {
  temp = as.h2o(temp)
  temp_pred = as.data.frame(h2o.predict(rfHex,temp))
  new_pred = rbind(new_pred, temp_pred)
  
  if (i == 261) { next }
  temp = filter(test, day_of_year == i)
  temptest = filter(test2, day_of_year == i-1)
  temp$Sales1 = as.data.frame(expm1(temp_pred))
  for (j in 2:21) {
    temp[[paste0('Sales',j)]] = temptest[[paste0('Sales',j-1)]]
  }
  test2 = rbind(test2, temp)
}

new_pred <- expm1(new_pred[,1])
new_submission <- data.frame(Id=test$Id, Sales=new_pred)
cat("saving the submission file\n")
write.csv(new_submission, "./H2O_submits/h2o_30_50_v4.csv",row.names=F)
```


## Scratch code to test things
```{r}
aaa=filter(test, day_of_year == 213)
aaa$Sales1[846:856]
round(pred[846:856])
round(new_pred[846:856])
unique(test$day_of_year)

length(pred)
nrow(new_pred)

class(pred)
class(new_pred)

select(test,contains("Sales"))
select(test2,contains("Sales"))

sum(is.na(test$Sales1))
sum(is.na(test2$Sales1))
sum(is.na(test2$Sales2))

unique(train$Open)
aa = filter(train, Store == 10)
bb = filter(test2, Store == 10)
aa = aa$Sales
bb = bb$Sales1
cc = c(aa,bb)
plot(cc)
```