-
Notifications
You must be signed in to change notification settings - Fork 5
Expand file tree
/
Copy pathWeek06.01Ensembles.03StackedEnsemblesInH2O.R
More file actions
78 lines (60 loc) · 2.33 KB
/
Week06.01Ensembles.03StackedEnsemblesInH2O.R
File metadata and controls
78 lines (60 loc) · 2.33 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
library(h2o)
h2o.init()
data <- h2o.importFile("http://h2o-public-test-data.s3.amazonaws.com/smalldata/airlines/allyears2k_headers.zip")
parts <- h2o.splitFrame(data, c(0.8,0.1), seed = 69)
train <- parts[[1]]; nrow(train) ## 35255
valid <- parts[[2]]; nrow(valid) ## 4272
test <- parts[[3]]; nrow(test) ## 4451
y <- "IsArrDelayed"
x <- setdiff(colnames(data), c(
"ArrDelay", "DepDelay",
"CarrierDelay", "WeatherDelay",
"NASDelay", "SecurityDelay",
"LateAircraftDelay",
"IsDepDelayed", "IsArrDelayed",
"ActualElapsedTime", # But CRSElapsedTime is fine
"ArrTime", ## But CRSArrTime is fine
"TailNum"
)
)
nfolds <- 5
train2 <- h2o.rbind(train, valid)
system.time(
m_glm <- h2o.glm(x, y, train2,
family = "binomial",
model_id = "glm_def",
nfolds = nfolds,
fold_assignment = "Modulo",
keep_cross_validation_predictions = TRUE
)
)
system.time(
m_gbm <- h2o.gbm(x, y, train2,
model_id = "rf_def",
nfolds = nfolds,
fold_assignment = "Modulo",
keep_cross_validation_predictions = TRUE
)
)
system.time(
m_rf <- h2o.randomForest(x, y, train2,
model_id = "rf_def",
nfolds = nfolds,
fold_assignment = "Modulo",
keep_cross_validation_predictions = TRUE
)
)
model_ids <- list(m_glm@model_id, m_gbm@model_id, m_rf@model_id)
system.time(
m_SE <- h2o.stackedEnsemble(x, y, train2,
model_id = "SE_glm_gbm_rf",
base_models = model_ids)
)
models <- c(m_glm, m_gbm, m_rf, m_SE)
sapply(models, h2o.logloss) ## Oooh!
sapply(models, h2o.logloss, xval = TRUE) ## Hhhmm...
sapply(models, h2o.auc)
sapply(models, h2o.auc, xval = TRUE)
perfs <- lapply(models, h2o.performance, test)
sapply(perfs, h2o.logloss) # Aha!
sapply(perfs, h2o.auc)