-
Notifications
You must be signed in to change notification settings - Fork 5
Expand file tree
/
Copy pathWeek02.02ImportingGeneratingAndOverfitting.06LetsOverfitaGBM.R
More file actions
82 lines (64 loc) · 1.97 KB
/
Week02.02ImportingGeneratingAndOverfitting.06LetsOverfitaGBM.R
File metadata and controls
82 lines (64 loc) · 1.97 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
library(h2o)
h2o.init()
set.seed(123)
N <- 1000
bloodTypes <- c('A', 'O', 'AB', 'B')
d <- data.frame(id = 1:N)
d$bloodType <- bloodTypes[
(d$id %% length(bloodTypes))
+ 1 # R indexes from 1
]
head(d)
bloodTypes <- c('A', 'A', 'A', 'O', 'O', 'O', 'AB', 'B')
d$bloodType <- as.factor(bloodTypes[(d$id %% length(bloodTypes))+1])
d$age = runif(N, min=18, max=65)
v = round(rnorm(N, mean=5, sd=2)) # 68% are 3,4,5,6,7
v = pmax(v, 0)
v = pmin(v, 9)
table(v)
d$healthyEating = v
v = round(rnorm(N, mean=5, sd=2)) # 68% are 3,4,5,6,7
v = v + ifelse(d$age<30, 1, 0) # The kids are more active (?)
v = pmax(v, 0)
v = pmin(v, 9)
table(v)
d$activeLifestyle = v
d
v = 20000 + ((d$age * 3) ^ 2) # Based salary based on age
range(v) # v is $22961 to $58023
v = v + (d$healthyEating * 500)
v = v - (d$activeLifestyle * 300)
v = v + runif(N, 0, 5000)
d$income = round(v, -2) # Round to nearest $100
as.h2o(d, destination_frame = "people")
## In next videos we will be using this:
people <- h2o.getFrame("people")
summary(people)
## BUT!! Don't shutdown youdr client, or h2o will
## shutdown, and your data is lost.
parts <- h2o.splitFrame(
people, #nrows(people) == 1000
c(0.8,0.1), #800 / 100 / 1000
destination_frames=c("people_train", "people_valid", "people_test"),
seed= 123)
train <- h2o.getFrame("people_train") # 788
valid <- h2o.getFrame("people_valid") # 118
test <- h2o.getFrame("people_test") # 94
y <- "income"
x <- setdiff(names(train), c("id", y))
m1 <- h2o.gbm(x, y, train,
model_id = "default_r",
validation_frame = valid
)
h2o.performance(m1, train = TRUE)
h2o.performance(m1, valid = TRUE)
h2o.performance(m1, test)
m2 <- h2o.gbm(x, y, train,
model_id = "overfit_r",
validation_frame = valid,
ntrees = 1000,
max_depth = 10
)
h2o.performance(m2, train = TRUE)
h2o.performance(m2, valid = TRUE)
h2o.performance(m2, test)