forked from fslaborg/FSharp.Stats
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathCrossValidation.fsx
More file actions
432 lines (342 loc) · 15.2 KB
/
CrossValidation.fsx
File metadata and controls
432 lines (342 loc) · 15.2 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
(**
---
title: Cross validation
index: 10
category: Documentation
categoryindex: 0
---
*)
(*** hide ***)
(*** condition: prepare ***)
#r "nuget: FSharpAux.Core, 2.0.0"
#r "nuget: FSharpAux, 2.0.0"
#r "nuget: FSharpAux.IO, 2.0.0"
#r "nuget: OptimizedPriorityQueue, 5.1.0"
#r "nuget: FsMath, 0.0.2"
#I "../src/FSharp.Stats/bin/Release/.net8.0/"
#r "FSharp.Stats.dll"
#r "nuget: Plotly.NET, 4.0.0"
open FsMath
Plotly.NET.Defaults.DefaultDisplayOptions <-
Plotly.NET.DisplayOptions.init (PlotlyJSReference = Plotly.NET.PlotlyJSReference.NoReference)
(*** condition: ipynb ***)
#if IPYNB
#r "nuget: Plotly.NET, 4.0.0"
#r "nuget: Plotly.NET.Interactive, 4.0.0"
#r "nuget: FSharp.Stats"
#endif // IPYNB
(**
# Cross validation
[](https://mybinder.org/v2/gh/fslaborg/FSharp.Stats/gh-pages?urlpath=/tree/home/jovyan/CrossValidation.ipynb)
[]({{root}}{{fsdocs-source-basename}}.ipynb)
_Summary:_ this tutorial demonstrates how to perform several types of cross validation with FSharp.Stats.
## Leave-one-out cross validation (LOOCV)
When fitting a data set it often comes down to the selection of the optimal fitting parameter(s).
A method to determine these is given by the leave-one-out cross validation (LOOCV). Thereby, the data set is fitted with a
given parameter range (smoothing strength, polynomial order etc.) in order to select the best.
### Procedure
In each iteration, one data point is excluded from the fitting procedure. The coefficients are determined
based on the remaining (n-1) data points. The difference of the excluded point with its corresponding fitted point is measured.
In a two-dimensional problem it is the y-intercept of f(xi) and the y_orig at xi.
After every data point was excluded once, the average (squared) distance is calculated and assigned to the corresponding fitting parameter (polynomial order or smoothing strength).
The parameter of the model that shows the minimal average error is the best under the given assumptions. It shows the best compromise between over- and underfitting respectively.
### Polynomial loocv
let's first create some polynomial fits to cross validate:
*)
open Plotly.NET
open FSharp.Stats
open FSharp.Stats.Fitting
open LinearRegression.OLS
let xV = vector [1. .. 10.]
let yV = vector [1.;20.;51.;40.;37.;6.;-10.;-5.;0.;10.]
// the fitting function fits a polynomial of order 'order' to the training data set (xTrain and yTrain) and applies it to xTest
let getFitFuncPolynomial (xTrain:Matrix<float>) yTrain (xTest:Vector<float>) order =
let xDat = xTrain.Data // |> Matrix.toVector
let coeffs = Polynomial.fit order xDat yTrain
let predictFunction = Polynomial.predict coeffs (xTest.[0])
predictFunction
open Plotly.NET
let rawchart() =
Chart.Point (xV,yV)
|> Chart.withTraceInfo "raw data"
let chartOrderOpt =
[1 .. 2 .. 10]
|> List.map (fun order ->
let coeffs = Polynomial.fit order xV yV
let predictFunction = Polynomial.predict coeffs
[1. .. 0.2 .. 10.]
|> List.map (fun x -> x,predictFunction x)
|> Chart.Line
|> Chart.withTraceInfo (sprintf "order=%i" order)
)
|> fun x -> Chart.combine (rawchart()::x)
|> Chart.withTitle "polynomial fits"
|> Chart.withTemplate ChartTemplates.lightMirrored
|> Chart.withXAxisStyle "x"
|> Chart.withYAxisStyle "y"
(*** condition: ipynb ***)
#if IPYNB
chartOrderOpt
#endif // IPYNB
(***hide***)
chartOrderOpt |> GenericChart.toChartHTML
(***include-it-raw***)
(**
And then crossvalidate across the polynomial orders:
*)
// the error is calculated as the squared difference of fitted and original y value
let error (f1:float) f2 = pown (f1 - f2) 2
/// Leave-one-out cross validation. Returns the mean squared error of each leave-out at the
/// specific polynomial order. Minimize for model selection.
let loocvPolynomial (xData:Vector<float>) (yData:Vector<float>) order =
let xDataMat = Matrix.ofCols [|xData|]
let getFitFuncPol xTrain yTrain (xTest:Vector<float>) =
getFitFuncPolynomial xTrain yTrain xTest order
let meanSquaredError = CrossValidation.loocv xDataMat yData getFitFuncPol error
meanSquaredError
// polynomial orders that should be checked
let ordersToCheck = [|1 .. 10|]
let errorPol =
ordersToCheck
|> Array.map (fun order ->
let error = loocvPolynomial xV yV order
order,error)
let chartPol =
errorPol
|> Chart.Line
|> Chart.withTemplate ChartTemplates.lightMirrored
|> Chart.withXAxisStyle "polynomial order"
|> Chart.withYAxisStyle "mean error"
|> Chart.withTitle "leave one out cross validation (polynomial)"
let result = sprintf "The minimal error is obtained by order=%i" (errorPol |> Seq.minBy snd |> fst)
(*** condition: ipynb ***)
#if IPYNB
chartPol
#endif // IPYNB
(***hide***)
chartPol |> GenericChart.toChartHTML
(***include-it-raw***)
(*** include-value:result ***)
(**
### Smoothing spline loocv
A smoothing spline is a non-parametric fitting procedure, fitting cubic polynomials in each interval given by the basis points.
let's first create some smoothing splines to cross validate:
*)
// the fitting function fits a smoothing spline with smoothing factor lambda to the training data set (xTrain and yTrain) and applies it to xTest
let getFitFuncSpline (xDat:Matrix<float>) (yDat:Vector<float>) (xDatTrain: Vector<float>) lambda =
let xDatVec = xDat.Data // |> Matrix.toVector
let zippedData = Seq.zip xDatVec yDat |> Array.ofSeq
let xValTest = xDatTrain.[0]
Spline.smoothingSpline zippedData (xDat.Data) lambda xValTest
/// in loocv the border points are chosen so that the support range of the training data set does not cover the test point.
/// if splines are used, that are not defined outside the border points use the following:
//let xDatSupport = Intervals.create (xDatVec |> Seq.min) (xDatVec |> Seq.max)
//if Intervals.liesInInterval xValTest xDatSupport then
//Spline.smoothingSpline zippedData (xDat |> Array.ofSeq) lambda xValTest
//else nan
let chartSpline =
[0.0002;0.002;0.0216;0.2;2.;20.]
|> List.map (fun lambda ->
let fit = Spline.smoothingSpline (Seq.zip xV yV |> Array.ofSeq) (Array.ofSeq xV) lambda
[1. .. 0.2 .. 10.]
|> List.map (fun x -> x,fit x)
|> Chart.Line
|> Chart.withTraceInfo (sprintf "l=%.4f" lambda)
)
|> fun x ->
Chart.combine (rawchart()::x)
|> Chart.withTemplate ChartTemplates.lightMirrored
|> Chart.withXAxisStyle "x"
|> Chart.withYAxisStyle "y"
|> Chart.withTitle "smoothing splines"
(*** condition: ipynb ***)
#if IPYNB
chartSpline
#endif // IPYNB
(***hide***)
chartSpline |> GenericChart.toChartHTML
(***include-it-raw***)
(**
And then crossvalidate across different lambda values:
*)
// the error is calculated as the squared difference of fitted and original y value
let errorSpl (f1:float) f2 =
// if xValue is outside of support area of the fitted model (some smoothing spline algorithms), the error should report 0.
//if nan.Equals f1 then 0.
//else pown (f1 - f2) 2
pown (f1 - f2) 2
/// Leave-one-out cross validation. Returns the mean squared error of each leave-out at the
/// specific regularization parameter (lambda). Minimize the (MSE) for model selection.
let loocvSmoothingSpline (xData:Vector<float>) (yData:Vector<float>) lambda =
let xDataMat = Matrix.ofCols [|xData|]
let getFitFuncSpl xDat yDat (xDatTrain: Vector<float>) =
getFitFuncSpline xDat yDat xDatTrain lambda
CrossValidation.loocv xDataMat yData getFitFuncSpl errorSpl
// smoothing parameter = lambda = regularization parameter
let lambdasToCheck = [|1. .. 15.|] |> Array.map (fun i -> 0.0001 * i**3.)
let errorSpline =
lambdasToCheck
|> Array.map (fun lambda ->
//basisPoints define, where the knots of the spline are located
let error = loocvSmoothingSpline xV yV lambda
lambda,error)
let chartSplineError =
errorSpline
|> Chart.Line
|> Chart.withTemplate ChartTemplates.lightMirrored
|> Chart.withXAxisStyle "lambda"
|> Chart.withYAxisStyle "mean error"
|> Chart.withTitle "leave one out cross validation (smoothing spline)"
let resultSpline = sprintf "The minimal error is obtained by lambda=%f" (errorSpline |> Seq.minBy snd |> fst)
(*** condition: ipynb ***)
#if IPYNB
chartSplineError
#endif // IPYNB
(***hide***)
chartSplineError |> GenericChart.toChartHTML
(***include-it-raw***)
(*** include-value:resultSpline ***)
(**
## k fold cross validation
The k fold cross validation (kfcv) is a generalized form of the loocv. Rather than excluding every data point separately, kfcv
allows the exclusion of data chunks with a defined fraction of the data points. When using k=10, the data is split up into 10 chunks of sub data sets each
containing 10% of the data set.
In each loop one chunk is excluded (test data), while the other 9 chunks serve as training data. After 10 (k) loops every single point was evaluated as test data set once
and k-1 times as training data. The selection of the subset chunks is random and can be repeated in several iterations.
The output contains the average error together with the standardDeviation computed by the given function.
*)
//repeated k fold cross validation for polynomials
let repeatedKFoldPolynomial k (xData: Vector<float>) (yData: Vector<float>) order =
let xDataMat = [|xData|] |> Matrix.ofCols
let getFitFuncPol xTrain yTrain (xTest:Vector<float>) =
getFitFuncPolynomial xTrain yTrain xTest order
CrossValidation.repeatedKFold k 10 xDataMat yData getFitFuncPol error Seq.stDev
//creates an output for 10 iterations where defined 20 % of the data set are taken as testing data set
let kfPolynomial order = repeatedKFoldPolynomial 5 xV yV order
//repeated k fold cross validation for smoothing splines
let repeatedKFoldSpline k (xData: Vector<float>) (yData: Vector<float>) lambda =
let xDataMat = [|xData|] |> Matrix.ofCols
let getFitFuncSpl xDat yDat (xDatTrain: Vector<float>) =
getFitFuncSpline xDat yDat xDatTrain lambda
CrossValidation.repeatedKFold k 10 xDataMat yData getFitFuncSpl errorSpl Seq.stDev
//creates an output for 10 iterations where defined 20 % of the data set are taken as testing data set
let kfSpline lambda = repeatedKFoldSpline 5 xV yV lambda
(**
The given data set is small and therefore the mean errors show a high variability
*)
let kfp =
let errorSplinekf =
ordersToCheck
|> Array.map (fun order ->
//basisPoints define, where the knots of the spline are located
let error = kfPolynomial order
(order,error.Error),error.ErrorStDev)
|> Array.unzip
fst errorSplinekf
|> Chart.Line
|> Chart.withTemplate ChartTemplates.lightMirrored
|> Chart.withXAxisStyle "order"
|> Chart.withYAxisStyle "mean error"
|> Chart.withYAxis(LayoutObjects.LinearAxis.init(AxisType=StyleParam.AxisType.Log))
|> Chart.withYErrorStyle (Array= snd errorSplinekf)
|> Chart.withTitle "kfoldPolynomial error"
(*** condition: ipynb ***)
#if IPYNB
kfp
#endif // IPYNB
(***hide***)
kfp |> GenericChart.toChartHTML
(***include-it-raw***)
let kfs =
let errorSplinekf =
lambdasToCheck
|> Array.map (fun lambda ->
//basisPoints define, where the knots of the spline are located
let error = kfSpline lambda
(lambda,error.Error),error.ErrorStDev)
|> Array.unzip
fst errorSplinekf
|> Chart.Line
|> Chart.withTemplate ChartTemplates.lightMirrored
|> Chart.withXAxisStyle "lambda"
|> Chart.withYAxisStyle "mean error"
|> Chart.withYErrorStyle (Array= snd errorSplinekf)
|> Chart.withTitle "kfoldSpline error"
(*** condition: ipynb ***)
#if IPYNB
kfs
#endif // IPYNB
(***hide***)
kfs |> GenericChart.toChartHTML
(***include-it-raw***)
(**
## Shuffle and split cross validation
The shuffle and split cross validation (sap) is a modified kfcv version. As in kfcv, sap
allows the exclusion of data chunks with a defined fraction of the data points. When using p=0.3, 30% of the data are taken as testing data set
while 70% serve as training data set. In sap by default only one testing set is evaluated (unlike to kfcv
where every data point is once part of a training data set and k-1 times part of testing data set).
Sap can be performed multiple times. Each time the training data fraction is taken randomly from the original data set. Unlike in kfcv overlaps may occur.
The output contains the average error together with the standardDeviation computed by the given function.
*)
let shuffleAndSplitPolynomial p iterations (xData: Vector<float>) (yData: Vector<float>) order =
let xDataMat = [|xData|] |> Matrix.ofCols
let getFitFuncPol xTrain yTrain (xTest:Vector<float>) =
getFitFuncPolynomial xTrain yTrain xTest order
CrossValidation.shuffleAndSplit p iterations xDataMat yData getFitFuncPol error Seq.stDev
//creates an output for 5 iterations where random 20 % of the data set are taken as testing data set
let sasPolynomial order = shuffleAndSplitPolynomial 0.2 5 xV yV order
let shuffleAndSplitSpline p iterations (xData: Vector<float>) (yData: Vector<float>) lambda =
let xDataMat = [|xData|] |> Matrix.ofCols
let getFitFuncSpl xDat yDat (xDatTrain: Vector<float>) =
getFitFuncSpline xDat yDat xDatTrain lambda
CrossValidation.shuffleAndSplit p iterations xDataMat yData getFitFuncSpl errorSpl Seq.stDev
//creates an output for 5 iterations where random 20 % of the data set are taken as testing data set
let sasSpline lambda = shuffleAndSplitSpline 0.2 5 xV yV lambda
(**
The given data set is small and therefore the mean errors show a high variability.
*)
let sasp =
let errorSplinekf =
ordersToCheck
|> Array.map (fun order ->
//basisPoints define, where the knots of the spline are located
let error = sasPolynomial order
(order,error.Error),error.ErrorStDev)
|> Array.unzip
fst errorSplinekf
|> Chart.Line
|> Chart.withTemplate ChartTemplates.lightMirrored
|> Chart.withXAxisStyle "order"
|> Chart.withYAxisStyle "mean error"
|> Chart.withYAxis(LayoutObjects.LinearAxis.init(AxisType=StyleParam.AxisType.Log))
|> Chart.withYErrorStyle (Array= snd errorSplinekf)
|> Chart.withTitle "shuffle_and_split polynomial error"
(*** condition: ipynb ***)
#if IPYNB
sasp
#endif // IPYNB
(***hide***)
sasp |> GenericChart.toChartHTML
(***include-it-raw***)
let sass =
let errorSplinekf =
lambdasToCheck
|> Array.map (fun lambda ->
//basisPoints define, where the knots of the spline are located
let error = sasSpline lambda
(lambda,error.Error),error.ErrorStDev)
|> Array.unzip
fst errorSplinekf
|> Chart.Line
|> Chart.withTemplate ChartTemplates.lightMirrored
|> Chart.withXAxisStyle "lambda"
|> Chart.withYAxisStyle "mean error"
|> Chart.withYErrorStyle (Array= snd errorSplinekf)
|> Chart.withTitle "shuffle_and_split spline error"
(*** condition: ipynb ***)
#if IPYNB
sass
#endif // IPYNB
(***hide***)
sass |> GenericChart.toChartHTML
(***include-it-raw***)