-
Notifications
You must be signed in to change notification settings - Fork 5
Expand file tree
/
Copy pathkcrossvalidation.py
More file actions
157 lines (126 loc) · 5.97 KB
/
kcrossvalidation.py
File metadata and controls
157 lines (126 loc) · 5.97 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
import data_handling as dt
from prog_bar import ProgBar
from exampleentry import *
import treerandom
import treepredict
def do_kcross_validation(fin,finy,kfolds):
print "Starting k=" + str(kfolds)+" validation for random forest"
#there is 2500 tracks
labels = dt.get_lines(finy,int)
pb = ProgBar()
lines = dt.get_lines(fin,float," ", callback = pb.callback)
del pb
#normalize features
lines = dt.transform_features(lines)
data = dt.add_labels_to_lines(lines, labels)
block_size = len(lines)/kfolds
print "chunk size = " + str(block_size)
example_chunks = list(dt.chunks(data, block_size))
#labels_chunks = list(dt.chunks(labels, block_size))
print "number of chunks = " +str(len(example_chunks))
#holds avg accuracy for one forest
accuracy_results = []
#need to add loop here, to loop over configurations of m,n,k
m = [100]
k = [5]
n = [5]
bestm = 0
bestk = 0
bestn = 0
bestaccuracy = 0
for p in range(0,len(m)):
for f in range(0,len(k)):
for g in range(0,len(n)):
for i in range(0,len(example_chunks)):
#we leave set in index i out of train
print "prepare validation set"
validationdata = example_chunks[i]
#extract validation chunk
print "leaving out block " + str(i) + " for validation"
leaveout = i
validationdata = [ exampleentry(validationdata[i][0:len(validationdata[i])-1],validationdata[i][-1]) for i in range(0,len(validationdata)) ]
trainingdata = []
print("merging blocks "),
for j in range(0,len(example_chunks)):
if(j != leaveout):
#print "j="+str(j) + " i="+ str(leaveout)
print(str(j) + ","),
trainingdata = trainingdata + example_chunks[j]
print "\nprepare training set"
print "training on " + str(len(trainingdata))
print "each track has " + str(len(trainingdata[0])) + " features"
pb = ProgBar()
forest = treerandom.build_randomized_forest(trainingdata,m=m[p],kcandidates=k[f],nmin=n[g], callback=pb.callback)
del pb
print "testing on " + str(len(validationdata))
corrects = 0
#classify a set of entries
for example in validationdata:
#print example.features
result = treerandom.classify(example.features,forest)
#print 'expected : ' + str(example.label) + ' result : '+ str(result)
if(result == example.label):
corrects = corrects + 1
#calculate the % of accuracy
accuracy_percentage = (corrects*100)/len(validationdata)
print "accuracy = " + str(accuracy_percentage) + "%"
accuracy_results.append(accuracy_percentage)
avgcc = dt.average(accuracy_results)
print "average accuracy using m="+str(m[p]) + ", k="+str(k[f])+", n="+str(n[g]) + "---> " + str(avgcc) + "%"
if(avgcc > bestaccuracy):
bestm = m[p]
bestk = k[f]
bestn = n[g]
bestaccuracy = avgcc
print "BEST COMBINATION m="+str(bestm) + ", k="+str(bestk)+", n="+str(bestn) + "---> " + str(bestaccuracy) + "%"
def do_simpletree_kcross_validation(fin,finy,kfolds):
print "Starting k=" + str(kfolds)+" validation for Simple tree"
#there is 2500 tracks
labels = dt.get_lines(finy,int)
pb = ProgBar()
lines = dt.get_lines(fin,float," ", callback = pb.callback)
del pb
#normalize features
lines = dt.transform_features(lines)
data = dt.add_labels_to_lines(lines, labels)
block_size = len(lines)/kfolds
print "chunk size = " + str(block_size)
example_chunks = list(dt.chunks(data, block_size))
#labels_chunks = list(dt.chunks(labels, block_size))
print "number of chunks = " +str(len(example_chunks))
#holds avg accuracy for one forest
accuracy_results = []
for i in range(0,len(example_chunks)):
#we leave set in index i out of train
print "prepare validation set"
validationdata = example_chunks[i]
#extract validation chunk
print "leaving out block " + str(i) + " for validation"
leaveout = i
validationdata = [ exampleentry(validationdata[i][0:len(validationdata[i])-1],validationdata[i][-1]) for i in range(0,len(validationdata)) ]
trainingdata = []
print("merging blocks "),
for j in range(0,len(example_chunks)):
if(j != leaveout):
#print "j="+str(j) + " i="+ str(leaveout)
print(str(j) + ","),
trainingdata = trainingdata + example_chunks[j]
print "\nprepare training set"
print "training on " + str(len(trainingdata))
print "each track has " + str(len(trainingdata[0])) + " features"
tree = treepredict.buildtree(trainingdata)
print "testing on " + str(len(validationdata))
corrects = 0
#classify a set of entries
for example in validationdata:
#print example.features
result = treepredict.classify(example.features,tree)
#print 'expected : ' + str(example.label) + ' result : '+ str(result)
if(result == example.label):
corrects = corrects + 1
#calculate the % of accuracy
accuracy_percentage = (corrects*100)/len(validationdata)
print "accuracy = " + str(accuracy_percentage) + "%"
accuracy_results.append(accuracy_percentage)
avgcc = dt.average(accuracy_results)
print "average accuracy ="+ str(avgcc) + "%"