-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathLogisticRregression.py
More file actions
122 lines (88 loc) · 3.22 KB
/
LogisticRregression.py
File metadata and controls
122 lines (88 loc) · 3.22 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
print(__doc__)
import numpy as np
from sklearn import linear_model, cross_validation
import csv
from sklearn.cross_validation import KFold
from sklearn.decomposition import PCA
data_training = np.genfromtxt('/Users/nesbtesh/Downloads/HW4.training.csv', delimiter=",")
data_test = np.genfromtxt('/Users/nesbtesh/Downloads/HW4.test.csv', delimiter=",")
#get X values
#ask if I only have to pick the first 7
X_train = data_training[:, 0:22]
X_test = data_test[:, 0:22]
#get Y variables
Y_train = data_training[:, 22]
#Y_test = data_test[:, 21]
# Create Logistic regression object
regr = linear_model.LogisticRegression()
# separate the train data into differentes samples
kf = KFold(len(X_train), n_folds=2)
#inite score list
score_model1 = []
#itirate the samples
for train_index, test_index in kf:
#fit the train data
regr.fit(X_train[train_index], Y_train[train_index])
#Score with test data that we got from the kfold
score_model1.append(regr.score(X_train[test_index], Y_train[test_index]))
#print the scores
print reduce(lambda x, y: x + y, score_model1) / len(score_model1)
#generate prediction with the actual test data
prediction = regr.predict(X_test)
# #open file and write the results into a csv file
# myfile = open('results2.csv', 'wb')
# wr = csv.writer(myfile, dialect='excel')
# #wr.writerows(prediction)
# for row in prediction:
# wr.writerow([row])
#######################################With PCA#########################
#perform PCA
pca = PCA(n_components=22)
pca.fit(X_train)
print(pca.explained_variance_ratio_)
#from the variance ratio we choose the first 15 variables
pca.n_components = 15
X_reduced = pca.fit_transform(X_train)
X_reduced.shape
# separate the train data into differentes samples
kf = KFold(len(X_reduced), n_folds=2)
score_model1 = []
#itirate the samples
for train_index, test_index in kf:
#fit the train data
regr.fit(X_reduced[train_index], Y_train[train_index])
#Score with test data that we got from the kfold
score_model1.append(regr.score(X_reduced[test_index], Y_train[test_index]))
#print the scores
print reduce(lambda x, y: x + y, score_model1) / len(score_model1)
#######################################With PCA2#########################
#perform PCA
pca = PCA(n_components=22)
pca.fit(X_train)
print(pca.explained_variance_ratio_)
pca.n_components = 12
X_reduced = pca.fit_transform(X_train)
X_reduced.shape
# separate the train data into differentes samples
kf = KFold(len(X_reduced), n_folds=2)
score_model1 = []
#itirate the samples
for train_index, test_index in kf:
#fit the train data
regr.fit(X_reduced[train_index], Y_train[train_index])
#Score with test data that we got from the kfold
score_model1.append(regr.score(X_reduced[test_index], Y_train[test_index]))
#print the scores
print reduce(lambda x, y: x + y, score_model1) / len(score_model1)
#we train the model with the choosen variables
X_train = data_training[:, 0:12]
regr.fit(X_train[train_index], Y_train[train_index])
#generate prediction with the actual test data
X_test = data_test[:, 0:12]
prediction = regr.predict(X_test)
#open file and write the results into a csv file
myfile = open('results2.csv', 'wb')
wr = csv.writer(myfile, dialect='excel')
#wr.writerows(prediction)
for row in prediction:
wr.writerow([row])