Machine-Learnign-With-Sklearn/LogisticRregression.py at master · nesbtesh/Machine-Learnign-With-Sklearn · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
print(__doc__)


import numpy as np
from sklearn import linear_model, cross_validation
import csv
from sklearn.cross_validation import KFold
from sklearn.decomposition import PCA

data_training = np.genfromtxt('/Users/nesbtesh/Downloads/HW4.training.csv', delimiter=",")
data_test = np.genfromtxt('/Users/nesbtesh/Downloads/HW4.test.csv', delimiter=",")

#get X values
#ask if I only have to pick the first 7
X_train = data_training[:, 0:22]
X_test = data_test[:, 0:22]

#get Y variables
Y_train = data_training[:, 22]
#Y_test = data_test[:, 21]

# Create Logistic regression object
regr = linear_model.LogisticRegression()

# separate the train data into differentes samples
kf = KFold(len(X_train), n_folds=2)

#inite score list
score_model1 = []

#itirate the samples
for train_index, test_index in kf:

	#fit the train data
	regr.fit(X_train[train_index], Y_train[train_index])

	#Score with test data that we got from the kfold
	score_model1.append(regr.score(X_train[test_index], Y_train[test_index]))

#print the scores
print reduce(lambda x, y: x + y, score_model1) / len(score_model1)

#generate prediction with the actual test data
prediction = regr.predict(X_test)

# #open file and write the results into a csv file
# myfile = open('results2.csv', 'wb')
# wr = csv.writer(myfile, dialect='excel')
# #wr.writerows(prediction)
# for row in prediction:
# 	wr.writerow([row])


#######################################With PCA#########################
#perform PCA
pca = PCA(n_components=22)
pca.fit(X_train)
print(pca.explained_variance_ratio_)

#from the variance ratio we choose the first 15 variables
pca.n_components = 15
X_reduced = pca.fit_transform(X_train)
X_reduced.shape

# separate the train data into differentes samples
kf = KFold(len(X_reduced), n_folds=2)

score_model1 = []

#itirate the samples
for train_index, test_index in kf:

	#fit the train data
	regr.fit(X_reduced[train_index], Y_train[train_index])

	#Score with test data that we got from the kfold
	score_model1.append(regr.score(X_reduced[test_index], Y_train[test_index]))

#print the scores
print reduce(lambda x, y: x + y, score_model1) / len(score_model1)

#######################################With PCA2#########################
#perform PCA
pca = PCA(n_components=22)
pca.fit(X_train)
print(pca.explained_variance_ratio_)

pca.n_components = 12
X_reduced = pca.fit_transform(X_train)
X_reduced.shape

# separate the train data into differentes samples
kf = KFold(len(X_reduced), n_folds=2)

score_model1 = []

#itirate the samples
for train_index, test_index in kf:

	#fit the train data
	regr.fit(X_reduced[train_index], Y_train[train_index])

	#Score with test data that we got from the kfold
	score_model1.append(regr.score(X_reduced[test_index], Y_train[test_index]))

#print the scores
print reduce(lambda x, y: x + y, score_model1) / len(score_model1)

#we train the model with the choosen variables
X_train = data_training[:, 0:12]
regr.fit(X_train[train_index], Y_train[train_index])

#generate prediction with the actual test data
X_test = data_test[:, 0:12]
prediction = regr.predict(X_test)

#open file and write the results into a csv file
myfile = open('results2.csv', 'wb')
wr = csv.writer(myfile, dialect='excel')
#wr.writerows(prediction)
for row in prediction:
	wr.writerow([row])