-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathcensus_XGBoost.py
More file actions
90 lines (52 loc) · 2.32 KB
/
census_XGBoost.py
File metadata and controls
90 lines (52 loc) · 2.32 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
# -*- coding: utf-8 -*-
"""
Created on Mon Mar 12 17:47:29 2018
@author: David
XGBoost run on cen2010 dataframe
Regressing median house value for census tract against various demographic features from 2010 census data
Adapted from example with Boston Housing Dataset:
http://www.blopig.com/blog/2017/07/using-random-forests-in-python-with-scikit-learn/
and these XGBoost tutorials/Kaggle submissions:
https://machinelearningmastery.com/xgboost-python-mini-course/
https://www.kaggle.com/mburakergenc/predictions-with-xgboost-and-linear-regression
"""
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('ggplot')
from sklearn.model_selection import train_test_split
from sklearn.metrics import explained_variance_score
from xgboost import XGBRegressor
### read CSV file containing BR census data
cen2010 = pd.read_csv('C:\\Users\\David\\Documents\\Data Science Related\\Project_Bremer\\cen2010_reduced2010CensusData.csv')
### Check how many columns contain missing data
print(cen2010.isnull().any().sum(), ' / ', len(cen2010.columns))
### Check how many entries in total are missing
print(cen2010.isnull().any(axis=1).sum(), ' / ', len(cen2010))
### drop any rows with missing data
cen2010_dropna = cen2010.dropna()
### select columns for features
features = cen2010_dropna.iloc[: , :-1]
### select target variable
target = cen2010_dropna.iloc[: , -1]
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2)
### instantiate XGboost regressor
xgb = XGBRegressor(n_estimators=200, learning_rate=0.08, gamma=0, subsample=0.75,
colsample_bytree=1, max_depth=4)
### fit xgb regressor to training data
xgb.fit(X_train, y_train)
predictions = xgb.predict(X_test)
print(explained_variance_score(predictions,y_test))
'''
### evaluate model fit on test data with several metrics
predicted_train = rf.predict(X_train)
predicted_test = rf.predict(X_test)
test_score = r2_score(y_test, predicted_test)
spearman = spearmanr(y_test, predicted_test)
pearson = pearsonr(y_test, predicted_test)
print(f'Out-of-bag R-2 score estimate: {rf.oob_score_:>5.3}')
print(f'Test data R-2 score: {test_score:>5.3}')
print(f'Test data Spearman correlation: {spearman[0]:.3}')
print(f'Test data Pearson correlation: {pearson[0]:.3}')
'''