-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathDyMMMFeatureImportance.py
More file actions
102 lines (84 loc) · 3.32 KB
/
DyMMMFeatureImportance.py
File metadata and controls
102 lines (84 loc) · 3.32 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.inspection import permutation_importance
from sklearn.model_selection import train_test_split
#https://explained.ai/rf-importance/
#https://www.kaggle.com/raviolli77/random-forest-in-python
from sklearn.base import clone
def dropcol_importances(rf, X_train, y_train):
rf_ = clone(rf)
rf_.set_params(warm_start=True, oob_score=True)
rf_.random_state = 999
rf_.fit(X_train, y_train)
baseline = rf_.oob_score_
imp = []
for col in X_train.columns:
X = X_train.drop(col, axis=1)
rf_ = clone(rf)
rf_.random_state = 999
rf_.fit(X, y_train)
o = rf_.oob_score_
imp.append(baseline - o)
imp = np.array(imp)
I = pd.DataFrame(
data={'Feature':X_train.columns,
'Importance':imp})
I = I.set_index('Feature')
I = I.sort_values('Importance', ascending=True)
return I
def randomForrest(fileName):
df=pd.read_csv(fileName)
#df = df[df.columns[1:]]
X = df.drop(['CSI','biomass1_SS', 'biomass2_SS', 'biomass1', 'biomass2'], axis=1)
feature_names=X.columns.to_list()
y = np.where(df['CSI']>=0.8, 1, 0)
X_train, X_test, y_train, y_test = train_test_split(
X, y, stratify=y, random_state=42)
# Build a forest and compute the feature importances
forest = RandomForestClassifier(n_estimators=120,warm_start=True, max_features=None,
oob_score=True,
random_state=0)
forest.fit(X_train, y_train)
importances = forest.feature_importances_
std = np.std([tree.feature_importances_ for tree in forest.estimators_], axis=0)
indices = np.argsort(importances)[::-1]
# Print the feature ranking
print("Feature ranking:")
colNames=[]
for f in range(X.shape[1]):
print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]))
colNames.append(X.columns[indices[f]])
print(colNames)
df = pd.DataFrame(data=colNames, columns=['param'])
df['importances']=importances[indices]
df['std']=std[indices]
print(df)
plt.figure()
plt.title("Feature importances")
plt.bar(range(X.shape[1]), importances[indices],
color="r", yerr=std[indices], align="center")
plt.xticks(range(X.shape[1]), colNames)
plt.xlim([-1, X.shape[1]])
plt.show()
result = permutation_importance(
forest, X_test, y_test, n_repeats=10, random_state=42, n_jobs=2)
forest_importances = pd.Series(result.importances_mean, index=feature_names)
print(forest_importances)
fig, ax = plt.subplots()
forest_importances.plot.bar(yerr=result.importances_std, ax=ax)
ax.set_title("Feature importances using permutation on full model")
ax.set_ylabel("Mean accuracy decrease")
fig.tight_layout()
plt.show()
df=dropcol_importances(forest, X_train, y_train)
plt.figure()
plt.title("Feature importances")
plt.bar(range(X.shape[1]), importances[indices],
color="r", yerr=std[indices], align="center")
plt.xticks(range(X.shape[1]), colNames)
plt.xlim([-1, X.shape[1]])
plt.show()
if __name__ == '__main__':
randomForrest("C:/dymmdir/code/worktree/ruhycode/communities/communitycoop_cstr/V8s/params_00000_RESULT.csv")