-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathRFE.py
More file actions
126 lines (95 loc) · 5.44 KB
/
RFE.py
File metadata and controls
126 lines (95 loc) · 5.44 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
# ===================== #
# Compute Recursive Feature Elimination
# ===================== #
# Author: Jesse Wolf, jwolf@uoguelph.ca | Thomas Papp-Simon, tpappsim@uoguelph.ca
# Date: March 18, 2023
# How to run: python3 RFE_multicol.py
# This script loops through training splits to generate a csv file for each split, containing only the features RFE selects.
# ================= #
# Import relevant libraries
import pandas as pd
import os
import re
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
print ("\nBeginning RFE.py.\n")
# Set the directory where the scaled data are located
directory = './scaled_training_sets/'
# Make directory if does not exist
path = "RFE_splits"
# Check whether the specified path exists or not
isExist = os.path.exists(path)
if not isExist:
# Create a new directory because it does not exist
os.makedirs(path)
# Get a list of all the CSV files in the directory
files = [os.path.join(directory, file) for file in os.listdir(directory) if file.startswith('training')]
for filename in files:
# load the dataset
merged_df = pd.read_csv(filename)
df_extra = merged_df[['team_abbreviation_home', 'team_abbreviation_away', 'game_date', 'game_yearEnd', 'wl_home']]
X=merged_df.values[:,:23]
Y=merged_df.values[:,27].astype(int)
# Feature selection
model = LogisticRegression(solver='lbfgs', max_iter=1000)
model.fit(X,Y)
rfe = RFE(model, n_features_to_select=17)
fit = rfe.fit(X,Y)
cols = list(merged_df.columns[:23])
temp = pd.Series(rfe.support_,index = cols)
selected_features_rfe = temp[temp==True].index
# Use regex to get only the 'trainingyear1-year2' from the file name
filename_re = re.search('\/(\w+-\w+)\.', filename)
if filename_re:
new_filename = filename_re.group(1)
df_rfe_clean = pd.concat([merged_df[selected_features_rfe].reset_index(drop=True), df_extra], axis=1)
df_rfe_clean.to_csv(f'./RFE_splits/RFE_{new_filename}.csv', index = False)
### Creating test set with same features as RFE_training2015-2021_outliers_removed_scaled.csv
RFE_training = pd.read_csv(f"./RFE_splits/RFE_training2015-2021_outliers_removed_scaled.csv")
scaled_test_set = pd.read_csv(f"./scaled_training_sets/test_set_outliers_removed_scaled.csv")
scaled_test_set = scaled_test_set.drop([col for col in scaled_test_set.columns if col not in RFE_training.columns], axis=1)
scaled_test_set.to_csv(f'./RFE_splits/test_RFE_all.csv', index=False)
# This code should loop over all files in the scaled_training_sets directory and only process the ones with the .csv file extension and ending with RFE.csv. It then loads each data frame from the CSV file and stores it in a dictionary using the file name as the key. Finally, it creates a list of column names for each data frame and prints them to the console.
directory = "RFE_splits"
file_extension = "RFE"
# # Create an empty list to store each dataframe
frames = []
# # Loop over all files in the directory
for file_name in os.listdir(directory):
# Check if the file has the correct file extension
if file_name.startswith(file_extension):
# Load the data frame from the CSV file
df = pd.read_csv(os.path.join(directory, file_name))
frames.append(df)
# Create a list of the columns(features) that are shared among all datasets.
common_cols = list(set.intersection(*(set(df.columns) for df in frames)))
# # Print the common column names, or a message if there are no common column names
# if len(common_columns) > 0:
# print(f"Common column names: {common_columns}")
# else:
# print("No common column names")
# Read in train and test sets to create our RFE-common datasets
train2015_2021 = pd.read_csv('./RFE_splits/RFE_training2015-2021_outliers_removed_scaled.csv')
test = pd.read_csv('./scaled_training_sets/test_set_outliers_removed_scaled.csv')
# Reset both train and test dataset to only contain the RFE_common columns
train2015_2021 = train2015_2021.loc[:, common_cols]
test = test.loc[:, common_cols]
df_extra_train = train2015_2021[['team_abbreviation_home', 'team_abbreviation_away', 'game_date', 'game_yearEnd', 'wl_home']]
df_extra_test = test[['team_abbreviation_home', 'team_abbreviation_away', 'game_date', 'game_yearEnd', 'wl_home']]
# print(df_extra_train)
# print(df_extra_test)
train2015_2021 = train2015_2021.drop(['team_abbreviation_home', 'team_abbreviation_away', 'game_date', 'game_yearEnd', 'wl_home'], axis=1)
test = test.drop(['team_abbreviation_home', 'team_abbreviation_away', 'game_date', 'game_yearEnd', 'wl_home'], axis=1)
# print(train2015_2021)
# print(test)
train2015_2021 = pd.concat([train2015_2021,df_extra_train], axis=1)
test = pd.concat([test,df_extra_test], axis=1)
#print(train2015_2021)
#print(test)
# Re-index columns to be in same order as the rest of the datasets
# train2015_2021 = train2015_2021.reindex(columns=['percent_3pt', 'percent_2pt', 'DRB', 'ORB', 'TRB', 'STL', 'DRtg', 'NRtg', 'TS.','team_abbreviation_home', 'team_abbreviation_away', 'game_date', 'game_yearEnd','wl_home'])
# test = test.reindex(columns=['percent_3pt', 'percent_2pt', 'DRB', 'ORB', 'TRB', 'STL', 'DRtg', 'NRtg', 'TS.','team_abbreviation_home', 'team_abbreviation_away', 'game_date', 'game_yearEnd','wl_home'])
# Create new csvs for downstream use
train2015_2021.to_csv('./RFE_splits/train2015_2021_RFEcommon.csv', index=False)
test.to_csv('./RFE_splits/test_RFEcommon.csv', index = False)
print ("RFE.py has finished running, on to featureImportance.py.\n")