-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathpca_features.py
More file actions
54 lines (38 loc) · 1.8 KB
/
pca_features.py
File metadata and controls
54 lines (38 loc) · 1.8 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
import sklearn as sk
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
# Load your dataset
df = pd.read_excel("WaitData.Published.xlsx",sheet_name='F4')
df = df.drop(columns=["x_ArrivalDTTM", "x_ScheduledDTTM", "x_BeginDTTM"])
df.dropna(axis=1, inplace=True)
# df = df[0,-5]
# Separate features and labels (if applicable)
y = df["Wait"] # Keep target label for future use
X = df.drop(columns=["Wait"]) # Exclude the target label if there's any
# imputer = SimpleImputer(strategy='mean')
# X = imputer.fit_transform(X)
# Standardize the features (important for PCA)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
# Assuming X_scaled is your scaled DataFrame
pca = PCA(n_components=3) # Or whatever number of components you want
pca.fit(X_scaled)
X_pca = pca.transform(X_scaled)
# Now X_pca should have the correct shape
components_df = pd.DataFrame(
pca.components_, # Shape should be (3, number of original features)
columns=X.columns, # Ensure columns match the original DataFrame's features
index=[f"PC{i+1}" for i in range(pca.n_components_)] # Create an index for each component
)
# Get the PCA component loadings (feature contributions)
# components_df = pd.DataFrame(pca.components_, columns=X_scaled.columns, index=[f"PC{i+1}" for i in range(pca.components_.shape[0])])
# Sum the absolute contributions for each feature across all components
importance = components_df.abs().sum(axis=0)
# Sort the features by their total importance across all principal components
sorted_importance = importance.sort_values(ascending=False)
# Print the top features that contribute most to the dataset's variance
print("Most important columns based on PCA:")
print(sorted_importance.head())