Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
85 changes: 42 additions & 43 deletions ai_model/complete_ml_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,11 +14,10 @@
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import (
accuracy_score, precision_score, recall_score, f1_score,
roc_auc_score, confusion_matrix, classification_report,
mean_squared_error, mean_absolute_error, r2_score
roc_auc_score, confusion_matrix, mean_squared_error, mean_absolute_error, r2_score
)
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
import xgboost as xgb
Expand All @@ -27,17 +26,17 @@
warnings.filterwarnings('ignore')

# Database utilities
import sys
import os
import sys # noqa: E402
import os # noqa: E402
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))

from operations.db_utils import (
save_dataframe_to_db,
save_model_performance,
from operations.db_utils import ( # noqa: E402
save_dataframe_to_db,
save_model_performance,
create_model_performance_table,
test_connection
)
from operations.db_config import TABLES, DB_CONFIG
from operations.db_config import TABLES, DB_CONFIG # noqa: E402

# Get the project root directory
PROJECT_ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
Expand Down Expand Up @@ -178,7 +177,7 @@ def assign_credential_type(row):

df['target_credential_type'] = df.apply(assign_credential_type, axis=1)

print(f"Created target variables:")
print("Created target variables:")
print(f" - Retention: {df['target_retention'].value_counts().to_dict()}")
print(f" - At Risk: {df['target_at_risk'].value_counts().to_dict()}")
print(f" - Credential Type: {df['target_credential_type'].value_counts().to_dict()}")
Expand Down Expand Up @@ -285,8 +284,8 @@ def preprocess_features(df, feature_list):
print("TESTING MULTIPLE MODELS WITH CROSS-VALIDATION")
print("-" * 80)

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression # noqa: E402
from sklearn.model_selection import StratifiedKFold # noqa: E402

models_to_test = {
'Logistic Regression': LogisticRegression(
Expand Down Expand Up @@ -349,11 +348,11 @@ def preprocess_features(df, feature_list):
print(f" Gap: {gap:.4f} ({gap*100:.2f}%)")

if gap < 0.05:
print(f" ✓ No overfitting (gap < 5%)")
print(" ✓ No overfitting (gap < 5%)")
elif gap < 0.10:
print(f" ⚠ Minimal overfitting (gap < 10%)")
print(" ⚠ Minimal overfitting (gap < 10%)")
else:
print(f" ✗ Overfitting detected (gap > 10%)")
print(" ✗ Overfitting detected (gap > 10%)")

model_comparison.append({
'Model': model_name,
Expand Down Expand Up @@ -413,8 +412,8 @@ def preprocess_features(df, feature_list):

print("\nConfusion Matrix:")
cm = confusion_matrix(y_test, y_pred)
print(f" Predicted")
print(f" Not Ret Retained")
print(" Predicted")
print(" Not Ret Retained")
print(f"Actual Not {cm[0,0]:6d} {cm[0,1]:6d}")
print(f" Ret {cm[1,0]:6d} {cm[1,1]:6d}")

Expand Down Expand Up @@ -545,8 +544,8 @@ def assign_alert_level(risk_score):
low_retention_low_risk = df[(df['retention_probability'] < 0.3) & (df['at_risk_alert'] == 'LOW')]
print(f"Students with <30% retention flagged as LOW: {len(low_retention_low_risk)} (should be very few)")

print(f"\nEarly warning system aligned with retention predictions")
print(f"\nAlert distribution:")
print("\nEarly warning system aligned with retention predictions")
print("\nAlert distribution:")
print(df['at_risk_alert'].value_counts().sort_index())

# ============================================================================
Expand Down Expand Up @@ -611,7 +610,7 @@ def assign_alert_level(risk_score):
df['predicted_time_to_credential'] = time_model.predict(X_full_retention)
df['predicted_graduation_year'] = df['Cohort'].str[:4].astype(float) + df['predicted_time_to_credential']

print(f"Time predictions generated")
print("Time predictions generated")
else:
print("Warning: Insufficient data for time-to-credential model")
df['predicted_time_to_credential'] = np.nan
Expand All @@ -630,7 +629,7 @@ def assign_alert_level(risk_score):
y_credential = y_credential[valid_idx]

print(f"\nDataset size: {len(X_cred):,} students")
print(f"Credential type distribution:")
print("Credential type distribution:")
cred_labels = {0: 'No Credential', 1: 'Certificate', 2: 'Associate', 3: 'Bachelor'}
for k, v in y_credential.value_counts().sort_index().items():
print(f" {cred_labels.get(k, k)}: {v:,} ({v/len(y_credential)*100:.1f}%)")
Expand Down Expand Up @@ -677,7 +676,7 @@ def assign_alert_level(risk_score):
model_name='Credential Type Prediction',
model_type='classification',
metrics={'accuracy': cred_accuracy, 'f1': cred_f1},
notes=f'Random Forest Classifier - 4 classes (No Credential, Certificate, Associate, Bachelor)'
notes='Random Forest Classifier - 4 classes (No Credential, Certificate, Associate, Bachelor)'
)

# Generate predictions for all students
Expand All @@ -699,7 +698,7 @@ def assign_alert_level(risk_score):
if class_idx < len(prob_labels):
df[prob_labels[int(class_idx)]] = proba[:, i]

print(f"Credential type predictions generated")
print("Credential type predictions generated")

# ============================================================================
# STEP 8: MODEL 5 - GATEWAY MATH SUCCESS PREDICTION
Expand Down Expand Up @@ -784,8 +783,8 @@ def assign_alert_level(risk_score):

print("\nConfusion Matrix:")
cm = confusion_matrix(y_test, y_pred)
print(f" Predicted")
print(f" No Pass Pass")
print(" Predicted")
print(" No Pass Pass")
print(f"Actual No {cm[0,0]:6d} {cm[0,1]:6d}")
print(f" Pass {cm[1,0]:6d} {cm[1,1]:6d}")

Expand All @@ -795,7 +794,7 @@ def assign_alert_level(risk_score):
model_name='Gateway Math Success Prediction',
model_type='classification',
metrics={'accuracy': math_accuracy, 'auc_roc': math_auc, 'precision': math_precision, 'recall': math_recall, 'f1_score': math_f1},
notes=f'XGBoost - Predicts gateway math completion Year 1'
notes='XGBoost - Predicts gateway math completion Year 1'
)

# Generate predictions for all students
Expand All @@ -810,7 +809,7 @@ def assign_alert_level(risk_score):
labels=['High Risk', 'Moderate Risk', 'Likely Pass', 'Very Likely Pass']
)

print(f"Gateway math predictions generated")
print("Gateway math predictions generated")

# ============================================================================
# STEP 9: MODEL 6 - GATEWAY ENGLISH SUCCESS PREDICTION (NEW!)
Expand Down Expand Up @@ -895,8 +894,8 @@ def assign_alert_level(risk_score):

print("\nConfusion Matrix:")
cm = confusion_matrix(y_test, y_pred)
print(f" Predicted")
print(f" No Pass Pass")
print(" Predicted")
print(" No Pass Pass")
print(f"Actual No {cm[0,0]:6d} {cm[0,1]:6d}")
print(f" Pass {cm[1,0]:6d} {cm[1,1]:6d}")

Expand All @@ -906,7 +905,7 @@ def assign_alert_level(risk_score):
model_name='Gateway English Success Prediction',
model_type='classification',
metrics={'accuracy': english_accuracy, 'auc_roc': english_auc, 'precision': english_precision, 'recall': english_recall, 'f1_score': english_f1},
notes=f'XGBoost - Predicts gateway English completion Year 1'
notes='XGBoost - Predicts gateway English completion Year 1'
)

# Generate predictions for all students
Expand All @@ -921,7 +920,7 @@ def assign_alert_level(risk_score):
labels=['High Risk', 'Moderate Risk', 'Likely Pass', 'Very Likely Pass']
)

print(f"Gateway English predictions generated")
print("Gateway English predictions generated")

# ============================================================================
# STEP 10: MODEL 7 - FIRST-SEMESTER GPA < 2.0 PREDICTION (NEW! - FIXED DATA LEAKAGE)
Expand Down Expand Up @@ -1009,8 +1008,8 @@ def assign_alert_level(risk_score):

print("\nConfusion Matrix:")
cm = confusion_matrix(y_test, y_pred)
print(f" Predicted")
print(f" GPA>=2.0 GPA<2.0")
print(" Predicted")
print(" GPA>=2.0 GPA<2.0")
print(f"Actual >=2.0 {cm[0,0]:6d} {cm[0,1]:6d}")
print(f" <2.0 {cm[1,0]:6d} {cm[1,1]:6d}")

Expand All @@ -1020,7 +1019,7 @@ def assign_alert_level(risk_score):
model_name='First-Semester Low GPA Prediction',
model_type='classification',
metrics={'accuracy': gpa_accuracy, 'auc_roc': gpa_auc, 'precision': gpa_precision, 'recall': gpa_recall, 'f1_score': gpa_f1},
notes=f'XGBoost - Predicts GPA < 2.0 risk (NO DATA LEAKAGE)'
notes='XGBoost - Predicts GPA < 2.0 risk (NO DATA LEAKAGE)'
)

# Generate predictions for all students
Expand All @@ -1033,7 +1032,7 @@ def assign_alert_level(risk_score):
labels=['Low Risk', 'Moderate Risk', 'High Risk', 'Critical Risk']
)

print(f"Low GPA predictions generated")
print("Low GPA predictions generated")

# ============================================================================
# STEP 11: SAVE PREDICTIONS TO STUDENT-LEVEL FILE
Expand Down Expand Up @@ -1066,7 +1065,7 @@ def assign_alert_level(risk_score):
if_exists='replace'
)
if success:
print(f"✓ Student-level predictions saved to database")
print("✓ Student-level predictions saved to database")
print(f" Table: {TABLES['student_predictions']}")
print(f" Records: {len(df):,}")
print(f" Columns: {len(df.columns)}")
Expand All @@ -1077,7 +1076,7 @@ def assign_alert_level(risk_score):
# Always save CSV files for backup and local analysis
output_file = os.path.join(DATA_DIR, 'kctcs_student_level_with_predictions.csv')
df.to_csv(output_file, index=False)
print(f"\n✓ Saved student-level predictions to CSV:")
print("\n✓ Saved student-level predictions to CSV:")
print(f" File: {output_file}")
print(f" Records: {len(df):,}")
print(f" Columns: {len(df.columns)}")
Expand Down Expand Up @@ -1117,15 +1116,15 @@ def assign_alert_level(risk_score):
if_exists='replace'
)
if success:
print(f"✓ Course-level predictions saved to database")
print("✓ Course-level predictions saved to database")
print(f" Table: {TABLES['course_predictions']}")
print(f" Records: {len(merged_with_predictions):,}")
print(f" Columns: {len(merged_with_predictions.columns)}")

# Always save CSV files for backup and local analysis
output_file = os.path.join(DATA_DIR, 'kctcs_merged_with_predictions.csv')
merged_with_predictions.to_csv(output_file, index=False)
print(f"\n✓ Saved course-level predictions to CSV:")
print("\n✓ Saved course-level predictions to CSV:")
print(f" File: {output_file}")
print(f" Records: {len(merged_with_predictions):,}")
print(f" Columns: {len(merged_with_predictions.columns)}")
Expand Down Expand Up @@ -1165,7 +1164,7 @@ def assign_alert_level(risk_score):
pct = count / len(df) * 100
summary_report += f" {cat:20s} {count:6,} ({pct:5.1f}%)\n"

summary_report += f"""
summary_report += """
2. EARLY WARNING SYSTEM
Algorithm: Composite Risk Score (Retention + Performance Metrics)
Approach: Aligned with retention predictions to eliminate contradictions
Expand Down Expand Up @@ -1331,14 +1330,14 @@ def assign_alert_level(risk_score):
print(" ✗ Database connection failed - used CSV fallback")

# Record counts loaded to database
print(f"\nRecords Loaded to Database:")
print("\nRecords Loaded to Database:")
if db_connected == 1:
print(f" - student_predictions table: {len(df):,} records")
print(f" - course_predictions table: {len(merged_with_predictions):,} records")
print(f" - ml_model_performance table: 4 model records")
print(" - ml_model_performance table: 4 model records")
print(f"\n Total records saved: {len(df) + len(merged_with_predictions) + 4:,}")
else:
print(f" - No records loaded to database (CSV fallback used)")
print(" - No records loaded to database (CSV fallback used)")
print(f" - student_predictions.csv: {len(df):,} records")
print(f" - course_predictions.csv: {len(merged_with_predictions):,} records")

Expand Down
Loading