devcolor · William-Hill · Feb 18, 2026 · Feb 18, 2026
diff --git a/ai_model/complete_ml_pipeline.py b/ai_model/complete_ml_pipeline.py
@@ -14,11 +14,10 @@
 import pandas as pd
 import numpy as np
 from sklearn.model_selection import train_test_split, cross_val_score
-from sklearn.preprocessing import LabelEncoder, StandardScaler
+from sklearn.preprocessing import LabelEncoder
 from sklearn.metrics import (
     accuracy_score, precision_score, recall_score, f1_score, 
-    roc_auc_score, confusion_matrix, classification_report,
-    mean_squared_error, mean_absolute_error, r2_score
+    roc_auc_score, confusion_matrix, mean_squared_error, mean_absolute_error, r2_score
 )
 from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
 import xgboost as xgb
@@ -27,17 +26,17 @@
 warnings.filterwarnings('ignore')
 
 # Database utilities
-import sys
-import os
+import sys  # noqa: E402
+import os  # noqa: E402
 sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 
-from operations.db_utils import (
-    save_dataframe_to_db, 
-    save_model_performance, 
+from operations.db_utils import (  # noqa: E402
+    save_dataframe_to_db,
+    save_model_performance,
     create_model_performance_table,
     test_connection
 )
-from operations.db_config import TABLES, DB_CONFIG
+from operations.db_config import TABLES, DB_CONFIG  # noqa: E402
 
 # Get the project root directory
 PROJECT_ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
@@ -178,7 +177,7 @@ def assign_credential_type(row):
 
 df['target_credential_type'] = df.apply(assign_credential_type, axis=1)
 
-print(f"Created target variables:")
+print("Created target variables:")
 print(f"  - Retention: {df['target_retention'].value_counts().to_dict()}")
 print(f"  - At Risk: {df['target_at_risk'].value_counts().to_dict()}")
 print(f"  - Credential Type: {df['target_credential_type'].value_counts().to_dict()}")
@@ -285,8 +284,8 @@ def preprocess_features(df, feature_list):
 print("TESTING MULTIPLE MODELS WITH CROSS-VALIDATION")
 print("-" * 80)
 
-from sklearn.linear_model import LogisticRegression
-from sklearn.model_selection import StratifiedKFold
+from sklearn.linear_model import LogisticRegression  # noqa: E402
+from sklearn.model_selection import StratifiedKFold  # noqa: E402
 
 models_to_test = {
     'Logistic Regression': LogisticRegression(
@@ -349,11 +348,11 @@ def preprocess_features(df, feature_list):
     print(f"  Gap:          {gap:.4f} ({gap*100:.2f}%)")
 
     if gap < 0.05:
-        print(f"  ✓ No overfitting (gap < 5%)")
+        print("  ✓ No overfitting (gap < 5%)")
     elif gap < 0.10:
-        print(f"  ⚠ Minimal overfitting (gap < 10%)")
+        print("  ⚠ Minimal overfitting (gap < 10%)")
     else:
-        print(f"  ✗ Overfitting detected (gap > 10%)")
+        print("  ✗ Overfitting detected (gap > 10%)")
 
     model_comparison.append({
         'Model': model_name,
@@ -413,8 +412,8 @@ def preprocess_features(df, feature_list):
 
 print("\nConfusion Matrix:")
 cm = confusion_matrix(y_test, y_pred)
-print(f"                Predicted")
-print(f"              Not Ret  Retained")
+print("                Predicted")
+print("              Not Ret  Retained")
 print(f"Actual Not    {cm[0,0]:6d}    {cm[0,1]:6d}")
 print(f"       Ret    {cm[1,0]:6d}    {cm[1,1]:6d}")
 
@@ -545,8 +544,8 @@ def assign_alert_level(risk_score):
 low_retention_low_risk = df[(df['retention_probability'] < 0.3) & (df['at_risk_alert'] == 'LOW')]
 print(f"Students with <30% retention flagged as LOW: {len(low_retention_low_risk)} (should be very few)")
 
-print(f"\nEarly warning system aligned with retention predictions")
-print(f"\nAlert distribution:")
+print("\nEarly warning system aligned with retention predictions")
+print("\nAlert distribution:")
 print(df['at_risk_alert'].value_counts().sort_index())
 
 # ============================================================================
@@ -611,7 +610,7 @@ def assign_alert_level(risk_score):
     df['predicted_time_to_credential'] = time_model.predict(X_full_retention)
     df['predicted_graduation_year'] = df['Cohort'].str[:4].astype(float) + df['predicted_time_to_credential']
 
-    print(f"Time predictions generated")
+    print("Time predictions generated")
 else:
     print("Warning: Insufficient data for time-to-credential model")
     df['predicted_time_to_credential'] = np.nan
@@ -630,7 +629,7 @@ def assign_alert_level(risk_score):
 y_credential = y_credential[valid_idx]
 
 print(f"\nDataset size: {len(X_cred):,} students")
-print(f"Credential type distribution:")
+print("Credential type distribution:")
 cred_labels = {0: 'No Credential', 1: 'Certificate', 2: 'Associate', 3: 'Bachelor'}
 for k, v in y_credential.value_counts().sort_index().items():
     print(f"  {cred_labels.get(k, k)}: {v:,} ({v/len(y_credential)*100:.1f}%)")
@@ -677,7 +676,7 @@ def assign_alert_level(risk_score):
         model_name='Credential Type Prediction',
         model_type='classification',
         metrics={'accuracy': cred_accuracy, 'f1': cred_f1},
-        notes=f'Random Forest Classifier - 4 classes (No Credential, Certificate, Associate, Bachelor)'
+        notes='Random Forest Classifier - 4 classes (No Credential, Certificate, Associate, Bachelor)'
     )
 
 # Generate predictions for all students
@@ -699,7 +698,7 @@ def assign_alert_level(risk_score):
     if class_idx < len(prob_labels):
         df[prob_labels[int(class_idx)]] = proba[:, i]
 
-print(f"Credential type predictions generated")
+print("Credential type predictions generated")
 
 # ============================================================================
 # STEP 8: MODEL 5 - GATEWAY MATH SUCCESS PREDICTION
@@ -784,8 +783,8 @@ def assign_alert_level(risk_score):
 
 print("\nConfusion Matrix:")
 cm = confusion_matrix(y_test, y_pred)
-print(f"                Predicted")
-print(f"              No Pass    Pass")
+print("                Predicted")
+print("              No Pass    Pass")
 print(f"Actual No     {cm[0,0]:6d}    {cm[0,1]:6d}")
 print(f"       Pass   {cm[1,0]:6d}    {cm[1,1]:6d}")
 
@@ -795,7 +794,7 @@ def assign_alert_level(risk_score):
         model_name='Gateway Math Success Prediction',
         model_type='classification',
         metrics={'accuracy': math_accuracy, 'auc_roc': math_auc, 'precision': math_precision, 'recall': math_recall, 'f1_score': math_f1},
-        notes=f'XGBoost - Predicts gateway math completion Year 1'
+        notes='XGBoost - Predicts gateway math completion Year 1'
     )
 
 # Generate predictions for all students
@@ -810,7 +809,7 @@ def assign_alert_level(risk_score):
     labels=['High Risk', 'Moderate Risk', 'Likely Pass', 'Very Likely Pass']
 )
 
-print(f"Gateway math predictions generated")
+print("Gateway math predictions generated")
 
 # ============================================================================
 # STEP 9: MODEL 6 - GATEWAY ENGLISH SUCCESS PREDICTION (NEW!)
@@ -895,8 +894,8 @@ def assign_alert_level(risk_score):
 
 print("\nConfusion Matrix:")
 cm = confusion_matrix(y_test, y_pred)
-print(f"                Predicted")
-print(f"              No Pass    Pass")
+print("                Predicted")
+print("              No Pass    Pass")
 print(f"Actual No     {cm[0,0]:6d}    {cm[0,1]:6d}")
 print(f"       Pass   {cm[1,0]:6d}    {cm[1,1]:6d}")
 
@@ -906,7 +905,7 @@ def assign_alert_level(risk_score):
         model_name='Gateway English Success Prediction',
         model_type='classification',
         metrics={'accuracy': english_accuracy, 'auc_roc': english_auc, 'precision': english_precision, 'recall': english_recall, 'f1_score': english_f1},
-        notes=f'XGBoost - Predicts gateway English completion Year 1'
+        notes='XGBoost - Predicts gateway English completion Year 1'
     )
 
 # Generate predictions for all students
@@ -921,7 +920,7 @@ def assign_alert_level(risk_score):
     labels=['High Risk', 'Moderate Risk', 'Likely Pass', 'Very Likely Pass']
 )
 
-print(f"Gateway English predictions generated")
+print("Gateway English predictions generated")
 
 # ============================================================================
 # STEP 10: MODEL 7 - FIRST-SEMESTER GPA < 2.0 PREDICTION (NEW! - FIXED DATA LEAKAGE)
@@ -1009,8 +1008,8 @@ def assign_alert_level(risk_score):
 
 print("\nConfusion Matrix:")
 cm = confusion_matrix(y_test, y_pred)
-print(f"                Predicted")
-print(f"              GPA>=2.0  GPA<2.0")
+print("                Predicted")
+print("              GPA>=2.0  GPA<2.0")
 print(f"Actual >=2.0  {cm[0,0]:6d}    {cm[0,1]:6d}")
 print(f"       <2.0   {cm[1,0]:6d}    {cm[1,1]:6d}")
 
@@ -1020,7 +1019,7 @@ def assign_alert_level(risk_score):
         model_name='First-Semester Low GPA Prediction',
         model_type='classification',
         metrics={'accuracy': gpa_accuracy, 'auc_roc': gpa_auc, 'precision': gpa_precision, 'recall': gpa_recall, 'f1_score': gpa_f1},
-        notes=f'XGBoost - Predicts GPA < 2.0 risk (NO DATA LEAKAGE)'
+        notes='XGBoost - Predicts GPA < 2.0 risk (NO DATA LEAKAGE)'
     )
 
 # Generate predictions for all students
@@ -1033,7 +1032,7 @@ def assign_alert_level(risk_score):
     labels=['Low Risk', 'Moderate Risk', 'High Risk', 'Critical Risk']
 )
 
-print(f"Low GPA predictions generated")
+print("Low GPA predictions generated")
 
 # ============================================================================
 # STEP 11: SAVE PREDICTIONS TO STUDENT-LEVEL FILE
@@ -1066,7 +1065,7 @@ def assign_alert_level(risk_score):
         if_exists='replace'
     )
     if success:
-        print(f"✓ Student-level predictions saved to database")
+        print("✓ Student-level predictions saved to database")
         print(f"  Table: {TABLES['student_predictions']}")
         print(f"  Records: {len(df):,}")
         print(f"  Columns: {len(df.columns)}")
@@ -1077,7 +1076,7 @@ def assign_alert_level(risk_score):
 # Always save CSV files for backup and local analysis
 output_file = os.path.join(DATA_DIR, 'kctcs_student_level_with_predictions.csv')
 df.to_csv(output_file, index=False)
-print(f"\n✓ Saved student-level predictions to CSV:")
+print("\n✓ Saved student-level predictions to CSV:")
 print(f"  File: {output_file}")
 print(f"  Records: {len(df):,}")
 print(f"  Columns: {len(df.columns)}")
@@ -1117,15 +1116,15 @@ def assign_alert_level(risk_score):
         if_exists='replace'
     )
     if success:
-        print(f"✓ Course-level predictions saved to database")
+        print("✓ Course-level predictions saved to database")
         print(f"  Table: {TABLES['course_predictions']}")
         print(f"  Records: {len(merged_with_predictions):,}")
         print(f"  Columns: {len(merged_with_predictions.columns)}")
 
 # Always save CSV files for backup and local analysis
 output_file = os.path.join(DATA_DIR, 'kctcs_merged_with_predictions.csv')
 merged_with_predictions.to_csv(output_file, index=False)
-print(f"\n✓ Saved course-level predictions to CSV:")
+print("\n✓ Saved course-level predictions to CSV:")
 print(f"  File: {output_file}")
 print(f"  Records: {len(merged_with_predictions):,}")
 print(f"  Columns: {len(merged_with_predictions.columns)}")
@@ -1165,7 +1164,7 @@ def assign_alert_level(risk_score):
     pct = count / len(df) * 100
     summary_report += f"     {cat:20s} {count:6,} ({pct:5.1f}%)\n"
 
-summary_report += f"""
+summary_report += """
 2. EARLY WARNING SYSTEM
    Algorithm: Composite Risk Score (Retention + Performance Metrics)
    Approach: Aligned with retention predictions to eliminate contradictions
@@ -1331,14 +1330,14 @@ def assign_alert_level(risk_score):
     print("  ✗ Database connection failed - used CSV fallback")
 
 # Record counts loaded to database
-print(f"\nRecords Loaded to Database:")
+print("\nRecords Loaded to Database:")
 if db_connected == 1:
     print(f"  - student_predictions table: {len(df):,} records")
     print(f"  - course_predictions table: {len(merged_with_predictions):,} records")
-    print(f"  - ml_model_performance table: 4 model records")
+    print("  - ml_model_performance table: 4 model records")
     print(f"\n  Total records saved: {len(df) + len(merged_with_predictions) + 4:,}")
 else:
-    print(f"  - No records loaded to database (CSV fallback used)")
+    print("  - No records loaded to database (CSV fallback used)")
     print(f"  - student_predictions.csv: {len(df):,} records")
     print(f"  - course_predictions.csv: {len(merged_with_predictions):,} records")