diff --git a/ai_model/complete_ml_pipeline.py b/ai_model/complete_ml_pipeline.py index da63bf5..c23fe99 100644 --- a/ai_model/complete_ml_pipeline.py +++ b/ai_model/complete_ml_pipeline.py @@ -14,11 +14,10 @@ import pandas as pd import numpy as np from sklearn.model_selection import train_test_split, cross_val_score -from sklearn.preprocessing import LabelEncoder, StandardScaler +from sklearn.preprocessing import LabelEncoder from sklearn.metrics import ( accuracy_score, precision_score, recall_score, f1_score, - roc_auc_score, confusion_matrix, classification_report, - mean_squared_error, mean_absolute_error, r2_score + roc_auc_score, confusion_matrix, mean_squared_error, mean_absolute_error, r2_score ) from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor import xgboost as xgb @@ -27,17 +26,17 @@ warnings.filterwarnings('ignore') # Database utilities -import sys -import os +import sys # noqa: E402 +import os # noqa: E402 sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -from operations.db_utils import ( - save_dataframe_to_db, - save_model_performance, +from operations.db_utils import ( # noqa: E402 + save_dataframe_to_db, + save_model_performance, create_model_performance_table, test_connection ) -from operations.db_config import TABLES, DB_CONFIG +from operations.db_config import TABLES, DB_CONFIG # noqa: E402 # Get the project root directory PROJECT_ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) @@ -178,7 +177,7 @@ def assign_credential_type(row): df['target_credential_type'] = df.apply(assign_credential_type, axis=1) -print(f"Created target variables:") +print("Created target variables:") print(f" - Retention: {df['target_retention'].value_counts().to_dict()}") print(f" - At Risk: {df['target_at_risk'].value_counts().to_dict()}") print(f" - Credential Type: {df['target_credential_type'].value_counts().to_dict()}") @@ -285,8 +284,8 @@ def preprocess_features(df, feature_list): print("TESTING MULTIPLE MODELS WITH CROSS-VALIDATION") print("-" * 80) -from sklearn.linear_model import LogisticRegression -from sklearn.model_selection import StratifiedKFold +from sklearn.linear_model import LogisticRegression # noqa: E402 +from sklearn.model_selection import StratifiedKFold # noqa: E402 models_to_test = { 'Logistic Regression': LogisticRegression( @@ -349,11 +348,11 @@ def preprocess_features(df, feature_list): print(f" Gap: {gap:.4f} ({gap*100:.2f}%)") if gap < 0.05: - print(f" ✓ No overfitting (gap < 5%)") + print(" ✓ No overfitting (gap < 5%)") elif gap < 0.10: - print(f" ⚠ Minimal overfitting (gap < 10%)") + print(" ⚠ Minimal overfitting (gap < 10%)") else: - print(f" ✗ Overfitting detected (gap > 10%)") + print(" ✗ Overfitting detected (gap > 10%)") model_comparison.append({ 'Model': model_name, @@ -413,8 +412,8 @@ def preprocess_features(df, feature_list): print("\nConfusion Matrix:") cm = confusion_matrix(y_test, y_pred) -print(f" Predicted") -print(f" Not Ret Retained") +print(" Predicted") +print(" Not Ret Retained") print(f"Actual Not {cm[0,0]:6d} {cm[0,1]:6d}") print(f" Ret {cm[1,0]:6d} {cm[1,1]:6d}") @@ -545,8 +544,8 @@ def assign_alert_level(risk_score): low_retention_low_risk = df[(df['retention_probability'] < 0.3) & (df['at_risk_alert'] == 'LOW')] print(f"Students with <30% retention flagged as LOW: {len(low_retention_low_risk)} (should be very few)") -print(f"\nEarly warning system aligned with retention predictions") -print(f"\nAlert distribution:") +print("\nEarly warning system aligned with retention predictions") +print("\nAlert distribution:") print(df['at_risk_alert'].value_counts().sort_index()) # ============================================================================ @@ -611,7 +610,7 @@ def assign_alert_level(risk_score): df['predicted_time_to_credential'] = time_model.predict(X_full_retention) df['predicted_graduation_year'] = df['Cohort'].str[:4].astype(float) + df['predicted_time_to_credential'] - print(f"Time predictions generated") + print("Time predictions generated") else: print("Warning: Insufficient data for time-to-credential model") df['predicted_time_to_credential'] = np.nan @@ -630,7 +629,7 @@ def assign_alert_level(risk_score): y_credential = y_credential[valid_idx] print(f"\nDataset size: {len(X_cred):,} students") -print(f"Credential type distribution:") +print("Credential type distribution:") cred_labels = {0: 'No Credential', 1: 'Certificate', 2: 'Associate', 3: 'Bachelor'} for k, v in y_credential.value_counts().sort_index().items(): print(f" {cred_labels.get(k, k)}: {v:,} ({v/len(y_credential)*100:.1f}%)") @@ -677,7 +676,7 @@ def assign_alert_level(risk_score): model_name='Credential Type Prediction', model_type='classification', metrics={'accuracy': cred_accuracy, 'f1': cred_f1}, - notes=f'Random Forest Classifier - 4 classes (No Credential, Certificate, Associate, Bachelor)' + notes='Random Forest Classifier - 4 classes (No Credential, Certificate, Associate, Bachelor)' ) # Generate predictions for all students @@ -699,7 +698,7 @@ def assign_alert_level(risk_score): if class_idx < len(prob_labels): df[prob_labels[int(class_idx)]] = proba[:, i] -print(f"Credential type predictions generated") +print("Credential type predictions generated") # ============================================================================ # STEP 8: MODEL 5 - GATEWAY MATH SUCCESS PREDICTION @@ -784,8 +783,8 @@ def assign_alert_level(risk_score): print("\nConfusion Matrix:") cm = confusion_matrix(y_test, y_pred) -print(f" Predicted") -print(f" No Pass Pass") +print(" Predicted") +print(" No Pass Pass") print(f"Actual No {cm[0,0]:6d} {cm[0,1]:6d}") print(f" Pass {cm[1,0]:6d} {cm[1,1]:6d}") @@ -795,7 +794,7 @@ def assign_alert_level(risk_score): model_name='Gateway Math Success Prediction', model_type='classification', metrics={'accuracy': math_accuracy, 'auc_roc': math_auc, 'precision': math_precision, 'recall': math_recall, 'f1_score': math_f1}, - notes=f'XGBoost - Predicts gateway math completion Year 1' + notes='XGBoost - Predicts gateway math completion Year 1' ) # Generate predictions for all students @@ -810,7 +809,7 @@ def assign_alert_level(risk_score): labels=['High Risk', 'Moderate Risk', 'Likely Pass', 'Very Likely Pass'] ) -print(f"Gateway math predictions generated") +print("Gateway math predictions generated") # ============================================================================ # STEP 9: MODEL 6 - GATEWAY ENGLISH SUCCESS PREDICTION (NEW!) @@ -895,8 +894,8 @@ def assign_alert_level(risk_score): print("\nConfusion Matrix:") cm = confusion_matrix(y_test, y_pred) -print(f" Predicted") -print(f" No Pass Pass") +print(" Predicted") +print(" No Pass Pass") print(f"Actual No {cm[0,0]:6d} {cm[0,1]:6d}") print(f" Pass {cm[1,0]:6d} {cm[1,1]:6d}") @@ -906,7 +905,7 @@ def assign_alert_level(risk_score): model_name='Gateway English Success Prediction', model_type='classification', metrics={'accuracy': english_accuracy, 'auc_roc': english_auc, 'precision': english_precision, 'recall': english_recall, 'f1_score': english_f1}, - notes=f'XGBoost - Predicts gateway English completion Year 1' + notes='XGBoost - Predicts gateway English completion Year 1' ) # Generate predictions for all students @@ -921,7 +920,7 @@ def assign_alert_level(risk_score): labels=['High Risk', 'Moderate Risk', 'Likely Pass', 'Very Likely Pass'] ) -print(f"Gateway English predictions generated") +print("Gateway English predictions generated") # ============================================================================ # STEP 10: MODEL 7 - FIRST-SEMESTER GPA < 2.0 PREDICTION (NEW! - FIXED DATA LEAKAGE) @@ -1009,8 +1008,8 @@ def assign_alert_level(risk_score): print("\nConfusion Matrix:") cm = confusion_matrix(y_test, y_pred) -print(f" Predicted") -print(f" GPA>=2.0 GPA<2.0") +print(" Predicted") +print(" GPA>=2.0 GPA<2.0") print(f"Actual >=2.0 {cm[0,0]:6d} {cm[0,1]:6d}") print(f" <2.0 {cm[1,0]:6d} {cm[1,1]:6d}") @@ -1020,7 +1019,7 @@ def assign_alert_level(risk_score): model_name='First-Semester Low GPA Prediction', model_type='classification', metrics={'accuracy': gpa_accuracy, 'auc_roc': gpa_auc, 'precision': gpa_precision, 'recall': gpa_recall, 'f1_score': gpa_f1}, - notes=f'XGBoost - Predicts GPA < 2.0 risk (NO DATA LEAKAGE)' + notes='XGBoost - Predicts GPA < 2.0 risk (NO DATA LEAKAGE)' ) # Generate predictions for all students @@ -1033,7 +1032,7 @@ def assign_alert_level(risk_score): labels=['Low Risk', 'Moderate Risk', 'High Risk', 'Critical Risk'] ) -print(f"Low GPA predictions generated") +print("Low GPA predictions generated") # ============================================================================ # STEP 11: SAVE PREDICTIONS TO STUDENT-LEVEL FILE @@ -1066,7 +1065,7 @@ def assign_alert_level(risk_score): if_exists='replace' ) if success: - print(f"✓ Student-level predictions saved to database") + print("✓ Student-level predictions saved to database") print(f" Table: {TABLES['student_predictions']}") print(f" Records: {len(df):,}") print(f" Columns: {len(df.columns)}") @@ -1077,7 +1076,7 @@ def assign_alert_level(risk_score): # Always save CSV files for backup and local analysis output_file = os.path.join(DATA_DIR, 'kctcs_student_level_with_predictions.csv') df.to_csv(output_file, index=False) -print(f"\n✓ Saved student-level predictions to CSV:") +print("\n✓ Saved student-level predictions to CSV:") print(f" File: {output_file}") print(f" Records: {len(df):,}") print(f" Columns: {len(df.columns)}") @@ -1117,7 +1116,7 @@ def assign_alert_level(risk_score): if_exists='replace' ) if success: - print(f"✓ Course-level predictions saved to database") + print("✓ Course-level predictions saved to database") print(f" Table: {TABLES['course_predictions']}") print(f" Records: {len(merged_with_predictions):,}") print(f" Columns: {len(merged_with_predictions.columns)}") @@ -1125,7 +1124,7 @@ def assign_alert_level(risk_score): # Always save CSV files for backup and local analysis output_file = os.path.join(DATA_DIR, 'kctcs_merged_with_predictions.csv') merged_with_predictions.to_csv(output_file, index=False) -print(f"\n✓ Saved course-level predictions to CSV:") +print("\n✓ Saved course-level predictions to CSV:") print(f" File: {output_file}") print(f" Records: {len(merged_with_predictions):,}") print(f" Columns: {len(merged_with_predictions.columns)}") @@ -1165,7 +1164,7 @@ def assign_alert_level(risk_score): pct = count / len(df) * 100 summary_report += f" {cat:20s} {count:6,} ({pct:5.1f}%)\n" -summary_report += f""" +summary_report += """ 2. EARLY WARNING SYSTEM Algorithm: Composite Risk Score (Retention + Performance Metrics) Approach: Aligned with retention predictions to eliminate contradictions @@ -1331,14 +1330,14 @@ def assign_alert_level(risk_score): print(" ✗ Database connection failed - used CSV fallback") # Record counts loaded to database -print(f"\nRecords Loaded to Database:") +print("\nRecords Loaded to Database:") if db_connected == 1: print(f" - student_predictions table: {len(df):,} records") print(f" - course_predictions table: {len(merged_with_predictions):,} records") - print(f" - ml_model_performance table: 4 model records") + print(" - ml_model_performance table: 4 model records") print(f"\n Total records saved: {len(df) + len(merged_with_predictions) + 4:,}") else: - print(f" - No records loaded to database (CSV fallback used)") + print(" - No records loaded to database (CSV fallback used)") print(f" - student_predictions.csv: {len(df):,} records") print(f" - course_predictions.csv: {len(merged_with_predictions):,} records") diff --git a/ai_model/complete_ml_pipeline_csv_only.py b/ai_model/complete_ml_pipeline_csv_only.py index 5e5609b..8321304 100644 --- a/ai_model/complete_ml_pipeline_csv_only.py +++ b/ai_model/complete_ml_pipeline_csv_only.py @@ -17,11 +17,10 @@ import pandas as pd import numpy as np from sklearn.model_selection import train_test_split, cross_val_score -from sklearn.preprocessing import LabelEncoder, StandardScaler +from sklearn.preprocessing import LabelEncoder from sklearn.metrics import ( accuracy_score, precision_score, recall_score, f1_score, - roc_auc_score, confusion_matrix, classification_report, - mean_squared_error, mean_absolute_error, r2_score + roc_auc_score, confusion_matrix, mean_squared_error, mean_absolute_error, r2_score ) from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor import xgboost as xgb @@ -157,7 +156,7 @@ def assign_credential_type(row): df['target_credential_type'] = df.apply(assign_credential_type, axis=1) -print(f"Created target variables:") +print("Created target variables:") print(f" - Retention: {df['target_retention'].value_counts().to_dict()}") print(f" - At Risk: {df['target_at_risk'].value_counts().to_dict()}") print(f" - Credential Type: {df['target_credential_type'].value_counts().to_dict()}") @@ -264,8 +263,8 @@ def preprocess_features(df, feature_list): print("TESTING MULTIPLE MODELS WITH CROSS-VALIDATION") print("-" * 80) -from sklearn.linear_model import LogisticRegression -from sklearn.model_selection import StratifiedKFold +from sklearn.linear_model import LogisticRegression # noqa: E402 +from sklearn.model_selection import StratifiedKFold # noqa: E402 models_to_test = { 'Logistic Regression': LogisticRegression( @@ -328,11 +327,11 @@ def preprocess_features(df, feature_list): print(f" Gap: {gap:.4f} ({gap*100:.2f}%)") if gap < 0.05: - print(f" ✓ No overfitting (gap < 5%)") + print(" ✓ No overfitting (gap < 5%)") elif gap < 0.10: - print(f" ⚠ Minimal overfitting (gap < 10%)") + print(" ⚠ Minimal overfitting (gap < 10%)") else: - print(f" ✗ Overfitting detected (gap > 10%)") + print(" ✗ Overfitting detected (gap > 10%)") model_comparison.append({ 'Model': model_name, @@ -392,8 +391,8 @@ def preprocess_features(df, feature_list): print("\nConfusion Matrix:") cm = confusion_matrix(y_test, y_pred) -print(f" Predicted") -print(f" Not Ret Retained") +print(" Predicted") +print(" Not Ret Retained") print(f"Actual Not {cm[0,0]:6d} {cm[0,1]:6d}") print(f" Ret {cm[1,0]:6d} {cm[1,1]:6d}") @@ -515,8 +514,8 @@ def assign_alert_level(risk_score): low_retention_low_risk = df[(df['retention_probability'] < 0.3) & (df['at_risk_alert'] == 'LOW')] print(f"Students with <30% retention flagged as LOW: {len(low_retention_low_risk)} (should be very few)") -print(f"\nEarly warning system aligned with retention predictions") -print(f"\nAlert distribution:") +print("\nEarly warning system aligned with retention predictions") +print("\nAlert distribution:") print(df['at_risk_alert'].value_counts().sort_index()) # ============================================================================ @@ -572,7 +571,7 @@ def assign_alert_level(risk_score): df['predicted_time_to_credential'] = time_model.predict(X_full_retention) df['predicted_graduation_year'] = df['Cohort'].str[:4].astype(float) + df['predicted_time_to_credential'] - print(f"Time predictions generated") + print("Time predictions generated") else: print("Warning: Insufficient data for time-to-credential model") df['predicted_time_to_credential'] = np.nan @@ -591,7 +590,7 @@ def assign_alert_level(risk_score): y_credential = y_credential[valid_idx] print(f"\nDataset size: {len(X_cred):,} students") -print(f"Credential type distribution:") +print("Credential type distribution:") cred_labels = {0: 'No Credential', 1: 'Certificate', 2: 'Associate', 3: 'Bachelor'} for k, v in y_credential.value_counts().sort_index().items(): print(f" {cred_labels.get(k, k)}: {v:,} ({v/len(y_credential)*100:.1f}%)") @@ -651,7 +650,7 @@ def assign_alert_level(risk_score): if class_idx < len(prob_labels): df[prob_labels[int(class_idx)]] = proba[:, i] -print(f"Credential type predictions generated") +print("Credential type predictions generated") # ============================================================================ # STEP 8: MODEL 5 - GATEWAY MATH SUCCESS PREDICTION @@ -736,8 +735,8 @@ def assign_alert_level(risk_score): print("\nConfusion Matrix:") cm = confusion_matrix(y_test, y_pred) -print(f" Predicted") -print(f" No Pass Pass") +print(" Predicted") +print(" No Pass Pass") print(f"Actual No {cm[0,0]:6d} {cm[0,1]:6d}") print(f" Pass {cm[1,0]:6d} {cm[1,1]:6d}") @@ -753,7 +752,7 @@ def assign_alert_level(risk_score): labels=['High Risk', 'Moderate Risk', 'Likely Pass', 'Very Likely Pass'] ) -print(f"Gateway math predictions generated") +print("Gateway math predictions generated") # ============================================================================ # STEP 9: MODEL 6 - GATEWAY ENGLISH SUCCESS PREDICTION @@ -838,8 +837,8 @@ def assign_alert_level(risk_score): print("\nConfusion Matrix:") cm = confusion_matrix(y_test, y_pred) -print(f" Predicted") -print(f" No Pass Pass") +print(" Predicted") +print(" No Pass Pass") print(f"Actual No {cm[0,0]:6d} {cm[0,1]:6d}") print(f" Pass {cm[1,0]:6d} {cm[1,1]:6d}") @@ -855,7 +854,7 @@ def assign_alert_level(risk_score): labels=['High Risk', 'Moderate Risk', 'Likely Pass', 'Very Likely Pass'] ) -print(f"Gateway English predictions generated") +print("Gateway English predictions generated") # ============================================================================ # STEP 10: MODEL 7 - FIRST-SEMESTER GPA < 2.0 PREDICTION @@ -943,8 +942,8 @@ def assign_alert_level(risk_score): print("\nConfusion Matrix:") cm = confusion_matrix(y_test, y_pred) -print(f" Predicted") -print(f" GPA>=2.0 GPA<2.0") +print(" Predicted") +print(" GPA>=2.0 GPA<2.0") print(f"Actual >=2.0 {cm[0,0]:6d} {cm[0,1]:6d}") print(f" <2.0 {cm[1,0]:6d} {cm[1,1]:6d}") @@ -958,7 +957,7 @@ def assign_alert_level(risk_score): labels=['Low Risk', 'Moderate Risk', 'High Risk', 'Critical Risk'] ) -print(f"Low GPA predictions generated") +print("Low GPA predictions generated") # ============================================================================ # STEP 11: MODEL 8 - GPA PREDICTION (CONTINUOUS) @@ -1027,9 +1026,9 @@ def assign_alert_level(risk_score): axis=1 ) -print(f"GPA predictions generated") +print("GPA predictions generated") print(f"Mean predicted GPA: {df['predicted_gpa'].mean():.2f}") -print(f"\nPerformance Distribution:") +print("\nPerformance Distribution:") print(df['gpa_performance'].value_counts().to_string()) # ============================================================================ @@ -1042,7 +1041,7 @@ def assign_alert_level(risk_score): # Save student-level predictions with all columns output_file = os.path.join(DATA_DIR, 'kctcs_student_level_with_predictions.csv') df.to_csv(output_file, index=False) -print(f"\n✓ Saved student-level predictions to CSV:") +print("\n✓ Saved student-level predictions to CSV:") print(f" File: {output_file}") print(f" Records: {len(df):,}") print(f" Columns: {len(df.columns)}") @@ -1093,7 +1092,7 @@ def assign_alert_level(risk_score): # Save course-level predictions output_file = os.path.join(DATA_DIR, 'kctcs_merged_with_predictions.csv') merged_with_predictions.to_csv(output_file, index=False) -print(f"\n✓ Saved course-level predictions to CSV:") +print("\n✓ Saved course-level predictions to CSV:") print(f" File: {output_file}") print(f" Records: {len(merged_with_predictions):,}") print(f" Columns: {len(merged_with_predictions.columns)}") @@ -1133,7 +1132,7 @@ def assign_alert_level(risk_score): pct = count / len(df) * 100 summary_report += f" {cat:20s} {count:6,} ({pct:5.1f}%)\n" -summary_report += f""" +summary_report += """ 2. EARLY WARNING SYSTEM Algorithm: Composite Risk Score (Retention + Performance Metrics) Approach: Aligned with retention predictions to eliminate contradictions diff --git a/ai_model/merge_kctcs_data.py b/ai_model/merge_kctcs_data.py index 935c853..d81fd80 100644 --- a/ai_model/merge_kctcs_data.py +++ b/ai_model/merge_kctcs_data.py @@ -1,5 +1,4 @@ import pandas as pd -from datetime import datetime print("=" * 80) print("MERGING KCTCS DATA FILES") @@ -98,12 +97,12 @@ students_with_courses = final_merged[final_merged['course_id'].notna()]['Student_GUID'].nunique() students_without_courses = final_merged[final_merged['course_id'].isna()]['Student_GUID'].nunique() -print(f"\nData breakdown:") +print("\nData breakdown:") print(f" - Students with course records: {students_with_courses:,}") print(f" - Students without course records: {students_without_courses:,}") print(f" - Average courses per student (for those with courses): {len(final_merged[final_merged['course_id'].notna()]) / students_with_courses:.1f}") -print(f"\nColumn categories:") +print("\nColumn categories:") cohort_cols = [col for col in final_merged.columns if not col.startswith('ar_') and not col.startswith('course_') and col not in ['ar_id', 'course_id']] ar_cols = [col for col in final_merged.columns if col.startswith('ar_') or col == 'ar_id'] course_cols = [col for col in final_merged.columns if col.startswith('course_') or col == 'course_id'] @@ -112,7 +111,7 @@ print(f" - AR columns: {len(ar_cols)}") print(f" - Course columns: {len(course_cols)}") -print(f"\nFirst few column names:") +print("\nFirst few column names:") print(f" First 10 columns: {list(final_merged.columns[:10])}") print("\n" + "=" * 80) diff --git a/operations/db_utils.py b/operations/db_utils.py index b1e77de..9552d34 100644 --- a/operations/db_utils.py +++ b/operations/db_utils.py @@ -4,10 +4,9 @@ Helper functions for database operations """ -import pandas as pd import pymysql from sqlalchemy import create_engine, text -from .db_config import DB_CONFIG, TABLES +from .db_config import DB_CONFIG import warnings warnings.filterwarnings('ignore') @@ -39,7 +38,7 @@ def get_sqlalchemy_engine(): f"@{DB_CONFIG['host']}:{DB_CONFIG['port']}/{DB_CONFIG['database']}" ) engine = create_engine(connection_string, pool_pre_ping=True) - print(f"✓ SQLAlchemy engine created") + print("✓ SQLAlchemy engine created") return engine except Exception as e: print(f"✗ Engine creation failed: {e}") @@ -145,7 +144,7 @@ def create_model_performance_table(): cursor.execute(create_table_sql) connection.commit() - print(f"✓ Model performance table created/verified") + print("✓ Model performance table created/verified") cursor.close() connection.close()