-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmodel.py
More file actions
114 lines (94 loc) · 3.57 KB
/
model.py
File metadata and controls
114 lines (94 loc) · 3.57 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
# code/model.py
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
import snowflake.connector
import pandas as pd
import joblib
import logging
import configparser
import os
# Configure logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s',
handlers=[
logging.FileHandler('pipeline.log'),
logging.StreamHandler()
]
)
# Read the config file
config = configparser.ConfigParser()
config.read(os.path.join(os.path.dirname(__file__), 'config.ini'))
snowflake_config = config['SNOWFLAKE']
# Connect to Snowflake
conn = snowflake.connector.connect(
user=snowflake_config['user'],
password=snowflake_config['password'],
account=snowflake_config['account'].strip('"\''),
warehouse=snowflake_config['warehouse'],
database=snowflake_config['database'],
schema=snowflake_config['schema'],
role=snowflake_config['role']
)
def train_model():
"""Train a depression prediction model"""
try:
logging.info("Starting model training")
# Ensure model directory exists
os.makedirs("model", exist_ok=True)
# Fetch data - using SILVER table which has cleaned data
df = pd.read_sql("""
SELECT
AGE,
ACADEMIC_PRESSURE,
FINANCIAL_STRESS,
FAMILY_HISTORY,
SLEEP_DURATION,
GENDER,
DEPRESSION
FROM STUDENT_DEPRESSION_DATASET.PUBLIC.SILVER_STUDENT_DATA
WHERE AGE IS NOT NULL
AND ACADEMIC_PRESSURE IS NOT NULL
AND FINANCIAL_STRESS IS NOT NULL
AND DEPRESSION IS NOT NULL
""", conn)
# Check if we got any data
if len(df) == 0:
raise ValueError("No valid data returned from the database")
# Convert binary categorical columns to numeric (if not already)
if df['FAMILY_HISTORY'].dtype == 'object':
df['FAMILY_HISTORY'] = df['FAMILY_HISTORY'].map({'Yes': 1, 'No': 0})
# Preprocess data - one-hot encode categorical variables
df = pd.get_dummies(df, columns=['SLEEP_DURATION', 'GENDER'])
# Check for any remaining NaN values
if df.isnull().values.any():
df = df.dropna()
if len(df) == 0:
raise ValueError("No valid data remaining after dropping NA values")
# Prepare features and target
X = df.drop('DEPRESSION', axis=1)
y = df['DEPRESSION']
# Split data
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42
)
# Train model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)
# Evaluate model
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)
logging.info(f"Model accuracy: {accuracy:.2f}")
logging.info("Classification Report:\n" + report)
# Save model
joblib.dump(model, 'model/depression_model.joblib')
logging.info("Model saved to model/depression_model.joblib")
return model
except Exception as e:
logging.error(f"Model training failed: {str(e)}")
return None
if __name__ == "__main__":
train_model()
conn.close()