Data-Driven-Computer-Architecture-project/code/model.py at main · codexaslam/Data-Driven-Computer-Architecture-project · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
# code/model.py
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
import snowflake.connector
import pandas as pd
import joblib
import logging
import configparser
import os

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler('pipeline.log'),
        logging.StreamHandler()
    ]
)

# Read the config file
config = configparser.ConfigParser()
config.read(os.path.join(os.path.dirname(__file__), 'config.ini'))
snowflake_config = config['SNOWFLAKE']

# Connect to Snowflake
conn = snowflake.connector.connect(
    user=snowflake_config['user'],
    password=snowflake_config['password'],
    account=snowflake_config['account'].strip('"\''),
    warehouse=snowflake_config['warehouse'],
    database=snowflake_config['database'],
    schema=snowflake_config['schema'],
    role=snowflake_config['role']
)

def train_model():
    """Train a depression prediction model"""
    try:
        logging.info("Starting model training")

        # Ensure model directory exists
        os.makedirs("model", exist_ok=True)

        # Fetch data - using SILVER table which has cleaned data
        df = pd.read_sql("""
            SELECT
                AGE,
                ACADEMIC_PRESSURE,
                FINANCIAL_STRESS,
                FAMILY_HISTORY,
                SLEEP_DURATION,
                GENDER,
                DEPRESSION
            FROM STUDENT_DEPRESSION_DATASET.PUBLIC.SILVER_STUDENT_DATA
            WHERE AGE IS NOT NULL
              AND ACADEMIC_PRESSURE IS NOT NULL
              AND FINANCIAL_STRESS IS NOT NULL
              AND DEPRESSION IS NOT NULL
        """, conn)

        # Check if we got any data
        if len(df) == 0:
            raise ValueError("No valid data returned from the database")

        # Convert binary categorical columns to numeric (if not already)
        if df['FAMILY_HISTORY'].dtype == 'object':
            df['FAMILY_HISTORY'] = df['FAMILY_HISTORY'].map({'Yes': 1, 'No': 0})

        # Preprocess data - one-hot encode categorical variables
        df = pd.get_dummies(df, columns=['SLEEP_DURATION', 'GENDER'])

        # Check for any remaining NaN values
        if df.isnull().values.any():
            df = df.dropna()
            if len(df) == 0:
                raise ValueError("No valid data remaining after dropping NA values")

        # Prepare features and target
        X = df.drop('DEPRESSION', axis=1)
        y = df['DEPRESSION']

        # Split data
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.2, random_state=42
        )

        # Train model
        model = RandomForestClassifier(n_estimators=100, random_state=42)
        model.fit(X_train, y_train)

        # Evaluate model
        y_pred = model.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        report = classification_report(y_test, y_pred)

        logging.info(f"Model accuracy: {accuracy:.2f}")
        logging.info("Classification Report:\n" + report)

        # Save model
        joblib.dump(model, 'model/depression_model.joblib')
        logging.info("Model saved to model/depression_model.joblib")

        return model
    except Exception as e:
        logging.error(f"Model training failed: {str(e)}")
        return None


if __name__ == "__main__":
    train_model()
    conn.close()