Surrender_Analysis/__data3__analyzeImbalanceOfData.py at master · mtkier94/Surrender_Analysis · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
import numpy as np
import pandas as pd
import pickle5 as pickle
import os
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler


from functions.sub_surrender_profiles import get_risk_drivers
from global_vars import path_tables, getDataPath

def reviewDataBalance(surrender_profile):

    path_data = getDataPath(surrender_profile)

    # import Training and Test Data
    X_train = pd.read_csv(os.path.join(path_data,r'X_train.csv'), index_col= 0)
    X_test = pd.read_csv(os.path.join(path_data, r'X_test.csv'), index_col= 0)
    y_train = pd.read_csv(os.path.join(path_data, r'y_train.csv'), index_col= 0).values.flatten()
    y_test = pd.read_csv(os.path.join(path_data, r'y_test.csv'), index_col= 0 ).values.flatten()

    # restrict data to relevant features -> assume proper exploratory data analysis
    cache = get_risk_drivers(profile=surrender_profile)
    features_profile_lst = []
    for el in cache:
        if el != 'Premium_freq':
            features_profile_lst.append(el)
        else:
            features_profile_lst.append('Premium_freq_0')
            features_profile_lst.append('Premium_freq_1')

    X_train, X_test = X_train[[el for el in features_profile_lst]], X_test[[el for el in features_profile_lst]]

    # Load Scaling range used in 'Lapse_data_preparation' for later visualization
    with open(os.path.join(path_data,'dict_range_scale_{}.pkl'.format(surrender_profile)), 'rb') as f:
        dict_range_scale =  pickle.load(f)
    # Load beta0 of latent surrender model for later visualization
    with open(os.path.join(path_data,r'beta0.pkl'), 'rb') as f:
        beta0 = pickle.load(f)

    # RUS resampling
    _, y_train_rus = RandomUnderSampler(sampling_strategy= 'majority').fit_resample(X_train,y_train)
    # SMOTE resampling
    _, y_train_smote = SMOTE().fit_resample(X_train,y_train)

    # analyze balance of data
    dict_data = {'numb_train': len(y_train), 'imb_train': np.round_(sum(y_train)/len(y_train),4), 'numb_rus': len(y_train_rus), 'numb_smote': len(y_train_smote), 'numb_test': len(y_test), 'imb_test': np.round_(sum(y_test)/len(y_test),4)}

    # save result for profile i
    with open(os.path.join(path_tables, r'{}_DataCounts.pkl'.format(surrender_profile)), 'wb') as f:
        pickle.dump(dict_data, f, pickle.HIGHEST_PROTOCOL)
    print('Profile ', surrender_profile, ' analyzed.')

def createSummaryOfDataBalance():

    data = {}
    for i in range(4):
        if os.path.exists(os.path.join(path_tables, r'{}_DataCounts.pkl'.format(i))):
            with open(os.path.join(path_tables, r'{}_DataCounts.pkl'.format(i)), 'rb') as f:
                    data[i] = pickle.load(f)

    df_data = pd.DataFrame.from_dict(data, orient = 'index')
    print(df_data)

    with open(os.path.join(path_tables,r'Data_review.tex'),'w') as f:
        f.write(df_data.to_latex())


if __name__ == '__main__':

    for i in [0,1,2,3]:
        # analyze data of profile i
        reviewDataBalance(i)

    # create LaTeX-table for data-balance of all four surrender-profiles
    createSummaryOfDataBalance()