-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathTrain.py
More file actions
89 lines (69 loc) · 3.38 KB
/
Train.py
File metadata and controls
89 lines (69 loc) · 3.38 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
# This script:
# -Train a single BDT from given hyperparameters
# -Plot roc curve, compute auc
# -Saves the model for further use
# -Kfolding (not finished)
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import train_test_split # To use method with same name
from sklearn.metrics import roc_curve, auc # Roc: receiver operating characteristic, auc: Area under the ROC Curve
from sklearn.model_selection import KFold # K-fold data selection for training
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import joblib
pathCSV='/home/mjacquar/TP4b/csv'
X = pd.read_csv(f'{pathCSV}/X.csv')
y = X['sig']
print("csv loaded")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, train_size=0.7, random_state=0) #random state: seed for random assignation of data in the split, done wih kFold
# features = ['B_s0_TAU', # B lifetime #---------/!\ OLD ---------
# 'MIN_IPCHI2_emu', # Minimum on the two impact parameters chi2, large if signal (comes from secondary vertex)
# 'B_s0_IP_OWNPV', # Impact parameter
# 'SUM_isolation_emu', # Isolation: we want nothing in the cone around e or mu
# 'B_s0_PT', # Transverse momentum of B0
# 'LOG1_cosDIRA', # Direction angle between sum emu and B0 reconstructed -> log(1-cos(DIRA))
# 'B_s0_CDFiso', # Different measure of isolation, mathematical def in analysis document
# 'MAX_PT_emu', # B0 has high mass -> high p_T for daughter particles
# 'B_s0_ENDVERTEX_CHI2', # Quality of the reconstructed decay vertex.
# 'DIFF_ETA_emu'] # Absolute difference in pseudorapidity of emu
features=[ 'B_s0_ENDVERTEX_CHI2',
'B_s0_CDFiso',
'eplus_ProbNNe',
'eplus_ETA',
'B_s0_IPCHI2_OWNPV',
'B_s0_minPT',
'muminus_ProbNNmuk',
'MIN_IPCHI2_emu',
'SUM_isolation_emu',
'LOG1_cosDIRA']
dt = DecisionTreeClassifier(max_depth=3) # Define the decision tree
model = AdaBoostClassifier(dt, algorithm='SAMME.R', n_estimators=50, learning_rate=0.1) # Define the model using the decision tree
print("Model defined")
model.fit(X_train[features], y_train)
print("model trained")
pathModel='/home/mjacquar/TP4b/model' # https://medium.com/@harsz89/persist-reuse-trained-machine-learning-models-using-joblib-or-pickle-in-python-76f7e4fd707
joblib.dump(model,f'{pathModel}/bdt_model.pkl') # Save trained model for later
print("Model saved")
# K-folding:
# cv = KFold(n_splits=5, shuffle=True,random_state=0) # K-fold is a Cross-validation method
# kNumber=0
# for train, test in cv.split(X): # Return index [tabOfTrainIndex][TabOfTestIndex]
# model.fit(X[train][features], y[train])
# print(f"model {kNumber} trained")
# joblib.dump(model,f'{pathModel}/bdt_model_k{kNumber}.pkl') # Save trained model for later
# kNumber+1
#Use it to predict the test sample:
y_pred=model.predict_proba(X_test[features])[:,1]
print("Predictions done")
fpr, tpr, threshold = roc_curve(y_test, y_pred) # Use built in fct to compute: false/true positive read, using the answer and predictions of the test sample
auc = auc(fpr, tpr) # Use built in fct to compute area under curve
print(f'Auc={auc}')
# Plot the result:
plt.figure(figsize=(8, 8), dpi=300)
plt.plot(tpr,1-fpr,linestyle='-',label=f'Auc={auc}')
plt.xlabel('True positive rate')
plt.ylabel('1-False positive rate')
plt.legend()
plt.savefig(f'plots/roc5.pdf')
plt.close()