-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathsentiment_model_training.py
More file actions
124 lines (95 loc) · 4.78 KB
/
sentiment_model_training.py
File metadata and controls
124 lines (95 loc) · 4.78 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
# Sentiment CLassifier using Multi-nominal Naive Bayes classifier
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem import PorterStemmer
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import precision_recall_fscore_support, confusion_matrix, roc_auc_score, accuracy_score
import pickle
import re
import string
import configparser
lemmatizer = WordNetLemmatizer()
port = PorterStemmer()
#############Lemmatize/Stemminize######################
def lemmaStemma(text):
return lemmatizer.lemmatize(text)
#######################Model Evaluation##############
def evaluation(predictions, predictions_prob, test_labels):
errors = abs(predictions - test_labels)
print('Mean Absolute Error(MAE):', round(np.mean(errors), 2))
"""mape = 100 * (errors / test_labels)
accuracy = 100 - np.mean(mape)
print('Accuracy:', round(accuracy, 2), '%')"""
print('Accuracy:', round(accuracy_score(test_labels, predictions) * 100, 2), '%')
'''confusion = precision_recall_fscore_support(test_labels, predictions, average='binary')
print('Precision:', confusion[0])
print('Recall:', confusion[1])
print('F1:', confusion[2])'''
print("****Confusion Matrix****")
print(confusion_matrix(test_labels, predictions))
def main():
# Read parameters from ini file
config = configparser.ConfigParser()
config.read('config.ini')
# Parameter defining
sentiment_training_data_path = config['TRAINING_PATH']['sentiment_training_data_path']
sentiment_vector = config['MODEL']['sentiment_vector']
sentiment_classifier = config['MODEL']['sentiment_classifier']
classification_test_size = config['PARAMETERS']['classification_test_size']
df = pd.read_csv(sentiment_training_data_path, header=0, encoding='unicode_escape').dropna()
df['comment'] = df['comment'].str.lower()
df['comment'] = df['comment'].map(lambda x: re.sub(r'\W+', ' ', x))
df['comment'] = df['comment'].map(lambda x: re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f\x7f-\xff]', ' ', x))
df['comment'] = df['comment'].apply(
lambda x: ' '.join([lemmatizer.lemmatize(word) for word in x.split()]))
features = df["comment"]
labels = df["label"]
vectorizer = CountVectorizer(max_features=1500, min_df=2, max_df=0.7, stop_words="english")
Answer_vect_fit = vectorizer.fit(features)
Answer_vect = Answer_vect_fit.transform(features).toarray()
# Save Vectorizer
print("Saving santiment vectorizer....")
pkl_filename = sentiment_vector
with open(pkl_filename, 'wb') as file:
pickle.dump(Answer_vect_fit, file)
print("Data Splitting")
##Split train/test
train_features, test_features, train_labels, test_labels, train_org, test_org = train_test_split(Answer_vect,
labels,
df["comment"],
stratify=labels,
test_size=float(classification_test_size),
random_state=42)
# MNB CLASSIFICATION
mnb = MultinomialNB()
mnbModel = mnb.fit(train_features, train_labels)
mnb_prediction = mnbModel.predict(test_features)
mnb_prob = mnbModel.predict_proba(test_features)
mnb_results = np.array(list(zip(
test_org,
test_labels,
mnb_prediction,
mnb_prob[:, 0],
mnb_prob[:, 1],
mnb_prob[:, 2])
)
)
mnb_results = pd.DataFrame(mnb_results, columns=['test_org',
'actual',
'prediction',
'negative',
'neutral',
'positive']
)
print("****Model Evaluation****")
evaluation(mnb_prediction, mnb_prob, test_labels)
# Save Model
print("Saving the sentiment model....")
pkl_filename = sentiment_classifier
with open(pkl_filename, 'wb') as file:
pickle.dump(mnbModel, file)
if __name__ == '__main__':
main()