-
Notifications
You must be signed in to change notification settings - Fork 9
Expand file tree
/
Copy pathSpoken.py
More file actions
68 lines (51 loc) · 1.65 KB
/
Spoken.py
File metadata and controls
68 lines (51 loc) · 1.65 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
# Libraries
import pandas as pd
from bs4 import BeautifulSoup
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
# from nltk import word_tokenize
# from nltk.stem import WordNetLemmatizer
import pickle
from imblearn.over_sampling import SMOTE
fields = ['Content', 'Label']
# Load into dataframe
df = pd.read_csv('STdataset.csv', skipinitialspace=True, usecols=fields)
# Stripping function
def remove_tags(text):
soup = BeautifulSoup(text, "lxml")
if soup.find_all('style'):
soup.style.decompose()
string = soup.get_text()
string = string.replace(' ', '').replace(
'\n', '').replace('\r', '').replace('\t', '')
string = ' '.join([w for w in string.split() if len(w) >= 3])
return string
df['Content'] = df['Content'].apply(remove_tags)
# Lemmatizer
'''
class LemmaTokenizer(object):
def __init__(self):
self.wnl = WordNetLemmatizer()
def __call__(self, doc):
return [self.wnl.lemmatize(t) for t in word_tokenize(doc)]
'''
# Vectorizer
vectorizer = TfidfVectorizer(stop_words='english')
x = vectorizer.fit_transform(df['Content'])
# Minority oversampling
sm = SMOTE(random_state=42)
x, y = sm.fit_sample(x, df['Label'])
# Model fitting
model = LinearSVC(random_state=42, tol=5, fit_intercept=False)
model.fit(x, y)
filename = 'tutorial_model.sav'
pickle.dump(model, open(filename, 'wb'))
# Predictor function
def predictor(comment):
simplified = remove_tags(comment)
tester = [simplified]
print(simplified)
contest = vectorizer.transform(tester)
load_model = pickle.load(open(filename, 'rb'))
a = load_model.predict(contest)
return a[0]