-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathstreamlit.py
More file actions
156 lines (132 loc) · 4.97 KB
/
streamlit.py
File metadata and controls
156 lines (132 loc) · 4.97 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
import os
import re
from urllib.parse import parse_qs, urlparse
import contractions
import gdown
import joblib
import nltk
import numpy as np
import pandas as pd
import requests
import spacy
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
import streamlit as st
# Download the necessary NLTK models
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt_tab')
# # Helper function to download file from Google Drive
def download_from_google_drive(drive_id, destination):
# Check if drive_id is a full URL or just an ID
if drive_id.startswith('http'):
parsed_url = urlparse(drive_id)
params = parse_qs(parsed_url.query)
file_id = params.get('id', [None])[0]
if not file_id:
raise ValueError("Could not extract file ID from the provided URL")
else:
file_id = drive_id
url = f"https://drive.google.com/uc?id={file_id}"
try:
output = gdown.download(url, destination, quiet=False)
if output is None:
print(f"Failed to download file with ID: {file_id}")
return False
print(f"File successfully downloaded to {output}")
return True
except Exception as e:
print(f"An error occurred while downloading: {str(e)}")
return False
# Download models and TF-IDF vectorizer
@st.cache_resource
def load_tfidf_vectorizer():
# Replace with your actual Google Drive ID
tfidf_drive_id = '1kavw9Dwgtgwuy4gVDdTtkUpcAeACw68I'
if not os.path.exists('tfidf_vectorizer.pkl'):
download_from_google_drive(tfidf_drive_id, 'tfidf_vectorizer.pkl')
return joblib.load('tfidf_vectorizer.pkl')
tfidf_vectorizer = load_tfidf_vectorizer()
@st.cache_resource
def load_model(model_name):
drive_ids = {
"SVM": "1etPvpOO6qyHEQgpND93mf5wG52zgbGQv",
"Random Forest": "1AVP7XbqaRjMb5qE3kc829jA_FweLH_aO",
"Naive Bayes": "1iauulrpF2FISna1PhaGQay83y1c3fZ0X"
}
model_file = {
"SVM": "svm_model.pkl",
"Random Forest": "rf_model.pkl",
"Naive Bayes": "nb_model.pkl"
}.get(model_name, "svm_model.pkl")
# Download the model file if not already downloaded
if not os.path.exists(model_file):
download_from_google_drive(drive_ids[model_name], model_file)
return joblib.load(model_file)
# Preprocessing functions
def handle_negations(text):
negation_words = {"not", "n't", "never", "no", "neither", "nor"}
tokens = word_tokenize(text)
negated_tokens = []
negation_active = False
for i in range(len(tokens)):
token = tokens[i].lower()
if token in negation_words:
negation_active = True
elif token in {'.', ',', ';', '!', '?'}:
negation_active = False
if negation_active and i+1 < len(tokens):
negated_tokens.append(f"NOT_{tokens[i+1]}")
negation_active = False
continue
else:
negated_tokens.append(tokens[i])
return ' '.join(negated_tokens)
def preprocess_text(text):
text = BeautifulSoup(text, "html.parser").get_text()
text = re.sub(r'\[.*?\]', '', text)
text = contractions.fix(text)
text = re.sub(r'https?://\S+|www\.\S+', '', text)
text = re.sub(r'(.)\1{2,}', r'\1\1', text)
text = re.sub(r'[^A-Za-z0-9 ]+', '', text)
text = text.lower()
text = handle_negations(text)
tokens = word_tokenize(text)
stop_words = set(stopwords.words('english'))
tokens = [word for word in tokens if word not in stop_words]
lemmatizer = WordNetLemmatizer()
tokens = [lemmatizer.lemmatize(word) for word in tokens]
text = ' '.join(tokens)
return text
# Prediction function
def predict_sentiment(manual_text, model_name):
model = load_model(model_name)
preprocessed_text = preprocess_text(manual_text)
text_features = tfidf_vectorizer.transform([preprocessed_text])
sentiment = model.predict(text_features)
sentiment_label = 'positive' if sentiment[0] == 0 else 'negative'
return sentiment_label
# Ensure that the model option is stored in session state
if "model_option" not in st.session_state:
st.session_state.model_option = "SVM"
# Streamlit UI
st.title("Sentiment Analysis App")
st.write("Enter a movie review, and select the model to predict the sentiment.")
# Text input
user_input = st.text_area("Enter your review here:", "")
# Model selection (default to SVM)
model_option = st.selectbox(
"Select a model:",
("SVM", "Random Forest", "Naive Bayes"),
index=("SVM", "Random Forest", "Naive Bayes").index(st.session_state.model_option)
)
# Update session state when model selection changes
st.session_state.model_option = model_option
# Prediction
if st.button("Predict Sentiment"):
prediction = predict_sentiment(user_input, st.session_state.model_option)
st.write(f"The predicted sentiment is: **{prediction}**")