-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy path8_ser_ui.py
More file actions
119 lines (95 loc) · 4.43 KB
/
8_ser_ui.py
File metadata and controls
119 lines (95 loc) · 4.43 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
import os
import gradio as gr
import numpy as np
import librosa
from sklearn.preprocessing import StandardScaler
import joblib
import soundfile as sf
import tensorflow as tf
import traceback
# Placeholder functions (replace with actual implementation)
def noise_reduction(audio):
# Perform noise reduction
print("Noise reduction step")
return audio
def feature_extraction(audio, sr):
# Extract features (e.g., MFCCs)
print("Feature extraction step")
mfccs = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=13)
return np.mean(mfccs.T, axis=0)
def feature_scaling(features):
# Scale features
print("Feature scaling step")
scaler = StandardScaler()
features_scaled = scaler.fit_transform(features.reshape(-1, 1))
return features_scaled.flatten()
def split_audio(audio, sr):
# Split audio into chunks (if necessary)
print("Audio splitting step")
return [audio]
def load_models():
print("Loading models")
svm_model = joblib.load("D:/MCA/4th sem/SER3/models/svm_model.pkl")
cnn_model = tf.keras.models.load_model("D:/MCA/4th sem/SER3/models/cnn_model.h5", compile=False)
lstm_model = tf.keras.models.load_model("D:/MCA/4th sem/SER3/models/lstm_model.h5", compile=False)
label_encoder = joblib.load("D:/MCA/4th sem/SER3/models/label_encoder.pkl")
return svm_model, cnn_model, lstm_model, label_encoder
def audio_classification(svm_model, cnn_model, lstm_model, label_encoder, features_scaled):
print("Predicting emotion using SVM")
svm_pred = svm_model.predict(features_scaled.reshape(1, -1))
print("Predicting emotion using CNN")
features_cnn = features_scaled.reshape((1, features_scaled.shape[0], 1))
cnn_pred = cnn_model.predict(features_cnn)
cnn_emotion = label_encoder.inverse_transform([np.argmax(cnn_pred)])[0]
print("Predicting emotion using LSTM")
lstm_pred = lstm_model.predict(features_cnn)
lstm_emotion = label_encoder.inverse_transform([np.argmax(lstm_pred)])[0]
svm_emotion = label_encoder.inverse_transform(svm_pred)[0]
return svm_emotion, cnn_emotion, lstm_emotion
def predict_emotion(audio_tuple):
print("Received audio data and sample rate")
if audio_tuple is None:
print("No audio file provided")
return "No audio file provided"
print("Audio tuple:", audio_tuple)
if len(audio_tuple) != 2:
print("Invalid audio tuple format")
return "Invalid audio tuple format"
try:
# Save the audio data to a temporary file
temp_folder = "temp"
if not os.path.exists(temp_folder):
os.makedirs(temp_folder)
temp_audio_path = os.path.join(temp_folder, "temp_audio.wav")
sample_rate, audio_data = audio_tuple
# Check if audio data is mono, reshape if necessary
if audio_data.ndim == 1:
audio_data = np.expand_dims(audio_data, axis=1)
sf.write(temp_audio_path, audio_data, sample_rate, subtype='PCM_24')
print("Audio saved successfully")
# Load the saved audio file for processing
audio, sr = librosa.load(temp_audio_path)
print("Audio loaded successfully")
except Exception as e:
print("Error processing audio data:")
traceback.print_exc() # Print full traceback
return f"Error processing audio data: {e}"
audio = noise_reduction(audio)
chunks = split_audio(audio, sr)
svm_model, cnn_model, lstm_model, label_encoder = load_models()
predictions = []
for chunk in chunks:
features = feature_extraction(chunk, sr)
features_scaled = feature_scaling(features)
svm_emotion, cnn_emotion, lstm_emotion = audio_classification(svm_model, cnn_model, lstm_model, label_encoder, features_scaled)
predictions.extend([svm_emotion, cnn_emotion, lstm_emotion])
# Assuming label mapping for the given emotions
label_mapping = {0: "Angry", 1: "Calm", 2: "Disgust", 3: "Fear", 4: "Happy", 5: "Neutral", 6: "Pleasant Surprise", 7: "Sad"}
final_prediction = max(set(predictions), key=predictions.count)
return final_prediction
# Create Gradio interface
input_audio = gr.Audio(label="Upload Audio File", type="numpy")
output_text = gr.Textbox(label="Predicted Emotion")
gr.Interface(fn=predict_emotion, inputs=input_audio, outputs=output_text,
title="Speech Emotion Recognition",
description="Upload an audio file to predict the emotion present in the audio.").launch()