CodeWhisper-ASR-AI/main.py at master · ChelseaYanxin/CodeWhisper-ASR-AI · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301

from flask import Blueprint, Flask, render_template, request, jsonify
import fitz  # pymupdf
import re
import speechbrain as sb
import torch
import torchaudio
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
from github import Github
from rake_nltk import Rake
import os
from openai import OpenAI
from datetime import datetime
import logging
import pytesseract
from pdf2image import convert_from_path
import numpy as np
from PIL import Image
import cv2
from ASR.Model import ASR
from ASR.Model import transcribe_audio
# represent the application
app = Flask(__name__)

GITHUB_TOKEN = os.getenv('GITHUB_TOKEN', '')
chatbot_bp = Blueprint('chatbot', __name__, template_folder='templates')
client = OpenAI(api_key='your_openai_api_key')

pytesseract.pytesseract.tesseract_cmd = r'D:\Tesseract-OCR\tesseract.exe' # your path to Tesseract executable

def preprocess_image(image):


    if isinstance(image, Image.Image):
        image = np.array(image)

    # transform to grayscale
    if len(image.shape) == 3:
        gray = cv2.cvtColor(image, cv2.COLOR_RGB2GRAY)
    else:
        gray = image

    # binarize the image
    thresh = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)[1]

    # denoise the image
    denoised = cv2.fastNlMeansDenoising(thresh)

    # sharpen the image
    kernel = np.array([[-1,-1,-1], [-1,9,-1], [-1,-1,-1]])
    sharpened = cv2.filter2D(denoised, -1, kernel)

    return Image.fromarray(sharpened)

def perform_ocr(image, lang='eng+chi_sim'):

    # process the image
    processed_image = preprocess_image(image)

    # convert the image to text
    custom_config = r'--oem 3 --psm 6'
    text = pytesseract.image_to_string(processed_image, lang=lang, config=custom_config)

    return text

def extract_clean_pdf_text(pdf_file_path):
    # open the PDF file
    doc = fitz.open(pdf_file_path)
    full_text = ""

    for page_num in range(len(doc)):
        page = doc[page_num]

        # extract text from the page
        text = page.get_text()

        # if the text is too short, perform OCR on the page
        if len(text.strip()) < 50:  # arbitrary threshold
            # convert the page to an image
            pix = page.get_pixmap()
            img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)

            # perform OCR on the image
            text = perform_ocr(img)

        full_text += text + "\n"

    # clean the text
    # remove excessive line breaks and whitespace
    cleaned_text = re.sub(r'\n{2,}', '\n', full_text)
    cleaned_text = re.sub(r'\s{2,}', ' ', cleaned_text)

    # split paragraphs
    paragraphs = cleaned_text.split("\n")

    # append a space to paragraphs that don't end with punctuation
    formatted_paragraphs = []
    for para in paragraphs:
        if para and not re.search(r'[.!?]$', para.strip()):
            formatted_paragraphs.append(para.strip() + " ")
        else:
            formatted_paragraphs.append(para.strip() + "\n")

    # add titles and bullet points
    final_paragraphs = []
    for para in formatted_paragraphs:
        if re.match(r'^[A-Z\s]+$', para.strip()):
            final_paragraphs.append(f"\n\n# {para.strip()}\n\n")
        elif re.match(r'^\d+\.\s', para.strip()):
            final_paragraphs.append(f"\n## {para.strip()}\n")
        elif re.match(r'^[-*]\s', para.strip()):
            final_paragraphs.append(f"- {para.strip()[2:]}\n")
        else:
            final_paragraphs.append(para)

    final_text = ''.join(final_paragraphs)
    return final_text

@app.route('/process_pdf', methods=['POST'])
def process_pdf():
    if 'file' not in request.files:
        return jsonify({'error': 'No file uploaded'}), 400

    file = request.files['file']
    if file.filename == '':
        return jsonify({'error': 'No selected file'}), 400

    # Save the uploaded PDF to a temporary location
    file_path = os.path.join('uploads', file.filename)
    file.save(file_path)

    try:
        # extract and clean text from the PDF
        cleaned_text = extract_clean_pdf_text(file_path)

        # remove the temporary file
        os.remove(file_path)

        return jsonify({'cleaned_text': cleaned_text})
    except Exception as e:
        # remove the temporary file
        if os.path.exists(file_path):
            os.remove(file_path)
        return jsonify({'error': str(e)}), 500


def asr_audio(speech_file_path):
    asr_model = ASR()
    processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")

    result = transcribe_audio(speech_file_path, asr_model, processor)
    return result


# Route for the HTML page
# 创建一个路由和视图函数的映射
@app.route('/')
def index():
    # return render_template('index.html')
    return render_template('CodeWhisper.html')

@app.route('/NotesHelper')
def NotesHelper():
    return render_template('NotesHelper.html')

@app.route('/SlideToNote')
def SlideToNote():
    return render_template('SlideToNote.html')

@app.route('/SpeechToNote')
def SpeechToNote():
    return render_template('SpeechToNote.html')

@app.route('/AdditionalResources')
def AdditionalResources():
    return render_template('AdditionalResources.html')

# Route to handle file upload and processing
@app.route('/process_audio', methods=['POST'])
def process_audio():
    if 'file' not in request.files:
        return jsonify({'error': 'No file uploaded'}), 400

    file = request.files['file']
    if file.filename == '':
        return jsonify({'error': 'No selected file'}), 400

    # check the file type
    if not file.filename.endswith(('.wav', '.mp3', '.flac', '.ogg', '.mp4','.m4a')):
        return jsonify({'error': 'Unsupported file format'}), 400

    # Save the uploaded audio to a temporary location
    file_path = f'./uploads/{file.filename}'
    file.save(file_path)

    try:
        transcription = asr_audio(file_path)
    except Exception as e:
        return jsonify({'error': f'ASR processing failed: {str(e)}'}), 500

    # get the ASR result
    # transcription = asr_audio(file_path)  可能回来

    # # Extract and clean text from the PDF
    # cleaned_text = extract_clean_pdf_text(file_path)

    # Return the cleaned text as a response
    return jsonify({'cleaned_text': transcription})


@app.route('/keyword_search', methods=['GET', 'POST'])
def key_search():
    if request.method == 'POST':
        # Get text input from user
        text = request.form.get('text', '')

        # Extract keywords using RAKE algorithm
        rake = Rake()
        rake.extract_keywords_from_text(text)

        # Get the top 3 keywords, making sure to only take the phrases (not the scores)
        keyword_scores = rake.get_ranked_phrases_with_scores()[:3]
        keywords = [phrase for score, phrase in keyword_scores]  # Fixed: properly unpack score and phrase

        # Search GitHub projects
        try:
            g = Github(GITHUB_TOKEN)
            # Combine keywords into search query
            query = ' OR '.join(keywords)
            repositories = []

            # Search repositories and get top 5 results
            repos = g.search_repositories(query, sort='stars', order='desc')
            for repo in repos[:5]:
                repositories.append({
                    'name': repo.name,
                    'url': repo.html_url,
                    'description': repo.description,
                    'stars': repo.stargazers_count
                })

            return render_template('results.html',
                                   keywords=keywords,
                                   repositories=repositories,
                                   text=text)
        except Exception as e:
            error = f"An error occurred during search: {str(e)}"
            return render_template('AdditionalResources.html', error=error)

    return render_template('AdditionalResources.html')


@app.route('/TeachingAssistant')
def TeachingAssistant():
    return render_template('TeachingAssistant.html')


@app.route('/chat', methods=['POST'])
def chat():
    try:
        user_message = request.json['message']

        # format the message in the way OpenAI expects
        messages = [
            {"role": "system",
             "content": "You are a helpful Computer Science Teaching Assistant. Please respond in English."},
            {"role": "user", "content": user_message}
        ]

        # send the message to OpenAI
        response = client.chat.completions.create(
            model="gpt-4o-mini-2024-07-18",
            messages=messages,
            temperature=0.7,
            max_tokens=800
        )

        assistant_message = response.choices[0].message.content

        return jsonify({
            "response": assistant_message,
            "timestamp": datetime.now().strftime("%H:%M")
        })

    except Exception as e:
        print(f"Error: {str(e)}")  # log the error
        return jsonify({"error": str(e)}), 500

if __name__ == '__main__':
    app.run(debug=True)