samples-python/flask-langchain-app/app.py at a90a95237e5403e0afbb8efbce9a993b9213aec3 · keploy/samples-python · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
import os
from flask import Flask, render_template, request, jsonify
from werkzeug.utils import secure_filename
import fitz  # PyMuPDF
from docx import Document
import magic
from datetime import datetime
from langchain.text_splitter import CharacterTextSplitter
from langchain_community.embeddings import FakeEmbeddings
from langchain_community.vectorstores import FAISS
from langchain.chains import RetrievalQA
from langchain.llms.fake import FakeListLLM

app = Flask(__name__)
app.config['UPLOAD_FOLDER'] = 'static/uploads'
app.config['MAX_CONTENT_LENGTH'] = 16 * 1024 * 1024  # 16MB max file size
app.config['ALLOWED_EXTENSIONS'] = {'pdf', 'docx', 'txt'}

# In-memory document store
documents = []

# Helper functions

def allowed_file(filename):
    return '.' in filename and filename.rsplit('.', 1)[1].lower() in app.config['ALLOWED_EXTENSIONS']

def extract_text_from_pdf(file_path):
    text = ""
    with fitz.open(file_path) as doc:
        for page in doc:
            text += page.get_text()
    return text

def extract_text_from_docx(file_path):
    doc = Document(file_path)
    text = ""
    for paragraph in doc.paragraphs:
        text += paragraph.text + "\n"
    return text

def extract_text_from_txt(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        return file.read()

@app.route('/')
def index():
    return render_template('index.html')

@app.route('/upload', methods=['POST'])
def upload_file():
    if 'file' not in request.files:
        return jsonify({'error': 'No file part'}), 400
    file = request.files['file']
    if file.filename == '':
        return jsonify({'error': 'No selected file'}), 400
    if file and allowed_file(file.filename):
        filename = secure_filename(file.filename)
        timestamp = datetime.now().strftime('%Y%m%d_%H%M%S_')
        filename = timestamp + filename
        file_path = os.path.join(app.config['UPLOAD_FOLDER'], filename)
        file.save(file_path)
        file_type = magic.from_file(file_path, mime=True)
        if file_type == 'application/pdf':
            text = extract_text_from_pdf(file_path)
        elif file_type == 'application/vnd.openxmlformats-officedocument.wordprocessingml.document':
            text = extract_text_from_docx(file_path)
        elif file_type == 'text/plain':
            text = extract_text_from_txt(file_path)
        else:
            return jsonify({'error': 'Unsupported file type'}), 400
        documents.append({'filename': filename, 'text': text})
        return jsonify({'message': 'File uploaded and processed successfully', 'filename': filename})
    return jsonify({'error': 'Invalid file type'}), 400

@app.route('/query', methods=['POST'])
def query():
    data = request.get_json()
    query_text = data.get('query')
    if not query_text:
        return jsonify({'error': 'No query provided'}), 400
    if not documents:
        return jsonify({'error': 'No documents uploaded yet.'}), 400
    # Combine all docs for demo; in production, use per-doc QA
    all_text = '\n'.join([doc['text'] for doc in documents])
    splitter = CharacterTextSplitter(chunk_size=500, chunk_overlap=50)
    texts = splitter.split_text(all_text)
    # Use fake embeddings and LLM for demo; replace with real ones for production
    embeddings = FakeEmbeddings(size=32)
    vectordb = FAISS.from_texts(texts, embeddings)
    retriever = vectordb.as_retriever()
    llm = FakeListLLM(responses=[f"Pretend answer for: {query_text}"])
    qa = RetrievalQA.from_chain_type(llm=llm, retriever=retriever)
    answer = qa.run(query_text)
    return jsonify({'results': [answer]})

@app.route('/documents', methods=['GET'])
def list_documents():
    return jsonify({'documents': [{'source': doc['filename']} for doc in documents]})

if __name__ == '__main__':
    os.makedirs(app.config['UPLOAD_FOLDER'], exist_ok=True)
    app.run(debug=True)