vanhai123's picture
Update app.py
2df143d verified
import gradio as gr
import os
import json
import shutil
import html
from datetime import datetime
from retriever import retriever, reload_retriever
from generator import answer_query
from langchain_community.document_loaders import (
PyPDFLoader, TextLoader, CSVLoader, UnstructuredWordDocumentLoader
)
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
# Đường dẫn file CSS
CUSTOM_CSS_PATH = "gradio_theme.css"
# Quản lý danh sách file upload
UPLOADED_FILES_JSON = "uploaded_files.json"
uploaded_files = []
def save_uploaded_files_to_json():
with open(UPLOADED_FILES_JSON, "w", encoding="utf-8") as f:
json.dump(uploaded_files, f, ensure_ascii=False, indent=2)
def load_uploaded_files_from_json():
global uploaded_files
if os.path.exists(UPLOADED_FILES_JSON):
with open(UPLOADED_FILES_JSON, "r", encoding="utf-8") as f:
uploaded_files = json.load(f)
else:
uploaded_files = []
def update_uploaded_files():
if not uploaded_files:
return "_Chưa có tài liệu nào được tải lên._"
return "### 📚 Danh sách tài liệu đã xử lý:\n" + "\n".join(
f"- {f['name']} (Uploaded: {f['timestamp'][:19]})" for f in uploaded_files
)
# Load khi khởi động
load_uploaded_files_from_json()
def process_document(file):
file_path = file.name
if os.path.exists("vectorstore"):
shutil.rmtree("vectorstore")
try:
if file_path.endswith(".pdf"):
loader = PyPDFLoader(file_path)
elif file_path.endswith(".csv"):
loader = CSVLoader(file_path)
elif file_path.endswith(".txt"):
loader = TextLoader(file_path, autodetect_encoding=True)
elif file_path.endswith(".docx") or file_path.endswith(".doc"):
loader = UnstructuredWordDocumentLoader(file_path)
else:
return "Định dạng file không hỗ trợ.", update_uploaded_files()
documents = loader.load()
except Exception as e:
return f"Lỗi khi tải tài liệu: {e}", update_uploaded_files()
splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
docs = splitter.split_documents(documents)
if not docs:
return "Không trích xuất được nội dung từ tài liệu.", update_uploaded_files()
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
db = FAISS.from_documents(docs, embeddings)
db.save_local("vectorstore")
reload_retriever()
uploaded_files.append({"name": os.path.basename(file.name), "timestamp": datetime.now().isoformat()})
save_uploaded_files_to_json()
return f"Đã xử lý {len(docs)} đoạn từ **{file.name}**", update_uploaded_files()
def delete_file(filename):
global uploaded_files
filename = filename.strip()
uploaded_files = [f for f in uploaded_files if f["name"] != filename]
save_uploaded_files_to_json()
return update_uploaded_files()
def clear_inputs():
return "", ""
def query_function(question, temperature, include_sources):
fixed_model = "sentence-transformers/all-MiniLM-L6-v2"
answer, docs = answer_query(question, model=fixed_model, temperature=temperature)
answer = html.escape(answer)
if include_sources and docs:
unique_sources = set()
for doc in docs:
section = doc.metadata.get("section")
if section:
unique_sources.add(section.strip())
else:
filename = os.path.basename(doc.metadata.get("source", "Unknown"))
unique_sources.add(filename.strip())
if unique_sources:
sources_list = [f"- {src}" for src in sorted(unique_sources)]
sources_text = "\n\n**Nguồn tham khảo:**\n" + "\n".join(sources_list)
answer += sources_text
return answer
# Tạo giao diện Gradio
with gr.Blocks(css=CUSTOM_CSS_PATH) as demo:
with gr.Row():
with gr.Column(scale=5):
gr.Markdown("## 🔍 RAGFlow Enterprise Search\nTìm kiếm thông minh từ tài liệu nội bộ", elem_classes="container-box")
with gr.Tabs():
# Tab Tìm kiếm
with gr.TabItem("🔍 Tìm kiếm"):
with gr.Column(elem_classes="container-box"):
question = gr.Textbox(lines=3, label="Câu hỏi")
with gr.Row():
temperature = gr.Slider(0, 1, value=0.2, step=0.1, label="Temperature")
include_sources = gr.Checkbox(label="Hiển thị nguồn", value=True)
with gr.Row():
search_btn = gr.Button("🔍 Tìm kiếm", variant="primary", elem_classes="button-primary")
clear_btn = gr.Button("🗑️ Xóa", variant="secondary", elem_classes="button-secondary")
output = gr.Markdown(elem_classes="output-box")
search_btn.click(query_function,
inputs=[question, temperature, include_sources],
outputs=[output])
clear_btn.click(clear_inputs,
outputs=[question, output])
# Tab Quản lý tài liệu
with gr.TabItem("📚 Quản lý tài liệu"):
with gr.Column(elem_classes="container-box"):
upload_file = gr.File(label="Tải lên tài liệu", file_types=[".pdf", ".docx", ".doc", ".csv", ".txt"])
upload_btn = gr.Button("📄 Tải lên và xử lý", variant="primary")
upload_status = gr.Textbox(label="Trạng thái", lines=3, interactive=False)
uploaded_files_list = gr.Markdown(value=update_uploaded_files(), elem_classes="scroll-box")
with gr.Column(elem_classes="container-box"):
delete_filename = gr.Textbox(label="Tên file muốn xóa")
delete_btn = gr.Button("🗑️ Xóa tài liệu", variant="secondary")
upload_btn.click(process_document,
inputs=[upload_file],
outputs=[upload_status, uploaded_files_list])
delete_btn.click(delete_file,
inputs=[delete_filename],
outputs=[uploaded_files_list])
demo.launch(share=True)