import gradio as gr import os import json import shutil import html from datetime import datetime from retriever import retriever, reload_retriever from generator import answer_query from langchain_community.document_loaders import ( PyPDFLoader, TextLoader, CSVLoader, UnstructuredWordDocumentLoader ) from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain_huggingface import HuggingFaceEmbeddings from langchain_community.vectorstores import FAISS # Đường dẫn file CSS CUSTOM_CSS_PATH = "gradio_theme.css" # Quản lý danh sách file upload UPLOADED_FILES_JSON = "uploaded_files.json" uploaded_files = [] def save_uploaded_files_to_json(): with open(UPLOADED_FILES_JSON, "w", encoding="utf-8") as f: json.dump(uploaded_files, f, ensure_ascii=False, indent=2) def load_uploaded_files_from_json(): global uploaded_files if os.path.exists(UPLOADED_FILES_JSON): with open(UPLOADED_FILES_JSON, "r", encoding="utf-8") as f: uploaded_files = json.load(f) else: uploaded_files = [] def update_uploaded_files(): if not uploaded_files: return "_Chưa có tài liệu nào được tải lên._" return "### 📚 Danh sách tài liệu đã xử lý:\n" + "\n".join( f"- {f['name']} (Uploaded: {f['timestamp'][:19]})" for f in uploaded_files ) # Load khi khởi động load_uploaded_files_from_json() def process_document(file): file_path = file.name if os.path.exists("vectorstore"): shutil.rmtree("vectorstore") try: if file_path.endswith(".pdf"): loader = PyPDFLoader(file_path) elif file_path.endswith(".csv"): loader = CSVLoader(file_path) elif file_path.endswith(".txt"): loader = TextLoader(file_path, autodetect_encoding=True) elif file_path.endswith(".docx") or file_path.endswith(".doc"): loader = UnstructuredWordDocumentLoader(file_path) else: return "Định dạng file không hỗ trợ.", update_uploaded_files() documents = loader.load() except Exception as e: return f"Lỗi khi tải tài liệu: {e}", update_uploaded_files() splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100) docs = splitter.split_documents(documents) if not docs: return "Không trích xuất được nội dung từ tài liệu.", update_uploaded_files() embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2") db = FAISS.from_documents(docs, embeddings) db.save_local("vectorstore") reload_retriever() uploaded_files.append({"name": os.path.basename(file.name), "timestamp": datetime.now().isoformat()}) save_uploaded_files_to_json() return f"Đã xử lý {len(docs)} đoạn từ **{file.name}**", update_uploaded_files() def delete_file(filename): global uploaded_files filename = filename.strip() uploaded_files = [f for f in uploaded_files if f["name"] != filename] save_uploaded_files_to_json() return update_uploaded_files() def clear_inputs(): return "", "" def query_function(question, temperature, include_sources): fixed_model = "sentence-transformers/all-MiniLM-L6-v2" answer, docs = answer_query(question, model=fixed_model, temperature=temperature) answer = html.escape(answer) if include_sources and docs: unique_sources = set() for doc in docs: section = doc.metadata.get("section") if section: unique_sources.add(section.strip()) else: filename = os.path.basename(doc.metadata.get("source", "Unknown")) unique_sources.add(filename.strip()) if unique_sources: sources_list = [f"- {src}" for src in sorted(unique_sources)] sources_text = "\n\n**Nguồn tham khảo:**\n" + "\n".join(sources_list) answer += sources_text return answer # Tạo giao diện Gradio with gr.Blocks(css=CUSTOM_CSS_PATH) as demo: with gr.Row(): with gr.Column(scale=5): gr.Markdown("## 🔍 RAGFlow Enterprise Search\nTìm kiếm thông minh từ tài liệu nội bộ", elem_classes="container-box") with gr.Tabs(): # Tab Tìm kiếm with gr.TabItem("🔍 Tìm kiếm"): with gr.Column(elem_classes="container-box"): question = gr.Textbox(lines=3, label="Câu hỏi") with gr.Row(): temperature = gr.Slider(0, 1, value=0.2, step=0.1, label="Temperature") include_sources = gr.Checkbox(label="Hiển thị nguồn", value=True) with gr.Row(): search_btn = gr.Button("🔍 Tìm kiếm", variant="primary", elem_classes="button-primary") clear_btn = gr.Button("🗑️ Xóa", variant="secondary", elem_classes="button-secondary") output = gr.Markdown(elem_classes="output-box") search_btn.click(query_function, inputs=[question, temperature, include_sources], outputs=[output]) clear_btn.click(clear_inputs, outputs=[question, output]) # Tab Quản lý tài liệu with gr.TabItem("📚 Quản lý tài liệu"): with gr.Column(elem_classes="container-box"): upload_file = gr.File(label="Tải lên tài liệu", file_types=[".pdf", ".docx", ".doc", ".csv", ".txt"]) upload_btn = gr.Button("📄 Tải lên và xử lý", variant="primary") upload_status = gr.Textbox(label="Trạng thái", lines=3, interactive=False) uploaded_files_list = gr.Markdown(value=update_uploaded_files(), elem_classes="scroll-box") with gr.Column(elem_classes="container-box"): delete_filename = gr.Textbox(label="Tên file muốn xóa") delete_btn = gr.Button("🗑️ Xóa tài liệu", variant="secondary") upload_btn.click(process_document, inputs=[upload_file], outputs=[upload_status, uploaded_files_list]) delete_btn.click(delete_file, inputs=[delete_filename], outputs=[uploaded_files_list]) demo.launch(share=True)