Spaces:

vanhai123
/

ragflow-enterprise-search-app

Running

App Files Files Community

ragflow-enterprise-search-app / app.py

vanhai123

Update app.py

2df143d verified about 1 month ago

raw

history blame contribute delete

6.46 kB

	import gradio as gr
	import os
	import json
	import shutil
	import html
	from datetime import datetime
	from retriever import retriever, reload_retriever
	from generator import answer_query
	from langchain_community.document_loaders import (
	PyPDFLoader, TextLoader, CSVLoader, UnstructuredWordDocumentLoader
	)
	from langchain.text_splitter import RecursiveCharacterTextSplitter
	from langchain_huggingface import HuggingFaceEmbeddings
	from langchain_community.vectorstores import FAISS

	# Đường dẫn file CSS
	CUSTOM_CSS_PATH = "gradio_theme.css"

	# Quản lý danh sách file upload
	UPLOADED_FILES_JSON = "uploaded_files.json"
	uploaded_files = []

	def save_uploaded_files_to_json():
	with open(UPLOADED_FILES_JSON, "w", encoding="utf-8") as f:
	json.dump(uploaded_files, f, ensure_ascii=False, indent=2)

	def load_uploaded_files_from_json():
	global uploaded_files
	if os.path.exists(UPLOADED_FILES_JSON):
	with open(UPLOADED_FILES_JSON, "r", encoding="utf-8") as f:
	uploaded_files = json.load(f)
	else:
	uploaded_files = []

	def update_uploaded_files():
	if not uploaded_files:
	return "_Chưa có tài liệu nào được tải lên._"
	return "### 📚 Danh sách tài liệu đã xử lý:\n" + "\n".join(
	f"- {f['name']} (Uploaded: {f['timestamp'][:19]})" for f in uploaded_files
	)

	# Load khi khởi động
	load_uploaded_files_from_json()

	def process_document(file):
	file_path = file.name

	if os.path.exists("vectorstore"):
	shutil.rmtree("vectorstore")

	try:
	if file_path.endswith(".pdf"):
	loader = PyPDFLoader(file_path)
	elif file_path.endswith(".csv"):
	loader = CSVLoader(file_path)
	elif file_path.endswith(".txt"):
	loader = TextLoader(file_path, autodetect_encoding=True)
	elif file_path.endswith(".docx") or file_path.endswith(".doc"):
	loader = UnstructuredWordDocumentLoader(file_path)
	else:
	return "Định dạng file không hỗ trợ.", update_uploaded_files()

	documents = loader.load()
	except Exception as e:
	return f"Lỗi khi tải tài liệu: {e}", update_uploaded_files()

	splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
	docs = splitter.split_documents(documents)

	if not docs:
	return "Không trích xuất được nội dung từ tài liệu.", update_uploaded_files()

	embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
	db = FAISS.from_documents(docs, embeddings)
	db.save_local("vectorstore")
	reload_retriever()

	uploaded_files.append({"name": os.path.basename(file.name), "timestamp": datetime.now().isoformat()})
	save_uploaded_files_to_json()

	return f"Đã xử lý {len(docs)} đoạn từ {file.name}", update_uploaded_files()

	def delete_file(filename):
	global uploaded_files
	filename = filename.strip()
	uploaded_files = [f for f in uploaded_files if f["name"] != filename]
	save_uploaded_files_to_json()
	return update_uploaded_files()

	def clear_inputs():
	return "", ""

	def query_function(question, temperature, include_sources):
	fixed_model = "sentence-transformers/all-MiniLM-L6-v2"
	answer, docs = answer_query(question, model=fixed_model, temperature=temperature)
	answer = html.escape(answer)

	if include_sources and docs:
	unique_sources = set()
	for doc in docs:
	section = doc.metadata.get("section")
	if section:
	unique_sources.add(section.strip())
	else:
	filename = os.path.basename(doc.metadata.get("source", "Unknown"))
	unique_sources.add(filename.strip())
	if unique_sources:
	sources_list = [f"- {src}" for src in sorted(unique_sources)]
	sources_text = "\n\nNguồn tham khảo:\n" + "\n".join(sources_list)
	answer += sources_text
	return answer

	# Tạo giao diện Gradio
	with gr.Blocks(css=CUSTOM_CSS_PATH) as demo:
	with gr.Row():
	with gr.Column(scale=5):
	gr.Markdown("## 🔍 RAGFlow Enterprise Search\nTìm kiếm thông minh từ tài liệu nội bộ", elem_classes="container-box")

	with gr.Tabs():
	# Tab Tìm kiếm
	with gr.TabItem("🔍 Tìm kiếm"):
	with gr.Column(elem_classes="container-box"):
	question = gr.Textbox(lines=3, label="Câu hỏi")
	with gr.Row():
	temperature = gr.Slider(0, 1, value=0.2, step=0.1, label="Temperature")
	include_sources = gr.Checkbox(label="Hiển thị nguồn", value=True)
	with gr.Row():
	search_btn = gr.Button("🔍 Tìm kiếm", variant="primary", elem_classes="button-primary")
	clear_btn = gr.Button("🗑️ Xóa", variant="secondary", elem_classes="button-secondary")
	output = gr.Markdown(elem_classes="output-box")

	search_btn.click(query_function,
	inputs=[question, temperature, include_sources],
	outputs=[output])
	clear_btn.click(clear_inputs,
	outputs=[question, output])

	# Tab Quản lý tài liệu
	with gr.TabItem("📚 Quản lý tài liệu"):
	with gr.Column(elem_classes="container-box"):
	upload_file = gr.File(label="Tải lên tài liệu", file_types=[".pdf", ".docx", ".doc", ".csv", ".txt"])
	upload_btn = gr.Button("📄 Tải lên và xử lý", variant="primary")
	upload_status = gr.Textbox(label="Trạng thái", lines=3, interactive=False)
	uploaded_files_list = gr.Markdown(value=update_uploaded_files(), elem_classes="scroll-box")
	with gr.Column(elem_classes="container-box"):
	delete_filename = gr.Textbox(label="Tên file muốn xóa")
	delete_btn = gr.Button("🗑️ Xóa tài liệu", variant="secondary")

	upload_btn.click(process_document,
	inputs=[upload_file],
	outputs=[upload_status, uploaded_files_list])
	delete_btn.click(delete_file,
	inputs=[delete_filename],
	outputs=[uploaded_files_list])

	demo.launch(share=True)