Spaces:

damoojeje
/

SmartManuals-AI

Running

App Files Files Community

SmartManuals-AI / app.py

damoojeje

Update app.py

6728736 verified about 1 month ago

raw

history blame

7.26 kB

	# ✅ app.py (SmartManuals-AI)
	# Hugging Face Space-ready app with multi-model support, PDF upload, and live progress feedback

	import os
	import json
	import fitz # PyMuPDF
	import nltk
	import chromadb
	import tempfile
	import shutil
	import pytesseract
	import gradio as gr
	from PIL import Image
	from tqdm import tqdm
	from nltk.tokenize import sent_tokenize
	from sentence_transformers import SentenceTransformer, util
	from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline

	# ---------------------------
	# 🔧 CONFIG
	# ---------------------------
	pdf_folder = "Manuals"
	output_jsonl_chunks = "chunks.jsonl"
	chroma_path = "./chroma_store"
	collection_name = "manual_chunks"
	chunk_size = 750
	chunk_overlap = 100
	MAX_CONTEXT_CHUNKS = 3
	HF_TOKEN = os.environ.get("HF_TOKEN")

	MODEL_MAP = {
	"LLaMA 3 (8B)": "meta-llama/Meta-Llama-3-8B-Instruct",
	"LLaMA 4 Scout (17B)": "meta-llama/Meta-Llama-4-Scout-17B-16E-Instruct",
	"Gemma 3 (27B)": "google/gemma-3-27b-it",
	"Mistral 7B": "mistralai/Mistral-7B-Instruct-v0.3",
	"Qwen3 (30B)": "Qwen/Qwen3-30B-A3B"
	}

	# ---------------------------
	# 📥 UTILITIES
	# ---------------------------
	def clean_text(text):
	return "\n".join([line.strip() for line in text.splitlines() if line.strip()])

	def tokenize_sentences(text):
	nltk.download('punkt', quiet=True)
	return sent_tokenize(text)

	def split_into_chunks(sentences, max_tokens=750, overlap=100):
	chunks, current_chunk, current_len = [], [], 0
	for sentence in sentences:
	token_count = len(sentence.split())
	if current_len + token_count > max_tokens and current_chunk:
	chunks.append(" ".join(current_chunk))
	current_chunk = current_chunk[-overlap:]
	current_len = sum(len(s.split()) for s in current_chunk)
	current_chunk.append(sentence)
	current_len += token_count
	if current_chunk:
	chunks.append(" ".join(current_chunk))
	return chunks

	def extract_metadata_from_filename(filename):
	name = filename.lower().replace("_", " ").replace("-", " ")
	meta = {"model": "unknown", "doc_type": "unknown", "brand": "life fitness"}
	if "om" in name: meta["doc_type"] = "owner manual"
	elif "sm" in name: meta["doc_type"] = "service manual"
	elif "assembly" in name: meta["doc_type"] = "assembly instructions"
	elif "alert" in name: meta["doc_type"] = "installer alert"
	elif "parts" in name: meta["doc_type"] = "parts manual"
	known_models = ["se3hd", "se3", "se4", "symbio", "explore", "integrity x", "integrity sl", "everest", "engage"]
	for model in known_models:
	if model.replace(" ", "") in name.replace(" ", ""):
	meta["model"] = model
	return meta

	def extract_text_with_ocr(page):
	text = page.get_text().strip()
	if text:
	return text
	pix = page.get_pixmap(dpi=300)
	img_data = pix.tobytes("png")
	img = Image.open(tempfile.SpooledTemporaryFile())
	img.fp.write(img_data)
	img.fp.seek(0)
	return pytesseract.image_to_string(img).strip()

	# ---------------------------
	# 🧠 EMBEDDING + CHROMA
	# ---------------------------
	def embed_pdfs_from_uploaded(files, progress=gr.Progress(track_tqdm=True)):
	os.makedirs(pdf_folder, exist_ok=True)
	temp_chunks = []
	for file in files:
	filename = os.path.basename(file.name)
	dst = os.path.join(pdf_folder, filename)
	shutil.copy(file.name, dst)
	doc = fitz.open(dst)
	meta = extract_metadata_from_filename(filename)
	for page_num, page in enumerate(doc, start=1):
	text = extract_text_with_ocr(page)
	sents = tokenize_sentences(clean_text(text))
	chunks = split_into_chunks(sents, chunk_size, chunk_overlap)
	for i, chunk in enumerate(chunks):
	temp_chunks.append({
	"chunk_id": f"{filename}::page_{page_num}::chunk_{i+1}",
	"source_file": filename,
	"page": page_num,
	"text": chunk,
	**meta
	})

	with open(output_jsonl_chunks, "w", encoding="utf-8") as f:
	for c in temp_chunks:
	json.dump(c, f)
	f.write("\n")

	embedder = SentenceTransformer("all-MiniLM-L6-v2")
	client = chromadb.PersistentClient(path=chroma_path)
	if collection_name in [c.name for c in client.list_collections()]:
	client.delete_collection(collection_name)
	collection = client.create_collection(collection_name)

	for i in tqdm(range(0, len(temp_chunks), 16)):
	batch = temp_chunks[i:i+16]
	texts = [b["text"] for b in batch]
	metadatas = [b for b in batch]
	ids = [b["chunk_id"] for b in batch]
	embeddings = embedder.encode(texts).tolist()
	collection.add(documents=texts, ids=ids, metadatas=metadatas, embeddings=embeddings)

	return collection, embedder

	# ---------------------------
	# 🤖 LLM INFERENCE
	# ---------------------------
	def load_llm(model_key):
	model_id = MODEL_MAP.get(model_key)
	if not model_id or not HF_TOKEN:
	return None, None, None
	tokenizer = AutoTokenizer.from_pretrained(model_id, token=HF_TOKEN)
	model = AutoModelForCausalLM.from_pretrained(model_id, token=HF_TOKEN, device_map="auto")
	pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, max_new_tokens=300)
	return tokenizer, model, pipe

	def generate_answer(pipe, tokenizer, context, query):
	messages = [
	{"role": "system", "content": "You are an expert manual assistant. Answer accurately using only the context."},
	{"role": "user", "content": f"Context:\n{context}\n\nQuestion: {query}"}
	]
	prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
	output = pipe(prompt)[0]["generated_text"]
	return output.split("\n")[-1].strip()

	# ---------------------------
	# 🎯 FULL PIPELINE
	# ---------------------------
	def rag_pipeline(query, model_key, files):
	collection, embedder = embed_pdfs_from_uploaded(files)
	query_embedding = embedder.encode(query, convert_to_tensor=True)
	results = collection.query(query_texts=[query], n_results=MAX_CONTEXT_CHUNKS)
	if not results["documents"]:
	return "No matches found."

	context = "\n\n".join(results["documents"][0])
	tokenizer, model, pipe = load_llm(model_key)
	if pipe:
	return generate_answer(pipe, tokenizer, context, query)
	return "Model could not be loaded."

	# ---------------------------
	# 🖥️ GRADIO UI
	# ---------------------------
	with gr.Blocks() as demo:
	gr.Markdown("""# 🧠 SmartManuals-AI with Multi-Model RAG
	Upload your PDF manuals and ask smart questions. Choose your preferred LLM.""")
	with gr.Row():
	file_upload = gr.File(file_types=[".pdf"], file_count="multiple", label="Upload Manuals")
	with gr.Row():
	query_box = gr.Textbox(label="Question")
	model_selector = gr.Dropdown(label="Choose Model", choices=list(MODEL_MAP.keys()), value="LLaMA 3 (8B)")
	submit_btn = gr.Button("Run Query")
	answer_box = gr.Textbox(label="Answer", lines=8)

	submit_btn.click(fn=rag_pipeline, inputs=[query_box, model_selector, file_upload], outputs=[answer_box])

	demo.launch()