# ✅ SmartManuals-AI app.py (for Hugging Face Spaces) # Optimized to support multiple LLMs, Gradio UI, and secure on-device document QA import os import json import io import fitz import nltk import chromadb import pytesseract import numpy as np import torch from PIL import Image from tqdm import tqdm from nltk.tokenize import sent_tokenize from sentence_transformers import SentenceTransformer, util from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline import gradio as gr # ---------------------- # 🔧 Configurations # ---------------------- PDF_DIR = "./Manuals" CHROMA_PATH = "./chroma_store" COLLECTION_NAME = "manual_chunks" MAX_CONTEXT_CHUNKS = 3 CHUNK_SIZE = 750 CHUNK_OVERLAP = 100 MODEL_OPTIONS = [ "meta-llama/Llama-3.1-8B-Instruct", "meta-llama/Llama-4-Scout-17B-16E-Instruct", "google/gemma-1.1-7b-it", "Qwen/Qwen1.5-14B-Chat", "mistralai/Mistral-7B-Instruct-v0.3" ] HF_TOKEN = os.environ.get("HF_TOKEN") # ---------------------- # 📚 NLTK Setup # ---------------------- try: nltk.data.find('tokenizers/punkt') except LookupError: nltk.download('punkt') # ---------------------- # 📄 Utility Functions # ---------------------- def extract_text_or_ocr(page): text = page.get_text().strip() if text: return text, False pix = page.get_pixmap(dpi=300) img_data = pix.tobytes("png") img = Image.open(io.BytesIO(img_data)) return pytesseract.image_to_string(img).strip(), True def clean_text(text): return "\n".join([line.strip() for line in text.splitlines() if line.strip()]) def tokenize_sentences(text): return sent_tokenize(text) def split_chunks(sentences, max_tokens=CHUNK_SIZE, overlap=CHUNK_OVERLAP): chunks, chunk, length = [], [], 0 for sentence in sentences: count = len(sentence.split()) if length + count > max_tokens and chunk: chunks.append(" ".join(chunk)) chunk = chunk[-overlap:] length = sum(len(s.split()) for s in chunk) chunk.append(sentence) length += count if chunk: chunks.append(" ".join(chunk)) return chunks def extract_metadata(filename): name = filename.lower().replace("_", " ").replace("-", " ") meta = {"model": "unknown", "doc_type": "unknown", "brand": "life fitness"} if "om" in name or "owner" in name: meta["doc_type"] = "owner manual" elif "sm" in name or "service" in name: meta["doc_type"] = "service manual" elif "assembly" in name: meta["doc_type"] = "assembly instructions" elif "alert" in name: meta["doc_type"] = "installer alert" elif "parts" in name: meta["doc_type"] = "parts manual" elif "bulletin" in name: meta["doc_type"] = "service bulletin" for kw in ["se3hd", "se3", "se4", "symbio", "explore", "integrity x", "integrity sl", "everest", "engage", "inspire", "discover", "95t", "95x", "95c", "95r", "97c"]: if kw.replace(" ", "") in name.replace(" ", ""): meta["model"] = kw return meta # ---------------------- # 🧠 Load LLM # ---------------------- def load_llm(model_id): tokenizer = AutoTokenizer.from_pretrained(model_id, token=HF_TOKEN) model = AutoModelForCausalLM.from_pretrained(model_id, token=HF_TOKEN, torch_dtype=torch.float32) return pipeline("text-generation", model=model, tokenizer=tokenizer, device=-1) # ---------------------- # 🧠 Chroma + Embed # ---------------------- def embed_pdfs(): os.makedirs(CHROMA_PATH, exist_ok=True) client = chromadb.PersistentClient(path=CHROMA_PATH) if COLLECTION_NAME in [c.name for c in client.list_collections()]: client.delete_collection(COLLECTION_NAME) collection = client.create_collection(COLLECTION_NAME) embedder = SentenceTransformer("all-MiniLM-L6-v2") for file in tqdm(os.listdir(PDF_DIR)): if not file.lower().endswith(".pdf"): continue doc = fitz.open(os.path.join(PDF_DIR, file)) meta = extract_metadata(file) for page_num, page in enumerate(doc, 1): text, _ = extract_text_or_ocr(page) if not text.strip(): continue sents = tokenize_sentences(clean_text(text)) chunks = split_chunks(sents) for i, chunk in enumerate(chunks): chunk_id = f"{file}::p{page_num}::c{i}" emb = embedder.encode([chunk])[0].tolist() collection.add( documents=[chunk], ids=[chunk_id], embeddings=[emb], metadatas=[{**meta, "source_file": file, "page": page_num}] ) return collection, embedder # ---------------------- # 🔍 RAG Pipeline # ---------------------- def answer_query(q, model_id): collection, embedder = embed_pdfs() pipe = load_llm(model_id) emb_q = embedder.encode([q])[0].tolist() results = collection.query(query_embeddings=[emb_q], n_results=MAX_CONTEXT_CHUNKS) context = "\n\n".join(results['documents'][0]) prompt = f"Use the context below to answer the question.\nContext:\n{context}\n\nQuestion: {q}\nAnswer:" return pipe(prompt)[0]['generated_text'].split("Answer:")[-1].strip() # ---------------------- # 🚀 Gradio UI # ---------------------- with gr.Blocks() as app: gr.Markdown("""# SmartManuals-AI **Local-first document QA** powered by OCR, ChromaDB & your choice of LLM (via Hugging Face). """) with gr.Row(): question = gr.Textbox(placeholder="Ask a question from the manuals...", label="Question") model_choice = gr.Dropdown(label="Choose Model", choices=MODEL_OPTIONS, value=MODEL_OPTIONS[0]) output = gr.Textbox(label="Answer", lines=10) run = gr.Button("Run RAG") run.click(fn=answer_query, inputs=[question, model_choice], outputs=output) if __name__ == "__main__": app.launch()