# โœ… SmartManuals-AI: Hugging Face Space App (RAM Safe, Multi-model, No Preview) import os, json, fitz, torch, chromadb, docx import gradio as gr from PIL import Image from nltk.tokenize import sent_tokenize from sentence_transformers import SentenceTransformer, util from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline from tqdm import tqdm # --------------------------- # โš™๏ธ Constants # --------------------------- MANUALS_DIR = "Manuals" CHROMA_PATH = "./chroma_store" CHUNKS_JSONL = "manual_chunks.jsonl" COLLECTION_NAME = "manual_chunks" HF_TOKEN = os.environ.get("HF_TOKEN") CHUNK_SIZE = 750 CHUNK_OVERLAP = 100 TOP_K = 3 MODEL_OPTIONS = { "LLaMA 3.1 (8B)": "meta-llama/Llama-3.1-8B-Instruct", "Mistral 7B": "mistralai/Mistral-7B-Instruct-v0.3", "Gemma 7B": "google/gemma-7b-it", "Qwen3 7B": "Qwen/Qwen1.5-7B-Chat" } # --------------------------- # ๐Ÿ“„ Extract Text from PDFs and DOCX # --------------------------- def extract_text_from_pdf(path): text = "" try: doc = fitz.open(path) for page in doc: page_text = page.get_text() text += page_text + "\n" doc.close() except Exception as e: print(f"โŒ PDF Error in {path}: {e}") return text def extract_text_from_docx(path): try: doc = docx.Document(path) return "\n".join(p.text for p in doc.paragraphs if p.text.strip()) except Exception as e: print(f"โŒ DOCX Error in {path}: {e}") return "" # --------------------------- # ๐Ÿงน Clean + Chunk # --------------------------- def clean(text): return "\n".join([line.strip() for line in text.splitlines() if line.strip()]) def split_sentences(text): return sent_tokenize(text) def chunk_text(sentences, size=CHUNK_SIZE, overlap=CHUNK_OVERLAP): chunks, chunk, length = [], [], 0 for sent in sentences: n = len(sent.split()) if length + n > size: if chunk: chunks.append(" ".join(chunk)) chunk = chunk[-overlap:] length = sum(len(s.split()) for s in chunk) chunk.append(sent) length += n if chunk: chunks.append(" ".join(chunk)) return chunks # --------------------------- # ๐Ÿ“ฆ Embed and Store in Chroma # --------------------------- def embed_all(): print("๐Ÿ” Scanning manuals and embedding...") os.makedirs(CHROMA_PATH, exist_ok=True) embedder = SentenceTransformer("all-MiniLM-L6-v2") client = chromadb.PersistentClient(path=CHROMA_PATH) if COLLECTION_NAME in [c.name for c in client.list_collections()]: client.delete_collection(COLLECTION_NAME) collection = client.create_collection(COLLECTION_NAME) all_chunks = [] files = [f for f in os.listdir(MANUALS_DIR) if f.lower().endswith((".pdf", ".docx"))] for fname in tqdm(files): path = os.path.join(MANUALS_DIR, fname) text = extract_text_from_pdf(path) if fname.endswith(".pdf") else extract_text_from_docx(path) text = clean(text) sents = split_sentences(text) chunks = chunk_text(sents) for i, chunk in enumerate(chunks): all_chunks.append({ "id": f"{fname}::chunk_{i+1}", "text": chunk, "metadata": {"source": fname} }) # Batch embed and store for i in range(0, len(all_chunks), 16): batch = all_chunks[i:i+16] docs = [c["text"] for c in batch] ids = [c["id"] for c in batch] metas = [c["metadata"] for c in batch] embs = embedder.encode(docs).tolist() collection.add(documents=docs, ids=ids, metadatas=metas, embeddings=embs) print(f"โœ… Embedded {len(all_chunks)} chunks.") return collection, embedder # --------------------------- # ๐Ÿ” RAG Search & LLM Answer # --------------------------- def ask(query, model_key): model_id = MODEL_OPTIONS[model_key] try: tokenizer = AutoTokenizer.from_pretrained(model_id, token=HF_TOKEN) model = AutoModelForCausalLM.from_pretrained(model_id, token=HF_TOKEN, torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32) model.to("cuda" if torch.cuda.is_available() else "cpu") gen = pipeline("text-generation", model=model, tokenizer=tokenizer, device=0 if torch.cuda.is_available() else -1) except Exception as e: return f"โŒ Model loading failed: {e}" results = db.query(query_texts=[query], n_results=TOP_K) chunks = results["documents"][0] context = "\n\n".join(chunks) prompt = f"Answer this using only the context below.\n\nContext:\n{context}\n\nQuestion: {query}\nAnswer:" try: res = gen(prompt, max_new_tokens=300, do_sample=False)[0]['generated_text'] return res.split("Answer:", 1)[-1].strip() except Exception as e: return f"โŒ LLM failed: {e}" # --------------------------- # โ–ถ๏ธ UI # --------------------------- db, embedder = embed_all() with gr.Blocks() as demo: gr.Markdown("## ๐Ÿง  SmartManuals-AI โ€” Ask Your PDF and Word Docs") with gr.Row(): qbox = gr.Textbox(label="Ask a Question", placeholder="e.g. How do I calibrate SE3 console?") model_pick = gr.Dropdown(choices=list(MODEL_OPTIONS.keys()), label="Choose a Model", value="Mistral 7B") answer = gr.Textbox(label="Answer", lines=8) ask_btn = gr.Button("Ask") ask_btn.click(fn=ask, inputs=[qbox, model_pick], outputs=[answer]) demo.launch()