Spaces:

damoojeje
/

SmartManuals-AI

Running

App Files Files Community

damoojeje commited on about 1 month ago

Commit

6728736

verified ·

1 Parent(s): e6fa21e

Update app.py

Browse files

Files changed (1) hide show

app.py +149 -118

app.py CHANGED Viewed

@@ -1,156 +1,187 @@
-# ✅ SmartManuals-AI app.py (for Hugging Face Spaces)
-# Optimized to support multiple LLMs, Gradio UI, and secure on-device document QA
 import os
 import json
-import io
-import fitz
 import nltk
 import chromadb
 import pytesseract
-import numpy as np
-import torch
 from PIL import Image
 from tqdm import tqdm
 from nltk.tokenize import sent_tokenize
 from sentence_transformers import SentenceTransformer, util
 from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
-import gradio as gr
-# ----------------------
-# 🔧 Configurations
-# ----------------------
-PDF_DIR = "./Manuals"
-CHROMA_PATH = "./chroma_store"
-COLLECTION_NAME = "manual_chunks"
 MAX_CONTEXT_CHUNKS = 3
-CHUNK_SIZE = 750
-CHUNK_OVERLAP = 100
-MODEL_OPTIONS = [
-    "meta-llama/Llama-3.1-8B-Instruct",
-    "meta-llama/Llama-4-Scout-17B-16E-Instruct",
-    "google/gemma-1.1-7b-it",
-    "Qwen/Qwen1.5-14B-Chat",
-    "mistralai/Mistral-7B-Instruct-v0.3"
-]
 HF_TOKEN = os.environ.get("HF_TOKEN")
-# ----------------------
-# 📚 NLTK Setup
-# ----------------------
-try:
-    nltk.data.find('tokenizers/punkt')
-except LookupError:
-    nltk.download('punkt')
-# ----------------------
-# 📄 Utility Functions
-# ----------------------
-def extract_text_or_ocr(page):
-    text = page.get_text().strip()
-    if text:
-        return text, False
-    pix = page.get_pixmap(dpi=300)
-    img_data = pix.tobytes("png")
-    img = Image.open(io.BytesIO(img_data))
-    return pytesseract.image_to_string(img).strip(), True
 def clean_text(text):
     return "\n".join([line.strip() for line in text.splitlines() if line.strip()])
 def tokenize_sentences(text):
     return sent_tokenize(text)
-def split_chunks(sentences, max_tokens=CHUNK_SIZE, overlap=CHUNK_OVERLAP):
-    chunks, chunk, length = [], [], 0
     for sentence in sentences:
-        count = len(sentence.split())
-        if length + count > max_tokens and chunk:
-            chunks.append(" ".join(chunk))
-            chunk = chunk[-overlap:]
-            length = sum(len(s.split()) for s in chunk)
-        chunk.append(sentence)
-        length += count
-    if chunk: chunks.append(" ".join(chunk))
     return chunks
-def extract_metadata(filename):
     name = filename.lower().replace("_", " ").replace("-", " ")
     meta = {"model": "unknown", "doc_type": "unknown", "brand": "life fitness"}
-    if "om" in name or "owner" in name: meta["doc_type"] = "owner manual"
-    elif "sm" in name or "service" in name: meta["doc_type"] = "service manual"
     elif "assembly" in name: meta["doc_type"] = "assembly instructions"
     elif "alert" in name: meta["doc_type"] = "installer alert"
     elif "parts" in name: meta["doc_type"] = "parts manual"
-    elif "bulletin" in name: meta["doc_type"] = "service bulletin"
-    for kw in ["se3hd", "se3", "se4", "symbio", "explore", "integrity x", "integrity sl", "everest", "engage", "inspire", "discover", "95t", "95x", "95c", "95r", "97c"]:
-        if kw.replace(" ", "") in name.replace(" ", ""): meta["model"] = kw
     return meta
-# ----------------------
-# 🧠 Load LLM
-# ----------------------
-def load_llm(model_id):
-    tokenizer = AutoTokenizer.from_pretrained(model_id, token=HF_TOKEN)
-    model = AutoModelForCausalLM.from_pretrained(model_id, token=HF_TOKEN, torch_dtype=torch.float32)
-    return pipeline("text-generation", model=model, tokenizer=tokenizer, device=-1)
-# ----------------------
-# 🧠 Chroma + Embed
-# ----------------------
-def embed_pdfs():
-    os.makedirs(CHROMA_PATH, exist_ok=True)
-    client = chromadb.PersistentClient(path=CHROMA_PATH)
-    if COLLECTION_NAME in [c.name for c in client.list_collections()]:
-        client.delete_collection(COLLECTION_NAME)
-    collection = client.create_collection(COLLECTION_NAME)
-    embedder = SentenceTransformer("all-MiniLM-L6-v2")
-    for file in tqdm(os.listdir(PDF_DIR)):
-        if not file.lower().endswith(".pdf"): continue
-        doc = fitz.open(os.path.join(PDF_DIR, file))
-        meta = extract_metadata(file)
-        for page_num, page in enumerate(doc, 1):
-            text, _ = extract_text_or_ocr(page)
-            if not text.strip(): continue
             sents = tokenize_sentences(clean_text(text))
-            chunks = split_chunks(sents)
             for i, chunk in enumerate(chunks):
-                chunk_id = f"{file}::p{page_num}::c{i}"
-                emb = embedder.encode([chunk])[0].tolist()
-                collection.add(
-                    documents=[chunk],
-                    ids=[chunk_id],
-                    embeddings=[emb],
-                    metadatas=[{**meta, "source_file": file, "page": page_num}]
-                )
     return collection, embedder
-# ----------------------
-# 🔍 RAG Pipeline
-# ----------------------
-def answer_query(q, model_id):
-    collection, embedder = embed_pdfs()
-    pipe = load_llm(model_id)
-    emb_q = embedder.encode([q])[0].tolist()
-    results = collection.query(query_embeddings=[emb_q], n_results=MAX_CONTEXT_CHUNKS)
-    context = "\n\n".join(results['documents'][0])
-    prompt = f"Use the context below to answer the question.\nContext:\n{context}\n\nQuestion: {q}\nAnswer:"
-    return pipe(prompt)[0]['generated_text'].split("Answer:")[-1].strip()
-# ----------------------
-# 🚀 Gradio UI
-# ----------------------
-with gr.Blocks() as app:
-    gr.Markdown("""# SmartManuals-AI
-**Local-first document QA** powered by OCR, ChromaDB & your choice of LLM (via Hugging Face).
-""")
     with gr.Row():
-        question = gr.Textbox(placeholder="Ask a question from the manuals...", label="Question")
-        model_choice = gr.Dropdown(label="Choose Model", choices=MODEL_OPTIONS, value=MODEL_OPTIONS[0])
-    output = gr.Textbox(label="Answer", lines=10)
-    run = gr.Button("Run RAG")
-    run.click(fn=answer_query, inputs=[question, model_choice], outputs=output)
-if __name__ == "__main__":
-    app.launch()

+# ✅ app.py (SmartManuals-AI)
+# Hugging Face Space-ready app with multi-model support, PDF upload, and live progress feedback
 import os
 import json
+import fitz  # PyMuPDF
 import nltk
 import chromadb
+import tempfile
+import shutil
 import pytesseract
+import gradio as gr
 from PIL import Image
 from tqdm import tqdm
 from nltk.tokenize import sent_tokenize
 from sentence_transformers import SentenceTransformer, util
 from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
+# ---------------------------
+# 🔧 CONFIG
+# ---------------------------
+pdf_folder = "Manuals"
+output_jsonl_chunks = "chunks.jsonl"
+chroma_path = "./chroma_store"
+collection_name = "manual_chunks"
+chunk_size = 750
+chunk_overlap = 100
 MAX_CONTEXT_CHUNKS = 3
 HF_TOKEN = os.environ.get("HF_TOKEN")
+MODEL_MAP = {
+    "LLaMA 3 (8B)": "meta-llama/Meta-Llama-3-8B-Instruct",
+    "LLaMA 4 Scout (17B)": "meta-llama/Meta-Llama-4-Scout-17B-16E-Instruct",
+    "Gemma 3 (27B)": "google/gemma-3-27b-it",
+    "Mistral 7B": "mistralai/Mistral-7B-Instruct-v0.3",
+    "Qwen3 (30B)": "Qwen/Qwen3-30B-A3B"
+}
+# ---------------------------
+# 📥 UTILITIES
+# ---------------------------
 def clean_text(text):
     return "\n".join([line.strip() for line in text.splitlines() if line.strip()])
 def tokenize_sentences(text):
+    nltk.download('punkt', quiet=True)
     return sent_tokenize(text)
+def split_into_chunks(sentences, max_tokens=750, overlap=100):
+    chunks, current_chunk, current_len = [], [], 0
     for sentence in sentences:
+        token_count = len(sentence.split())
+        if current_len + token_count > max_tokens and current_chunk:
+            chunks.append(" ".join(current_chunk))
+            current_chunk = current_chunk[-overlap:]
+            current_len = sum(len(s.split()) for s in current_chunk)
+        current_chunk.append(sentence)
+        current_len += token_count
+    if current_chunk:
+        chunks.append(" ".join(current_chunk))
     return chunks
+def extract_metadata_from_filename(filename):
     name = filename.lower().replace("_", " ").replace("-", " ")
     meta = {"model": "unknown", "doc_type": "unknown", "brand": "life fitness"}
+    if "om" in name: meta["doc_type"] = "owner manual"
+    elif "sm" in name: meta["doc_type"] = "service manual"
     elif "assembly" in name: meta["doc_type"] = "assembly instructions"
     elif "alert" in name: meta["doc_type"] = "installer alert"
     elif "parts" in name: meta["doc_type"] = "parts manual"
+    known_models = ["se3hd", "se3", "se4", "symbio", "explore", "integrity x", "integrity sl", "everest", "engage"]
+    for model in known_models:
+        if model.replace(" ", "") in name.replace(" ", ""):
+            meta["model"] = model
     return meta
+def extract_text_with_ocr(page):
+    text = page.get_text().strip()
+    if text:
+        return text
+    pix = page.get_pixmap(dpi=300)
+    img_data = pix.tobytes("png")
+    img = Image.open(tempfile.SpooledTemporaryFile())
+    img.fp.write(img_data)
+    img.fp.seek(0)
+    return pytesseract.image_to_string(img).strip()
+# ---------------------------
+# 🧠 EMBEDDING + CHROMA
+# ---------------------------
+def embed_pdfs_from_uploaded(files, progress=gr.Progress(track_tqdm=True)):
+    os.makedirs(pdf_folder, exist_ok=True)
+    temp_chunks = []
+    for file in files:
+        filename = os.path.basename(file.name)
+        dst = os.path.join(pdf_folder, filename)
+        shutil.copy(file.name, dst)
+        doc = fitz.open(dst)
+        meta = extract_metadata_from_filename(filename)
+        for page_num, page in enumerate(doc, start=1):
+            text = extract_text_with_ocr(page)
             sents = tokenize_sentences(clean_text(text))
+            chunks = split_into_chunks(sents, chunk_size, chunk_overlap)
             for i, chunk in enumerate(chunks):
+                temp_chunks.append({
+                    "chunk_id": f"{filename}::page_{page_num}::chunk_{i+1}",
+                    "source_file": filename,
+                    "page": page_num,
+                    "text": chunk,
+                    **meta
+                })
+    with open(output_jsonl_chunks, "w", encoding="utf-8") as f:
+        for c in temp_chunks:
+            json.dump(c, f)
+            f.write("\n")
+    embedder = SentenceTransformer("all-MiniLM-L6-v2")
+    client = chromadb.PersistentClient(path=chroma_path)
+    if collection_name in [c.name for c in client.list_collections()]:
+        client.delete_collection(collection_name)
+    collection = client.create_collection(collection_name)
+    for i in tqdm(range(0, len(temp_chunks), 16)):
+        batch = temp_chunks[i:i+16]
+        texts = [b["text"] for b in batch]
+        metadatas = [b for b in batch]
+        ids = [b["chunk_id"] for b in batch]
+        embeddings = embedder.encode(texts).tolist()
+        collection.add(documents=texts, ids=ids, metadatas=metadatas, embeddings=embeddings)
     return collection, embedder
+# ---------------------------
+# 🤖 LLM INFERENCE
+# ---------------------------
+def load_llm(model_key):
+    model_id = MODEL_MAP.get(model_key)
+    if not model_id or not HF_TOKEN:
+        return None, None, None
+    tokenizer = AutoTokenizer.from_pretrained(model_id, token=HF_TOKEN)
+    model = AutoModelForCausalLM.from_pretrained(model_id, token=HF_TOKEN, device_map="auto")
+    pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, max_new_tokens=300)
+    return tokenizer, model, pipe
+def generate_answer(pipe, tokenizer, context, query):
+    messages = [
+        {"role": "system", "content": "You are an expert manual assistant. Answer accurately using only the context."},
+        {"role": "user", "content": f"Context:\n{context}\n\nQuestion: {query}"}
+    ]
+    prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+    output = pipe(prompt)[0]["generated_text"]
+    return output.split("\n")[-1].strip()
+# ---------------------------
+# 🎯 FULL PIPELINE
+# ---------------------------
+def rag_pipeline(query, model_key, files):
+    collection, embedder = embed_pdfs_from_uploaded(files)
+    query_embedding = embedder.encode(query, convert_to_tensor=True)
+    results = collection.query(query_texts=[query], n_results=MAX_CONTEXT_CHUNKS)
+    if not results["documents"]:
+        return "No matches found."
+    context = "\n\n".join(results["documents"][0])
+    tokenizer, model, pipe = load_llm(model_key)
+    if pipe:
+        return generate_answer(pipe, tokenizer, context, query)
+    return "Model could not be loaded."
+# ---------------------------
+# 🖥️ GRADIO UI
+# ---------------------------
+with gr.Blocks() as demo:
+    gr.Markdown("""# 🧠 SmartManuals-AI with Multi-Model RAG
+Upload your PDF manuals and ask smart questions. Choose your preferred LLM.""")
     with gr.Row():
+        file_upload = gr.File(file_types=[".pdf"], file_count="multiple", label="Upload Manuals")
+    with gr.Row():
+        query_box = gr.Textbox(label="Question")
+        model_selector = gr.Dropdown(label="Choose Model", choices=list(MODEL_MAP.keys()), value="LLaMA 3 (8B)")
+    submit_btn = gr.Button("Run Query")
+    answer_box = gr.Textbox(label="Answer", lines=8)
+    submit_btn.click(fn=rag_pipeline, inputs=[query_box, model_selector, file_upload], outputs=[answer_box])
+demo.launch()