Spaces:

damoojeje
/

SmartManuals-AI

Running

App Files Files Community

damoojeje commited on 17 days ago

Commit

57fff59

verified ·

1 Parent(s): d06b252

Update app.py

Browse files

Files changed (1) hide show

app.py +80 -100

app.py CHANGED Viewed

@@ -1,4 +1,3 @@
 import os
 import fitz
 import json
@@ -6,100 +5,85 @@ import gradio as gr
 import pytesseract
 import chromadb
 import torch
-import asyncio
-import docx2txt
 import nltk
 import traceback
 from PIL import Image
 from io import BytesIO
 from tqdm import tqdm
-from transformers import (
-    pipeline,
-    AutoModelForCausalLM,
-    AutoTokenizer
-)
 from sentence_transformers import SentenceTransformer, util
 from nltk.tokenize import sent_tokenize
-# Ensure punkt is available
 try:
     nltk.data.find("tokenizers/punkt")
 except LookupError:
     nltk.download("punkt")
-# ---------------- Config ----------------
 MANUALS_DIR = "Manuals"
 CHROMA_PATH = "chroma_store"
 COLLECTION_NAME = "manual_chunks"
 CHUNK_SIZE = 750
 CHUNK_OVERLAP = 100
 MAX_CONTEXT_CHUNKS = 3
-MODELS = {
-    "LLaMA 3 (8B)": "meta-llama/Llama-3.1-8B-Instruct",
-    "Mistral 7B": "mistralai/Mistral-7B-Instruct-v0.3",
-    "Gemma 2B": "google/gemma-1.1-2b-it",
-    "LLaMA 4 (Scout 17B)": "meta-llama/Llama-4-Scout-17B-16E",
-    "Qwen 30B": "Qwen/Qwen3-30B-A3B"
-}
-HF_TOKEN = os.environ.get("HF_TOKEN")
 device = "cuda" if torch.cuda.is_available() else "cpu"
-# ---------------- Utils ----------------
 def clean(text):
     return "\n".join([line.strip() for line in text.splitlines() if line.strip()])
 def split_sentences(text):
     try:
         return sent_tokenize(text)
-    except Exception as e:
-        print("[Tokenizer Error]", e)
         return text.split(". ")
-def split_into_chunks(sentences, max_tokens=CHUNK_SIZE, overlap=CHUNK_OVERLAP):
     chunks = []
-    current_chunk, current_len = [], 0
-    for sentence in sentences:
-        words = sentence.split()
-        if current_len + len(words) > max_tokens and current_chunk:
             chunks.append(" ".join(current_chunk))
             current_chunk = current_chunk[-overlap:]
-            current_len = sum(len(s.split()) for s in current_chunk)
-        current_chunk.append(sentence)
-        current_len += len(words)
     if current_chunk:
         chunks.append(" ".join(current_chunk))
     return chunks
-def extract_pdf_text(pdf_path):
-    text_chunks = []
     try:
-        doc = fitz.open(pdf_path)
         for i, page in enumerate(doc):
             text = page.get_text().strip()
             if not text:
-                pix = page.get_pixmap(dpi=300)
-                img = Image.open(BytesIO(pix.tobytes("png")))
                 text = pytesseract.image_to_string(img)
-            text_chunks.append((pdf_path, i + 1, clean(text)))
     except Exception as e:
-        print("❌ Error reading PDF:", pdf_path, e)
-    return text_chunks
-def extract_docx_text(docx_path):
     try:
-        text = clean(docx2txt.process(docx_path))
-        return [(docx_path, 1, text)]
     except Exception as e:
-        print("❌ Error reading DOCX:", docx_path, e)
         return []
-# ---------------- Background Embed ----------------
 def embed_all():
     embedder = SentenceTransformer("all-MiniLM-L6-v2")
     embedder.eval()
@@ -111,9 +95,8 @@ def embed_all():
         pass
     collection = client.get_or_create_collection(COLLECTION_NAME)
-    chunks, ids, metas = [], [], []
-    idx = 0
-    print("📄 Scanning Manuals folder...")
     for fname in os.listdir(MANUALS_DIR):
         fpath = os.path.join(MANUALS_DIR, fname)
@@ -124,80 +107,77 @@ def embed_all():
         else:
             continue
-        for filepath, page, text in pages:
-            sentences = split_sentences(text)
-            subchunks = split_into_chunks(sentences)
-            for i, subchunk in enumerate(subchunks):
-                chunks.append(subchunk)
-                ids.append(f"{fname}::{page}::{i}")
                 metas.append({"source": fname, "page": page})
-                if len(chunks) >= 16:
-                    embs = embedder.encode(chunks).tolist()
-                    collection.add(documents=chunks, ids=ids, metadatas=metas, embeddings=embs)
-                    chunks, ids, metas = [], [], []
-    if chunks:
-        embs = embedder.encode(chunks).tolist()
-        collection.add(documents=chunks, ids=ids, metadatas=metas, embeddings=embs)
     print(f"✅ Embedded {len(ids)} chunks.")
     return collection, embedder
-# ---------------- Model Loader ----------------
-def load_model(model_id):
-    tokenizer = AutoTokenizer.from_pretrained(model_id, token=HF_TOKEN)
-    model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32, token=HF_TOKEN)
     pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, device=0 if torch.cuda.is_available() else -1)
     return pipe, tokenizer
-# ---------------- Query ----------------
-def query_llm(context, question, pipe, tokenizer):
-    prompt = f"""<|begin_of_text|><|start_header_id|>system<|end_header_id|>
-You are a helpful assistant. Use only the following context to answer. If uncertain, say: 'I don't know.'
 {context}
-<|start_header_id|>user<|end_header_id|>
-{question}
-<|start_header_id|>assistant<|end_header_id|>
-"""
-    out = pipe(prompt, max_new_tokens=512)[0]["generated_text"]
-    return out.split("<|start_header_id|>assistant<|end_header_id|>")[-1].strip()
-def answer_question(question, model_choice):
     try:
-        model_id = MODELS[model_choice]
-        pipe, tokenizer = load_model(model_id)
         query_emb = embedder.encode(question, convert_to_tensor=True)
         results = db.query(query_texts=[question], n_results=MAX_CONTEXT_CHUNKS)
-        context_chunks = results["documents"][0]
-        context = "\n\n".join(context_chunks)
-        answer = query_llm(context, question, pipe, tokenizer)
-        return answer
     except Exception as e:
-        traceback.print_exc()
-        return f"❌ Error: {str(e)}"
-# ---------------- Run App ----------------
 with gr.Blocks() as demo:
-    gr.Markdown("### 📘 Ask Questions About Your Manuals")
-    model_choice = gr.Dropdown(label="Select Model", choices=list(MODELS.keys()), value="LLaMA 3 (8B)")
-    question = gr.Textbox(label="Your Question", placeholder="e.g. How do I reset the treadmill?")
-    submit = gr.Button("🔍 Get Answer")
-    answer = gr.Textbox(label="Answer", lines=10)
-    submit.click(fn=answer_question, inputs=[question, model_choice], outputs=answer)
-# Run background embed on startup
 try:
     db, embedder = embed_all()
 except Exception as e:
-    print("❌ Failed to embed docs:", e)
     db, embedder = None, None
-# Only launch if in HF Space
 if __name__ == "__main__":
     demo.launch()

 import os
 import fitz
 import json
 import pytesseract
 import chromadb
 import torch
 import nltk
 import traceback
+import docx2txt
 from PIL import Image
 from io import BytesIO
 from tqdm import tqdm
+from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
 from sentence_transformers import SentenceTransformer, util
 from nltk.tokenize import sent_tokenize
+# Ensure punkt is downloaded
 try:
     nltk.data.find("tokenizers/punkt")
 except LookupError:
     nltk.download("punkt")
+# Configuration
+HF_TOKEN = os.getenv("HF_TOKEN")
 MANUALS_DIR = "Manuals"
 CHROMA_PATH = "chroma_store"
 COLLECTION_NAME = "manual_chunks"
 CHUNK_SIZE = 750
 CHUNK_OVERLAP = 100
 MAX_CONTEXT_CHUNKS = 3
+MODEL_ID = "ibm-granite/granite-vision-3.2-2b"
 device = "cuda" if torch.cuda.is_available() else "cpu"
+# ---------------- Text Helpers ----------------
 def clean(text):
     return "\n".join([line.strip() for line in text.splitlines() if line.strip()])
 def split_sentences(text):
     try:
         return sent_tokenize(text)
+    except:
+        print("⚠️ Tokenizer fallback: simple split.")
         return text.split(". ")
+def split_chunks(sentences, max_tokens=CHUNK_SIZE, overlap=CHUNK_OVERLAP):
     chunks = []
+    current_chunk, length = [], 0
+    for sent in sentences:
+        words = sent.split()
+        if length + len(words) > max_tokens and current_chunk:
             chunks.append(" ".join(current_chunk))
             current_chunk = current_chunk[-overlap:]
+            length = sum(len(s.split()) for s in current_chunk)
+        current_chunk.append(sent)
+        length += len(words)
     if current_chunk:
         chunks.append(" ".join(current_chunk))
     return chunks
+# ---------------- File Readers ----------------
+def extract_pdf_text(path):
+    chunks = []
     try:
+        doc = fitz.open(path)
         for i, page in enumerate(doc):
             text = page.get_text().strip()
             if not text:
+                img = Image.open(BytesIO(page.get_pixmap(dpi=300).tobytes("png")))
                 text = pytesseract.image_to_string(img)
+            chunks.append((path, i + 1, clean(text)))
     except Exception as e:
+        print("❌ PDF read error:", path, e)
+    return chunks
+def extract_docx_text(path):
     try:
+        return [(path, 1, clean(docx2txt.process(path)))]
     except Exception as e:
+        print("❌ DOCX read error:", path, e)
         return []
+# ---------------- Embedding ----------------
 def embed_all():
     embedder = SentenceTransformer("all-MiniLM-L6-v2")
     embedder.eval()
         pass
     collection = client.get_or_create_collection(COLLECTION_NAME)
+    docs, ids, metas = [], [], []
+    print("📄 Processing manuals...")
     for fname in os.listdir(MANUALS_DIR):
         fpath = os.path.join(MANUALS_DIR, fname)
         else:
             continue
+        for path, page, text in pages:
+            for i, chunk in enumerate(split_chunks(split_sentences(text))):
+                chunk_id = f"{fname}::{page}::{i}"
+                docs.append(chunk)
+                ids.append(chunk_id)
                 metas.append({"source": fname, "page": page})
+                if len(docs) >= 16:
+                    embs = embedder.encode(docs).tolist()
+                    collection.add(documents=docs, ids=ids, metadatas=metas, embeddings=embs)
+                    docs, ids, metas = [], [], []
+    if docs:
+        embs = embedder.encode(docs).tolist()
+        collection.add(documents=docs, ids=ids, metadatas=metas, embeddings=embs)
     print(f"✅ Embedded {len(ids)} chunks.")
     return collection, embedder
+# ---------------- Model Setup ----------------
+def load_model():
+    tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, token=HF_TOKEN)
+    model = AutoModelForCausalLM.from_pretrained(
+        MODEL_ID,
+        token=HF_TOKEN,
+        device_map="auto" if torch.cuda.is_available() else None,
+        torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32
+    ).to(device)
     pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, device=0 if torch.cuda.is_available() else -1)
     return pipe, tokenizer
+def ask_model(question, context, pipe, tokenizer):
+    prompt = f"""Use only the following context to answer. If uncertain, say "I don't know."
+<context>
 {context}
+</context>
+Q: {question}
+A:"""
+    output = pipe(prompt, max_new_tokens=512)[0]["generated_text"]
+    return output.split("A:")[-1].strip()
+# ---------------- Query ----------------
+def get_answer(question):
     try:
         query_emb = embedder.encode(question, convert_to_tensor=True)
         results = db.query(query_texts=[question], n_results=MAX_CONTEXT_CHUNKS)
+        context = "\n\n".join(results["documents"][0])
+        return ask_model(question, context, model_pipe, model_tokenizer)
     except Exception as e:
+        print("❌ Query error:", e)
+        return f"Error: {e}"
+# ---------------- UI ----------------
 with gr.Blocks() as demo:
+    gr.Markdown("## 🤖 SmartManuals-AI (Granite 3.2-2B)")
+    with gr.Row():
+        question = gr.Textbox(label="Ask your question")
+        ask = gr.Button("Ask")
+    answer = gr.Textbox(label="Answer", lines=8)
+    ask.click(fn=get_answer, inputs=question, outputs=answer)
+# Embed + Load Model at Startup
 try:
     db, embedder = embed_all()
+    model_pipe, model_tokenizer = load_model()
 except Exception as e:
+    print("❌ Startup failure:", e)
     db, embedder = None, None
+    model_pipe, model_tokenizer = None, None
 if __name__ == "__main__":
     demo.launch()