Spaces:

damoojeje
/

SmartManuals-AI

Running

App Files Files Community

damoojeje commited on May 21

Commit

8ab0a40

verified ·

1 Parent(s): 43b8a1d

Update app.py

Browse files

Files changed (1) hide show

app.py +110 -125

app.py CHANGED Viewed

@@ -1,156 +1,141 @@
 import os
-import fitz  # PyMuPDF
-import docx
-import io
 import json
-import gradio as gr
 import pytesseract
 from PIL import Image
-from tqdm import tqdm
-import chromadb
-import torch
 import nltk
-from sentence_transformers import SentenceTransformer, util
-from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
-# ----------------------------
-# ✅ Ensure nltk punkt is available
-# ----------------------------
-try:
-    nltk.data.find("tokenizers/punkt")
-except LookupError:
-    nltk.download("punkt")
 from nltk.tokenize import sent_tokenize
-# ----------------------------
-# ⚙️ Config
-# ----------------------------
-MANUAL_DIR = "./Manuals"
-CHROMA_DIR = "./chroma_store"
-CHUNK_SIZE = 750
-CHUNK_OVERLAP = 100
-MAX_CONTEXT = 3
-DEFAULT_MODEL = "meta-llama/Llama-3-8b-Instruct"
-MODEL_OPTIONS = [
-    "meta-llama/Llama-3-8b-Instruct",
-    "mistralai/Mistral-7B-Instruct-v0.3",
-    "google/gemma-1.1-7b-it"
-]
-HF_TOKEN = os.environ.get("HF_TOKEN")
-# ----------------------------
-# 🔍 Utility functions
-# ----------------------------
-def extract_pdf_text(path):
-    text_blocks = []
-    doc = fitz.open(path)
-    for i, page in enumerate(doc):
-        text = page.get_text()
-        if not text.strip():
-            img = Image.open(io.BytesIO(page.get_pixmap().tobytes("png")))
-            text = pytesseract.image_to_string(img)
-        text_blocks.append({"page": i + 1, "text": text})
-    return text_blocks
-def extract_docx_text(path):
-    doc = docx.Document(path)
-    full_text = "\n".join([para.text for para in doc.paragraphs])
-    return [{"page": 1, "text": full_text}]
 def split_sentences(text):
     try:
         return sent_tokenize(text)
-    except Exception:
         return text.split(". ")
-def chunk_text(sentences):
-    chunks = []
-    current = []
-    count = 0
-    for sentence in sentences:
-        tokens = sentence.split()
-        if count + len(tokens) > CHUNK_SIZE:
-            chunks.append(" ".join(current))
-            current = current[-CHUNK_OVERLAP:]
-            count = sum(len(s.split()) for s in current)
-        current.append(sentence)
-        count += len(tokens)
-    if current:
-        chunks.append(" ".join(current))
-    return chunks
 def embed_all():
-    client = chromadb.PersistentClient(path=CHROMA_DIR)
-    if "manual_chunks" in [c.name for c in client.list_collections()]:
-        client.delete_collection("manual_chunks")
-    collection = client.create_collection("manual_chunks")
     embedder = SentenceTransformer("all-MiniLM-L6-v2")
-    for fname in os.listdir(MANUAL_DIR):
-        fpath = os.path.join(MANUAL_DIR, fname)
         if fname.lower().endswith(".pdf"):
             pages = extract_pdf_text(fpath)
-        elif fname.lower().endswith(".docx"):
-            pages = extract_docx_text(fpath)
-        else:
-            continue
-        for page in pages:
-            sents = split_sentences(page["text"])
-            chunks = chunk_text(sents)
-            for idx, chunk in enumerate(chunks):
-                cid = f"{fname}::p{page['page']}::c{idx}"
-                collection.add(documents=[chunk], ids=[cid], metadatas=[{"source": fname, "page": page["page"]}])
     return collection, embedder
-def get_model(model_id):
-    tokenizer = AutoTokenizer.from_pretrained(model_id, token=HF_TOKEN)
-    model = AutoModelForCausalLM.from_pretrained(model_id, token=HF_TOKEN, torch_dtype=torch.float32)
-    return pipeline("text-generation", model=model, tokenizer=tokenizer, device=-1)
-def run_query(question, model_name):
-    results = db.query(query_texts=[question], n_results=MAX_CONTEXT)
-    if not results or not results.get("documents"):
-        return "No matching information found."
     context = "\n\n".join(results["documents"][0])
-    prompt = f"""
-You are a helpful assistant. Use the following context to answer the question.
-Context:
 {context}
-Question: {question}
-Answer:
 """
-    model = get_model(model_name)
-    res = model(prompt, max_new_tokens=300)[0]['generated_text']
-    return res.split("Answer:")[-1].strip()
-# ----------------------------
-# ✅ Startup: Embed manuals
-# ----------------------------
-db, embedder = embed_all()
-# ----------------------------
-# 🎛️ Gradio UI
-# ----------------------------
 with gr.Blocks() as demo:
-    gr.Markdown("""
-    # 📘 SmartManuals-AI (Docker)
-    Ask any question from the preloaded manuals (PDF + Word).
-    """)
     with gr.Row():
-        question = gr.Textbox(label="Ask a Question")
-        model = gr.Dropdown(choices=MODEL_OPTIONS, value=DEFAULT_MODEL, label="Choose LLM")
-    btn = gr.Button("Ask")
-    answer = gr.Textbox(label="Answer", lines=10)
-    btn.click(fn=run_query, inputs=[question, model], outputs=answer)
-demo.launch(server_name="0.0.0.0", server_port=7860)

 import os
 import json
+import fitz  # PyMuPDF
 import pytesseract
 from PIL import Image
+import io
 import nltk
+import chromadb
+from tqdm import tqdm
+from sentence_transformers import SentenceTransformer
+from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
+import gradio as gr
+# ---------------------------
+# 📦 Paths and Constants
+# ---------------------------
+MANUALS_DIR = "./Manuals"
+CHROMA_PATH = "./chroma_store"
+COLLECTION_NAME = "manual_chunks"
+# Ensure NLTK punkt is available
+nltk.download("punkt")
 from nltk.tokenize import sent_tokenize
+# ---------------------------
+# 🧼 Text cleaning utilities
+# ---------------------------
+def clean(text):
+    return "\n".join([line.strip() for line in text.splitlines() if line.strip()])
 def split_sentences(text):
     try:
         return sent_tokenize(text)
+    except Exception as e:
+        print("[Tokenizer Error]", e, "\nFalling back to simple split.")
         return text.split(". ")
+# ---------------------------
+# 📄 PDF and DOCX extraction
+# ---------------------------
+def extract_pdf_text(pdf_path):
+    doc = fitz.open(pdf_path)
+    pages = []
+    for i, page in enumerate(doc):
+        text = page.get_text().strip()
+        if not text:
+            try:
+                pix = page.get_pixmap(dpi=300)
+                img = Image.open(io.BytesIO(pix.tobytes("png")))
+                text = pytesseract.image_to_string(img)
+            except pytesseract.TesseractNotFoundError:
+                print("❌ Tesseract not found. Skipping OCR for page.")
+                text = ""
+        pages.append((i + 1, text))
+    return pages
+# ---------------------------
+# 🧠 Embed text using MiniLM
+# ---------------------------
 def embed_all():
+    client = chromadb.PersistentClient(path=CHROMA_PATH)
+    if COLLECTION_NAME in [c.name for c in client.list_collections()]:
+        client.delete_collection(COLLECTION_NAME)
+    collection = client.create_collection(COLLECTION_NAME)
     embedder = SentenceTransformer("all-MiniLM-L6-v2")
+    chunk_id = 0
+    for fname in os.listdir(MANUALS_DIR):
+        fpath = os.path.join(MANUALS_DIR, fname)
         if fname.lower().endswith(".pdf"):
             pages = extract_pdf_text(fpath)
+            for page_num, text in pages:
+                sents = split_sentences(clean(text))
+                for i in range(0, len(sents), 5):
+                    chunk = " ".join(sents[i:i + 5])
+                    if chunk.strip():
+                        collection.add(
+                            documents=[chunk],
+                            metadatas=[{"source": fname, "page": page_num}],
+                            ids=[f"{fname}-{page_num}-{i}-{chunk_id}"]
+                        )
+                        chunk_id += 1
+    print(f"✅ Embedded {chunk_id} chunks.")
     return collection, embedder
+# ---------------------------
+# 🤖 Load model
+# ---------------------------
+def load_llm():
+    model_id = "meta-llama/Llama-3.1-8B-Instruct"
+    token = os.environ.get("HF_TOKEN")
+    tokenizer = AutoTokenizer.from_pretrained(model_id, token=token)
+    model = AutoModelForCausalLM.from_pretrained(
+        model_id, token=token, torch_dtype=None, device_map="auto"
+    )
+    pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, max_new_tokens=512)
+    return pipe, tokenizer
+# ---------------------------
+# ❓ Ask a question
+# ---------------------------
+def ask_question(question, db, embedder, pipe, tokenizer):
+    results = db.query(query_texts=[question], n_results=5)
     context = "\n\n".join(results["documents"][0])
+    prompt = f"""
+<|begin_of_text|><|start_header_id|>system<|end_header_id|>
+You are a helpful assistant that answers questions from technical manuals using only the provided context.
+<context>
 {context}
+</context>
+<|start_header_id|>user<|end_header_id|>
+{question}<|start_header_id|>assistant<|end_header_id|>
 """
+    out = pipe(prompt)[0]["generated_text"]
+    final = out.split("<|start_header_id|>assistant<|end_header_id|>")[-1].strip()
+    return final
+# ---------------------------
+# 🚀 Build interface
+# ---------------------------
 with gr.Blocks() as demo:
+    gr.Markdown("# 🤖 SmartManuals-AI (Hugging Face Space Edition)")
     with gr.Row():
+        qbox = gr.Textbox(label="Ask a Question", placeholder="e.g. How do I access diagnostics on the SE3 console?")
+        submit = gr.Button("🔍 Ask")
+    abox = gr.Textbox(label="Answer", lines=8)
+    db, embedder = embed_all()
+    pipe, tokenizer = load_llm()
+    submit.click(fn=lambda q: ask_question(q, db, embedder, pipe, tokenizer), inputs=qbox, outputs=abox)
+# For Hugging Face Spaces
+if __name__ == "__main__":
+    demo.launch()