Spaces:

damoojeje
/

SmartManuals-AI

Running

App Files Files Community

damoojeje commited on 29 days ago

Commit

fcbea64

verified ·

1 Parent(s): 6f368e7

Update app.py

Browse files

Files changed (1) hide show

app.py +173 -143

app.py CHANGED Viewed

@@ -1,176 +1,206 @@
-# ✅ app.py — Hugging Face Space Version (Finalized)
-# RAG over local PDFs/DOCX using Hugging Face-hosted models with Chroma
 import os
 import json
 import fitz  # PyMuPDF
-import nltk
 import chromadb
 from tqdm import tqdm
 from nltk.tokenize import sent_tokenize
 from sentence_transformers import SentenceTransformer, util
-import numpy as np
-import torch
 from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
-import pytesseract
-from PIL import Image
-import io
-import docx2txt
 import gradio as gr
 # ---------------------------
-# ✅ Configuration
 # ---------------------------
-MANUALS_DIR = "./Manuals"  # Folder containing all PDF and DOCX files
 CHROMA_PATH = "./chroma_store"
-CHUNKS_PATH = "chunks.jsonl"
-COLLECTION_NAME = "manual_chunks"
-MAX_CONTEXT_CHUNKS = 3
 CHUNK_SIZE = 750
 CHUNK_OVERLAP = 100
 HF_TOKEN = os.environ.get("HF_TOKEN")
-LLM_MODELS = {
-    "LLaMA 3.1 8B": "meta-llama/Llama-3.1-8B-Instruct",
-    "LLaMA 3 8B": "meta-llama/Llama-3-8B-Instruct",
-    "LLaMA 4 Scout": "meta-llama/Llama-4-Scout-17B-16E-Instruct",
-    "Mistral": "mistralai/Mistral-7B-Instruct-v0.3",
-    "Gemma": "google/gemma-1.1-7b-it",
-    "Qwen 3 30B": "Qwen/Qwen3-30B-A3B",
-}
-# ---------------------------
-# ✅ Setup
-# ---------------------------
-nltk.download('punkt')
-embedder = SentenceTransformer("all-MiniLM-L6-v2")
-client = chromadb.PersistentClient(path=CHROMA_PATH)
 collection = None
 # ---------------------------
-# 📄 Load all PDFs and DOCX content
 # ---------------------------
-def extract_all_documents():
     chunks = []
-    for fname in os.listdir(MANUALS_DIR):
-        path = os.path.join(MANUALS_DIR, fname)
-        if fname.lower().endswith(".pdf"):
             doc = fitz.open(path)
             for i, page in enumerate(doc):
-                text = page.get_text().strip()
                 if not text:
-                    pix = page.get_pixmap(dpi=300)
-                    img = Image.open(io.BytesIO(pix.tobytes("png")))
                     text = pytesseract.image_to_string(img)
-                if text.strip():
-                    chunks.append((fname, i + 1, text.strip()))
-        elif fname.lower().endswith(".docx"):
-            text = docx2txt.process(path)
-            if text.strip():
-                chunks.append((fname, 1, text.strip()))
     return chunks
 # ---------------------------
-# ✂️ Chunk text
-# ---------------------------
-def split_chunks(text, size=CHUNK_SIZE, overlap=CHUNK_OVERLAP):
-    sentences = sent_tokenize(text)
-    chunks, curr, curr_len = [], [], 0
-    for sent in sentences:
-        tok_len = len(sent.split())
-        if curr_len + tok_len > size:
-            chunks.append(" ".join(curr))
-            curr = curr[-overlap:]
-            curr_len = sum(len(s.split()) for s in curr)
-        curr.append(sent)
-        curr_len += tok_len
-    if curr:
-        chunks.append(" ".join(curr))
-    return chunks
-# ---------------------------
-# 💾 Embed into Chroma
-# ---------------------------
-def embed_documents():
-    global collection
-    if collection:
-        client.delete_collection(COLLECTION_NAME)
-    collection = client.create_collection(COLLECTION_NAME)
-    docs = extract_all_documents()
-    records = []
-    for fname, page, text in docs:
-        for i, chunk in enumerate(split_chunks(text)):
-            if not chunk.strip():
-                continue
-            records.append({
-                "id": f"{fname}::p{page}::c{i}",
-                "text": chunk,
-                "metadata": {"source_file": fname, "page": page}
-            })
-    for i in tqdm(range(0, len(records), 16)):
-        batch = records[i:i + 16]
-        texts = [b["text"] for b in batch]
-        ids = [b["id"] for b in batch]
-        metas = [b["metadata"] for b in batch]
-        embs = embedder.encode(texts).tolist()
-        collection.add(documents=texts, ids=ids, metadatas=metas, embeddings=embs)
-    return f"✅ Embedded {len(records)} chunks"
-# ---------------------------
-# 🔎 Query
-# ---------------------------
-def search_context(query, top_k=MAX_CONTEXT_CHUNKS):
-    results = collection.query(query_texts=[query], n_results=top_k)
-    chunks = results["documents"][0]
-    metas = results["metadatas"][0]
-    return "\n\n".join(
-        f"File: {m['source_file']}, Page: {m['page']}\n{c}" for m, c in zip(metas, chunks)
-    )
-# ---------------------------
-# 🧠 Run Inference
-# ---------------------------
-def ask_model(model_name, query):
-    if not HF_TOKEN:
-        return "❌ HF_TOKEN not set."
-    context = search_context(query)
-    system_prompt = "Answer only using the context. Say 'I don't know' if not found."
-    prompt = f"<|begin_of_text|><|start_header_id|>system<|end_header_id|>{system_prompt}<|start_header_id|>user<|end_header_id|>{context}\n\nQuestion: {query}<|start_header_id|>assistant<|end_header_id|>"
-    tokenizer = AutoTokenizer.from_pretrained(model_name, token=HF_TOKEN)
-    model = AutoModelForCausalLM.from_pretrained(model_name, token=HF_TOKEN, device_map="auto")
-    pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, device=0 if torch.cuda.is_available() else -1)
-    output = pipe(prompt, max_new_tokens=512, do_sample=True)[0]["generated_text"]
-    return output.split("<|start_header_id|>assistant<|end_header_id|>")[-1].strip()
-# ---------------------------
-# 🎛 Gradio UI
-# ---------------------------
-def launch_interface():
     with gr.Blocks() as demo:
-        gr.Markdown("""
-        # 🧠 SmartManuals-AI (Hugging Face Edition)
-        Upload manuals to `./Manuals`, click Embed, then ask questions.
-        """)
-        with gr.Row():
-            embed_button = gr.Button("⚙️ Embed Documents")
-            embed_status = gr.Textbox(label="Status")
-        with gr.Row():
-            model_select = gr.Dropdown(list(LLM_MODELS.keys()), label="Model", value="LLaMA 3.1 8B")
-            question = gr.Textbox(label="Question")
-        answer = gr.Textbox(label="Answer", lines=10)
-        submit = gr.Button("🔍 Ask")
-        embed_button.click(fn=embed_documents, outputs=embed_status)
-        submit.click(fn=lambda m, q: ask_model(LLM_MODELS[m], q), inputs=[model_select, question], outputs=[answer])
-    demo.launch()
 # ---------------------------
-if __name__ == "__main__":
-    launch_interface()

+# ✅ app.py (Final Hugging Face Version for SmartManuals-AI)
+# ✅ No metadata filtering; all semantic search with keyword reranking
+# ✅ Auto-index from Manuals/ on startup, with rerun prevention
+# ✅ Gradio UI only, no file upload, progress logs
 import os
 import json
 import fitz  # PyMuPDF
+import hashlib
 import chromadb
 from tqdm import tqdm
 from nltk.tokenize import sent_tokenize
 from sentence_transformers import SentenceTransformer, util
 from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
+import torch
 import gradio as gr
 # ---------------------------
+# ⚙️ Config
 # ---------------------------
+MANUALS_FOLDER = "./Manuals"
 CHROMA_PATH = "./chroma_store"
+CHUNKS_FILE = "manual_chunks_with_ocr.jsonl"
+HASH_FILE = "manuals.hash"
 CHUNK_SIZE = 750
 CHUNK_OVERLAP = 100
+MODEL_ID = "meta-llama/Llama-3.1-8B-Instruct"
 HF_TOKEN = os.environ.get("HF_TOKEN")
 collection = None
+embedder = None
+pipe = None
+# ---------------------------
+# 🔐 Load model and pipeline
+# ---------------------------
+def load_model():
+    global pipe
+    if HF_TOKEN is None:
+        print("❌ HF_TOKEN is not set")
+        return None
+    try:
+        tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, token=HF_TOKEN)
+        model = AutoModelForCausalLM.from_pretrained(
+            MODEL_ID, token=HF_TOKEN, torch_dtype=torch.float32
+        )
+        pipe = pipeline(
+            "text-generation",
+            model=model,
+            tokenizer=tokenizer,
+            max_new_tokens=512,
+            temperature=0.2,
+            top_p=0.9,
+            do_sample=True,
+            device=-1
+        )
+        print(f"✅ Model loaded: {MODEL_ID}")
+        return tokenizer
+    except Exception as e:
+        print(f"❌ Model load failed: {e}")
+        return None
+# ---------------------------
+# 📚 Utilities
+# ---------------------------
+def clean_text(text):
+    lines = text.splitlines()
+    return "\n".join([l.strip() for l in lines if l.strip()])
+def split_into_chunks(sentences, max_tokens=CHUNK_SIZE, overlap=CHUNK_OVERLAP):
+    chunks, current, cur_len = [], [], 0
+    for sent in sentences:
+        tok = len(sent.split())
+        if cur_len + tok > max_tokens:
+            chunks.append(" ".join(current))
+            current = current[-overlap:]
+            cur_len = sum(len(s.split()) for s in current)
+        current.append(sent)
+        cur_len += tok
+    if current: chunks.append(" ".join(current))
+    return chunks
+def hash_folder(folder):
+    hasher = hashlib.sha256()
+    for fname in sorted(os.listdir(folder)):
+        if fname.endswith(".pdf"):
+            with open(os.path.join(folder, fname), "rb") as f:
+                while chunk := f.read(8192):
+                    hasher.update(chunk)
+    return hasher.hexdigest()
 # ---------------------------
+# 🔁 Indexing
 # ---------------------------
+def extract_and_chunk():
+    from PIL import Image
+    import pytesseract
     chunks = []
+    for fname in tqdm(sorted(os.listdir(MANUALS_FOLDER))):
+        if not fname.endswith(".pdf"): continue
+        path = os.path.join(MANUALS_FOLDER, fname)
+        try:
             doc = fitz.open(path)
             for i, page in enumerate(doc):
+                text = page.get_text()
                 if not text:
+                    img = Image.open(io.BytesIO(page.get_pixmap(dpi=300).tobytes("png")))
                     text = pytesseract.image_to_string(img)
+                sents = sent_tokenize(clean_text(text))
+                for j, chunk in enumerate(split_into_chunks(sents)):
+                    chunks.append({
+                        "source_file": fname,
+                        "chunk_id": f"{fname}::p{i+1}::c{j+1}",
+                        "page": i+1,
+                        "text": chunk.strip()
+                    })
+        except Exception as e:
+            print(f"Error reading {fname}: {e}")
+    with open(CHUNKS_FILE, "w", encoding="utf-8") as f:
+        for chunk in chunks:
+            json.dump(chunk, f)
+            f.write("\n")
     return chunks
 # ---------------------------
+# 💾 ChromaDB Embedding
+# ---------------------------
+def embed_chunks():
+    global collection, embedder
+    client = chromadb.PersistentClient(path=CHROMA_PATH)
+    embedder = SentenceTransformer("all-MiniLM-L6-v2")
+    try: client.delete_collection("manual_chunks")
+    except: pass
+    collection = client.create_collection("manual_chunks")
+    with open(CHUNKS_FILE, "r", encoding="utf-8") as f:
+        batch, metas, ids, texts = [], [], [], []
+        for line in f:
+            item = json.loads(line)
+            texts.append(item["text"])
+            ids.append(item["chunk_id"])
+            metas.append({"source_file": item["source_file"], "page": item["page"]})
+            if len(texts) == 16:
+                embs = embedder.encode(texts).tolist()
+                collection.add(documents=texts, ids=ids, metadatas=metas, embeddings=embs)
+                texts, ids, metas = [], [], []
+        if texts:
+            embs = embedder.encode(texts).tolist()
+            collection.add(documents=texts, ids=ids, metadatas=metas, embeddings=embs)
+# ---------------------------
+# 🔍 Semantic QA
+# ---------------------------
+def ask(question):
+    if not collection or not embedder or not pipe:
+        return "App not ready."
+    emb = embedder.encode(question).tolist()
+    results = collection.query(query_embeddings=[emb], n_results=3)
+    context = "\n\n".join([r for r in results["documents"][0]])
+    prompt = f"""
+Use the context to answer. Say 'I don't know' if unsure.
+Context:
+{context}
+Question: {question}
+"""
+    return pipe(prompt)[0]['generated_text']
+# ---------------------------
+# 🚀 App Startup
+# ---------------------------
+def initialize():
+    if not os.path.exists(MANUALS_FOLDER):
+        os.makedirs(MANUALS_FOLDER)
+    new_hash = hash_folder(MANUALS_FOLDER)
+    if os.path.exists(HASH_FILE):
+        with open(HASH_FILE, "r") as f:
+            if f.read().strip() == new_hash and os.path.exists(CHUNKS_FILE):
+                print("✅ Manuals unchanged. Skipping re-embedding.")
+                return
+    print("🔄 Indexing manuals...")
+    extract_and_chunk()
+    embed_chunks()
+    with open(HASH_FILE, "w") as f:
+        f.write(new_hash)
+    print("✅ Embedding complete.")
+# ---------------------------
+# 🖥️ Gradio Interface
+# ---------------------------
+def build_ui():
     with gr.Blocks() as demo:
+        gr.Markdown("## 🔍 Ask SmartManuals-AI")
+        inp = gr.Textbox(label="Your question")
+        out = gr.Textbox(label="Answer", lines=6)
+        btn = gr.Button("Ask")
+        btn.click(fn=ask, inputs=inp, outputs=out)
+    return demo
 # ---------------------------
+# 🔧 Run App
+# ---------------------------
+load_model()
+initialize()
+demo = build_ui()