Spaces:

damoojeje
/

SmartManuals-AI

Running

File size: 5,212 Bytes

# app.py
# SmartManuals-AI: Hugging Face Space version

import os, json, fitz, nltk, chromadb, io
import torch
from tqdm import tqdm
from PIL import Image
from docx import Document
from sentence_transformers import SentenceTransformer, util
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
from nltk.tokenize import sent_tokenize
import pytesseract
import gradio as gr

# ----------------------
# Configuration
# ----------------------
MANUALS_FOLDER = "./Manuals"
CHUNKS_JSONL = "chunks.jsonl"
CHROMA_PATH = "./chroma_store"
COLLECTION_NAME = "manual_chunks"
CHUNK_SIZE = 750
CHUNK_OVERLAP = 100
MODEL_NAME = "meta-llama/Llama-3.1-8B-Instruct"
HF_TOKEN = os.getenv("HF_TOKEN")

# ----------------------
# Ensure punkt is downloaded
# ----------------------
nltk.download("punkt")

# ----------------------
# Utilities
# ----------------------
def extract_text_from_pdf(path):
    doc = fitz.open(path)
    text = ""
    for page in doc:
        t = page.get_text()
        if not t.strip():
            pix = page.get_pixmap(dpi=300)
            img = Image.open(io.BytesIO(pix.tobytes("png")))
            t = pytesseract.image_to_string(img)
        text += t + "\n"
    return text

def extract_text_from_docx(path):
    doc = Document(path)
    return "\n".join(p.text for p in doc.paragraphs if p.text.strip())

def clean(text):
    return "\n".join([line.strip() for line in text.splitlines() if line.strip()])

def split_sentences(text):
    return sent_tokenize(text)

def chunk_sentences(sentences, max_tokens=CHUNK_SIZE, overlap=CHUNK_OVERLAP):
    chunks, chunk, count = [], [], 0
    for s in sentences:
        words = s.split()
        if count + len(words) > max_tokens:
            chunks.append(" ".join(chunk))
            chunk = chunk[-overlap:] if overlap > 0 else []
            count = sum(len(x.split()) for x in chunk)
        chunk.append(s)
        count += len(words)
    if chunk:
        chunks.append(" ".join(chunk))
    return chunks

def get_metadata(filename):
    name = filename.lower()
    return {
        "source_file": filename,
        "doc_type": "service manual" if "sm" in name else "owner manual" if "om" in name else "unknown",
        "model": "se3hd" if "se3hd" in name else "unknown"
    }

# ----------------------
# Embedding
# ----------------------
def embed_all():
    embedder = SentenceTransformer("all-MiniLM-L6-v2")
    client = chromadb.PersistentClient(path=CHROMA_PATH)
    try:
        client.delete_collection(COLLECTION_NAME)
    except:
        pass
    collection = client.create_collection(COLLECTION_NAME)
    chunks, metadatas, ids = [], [], []
    files = os.listdir(MANUALS_FOLDER)
    idx = 0
    for file in tqdm(files):
        path = os.path.join(MANUALS_FOLDER, file)
        text = extract_text_from_pdf(path) if file.endswith(".pdf") else extract_text_from_docx(path)
        meta = get_metadata(file)
        sents = split_sentences(clean(text))
        for i, chunk in enumerate(chunk_sentences(sents)):
            chunks.append(chunk)
            ids.append(f"{file}::chunk_{i}")
            metadatas.append(meta)
            if len(chunks) >= 16:
                emb = embedder.encode(chunks).tolist()
                collection.add(documents=chunks, ids=ids, metadatas=metadatas, embeddings=emb)
                chunks, ids, metadatas = [], [], []
    if chunks:
        emb = embedder.encode(chunks).tolist()
        collection.add(documents=chunks, ids=ids, metadatas=metadatas, embeddings=emb)
    return collection, embedder

# ----------------------
# Model setup
# ----------------------
def load_model():
    device = 0 if torch.cuda.is_available() else -1
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, token=HF_TOKEN)
    model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, token=HF_TOKEN)
    return pipeline("text-generation", model=model, tokenizer=tokenizer, device=device, max_new_tokens=512)

# ----------------------
# RAG Pipeline
# ----------------------
def answer_query(question):
    results = db.query(query_texts=[question], n_results=5)
    context = "\n\n".join(results["documents"][0])
    prompt = f"""<|begin_of_text|><|start_header_id|>system<|end_header_id|>
You are a helpful assistant. Use the provided context to answer questions. If you don't know, say 'I don't know.'
<context>
{context}
</context>
<|start_header_id|>user<|end_header_id|>
{question}<|start_header_id|>assistant<|end_header_id|>"""
    return llm(prompt)[0]["generated_text"].split("<|start_header_id|>assistant<|end_header_id|>")[-1].strip()

# ----------------------
# UI
# ----------------------
with gr.Blocks() as demo:
    status = gr.Textbox(label="Status", value="Embedding manuals... Please wait.", interactive=False)
    question = gr.Textbox(label="Ask a Question")
    submit = gr.Button("🔍 Ask")
    answer = gr.Textbox(label="Answer", lines=8)

    def handle_query(q):
        return answer_query(q)

    submit.click(fn=handle_query, inputs=question, outputs=answer)

# ----------------------
# Startup
# ----------------------
status_text = "Embedding manuals and loading model..."
db, embedder = embed_all()
llm = load_model()
status_text = "Ready!"
demo.launch()