# ✅ SmartManuals-AI app.py (for Hugging Face Spaces)
# Optimized to support multiple LLMs, Gradio UI, and secure on-device document QA

import os
import json
import io
import fitz
import nltk
import chromadb
import pytesseract
import numpy as np
import torch
from PIL import Image
from tqdm import tqdm
from nltk.tokenize import sent_tokenize
from sentence_transformers import SentenceTransformer, util
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
import gradio as gr

# ----------------------
# 🔧 Configurations
# ----------------------
PDF_DIR = "./Manuals"
CHROMA_PATH = "./chroma_store"
COLLECTION_NAME = "manual_chunks"
MAX_CONTEXT_CHUNKS = 3
CHUNK_SIZE = 750
CHUNK_OVERLAP = 100
MODEL_OPTIONS = [
    "meta-llama/Llama-3.1-8B-Instruct",
    "meta-llama/Llama-4-Scout-17B-16E-Instruct",
    "google/gemma-1.1-7b-it",
    "Qwen/Qwen1.5-14B-Chat",
    "mistralai/Mistral-7B-Instruct-v0.3"
]
HF_TOKEN = os.environ.get("HF_TOKEN")

# ----------------------
# 📚 NLTK Setup
# ----------------------
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt')

# ----------------------
# 📄 Utility Functions
# ----------------------
def extract_text_or_ocr(page):
    text = page.get_text().strip()
    if text:
        return text, False
    pix = page.get_pixmap(dpi=300)
    img_data = pix.tobytes("png")
    img = Image.open(io.BytesIO(img_data))
    return pytesseract.image_to_string(img).strip(), True

def clean_text(text):
    return "\n".join([line.strip() for line in text.splitlines() if line.strip()])

def tokenize_sentences(text):
    return sent_tokenize(text)

def split_chunks(sentences, max_tokens=CHUNK_SIZE, overlap=CHUNK_OVERLAP):
    chunks, chunk, length = [], [], 0
    for sentence in sentences:
        count = len(sentence.split())
        if length + count > max_tokens and chunk:
            chunks.append(" ".join(chunk))
            chunk = chunk[-overlap:]
            length = sum(len(s.split()) for s in chunk)
        chunk.append(sentence)
        length += count
    if chunk: chunks.append(" ".join(chunk))
    return chunks

def extract_metadata(filename):
    name = filename.lower().replace("_", " ").replace("-", " ")
    meta = {"model": "unknown", "doc_type": "unknown", "brand": "life fitness"}
    if "om" in name or "owner" in name: meta["doc_type"] = "owner manual"
    elif "sm" in name or "service" in name: meta["doc_type"] = "service manual"
    elif "assembly" in name: meta["doc_type"] = "assembly instructions"
    elif "alert" in name: meta["doc_type"] = "installer alert"
    elif "parts" in name: meta["doc_type"] = "parts manual"
    elif "bulletin" in name: meta["doc_type"] = "service bulletin"
    for kw in ["se3hd", "se3", "se4", "symbio", "explore", "integrity x", "integrity sl", "everest", "engage", "inspire", "discover", "95t", "95x", "95c", "95r", "97c"]:
        if kw.replace(" ", "") in name.replace(" ", ""): meta["model"] = kw
    return meta

# ----------------------
# 🧠 Load LLM
# ----------------------
def load_llm(model_id):
    tokenizer = AutoTokenizer.from_pretrained(model_id, token=HF_TOKEN)
    model = AutoModelForCausalLM.from_pretrained(model_id, token=HF_TOKEN, torch_dtype=torch.float32)
    return pipeline("text-generation", model=model, tokenizer=tokenizer, device=-1)

# ----------------------
# 🧠 Chroma + Embed
# ----------------------
def embed_pdfs():
    os.makedirs(CHROMA_PATH, exist_ok=True)
    client = chromadb.PersistentClient(path=CHROMA_PATH)
    if COLLECTION_NAME in [c.name for c in client.list_collections()]:
        client.delete_collection(COLLECTION_NAME)
    collection = client.create_collection(COLLECTION_NAME)
    embedder = SentenceTransformer("all-MiniLM-L6-v2")

    for file in tqdm(os.listdir(PDF_DIR)):
        if not file.lower().endswith(".pdf"): continue
        doc = fitz.open(os.path.join(PDF_DIR, file))
        meta = extract_metadata(file)
        for page_num, page in enumerate(doc, 1):
            text, _ = extract_text_or_ocr(page)
            if not text.strip(): continue
            sents = tokenize_sentences(clean_text(text))
            chunks = split_chunks(sents)
            for i, chunk in enumerate(chunks):
                chunk_id = f"{file}::p{page_num}::c{i}"
                emb = embedder.encode([chunk])[0].tolist()
                collection.add(
                    documents=[chunk],
                    ids=[chunk_id],
                    embeddings=[emb],
                    metadatas=[{**meta, "source_file": file, "page": page_num}]
                )
    return collection, embedder

# ----------------------
# 🔍 RAG Pipeline
# ----------------------
def answer_query(q, model_id):
    collection, embedder = embed_pdfs()
    pipe = load_llm(model_id)
    emb_q = embedder.encode([q])[0].tolist()
    results = collection.query(query_embeddings=[emb_q], n_results=MAX_CONTEXT_CHUNKS)
    context = "\n\n".join(results['documents'][0])
    prompt = f"Use the context below to answer the question.\nContext:\n{context}\n\nQuestion: {q}\nAnswer:"
    return pipe(prompt)[0]['generated_text'].split("Answer:")[-1].strip()

# ----------------------
# 🚀 Gradio UI
# ----------------------
with gr.Blocks() as app:
    gr.Markdown("""# SmartManuals-AI
**Local-first document QA** powered by OCR, ChromaDB & your choice of LLM (via Hugging Face).
""")
    with gr.Row():
        question = gr.Textbox(placeholder="Ask a question from the manuals...", label="Question")
        model_choice = gr.Dropdown(label="Choose Model", choices=MODEL_OPTIONS, value=MODEL_OPTIONS[0])
    output = gr.Textbox(label="Answer", lines=10)
    run = gr.Button("Run RAG")
    run.click(fn=answer_query, inputs=[question, model_choice], outputs=output)

if __name__ == "__main__":
    app.launch()