SmartManuals-AI / app.py
damoojeje's picture
Update app.py
6728736 verified
raw
history blame
7.26 kB
# ✅ app.py (SmartManuals-AI)
# Hugging Face Space-ready app with multi-model support, PDF upload, and live progress feedback
import os
import json
import fitz # PyMuPDF
import nltk
import chromadb
import tempfile
import shutil
import pytesseract
import gradio as gr
from PIL import Image
from tqdm import tqdm
from nltk.tokenize import sent_tokenize
from sentence_transformers import SentenceTransformer, util
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
# ---------------------------
# 🔧 CONFIG
# ---------------------------
pdf_folder = "Manuals"
output_jsonl_chunks = "chunks.jsonl"
chroma_path = "./chroma_store"
collection_name = "manual_chunks"
chunk_size = 750
chunk_overlap = 100
MAX_CONTEXT_CHUNKS = 3
HF_TOKEN = os.environ.get("HF_TOKEN")
MODEL_MAP = {
"LLaMA 3 (8B)": "meta-llama/Meta-Llama-3-8B-Instruct",
"LLaMA 4 Scout (17B)": "meta-llama/Meta-Llama-4-Scout-17B-16E-Instruct",
"Gemma 3 (27B)": "google/gemma-3-27b-it",
"Mistral 7B": "mistralai/Mistral-7B-Instruct-v0.3",
"Qwen3 (30B)": "Qwen/Qwen3-30B-A3B"
}
# ---------------------------
# 📥 UTILITIES
# ---------------------------
def clean_text(text):
return "\n".join([line.strip() for line in text.splitlines() if line.strip()])
def tokenize_sentences(text):
nltk.download('punkt', quiet=True)
return sent_tokenize(text)
def split_into_chunks(sentences, max_tokens=750, overlap=100):
chunks, current_chunk, current_len = [], [], 0
for sentence in sentences:
token_count = len(sentence.split())
if current_len + token_count > max_tokens and current_chunk:
chunks.append(" ".join(current_chunk))
current_chunk = current_chunk[-overlap:]
current_len = sum(len(s.split()) for s in current_chunk)
current_chunk.append(sentence)
current_len += token_count
if current_chunk:
chunks.append(" ".join(current_chunk))
return chunks
def extract_metadata_from_filename(filename):
name = filename.lower().replace("_", " ").replace("-", " ")
meta = {"model": "unknown", "doc_type": "unknown", "brand": "life fitness"}
if "om" in name: meta["doc_type"] = "owner manual"
elif "sm" in name: meta["doc_type"] = "service manual"
elif "assembly" in name: meta["doc_type"] = "assembly instructions"
elif "alert" in name: meta["doc_type"] = "installer alert"
elif "parts" in name: meta["doc_type"] = "parts manual"
known_models = ["se3hd", "se3", "se4", "symbio", "explore", "integrity x", "integrity sl", "everest", "engage"]
for model in known_models:
if model.replace(" ", "") in name.replace(" ", ""):
meta["model"] = model
return meta
def extract_text_with_ocr(page):
text = page.get_text().strip()
if text:
return text
pix = page.get_pixmap(dpi=300)
img_data = pix.tobytes("png")
img = Image.open(tempfile.SpooledTemporaryFile())
img.fp.write(img_data)
img.fp.seek(0)
return pytesseract.image_to_string(img).strip()
# ---------------------------
# 🧠 EMBEDDING + CHROMA
# ---------------------------
def embed_pdfs_from_uploaded(files, progress=gr.Progress(track_tqdm=True)):
os.makedirs(pdf_folder, exist_ok=True)
temp_chunks = []
for file in files:
filename = os.path.basename(file.name)
dst = os.path.join(pdf_folder, filename)
shutil.copy(file.name, dst)
doc = fitz.open(dst)
meta = extract_metadata_from_filename(filename)
for page_num, page in enumerate(doc, start=1):
text = extract_text_with_ocr(page)
sents = tokenize_sentences(clean_text(text))
chunks = split_into_chunks(sents, chunk_size, chunk_overlap)
for i, chunk in enumerate(chunks):
temp_chunks.append({
"chunk_id": f"{filename}::page_{page_num}::chunk_{i+1}",
"source_file": filename,
"page": page_num,
"text": chunk,
**meta
})
with open(output_jsonl_chunks, "w", encoding="utf-8") as f:
for c in temp_chunks:
json.dump(c, f)
f.write("\n")
embedder = SentenceTransformer("all-MiniLM-L6-v2")
client = chromadb.PersistentClient(path=chroma_path)
if collection_name in [c.name for c in client.list_collections()]:
client.delete_collection(collection_name)
collection = client.create_collection(collection_name)
for i in tqdm(range(0, len(temp_chunks), 16)):
batch = temp_chunks[i:i+16]
texts = [b["text"] for b in batch]
metadatas = [b for b in batch]
ids = [b["chunk_id"] for b in batch]
embeddings = embedder.encode(texts).tolist()
collection.add(documents=texts, ids=ids, metadatas=metadatas, embeddings=embeddings)
return collection, embedder
# ---------------------------
# 🤖 LLM INFERENCE
# ---------------------------
def load_llm(model_key):
model_id = MODEL_MAP.get(model_key)
if not model_id or not HF_TOKEN:
return None, None, None
tokenizer = AutoTokenizer.from_pretrained(model_id, token=HF_TOKEN)
model = AutoModelForCausalLM.from_pretrained(model_id, token=HF_TOKEN, device_map="auto")
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, max_new_tokens=300)
return tokenizer, model, pipe
def generate_answer(pipe, tokenizer, context, query):
messages = [
{"role": "system", "content": "You are an expert manual assistant. Answer accurately using only the context."},
{"role": "user", "content": f"Context:\n{context}\n\nQuestion: {query}"}
]
prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
output = pipe(prompt)[0]["generated_text"]
return output.split("\n")[-1].strip()
# ---------------------------
# 🎯 FULL PIPELINE
# ---------------------------
def rag_pipeline(query, model_key, files):
collection, embedder = embed_pdfs_from_uploaded(files)
query_embedding = embedder.encode(query, convert_to_tensor=True)
results = collection.query(query_texts=[query], n_results=MAX_CONTEXT_CHUNKS)
if not results["documents"]:
return "No matches found."
context = "\n\n".join(results["documents"][0])
tokenizer, model, pipe = load_llm(model_key)
if pipe:
return generate_answer(pipe, tokenizer, context, query)
return "Model could not be loaded."
# ---------------------------
# 🖥️ GRADIO UI
# ---------------------------
with gr.Blocks() as demo:
gr.Markdown("""# 🧠 SmartManuals-AI with Multi-Model RAG
Upload your PDF manuals and ask smart questions. Choose your preferred LLM.""")
with gr.Row():
file_upload = gr.File(file_types=[".pdf"], file_count="multiple", label="Upload Manuals")
with gr.Row():
query_box = gr.Textbox(label="Question")
model_selector = gr.Dropdown(label="Choose Model", choices=list(MODEL_MAP.keys()), value="LLaMA 3 (8B)")
submit_btn = gr.Button("Run Query")
answer_box = gr.Textbox(label="Answer", lines=8)
submit_btn.click(fn=rag_pipeline, inputs=[query_box, model_selector, file_upload], outputs=[answer_box])
demo.launch()