Spaces:

damoojeje
/

SmartManuals-AI

Sleeping

App Files Files Community

SmartManuals-AI / app.py

damoojeje

Update app.py

bfb4fda verified 4 months ago

raw

history blame

5.5 kB

	# ✅ SmartManuals-AI: Hugging Face Space App (RAM Safe, Multi-model, No Preview)

	import os, json, fitz, torch, chromadb, docx
	import gradio as gr
	from PIL import Image
	from nltk.tokenize import sent_tokenize
	from sentence_transformers import SentenceTransformer, util
	from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
	from tqdm import tqdm

	# ---------------------------
	# ⚙️ Constants
	# ---------------------------
	MANUALS_DIR = "Manuals"
	CHROMA_PATH = "./chroma_store"
	CHUNKS_JSONL = "manual_chunks.jsonl"
	COLLECTION_NAME = "manual_chunks"
	HF_TOKEN = os.environ.get("HF_TOKEN")
	CHUNK_SIZE = 750
	CHUNK_OVERLAP = 100
	TOP_K = 3

	MODEL_OPTIONS = {
	"LLaMA 3.1 (8B)": "meta-llama/Llama-3.1-8B-Instruct",
	"Mistral 7B": "mistralai/Mistral-7B-Instruct-v0.3",
	"Gemma 7B": "google/gemma-7b-it",
	"Qwen3 7B": "Qwen/Qwen1.5-7B-Chat"
	}

	# ---------------------------
	# 📄 Extract Text from PDFs and DOCX
	# ---------------------------
	def extract_text_from_pdf(path):
	text = ""
	try:
	doc = fitz.open(path)
	for page in doc:
	page_text = page.get_text()
	text += page_text + "\n"
	doc.close()
	except Exception as e:
	print(f"❌ PDF Error in {path}: {e}")
	return text

	def extract_text_from_docx(path):
	try:
	doc = docx.Document(path)
	return "\n".join(p.text for p in doc.paragraphs if p.text.strip())
	except Exception as e:
	print(f"❌ DOCX Error in {path}: {e}")
	return ""

	# ---------------------------
	# 🧹 Clean + Chunk
	# ---------------------------
	def clean(text):
	return "\n".join([line.strip() for line in text.splitlines() if line.strip()])

	def split_sentences(text):
	return sent_tokenize(text)

	def chunk_text(sentences, size=CHUNK_SIZE, overlap=CHUNK_OVERLAP):
	chunks, chunk, length = [], [], 0
	for sent in sentences:
	n = len(sent.split())
	if length + n > size:
	if chunk:
	chunks.append(" ".join(chunk))
	chunk = chunk[-overlap:]
	length = sum(len(s.split()) for s in chunk)
	chunk.append(sent)
	length += n
	if chunk:
	chunks.append(" ".join(chunk))
	return chunks

	# ---------------------------
	# 📦 Embed and Store in Chroma
	# ---------------------------
	def embed_all():
	print("🔍 Scanning manuals and embedding...")
	os.makedirs(CHROMA_PATH, exist_ok=True)
	embedder = SentenceTransformer("all-MiniLM-L6-v2")
	client = chromadb.PersistentClient(path=CHROMA_PATH)
	if COLLECTION_NAME in [c.name for c in client.list_collections()]:
	client.delete_collection(COLLECTION_NAME)
	collection = client.create_collection(COLLECTION_NAME)

	all_chunks = []
	files = [f for f in os.listdir(MANUALS_DIR) if f.lower().endswith((".pdf", ".docx"))]
	for fname in tqdm(files):
	path = os.path.join(MANUALS_DIR, fname)
	text = extract_text_from_pdf(path) if fname.endswith(".pdf") else extract_text_from_docx(path)
	text = clean(text)
	sents = split_sentences(text)
	chunks = chunk_text(sents)
	for i, chunk in enumerate(chunks):
	all_chunks.append({
	"id": f"{fname}::chunk_{i+1}",
	"text": chunk,
	"metadata": {"source": fname}
	})

	# Batch embed and store
	for i in range(0, len(all_chunks), 16):
	batch = all_chunks[i:i+16]
	docs = [c["text"] for c in batch]
	ids = [c["id"] for c in batch]
	metas = [c["metadata"] for c in batch]
	embs = embedder.encode(docs).tolist()
	collection.add(documents=docs, ids=ids, metadatas=metas, embeddings=embs)

	print(f"✅ Embedded {len(all_chunks)} chunks.")
	return collection, embedder

	# ---------------------------
	# 🔍 RAG Search & LLM Answer
	# ---------------------------
	def ask(query, model_key):
	model_id = MODEL_OPTIONS[model_key]
	try:
	tokenizer = AutoTokenizer.from_pretrained(model_id, token=HF_TOKEN)
	model = AutoModelForCausalLM.from_pretrained(model_id, token=HF_TOKEN, torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32)
	model.to("cuda" if torch.cuda.is_available() else "cpu")
	gen = pipeline("text-generation", model=model, tokenizer=tokenizer, device=0 if torch.cuda.is_available() else -1)
	except Exception as e:
	return f"❌ Model loading failed: {e}"

	results = db.query(query_texts=[query], n_results=TOP_K)
	chunks = results["documents"][0]
	context = "\n\n".join(chunks)
	prompt = f"Answer this using only the context below.\n\nContext:\n{context}\n\nQuestion: {query}\nAnswer:"

	try:
	res = gen(prompt, max_new_tokens=300, do_sample=False)[0]['generated_text']
	return res.split("Answer:", 1)[-1].strip()
	except Exception as e:
	return f"❌ LLM failed: {e}"

	# ---------------------------
	# ▶️ UI
	# ---------------------------
	db, embedder = embed_all()
	with gr.Blocks() as demo:
	gr.Markdown("## 🧠 SmartManuals-AI — Ask Your PDF and Word Docs")
	with gr.Row():
	qbox = gr.Textbox(label="Ask a Question", placeholder="e.g. How do I calibrate SE3 console?")
	model_pick = gr.Dropdown(choices=list(MODEL_OPTIONS.keys()), label="Choose a Model", value="Mistral 7B")
	answer = gr.Textbox(label="Answer", lines=8)
	ask_btn = gr.Button("Ask")
	ask_btn.click(fn=ask, inputs=[qbox, model_pick], outputs=[answer])

	demo.launch()