damoojeje commited on
Commit
2975595
·
verified ·
1 Parent(s): bfb4fda

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +100 -109
app.py CHANGED
@@ -1,153 +1,144 @@
1
- # ✅ SmartManuals-AI: Hugging Face Space App (RAM Safe, Multi-model, No Preview)
2
-
3
- import os, json, fitz, torch, chromadb, docx
 
 
 
 
4
  import gradio as gr
 
 
5
  from PIL import Image
6
  from nltk.tokenize import sent_tokenize
7
  from sentence_transformers import SentenceTransformer, util
8
  from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
9
- from tqdm import tqdm
10
 
11
- # ---------------------------
12
- # ⚙️ Constants
13
- # ---------------------------
14
- MANUALS_DIR = "Manuals"
 
 
 
 
15
  CHROMA_PATH = "./chroma_store"
16
- CHUNKS_JSONL = "manual_chunks.jsonl"
17
  COLLECTION_NAME = "manual_chunks"
18
- HF_TOKEN = os.environ.get("HF_TOKEN")
19
- CHUNK_SIZE = 750
20
- CHUNK_OVERLAP = 100
21
- TOP_K = 3
22
-
23
  MODEL_OPTIONS = {
24
- "LLaMA 3.1 (8B)": "meta-llama/Llama-3.1-8B-Instruct",
25
  "Mistral 7B": "mistralai/Mistral-7B-Instruct-v0.3",
26
- "Gemma 7B": "google/gemma-7b-it",
27
- "Qwen3 7B": "Qwen/Qwen1.5-7B-Chat"
28
  }
 
 
29
 
30
- # ---------------------------
31
- # 📄 Extract Text from PDFs and DOCX
32
- # ---------------------------
33
  def extract_text_from_pdf(path):
34
- text = ""
35
  try:
36
  doc = fitz.open(path)
37
- for page in doc:
38
- page_text = page.get_text()
39
- text += page_text + "\n"
40
- doc.close()
41
- except Exception as e:
42
- print(f"❌ PDF Error in {path}: {e}")
43
- return text
44
 
45
  def extract_text_from_docx(path):
46
  try:
47
  doc = docx.Document(path)
48
- return "\n".join(p.text for p in doc.paragraphs if p.text.strip())
49
- except Exception as e:
50
- print(f"❌ DOCX Error in {path}: {e}")
51
  return ""
52
 
53
- # ---------------------------
54
- # 🧹 Clean + Chunk
55
- # ---------------------------
56
  def clean(text):
57
  return "\n".join([line.strip() for line in text.splitlines() if line.strip()])
58
 
59
  def split_sentences(text):
60
- return sent_tokenize(text)
61
-
62
- def chunk_text(sentences, size=CHUNK_SIZE, overlap=CHUNK_OVERLAP):
63
- chunks, chunk, length = [], [], 0
64
- for sent in sentences:
65
- n = len(sent.split())
66
- if length + n > size:
67
- if chunk:
68
- chunks.append(" ".join(chunk))
69
- chunk = chunk[-overlap:]
70
- length = sum(len(s.split()) for s in chunk)
71
- chunk.append(sent)
72
- length += n
73
- if chunk:
74
- chunks.append(" ".join(chunk))
 
 
 
 
 
75
  return chunks
76
 
77
- # ---------------------------
78
- # 📦 Embed and Store in Chroma
79
- # ---------------------------
80
  def embed_all():
81
- print("🔍 Scanning manuals and embedding...")
82
- os.makedirs(CHROMA_PATH, exist_ok=True)
83
- embedder = SentenceTransformer("all-MiniLM-L6-v2")
84
- client = chromadb.PersistentClient(path=CHROMA_PATH)
85
- if COLLECTION_NAME in [c.name for c in client.list_collections()]:
86
- client.delete_collection(COLLECTION_NAME)
87
- collection = client.create_collection(COLLECTION_NAME)
88
 
 
89
  all_chunks = []
90
- files = [f for f in os.listdir(MANUALS_DIR) if f.lower().endswith((".pdf", ".docx"))]
91
- for fname in tqdm(files):
92
- path = os.path.join(MANUALS_DIR, fname)
93
- text = extract_text_from_pdf(path) if fname.endswith(".pdf") else extract_text_from_docx(path)
94
- text = clean(text)
95
- sents = split_sentences(text)
96
- chunks = chunk_text(sents)
97
- for i, chunk in enumerate(chunks):
98
- all_chunks.append({
99
- "id": f"{fname}::chunk_{i+1}",
100
- "text": chunk,
101
- "metadata": {"source": fname}
102
- })
103
-
104
- # Batch embed and store
 
 
105
  for i in range(0, len(all_chunks), 16):
106
  batch = all_chunks[i:i+16]
107
- docs = [c["text"] for c in batch]
108
- ids = [c["id"] for c in batch]
109
- metas = [c["metadata"] for c in batch]
110
  embs = embedder.encode(docs).tolist()
111
  collection.add(documents=docs, ids=ids, metadatas=metas, embeddings=embs)
112
 
113
- print(f"✅ Embedded {len(all_chunks)} chunks.")
114
  return collection, embedder
115
 
116
- # ---------------------------
117
- # 🔍 RAG Search & LLM Answer
118
- # ---------------------------
119
- def ask(query, model_key):
120
- model_id = MODEL_OPTIONS[model_key]
121
- try:
122
- tokenizer = AutoTokenizer.from_pretrained(model_id, token=HF_TOKEN)
123
- model = AutoModelForCausalLM.from_pretrained(model_id, token=HF_TOKEN, torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32)
124
- model.to("cuda" if torch.cuda.is_available() else "cpu")
125
- gen = pipeline("text-generation", model=model, tokenizer=tokenizer, device=0 if torch.cuda.is_available() else -1)
126
- except Exception as e:
127
- return f"❌ Model loading failed: {e}"
128
 
129
- results = db.query(query_texts=[query], n_results=TOP_K)
130
- chunks = results["documents"][0]
131
- context = "\n\n".join(chunks)
132
- prompt = f"Answer this using only the context below.\n\nContext:\n{context}\n\nQuestion: {query}\nAnswer:"
133
 
134
- try:
135
- res = gen(prompt, max_new_tokens=300, do_sample=False)[0]['generated_text']
136
- return res.split("Answer:", 1)[-1].strip()
137
- except Exception as e:
138
- return f"❌ LLM failed: {e}"
 
 
139
 
140
- # ---------------------------
141
- # ▶️ UI
142
- # ---------------------------
143
- db, embedder = embed_all()
 
 
 
144
  with gr.Blocks() as demo:
145
- gr.Markdown("## 🧠 SmartManuals-AI — Ask Your PDF and Word Docs")
 
 
 
146
  with gr.Row():
147
- qbox = gr.Textbox(label="Ask a Question", placeholder="e.g. How do I calibrate SE3 console?")
148
- model_pick = gr.Dropdown(choices=list(MODEL_OPTIONS.keys()), label="Choose a Model", value="Mistral 7B")
149
- answer = gr.Textbox(label="Answer", lines=8)
150
- ask_btn = gr.Button("Ask")
151
- ask_btn.click(fn=ask, inputs=[qbox, model_pick], outputs=[answer])
152
 
153
  demo.launch()
 
1
+ import os
2
+ import json
3
+ import fitz # PyMuPDF
4
+ import docx
5
+ import chromadb
6
+ import torch
7
+ import nltk
8
  import gradio as gr
9
+ from tqdm import tqdm
10
+ from typing import List
11
  from PIL import Image
12
  from nltk.tokenize import sent_tokenize
13
  from sentence_transformers import SentenceTransformer, util
14
  from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
 
15
 
16
+ # --- Ensure punkt tokenizer is available ---
17
+ try:
18
+ nltk.data.find("tokenizers/punkt")
19
+ except LookupError:
20
+ nltk.download("punkt")
21
+
22
+ # --- Configuration ---
23
+ MANUALS_FOLDER = "./Manuals"
24
  CHROMA_PATH = "./chroma_store"
 
25
  COLLECTION_NAME = "manual_chunks"
 
 
 
 
 
26
  MODEL_OPTIONS = {
27
+ "LLaMA 3.1 8B": "meta-llama/Llama-3.1-8B-Instruct",
28
  "Mistral 7B": "mistralai/Mistral-7B-Instruct-v0.3",
29
+ "Gemma 7B": "google/gemma-1.1-7b-it"
 
30
  }
31
+ HF_TOKEN = os.environ.get("HF_TOKEN")
32
+ MAX_CONTEXT_CHUNKS = 3
33
 
34
+ # --- Utility Functions ---
 
 
35
  def extract_text_from_pdf(path):
 
36
  try:
37
  doc = fitz.open(path)
38
+ return "\n".join([page.get_text().strip() for page in doc])
39
+ except:
40
+ return ""
 
 
 
 
41
 
42
  def extract_text_from_docx(path):
43
  try:
44
  doc = docx.Document(path)
45
+ return "\n".join([para.text.strip() for para in doc.paragraphs])
46
+ except:
 
47
  return ""
48
 
 
 
 
49
  def clean(text):
50
  return "\n".join([line.strip() for line in text.splitlines() if line.strip()])
51
 
52
  def split_sentences(text):
53
+ try:
54
+ return sent_tokenize(text)
55
+ except Exception as e:
56
+ print(f"[Tokenizer Error] {e}. Falling back to simple split.")
57
+ return text.split(". ")
58
+
59
+ def chunk_sentences(sentences, max_tokens=500, overlap=50):
60
+ chunks = []
61
+ current = []
62
+ total = 0
63
+ for sentence in sentences:
64
+ count = len(sentence.split())
65
+ if total + count > max_tokens:
66
+ chunks.append(" ".join(current))
67
+ current = current[-overlap:]
68
+ total = sum(len(s.split()) for s in current)
69
+ current.append(sentence)
70
+ total += count
71
+ if current:
72
+ chunks.append(" ".join(current))
73
  return chunks
74
 
 
 
 
75
  def embed_all():
76
+ db = chromadb.PersistentClient(path=CHROMA_PATH)
77
+ if COLLECTION_NAME in [c.name for c in db.list_collections()]:
78
+ db.delete_collection(COLLECTION_NAME)
79
+ collection = db.create_collection(COLLECTION_NAME)
 
 
 
80
 
81
+ embedder = SentenceTransformer("all-MiniLM-L6-v2")
82
  all_chunks = []
83
+
84
+ for fname in os.listdir(MANUALS_FOLDER):
85
+ path = os.path.join(MANUALS_FOLDER, fname)
86
+ text = ""
87
+ if fname.lower().endswith(".pdf"):
88
+ text = extract_text_from_pdf(path)
89
+ elif fname.lower().endswith(".docx"):
90
+ text = extract_text_from_docx(path)
91
+ else:
92
+ continue
93
+
94
+ sents = split_sentences(clean(text))
95
+ chunks = chunk_sentences(sents)
96
+ for idx, chunk in enumerate(chunks):
97
+ chunk_id = f"{fname}::chunk_{idx}"
98
+ all_chunks.append({"id": chunk_id, "text": chunk, "metadata": {"source": fname}})
99
+
100
  for i in range(0, len(all_chunks), 16):
101
  batch = all_chunks[i:i+16]
102
+ docs = [x["text"] for x in batch]
103
+ ids = [x["id"] for x in batch]
104
+ metas = [x["metadata"] for x in batch]
105
  embs = embedder.encode(docs).tolist()
106
  collection.add(documents=docs, ids=ids, metadatas=metas, embeddings=embs)
107
 
 
108
  return collection, embedder
109
 
110
+ def answer_query(query, model_choice):
111
+ db, embedder = embed_all()
112
+ results = db.get_collection(COLLECTION_NAME).query(query_texts=[query], n_results=MAX_CONTEXT_CHUNKS)
 
 
 
 
 
 
 
 
 
113
 
114
+ context = "\n\n".join(results["documents"][0])
115
+ model_id = MODEL_OPTIONS.get(model_choice)
 
 
116
 
117
+ tokenizer = AutoTokenizer.from_pretrained(model_id, token=HF_TOKEN)
118
+ model = AutoModelForCausalLM.from_pretrained(model_id, token=HF_TOKEN)
119
+ pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)
120
+
121
+ prompt = f"""
122
+ Context:
123
+ {context}
124
 
125
+ Question: {query}
126
+ Answer:"""
127
+
128
+ out = pipe(prompt, max_new_tokens=300, do_sample=False)
129
+ return out[0]["generated_text"].split("Answer:")[-1].strip()
130
+
131
+ # --- UI ---
132
  with gr.Blocks() as demo:
133
+ gr.Markdown("""# 📘 SmartManuals-AI
134
+ Ask technical questions from manuals (PDF & DOCX) with LLM + OCR + RAG.
135
+ """)
136
+
137
  with gr.Row():
138
+ question = gr.Textbox(label="Your Question", placeholder="e.g., How do I reset the console?")
139
+ model_choice = gr.Dropdown(choices=list(MODEL_OPTIONS.keys()), value="LLaMA 3.1 8B", label="Model")
140
+ answer = gr.Textbox(label="Answer")
141
+ submit = gr.Button("Ask")
142
+ submit.click(fn=answer_query, inputs=[question, model_choice], outputs=answer)
143
 
144
  demo.launch()