damoojeje commited on
Commit
d06b252
Β·
verified Β·
1 Parent(s): 8ab0a40

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +158 -96
app.py CHANGED
@@ -1,30 +1,52 @@
 
1
  import os
 
2
  import json
3
- import fitz # PyMuPDF
4
  import pytesseract
5
- from PIL import Image
6
- import io
7
- import nltk
8
  import chromadb
 
 
 
 
 
 
 
9
  from tqdm import tqdm
10
- from sentence_transformers import SentenceTransformer
11
- from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
12
- import gradio as gr
13
-
14
- # ---------------------------
15
- # πŸ“¦ Paths and Constants
16
- # ---------------------------
17
- MANUALS_DIR = "./Manuals"
18
- CHROMA_PATH = "./chroma_store"
19
- COLLECTION_NAME = "manual_chunks"
20
-
21
- # Ensure NLTK punkt is available
22
- nltk.download("punkt")
23
  from nltk.tokenize import sent_tokenize
24
 
25
- # ---------------------------
26
- # 🧼 Text cleaning utilities
27
- # ---------------------------
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
  def clean(text):
29
  return "\n".join([line.strip() for line in text.splitlines() if line.strip()])
30
 
@@ -32,110 +54,150 @@ def split_sentences(text):
32
  try:
33
  return sent_tokenize(text)
34
  except Exception as e:
35
- print("[Tokenizer Error]", e, "\nFalling back to simple split.")
36
  return text.split(". ")
37
 
38
- # ---------------------------
39
- # πŸ“„ PDF and DOCX extraction
40
- # ---------------------------
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41
  def extract_pdf_text(pdf_path):
42
- doc = fitz.open(pdf_path)
43
- pages = []
44
- for i, page in enumerate(doc):
45
- text = page.get_text().strip()
46
- if not text:
47
- try:
48
  pix = page.get_pixmap(dpi=300)
49
- img = Image.open(io.BytesIO(pix.tobytes("png")))
50
  text = pytesseract.image_to_string(img)
51
- except pytesseract.TesseractNotFoundError:
52
- print("❌ Tesseract not found. Skipping OCR for page.")
53
- text = ""
54
- pages.append((i + 1, text))
55
- return pages
56
-
57
- # ---------------------------
58
- # 🧠 Embed text using MiniLM
59
- # ---------------------------
 
 
 
 
 
60
  def embed_all():
 
 
61
  client = chromadb.PersistentClient(path=CHROMA_PATH)
62
- if COLLECTION_NAME in [c.name for c in client.list_collections()]:
 
63
  client.delete_collection(COLLECTION_NAME)
64
- collection = client.create_collection(COLLECTION_NAME)
 
 
65
 
66
- embedder = SentenceTransformer("all-MiniLM-L6-v2")
 
 
67
 
68
- chunk_id = 0
69
  for fname in os.listdir(MANUALS_DIR):
70
  fpath = os.path.join(MANUALS_DIR, fname)
71
  if fname.lower().endswith(".pdf"):
72
  pages = extract_pdf_text(fpath)
73
- for page_num, text in pages:
74
- sents = split_sentences(clean(text))
75
- for i in range(0, len(sents), 5):
76
- chunk = " ".join(sents[i:i + 5])
77
- if chunk.strip():
78
- collection.add(
79
- documents=[chunk],
80
- metadatas=[{"source": fname, "page": page_num}],
81
- ids=[f"{fname}-{page_num}-{i}-{chunk_id}"]
82
- )
83
- chunk_id += 1
84
-
85
- print(f"βœ… Embedded {chunk_id} chunks.")
 
 
 
 
 
 
 
 
 
 
86
  return collection, embedder
87
 
88
- # ---------------------------
89
- # πŸ€– Load model
90
- # ---------------------------
91
- def load_llm():
92
- model_id = "meta-llama/Llama-3.1-8B-Instruct"
93
- token = os.environ.get("HF_TOKEN")
94
- tokenizer = AutoTokenizer.from_pretrained(model_id, token=token)
95
- model = AutoModelForCausalLM.from_pretrained(
96
- model_id, token=token, torch_dtype=None, device_map="auto"
97
- )
98
- pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, max_new_tokens=512)
99
  return pipe, tokenizer
100
 
101
- # ---------------------------
102
- # ❓ Ask a question
103
- # ---------------------------
104
- def ask_question(question, db, embedder, pipe, tokenizer):
105
- results = db.query(query_texts=[question], n_results=5)
106
- context = "\n\n".join(results["documents"][0])
107
-
108
- prompt = f"""
109
- <|begin_of_text|><|start_header_id|>system<|end_header_id|>
110
- You are a helpful assistant that answers questions from technical manuals using only the provided context.
111
- <context>
112
  {context}
113
- </context>
114
  <|start_header_id|>user<|end_header_id|>
115
- {question}<|start_header_id|>assistant<|end_header_id|>
 
116
  """
 
 
 
 
 
 
 
 
 
 
 
 
117
 
118
- out = pipe(prompt)[0]["generated_text"]
119
- final = out.split("<|start_header_id|>assistant<|end_header_id|>")[-1].strip()
120
- return final
 
 
121
 
122
- # ---------------------------
123
- # πŸš€ Build interface
124
- # ---------------------------
125
  with gr.Blocks() as demo:
126
- gr.Markdown("# πŸ€– SmartManuals-AI (Hugging Face Space Edition)")
127
 
128
- with gr.Row():
129
- qbox = gr.Textbox(label="Ask a Question", placeholder="e.g. How do I access diagnostics on the SE3 console?")
130
- submit = gr.Button("πŸ” Ask")
 
131
 
132
- abox = gr.Textbox(label="Answer", lines=8)
133
 
 
 
134
  db, embedder = embed_all()
135
- pipe, tokenizer = load_llm()
136
-
137
- submit.click(fn=lambda q: ask_question(q, db, embedder, pipe, tokenizer), inputs=qbox, outputs=abox)
138
 
139
- # For Hugging Face Spaces
140
  if __name__ == "__main__":
141
  demo.launch()
 
1
+
2
  import os
3
+ import fitz
4
  import json
5
+ import gradio as gr
6
  import pytesseract
 
 
 
7
  import chromadb
8
+ import torch
9
+ import asyncio
10
+ import docx2txt
11
+ import nltk
12
+ import traceback
13
+ from PIL import Image
14
+ from io import BytesIO
15
  from tqdm import tqdm
16
+ from transformers import (
17
+ pipeline,
18
+ AutoModelForCausalLM,
19
+ AutoTokenizer
20
+ )
21
+ from sentence_transformers import SentenceTransformer, util
 
 
 
 
 
 
 
22
  from nltk.tokenize import sent_tokenize
23
 
24
+ # Ensure punkt is available
25
+ try:
26
+ nltk.data.find("tokenizers/punkt")
27
+ except LookupError:
28
+ nltk.download("punkt")
29
+
30
+ # ---------------- Config ----------------
31
+ MANUALS_DIR = "Manuals"
32
+ CHROMA_PATH = "chroma_store"
33
+ COLLECTION_NAME = "manual_chunks"
34
+ CHUNK_SIZE = 750
35
+ CHUNK_OVERLAP = 100
36
+ MAX_CONTEXT_CHUNKS = 3
37
+
38
+ MODELS = {
39
+ "LLaMA 3 (8B)": "meta-llama/Llama-3.1-8B-Instruct",
40
+ "Mistral 7B": "mistralai/Mistral-7B-Instruct-v0.3",
41
+ "Gemma 2B": "google/gemma-1.1-2b-it",
42
+ "LLaMA 4 (Scout 17B)": "meta-llama/Llama-4-Scout-17B-16E",
43
+ "Qwen 30B": "Qwen/Qwen3-30B-A3B"
44
+ }
45
+
46
+ HF_TOKEN = os.environ.get("HF_TOKEN")
47
+ device = "cuda" if torch.cuda.is_available() else "cpu"
48
+
49
+ # ---------------- Utils ----------------
50
  def clean(text):
51
  return "\n".join([line.strip() for line in text.splitlines() if line.strip()])
52
 
 
54
  try:
55
  return sent_tokenize(text)
56
  except Exception as e:
57
+ print("[Tokenizer Error]", e)
58
  return text.split(". ")
59
 
60
+ def split_into_chunks(sentences, max_tokens=CHUNK_SIZE, overlap=CHUNK_OVERLAP):
61
+ chunks = []
62
+ current_chunk, current_len = [], 0
63
+
64
+ for sentence in sentences:
65
+ words = sentence.split()
66
+ if current_len + len(words) > max_tokens and current_chunk:
67
+ chunks.append(" ".join(current_chunk))
68
+ current_chunk = current_chunk[-overlap:]
69
+ current_len = sum(len(s.split()) for s in current_chunk)
70
+
71
+ current_chunk.append(sentence)
72
+ current_len += len(words)
73
+
74
+ if current_chunk:
75
+ chunks.append(" ".join(current_chunk))
76
+
77
+ return chunks
78
+
79
  def extract_pdf_text(pdf_path):
80
+ text_chunks = []
81
+ try:
82
+ doc = fitz.open(pdf_path)
83
+ for i, page in enumerate(doc):
84
+ text = page.get_text().strip()
85
+ if not text:
86
  pix = page.get_pixmap(dpi=300)
87
+ img = Image.open(BytesIO(pix.tobytes("png")))
88
  text = pytesseract.image_to_string(img)
89
+ text_chunks.append((pdf_path, i + 1, clean(text)))
90
+ except Exception as e:
91
+ print("❌ Error reading PDF:", pdf_path, e)
92
+ return text_chunks
93
+
94
+ def extract_docx_text(docx_path):
95
+ try:
96
+ text = clean(docx2txt.process(docx_path))
97
+ return [(docx_path, 1, text)]
98
+ except Exception as e:
99
+ print("❌ Error reading DOCX:", docx_path, e)
100
+ return []
101
+
102
+ # ---------------- Background Embed ----------------
103
  def embed_all():
104
+ embedder = SentenceTransformer("all-MiniLM-L6-v2")
105
+ embedder.eval()
106
  client = chromadb.PersistentClient(path=CHROMA_PATH)
107
+
108
+ try:
109
  client.delete_collection(COLLECTION_NAME)
110
+ except:
111
+ pass
112
+ collection = client.get_or_create_collection(COLLECTION_NAME)
113
 
114
+ chunks, ids, metas = [], [], []
115
+ idx = 0
116
+ print("πŸ“„ Scanning Manuals folder...")
117
 
 
118
  for fname in os.listdir(MANUALS_DIR):
119
  fpath = os.path.join(MANUALS_DIR, fname)
120
  if fname.lower().endswith(".pdf"):
121
  pages = extract_pdf_text(fpath)
122
+ elif fname.lower().endswith(".docx"):
123
+ pages = extract_docx_text(fpath)
124
+ else:
125
+ continue
126
+
127
+ for filepath, page, text in pages:
128
+ sentences = split_sentences(text)
129
+ subchunks = split_into_chunks(sentences)
130
+ for i, subchunk in enumerate(subchunks):
131
+ chunks.append(subchunk)
132
+ ids.append(f"{fname}::{page}::{i}")
133
+ metas.append({"source": fname, "page": page})
134
+
135
+ if len(chunks) >= 16:
136
+ embs = embedder.encode(chunks).tolist()
137
+ collection.add(documents=chunks, ids=ids, metadatas=metas, embeddings=embs)
138
+ chunks, ids, metas = [], [], []
139
+
140
+ if chunks:
141
+ embs = embedder.encode(chunks).tolist()
142
+ collection.add(documents=chunks, ids=ids, metadatas=metas, embeddings=embs)
143
+
144
+ print(f"βœ… Embedded {len(ids)} chunks.")
145
  return collection, embedder
146
 
147
+ # ---------------- Model Loader ----------------
148
+ def load_model(model_id):
149
+ tokenizer = AutoTokenizer.from_pretrained(model_id, token=HF_TOKEN)
150
+ model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32, token=HF_TOKEN)
151
+ pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, device=0 if torch.cuda.is_available() else -1)
 
 
 
 
 
 
152
  return pipe, tokenizer
153
 
154
+ # ---------------- Query ----------------
155
+ def query_llm(context, question, pipe, tokenizer):
156
+ prompt = f"""<|begin_of_text|><|start_header_id|>system<|end_header_id|>
157
+ You are a helpful assistant. Use only the following context to answer. If uncertain, say: 'I don't know.'
158
+
 
 
 
 
 
 
159
  {context}
 
160
  <|start_header_id|>user<|end_header_id|>
161
+ {question}
162
+ <|start_header_id|>assistant<|end_header_id|>
163
  """
164
+ out = pipe(prompt, max_new_tokens=512)[0]["generated_text"]
165
+ return out.split("<|start_header_id|>assistant<|end_header_id|>")[-1].strip()
166
+
167
+ def answer_question(question, model_choice):
168
+ try:
169
+ model_id = MODELS[model_choice]
170
+ pipe, tokenizer = load_model(model_id)
171
+ query_emb = embedder.encode(question, convert_to_tensor=True)
172
+
173
+ results = db.query(query_texts=[question], n_results=MAX_CONTEXT_CHUNKS)
174
+ context_chunks = results["documents"][0]
175
+ context = "\n\n".join(context_chunks)
176
 
177
+ answer = query_llm(context, question, pipe, tokenizer)
178
+ return answer
179
+ except Exception as e:
180
+ traceback.print_exc()
181
+ return f"❌ Error: {str(e)}"
182
 
183
+ # ---------------- Run App ----------------
 
 
184
  with gr.Blocks() as demo:
185
+ gr.Markdown("### πŸ“˜ Ask Questions About Your Manuals")
186
 
187
+ model_choice = gr.Dropdown(label="Select Model", choices=list(MODELS.keys()), value="LLaMA 3 (8B)")
188
+ question = gr.Textbox(label="Your Question", placeholder="e.g. How do I reset the treadmill?")
189
+ submit = gr.Button("πŸ” Get Answer")
190
+ answer = gr.Textbox(label="Answer", lines=10)
191
 
192
+ submit.click(fn=answer_question, inputs=[question, model_choice], outputs=answer)
193
 
194
+ # Run background embed on startup
195
+ try:
196
  db, embedder = embed_all()
197
+ except Exception as e:
198
+ print("❌ Failed to embed docs:", e)
199
+ db, embedder = None, None
200
 
201
+ # Only launch if in HF Space
202
  if __name__ == "__main__":
203
  demo.launch()