damoojeje commited on
Commit
bfb4fda
Β·
verified Β·
1 Parent(s): df365ca

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +103 -127
app.py CHANGED
@@ -1,177 +1,153 @@
1
- import os
2
- import json
3
- import fitz # PyMuPDF
4
- import re
5
- from tqdm import tqdm
6
- from docx import Document
7
  from PIL import Image
8
- import pytesseract
9
- import io
10
- import torch
11
- import chromadb
12
  from sentence_transformers import SentenceTransformer, util
13
  from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
14
- import gradio as gr
15
 
16
  # ---------------------------
17
- # πŸ“ Configuration
18
  # ---------------------------
19
- MANUALS_FOLDER = "./Manuals"
20
  CHROMA_PATH = "./chroma_store"
 
21
  COLLECTION_NAME = "manual_chunks"
 
22
  CHUNK_SIZE = 750
23
  CHUNK_OVERLAP = 100
24
- MAX_CONTEXT_CHUNKS = 3
25
- HF_MODEL = "meta-llama/Llama-3.1-8B-Instruct"
26
- HF_TOKEN = os.environ.get("HF_TOKEN")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
 
28
  # ---------------------------
29
- # 🧹 Helpers
30
  # ---------------------------
31
  def clean(text):
32
- lines = text.splitlines()
33
- return "\n".join(line.strip() for line in lines if line.strip())
34
 
35
  def split_sentences(text):
36
- return re.split(r'(?<=[.!?])\s+', text.strip())
37
 
38
- def chunk_sentences(sentences, max_len=CHUNK_SIZE, overlap=CHUNK_OVERLAP):
39
  chunks, chunk, length = [], [], 0
40
  for sent in sentences:
41
- tokens = len(sent.split())
42
- if length + tokens > max_len and chunk:
43
- chunks.append(" ".join(chunk))
44
- chunk = chunk[-overlap:] if overlap else []
45
- length = sum(len(s.split()) for s in chunk)
 
46
  chunk.append(sent)
47
- length += tokens
48
  if chunk:
49
  chunks.append(" ".join(chunk))
50
  return chunks
51
 
52
- def extract_text_from_pdf(path):
53
- doc = fitz.open(path)
54
- full_text = []
55
- for page in doc:
56
- text = page.get_text().strip()
57
- if not text:
58
- try:
59
- pix = page.get_pixmap(dpi=300)
60
- img_data = pix.tobytes("png")
61
- img = Image.open(io.BytesIO(img_data))
62
- text = pytesseract.image_to_string(img).strip()
63
- except Exception:
64
- text = ""
65
- full_text.append(text)
66
- return "\n".join(full_text)
67
-
68
- def extract_text_from_docx(path):
69
- doc = Document(path)
70
- return "\n".join([para.text for para in doc.paragraphs if para.text.strip()])
71
-
72
- def extract_metadata(filename):
73
- name = filename.lower()
74
- model = next((m for m in ["se3hd", "se3", "se4", "symbio", "explore", "integrity x", "integrity sl", "everest", "engage", "inspire", "discover", "95t", "95x", "95c", "95r", "97c"] if m in name), "unknown")
75
- if "om" in name or "owner" in name:
76
- doc_type = "owner manual"
77
- elif "sm" in name or "service" in name:
78
- doc_type = "service manual"
79
- elif "assembly" in name:
80
- doc_type = "assembly instructions"
81
- elif "alert" in name:
82
- doc_type = "installer alert"
83
- elif "parts" in name:
84
- doc_type = "parts manual"
85
- elif "bulletin" in name:
86
- doc_type = "service bulletin"
87
- else:
88
- doc_type = "unknown"
89
- return model, doc_type
90
-
91
  # ---------------------------
92
- # πŸš€ Build ChromaDB at Startup
93
  # ---------------------------
94
  def embed_all():
 
 
 
95
  client = chromadb.PersistentClient(path=CHROMA_PATH)
96
  if COLLECTION_NAME in [c.name for c in client.list_collections()]:
97
  client.delete_collection(COLLECTION_NAME)
98
  collection = client.create_collection(COLLECTION_NAME)
99
 
100
- embedder = SentenceTransformer("all-MiniLM-L6-v2")
101
- records = []
102
-
103
- for fname in os.listdir(MANUALS_FOLDER):
104
- path = os.path.join(MANUALS_FOLDER, fname)
105
- if not fname.lower().endswith((".pdf", ".docx")):
106
- continue
107
  text = extract_text_from_pdf(path) if fname.endswith(".pdf") else extract_text_from_docx(path)
108
- sents = split_sentences(clean(text))
109
- chunks = chunk_sentences(sents)
110
- model, doc_type = extract_metadata(fname)
111
  for i, chunk in enumerate(chunks):
112
- records.append({
113
  "id": f"{fname}::chunk_{i+1}",
114
  "text": chunk,
115
- "metadata": {"source_file": fname, "model": model, "doc_type": doc_type}
116
  })
117
 
118
- for i in range(0, len(records), 16):
119
- batch = records[i:i+16]
120
- texts = [r["text"] for r in batch]
121
- ids = [r["id"] for r in batch]
122
- metas = [r["metadata"] for r in batch]
123
- embeddings = embedder.encode(texts).tolist()
124
- collection.add(documents=texts, ids=ids, metadatas=metas, embeddings=embeddings)
 
125
 
 
126
  return collection, embedder
127
 
128
  # ---------------------------
129
- # πŸ’¬ Load HF Model
130
  # ---------------------------
131
- llm_pipe = None
132
- if HF_TOKEN:
133
- tokenizer = AutoTokenizer.from_pretrained(HF_MODEL, token=HF_TOKEN)
134
- model = AutoModelForCausalLM.from_pretrained(HF_MODEL, token=HF_TOKEN, torch_dtype=torch.float32)
135
- llm_pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, device=-1)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
136
 
137
  # ---------------------------
138
- # πŸ”Ž RAG Function
139
- # ---------------------------
140
- def run_query(question):
141
- if not question.strip():
142
- return "Please enter a question."
143
- if not db or not embedder:
144
- return "Chroma or embedder not ready."
145
-
146
- q_embed = embedder.encode(question).tolist()
147
- res = db.query(query_embeddings=[q_embed], n_results=MAX_CONTEXT_CHUNKS)
148
- contexts = res["documents"][0]
149
- prompt = """
150
- You are a technical assistant.
151
- Answer only using the context below.
152
- Say 'I don't know' if not found.
153
-
154
- """
155
- context_text = "\n\n".join(contexts)
156
- final_prompt = prompt + f"Context:\n{context_text}\n\nQuestion: {question}\nAnswer:"
157
- if llm_pipe:
158
- result = llm_pipe(final_prompt, max_new_tokens=300)[0]['generated_text']
159
- return result.split("Answer:")[-1].strip()
160
- return "Model not loaded."
161
-
162
- # ---------------------------
163
- # 🧠 Init embeddings once
164
  # ---------------------------
165
  db, embedder = embed_all()
166
-
167
- # ---------------------------
168
- # πŸŽ›οΈ Gradio Interface
169
- # ---------------------------
170
  with gr.Blocks() as demo:
171
- gr.Markdown("# πŸ€– SmartManuals-AI: Ask Technical Questions about Your Manuals")
172
- question = gr.Textbox(placeholder="e.g. How do I reset the treadmill console?", label="Enter Question")
173
- submit = gr.Button("Get Answer")
174
- output = gr.Textbox(label="Answer")
175
- submit.click(fn=run_query, inputs=question, outputs=output)
 
 
176
 
177
  demo.launch()
 
1
+ # βœ… SmartManuals-AI: Hugging Face Space App (RAM Safe, Multi-model, No Preview)
2
+
3
+ import os, json, fitz, torch, chromadb, docx
4
+ import gradio as gr
 
 
5
  from PIL import Image
6
+ from nltk.tokenize import sent_tokenize
 
 
 
7
  from sentence_transformers import SentenceTransformer, util
8
  from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
9
+ from tqdm import tqdm
10
 
11
  # ---------------------------
12
+ # βš™οΈ Constants
13
  # ---------------------------
14
+ MANUALS_DIR = "Manuals"
15
  CHROMA_PATH = "./chroma_store"
16
+ CHUNKS_JSONL = "manual_chunks.jsonl"
17
  COLLECTION_NAME = "manual_chunks"
18
+ HF_TOKEN = os.environ.get("HF_TOKEN")
19
  CHUNK_SIZE = 750
20
  CHUNK_OVERLAP = 100
21
+ TOP_K = 3
22
+
23
+ MODEL_OPTIONS = {
24
+ "LLaMA 3.1 (8B)": "meta-llama/Llama-3.1-8B-Instruct",
25
+ "Mistral 7B": "mistralai/Mistral-7B-Instruct-v0.3",
26
+ "Gemma 7B": "google/gemma-7b-it",
27
+ "Qwen3 7B": "Qwen/Qwen1.5-7B-Chat"
28
+ }
29
+
30
+ # ---------------------------
31
+ # πŸ“„ Extract Text from PDFs and DOCX
32
+ # ---------------------------
33
+ def extract_text_from_pdf(path):
34
+ text = ""
35
+ try:
36
+ doc = fitz.open(path)
37
+ for page in doc:
38
+ page_text = page.get_text()
39
+ text += page_text + "\n"
40
+ doc.close()
41
+ except Exception as e:
42
+ print(f"❌ PDF Error in {path}: {e}")
43
+ return text
44
+
45
+ def extract_text_from_docx(path):
46
+ try:
47
+ doc = docx.Document(path)
48
+ return "\n".join(p.text for p in doc.paragraphs if p.text.strip())
49
+ except Exception as e:
50
+ print(f"❌ DOCX Error in {path}: {e}")
51
+ return ""
52
 
53
  # ---------------------------
54
+ # 🧹 Clean + Chunk
55
  # ---------------------------
56
  def clean(text):
57
+ return "\n".join([line.strip() for line in text.splitlines() if line.strip()])
 
58
 
59
  def split_sentences(text):
60
+ return sent_tokenize(text)
61
 
62
+ def chunk_text(sentences, size=CHUNK_SIZE, overlap=CHUNK_OVERLAP):
63
  chunks, chunk, length = [], [], 0
64
  for sent in sentences:
65
+ n = len(sent.split())
66
+ if length + n > size:
67
+ if chunk:
68
+ chunks.append(" ".join(chunk))
69
+ chunk = chunk[-overlap:]
70
+ length = sum(len(s.split()) for s in chunk)
71
  chunk.append(sent)
72
+ length += n
73
  if chunk:
74
  chunks.append(" ".join(chunk))
75
  return chunks
76
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77
  # ---------------------------
78
+ # πŸ“¦ Embed and Store in Chroma
79
  # ---------------------------
80
  def embed_all():
81
+ print("πŸ” Scanning manuals and embedding...")
82
+ os.makedirs(CHROMA_PATH, exist_ok=True)
83
+ embedder = SentenceTransformer("all-MiniLM-L6-v2")
84
  client = chromadb.PersistentClient(path=CHROMA_PATH)
85
  if COLLECTION_NAME in [c.name for c in client.list_collections()]:
86
  client.delete_collection(COLLECTION_NAME)
87
  collection = client.create_collection(COLLECTION_NAME)
88
 
89
+ all_chunks = []
90
+ files = [f for f in os.listdir(MANUALS_DIR) if f.lower().endswith((".pdf", ".docx"))]
91
+ for fname in tqdm(files):
92
+ path = os.path.join(MANUALS_DIR, fname)
 
 
 
93
  text = extract_text_from_pdf(path) if fname.endswith(".pdf") else extract_text_from_docx(path)
94
+ text = clean(text)
95
+ sents = split_sentences(text)
96
+ chunks = chunk_text(sents)
97
  for i, chunk in enumerate(chunks):
98
+ all_chunks.append({
99
  "id": f"{fname}::chunk_{i+1}",
100
  "text": chunk,
101
+ "metadata": {"source": fname}
102
  })
103
 
104
+ # Batch embed and store
105
+ for i in range(0, len(all_chunks), 16):
106
+ batch = all_chunks[i:i+16]
107
+ docs = [c["text"] for c in batch]
108
+ ids = [c["id"] for c in batch]
109
+ metas = [c["metadata"] for c in batch]
110
+ embs = embedder.encode(docs).tolist()
111
+ collection.add(documents=docs, ids=ids, metadatas=metas, embeddings=embs)
112
 
113
+ print(f"βœ… Embedded {len(all_chunks)} chunks.")
114
  return collection, embedder
115
 
116
  # ---------------------------
117
+ # πŸ” RAG Search & LLM Answer
118
  # ---------------------------
119
+ def ask(query, model_key):
120
+ model_id = MODEL_OPTIONS[model_key]
121
+ try:
122
+ tokenizer = AutoTokenizer.from_pretrained(model_id, token=HF_TOKEN)
123
+ model = AutoModelForCausalLM.from_pretrained(model_id, token=HF_TOKEN, torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32)
124
+ model.to("cuda" if torch.cuda.is_available() else "cpu")
125
+ gen = pipeline("text-generation", model=model, tokenizer=tokenizer, device=0 if torch.cuda.is_available() else -1)
126
+ except Exception as e:
127
+ return f"❌ Model loading failed: {e}"
128
+
129
+ results = db.query(query_texts=[query], n_results=TOP_K)
130
+ chunks = results["documents"][0]
131
+ context = "\n\n".join(chunks)
132
+ prompt = f"Answer this using only the context below.\n\nContext:\n{context}\n\nQuestion: {query}\nAnswer:"
133
+
134
+ try:
135
+ res = gen(prompt, max_new_tokens=300, do_sample=False)[0]['generated_text']
136
+ return res.split("Answer:", 1)[-1].strip()
137
+ except Exception as e:
138
+ return f"❌ LLM failed: {e}"
139
 
140
  # ---------------------------
141
+ # ▢️ UI
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
142
  # ---------------------------
143
  db, embedder = embed_all()
 
 
 
 
144
  with gr.Blocks() as demo:
145
+ gr.Markdown("## 🧠 SmartManuals-AI β€” Ask Your PDF and Word Docs")
146
+ with gr.Row():
147
+ qbox = gr.Textbox(label="Ask a Question", placeholder="e.g. How do I calibrate SE3 console?")
148
+ model_pick = gr.Dropdown(choices=list(MODEL_OPTIONS.keys()), label="Choose a Model", value="Mistral 7B")
149
+ answer = gr.Textbox(label="Answer", lines=8)
150
+ ask_btn = gr.Button("Ask")
151
+ ask_btn.click(fn=ask, inputs=[qbox, model_pick], outputs=[answer])
152
 
153
  demo.launch()