damoojeje commited on
Commit
d6e6c98
Β·
verified Β·
1 Parent(s): 05604a9

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +158 -113
app.py CHANGED
@@ -1,147 +1,192 @@
 
 
 
1
  import os
2
  import json
3
  import fitz # PyMuPDF
4
- import docx
5
  import chromadb
6
  import torch
7
- import nltk
8
  import gradio as gr
 
 
 
9
  from tqdm import tqdm
10
- from typing import List
11
  from PIL import Image
12
- from nltk.tokenize import sent_tokenize
13
  from sentence_transformers import SentenceTransformer, util
14
- from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
15
-
16
 
17
- nltk.download('punkt')
18
-
19
- # --- Ensure punkt tokenizer is available ---
20
- try:
21
- nltk.data.find("tokenizers/punkt")
22
- except LookupError:
23
- nltk.download("punkt")
24
-
25
- # --- Configuration ---
26
- MANUALS_FOLDER = "./Manuals"
27
  CHROMA_PATH = "./chroma_store"
28
- COLLECTION_NAME = "manual_chunks"
29
- MODEL_OPTIONS = {
30
- "LLaMA 3.1 8B": "meta-llama/Llama-3.1-8B-Instruct",
31
- "Mistral 7B": "mistralai/Mistral-7B-Instruct-v0.3",
32
- "Gemma 7B": "google/gemma-1.1-7b-it"
33
- }
 
 
 
 
 
 
34
  HF_TOKEN = os.environ.get("HF_TOKEN")
35
- MAX_CONTEXT_CHUNKS = 3
36
 
37
- # --- Utility Functions ---
38
- def extract_text_from_pdf(path):
39
- try:
40
- doc = fitz.open(path)
41
- return "\n".join([page.get_text().strip() for page in doc])
42
- except:
43
- return ""
 
 
44
 
45
- def extract_text_from_docx(path):
46
- try:
47
- doc = docx.Document(path)
48
- return "\n".join([para.text.strip() for para in doc.paragraphs])
49
- except:
50
- return ""
51
 
 
 
 
52
  def clean(text):
53
  return "\n".join([line.strip() for line in text.splitlines() if line.strip()])
54
 
55
- def split_sentences(text):
56
- try:
57
- return sent_tokenize(text)
58
- except Exception as e:
59
- print(f"[Tokenizer Error] {e}. Falling back to simple split.")
60
- return text.split(". ")
61
-
62
- def chunk_sentences(sentences, max_tokens=500, overlap=50):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63
  chunks = []
64
  current = []
65
- total = 0
66
- for sentence in sentences:
67
- count = len(sentence.split())
68
- if total + count > max_tokens:
69
  chunks.append(" ".join(current))
70
  current = current[-overlap:]
71
- total = sum(len(s.split()) for s in current)
72
- current.append(sentence)
73
- total += count
74
  if current:
75
  chunks.append(" ".join(current))
76
  return chunks
77
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
78
  def embed_all():
79
- db = chromadb.PersistentClient(path=CHROMA_PATH)
80
- if COLLECTION_NAME in [c.name for c in db.list_collections()]:
81
- db.delete_collection(COLLECTION_NAME)
82
- collection = db.create_collection(COLLECTION_NAME)
83
-
84
- embedder = SentenceTransformer("all-MiniLM-L6-v2")
85
- all_chunks = []
86
-
87
- for fname in os.listdir(MANUALS_FOLDER):
88
- path = os.path.join(MANUALS_FOLDER, fname)
89
- text = ""
90
- if fname.lower().endswith(".pdf"):
91
- text = extract_text_from_pdf(path)
92
- elif fname.lower().endswith(".docx"):
93
- text = extract_text_from_docx(path)
94
  else:
95
  continue
96
-
97
- sents = split_sentences(clean(text))
98
- chunks = chunk_sentences(sents)
99
- for idx, chunk in enumerate(chunks):
100
- chunk_id = f"{fname}::chunk_{idx}"
101
- all_chunks.append({"id": chunk_id, "text": chunk, "metadata": {"source": fname}})
102
-
103
- for i in range(0, len(all_chunks), 16):
104
- batch = all_chunks[i:i+16]
105
- docs = [x["text"] for x in batch]
106
- ids = [x["id"] for x in batch]
107
- metas = [x["metadata"] for x in batch]
108
- embs = embedder.encode(docs).tolist()
109
- collection.add(documents=docs, ids=ids, metadatas=metas, embeddings=embs)
110
-
111
- return collection, embedder
112
-
113
- def answer_query(query, model_choice):
114
- db, embedder = embed_all()
115
- results = db.get_collection(COLLECTION_NAME).query(query_texts=[query], n_results=MAX_CONTEXT_CHUNKS)
116
-
 
 
 
 
 
 
 
117
  context = "\n\n".join(results["documents"][0])
118
- model_id = MODEL_OPTIONS.get(model_choice)
119
-
120
- tokenizer = AutoTokenizer.from_pretrained(model_id, token=HF_TOKEN)
121
- model = AutoModelForCausalLM.from_pretrained(model_id, token=HF_TOKEN)
122
- pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)
123
-
124
  prompt = f"""
125
- Context:
126
- {context}
127
-
128
- Question: {query}
129
- Answer:"""
130
-
131
- out = pipe(prompt, max_new_tokens=300, do_sample=False)
132
- return out[0]["generated_text"].split("Answer:")[-1].strip()
133
-
134
- # --- UI ---
135
- with gr.Blocks() as demo:
136
- gr.Markdown("""# πŸ“˜ SmartManuals-AI
137
- Ask technical questions from manuals (PDF & DOCX) with LLM + OCR + RAG.
138
- """)
139
-
 
 
 
 
 
 
 
 
 
 
 
 
 
140
  with gr.Row():
141
- question = gr.Textbox(label="Your Question", placeholder="e.g., How do I reset the console?")
142
- model_choice = gr.Dropdown(choices=list(MODEL_OPTIONS.keys()), value="LLaMA 3.1 8B", label="Model")
143
- answer = gr.Textbox(label="Answer")
144
- submit = gr.Button("Ask")
145
- submit.click(fn=answer_query, inputs=[question, model_choice], outputs=answer)
146
 
147
  demo.launch()
 
1
+ # βœ… SmartManuals-AI App for Hugging Face Spaces
2
+ # Full app.py with spaCy-based sentence segmentation and model dropdown selection
3
+
4
  import os
5
  import json
6
  import fitz # PyMuPDF
 
7
  import chromadb
8
  import torch
9
+ import docx
10
  import gradio as gr
11
+ import pytesseract
12
+ import numpy as np
13
+ import spacy
14
  from tqdm import tqdm
 
15
  from PIL import Image
16
+ from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
17
  from sentence_transformers import SentenceTransformer, util
 
 
18
 
19
+ # ---------------------------
20
+ # βš™οΈ Configuration
21
+ # ---------------------------
22
+ MANUALS_DIR = "./Manuals"
 
 
 
 
 
 
23
  CHROMA_PATH = "./chroma_store"
24
+ CHROMA_COLLECTION = "manual_chunks"
25
+ CHUNK_SIZE = 750
26
+ CHUNK_OVERLAP = 100
27
+ EMBED_MODEL = "all-MiniLM-L6-v2"
28
+ DEFAULT_MODEL = "meta-llama/Llama-3-8B-Instruct"
29
+ AVAILABLE_MODELS = [
30
+ "meta-llama/Llama-3-8B-Instruct",
31
+ "meta-llama/Llama-4-Scout-17B-16E-Instruct",
32
+ "google/gemma-1.1-7b-it",
33
+ "mistralai/Mistral-7B-Instruct-v0.3",
34
+ "Qwen/Qwen1.5-7B-Chat"
35
+ ]
36
  HF_TOKEN = os.environ.get("HF_TOKEN")
 
37
 
38
+ # ---------------------------
39
+ # πŸ“š Load NLP model for sentence splitting
40
+ # ---------------------------
41
+ try:
42
+ import spacy
43
+ nlp = spacy.load("en_core_web_sm")
44
+ except:
45
+ os.system("python -m spacy download en_core_web_sm")
46
+ nlp = spacy.load("en_core_web_sm")
47
 
48
+ def split_sentences(text):
49
+ return [sent.text.strip() for sent in nlp(text).sents if sent.text.strip()]
 
 
 
 
50
 
51
+ # ---------------------------
52
+ # 🧹 Text cleanup
53
+ # ---------------------------
54
  def clean(text):
55
  return "\n".join([line.strip() for line in text.splitlines() if line.strip()])
56
 
57
+ # ---------------------------
58
+ # πŸ“„ PDF and DOCX extractors
59
+ # ---------------------------
60
+ def extract_pdf_text(path):
61
+ doc = fitz.open(path)
62
+ pages = []
63
+ for i, page in enumerate(doc):
64
+ text = page.get_text()
65
+ if not text.strip():
66
+ pix = page.get_pixmap(dpi=300)
67
+ img = Image.open(io.BytesIO(pix.tobytes("png")))
68
+ text = pytesseract.image_to_string(img)
69
+ pages.append((i + 1, text))
70
+ return pages
71
+
72
+ def extract_docx_text(path):
73
+ doc = docx.Document(path)
74
+ full_text = "\n".join([para.text for para in doc.paragraphs if para.text.strip()])
75
+ return [(1, full_text)]
76
+
77
+ # ---------------------------
78
+ # πŸ“¦ Chunk splitter
79
+ # ---------------------------
80
+ def chunkify(sentences, max_tokens=CHUNK_SIZE, overlap=CHUNK_OVERLAP):
81
  chunks = []
82
  current = []
83
+ length = 0
84
+ for s in sentences:
85
+ tokens = len(s.split())
86
+ if length + tokens > max_tokens:
87
  chunks.append(" ".join(current))
88
  current = current[-overlap:]
89
+ length = sum(len(w.split()) for w in current)
90
+ current.append(s)
91
+ length += tokens
92
  if current:
93
  chunks.append(" ".join(current))
94
  return chunks
95
 
96
+ # ---------------------------
97
+ # πŸ”Ž Metadata from file
98
+ # ---------------------------
99
+ def extract_meta(name):
100
+ name = name.lower()
101
+ return {
102
+ "model": next((m for m in ["se3", "se4", "symbio", "explore"] if m in name), "unknown"),
103
+ "doc_type": next((d for d in ["owner", "service", "parts"] if d in name), "unknown"),
104
+ "brand": "life fitness"
105
+ }
106
+
107
+ # ---------------------------
108
+ # πŸ”  Embed and store chunks
109
+ # ---------------------------
110
  def embed_all():
111
+ embedder = SentenceTransformer(EMBED_MODEL)
112
+ client = chromadb.PersistentClient(path=CHROMA_PATH)
113
+ try:
114
+ client.delete_collection(CHROMA_COLLECTION)
115
+ except:
116
+ pass
117
+ db = client.create_collection(CHROMA_COLLECTION)
118
+
119
+ for fname in os.listdir(MANUALS_DIR):
120
+ path = os.path.join(MANUALS_DIR, fname)
121
+ if fname.endswith(".pdf"):
122
+ pages = extract_pdf_text(path)
123
+ elif fname.endswith(".docx"):
124
+ pages = extract_docx_text(path)
 
125
  else:
126
  continue
127
+ meta = extract_meta(fname)
128
+ for page, text in pages:
129
+ sents = split_sentences(clean(text))
130
+ chunks = chunkify(sents)
131
+ for i, chunk in enumerate(chunks):
132
+ db.add(
133
+ ids=[f"{fname}::p{page}::c{i}"],
134
+ documents=[chunk],
135
+ metadatas=[{**meta, "source": fname, "page": page}]
136
+ )
137
+ return db, embedder
138
+
139
+ # ---------------------------
140
+ # πŸ€– Load selected LLM model
141
+ # ---------------------------
142
+ def load_model(repo):
143
+ tokenizer = AutoTokenizer.from_pretrained(repo, token=HF_TOKEN)
144
+ model = AutoModelForCausalLM.from_pretrained(
145
+ repo, torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
146
+ device_map="auto" if torch.cuda.is_available() else None, token=HF_TOKEN
147
+ )
148
+ return pipeline("text-generation", model=model, tokenizer=tokenizer, device=0 if torch.cuda.is_available() else -1)
149
+
150
+ # ---------------------------
151
+ # πŸ“₯ Retrieval-Augmented QA
152
+ # ---------------------------
153
+ def answer_query(q, model_choice):
154
+ results = db.query(query_texts=[q], n_results=3)
155
  context = "\n\n".join(results["documents"][0])
 
 
 
 
 
 
156
  prompt = f"""
157
+ You are a helpful assistant. Answer based on the context. If unsure, say "I don't know".
158
+
159
+ Context:
160
+ {context}
161
+
162
+ Question: {q}
163
+ Answer:
164
+ """
165
+ pipe = load_model(model_choice)
166
+ out = pipe(prompt, max_new_tokens=300, do_sample=False)[0]["generated_text"]
167
+ return out.split("Answer:")[-1].strip()
168
+
169
+ # ---------------------------
170
+ # πŸš€ Initialize app
171
+ # ---------------------------
172
+ print("Embedding documents...")
173
+ db, embedder = embed_all()
174
+ print("Done embedding.")
175
+
176
+ # ---------------------------
177
+ # πŸŽ›οΈ Gradio UI
178
+ # ---------------------------
179
+ demo = gr.Blocks()
180
+
181
+ with demo:
182
+ gr.Markdown("""# 🧠 SmartManuals-AI
183
+ Ask any question and let the model answer from your uploaded manuals.
184
+ """)
185
  with gr.Row():
186
+ qbox = gr.Textbox(label="Ask a Question", placeholder="e.g. How to reset the SE3 console?")
187
+ model_select = gr.Dropdown(choices=AVAILABLE_MODELS, label="Choose LLM", value=DEFAULT_MODEL)
188
+ ansbox = gr.Textbox(label="Answer", lines=10)
189
+ btn = gr.Button("πŸ” Submit")
190
+ btn.click(fn=answer_query, inputs=[qbox, model_select], outputs=ansbox)
191
 
192
  demo.launch()