damoojeje commited on
Commit
835a614
Β·
verified Β·
1 Parent(s): 6728736

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +157 -165
app.py CHANGED
@@ -1,187 +1,179 @@
1
- # βœ… app.py (SmartManuals-AI)
2
- # Hugging Face Space-ready app with multi-model support, PDF upload, and live progress feedback
3
 
4
  import os
5
- import json
6
  import fitz # PyMuPDF
7
  import nltk
8
- import chromadb
9
- import tempfile
10
- import shutil
11
  import pytesseract
 
12
  import gradio as gr
13
- from PIL import Image
14
  from tqdm import tqdm
15
- from nltk.tokenize import sent_tokenize
 
16
  from sentence_transformers import SentenceTransformer, util
17
- from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
18
-
19
- # ---------------------------
20
- # πŸ”§ CONFIG
21
- # ---------------------------
22
- pdf_folder = "Manuals"
23
- output_jsonl_chunks = "chunks.jsonl"
24
- chroma_path = "./chroma_store"
25
- collection_name = "manual_chunks"
26
- chunk_size = 750
27
- chunk_overlap = 100
28
- MAX_CONTEXT_CHUNKS = 3
 
 
 
 
 
 
 
 
29
  HF_TOKEN = os.environ.get("HF_TOKEN")
30
 
31
- MODEL_MAP = {
32
- "LLaMA 3 (8B)": "meta-llama/Meta-Llama-3-8B-Instruct",
33
- "LLaMA 4 Scout (17B)": "meta-llama/Meta-Llama-4-Scout-17B-16E-Instruct",
34
- "Gemma 3 (27B)": "google/gemma-3-27b-it",
35
- "Mistral 7B": "mistralai/Mistral-7B-Instruct-v0.3",
36
- "Qwen3 (30B)": "Qwen/Qwen3-30B-A3B"
37
- }
38
-
39
- # ---------------------------
40
- # πŸ“₯ UTILITIES
41
- # ---------------------------
42
  def clean_text(text):
43
  return "\n".join([line.strip() for line in text.splitlines() if line.strip()])
44
 
45
- def tokenize_sentences(text):
46
- nltk.download('punkt', quiet=True)
47
  return sent_tokenize(text)
48
 
49
- def split_into_chunks(sentences, max_tokens=750, overlap=100):
50
- chunks, current_chunk, current_len = [], [], 0
51
- for sentence in sentences:
52
- token_count = len(sentence.split())
53
- if current_len + token_count > max_tokens and current_chunk:
54
- chunks.append(" ".join(current_chunk))
55
- current_chunk = current_chunk[-overlap:]
56
- current_len = sum(len(s.split()) for s in current_chunk)
57
- current_chunk.append(sentence)
58
- current_len += token_count
59
- if current_chunk:
60
- chunks.append(" ".join(current_chunk))
61
  return chunks
62
 
63
- def extract_metadata_from_filename(filename):
64
- name = filename.lower().replace("_", " ").replace("-", " ")
65
- meta = {"model": "unknown", "doc_type": "unknown", "brand": "life fitness"}
66
- if "om" in name: meta["doc_type"] = "owner manual"
67
- elif "sm" in name: meta["doc_type"] = "service manual"
68
- elif "assembly" in name: meta["doc_type"] = "assembly instructions"
69
- elif "alert" in name: meta["doc_type"] = "installer alert"
70
- elif "parts" in name: meta["doc_type"] = "parts manual"
71
- known_models = ["se3hd", "se3", "se4", "symbio", "explore", "integrity x", "integrity sl", "everest", "engage"]
72
- for model in known_models:
73
- if model.replace(" ", "") in name.replace(" ", ""):
74
- meta["model"] = model
75
- return meta
76
-
77
- def extract_text_with_ocr(page):
78
- text = page.get_text().strip()
79
- if text:
80
- return text
81
- pix = page.get_pixmap(dpi=300)
82
- img_data = pix.tobytes("png")
83
- img = Image.open(tempfile.SpooledTemporaryFile())
84
- img.fp.write(img_data)
85
- img.fp.seek(0)
86
- return pytesseract.image_to_string(img).strip()
87
-
88
- # ---------------------------
89
- # 🧠 EMBEDDING + CHROMA
90
- # ---------------------------
91
- def embed_pdfs_from_uploaded(files, progress=gr.Progress(track_tqdm=True)):
92
- os.makedirs(pdf_folder, exist_ok=True)
93
- temp_chunks = []
94
- for file in files:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
95
  filename = os.path.basename(file.name)
96
- dst = os.path.join(pdf_folder, filename)
97
- shutil.copy(file.name, dst)
98
- doc = fitz.open(dst)
99
- meta = extract_metadata_from_filename(filename)
100
- for page_num, page in enumerate(doc, start=1):
101
- text = extract_text_with_ocr(page)
102
- sents = tokenize_sentences(clean_text(text))
103
- chunks = split_into_chunks(sents, chunk_size, chunk_overlap)
104
- for i, chunk in enumerate(chunks):
105
- temp_chunks.append({
106
- "chunk_id": f"{filename}::page_{page_num}::chunk_{i+1}",
107
- "source_file": filename,
108
- "page": page_num,
109
- "text": chunk,
110
- **meta
111
- })
112
-
113
- with open(output_jsonl_chunks, "w", encoding="utf-8") as f:
114
- for c in temp_chunks:
115
- json.dump(c, f)
116
- f.write("\n")
117
-
 
118
  embedder = SentenceTransformer("all-MiniLM-L6-v2")
119
- client = chromadb.PersistentClient(path=chroma_path)
120
- if collection_name in [c.name for c in client.list_collections()]:
121
- client.delete_collection(collection_name)
122
- collection = client.create_collection(collection_name)
123
-
124
- for i in tqdm(range(0, len(temp_chunks), 16)):
125
- batch = temp_chunks[i:i+16]
126
- texts = [b["text"] for b in batch]
127
- metadatas = [b for b in batch]
128
- ids = [b["chunk_id"] for b in batch]
129
- embeddings = embedder.encode(texts).tolist()
130
- collection.add(documents=texts, ids=ids, metadatas=metadatas, embeddings=embeddings)
131
-
132
- return collection, embedder
133
-
134
- # ---------------------------
135
- # πŸ€– LLM INFERENCE
136
- # ---------------------------
137
- def load_llm(model_key):
138
- model_id = MODEL_MAP.get(model_key)
139
- if not model_id or not HF_TOKEN:
140
- return None, None, None
141
- tokenizer = AutoTokenizer.from_pretrained(model_id, token=HF_TOKEN)
142
- model = AutoModelForCausalLM.from_pretrained(model_id, token=HF_TOKEN, device_map="auto")
143
- pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, max_new_tokens=300)
144
- return tokenizer, model, pipe
145
-
146
- def generate_answer(pipe, tokenizer, context, query):
147
- messages = [
148
- {"role": "system", "content": "You are an expert manual assistant. Answer accurately using only the context."},
149
- {"role": "user", "content": f"Context:\n{context}\n\nQuestion: {query}"}
150
- ]
151
- prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
152
- output = pipe(prompt)[0]["generated_text"]
153
- return output.split("\n")[-1].strip()
154
-
155
- # ---------------------------
156
- # 🎯 FULL PIPELINE
157
- # ---------------------------
158
- def rag_pipeline(query, model_key, files):
159
- collection, embedder = embed_pdfs_from_uploaded(files)
160
- query_embedding = embedder.encode(query, convert_to_tensor=True)
161
- results = collection.query(query_texts=[query], n_results=MAX_CONTEXT_CHUNKS)
162
- if not results["documents"]:
163
- return "No matches found."
164
-
165
- context = "\n\n".join(results["documents"][0])
166
- tokenizer, model, pipe = load_llm(model_key)
167
- if pipe:
168
- return generate_answer(pipe, tokenizer, context, query)
169
- return "Model could not be loaded."
170
-
171
- # ---------------------------
172
- # πŸ–₯️ GRADIO UI
173
- # ---------------------------
174
  with gr.Blocks() as demo:
175
- gr.Markdown("""# 🧠 SmartManuals-AI with Multi-Model RAG
176
- Upload your PDF manuals and ask smart questions. Choose your preferred LLM.""")
177
- with gr.Row():
178
- file_upload = gr.File(file_types=[".pdf"], file_count="multiple", label="Upload Manuals")
179
- with gr.Row():
180
- query_box = gr.Textbox(label="Question")
181
- model_selector = gr.Dropdown(label="Choose Model", choices=list(MODEL_MAP.keys()), value="LLaMA 3 (8B)")
182
- submit_btn = gr.Button("Run Query")
183
- answer_box = gr.Textbox(label="Answer", lines=8)
184
-
185
- submit_btn.click(fn=rag_pipeline, inputs=[query_box, model_selector, file_upload], outputs=[answer_box])
 
 
 
 
 
186
 
187
  demo.launch()
 
1
+ # βœ… Hugging Face-ready `app.py` for SmartManuals-AI
2
+ # Supports PDF/DOCX upload, embedding, querying via multiple HF models, and OCR fallback
3
 
4
  import os
 
5
  import fitz # PyMuPDF
6
  import nltk
7
+ import json
8
+ import io
9
+ import docx2txt
10
  import pytesseract
11
+ import chromadb
12
  import gradio as gr
13
+ import torch
14
  from tqdm import tqdm
15
+ from PIL import Image
16
+ from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
17
  from sentence_transformers import SentenceTransformer, util
18
+ from nltk.tokenize import sent_tokenize
19
+
20
+ nltk.download("punkt")
21
+
22
+ # ----------------------------
23
+ # Configuration
24
+ # ----------------------------
25
+ CHROMA_PATH = "./chroma_store"
26
+ COLLECTION_NAME = "manual_chunks"
27
+ CHUNK_SIZE = 750
28
+ CHUNK_OVERLAP = 100
29
+ MAX_CONTEXT = 3
30
+ HF_MODELS = [
31
+ "meta-llama/Llama-3-8B-Instruct",
32
+ "meta-llama/Llama-3.1-8B-Instruct",
33
+ "meta-llama/Llama-4-Scout-17B-16E-Instruct",
34
+ "mistralai/Mistral-7B-Instruct-v0.3",
35
+ "google/gemma-1.1-7b-it",
36
+ "Qwen/Qwen3-30B-A3B",
37
+ ]
38
  HF_TOKEN = os.environ.get("HF_TOKEN")
39
 
40
+ # ----------------------------
41
+ # Utilities
42
+ # ----------------------------
 
 
 
 
 
 
 
 
43
  def clean_text(text):
44
  return "\n".join([line.strip() for line in text.splitlines() if line.strip()])
45
 
46
+ def split_sentences(text):
 
47
  return sent_tokenize(text)
48
 
49
+ def chunk_sentences(sentences):
50
+ chunks, chunk, length = [], [], 0
51
+ for sent in sentences:
52
+ tokens = len(sent.split())
53
+ if length + tokens > CHUNK_SIZE:
54
+ chunks.append(" ".join(chunk))
55
+ chunk = chunk[-CHUNK_OVERLAP:]
56
+ length = sum(len(s.split()) for s in chunk)
57
+ chunk.append(sent)
58
+ length += tokens
59
+ if chunk:
60
+ chunks.append(" ".join(chunk))
61
  return chunks
62
 
63
+ def extract_text_pdf(file):
64
+ doc = fitz.open(stream=file.read(), filetype="pdf")
65
+ texts = []
66
+ for page in doc:
67
+ text = page.get_text()
68
+ if not text.strip():
69
+ pix = page.get_pixmap(dpi=300)
70
+ img = Image.open(io.BytesIO(pix.tobytes("png")))
71
+ text = pytesseract.image_to_string(img)
72
+ texts.append(text)
73
+ return texts
74
+
75
+ def extract_text_docx(file):
76
+ return [docx2txt.process(file)]
77
+
78
+ def extract_metadata(filename):
79
+ lower = filename.lower()
80
+ model = next((m for m in [
81
+ "se3hd", "se3", "se4", "symbio", "explore", "integrity x", "integrity sl",
82
+ "everest", "engage", "inspire", "discover", "95t", "95x", "95c", "95r", "97c"
83
+ ] if m in lower.replace(" ", "")), "unknown")
84
+
85
+ doc_type = "unknown"
86
+ if "om" in lower or "owner" in lower:
87
+ doc_type = "owner manual"
88
+ elif "sm" in lower or "service" in lower:
89
+ doc_type = "service manual"
90
+ elif "assembly" in lower:
91
+ doc_type = "assembly instructions"
92
+ elif "parts" in lower:
93
+ doc_type = "parts manual"
94
+ elif "bulletin" in lower:
95
+ doc_type = "service bulletin"
96
+
97
+ return model, doc_type
98
+
99
+ # ----------------------------
100
+ # Embedding pipeline
101
+ # ----------------------------
102
+ def embed_docs(files, progress=gr.Progress()):
103
+ embedder = SentenceTransformer("all-MiniLM-L6-v2")
104
+ client = chromadb.PersistentClient(path=CHROMA_PATH)
105
+ try:
106
+ client.delete_collection(COLLECTION_NAME)
107
+ except: pass
108
+ collection = client.create_collection(COLLECTION_NAME)
109
+
110
+ texts, ids, metadatas = [], [], []
111
+ i = 0
112
+ for file in progress.tqdm(files, desc="Embedding files"):
113
  filename = os.path.basename(file.name)
114
+ ext = filename.lower().split(".")[-1]
115
+ raw_texts = extract_text_pdf(file) if ext == "pdf" else extract_text_docx(file)
116
+ model, doc_type = extract_metadata(filename)
117
+ for page, text in enumerate(raw_texts):
118
+ sents = split_sentences(clean_text(text))
119
+ for j, chunk in enumerate(chunk_sentences(sents)):
120
+ texts.append(chunk)
121
+ ids.append(f"{filename}::p{page+1}::c{j+1}")
122
+ metadatas.append({"source_file": filename, "page": page+1, "model": model, "doc_type": doc_type})
123
+ i += 1
124
+ if len(texts) >= 16:
125
+ collection.add(documents=texts, metadatas=metadatas, ids=ids,
126
+ embeddings=embedder.encode(texts).tolist())
127
+ texts, metadatas, ids = [], [], []
128
+ if texts:
129
+ collection.add(documents=texts, metadatas=metadatas, ids=ids,
130
+ embeddings=embedder.encode(texts).tolist())
131
+ return f"βœ… Embedded {i} chunks from {len(files)} files."
132
+
133
+ # ----------------------------
134
+ # Querying pipeline
135
+ # ----------------------------
136
+ def query_rag(q, model_name):
137
  embedder = SentenceTransformer("all-MiniLM-L6-v2")
138
+ client = chromadb.PersistentClient(path=CHROMA_PATH)
139
+ collection = client.get_collection(COLLECTION_NAME)
140
+ chunks = collection.query(query_texts=[q], n_results=MAX_CONTEXT)
141
+
142
+ context = "\n\n".join(chunks['documents'][0])
143
+ prompt = f"""<|begin_of_text|><|start_header_id|>system<|end_header_id|>
144
+ You are a helpful assistant. Only answer from the provided manual context below.
145
+ If unsure, say 'I don't know'.
146
+ <context>
147
+ {context}
148
+ </context>
149
+ <|start_header_id|>user<|end_header_id|>
150
+ {q}<|start_header_id|>assistant<|end_header_id|>"""
151
+
152
+ tokenizer = AutoTokenizer.from_pretrained(model_name, token=HF_TOKEN)
153
+ model = AutoModelForCausalLM.from_pretrained(model_name, token=HF_TOKEN, torch_dtype=torch.float32)
154
+ pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, device=-1)
155
+ result = pipe(prompt, max_new_tokens=300)[0]["generated_text"]
156
+ return result.split("<|start_header_id|>assistant<|end_header_id|>")[-1].strip()
157
+
158
+ # ----------------------------
159
+ # Gradio Interface
160
+ # ----------------------------
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
161
  with gr.Blocks() as demo:
162
+ gr.Markdown("""# 🧠 SmartManuals-AI (HF Edition)
163
+ Upload PDF or Word documents, embed them locally, and ask technical questions using LLMs (LLaMA 3, Mistral, etc).""")
164
+
165
+ with gr.Tab("πŸ“₯ Upload & Embed"):
166
+ uploader = gr.File(file_types=[".pdf", ".docx"], file_count="multiple")
167
+ embed_btn = gr.Button("πŸš€ Embed Files")
168
+ embed_output = gr.Textbox(label="Embed Log")
169
+
170
+ with gr.Tab("❓ Ask a Question"):
171
+ question = gr.Textbox(label="Your Question")
172
+ model_select = gr.Dropdown(choices=HF_MODELS, label="Model", value=HF_MODELS[0])
173
+ ask_btn = gr.Button("πŸ’¬ Ask")
174
+ response = gr.Textbox(label="Answer", lines=8)
175
+
176
+ embed_btn.click(embed_docs, inputs=uploader, outputs=embed_output)
177
+ ask_btn.click(query_rag, inputs=[question, model_select], outputs=response)
178
 
179
  demo.launch()