damoojeje commited on
Commit
6f368e7
Β·
verified Β·
1 Parent(s): 86bcdf2

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +154 -157
app.py CHANGED
@@ -1,179 +1,176 @@
1
- # βœ… Hugging Face-ready `app.py` for SmartManuals-AI
2
- # Supports PDF/DOCX upload, embedding, querying via multiple HF models, and OCR fallback
3
 
4
  import os
 
5
  import fitz # PyMuPDF
6
  import nltk
7
- import json
8
- import io
9
- import docx2txt
10
- import pytesseract
11
  import chromadb
12
- import gradio as gr
13
- import torch
14
  from tqdm import tqdm
15
- from PIL import Image
16
- from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
17
- from sentence_transformers import SentenceTransformer, util
18
  from nltk.tokenize import sent_tokenize
 
 
 
 
 
 
 
 
 
19
 
20
- nltk.download("punkt")
21
-
22
- # ----------------------------
23
- # Configuration
24
- # ----------------------------
25
  CHROMA_PATH = "./chroma_store"
 
26
  COLLECTION_NAME = "manual_chunks"
 
27
  CHUNK_SIZE = 750
28
  CHUNK_OVERLAP = 100
29
- MAX_CONTEXT = 3
30
- HF_MODELS = [
31
- "meta-llama/Llama-3-8B-Instruct",
32
- "meta-llama/Llama-3.1-8B-Instruct",
33
- "meta-llama/Llama-4-Scout-17B-16E-Instruct",
34
- "mistralai/Mistral-7B-Instruct-v0.3",
35
- "google/gemma-1.1-7b-it",
36
- "Qwen/Qwen3-30B-A3B",
37
- ]
38
- HF_TOKEN = os.environ.get("HF_TOKEN")
39
-
40
- # ----------------------------
41
- # Utilities
42
- # ----------------------------
43
- def clean_text(text):
44
- return "\n".join([line.strip() for line in text.splitlines() if line.strip()])
45
 
46
- def split_sentences(text):
47
- return sent_tokenize(text)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48
 
49
- def chunk_sentences(sentences):
50
- chunks, chunk, length = [], [], 0
 
 
 
 
51
  for sent in sentences:
52
- tokens = len(sent.split())
53
- if length + tokens > CHUNK_SIZE:
54
- chunks.append(" ".join(chunk))
55
- chunk = chunk[-CHUNK_OVERLAP:]
56
- length = sum(len(s.split()) for s in chunk)
57
- chunk.append(sent)
58
- length += tokens
59
- if chunk:
60
- chunks.append(" ".join(chunk))
61
  return chunks
62
 
63
- def extract_text_pdf(file):
64
- doc = fitz.open(stream=file.read(), filetype="pdf")
65
- texts = []
66
- for page in doc:
67
- text = page.get_text()
68
- if not text.strip():
69
- pix = page.get_pixmap(dpi=300)
70
- img = Image.open(io.BytesIO(pix.tobytes("png")))
71
- text = pytesseract.image_to_string(img)
72
- texts.append(text)
73
- return texts
74
-
75
- def extract_text_docx(file):
76
- return [docx2txt.process(file)]
77
-
78
- def extract_metadata(filename):
79
- lower = filename.lower()
80
- model = next((m for m in [
81
- "se3hd", "se3", "se4", "symbio", "explore", "integrity x", "integrity sl",
82
- "everest", "engage", "inspire", "discover", "95t", "95x", "95c", "95r", "97c"
83
- ] if m in lower.replace(" ", "")), "unknown")
84
-
85
- doc_type = "unknown"
86
- if "om" in lower or "owner" in lower:
87
- doc_type = "owner manual"
88
- elif "sm" in lower or "service" in lower:
89
- doc_type = "service manual"
90
- elif "assembly" in lower:
91
- doc_type = "assembly instructions"
92
- elif "parts" in lower:
93
- doc_type = "parts manual"
94
- elif "bulletin" in lower:
95
- doc_type = "service bulletin"
96
-
97
- return model, doc_type
98
-
99
- # ----------------------------
100
- # Embedding pipeline
101
- # ----------------------------
102
- def embed_docs(files, progress=gr.Progress()):
103
- embedder = SentenceTransformer("all-MiniLM-L6-v2")
104
- client = chromadb.PersistentClient(path=CHROMA_PATH)
105
- try:
106
  client.delete_collection(COLLECTION_NAME)
107
- except: pass
108
  collection = client.create_collection(COLLECTION_NAME)
109
 
110
- texts, ids, metadatas = [], [], []
111
- i = 0
112
- for file in progress.tqdm(files, desc="Embedding files"):
113
- filename = os.path.basename(file.name)
114
- ext = filename.lower().split(".")[-1]
115
- raw_texts = extract_text_pdf(file) if ext == "pdf" else extract_text_docx(file)
116
- model, doc_type = extract_metadata(filename)
117
- for page, text in enumerate(raw_texts):
118
- sents = split_sentences(clean_text(text))
119
- for j, chunk in enumerate(chunk_sentences(sents)):
120
- texts.append(chunk)
121
- ids.append(f"{filename}::p{page+1}::c{j+1}")
122
- metadatas.append({"source_file": filename, "page": page+1, "model": model, "doc_type": doc_type})
123
- i += 1
124
- if len(texts) >= 16:
125
- collection.add(documents=texts, metadatas=metadatas, ids=ids,
126
- embeddings=embedder.encode(texts).tolist())
127
- texts, metadatas, ids = [], [], []
128
- if texts:
129
- collection.add(documents=texts, metadatas=metadatas, ids=ids,
130
- embeddings=embedder.encode(texts).tolist())
131
- return f"βœ… Embedded {i} chunks from {len(files)} files."
132
-
133
- # ----------------------------
134
- # Querying pipeline
135
- # ----------------------------
136
- def query_rag(q, model_name):
137
- embedder = SentenceTransformer("all-MiniLM-L6-v2")
138
- client = chromadb.PersistentClient(path=CHROMA_PATH)
139
- collection = client.get_collection(COLLECTION_NAME)
140
- chunks = collection.query(query_texts=[q], n_results=MAX_CONTEXT)
141
-
142
- context = "\n\n".join(chunks['documents'][0])
143
- prompt = f"""<|begin_of_text|><|start_header_id|>system<|end_header_id|>
144
- You are a helpful assistant. Only answer from the provided manual context below.
145
- If unsure, say 'I don't know'.
146
- <context>
147
- {context}
148
- </context>
149
- <|start_header_id|>user<|end_header_id|>
150
- {q}<|start_header_id|>assistant<|end_header_id|>"""
151
 
152
  tokenizer = AutoTokenizer.from_pretrained(model_name, token=HF_TOKEN)
153
- model = AutoModelForCausalLM.from_pretrained(model_name, token=HF_TOKEN, torch_dtype=torch.float32)
154
- pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, device=-1)
155
- result = pipe(prompt, max_new_tokens=300)[0]["generated_text"]
156
- return result.split("<|start_header_id|>assistant<|end_header_id|>")[-1].strip()
157
-
158
- # ----------------------------
159
- # Gradio Interface
160
- # ----------------------------
161
- with gr.Blocks() as demo:
162
- gr.Markdown("""# 🧠 SmartManuals-AI (HF Edition)
163
- Upload PDF or Word documents, embed them locally, and ask technical questions using LLMs (LLaMA 3, Mistral, etc).""")
164
-
165
- with gr.Tab("πŸ“₯ Upload & Embed"):
166
- uploader = gr.File(file_types=[".pdf", ".docx"], file_count="multiple")
167
- embed_btn = gr.Button("πŸš€ Embed Files")
168
- embed_output = gr.Textbox(label="Embed Log")
169
-
170
- with gr.Tab("❓ Ask a Question"):
171
- question = gr.Textbox(label="Your Question")
172
- model_select = gr.Dropdown(choices=HF_MODELS, label="Model", value=HF_MODELS[0])
173
- ask_btn = gr.Button("πŸ’¬ Ask")
174
- response = gr.Textbox(label="Answer", lines=8)
175
-
176
- embed_btn.click(embed_docs, inputs=uploader, outputs=embed_output)
177
- ask_btn.click(query_rag, inputs=[question, model_select], outputs=response)
178
-
179
- demo.launch()
 
 
 
 
 
 
 
 
1
+ # βœ… app.py β€” Hugging Face Space Version (Finalized)
2
+ # RAG over local PDFs/DOCX using Hugging Face-hosted models with Chroma
3
 
4
  import os
5
+ import json
6
  import fitz # PyMuPDF
7
  import nltk
 
 
 
 
8
  import chromadb
 
 
9
  from tqdm import tqdm
 
 
 
10
  from nltk.tokenize import sent_tokenize
11
+ from sentence_transformers import SentenceTransformer, util
12
+ import numpy as np
13
+ import torch
14
+ from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
15
+ import pytesseract
16
+ from PIL import Image
17
+ import io
18
+ import docx2txt
19
+ import gradio as gr
20
 
21
+ # ---------------------------
22
+ # βœ… Configuration
23
+ # ---------------------------
24
+ MANUALS_DIR = "./Manuals" # Folder containing all PDF and DOCX files
 
25
  CHROMA_PATH = "./chroma_store"
26
+ CHUNKS_PATH = "chunks.jsonl"
27
  COLLECTION_NAME = "manual_chunks"
28
+ MAX_CONTEXT_CHUNKS = 3
29
  CHUNK_SIZE = 750
30
  CHUNK_OVERLAP = 100
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
 
32
+ HF_TOKEN = os.environ.get("HF_TOKEN")
33
+ LLM_MODELS = {
34
+ "LLaMA 3.1 8B": "meta-llama/Llama-3.1-8B-Instruct",
35
+ "LLaMA 3 8B": "meta-llama/Llama-3-8B-Instruct",
36
+ "LLaMA 4 Scout": "meta-llama/Llama-4-Scout-17B-16E-Instruct",
37
+ "Mistral": "mistralai/Mistral-7B-Instruct-v0.3",
38
+ "Gemma": "google/gemma-1.1-7b-it",
39
+ "Qwen 3 30B": "Qwen/Qwen3-30B-A3B",
40
+ }
41
+
42
+ # ---------------------------
43
+ # βœ… Setup
44
+ # ---------------------------
45
+ nltk.download('punkt')
46
+ embedder = SentenceTransformer("all-MiniLM-L6-v2")
47
+ client = chromadb.PersistentClient(path=CHROMA_PATH)
48
+ collection = None
49
+
50
+ # ---------------------------
51
+ # πŸ“„ Load all PDFs and DOCX content
52
+ # ---------------------------
53
+ def extract_all_documents():
54
+ chunks = []
55
+ for fname in os.listdir(MANUALS_DIR):
56
+ path = os.path.join(MANUALS_DIR, fname)
57
+ if fname.lower().endswith(".pdf"):
58
+ doc = fitz.open(path)
59
+ for i, page in enumerate(doc):
60
+ text = page.get_text().strip()
61
+ if not text:
62
+ pix = page.get_pixmap(dpi=300)
63
+ img = Image.open(io.BytesIO(pix.tobytes("png")))
64
+ text = pytesseract.image_to_string(img)
65
+ if text.strip():
66
+ chunks.append((fname, i + 1, text.strip()))
67
+ elif fname.lower().endswith(".docx"):
68
+ text = docx2txt.process(path)
69
+ if text.strip():
70
+ chunks.append((fname, 1, text.strip()))
71
+ return chunks
72
 
73
+ # ---------------------------
74
+ # βœ‚οΈ Chunk text
75
+ # ---------------------------
76
+ def split_chunks(text, size=CHUNK_SIZE, overlap=CHUNK_OVERLAP):
77
+ sentences = sent_tokenize(text)
78
+ chunks, curr, curr_len = [], [], 0
79
  for sent in sentences:
80
+ tok_len = len(sent.split())
81
+ if curr_len + tok_len > size:
82
+ chunks.append(" ".join(curr))
83
+ curr = curr[-overlap:]
84
+ curr_len = sum(len(s.split()) for s in curr)
85
+ curr.append(sent)
86
+ curr_len += tok_len
87
+ if curr:
88
+ chunks.append(" ".join(curr))
89
  return chunks
90
 
91
+ # ---------------------------
92
+ # πŸ’Ύ Embed into Chroma
93
+ # ---------------------------
94
+ def embed_documents():
95
+ global collection
96
+ if collection:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
97
  client.delete_collection(COLLECTION_NAME)
 
98
  collection = client.create_collection(COLLECTION_NAME)
99
 
100
+ docs = extract_all_documents()
101
+ records = []
102
+ for fname, page, text in docs:
103
+ for i, chunk in enumerate(split_chunks(text)):
104
+ if not chunk.strip():
105
+ continue
106
+ records.append({
107
+ "id": f"{fname}::p{page}::c{i}",
108
+ "text": chunk,
109
+ "metadata": {"source_file": fname, "page": page}
110
+ })
111
+
112
+ for i in tqdm(range(0, len(records), 16)):
113
+ batch = records[i:i + 16]
114
+ texts = [b["text"] for b in batch]
115
+ ids = [b["id"] for b in batch]
116
+ metas = [b["metadata"] for b in batch]
117
+ embs = embedder.encode(texts).tolist()
118
+ collection.add(documents=texts, ids=ids, metadatas=metas, embeddings=embs)
119
+ return f"βœ… Embedded {len(records)} chunks"
120
+
121
+ # ---------------------------
122
+ # πŸ”Ž Query
123
+ # ---------------------------
124
+ def search_context(query, top_k=MAX_CONTEXT_CHUNKS):
125
+ results = collection.query(query_texts=[query], n_results=top_k)
126
+ chunks = results["documents"][0]
127
+ metas = results["metadatas"][0]
128
+ return "\n\n".join(
129
+ f"File: {m['source_file']}, Page: {m['page']}\n{c}" for m, c in zip(metas, chunks)
130
+ )
131
+
132
+ # ---------------------------
133
+ # 🧠 Run Inference
134
+ # ---------------------------
135
+ def ask_model(model_name, query):
136
+ if not HF_TOKEN:
137
+ return "❌ HF_TOKEN not set."
138
+ context = search_context(query)
139
+ system_prompt = "Answer only using the context. Say 'I don't know' if not found."
140
+ prompt = f"<|begin_of_text|><|start_header_id|>system<|end_header_id|>{system_prompt}<|start_header_id|>user<|end_header_id|>{context}\n\nQuestion: {query}<|start_header_id|>assistant<|end_header_id|>"
141
 
142
  tokenizer = AutoTokenizer.from_pretrained(model_name, token=HF_TOKEN)
143
+ model = AutoModelForCausalLM.from_pretrained(model_name, token=HF_TOKEN, device_map="auto")
144
+ pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, device=0 if torch.cuda.is_available() else -1)
145
+
146
+ output = pipe(prompt, max_new_tokens=512, do_sample=True)[0]["generated_text"]
147
+ return output.split("<|start_header_id|>assistant<|end_header_id|>")[-1].strip()
148
+
149
+ # ---------------------------
150
+ # πŸŽ› Gradio UI
151
+ # ---------------------------
152
+ def launch_interface():
153
+ with gr.Blocks() as demo:
154
+ gr.Markdown("""
155
+ # 🧠 SmartManuals-AI (Hugging Face Edition)
156
+ Upload manuals to `./Manuals`, click Embed, then ask questions.
157
+ """)
158
+
159
+ with gr.Row():
160
+ embed_button = gr.Button("βš™οΈ Embed Documents")
161
+ embed_status = gr.Textbox(label="Status")
162
+
163
+ with gr.Row():
164
+ model_select = gr.Dropdown(list(LLM_MODELS.keys()), label="Model", value="LLaMA 3.1 8B")
165
+ question = gr.Textbox(label="Question")
166
+ answer = gr.Textbox(label="Answer", lines=10)
167
+ submit = gr.Button("πŸ” Ask")
168
+
169
+ embed_button.click(fn=embed_documents, outputs=embed_status)
170
+ submit.click(fn=lambda m, q: ask_model(LLM_MODELS[m], q), inputs=[model_select, question], outputs=[answer])
171
+
172
+ demo.launch()
173
+
174
+ # ---------------------------
175
+ if __name__ == "__main__":
176
+ launch_interface()