damoojeje commited on
Commit
fcbea64
Β·
verified Β·
1 Parent(s): 6f368e7

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +173 -143
app.py CHANGED
@@ -1,176 +1,206 @@
1
- # βœ… app.py β€” Hugging Face Space Version (Finalized)
2
- # RAG over local PDFs/DOCX using Hugging Face-hosted models with Chroma
 
 
3
 
4
  import os
5
  import json
6
  import fitz # PyMuPDF
7
- import nltk
8
  import chromadb
9
  from tqdm import tqdm
10
  from nltk.tokenize import sent_tokenize
11
  from sentence_transformers import SentenceTransformer, util
12
- import numpy as np
13
- import torch
14
  from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
15
- import pytesseract
16
- from PIL import Image
17
- import io
18
- import docx2txt
19
  import gradio as gr
20
 
21
  # ---------------------------
22
- # βœ… Configuration
23
  # ---------------------------
24
- MANUALS_DIR = "./Manuals" # Folder containing all PDF and DOCX files
25
  CHROMA_PATH = "./chroma_store"
26
- CHUNKS_PATH = "chunks.jsonl"
27
- COLLECTION_NAME = "manual_chunks"
28
- MAX_CONTEXT_CHUNKS = 3
29
  CHUNK_SIZE = 750
30
  CHUNK_OVERLAP = 100
31
-
32
  HF_TOKEN = os.environ.get("HF_TOKEN")
33
- LLM_MODELS = {
34
- "LLaMA 3.1 8B": "meta-llama/Llama-3.1-8B-Instruct",
35
- "LLaMA 3 8B": "meta-llama/Llama-3-8B-Instruct",
36
- "LLaMA 4 Scout": "meta-llama/Llama-4-Scout-17B-16E-Instruct",
37
- "Mistral": "mistralai/Mistral-7B-Instruct-v0.3",
38
- "Gemma": "google/gemma-1.1-7b-it",
39
- "Qwen 3 30B": "Qwen/Qwen3-30B-A3B",
40
- }
41
 
42
- # ---------------------------
43
- # βœ… Setup
44
- # ---------------------------
45
- nltk.download('punkt')
46
- embedder = SentenceTransformer("all-MiniLM-L6-v2")
47
- client = chromadb.PersistentClient(path=CHROMA_PATH)
48
  collection = None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49
 
50
  # ---------------------------
51
- # πŸ“„ Load all PDFs and DOCX content
52
  # ---------------------------
53
- def extract_all_documents():
 
 
 
54
  chunks = []
55
- for fname in os.listdir(MANUALS_DIR):
56
- path = os.path.join(MANUALS_DIR, fname)
57
- if fname.lower().endswith(".pdf"):
 
58
  doc = fitz.open(path)
59
  for i, page in enumerate(doc):
60
- text = page.get_text().strip()
61
  if not text:
62
- pix = page.get_pixmap(dpi=300)
63
- img = Image.open(io.BytesIO(pix.tobytes("png")))
64
  text = pytesseract.image_to_string(img)
65
- if text.strip():
66
- chunks.append((fname, i + 1, text.strip()))
67
- elif fname.lower().endswith(".docx"):
68
- text = docx2txt.process(path)
69
- if text.strip():
70
- chunks.append((fname, 1, text.strip()))
 
 
 
 
 
 
 
 
71
  return chunks
72
 
73
  # ---------------------------
74
- # βœ‚οΈ Chunk text
75
- # ---------------------------
76
- def split_chunks(text, size=CHUNK_SIZE, overlap=CHUNK_OVERLAP):
77
- sentences = sent_tokenize(text)
78
- chunks, curr, curr_len = [], [], 0
79
- for sent in sentences:
80
- tok_len = len(sent.split())
81
- if curr_len + tok_len > size:
82
- chunks.append(" ".join(curr))
83
- curr = curr[-overlap:]
84
- curr_len = sum(len(s.split()) for s in curr)
85
- curr.append(sent)
86
- curr_len += tok_len
87
- if curr:
88
- chunks.append(" ".join(curr))
89
- return chunks
90
-
91
- # ---------------------------
92
- # πŸ’Ύ Embed into Chroma
93
- # ---------------------------
94
- def embed_documents():
95
- global collection
96
- if collection:
97
- client.delete_collection(COLLECTION_NAME)
98
- collection = client.create_collection(COLLECTION_NAME)
99
-
100
- docs = extract_all_documents()
101
- records = []
102
- for fname, page, text in docs:
103
- for i, chunk in enumerate(split_chunks(text)):
104
- if not chunk.strip():
105
- continue
106
- records.append({
107
- "id": f"{fname}::p{page}::c{i}",
108
- "text": chunk,
109
- "metadata": {"source_file": fname, "page": page}
110
- })
111
-
112
- for i in tqdm(range(0, len(records), 16)):
113
- batch = records[i:i + 16]
114
- texts = [b["text"] for b in batch]
115
- ids = [b["id"] for b in batch]
116
- metas = [b["metadata"] for b in batch]
117
- embs = embedder.encode(texts).tolist()
118
- collection.add(documents=texts, ids=ids, metadatas=metas, embeddings=embs)
119
- return f"βœ… Embedded {len(records)} chunks"
120
-
121
- # ---------------------------
122
- # πŸ”Ž Query
123
- # ---------------------------
124
- def search_context(query, top_k=MAX_CONTEXT_CHUNKS):
125
- results = collection.query(query_texts=[query], n_results=top_k)
126
- chunks = results["documents"][0]
127
- metas = results["metadatas"][0]
128
- return "\n\n".join(
129
- f"File: {m['source_file']}, Page: {m['page']}\n{c}" for m, c in zip(metas, chunks)
130
- )
131
-
132
- # ---------------------------
133
- # 🧠 Run Inference
134
- # ---------------------------
135
- def ask_model(model_name, query):
136
- if not HF_TOKEN:
137
- return "❌ HF_TOKEN not set."
138
- context = search_context(query)
139
- system_prompt = "Answer only using the context. Say 'I don't know' if not found."
140
- prompt = f"<|begin_of_text|><|start_header_id|>system<|end_header_id|>{system_prompt}<|start_header_id|>user<|end_header_id|>{context}\n\nQuestion: {query}<|start_header_id|>assistant<|end_header_id|>"
141
-
142
- tokenizer = AutoTokenizer.from_pretrained(model_name, token=HF_TOKEN)
143
- model = AutoModelForCausalLM.from_pretrained(model_name, token=HF_TOKEN, device_map="auto")
144
- pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, device=0 if torch.cuda.is_available() else -1)
145
-
146
- output = pipe(prompt, max_new_tokens=512, do_sample=True)[0]["generated_text"]
147
- return output.split("<|start_header_id|>assistant<|end_header_id|>")[-1].strip()
148
-
149
- # ---------------------------
150
- # πŸŽ› Gradio UI
151
- # ---------------------------
152
- def launch_interface():
153
  with gr.Blocks() as demo:
154
- gr.Markdown("""
155
- # 🧠 SmartManuals-AI (Hugging Face Edition)
156
- Upload manuals to `./Manuals`, click Embed, then ask questions.
157
- """)
158
-
159
- with gr.Row():
160
- embed_button = gr.Button("βš™οΈ Embed Documents")
161
- embed_status = gr.Textbox(label="Status")
162
-
163
- with gr.Row():
164
- model_select = gr.Dropdown(list(LLM_MODELS.keys()), label="Model", value="LLaMA 3.1 8B")
165
- question = gr.Textbox(label="Question")
166
- answer = gr.Textbox(label="Answer", lines=10)
167
- submit = gr.Button("πŸ” Ask")
168
-
169
- embed_button.click(fn=embed_documents, outputs=embed_status)
170
- submit.click(fn=lambda m, q: ask_model(LLM_MODELS[m], q), inputs=[model_select, question], outputs=[answer])
171
-
172
- demo.launch()
173
 
174
  # ---------------------------
175
- if __name__ == "__main__":
176
- launch_interface()
 
 
 
 
1
+ # βœ… app.py (Final Hugging Face Version for SmartManuals-AI)
2
+ # βœ… No metadata filtering; all semantic search with keyword reranking
3
+ # βœ… Auto-index from Manuals/ on startup, with rerun prevention
4
+ # βœ… Gradio UI only, no file upload, progress logs
5
 
6
  import os
7
  import json
8
  import fitz # PyMuPDF
9
+ import hashlib
10
  import chromadb
11
  from tqdm import tqdm
12
  from nltk.tokenize import sent_tokenize
13
  from sentence_transformers import SentenceTransformer, util
 
 
14
  from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
15
+ import torch
 
 
 
16
  import gradio as gr
17
 
18
  # ---------------------------
19
+ # βš™οΈ Config
20
  # ---------------------------
21
+ MANUALS_FOLDER = "./Manuals"
22
  CHROMA_PATH = "./chroma_store"
23
+ CHUNKS_FILE = "manual_chunks_with_ocr.jsonl"
24
+ HASH_FILE = "manuals.hash"
 
25
  CHUNK_SIZE = 750
26
  CHUNK_OVERLAP = 100
27
+ MODEL_ID = "meta-llama/Llama-3.1-8B-Instruct"
28
  HF_TOKEN = os.environ.get("HF_TOKEN")
 
 
 
 
 
 
 
 
29
 
 
 
 
 
 
 
30
  collection = None
31
+ embedder = None
32
+ pipe = None
33
+
34
+ # ---------------------------
35
+ # πŸ” Load model and pipeline
36
+ # ---------------------------
37
+ def load_model():
38
+ global pipe
39
+ if HF_TOKEN is None:
40
+ print("❌ HF_TOKEN is not set")
41
+ return None
42
+ try:
43
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, token=HF_TOKEN)
44
+ model = AutoModelForCausalLM.from_pretrained(
45
+ MODEL_ID, token=HF_TOKEN, torch_dtype=torch.float32
46
+ )
47
+ pipe = pipeline(
48
+ "text-generation",
49
+ model=model,
50
+ tokenizer=tokenizer,
51
+ max_new_tokens=512,
52
+ temperature=0.2,
53
+ top_p=0.9,
54
+ do_sample=True,
55
+ device=-1
56
+ )
57
+ print(f"βœ… Model loaded: {MODEL_ID}")
58
+ return tokenizer
59
+ except Exception as e:
60
+ print(f"❌ Model load failed: {e}")
61
+ return None
62
+
63
+ # ---------------------------
64
+ # πŸ“š Utilities
65
+ # ---------------------------
66
+ def clean_text(text):
67
+ lines = text.splitlines()
68
+ return "\n".join([l.strip() for l in lines if l.strip()])
69
+
70
+ def split_into_chunks(sentences, max_tokens=CHUNK_SIZE, overlap=CHUNK_OVERLAP):
71
+ chunks, current, cur_len = [], [], 0
72
+ for sent in sentences:
73
+ tok = len(sent.split())
74
+ if cur_len + tok > max_tokens:
75
+ chunks.append(" ".join(current))
76
+ current = current[-overlap:]
77
+ cur_len = sum(len(s.split()) for s in current)
78
+ current.append(sent)
79
+ cur_len += tok
80
+ if current: chunks.append(" ".join(current))
81
+ return chunks
82
+
83
+ def hash_folder(folder):
84
+ hasher = hashlib.sha256()
85
+ for fname in sorted(os.listdir(folder)):
86
+ if fname.endswith(".pdf"):
87
+ with open(os.path.join(folder, fname), "rb") as f:
88
+ while chunk := f.read(8192):
89
+ hasher.update(chunk)
90
+ return hasher.hexdigest()
91
 
92
  # ---------------------------
93
+ # πŸ” Indexing
94
  # ---------------------------
95
+ def extract_and_chunk():
96
+ from PIL import Image
97
+ import pytesseract
98
+
99
  chunks = []
100
+ for fname in tqdm(sorted(os.listdir(MANUALS_FOLDER))):
101
+ if not fname.endswith(".pdf"): continue
102
+ path = os.path.join(MANUALS_FOLDER, fname)
103
+ try:
104
  doc = fitz.open(path)
105
  for i, page in enumerate(doc):
106
+ text = page.get_text()
107
  if not text:
108
+ img = Image.open(io.BytesIO(page.get_pixmap(dpi=300).tobytes("png")))
 
109
  text = pytesseract.image_to_string(img)
110
+ sents = sent_tokenize(clean_text(text))
111
+ for j, chunk in enumerate(split_into_chunks(sents)):
112
+ chunks.append({
113
+ "source_file": fname,
114
+ "chunk_id": f"{fname}::p{i+1}::c{j+1}",
115
+ "page": i+1,
116
+ "text": chunk.strip()
117
+ })
118
+ except Exception as e:
119
+ print(f"Error reading {fname}: {e}")
120
+ with open(CHUNKS_FILE, "w", encoding="utf-8") as f:
121
+ for chunk in chunks:
122
+ json.dump(chunk, f)
123
+ f.write("\n")
124
  return chunks
125
 
126
  # ---------------------------
127
+ # πŸ’Ύ ChromaDB Embedding
128
+ # ---------------------------
129
+ def embed_chunks():
130
+ global collection, embedder
131
+ client = chromadb.PersistentClient(path=CHROMA_PATH)
132
+ embedder = SentenceTransformer("all-MiniLM-L6-v2")
133
+ try: client.delete_collection("manual_chunks")
134
+ except: pass
135
+ collection = client.create_collection("manual_chunks")
136
+ with open(CHUNKS_FILE, "r", encoding="utf-8") as f:
137
+ batch, metas, ids, texts = [], [], [], []
138
+ for line in f:
139
+ item = json.loads(line)
140
+ texts.append(item["text"])
141
+ ids.append(item["chunk_id"])
142
+ metas.append({"source_file": item["source_file"], "page": item["page"]})
143
+ if len(texts) == 16:
144
+ embs = embedder.encode(texts).tolist()
145
+ collection.add(documents=texts, ids=ids, metadatas=metas, embeddings=embs)
146
+ texts, ids, metas = [], [], []
147
+ if texts:
148
+ embs = embedder.encode(texts).tolist()
149
+ collection.add(documents=texts, ids=ids, metadatas=metas, embeddings=embs)
150
+
151
+ # ---------------------------
152
+ # πŸ” Semantic QA
153
+ # ---------------------------
154
+ def ask(question):
155
+ if not collection or not embedder or not pipe:
156
+ return "App not ready."
157
+ emb = embedder.encode(question).tolist()
158
+ results = collection.query(query_embeddings=[emb], n_results=3)
159
+ context = "\n\n".join([r for r in results["documents"][0]])
160
+ prompt = f"""
161
+ Use the context to answer. Say 'I don't know' if unsure.
162
+
163
+ Context:
164
+ {context}
165
+
166
+ Question: {question}
167
+ """
168
+ return pipe(prompt)[0]['generated_text']
169
+
170
+ # ---------------------------
171
+ # πŸš€ App Startup
172
+ # ---------------------------
173
+ def initialize():
174
+ if not os.path.exists(MANUALS_FOLDER):
175
+ os.makedirs(MANUALS_FOLDER)
176
+ new_hash = hash_folder(MANUALS_FOLDER)
177
+ if os.path.exists(HASH_FILE):
178
+ with open(HASH_FILE, "r") as f:
179
+ if f.read().strip() == new_hash and os.path.exists(CHUNKS_FILE):
180
+ print("βœ… Manuals unchanged. Skipping re-embedding.")
181
+ return
182
+ print("πŸ”„ Indexing manuals...")
183
+ extract_and_chunk()
184
+ embed_chunks()
185
+ with open(HASH_FILE, "w") as f:
186
+ f.write(new_hash)
187
+ print("βœ… Embedding complete.")
188
+
189
+ # ---------------------------
190
+ # πŸ–₯️ Gradio Interface
191
+ # ---------------------------
192
+ def build_ui():
 
 
 
 
 
 
 
 
 
 
 
 
 
193
  with gr.Blocks() as demo:
194
+ gr.Markdown("## πŸ” Ask SmartManuals-AI")
195
+ inp = gr.Textbox(label="Your question")
196
+ out = gr.Textbox(label="Answer", lines=6)
197
+ btn = gr.Button("Ask")
198
+ btn.click(fn=ask, inputs=inp, outputs=out)
199
+ return demo
 
 
 
 
 
 
 
 
 
 
 
 
 
200
 
201
  # ---------------------------
202
+ # πŸ”§ Run App
203
+ # ---------------------------
204
+ load_model()
205
+ initialize()
206
+ demo = build_ui()