Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -1,176 +1,206 @@
|
|
1 |
-
# β
app.py
|
2 |
-
#
|
|
|
|
|
3 |
|
4 |
import os
|
5 |
import json
|
6 |
import fitz # PyMuPDF
|
7 |
-
import
|
8 |
import chromadb
|
9 |
from tqdm import tqdm
|
10 |
from nltk.tokenize import sent_tokenize
|
11 |
from sentence_transformers import SentenceTransformer, util
|
12 |
-
import numpy as np
|
13 |
-
import torch
|
14 |
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
|
15 |
-
import
|
16 |
-
from PIL import Image
|
17 |
-
import io
|
18 |
-
import docx2txt
|
19 |
import gradio as gr
|
20 |
|
21 |
# ---------------------------
|
22 |
-
#
|
23 |
# ---------------------------
|
24 |
-
|
25 |
CHROMA_PATH = "./chroma_store"
|
26 |
-
|
27 |
-
|
28 |
-
MAX_CONTEXT_CHUNKS = 3
|
29 |
CHUNK_SIZE = 750
|
30 |
CHUNK_OVERLAP = 100
|
31 |
-
|
32 |
HF_TOKEN = os.environ.get("HF_TOKEN")
|
33 |
-
LLM_MODELS = {
|
34 |
-
"LLaMA 3.1 8B": "meta-llama/Llama-3.1-8B-Instruct",
|
35 |
-
"LLaMA 3 8B": "meta-llama/Llama-3-8B-Instruct",
|
36 |
-
"LLaMA 4 Scout": "meta-llama/Llama-4-Scout-17B-16E-Instruct",
|
37 |
-
"Mistral": "mistralai/Mistral-7B-Instruct-v0.3",
|
38 |
-
"Gemma": "google/gemma-1.1-7b-it",
|
39 |
-
"Qwen 3 30B": "Qwen/Qwen3-30B-A3B",
|
40 |
-
}
|
41 |
|
42 |
-
# ---------------------------
|
43 |
-
# β
Setup
|
44 |
-
# ---------------------------
|
45 |
-
nltk.download('punkt')
|
46 |
-
embedder = SentenceTransformer("all-MiniLM-L6-v2")
|
47 |
-
client = chromadb.PersistentClient(path=CHROMA_PATH)
|
48 |
collection = None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
49 |
|
50 |
# ---------------------------
|
51 |
-
#
|
52 |
# ---------------------------
|
53 |
-
def
|
|
|
|
|
|
|
54 |
chunks = []
|
55 |
-
for fname in os.listdir(
|
56 |
-
|
57 |
-
|
|
|
58 |
doc = fitz.open(path)
|
59 |
for i, page in enumerate(doc):
|
60 |
-
text = page.get_text()
|
61 |
if not text:
|
62 |
-
|
63 |
-
img = Image.open(io.BytesIO(pix.tobytes("png")))
|
64 |
text = pytesseract.image_to_string(img)
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
71 |
return chunks
|
72 |
|
73 |
# ---------------------------
|
74 |
-
#
|
75 |
-
# ---------------------------
|
76 |
-
def
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
-
|
126 |
-
|
127 |
-
|
128 |
-
|
129 |
-
|
130 |
-
)
|
131 |
-
|
132 |
-
|
133 |
-
|
134 |
-
|
135 |
-
|
136 |
-
|
137 |
-
|
138 |
-
|
139 |
-
|
140 |
-
prompt = f"<|begin_of_text|><|start_header_id|>system<|end_header_id|>{system_prompt}<|start_header_id|>user<|end_header_id|>{context}\n\nQuestion: {query}<|start_header_id|>assistant<|end_header_id|>"
|
141 |
-
|
142 |
-
tokenizer = AutoTokenizer.from_pretrained(model_name, token=HF_TOKEN)
|
143 |
-
model = AutoModelForCausalLM.from_pretrained(model_name, token=HF_TOKEN, device_map="auto")
|
144 |
-
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, device=0 if torch.cuda.is_available() else -1)
|
145 |
-
|
146 |
-
output = pipe(prompt, max_new_tokens=512, do_sample=True)[0]["generated_text"]
|
147 |
-
return output.split("<|start_header_id|>assistant<|end_header_id|>")[-1].strip()
|
148 |
-
|
149 |
-
# ---------------------------
|
150 |
-
# π Gradio UI
|
151 |
-
# ---------------------------
|
152 |
-
def launch_interface():
|
153 |
with gr.Blocks() as demo:
|
154 |
-
gr.Markdown(""
|
155 |
-
|
156 |
-
|
157 |
-
""
|
158 |
-
|
159 |
-
|
160 |
-
embed_button = gr.Button("βοΈ Embed Documents")
|
161 |
-
embed_status = gr.Textbox(label="Status")
|
162 |
-
|
163 |
-
with gr.Row():
|
164 |
-
model_select = gr.Dropdown(list(LLM_MODELS.keys()), label="Model", value="LLaMA 3.1 8B")
|
165 |
-
question = gr.Textbox(label="Question")
|
166 |
-
answer = gr.Textbox(label="Answer", lines=10)
|
167 |
-
submit = gr.Button("π Ask")
|
168 |
-
|
169 |
-
embed_button.click(fn=embed_documents, outputs=embed_status)
|
170 |
-
submit.click(fn=lambda m, q: ask_model(LLM_MODELS[m], q), inputs=[model_select, question], outputs=[answer])
|
171 |
-
|
172 |
-
demo.launch()
|
173 |
|
174 |
# ---------------------------
|
175 |
-
|
176 |
-
|
|
|
|
|
|
|
|
1 |
+
# β
app.py (Final Hugging Face Version for SmartManuals-AI)
|
2 |
+
# β
No metadata filtering; all semantic search with keyword reranking
|
3 |
+
# β
Auto-index from Manuals/ on startup, with rerun prevention
|
4 |
+
# β
Gradio UI only, no file upload, progress logs
|
5 |
|
6 |
import os
|
7 |
import json
|
8 |
import fitz # PyMuPDF
|
9 |
+
import hashlib
|
10 |
import chromadb
|
11 |
from tqdm import tqdm
|
12 |
from nltk.tokenize import sent_tokenize
|
13 |
from sentence_transformers import SentenceTransformer, util
|
|
|
|
|
14 |
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
|
15 |
+
import torch
|
|
|
|
|
|
|
16 |
import gradio as gr
|
17 |
|
18 |
# ---------------------------
|
19 |
+
# βοΈ Config
|
20 |
# ---------------------------
|
21 |
+
MANUALS_FOLDER = "./Manuals"
|
22 |
CHROMA_PATH = "./chroma_store"
|
23 |
+
CHUNKS_FILE = "manual_chunks_with_ocr.jsonl"
|
24 |
+
HASH_FILE = "manuals.hash"
|
|
|
25 |
CHUNK_SIZE = 750
|
26 |
CHUNK_OVERLAP = 100
|
27 |
+
MODEL_ID = "meta-llama/Llama-3.1-8B-Instruct"
|
28 |
HF_TOKEN = os.environ.get("HF_TOKEN")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
29 |
|
|
|
|
|
|
|
|
|
|
|
|
|
30 |
collection = None
|
31 |
+
embedder = None
|
32 |
+
pipe = None
|
33 |
+
|
34 |
+
# ---------------------------
|
35 |
+
# π Load model and pipeline
|
36 |
+
# ---------------------------
|
37 |
+
def load_model():
|
38 |
+
global pipe
|
39 |
+
if HF_TOKEN is None:
|
40 |
+
print("β HF_TOKEN is not set")
|
41 |
+
return None
|
42 |
+
try:
|
43 |
+
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, token=HF_TOKEN)
|
44 |
+
model = AutoModelForCausalLM.from_pretrained(
|
45 |
+
MODEL_ID, token=HF_TOKEN, torch_dtype=torch.float32
|
46 |
+
)
|
47 |
+
pipe = pipeline(
|
48 |
+
"text-generation",
|
49 |
+
model=model,
|
50 |
+
tokenizer=tokenizer,
|
51 |
+
max_new_tokens=512,
|
52 |
+
temperature=0.2,
|
53 |
+
top_p=0.9,
|
54 |
+
do_sample=True,
|
55 |
+
device=-1
|
56 |
+
)
|
57 |
+
print(f"β
Model loaded: {MODEL_ID}")
|
58 |
+
return tokenizer
|
59 |
+
except Exception as e:
|
60 |
+
print(f"β Model load failed: {e}")
|
61 |
+
return None
|
62 |
+
|
63 |
+
# ---------------------------
|
64 |
+
# π Utilities
|
65 |
+
# ---------------------------
|
66 |
+
def clean_text(text):
|
67 |
+
lines = text.splitlines()
|
68 |
+
return "\n".join([l.strip() for l in lines if l.strip()])
|
69 |
+
|
70 |
+
def split_into_chunks(sentences, max_tokens=CHUNK_SIZE, overlap=CHUNK_OVERLAP):
|
71 |
+
chunks, current, cur_len = [], [], 0
|
72 |
+
for sent in sentences:
|
73 |
+
tok = len(sent.split())
|
74 |
+
if cur_len + tok > max_tokens:
|
75 |
+
chunks.append(" ".join(current))
|
76 |
+
current = current[-overlap:]
|
77 |
+
cur_len = sum(len(s.split()) for s in current)
|
78 |
+
current.append(sent)
|
79 |
+
cur_len += tok
|
80 |
+
if current: chunks.append(" ".join(current))
|
81 |
+
return chunks
|
82 |
+
|
83 |
+
def hash_folder(folder):
|
84 |
+
hasher = hashlib.sha256()
|
85 |
+
for fname in sorted(os.listdir(folder)):
|
86 |
+
if fname.endswith(".pdf"):
|
87 |
+
with open(os.path.join(folder, fname), "rb") as f:
|
88 |
+
while chunk := f.read(8192):
|
89 |
+
hasher.update(chunk)
|
90 |
+
return hasher.hexdigest()
|
91 |
|
92 |
# ---------------------------
|
93 |
+
# π Indexing
|
94 |
# ---------------------------
|
95 |
+
def extract_and_chunk():
|
96 |
+
from PIL import Image
|
97 |
+
import pytesseract
|
98 |
+
|
99 |
chunks = []
|
100 |
+
for fname in tqdm(sorted(os.listdir(MANUALS_FOLDER))):
|
101 |
+
if not fname.endswith(".pdf"): continue
|
102 |
+
path = os.path.join(MANUALS_FOLDER, fname)
|
103 |
+
try:
|
104 |
doc = fitz.open(path)
|
105 |
for i, page in enumerate(doc):
|
106 |
+
text = page.get_text()
|
107 |
if not text:
|
108 |
+
img = Image.open(io.BytesIO(page.get_pixmap(dpi=300).tobytes("png")))
|
|
|
109 |
text = pytesseract.image_to_string(img)
|
110 |
+
sents = sent_tokenize(clean_text(text))
|
111 |
+
for j, chunk in enumerate(split_into_chunks(sents)):
|
112 |
+
chunks.append({
|
113 |
+
"source_file": fname,
|
114 |
+
"chunk_id": f"{fname}::p{i+1}::c{j+1}",
|
115 |
+
"page": i+1,
|
116 |
+
"text": chunk.strip()
|
117 |
+
})
|
118 |
+
except Exception as e:
|
119 |
+
print(f"Error reading {fname}: {e}")
|
120 |
+
with open(CHUNKS_FILE, "w", encoding="utf-8") as f:
|
121 |
+
for chunk in chunks:
|
122 |
+
json.dump(chunk, f)
|
123 |
+
f.write("\n")
|
124 |
return chunks
|
125 |
|
126 |
# ---------------------------
|
127 |
+
# πΎ ChromaDB Embedding
|
128 |
+
# ---------------------------
|
129 |
+
def embed_chunks():
|
130 |
+
global collection, embedder
|
131 |
+
client = chromadb.PersistentClient(path=CHROMA_PATH)
|
132 |
+
embedder = SentenceTransformer("all-MiniLM-L6-v2")
|
133 |
+
try: client.delete_collection("manual_chunks")
|
134 |
+
except: pass
|
135 |
+
collection = client.create_collection("manual_chunks")
|
136 |
+
with open(CHUNKS_FILE, "r", encoding="utf-8") as f:
|
137 |
+
batch, metas, ids, texts = [], [], [], []
|
138 |
+
for line in f:
|
139 |
+
item = json.loads(line)
|
140 |
+
texts.append(item["text"])
|
141 |
+
ids.append(item["chunk_id"])
|
142 |
+
metas.append({"source_file": item["source_file"], "page": item["page"]})
|
143 |
+
if len(texts) == 16:
|
144 |
+
embs = embedder.encode(texts).tolist()
|
145 |
+
collection.add(documents=texts, ids=ids, metadatas=metas, embeddings=embs)
|
146 |
+
texts, ids, metas = [], [], []
|
147 |
+
if texts:
|
148 |
+
embs = embedder.encode(texts).tolist()
|
149 |
+
collection.add(documents=texts, ids=ids, metadatas=metas, embeddings=embs)
|
150 |
+
|
151 |
+
# ---------------------------
|
152 |
+
# π Semantic QA
|
153 |
+
# ---------------------------
|
154 |
+
def ask(question):
|
155 |
+
if not collection or not embedder or not pipe:
|
156 |
+
return "App not ready."
|
157 |
+
emb = embedder.encode(question).tolist()
|
158 |
+
results = collection.query(query_embeddings=[emb], n_results=3)
|
159 |
+
context = "\n\n".join([r for r in results["documents"][0]])
|
160 |
+
prompt = f"""
|
161 |
+
Use the context to answer. Say 'I don't know' if unsure.
|
162 |
+
|
163 |
+
Context:
|
164 |
+
{context}
|
165 |
+
|
166 |
+
Question: {question}
|
167 |
+
"""
|
168 |
+
return pipe(prompt)[0]['generated_text']
|
169 |
+
|
170 |
+
# ---------------------------
|
171 |
+
# π App Startup
|
172 |
+
# ---------------------------
|
173 |
+
def initialize():
|
174 |
+
if not os.path.exists(MANUALS_FOLDER):
|
175 |
+
os.makedirs(MANUALS_FOLDER)
|
176 |
+
new_hash = hash_folder(MANUALS_FOLDER)
|
177 |
+
if os.path.exists(HASH_FILE):
|
178 |
+
with open(HASH_FILE, "r") as f:
|
179 |
+
if f.read().strip() == new_hash and os.path.exists(CHUNKS_FILE):
|
180 |
+
print("β
Manuals unchanged. Skipping re-embedding.")
|
181 |
+
return
|
182 |
+
print("π Indexing manuals...")
|
183 |
+
extract_and_chunk()
|
184 |
+
embed_chunks()
|
185 |
+
with open(HASH_FILE, "w") as f:
|
186 |
+
f.write(new_hash)
|
187 |
+
print("β
Embedding complete.")
|
188 |
+
|
189 |
+
# ---------------------------
|
190 |
+
# π₯οΈ Gradio Interface
|
191 |
+
# ---------------------------
|
192 |
+
def build_ui():
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
193 |
with gr.Blocks() as demo:
|
194 |
+
gr.Markdown("## π Ask SmartManuals-AI")
|
195 |
+
inp = gr.Textbox(label="Your question")
|
196 |
+
out = gr.Textbox(label="Answer", lines=6)
|
197 |
+
btn = gr.Button("Ask")
|
198 |
+
btn.click(fn=ask, inputs=inp, outputs=out)
|
199 |
+
return demo
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
200 |
|
201 |
# ---------------------------
|
202 |
+
# π§ Run App
|
203 |
+
# ---------------------------
|
204 |
+
load_model()
|
205 |
+
initialize()
|
206 |
+
demo = build_ui()
|