Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -8,20 +8,19 @@ import torch
|
|
8 |
import nltk
|
9 |
import traceback
|
10 |
import docx2txt
|
|
|
11 |
from PIL import Image
|
12 |
from io import BytesIO
|
13 |
from tqdm import tqdm
|
14 |
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
|
15 |
from sentence_transformers import SentenceTransformer, util
|
16 |
-
from nltk.tokenize import
|
17 |
|
18 |
-
#
|
19 |
-
|
20 |
-
|
21 |
-
except LookupError:
|
22 |
-
nltk.download("punkt")
|
23 |
|
24 |
-
#
|
25 |
HF_TOKEN = os.getenv("HF_TOKEN")
|
26 |
MANUALS_DIR = "Manuals"
|
27 |
CHROMA_PATH = "chroma_store"
|
@@ -33,15 +32,23 @@ MODEL_ID = "ibm-granite/granite-vision-3.2-2b"
|
|
33 |
|
34 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
35 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
36 |
# ---------------- Text Helpers ----------------
|
37 |
def clean(text):
|
38 |
return "\n".join([line.strip() for line in text.splitlines() if line.strip()])
|
39 |
|
40 |
def split_sentences(text):
|
41 |
try:
|
42 |
-
return
|
43 |
-
except:
|
44 |
-
|
45 |
return text.split(". ")
|
46 |
|
47 |
def split_chunks(sentences, max_tokens=CHUNK_SIZE, overlap=CHUNK_OVERLAP):
|
@@ -73,14 +80,14 @@ def extract_pdf_text(path):
|
|
73 |
text = pytesseract.image_to_string(img)
|
74 |
chunks.append((path, i + 1, clean(text)))
|
75 |
except Exception as e:
|
76 |
-
|
77 |
return chunks
|
78 |
|
79 |
def extract_docx_text(path):
|
80 |
try:
|
81 |
return [(path, 1, clean(docx2txt.process(path)))]
|
82 |
except Exception as e:
|
83 |
-
|
84 |
return []
|
85 |
|
86 |
# ---------------- Embedding ----------------
|
@@ -96,7 +103,7 @@ def embed_all():
|
|
96 |
collection = client.get_or_create_collection(COLLECTION_NAME)
|
97 |
|
98 |
docs, ids, metas = [], [], []
|
99 |
-
|
100 |
|
101 |
for fname in os.listdir(MANUALS_DIR):
|
102 |
fpath = os.path.join(MANUALS_DIR, fname)
|
@@ -123,7 +130,7 @@ def embed_all():
|
|
123 |
embs = embedder.encode(docs).tolist()
|
124 |
collection.add(documents=docs, ids=ids, metadatas=metas, embeddings=embs)
|
125 |
|
126 |
-
|
127 |
return collection, embedder
|
128 |
|
129 |
# ---------------- Model Setup ----------------
|
@@ -156,9 +163,13 @@ def get_answer(question):
|
|
156 |
query_emb = embedder.encode(question, convert_to_tensor=True)
|
157 |
results = db.query(query_texts=[question], n_results=MAX_CONTEXT_CHUNKS)
|
158 |
context = "\n\n".join(results["documents"][0])
|
159 |
-
|
|
|
|
|
|
|
|
|
160 |
except Exception as e:
|
161 |
-
|
162 |
return f"Error: {e}"
|
163 |
|
164 |
# ---------------- UI ----------------
|
@@ -167,7 +178,7 @@ with gr.Blocks() as demo:
|
|
167 |
with gr.Row():
|
168 |
question = gr.Textbox(label="Ask your question")
|
169 |
ask = gr.Button("Ask")
|
170 |
-
answer = gr.Textbox(label="Answer", lines=
|
171 |
ask.click(fn=get_answer, inputs=question, outputs=answer)
|
172 |
|
173 |
# Embed + Load Model at Startup
|
@@ -175,7 +186,7 @@ try:
|
|
175 |
db, embedder = embed_all()
|
176 |
model_pipe, model_tokenizer = load_model()
|
177 |
except Exception as e:
|
178 |
-
|
179 |
db, embedder = None, None
|
180 |
model_pipe, model_tokenizer = None, None
|
181 |
|
|
|
8 |
import nltk
|
9 |
import traceback
|
10 |
import docx2txt
|
11 |
+
import logging
|
12 |
from PIL import Image
|
13 |
from io import BytesIO
|
14 |
from tqdm import tqdm
|
15 |
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
|
16 |
from sentence_transformers import SentenceTransformer, util
|
17 |
+
from nltk.tokenize.punkt import PunktSentenceTokenizer, PunktTrainer
|
18 |
|
19 |
+
# ---------------- Logger Setup ----------------
|
20 |
+
logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s")
|
21 |
+
logger = logging.getLogger("SmartManuals")
|
|
|
|
|
22 |
|
23 |
+
# ---------------- Config ----------------
|
24 |
HF_TOKEN = os.getenv("HF_TOKEN")
|
25 |
MANUALS_DIR = "Manuals"
|
26 |
CHROMA_PATH = "chroma_store"
|
|
|
32 |
|
33 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
34 |
|
35 |
+
# ---------------- Sentence Tokenizer (Persistent) ----------------
|
36 |
+
try:
|
37 |
+
nltk.data.find("tokenizers/punkt")
|
38 |
+
except LookupError:
|
39 |
+
nltk.download("punkt")
|
40 |
+
|
41 |
+
tokenizer_punkt = PunktSentenceTokenizer()
|
42 |
+
|
43 |
# ---------------- Text Helpers ----------------
|
44 |
def clean(text):
|
45 |
return "\n".join([line.strip() for line in text.splitlines() if line.strip()])
|
46 |
|
47 |
def split_sentences(text):
|
48 |
try:
|
49 |
+
return tokenizer_punkt.tokenize(text)
|
50 |
+
except Exception as e:
|
51 |
+
logger.warning("Tokenizer fallback: simple split. Reason: %s", e)
|
52 |
return text.split(". ")
|
53 |
|
54 |
def split_chunks(sentences, max_tokens=CHUNK_SIZE, overlap=CHUNK_OVERLAP):
|
|
|
80 |
text = pytesseract.image_to_string(img)
|
81 |
chunks.append((path, i + 1, clean(text)))
|
82 |
except Exception as e:
|
83 |
+
logger.error("PDF read error [%s]: %s", path, e)
|
84 |
return chunks
|
85 |
|
86 |
def extract_docx_text(path):
|
87 |
try:
|
88 |
return [(path, 1, clean(docx2txt.process(path)))]
|
89 |
except Exception as e:
|
90 |
+
logger.error("DOCX read error [%s]: %s", path, e)
|
91 |
return []
|
92 |
|
93 |
# ---------------- Embedding ----------------
|
|
|
103 |
collection = client.get_or_create_collection(COLLECTION_NAME)
|
104 |
|
105 |
docs, ids, metas = [], [], []
|
106 |
+
logger.info("π Processing manuals...")
|
107 |
|
108 |
for fname in os.listdir(MANUALS_DIR):
|
109 |
fpath = os.path.join(MANUALS_DIR, fname)
|
|
|
130 |
embs = embedder.encode(docs).tolist()
|
131 |
collection.add(documents=docs, ids=ids, metadatas=metas, embeddings=embs)
|
132 |
|
133 |
+
logger.info("β
Embedded %d chunks.", len(ids))
|
134 |
return collection, embedder
|
135 |
|
136 |
# ---------------- Model Setup ----------------
|
|
|
163 |
query_emb = embedder.encode(question, convert_to_tensor=True)
|
164 |
results = db.query(query_texts=[question], n_results=MAX_CONTEXT_CHUNKS)
|
165 |
context = "\n\n".join(results["documents"][0])
|
166 |
+
source_info = "\n\n".join([
|
167 |
+
f"π Source: {m.get('source', 'N/A')} (Page {m.get('page', 'N/A')})" for m in results["metadatas"][0]
|
168 |
+
])
|
169 |
+
answer = ask_model(question, context, model_pipe, model_tokenizer)
|
170 |
+
return f"{answer}\n\n---\n{source_info}"
|
171 |
except Exception as e:
|
172 |
+
logger.error("β Query error: %s", e)
|
173 |
return f"Error: {e}"
|
174 |
|
175 |
# ---------------- UI ----------------
|
|
|
178 |
with gr.Row():
|
179 |
question = gr.Textbox(label="Ask your question")
|
180 |
ask = gr.Button("Ask")
|
181 |
+
answer = gr.Textbox(label="Answer", lines=10)
|
182 |
ask.click(fn=get_answer, inputs=question, outputs=answer)
|
183 |
|
184 |
# Embed + Load Model at Startup
|
|
|
186 |
db, embedder = embed_all()
|
187 |
model_pipe, model_tokenizer = load_model()
|
188 |
except Exception as e:
|
189 |
+
logger.exception("β Startup failure: %s", e)
|
190 |
db, embedder = None, None
|
191 |
model_pipe, model_tokenizer = None, None
|
192 |
|