damoojeje commited on
Commit
ad0baa1
Β·
verified Β·
1 Parent(s): 05755ed

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +29 -18
app.py CHANGED
@@ -8,20 +8,19 @@ import torch
8
  import nltk
9
  import traceback
10
  import docx2txt
 
11
  from PIL import Image
12
  from io import BytesIO
13
  from tqdm import tqdm
14
  from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
15
  from sentence_transformers import SentenceTransformer, util
16
- from nltk.tokenize import sent_tokenize
17
 
18
- # Ensure punkt is downloaded
19
- try:
20
- nltk.data.find("tokenizers/punkt")
21
- except LookupError:
22
- nltk.download("punkt")
23
 
24
- # Configuration
25
  HF_TOKEN = os.getenv("HF_TOKEN")
26
  MANUALS_DIR = "Manuals"
27
  CHROMA_PATH = "chroma_store"
@@ -33,15 +32,23 @@ MODEL_ID = "ibm-granite/granite-vision-3.2-2b"
33
 
34
  device = "cuda" if torch.cuda.is_available() else "cpu"
35
 
 
 
 
 
 
 
 
 
36
  # ---------------- Text Helpers ----------------
37
  def clean(text):
38
  return "\n".join([line.strip() for line in text.splitlines() if line.strip()])
39
 
40
  def split_sentences(text):
41
  try:
42
- return sent_tokenize(text)
43
- except:
44
- print("⚠️ Tokenizer fallback: simple split.")
45
  return text.split(". ")
46
 
47
  def split_chunks(sentences, max_tokens=CHUNK_SIZE, overlap=CHUNK_OVERLAP):
@@ -73,14 +80,14 @@ def extract_pdf_text(path):
73
  text = pytesseract.image_to_string(img)
74
  chunks.append((path, i + 1, clean(text)))
75
  except Exception as e:
76
- print("❌ PDF read error:", path, e)
77
  return chunks
78
 
79
  def extract_docx_text(path):
80
  try:
81
  return [(path, 1, clean(docx2txt.process(path)))]
82
  except Exception as e:
83
- print("❌ DOCX read error:", path, e)
84
  return []
85
 
86
  # ---------------- Embedding ----------------
@@ -96,7 +103,7 @@ def embed_all():
96
  collection = client.get_or_create_collection(COLLECTION_NAME)
97
 
98
  docs, ids, metas = [], [], []
99
- print("πŸ“„ Processing manuals...")
100
 
101
  for fname in os.listdir(MANUALS_DIR):
102
  fpath = os.path.join(MANUALS_DIR, fname)
@@ -123,7 +130,7 @@ def embed_all():
123
  embs = embedder.encode(docs).tolist()
124
  collection.add(documents=docs, ids=ids, metadatas=metas, embeddings=embs)
125
 
126
- print(f"βœ… Embedded {len(ids)} chunks.")
127
  return collection, embedder
128
 
129
  # ---------------- Model Setup ----------------
@@ -156,9 +163,13 @@ def get_answer(question):
156
  query_emb = embedder.encode(question, convert_to_tensor=True)
157
  results = db.query(query_texts=[question], n_results=MAX_CONTEXT_CHUNKS)
158
  context = "\n\n".join(results["documents"][0])
159
- return ask_model(question, context, model_pipe, model_tokenizer)
 
 
 
 
160
  except Exception as e:
161
- print("❌ Query error:", e)
162
  return f"Error: {e}"
163
 
164
  # ---------------- UI ----------------
@@ -167,7 +178,7 @@ with gr.Blocks() as demo:
167
  with gr.Row():
168
  question = gr.Textbox(label="Ask your question")
169
  ask = gr.Button("Ask")
170
- answer = gr.Textbox(label="Answer", lines=8)
171
  ask.click(fn=get_answer, inputs=question, outputs=answer)
172
 
173
  # Embed + Load Model at Startup
@@ -175,7 +186,7 @@ try:
175
  db, embedder = embed_all()
176
  model_pipe, model_tokenizer = load_model()
177
  except Exception as e:
178
- print("❌ Startup failure:", e)
179
  db, embedder = None, None
180
  model_pipe, model_tokenizer = None, None
181
 
 
8
  import nltk
9
  import traceback
10
  import docx2txt
11
+ import logging
12
  from PIL import Image
13
  from io import BytesIO
14
  from tqdm import tqdm
15
  from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
16
  from sentence_transformers import SentenceTransformer, util
17
+ from nltk.tokenize.punkt import PunktSentenceTokenizer, PunktTrainer
18
 
19
+ # ---------------- Logger Setup ----------------
20
+ logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s")
21
+ logger = logging.getLogger("SmartManuals")
 
 
22
 
23
+ # ---------------- Config ----------------
24
  HF_TOKEN = os.getenv("HF_TOKEN")
25
  MANUALS_DIR = "Manuals"
26
  CHROMA_PATH = "chroma_store"
 
32
 
33
  device = "cuda" if torch.cuda.is_available() else "cpu"
34
 
35
+ # ---------------- Sentence Tokenizer (Persistent) ----------------
36
+ try:
37
+ nltk.data.find("tokenizers/punkt")
38
+ except LookupError:
39
+ nltk.download("punkt")
40
+
41
+ tokenizer_punkt = PunktSentenceTokenizer()
42
+
43
  # ---------------- Text Helpers ----------------
44
  def clean(text):
45
  return "\n".join([line.strip() for line in text.splitlines() if line.strip()])
46
 
47
  def split_sentences(text):
48
  try:
49
+ return tokenizer_punkt.tokenize(text)
50
+ except Exception as e:
51
+ logger.warning("Tokenizer fallback: simple split. Reason: %s", e)
52
  return text.split(". ")
53
 
54
  def split_chunks(sentences, max_tokens=CHUNK_SIZE, overlap=CHUNK_OVERLAP):
 
80
  text = pytesseract.image_to_string(img)
81
  chunks.append((path, i + 1, clean(text)))
82
  except Exception as e:
83
+ logger.error("PDF read error [%s]: %s", path, e)
84
  return chunks
85
 
86
  def extract_docx_text(path):
87
  try:
88
  return [(path, 1, clean(docx2txt.process(path)))]
89
  except Exception as e:
90
+ logger.error("DOCX read error [%s]: %s", path, e)
91
  return []
92
 
93
  # ---------------- Embedding ----------------
 
103
  collection = client.get_or_create_collection(COLLECTION_NAME)
104
 
105
  docs, ids, metas = [], [], []
106
+ logger.info("πŸ“„ Processing manuals...")
107
 
108
  for fname in os.listdir(MANUALS_DIR):
109
  fpath = os.path.join(MANUALS_DIR, fname)
 
130
  embs = embedder.encode(docs).tolist()
131
  collection.add(documents=docs, ids=ids, metadatas=metas, embeddings=embs)
132
 
133
+ logger.info("βœ… Embedded %d chunks.", len(ids))
134
  return collection, embedder
135
 
136
  # ---------------- Model Setup ----------------
 
163
  query_emb = embedder.encode(question, convert_to_tensor=True)
164
  results = db.query(query_texts=[question], n_results=MAX_CONTEXT_CHUNKS)
165
  context = "\n\n".join(results["documents"][0])
166
+ source_info = "\n\n".join([
167
+ f"πŸ“„ Source: {m.get('source', 'N/A')} (Page {m.get('page', 'N/A')})" for m in results["metadatas"][0]
168
+ ])
169
+ answer = ask_model(question, context, model_pipe, model_tokenizer)
170
+ return f"{answer}\n\n---\n{source_info}"
171
  except Exception as e:
172
+ logger.error("❌ Query error: %s", e)
173
  return f"Error: {e}"
174
 
175
  # ---------------- UI ----------------
 
178
  with gr.Row():
179
  question = gr.Textbox(label="Ask your question")
180
  ask = gr.Button("Ask")
181
+ answer = gr.Textbox(label="Answer", lines=10)
182
  ask.click(fn=get_answer, inputs=question, outputs=answer)
183
 
184
  # Embed + Load Model at Startup
 
186
  db, embedder = embed_all()
187
  model_pipe, model_tokenizer = load_model()
188
  except Exception as e:
189
+ logger.exception("❌ Startup failure: %s", e)
190
  db, embedder = None, None
191
  model_pipe, model_tokenizer = None, None
192