damoojeje commited on
Commit
8ab0a40
Β·
verified Β·
1 Parent(s): 43b8a1d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +110 -125
app.py CHANGED
@@ -1,156 +1,141 @@
1
  import os
2
- import fitz # PyMuPDF
3
- import docx
4
- import io
5
  import json
6
- import gradio as gr
7
  import pytesseract
8
  from PIL import Image
9
- from tqdm import tqdm
10
- import chromadb
11
- import torch
12
  import nltk
13
- from sentence_transformers import SentenceTransformer, util
14
- from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
 
 
 
15
 
16
- # ----------------------------
17
- # βœ… Ensure nltk punkt is available
18
- # ----------------------------
19
- try:
20
- nltk.data.find("tokenizers/punkt")
21
- except LookupError:
22
- nltk.download("punkt")
23
 
 
 
24
  from nltk.tokenize import sent_tokenize
25
 
26
- # ----------------------------
27
- # βš™οΈ Config
28
- # ----------------------------
29
- MANUAL_DIR = "./Manuals"
30
- CHROMA_DIR = "./chroma_store"
31
- CHUNK_SIZE = 750
32
- CHUNK_OVERLAP = 100
33
- MAX_CONTEXT = 3
34
-
35
- DEFAULT_MODEL = "meta-llama/Llama-3-8b-Instruct"
36
- MODEL_OPTIONS = [
37
- "meta-llama/Llama-3-8b-Instruct",
38
- "mistralai/Mistral-7B-Instruct-v0.3",
39
- "google/gemma-1.1-7b-it"
40
- ]
41
-
42
- HF_TOKEN = os.environ.get("HF_TOKEN")
43
-
44
- # ----------------------------
45
- # πŸ” Utility functions
46
- # ----------------------------
47
- def extract_pdf_text(path):
48
- text_blocks = []
49
- doc = fitz.open(path)
50
- for i, page in enumerate(doc):
51
- text = page.get_text()
52
- if not text.strip():
53
- img = Image.open(io.BytesIO(page.get_pixmap().tobytes("png")))
54
- text = pytesseract.image_to_string(img)
55
- text_blocks.append({"page": i + 1, "text": text})
56
- return text_blocks
57
-
58
- def extract_docx_text(path):
59
- doc = docx.Document(path)
60
- full_text = "\n".join([para.text for para in doc.paragraphs])
61
- return [{"page": 1, "text": full_text}]
62
 
63
  def split_sentences(text):
64
  try:
65
  return sent_tokenize(text)
66
- except Exception:
 
67
  return text.split(". ")
68
 
69
- def chunk_text(sentences):
70
- chunks = []
71
- current = []
72
- count = 0
73
- for sentence in sentences:
74
- tokens = sentence.split()
75
- if count + len(tokens) > CHUNK_SIZE:
76
- chunks.append(" ".join(current))
77
- current = current[-CHUNK_OVERLAP:]
78
- count = sum(len(s.split()) for s in current)
79
- current.append(sentence)
80
- count += len(tokens)
81
- if current:
82
- chunks.append(" ".join(current))
83
- return chunks
84
-
 
 
 
 
 
 
85
  def embed_all():
86
- client = chromadb.PersistentClient(path=CHROMA_DIR)
87
- if "manual_chunks" in [c.name for c in client.list_collections()]:
88
- client.delete_collection("manual_chunks")
89
- collection = client.create_collection("manual_chunks")
 
90
  embedder = SentenceTransformer("all-MiniLM-L6-v2")
91
 
92
- for fname in os.listdir(MANUAL_DIR):
93
- fpath = os.path.join(MANUAL_DIR, fname)
 
94
  if fname.lower().endswith(".pdf"):
95
  pages = extract_pdf_text(fpath)
96
- elif fname.lower().endswith(".docx"):
97
- pages = extract_docx_text(fpath)
98
- else:
99
- continue
100
-
101
- for page in pages:
102
- sents = split_sentences(page["text"])
103
- chunks = chunk_text(sents)
104
- for idx, chunk in enumerate(chunks):
105
- cid = f"{fname}::p{page['page']}::c{idx}"
106
- collection.add(documents=[chunk], ids=[cid], metadatas=[{"source": fname, "page": page["page"]}])
107
-
 
108
  return collection, embedder
109
 
110
- def get_model(model_id):
111
- tokenizer = AutoTokenizer.from_pretrained(model_id, token=HF_TOKEN)
112
- model = AutoModelForCausalLM.from_pretrained(model_id, token=HF_TOKEN, torch_dtype=torch.float32)
113
- return pipeline("text-generation", model=model, tokenizer=tokenizer, device=-1)
114
-
115
- def run_query(question, model_name):
116
- results = db.query(query_texts=[question], n_results=MAX_CONTEXT)
117
- if not results or not results.get("documents"):
118
- return "No matching information found."
119
-
 
 
 
 
 
 
 
 
120
  context = "\n\n".join(results["documents"][0])
121
- prompt = f"""
122
- You are a helpful assistant. Use the following context to answer the question.
123
 
124
- Context:
 
 
 
125
  {context}
126
-
127
- Question: {question}
128
- Answer:
129
  """
130
- model = get_model(model_name)
131
- res = model(prompt, max_new_tokens=300)[0]['generated_text']
132
- return res.split("Answer:")[-1].strip()
133
-
134
- # ----------------------------
135
- # βœ… Startup: Embed manuals
136
- # ----------------------------
137
- db, embedder = embed_all()
138
-
139
- # ----------------------------
140
- # πŸŽ›οΈ Gradio UI
141
- # ----------------------------
142
  with gr.Blocks() as demo:
143
- gr.Markdown("""
144
- # πŸ“˜ SmartManuals-AI (Docker)
145
- Ask any question from the preloaded manuals (PDF + Word).
146
- """)
147
 
148
  with gr.Row():
149
- question = gr.Textbox(label="Ask a Question")
150
- model = gr.Dropdown(choices=MODEL_OPTIONS, value=DEFAULT_MODEL, label="Choose LLM")
151
- btn = gr.Button("Ask")
152
- answer = gr.Textbox(label="Answer", lines=10)
 
 
 
153
 
154
- btn.click(fn=run_query, inputs=[question, model], outputs=answer)
155
 
156
- demo.launch(server_name="0.0.0.0", server_port=7860)
 
 
 
1
  import os
 
 
 
2
  import json
3
+ import fitz # PyMuPDF
4
  import pytesseract
5
  from PIL import Image
6
+ import io
 
 
7
  import nltk
8
+ import chromadb
9
+ from tqdm import tqdm
10
+ from sentence_transformers import SentenceTransformer
11
+ from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
12
+ import gradio as gr
13
 
14
+ # ---------------------------
15
+ # πŸ“¦ Paths and Constants
16
+ # ---------------------------
17
+ MANUALS_DIR = "./Manuals"
18
+ CHROMA_PATH = "./chroma_store"
19
+ COLLECTION_NAME = "manual_chunks"
 
20
 
21
+ # Ensure NLTK punkt is available
22
+ nltk.download("punkt")
23
  from nltk.tokenize import sent_tokenize
24
 
25
+ # ---------------------------
26
+ # 🧼 Text cleaning utilities
27
+ # ---------------------------
28
+ def clean(text):
29
+ return "\n".join([line.strip() for line in text.splitlines() if line.strip()])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30
 
31
  def split_sentences(text):
32
  try:
33
  return sent_tokenize(text)
34
+ except Exception as e:
35
+ print("[Tokenizer Error]", e, "\nFalling back to simple split.")
36
  return text.split(". ")
37
 
38
+ # ---------------------------
39
+ # πŸ“„ PDF and DOCX extraction
40
+ # ---------------------------
41
+ def extract_pdf_text(pdf_path):
42
+ doc = fitz.open(pdf_path)
43
+ pages = []
44
+ for i, page in enumerate(doc):
45
+ text = page.get_text().strip()
46
+ if not text:
47
+ try:
48
+ pix = page.get_pixmap(dpi=300)
49
+ img = Image.open(io.BytesIO(pix.tobytes("png")))
50
+ text = pytesseract.image_to_string(img)
51
+ except pytesseract.TesseractNotFoundError:
52
+ print("❌ Tesseract not found. Skipping OCR for page.")
53
+ text = ""
54
+ pages.append((i + 1, text))
55
+ return pages
56
+
57
+ # ---------------------------
58
+ # 🧠 Embed text using MiniLM
59
+ # ---------------------------
60
  def embed_all():
61
+ client = chromadb.PersistentClient(path=CHROMA_PATH)
62
+ if COLLECTION_NAME in [c.name for c in client.list_collections()]:
63
+ client.delete_collection(COLLECTION_NAME)
64
+ collection = client.create_collection(COLLECTION_NAME)
65
+
66
  embedder = SentenceTransformer("all-MiniLM-L6-v2")
67
 
68
+ chunk_id = 0
69
+ for fname in os.listdir(MANUALS_DIR):
70
+ fpath = os.path.join(MANUALS_DIR, fname)
71
  if fname.lower().endswith(".pdf"):
72
  pages = extract_pdf_text(fpath)
73
+ for page_num, text in pages:
74
+ sents = split_sentences(clean(text))
75
+ for i in range(0, len(sents), 5):
76
+ chunk = " ".join(sents[i:i + 5])
77
+ if chunk.strip():
78
+ collection.add(
79
+ documents=[chunk],
80
+ metadatas=[{"source": fname, "page": page_num}],
81
+ ids=[f"{fname}-{page_num}-{i}-{chunk_id}"]
82
+ )
83
+ chunk_id += 1
84
+
85
+ print(f"βœ… Embedded {chunk_id} chunks.")
86
  return collection, embedder
87
 
88
+ # ---------------------------
89
+ # πŸ€– Load model
90
+ # ---------------------------
91
+ def load_llm():
92
+ model_id = "meta-llama/Llama-3.1-8B-Instruct"
93
+ token = os.environ.get("HF_TOKEN")
94
+ tokenizer = AutoTokenizer.from_pretrained(model_id, token=token)
95
+ model = AutoModelForCausalLM.from_pretrained(
96
+ model_id, token=token, torch_dtype=None, device_map="auto"
97
+ )
98
+ pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, max_new_tokens=512)
99
+ return pipe, tokenizer
100
+
101
+ # ---------------------------
102
+ # ❓ Ask a question
103
+ # ---------------------------
104
+ def ask_question(question, db, embedder, pipe, tokenizer):
105
+ results = db.query(query_texts=[question], n_results=5)
106
  context = "\n\n".join(results["documents"][0])
 
 
107
 
108
+ prompt = f"""
109
+ <|begin_of_text|><|start_header_id|>system<|end_header_id|>
110
+ You are a helpful assistant that answers questions from technical manuals using only the provided context.
111
+ <context>
112
  {context}
113
+ </context>
114
+ <|start_header_id|>user<|end_header_id|>
115
+ {question}<|start_header_id|>assistant<|end_header_id|>
116
  """
117
+
118
+ out = pipe(prompt)[0]["generated_text"]
119
+ final = out.split("<|start_header_id|>assistant<|end_header_id|>")[-1].strip()
120
+ return final
121
+
122
+ # ---------------------------
123
+ # πŸš€ Build interface
124
+ # ---------------------------
 
 
 
 
125
  with gr.Blocks() as demo:
126
+ gr.Markdown("# πŸ€– SmartManuals-AI (Hugging Face Space Edition)")
 
 
 
127
 
128
  with gr.Row():
129
+ qbox = gr.Textbox(label="Ask a Question", placeholder="e.g. How do I access diagnostics on the SE3 console?")
130
+ submit = gr.Button("πŸ” Ask")
131
+
132
+ abox = gr.Textbox(label="Answer", lines=8)
133
+
134
+ db, embedder = embed_all()
135
+ pipe, tokenizer = load_llm()
136
 
137
+ submit.click(fn=lambda q: ask_question(q, db, embedder, pipe, tokenizer), inputs=qbox, outputs=abox)
138
 
139
+ # For Hugging Face Spaces
140
+ if __name__ == "__main__":
141
+ demo.launch()