File size: 6,262 Bytes
6f368e7
 
e6fa21e
18069c2
6f368e7
6728736
18069c2
835a614
18069c2
835a614
6f368e7
 
 
 
 
 
 
 
 
835a614
6f368e7
 
 
 
835a614
6f368e7
835a614
6f368e7
835a614
 
c76542a
6f368e7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c76542a
6f368e7
 
 
 
 
 
835a614
6f368e7
 
 
 
 
 
 
 
 
18069c2
 
6f368e7
 
 
 
 
 
835a614
 
 
6f368e7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
835a614
 
6f368e7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
# βœ… app.py β€” Hugging Face Space Version (Finalized)
# RAG over local PDFs/DOCX using Hugging Face-hosted models with Chroma

import os
import json
import fitz  # PyMuPDF
import nltk
import chromadb
from tqdm import tqdm
from nltk.tokenize import sent_tokenize
from sentence_transformers import SentenceTransformer, util
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
import pytesseract
from PIL import Image
import io
import docx2txt
import gradio as gr

# ---------------------------
# βœ… Configuration
# ---------------------------
MANUALS_DIR = "./Manuals"  # Folder containing all PDF and DOCX files
CHROMA_PATH = "./chroma_store"
CHUNKS_PATH = "chunks.jsonl"
COLLECTION_NAME = "manual_chunks"
MAX_CONTEXT_CHUNKS = 3
CHUNK_SIZE = 750
CHUNK_OVERLAP = 100

HF_TOKEN = os.environ.get("HF_TOKEN")
LLM_MODELS = {
    "LLaMA 3.1 8B": "meta-llama/Llama-3.1-8B-Instruct",
    "LLaMA 3 8B": "meta-llama/Llama-3-8B-Instruct",
    "LLaMA 4 Scout": "meta-llama/Llama-4-Scout-17B-16E-Instruct",
    "Mistral": "mistralai/Mistral-7B-Instruct-v0.3",
    "Gemma": "google/gemma-1.1-7b-it",
    "Qwen 3 30B": "Qwen/Qwen3-30B-A3B",
}

# ---------------------------
# βœ… Setup
# ---------------------------
nltk.download('punkt')
embedder = SentenceTransformer("all-MiniLM-L6-v2")
client = chromadb.PersistentClient(path=CHROMA_PATH)
collection = None

# ---------------------------
# πŸ“„ Load all PDFs and DOCX content
# ---------------------------
def extract_all_documents():
    chunks = []
    for fname in os.listdir(MANUALS_DIR):
        path = os.path.join(MANUALS_DIR, fname)
        if fname.lower().endswith(".pdf"):
            doc = fitz.open(path)
            for i, page in enumerate(doc):
                text = page.get_text().strip()
                if not text:
                    pix = page.get_pixmap(dpi=300)
                    img = Image.open(io.BytesIO(pix.tobytes("png")))
                    text = pytesseract.image_to_string(img)
                if text.strip():
                    chunks.append((fname, i + 1, text.strip()))
        elif fname.lower().endswith(".docx"):
            text = docx2txt.process(path)
            if text.strip():
                chunks.append((fname, 1, text.strip()))
    return chunks

# ---------------------------
# βœ‚οΈ Chunk text
# ---------------------------
def split_chunks(text, size=CHUNK_SIZE, overlap=CHUNK_OVERLAP):
    sentences = sent_tokenize(text)
    chunks, curr, curr_len = [], [], 0
    for sent in sentences:
        tok_len = len(sent.split())
        if curr_len + tok_len > size:
            chunks.append(" ".join(curr))
            curr = curr[-overlap:]
            curr_len = sum(len(s.split()) for s in curr)
        curr.append(sent)
        curr_len += tok_len
    if curr:
        chunks.append(" ".join(curr))
    return chunks

# ---------------------------
# πŸ’Ύ Embed into Chroma
# ---------------------------
def embed_documents():
    global collection
    if collection:
        client.delete_collection(COLLECTION_NAME)
    collection = client.create_collection(COLLECTION_NAME)

    docs = extract_all_documents()
    records = []
    for fname, page, text in docs:
        for i, chunk in enumerate(split_chunks(text)):
            if not chunk.strip():
                continue
            records.append({
                "id": f"{fname}::p{page}::c{i}",
                "text": chunk,
                "metadata": {"source_file": fname, "page": page}
            })

    for i in tqdm(range(0, len(records), 16)):
        batch = records[i:i + 16]
        texts = [b["text"] for b in batch]
        ids = [b["id"] for b in batch]
        metas = [b["metadata"] for b in batch]
        embs = embedder.encode(texts).tolist()
        collection.add(documents=texts, ids=ids, metadatas=metas, embeddings=embs)
    return f"βœ… Embedded {len(records)} chunks"

# ---------------------------
# πŸ”Ž Query
# ---------------------------
def search_context(query, top_k=MAX_CONTEXT_CHUNKS):
    results = collection.query(query_texts=[query], n_results=top_k)
    chunks = results["documents"][0]
    metas = results["metadatas"][0]
    return "\n\n".join(
        f"File: {m['source_file']}, Page: {m['page']}\n{c}" for m, c in zip(metas, chunks)
    )

# ---------------------------
# 🧠 Run Inference
# ---------------------------
def ask_model(model_name, query):
    if not HF_TOKEN:
        return "❌ HF_TOKEN not set."
    context = search_context(query)
    system_prompt = "Answer only using the context. Say 'I don't know' if not found."
    prompt = f"<|begin_of_text|><|start_header_id|>system<|end_header_id|>{system_prompt}<|start_header_id|>user<|end_header_id|>{context}\n\nQuestion: {query}<|start_header_id|>assistant<|end_header_id|>"

    tokenizer = AutoTokenizer.from_pretrained(model_name, token=HF_TOKEN)
    model = AutoModelForCausalLM.from_pretrained(model_name, token=HF_TOKEN, device_map="auto")
    pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, device=0 if torch.cuda.is_available() else -1)

    output = pipe(prompt, max_new_tokens=512, do_sample=True)[0]["generated_text"]
    return output.split("<|start_header_id|>assistant<|end_header_id|>")[-1].strip()

# ---------------------------
# πŸŽ› Gradio UI
# ---------------------------
def launch_interface():
    with gr.Blocks() as demo:
        gr.Markdown("""
        # 🧠 SmartManuals-AI (Hugging Face Edition)
        Upload manuals to `./Manuals`, click Embed, then ask questions.
        """)

        with gr.Row():
            embed_button = gr.Button("βš™οΈ Embed Documents")
            embed_status = gr.Textbox(label="Status")

        with gr.Row():
            model_select = gr.Dropdown(list(LLM_MODELS.keys()), label="Model", value="LLaMA 3.1 8B")
            question = gr.Textbox(label="Question")
        answer = gr.Textbox(label="Answer", lines=10)
        submit = gr.Button("πŸ” Ask")

        embed_button.click(fn=embed_documents, outputs=embed_status)
        submit.click(fn=lambda m, q: ask_model(LLM_MODELS[m], q), inputs=[model_select, question], outputs=[answer])

    demo.launch()

# ---------------------------
if __name__ == "__main__":
    launch_interface()