File size: 5,502 Bytes
bfb4fda
 
 
 
df365ca
bfb4fda
df365ca
 
bfb4fda
835a614
df365ca
bfb4fda
df365ca
bfb4fda
835a614
bfb4fda
bc25066
bfb4fda
835a614
 
bfb4fda
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bc25066
df365ca
bfb4fda
df365ca
bc25066
bfb4fda
bc25066
 
bfb4fda
bc25066
bfb4fda
df365ca
 
bfb4fda
 
 
 
 
 
df365ca
bfb4fda
bc25066
 
6f368e7
c76542a
df365ca
bfb4fda
df365ca
bc25066
bfb4fda
 
 
bc25066
df365ca
bc25066
 
 
bfb4fda
 
 
 
df365ca
bfb4fda
 
 
df365ca
bfb4fda
df365ca
 
bfb4fda
df365ca
 
bfb4fda
 
 
 
 
 
 
 
bc25066
bfb4fda
df365ca
bc25066
df365ca
bfb4fda
df365ca
bfb4fda
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
df365ca
 
bfb4fda
df365ca
bc25066
df365ca
bfb4fda
 
 
 
 
 
 
df365ca
bc25066
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
# βœ… SmartManuals-AI: Hugging Face Space App (RAM Safe, Multi-model, No Preview)

import os, json, fitz, torch, chromadb, docx
import gradio as gr
from PIL import Image
from nltk.tokenize import sent_tokenize
from sentence_transformers import SentenceTransformer, util
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from tqdm import tqdm

# ---------------------------
# βš™οΈ Constants
# ---------------------------
MANUALS_DIR = "Manuals"
CHROMA_PATH = "./chroma_store"
CHUNKS_JSONL = "manual_chunks.jsonl"
COLLECTION_NAME = "manual_chunks"
HF_TOKEN = os.environ.get("HF_TOKEN")
CHUNK_SIZE = 750
CHUNK_OVERLAP = 100
TOP_K = 3

MODEL_OPTIONS = {
    "LLaMA 3.1 (8B)": "meta-llama/Llama-3.1-8B-Instruct",
    "Mistral 7B": "mistralai/Mistral-7B-Instruct-v0.3",
    "Gemma 7B": "google/gemma-7b-it",
    "Qwen3 7B": "Qwen/Qwen1.5-7B-Chat"
}

# ---------------------------
# πŸ“„ Extract Text from PDFs and DOCX
# ---------------------------
def extract_text_from_pdf(path):
    text = ""
    try:
        doc = fitz.open(path)
        for page in doc:
            page_text = page.get_text()
            text += page_text + "\n"
        doc.close()
    except Exception as e:
        print(f"❌ PDF Error in {path}: {e}")
    return text

def extract_text_from_docx(path):
    try:
        doc = docx.Document(path)
        return "\n".join(p.text for p in doc.paragraphs if p.text.strip())
    except Exception as e:
        print(f"❌ DOCX Error in {path}: {e}")
        return ""

# ---------------------------
# 🧹 Clean + Chunk
# ---------------------------
def clean(text):
    return "\n".join([line.strip() for line in text.splitlines() if line.strip()])

def split_sentences(text):
    return sent_tokenize(text)

def chunk_text(sentences, size=CHUNK_SIZE, overlap=CHUNK_OVERLAP):
    chunks, chunk, length = [], [], 0
    for sent in sentences:
        n = len(sent.split())
        if length + n > size:
            if chunk:
                chunks.append(" ".join(chunk))
                chunk = chunk[-overlap:]
                length = sum(len(s.split()) for s in chunk)
        chunk.append(sent)
        length += n
    if chunk:
        chunks.append(" ".join(chunk))
    return chunks

# ---------------------------
# πŸ“¦ Embed and Store in Chroma
# ---------------------------
def embed_all():
    print("πŸ” Scanning manuals and embedding...")
    os.makedirs(CHROMA_PATH, exist_ok=True)
    embedder = SentenceTransformer("all-MiniLM-L6-v2")
    client = chromadb.PersistentClient(path=CHROMA_PATH)
    if COLLECTION_NAME in [c.name for c in client.list_collections()]:
        client.delete_collection(COLLECTION_NAME)
    collection = client.create_collection(COLLECTION_NAME)

    all_chunks = []
    files = [f for f in os.listdir(MANUALS_DIR) if f.lower().endswith((".pdf", ".docx"))]
    for fname in tqdm(files):
        path = os.path.join(MANUALS_DIR, fname)
        text = extract_text_from_pdf(path) if fname.endswith(".pdf") else extract_text_from_docx(path)
        text = clean(text)
        sents = split_sentences(text)
        chunks = chunk_text(sents)
        for i, chunk in enumerate(chunks):
            all_chunks.append({
                "id": f"{fname}::chunk_{i+1}",
                "text": chunk,
                "metadata": {"source": fname}
            })

    # Batch embed and store
    for i in range(0, len(all_chunks), 16):
        batch = all_chunks[i:i+16]
        docs = [c["text"] for c in batch]
        ids = [c["id"] for c in batch]
        metas = [c["metadata"] for c in batch]
        embs = embedder.encode(docs).tolist()
        collection.add(documents=docs, ids=ids, metadatas=metas, embeddings=embs)

    print(f"βœ… Embedded {len(all_chunks)} chunks.")
    return collection, embedder

# ---------------------------
# πŸ” RAG Search & LLM Answer
# ---------------------------
def ask(query, model_key):
    model_id = MODEL_OPTIONS[model_key]
    try:
        tokenizer = AutoTokenizer.from_pretrained(model_id, token=HF_TOKEN)
        model = AutoModelForCausalLM.from_pretrained(model_id, token=HF_TOKEN, torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32)
        model.to("cuda" if torch.cuda.is_available() else "cpu")
        gen = pipeline("text-generation", model=model, tokenizer=tokenizer, device=0 if torch.cuda.is_available() else -1)
    except Exception as e:
        return f"❌ Model loading failed: {e}"

    results = db.query(query_texts=[query], n_results=TOP_K)
    chunks = results["documents"][0]
    context = "\n\n".join(chunks)
    prompt = f"Answer this using only the context below.\n\nContext:\n{context}\n\nQuestion: {query}\nAnswer:"

    try:
        res = gen(prompt, max_new_tokens=300, do_sample=False)[0]['generated_text']
        return res.split("Answer:", 1)[-1].strip()
    except Exception as e:
        return f"❌ LLM failed: {e}"

# ---------------------------
# ▢️ UI
# ---------------------------
db, embedder = embed_all()
with gr.Blocks() as demo:
    gr.Markdown("## 🧠 SmartManuals-AI β€” Ask Your PDF and Word Docs")
    with gr.Row():
        qbox = gr.Textbox(label="Ask a Question", placeholder="e.g. How do I calibrate SE3 console?")
        model_pick = gr.Dropdown(choices=list(MODEL_OPTIONS.keys()), label="Choose a Model", value="Mistral 7B")
    answer = gr.Textbox(label="Answer", lines=8)
    ask_btn = gr.Button("Ask")
    ask_btn.click(fn=ask, inputs=[qbox, model_pick], outputs=[answer])

demo.launch()