Spaces:
Sleeping
Sleeping
File size: 5,502 Bytes
bfb4fda df365ca bfb4fda df365ca bfb4fda 835a614 df365ca bfb4fda df365ca bfb4fda 835a614 bfb4fda bc25066 bfb4fda 835a614 bfb4fda bc25066 df365ca bfb4fda df365ca bc25066 bfb4fda bc25066 bfb4fda bc25066 bfb4fda df365ca bfb4fda df365ca bfb4fda bc25066 6f368e7 c76542a df365ca bfb4fda df365ca bc25066 bfb4fda bc25066 df365ca bc25066 bfb4fda df365ca bfb4fda df365ca bfb4fda df365ca bfb4fda df365ca bfb4fda bc25066 bfb4fda df365ca bc25066 df365ca bfb4fda df365ca bfb4fda df365ca bfb4fda df365ca bc25066 df365ca bfb4fda df365ca bc25066 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 |
# β
SmartManuals-AI: Hugging Face Space App (RAM Safe, Multi-model, No Preview)
import os, json, fitz, torch, chromadb, docx
import gradio as gr
from PIL import Image
from nltk.tokenize import sent_tokenize
from sentence_transformers import SentenceTransformer, util
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from tqdm import tqdm
# ---------------------------
# βοΈ Constants
# ---------------------------
MANUALS_DIR = "Manuals"
CHROMA_PATH = "./chroma_store"
CHUNKS_JSONL = "manual_chunks.jsonl"
COLLECTION_NAME = "manual_chunks"
HF_TOKEN = os.environ.get("HF_TOKEN")
CHUNK_SIZE = 750
CHUNK_OVERLAP = 100
TOP_K = 3
MODEL_OPTIONS = {
"LLaMA 3.1 (8B)": "meta-llama/Llama-3.1-8B-Instruct",
"Mistral 7B": "mistralai/Mistral-7B-Instruct-v0.3",
"Gemma 7B": "google/gemma-7b-it",
"Qwen3 7B": "Qwen/Qwen1.5-7B-Chat"
}
# ---------------------------
# π Extract Text from PDFs and DOCX
# ---------------------------
def extract_text_from_pdf(path):
text = ""
try:
doc = fitz.open(path)
for page in doc:
page_text = page.get_text()
text += page_text + "\n"
doc.close()
except Exception as e:
print(f"β PDF Error in {path}: {e}")
return text
def extract_text_from_docx(path):
try:
doc = docx.Document(path)
return "\n".join(p.text for p in doc.paragraphs if p.text.strip())
except Exception as e:
print(f"β DOCX Error in {path}: {e}")
return ""
# ---------------------------
# π§Ή Clean + Chunk
# ---------------------------
def clean(text):
return "\n".join([line.strip() for line in text.splitlines() if line.strip()])
def split_sentences(text):
return sent_tokenize(text)
def chunk_text(sentences, size=CHUNK_SIZE, overlap=CHUNK_OVERLAP):
chunks, chunk, length = [], [], 0
for sent in sentences:
n = len(sent.split())
if length + n > size:
if chunk:
chunks.append(" ".join(chunk))
chunk = chunk[-overlap:]
length = sum(len(s.split()) for s in chunk)
chunk.append(sent)
length += n
if chunk:
chunks.append(" ".join(chunk))
return chunks
# ---------------------------
# π¦ Embed and Store in Chroma
# ---------------------------
def embed_all():
print("π Scanning manuals and embedding...")
os.makedirs(CHROMA_PATH, exist_ok=True)
embedder = SentenceTransformer("all-MiniLM-L6-v2")
client = chromadb.PersistentClient(path=CHROMA_PATH)
if COLLECTION_NAME in [c.name for c in client.list_collections()]:
client.delete_collection(COLLECTION_NAME)
collection = client.create_collection(COLLECTION_NAME)
all_chunks = []
files = [f for f in os.listdir(MANUALS_DIR) if f.lower().endswith((".pdf", ".docx"))]
for fname in tqdm(files):
path = os.path.join(MANUALS_DIR, fname)
text = extract_text_from_pdf(path) if fname.endswith(".pdf") else extract_text_from_docx(path)
text = clean(text)
sents = split_sentences(text)
chunks = chunk_text(sents)
for i, chunk in enumerate(chunks):
all_chunks.append({
"id": f"{fname}::chunk_{i+1}",
"text": chunk,
"metadata": {"source": fname}
})
# Batch embed and store
for i in range(0, len(all_chunks), 16):
batch = all_chunks[i:i+16]
docs = [c["text"] for c in batch]
ids = [c["id"] for c in batch]
metas = [c["metadata"] for c in batch]
embs = embedder.encode(docs).tolist()
collection.add(documents=docs, ids=ids, metadatas=metas, embeddings=embs)
print(f"β
Embedded {len(all_chunks)} chunks.")
return collection, embedder
# ---------------------------
# π RAG Search & LLM Answer
# ---------------------------
def ask(query, model_key):
model_id = MODEL_OPTIONS[model_key]
try:
tokenizer = AutoTokenizer.from_pretrained(model_id, token=HF_TOKEN)
model = AutoModelForCausalLM.from_pretrained(model_id, token=HF_TOKEN, torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32)
model.to("cuda" if torch.cuda.is_available() else "cpu")
gen = pipeline("text-generation", model=model, tokenizer=tokenizer, device=0 if torch.cuda.is_available() else -1)
except Exception as e:
return f"β Model loading failed: {e}"
results = db.query(query_texts=[query], n_results=TOP_K)
chunks = results["documents"][0]
context = "\n\n".join(chunks)
prompt = f"Answer this using only the context below.\n\nContext:\n{context}\n\nQuestion: {query}\nAnswer:"
try:
res = gen(prompt, max_new_tokens=300, do_sample=False)[0]['generated_text']
return res.split("Answer:", 1)[-1].strip()
except Exception as e:
return f"β LLM failed: {e}"
# ---------------------------
# βΆοΈ UI
# ---------------------------
db, embedder = embed_all()
with gr.Blocks() as demo:
gr.Markdown("## π§ SmartManuals-AI β Ask Your PDF and Word Docs")
with gr.Row():
qbox = gr.Textbox(label="Ask a Question", placeholder="e.g. How do I calibrate SE3 console?")
model_pick = gr.Dropdown(choices=list(MODEL_OPTIONS.keys()), label="Choose a Model", value="Mistral 7B")
answer = gr.Textbox(label="Answer", lines=8)
ask_btn = gr.Button("Ask")
ask_btn.click(fn=ask, inputs=[qbox, model_pick], outputs=[answer])
demo.launch()
|