Spaces:

zamal
/

Multimodal-Chat-Playground

Running on Zero

App Files Files Community

zamal commited on 13 days ago

Commit

1e770e5

verified ·

1 Parent(s): 0a3438b

Update app.py

Browse files

Files changed (1) hide show

app.py +90 -238

app.py CHANGED Viewed

@@ -5,273 +5,141 @@ import gc
 from huggingface_hub.utils import HfHubHTTPError
 from langchain_core.prompts import PromptTemplate
 from langchain_huggingface import HuggingFaceEndpoint
-import io, base64
-from PIL import Image
-import torch
-import gradio as gr
-import spaces
-import numpy as np
-import pandas as pd
-import pymupdf
-from PIL import Image
-from pypdf import PdfReader
-from dotenv import load_dotenv
-import shutil
-from chromadb.config import Settings, DEFAULT_TENANT, DEFAULT_DATABASE
-from welcome_text import WELCOME_INTRO
 from doctr.io import DocumentFile
 from doctr.models import ocr_predictor
-from transformers import LlavaNextProcessor, LlavaNextForConditionalGeneration
 import chromadb
 from chromadb.utils import embedding_functions
-from chromadb.utils.data_loaders import ImageLoader
-from langchain_core.prompts import PromptTemplate
 from langchain.text_splitter import RecursiveCharacterTextSplitter
-from langchain_huggingface import HuggingFaceEndpoint
-from utils import extract_pdfs, extract_images, clean_text, image_to_bytes
-from utils import *
 # ─────────────────────────────────────────────────────────────────────────────
-# Load .env
-load_dotenv()
-HF_TOKEN = os.getenv("HUGGINGFACE_TOKEN")
 processor = None
 vision_model = None
-# hold the in-memory vectordb
-CURRENT_VDB = None
-# OCR + multimodal image description setup
-ocr_model = ocr_predictor(
-    "db_resnet50", "crnn_mobilenet_v3_large", pretrained=True, assume_straight_pages=True
 )
-processor = LlavaNextProcessor.from_pretrained("llava-hf/llava-v1.6-mistral-7b-hf")
-vision_model = LlavaNextForConditionalGeneration.from_pretrained(
-    "llava-hf/llava-v1.6-mistral-7b-hf",
-    torch_dtype=torch.float16,
-    low_cpu_mem_usage=True
-).to("cuda")
-# Add at the top of your module, alongside your other globals
-PERSIST_DIR = "./chroma_db"
-if os.path.exists(PERSIST_DIR):
-    shutil.rmtree(PERSIST_DIR)
-@spaces.GPU()
-def get_image_description(image: Image.Image) -> str:
-    """
-    Lazy-loads the Llava processor + model inside the GPU worker,
-    runs captioning, and returns a one-sentence description.
-    """
-    global processor, vision_model
-    # On first call, instantiate + move to CUDA
     if processor is None or vision_model is None:
-        processor = LlavaNextProcessor.from_pretrained("llava-hf/llava-v1.6-mistral-7b-hf")
         vision_model = LlavaNextForConditionalGeneration.from_pretrained(
-            "llava-hf/llava-v1.6-mistral-7b-hf",
-            torch_dtype=torch.float16,
-            low_cpu_mem_usage=True
         ).to("cuda")
-    torch.cuda.empty_cache()
-    gc.collect()
     prompt = "[INST] <image>\nDescribe the image in a sentence [/INST]"
-    inputs = processor(prompt, image, return_tensors="pt").to("cuda")
-    output = vision_model.generate(**inputs, max_new_tokens=100)
-    return processor.decode(output[0], skip_special_tokens=True)
-# Vector DB setup
-# at top of file, alongside your other imports
-from chromadb.utils import embedding_functions
-from chromadb.utils.data_loaders import ImageLoader
-import chromadb
-from langchain.text_splitter import RecursiveCharacterTextSplitter
-from utils import image_to_bytes  # your helper
-# 1) Create one shared embedding function (defaulting to All-MiniLM-L6-v2, 384-dim)
-SHARED_EMB_FN = embedding_functions.SentenceTransformerEmbeddingFunction(
-    model_name="all-MiniLM-L6-v2"
-)
-def get_vectordb(text: str, images: list[Image.Image], img_names: list[str]):
-    client = chromadb.EphemeralClient()
-    # wipe old
-    for name in ("text_db", "image_db"):
-        if name in [c.name for c in client.list_collections()]:
-            client.delete_collection(name)
-    text_col = client.get_or_create_collection("text_db", embedding_function=SHARED_EMB_FN)
-    img_col  = client.get_or_create_collection(
-        "image_db",
-        embedding_function=SHARED_EMB_FN,
-        metadata={"hnsw:space": "cosine"}
-    )
-    # add images
-    if images:
-        descs, metas = [], []
-        for i, img in enumerate(images):
-            try:
-                cap = get_image_description(img)
-            except:
-                cap = "⚠️ could not describe image"
-            descs.append(f"{img_names[i]}: {cap}")
-            metas.append({"image": image_to_bytes(img)})
-        img_col.add(ids=[str(i) for i in range(len(images))],
-                    documents=descs,
-                    metadatas=metas)
-    # chunk + add text
-    splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
-    docs = splitter.create_documents([text])
-    text_col.add(ids=[str(i) for i in range(len(docs))],
-                 documents=[d.page_content for d in docs])
-    return client
-# Text extraction
-def result_to_text(result, as_text=False):
-    pages = []
-    for pg in result.pages:
-        txt = " ".join(w.value for block in pg.blocks for line in block.lines for w in line.words)
-        pages.append(clean_text(txt))
-    return "\n\n".join(pages) if as_text else pages
-OCR_CHOICES = {
-    "db_resnet50 + crnn_mobilenet_v3_large": ("db_resnet50", "crnn_mobilenet_v3_large"),
-    "db_resnet50 + crnn_resnet31":          ("db_resnet50", "crnn_resnet31"),
-}
-@spaces.GPU()
 def extract_data_from_pdfs(
-    docs: list[str],
-    session: dict,
-    include_images: str,
-    do_ocr: str,
-    ocr_choice: str,
-    vlm_choice: str,
-    progress=gr.Progress()
 ):
     if not docs:
         raise gr.Error("No documents to process")
-    # 1) OCR pipeline if requested
     if do_ocr == "Get Text With OCR":
         db_m, crnn_m = OCR_CHOICES[ocr_choice]
         local_ocr = ocr_predictor(db_m, crnn_m, pretrained=True, assume_straight_pages=True)
-    else:
-        local_ocr = None
-    # 2) Vision–language model
     proc = LlavaNextProcessor.from_pretrained(vlm_choice)
-    vis = (
-        LlavaNextForConditionalGeneration
-        .from_pretrained(vlm_choice, torch_dtype=torch.float16, low_cpu_mem_usage=True)
-        .to("cuda")
-    )
-    # 3) Monkey‐patch caption fn
     def describe(img: Image.Image) -> str:
         torch.cuda.empty_cache(); gc.collect()
         prompt = "[INST] <image>\nDescribe the image in a sentence [/INST]"
         inp = proc(prompt, img, return_tensors="pt").to("cuda")
         out = vis.generate(**inp, max_new_tokens=100)
         return proc.decode(out[0], skip_special_tokens=True)
-    global get_image_description
     get_image_description = describe
-    # 4) Extract text & images
     progress(0.2, "Extracting text and images…")
-    all_text = ""
-    images, names = [], []
-    for path in docs:
         if local_ocr:
-            pdf = DocumentFile.from_pdf(path)
             res = local_ocr(pdf)
-            all_text += result_to_text(res, as_text=True) + "\n\n"
         else:
-            all_text += (PdfReader(path).pages[0].extract_text() or "") + "\n\n"
         if include_images == "Include Images":
-            imgs = extract_images([path])
             images.extend(imgs)
-            names.extend([os.path.basename(path)] * len(imgs))
-    # 5) Build the in‐memory vector DB once
     progress(0.6, "Indexing in vector DB…")
-    global CURRENT_VDB
-    CURRENT_VDB = get_vectordb(all_text, images, names)
-    # 6) Mark session and return UI outputs
-    session["processed"] = True
-    sample = images[:4] if include_images == "Include Images" else []
-    return (
-        session,
-        all_text[:2000] + "...",
-        sample,
-        "<h3>Done!</h3>"
-    )
-# Chat function
-def conversation(
-    session: dict,
-    question: str,
-    num_ctx: int,
-    img_ctx: int,
-    history: list,
-    temp: float,
-    max_tok: int,
-    model_id: str
-):
-    """
-    Uses the in-memory CURRENT_VDB (set by extract_data_from_pdfs) to answer the user.
-    """
     global CURRENT_VDB
-    # 0) Guard: make sure we've extracted at least once
     if not session.get("processed") or CURRENT_VDB is None:
         raise gr.Error("Please extract data first")
-    # 1) Retrieve top-k text chunks
-    text_col = CURRENT_VDB.get_collection("text_db")
-    docs = text_col.query(
-        query_texts=[question],
-        n_results=int(num_ctx),
-        include=["documents"]
-    )["documents"][0]
-    # 2) Retrieve top-k images
-    img_col = CURRENT_VDB.get_collection("image_db")
-    img_q = img_col.query(
-        query_texts=[question],
-        n_results=int(img_ctx),
-        include=["metadatas", "documents"]
-    )
     img_descs = img_q["documents"][0] or ["No images found"]
     images = []
-    for meta in img_q["metadatas"][0]:
-        b64 = meta.get("image", "")
-        try:
-            images.append(Image.open(io.BytesIO(base64.b64decode(b64))))
-        except:
-            pass
     img_desc = "\n".join(img_descs)
-    # 3) Build the prompt
     prompt = PromptTemplate(
         template="""
 Context:
@@ -284,39 +152,23 @@ Question:
 {q}
 Answer:
-""",
-        input_variables=["text", "img_desc", "q"],
-    )
-    user_input = prompt.format(
-        text="\n\n".join(docs),
-        img_desc=img_desc,
-        q=question
-    )
-    # 4) Call the LLM
     llm = HuggingFaceEndpoint(
-        repo_id=model_id,
-        task="text-generation",
-        temperature=temp,
-        max_new_tokens=max_tok,
-        # the client will pick up HUGGINGFACEHUB_API_TOKEN from env automatically
     )
-    try:
-        answer = llm.invoke(user_input)
     except HfHubHTTPError as e:
-        if e.response.status_code == 404:
-            answer = f"❌ Model `{model_id}` not hosted on HF Inference API."
-        else:
-            answer = f"⚠️ HF API error: {e}"
     except Exception as e:
-        answer = f"⚠️ Unexpected error: {e}"
-    # 5) Append to chat history and return
-    new_history = history + [
-        {"role": "user",      "content": question},
-        {"role": "assistant", "content": answer}
-    ]
-    return new_history, docs, images

 from huggingface_hub.utils import HfHubHTTPError
 from langchain_core.prompts import PromptTemplate
 from langchain_huggingface import HuggingFaceEndpoint
+from transformers import LlavaNextProcessor, LlavaNextForConditionalGeneration
 from doctr.io import DocumentFile
 from doctr.models import ocr_predictor
+from pypdf import PdfReader
+from PIL import Image
 import chromadb
 from chromadb.utils import embedding_functions
 from langchain.text_splitter import RecursiveCharacterTextSplitter
+import gradio as gr
 # ─────────────────────────────────────────────────────────────────────────────
+# Globals
+CURRENT_VDB = None
 processor = None
 vision_model = None
+# OCR & V+L defaults
+OCR_CHOICES = {
+    "db_resnet50 + crnn_mobilenet_v3_large": ("db_resnet50", "crnn_mobilenet_v3_large"),
+    "db_resnet50 + crnn_resnet31": ("db_resnet50", "crnn_resnet31"),
+}
+SHARED_EMB_FN = embedding_functions.SentenceTransformerEmbeddingFunction(
+    model_name="all-MiniLM-L6-v2"
 )
+def get_image_description(img: Image.Image) -> str:
+    global processor, vision_model
     if processor is None or vision_model is None:
+        # use the same default V+L model everywhere
+        vlm = "llava-hf/llava-v1.6-mistral-7b-hf"
+        processor = LlavaNextProcessor.from_pretrained(vlm)
         vision_model = LlavaNextForConditionalGeneration.from_pretrained(
+            vlm, torch_dtype=torch.float16, low_cpu_mem_usage=True
         ).to("cuda")
+    torch.cuda.empty_cache(); gc.collect()
     prompt = "[INST] <image>\nDescribe the image in a sentence [/INST]"
+    inputs = processor(prompt, img, return_tensors="pt").to("cuda")
+    out = vision_model.generate(**inputs, max_new_tokens=100)
+    return processor.decode(out[0], skip_special_tokens=True)
 def extract_data_from_pdfs(
+    docs, session, include_images, do_ocr, ocr_choice, vlm_choice, progress=gr.Progress()
 ):
     if not docs:
         raise gr.Error("No documents to process")
+    # 1) Optional OCR
+    local_ocr = None
     if do_ocr == "Get Text With OCR":
         db_m, crnn_m = OCR_CHOICES[ocr_choice]
         local_ocr = ocr_predictor(db_m, crnn_m, pretrained=True, assume_straight_pages=True)
+    # 2) Prepare V+L
     proc = LlavaNextProcessor.from_pretrained(vlm_choice)
+    vis = LlavaNextForConditionalGeneration.from_pretrained(
+        vlm_choice, torch_dtype=torch.float16, low_cpu_mem_usage=True
+    ).to("cuda")
+    # 3) Patch get_image_description to use this choice
     def describe(img: Image.Image) -> str:
         torch.cuda.empty_cache(); gc.collect()
         prompt = "[INST] <image>\nDescribe the image in a sentence [/INST]"
         inp = proc(prompt, img, return_tensors="pt").to("cuda")
         out = vis.generate(**inp, max_new_tokens=100)
         return proc.decode(out[0], skip_special_tokens=True)
+    global get_image_description, CURRENT_VDB
     get_image_description = describe
+    # 4) Pull text + images
     progress(0.2, "Extracting text and images…")
+    full_text, images, names = "", [], []
+    for p in docs:
         if local_ocr:
+            pdf = DocumentFile.from_pdf(p)
             res = local_ocr(pdf)
+            full_text += " ".join(w.value for blk in res.pages for line in blk.lines for w in line.words) + "\n\n"
         else:
+            full_text += (PdfReader(p).pages[0].extract_text() or "") + "\n\n"
         if include_images == "Include Images":
+            imgs = extract_images([p])
             images.extend(imgs)
+            names.extend([os.path.basename(p)] * len(imgs))
+    # 5) Build in-memory Chroma
     progress(0.6, "Indexing in vector DB…")
+    client = chromadb.EphemeralClient()
+    for col in ("text_db", "image_db"):
+        if col in [c.name for c in client.list_collections()]:
+            client.delete_collection(col)
+    text_col = client.get_or_create_collection("text_db", embedding_function=SHARED_EMB_FN)
+    img_col = client.get_or_create_collection("image_db", embedding_function=SHARED_EMB_FN,
+                                              metadata={"hnsw:space":"cosine"})
+    if images:
+        descs, metas = [], []
+        for i, im in enumerate(images):
+            cap = get_image_description(im)
+            descs.append(f"{names[i]}: {cap}")
+            metas.append({"image": image_to_bytes(im)})
+        img_col.add(ids=[str(i) for i in range(len(images))],
+                    documents=descs, metadatas=metas)
+    splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
+    docs_ = splitter.create_documents([full_text])
+    text_col.add(ids=[str(i) for i in range(len(docs_))],
+                 documents=[d.page_content for d in docs_])
+    CURRENT_VDB = client
+    session["processed"] = True
+    sample = images[:4] if include_images=="Include Images" else []
+    return session, full_text[:2000]+"...", sample, "<h3>Done!</h3>"
+def conversation(session, question, num_ctx, img_ctx, history, temp, max_tok, model_id):
     global CURRENT_VDB
     if not session.get("processed") or CURRENT_VDB is None:
         raise gr.Error("Please extract data first")
+    # a) text retrieval
+    docs = CURRENT_VDB.get_collection("text_db")\
+         .query(query_texts=[question], n_results=int(num_ctx), include=["documents"])["documents"][0]
+    # b) image retrieval
+    img_q = CURRENT_VDB.get_collection("image_db")\
+           .query(query_texts=[question], n_results=int(img_ctx),
+                  include=["metadatas","documents"])
     img_descs = img_q["documents"][0] or ["No images found"]
     images = []
+    for m in img_q["metadatas"][0]:
+        b = m.get("image","")
+        try: images.append(Image.open(io.BytesIO(base64.b64decode(b))))
+        except: pass
     img_desc = "\n".join(img_descs)
+    # c) prompt & LLM
     prompt = PromptTemplate(
         template="""
 Context:
 {q}
 Answer:
+""", input_variables=["text","img_desc","q"])
+    inp = prompt.format(text="\n\n".join(docs), img_desc=img_desc, q=question)
     llm = HuggingFaceEndpoint(
+        repo_id=model_id, task="text-generation",
+        temperature=temp, max_new_tokens=max_tok,
+        huggingfacehub_api_token=HF_TOKEN
     )
+    try:    ans = llm.invoke(inp)
     except HfHubHTTPError as e:
+        ans = f"❌ Model `{model_id}` not hosted." if e.response.status_code==404 else f"⚠️ HF API error: {e}"
     except Exception as e:
+        ans = f"⚠️ Unexpected error: {e}"
+    new_hist = history + [{"role":"user","content":question},
+                          {"role":"assistant","content":ans}]
+    return new_hist, docs, images