Spaces:

zamal
/

Multimodal-Chat-Playground

Running on Zero

App Files Files Community

zamal commited on May 30

Commit

3ad87bd

verified ·

1 Parent(s): 6177313

Update app.py

Browse files

Files changed (1) hide show

app.py +69 -75

app.py CHANGED Viewed

@@ -41,6 +41,9 @@ load_dotenv()
 HF_TOKEN = os.getenv("HUGGINGFACE_TOKEN")
 processor = None
 vision_model = None
 # OCR + multimodal image description setup
 ocr_model = ocr_predictor(
     "db_resnet50", "crnn_mobilenet_v3_large", pretrained=True, assume_straight_pages=True
@@ -97,52 +100,34 @@ SHARED_EMB_FN = embedding_functions.SentenceTransformerEmbeddingFunction(
 )
 def get_vectordb(text: str, images: list[Image.Image], img_names: list[str]):
-    """
-    Build a *persistent* ChromaDB instance on disk, with two collections:
-      • text_db  (chunks of the PDF text)
-      • image_db (image descriptions + raw image bytes)
-    """
-    # 1) Make or clean the on-disk folder
-    shutil.rmtree(PERSIST_DIR, ignore_errors=True)
-    os.makedirs(PERSIST_DIR, exist_ok=True)
-    client = chromadb.PersistentClient(
-    path=PERSIST_DIR,
-    settings=Settings(),
-    tenant=DEFAULT_TENANT,
-    database=DEFAULT_DATABASE
-    )
-    # 3) Create / wipe collections
-    for col in ("text_db", "image_db"):
-        if col in [c.name for c in client.list_collections()]:
-            client.delete_collection(col)
-    text_col = client.get_or_create_collection(
-        name="text_db",
-        embedding_function=SHARED_EMB_FN
-    )
-    img_col = client.get_or_create_collection(
-        name="image_db",
         embedding_function=SHARED_EMB_FN,
         metadata={"hnsw:space": "cosine"}
     )
-    # 4) Add images
     if images:
         descs, metas = [], []
-        for idx, img in enumerate(images):
             try:
                 cap = get_image_description(img)
             except:
                 cap = "⚠️ could not describe image"
-            descs.append(f"{img_names[idx]}: {cap}")
             metas.append({"image": image_to_bytes(img)})
         img_col.add(ids=[str(i) for i in range(len(images))],
                     documents=descs,
                     metadatas=metas)
-    # 5) Chunk & add text
     splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
     docs = splitter.create_documents([text])
     text_col.add(ids=[str(i) for i in range(len(docs))],
@@ -153,6 +138,7 @@ def get_vectordb(text: str, images: list[Image.Image], img_names: list[str]):
 # Text extraction
 def result_to_text(result, as_text=False):
     pages = []
@@ -224,15 +210,15 @@ def extract_data_from_pdfs(
     progress(0.6, "Indexing in vector DB…")
     client = get_vectordb(all_text, images, names)
-    # 6) Mark session and return UI outputs
-    session["processed"] = True
-    session["persist_directory"] = PERSIST_DIR
-    sample_imgs = images[:4] if include_images == "Include Images" else []
     return (
-        session,               # gr.State
         all_text[:2000] + "...",
-        sample_imgs,
         "<h3>Done!</h3>"
     )
@@ -250,49 +236,41 @@ def conversation(
     max_tok: int,
     model_id: str
 ):
-    pd = session.get("persist_directory")
-    if not session.get("processed") or not pd:
         raise gr.Error("Please extract data first")
-    # 1) Reopen the same persistent client (new API)
-    client = chromadb.PersistentClient(
-        path=pd,
-        settings=Settings(),
-        tenant=DEFAULT_TENANT,
-        database=DEFAULT_DATABASE
     )
-    # 2) Text retrieval
-    text_col = client.get_collection("text_db")
-    docs = text_col.query(query_texts=[question],
-                          n_results=int(num_ctx),
-                          include=["documents"])["documents"][0]
-    # 3) Image retrieval
-    img_col = client.get_collection("image_db")
-    img_q = img_col.query(query_texts=[question],
-                          n_results=int(img_ctx),
-                          include=["metadatas","documents"])
     img_descs = img_q["documents"][0] or ["No images found"]
     images = []
     for meta in img_q["metadatas"][0]:
-        b64 = meta.get("image","")
         try:
             images.append(Image.open(io.BytesIO(base64.b64decode(b64))))
         except:
             pass
     img_desc = "\n".join(img_descs)
-    # 4) Build prompt & call LLM
-    llm = HuggingFaceEndpoint(
-        repo_id=model_id,
-        task="text-generation",
-        temperature=temp,
-        max_new_tokens=max_tok,
-        huggingfacehub_api_token=HF_TOKEN
-    )
     prompt = PromptTemplate(
         template="""
 Context:
@@ -305,27 +283,43 @@ Question:
 {q}
 Answer:
-""", input_variables=["text","img_desc","q"]
     )
-    inp = prompt.format(text="\n\n".join(docs), img_desc=img_desc, q=question)
     try:
-        answer = llm.invoke(inp)
     except HfHubHTTPError as e:
-        answer = "❌ Model not hosted" if e.response.status_code==404 else f"⚠️ HF error: {e}"
     except Exception as e:
         answer = f"⚠️ Unexpected error: {e}"
     new_history = history + [
-        {"role":"user", "content":question},
-        {"role":"assistant","content":answer}
     ]
     return new_history, docs, images
 # ─────────────────────────────────────────────────────────────────────────────
 # Gradio UI
 CSS = """

 HF_TOKEN = os.getenv("HUGGINGFACE_TOKEN")
 processor = None
 vision_model = None
+# hold the in-memory vectordb
+CURRENT_VDB = None
 # OCR + multimodal image description setup
 ocr_model = ocr_predictor(
     "db_resnet50", "crnn_mobilenet_v3_large", pretrained=True, assume_straight_pages=True
 )
 def get_vectordb(text: str, images: list[Image.Image], img_names: list[str]):
+    client = chromadb.EphemeralClient()
+    # wipe old
+    for name in ("text_db", "image_db"):
+        if name in [c.name for c in client.list_collections()]:
+            client.delete_collection(name)
+    text_col = client.get_or_create_collection("text_db", embedding_function=SHARED_EMB_FN)
+    img_col  = client.get_or_create_collection(
+        "image_db",
         embedding_function=SHARED_EMB_FN,
         metadata={"hnsw:space": "cosine"}
     )
+    # add images
     if images:
         descs, metas = [], []
+        for i, img in enumerate(images):
             try:
                 cap = get_image_description(img)
             except:
                 cap = "⚠️ could not describe image"
+            descs.append(f"{img_names[i]}: {cap}")
             metas.append({"image": image_to_bytes(img)})
         img_col.add(ids=[str(i) for i in range(len(images))],
                     documents=descs,
                     metadatas=metas)
+    # chunk + add text
     splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
     docs = splitter.create_documents([text])
     text_col.add(ids=[str(i) for i in range(len(docs))],
 # Text extraction
 def result_to_text(result, as_text=False):
     pages = []
     progress(0.6, "Indexing in vector DB…")
     client = get_vectordb(all_text, images, names)
+    global CURRENT_VDB
+    CURRENT_VDB = get_vectordb(all_text, images, names)
+    session["processed"] = True
+    sample = images[:4] if include_images=="Include Images" else []
     return (
+        session,
         all_text[:2000] + "...",
+        sample,
         "<h3>Done!</h3>"
     )
     max_tok: int,
     model_id: str
 ):
+    """
+    Uses the in-memory CURRENT_VDB (set by extract_data_from_pdfs) to answer the user.
+    """
+    global CURRENT_VDB
+    # 0) Guard: make sure we've extracted at least once
+    if not session.get("processed") or CURRENT_VDB is None:
         raise gr.Error("Please extract data first")
+    # 1) Retrieve top-k text chunks
+    text_col = CURRENT_VDB.get_collection("text_db")
+    docs = text_col.query(
+        query_texts=[question],
+        n_results=int(num_ctx),
+        include=["documents"]
+    )["documents"][0]
+    # 2) Retrieve top-k images
+    img_col = CURRENT_VDB.get_collection("image_db")
+    img_q = img_col.query(
+        query_texts=[question],
+        n_results=int(img_ctx),
+        include=["metadatas", "documents"]
     )
     img_descs = img_q["documents"][0] or ["No images found"]
     images = []
     for meta in img_q["metadatas"][0]:
+        b64 = meta.get("image", "")
         try:
             images.append(Image.open(io.BytesIO(base64.b64decode(b64))))
         except:
             pass
     img_desc = "\n".join(img_descs)
+    # 3) Build the prompt
     prompt = PromptTemplate(
         template="""
 Context:
 {q}
 Answer:
+""",
+        input_variables=["text", "img_desc", "q"],
+    )
+    user_input = prompt.format(
+        text="\n\n".join(docs),
+        img_desc=img_desc,
+        q=question
     )
+    # 4) Call the LLM
+    llm = HuggingFaceEndpoint(
+        repo_id=model_id,
+        task="text-generation",
+        temperature=temp,
+        max_new_tokens=max_tok,
+        # the client will pick up HUGGINGFACEHUB_API_TOKEN from env automatically
+    )
     try:
+        answer = llm.invoke(user_input)
     except HfHubHTTPError as e:
+        if e.response.status_code == 404:
+            answer = f"❌ Model `{model_id}` not hosted on HF Inference API."
+        else:
+            answer = f"⚠️ HF API error: {e}"
     except Exception as e:
         answer = f"⚠️ Unexpected error: {e}"
+    # 5) Append to chat history and return
     new_history = history + [
+        {"role": "user",      "content": question},
+        {"role": "assistant", "content": answer}
     ]
     return new_history, docs, images
 # ─────────────────────────────────────────────────────────────────────────────
 # Gradio UI
 CSS = """