Spaces:

zamal
/

Multimodal-Chat-Playground

Running on Zero

App Files Files Community

zamal commited on 29 days ago

Commit

08d9c00

verified ·

1 Parent(s): 82895ea

Update app.py

Browse files

Files changed (1) hide show

app.py +47 -48

app.py CHANGED Viewed

@@ -58,12 +58,12 @@ CURRENT_VDB = None
 @spaces.GPU()
 def get_image_description(image: Image.Image) -> str:
     """
-    Lazy-loads the Llava processor + model into the GPU worker,
     runs captioning, and returns a one-sentence description.
     """
     global processor, vision_model
-    # First-call: instantiate + move to CUDA
     if processor is None or vision_model is None:
         processor = LlavaNextProcessor.from_pretrained("llava-hf/llava-v1.6-mistral-7b-hf")
         vision_model = LlavaNextForConditionalGeneration.from_pretrained(
@@ -72,9 +72,9 @@ def get_image_description(image: Image.Image) -> str:
             low_cpu_mem_usage=True
         ).to("cuda")
-    # clear and run
     torch.cuda.empty_cache()
     gc.collect()
     prompt = "[INST] <image>\nDescribe the image in a sentence [/INST]"
     inputs = processor(prompt, image, return_tensors="pt").to("cuda")
     output = vision_model.generate(**inputs, max_new_tokens=100)
@@ -175,21 +175,21 @@ def extract_data_from_pdfs(
 ):
     """
     1) (Optional) OCR setup
-    2) V+L model setup & monkey-patch get_image_description
-    3) Extract text and images
-    4) Build and store vector DB in global CURRENT_VDB
     """
     if not docs:
         raise gr.Error("No documents to process")
-    # 1) OCR instantiation if requested
     if do_ocr == "Get Text With OCR":
         db_m, crnn_m = OCR_CHOICES[ocr_choice]
         local_ocr = ocr_predictor(db_m, crnn_m, pretrained=True, assume_straight_pages=True)
     else:
         local_ocr = None
-    # 2) Vision–language model instantiation
     proc = LlavaNextProcessor.from_pretrained(vlm_choice)
     vis = (
         LlavaNextForConditionalGeneration
@@ -197,9 +197,10 @@ def extract_data_from_pdfs(
         .to("cuda")
     )
-    # Monkey-patch global captioning fn
     def describe(img: Image.Image) -> str:
-        torch.cuda.empty_cache(); gc.collect()
         prompt = "[INST] <image>\nDescribe the image in a sentence [/INST]"
         inputs = proc(prompt, img, return_tensors="pt").to("cuda")
         output = vis.generate(**inputs, max_new_tokens=100)
@@ -208,13 +209,12 @@ def extract_data_from_pdfs(
     global get_image_description, CURRENT_VDB
     get_image_description = describe
-    # 3) Extract text & images
     progress(0.2, "Extracting text and images…")
     all_text = ""
     images, names = [], []
     for path in docs:
-        # text
         if local_ocr:
             pdf = DocumentFile.from_pdf(path)
             res = local_ocr(pdf)
@@ -223,29 +223,28 @@ def extract_data_from_pdfs(
             txt = PdfReader(path).pages[0].extract_text() or ""
             all_text += txt + "\n\n"
-        # images
         if include_images == "Include Images":
             imgs = extract_images([path])
             images.extend(imgs)
             names.extend([os.path.basename(path)] * len(imgs))
-    # 4) Build and stash the vector DB
     progress(0.6, "Indexing in vector DB…")
     CURRENT_VDB = get_vectordb(all_text, images, names)
-    # mark done & return only picklable outputs
     session["processed"] = True
     sample_imgs = images[:4] if include_images == "Include Images" else []
     return (
-        session,               # gr.State for “processed”
-        gr.Row(visible=True),  # to un‐hide your chat UI
-        all_text[:2000] + "...",
-        sample_imgs,
-        "<h3>Done!</h3>"
     )
 # Chat function
 def conversation(
     session: dict,
@@ -258,8 +257,7 @@ def conversation(
     model_id: str
 ):
     """
-    Pulls CURRENT_VDB from module global, runs text+image retrieval,
-    calls the HF endpoint, and returns updated chat history.
     """
     global CURRENT_VDB
     if not session.get("processed") or CURRENT_VDB is None:
@@ -272,7 +270,7 @@ def conversation(
         huggingfacehub_api_token=HF_TOKEN
     )
-    # Retrieve top‐k text & images
     text_col = CURRENT_VDB.get_collection("text_db")
     docs = text_col.query(
         query_texts=[question],
@@ -280,6 +278,7 @@ def conversation(
         include=["documents"]
     )["documents"][0]
     img_col = CURRENT_VDB.get_collection("image_db")
     img_q = img_col.query(
         query_texts=[question],
@@ -296,7 +295,7 @@ def conversation(
             pass
     img_desc = "\n".join(img_descs)
-    # Build and call prompt
     prompt = PromptTemplate(
         template="""
 Context:
@@ -336,6 +335,7 @@ Answer:
 # ─────────────────────────────────────────────────────────────────────────────
 # Gradio UI
 CSS = """
@@ -357,14 +357,13 @@ MODEL_OPTIONS = [
 ]
 with gr.Blocks(css=CSS, theme=gr.themes.Soft()) as demo:
-    vdb_state     = gr.State()
     session_state = gr.State({})
     # ─── Welcome Screen ─────────────────────────────────────────────
     with gr.Column(visible=True) as welcome_col:
         gr.Markdown(
-           f"<div style='text-align: center'>\n{WELCOME_INTRO}\n</div>",
             elem_id="welcome_md"
         )
         start_btn = gr.Button("🚀 Start")
@@ -386,6 +385,11 @@ with gr.Blocks(css=CSS, theme=gr.themes.Soft()) as demo:
                     value="Exclude Images",
                     label="Images"
                 )
                 ocr_dd = gr.Dropdown(
                     choices=[
                         "db_resnet50 + crnn_mobilenet_v3_large",
@@ -405,28 +409,23 @@ with gr.Blocks(css=CSS, theme=gr.themes.Soft()) as demo:
                 extract_btn = gr.Button("Extract")
                 preview_text = gr.Textbox(lines=10, label="Sample Text", interactive=False)
                 preview_img  = gr.Gallery(label="Sample Images", rows=2, value=[])
                 extract_btn.click(
-                    extract_data_from_pdfs,
                     inputs=[
                         docs,
                         session_state,
                         include_dd,
-                        gr.Radio(
-                            ["Get Text With OCR", "Get Available Text Only"],
-                            value="Get Available Text Only",
-                            label="OCR"
-                        ),
                         ocr_dd,
                         vlm_dd
                     ],
                     outputs=[
-                        vdb_state,
-                        session_state,
-                        gr.Row(visible=False),
-                        preview_text,
-                        preview_img,
-                        gr.HTML()
                     ]
                 )
@@ -446,15 +445,15 @@ with gr.Blocks(css=CSS, theme=gr.themes.Soft()) as demo:
                             value=MODEL_OPTIONS[0],
                             label="Choose Chat Model"
                         )
-                        num_ctx = gr.Slider(1,20,value=3,label="Text Contexts")
-                        img_ctx = gr.Slider(1,10,value=2,label="Image Contexts")
-                        temp    = gr.Slider(0.1,1.0,step=0.1,value=0.4,label="Temperature")
-                        max_tok = gr.Slider(10,1000,step=10,value=200,label="Max Tokens")
                 send.click(
-                    conversation,
                     inputs=[
-                        vdb_state,
                         msg,
                         num_ctx,
                         img_ctx,
@@ -465,18 +464,18 @@ with gr.Blocks(css=CSS, theme=gr.themes.Soft()) as demo:
                     ],
                     outputs=[
                         chat,
-                        gr.Dataframe(),
                         gr.Gallery(label="Relevant Images", rows=2, value=[])
                     ]
                 )
-        # Footer inside app_col
         gr.HTML("<center>Made with ❤️ by Zamal</center>")
     # ─── Wire the Start button ───────────────────────────────────────
     start_btn.click(
         fn=lambda: (gr.update(visible=False), gr.update(visible=True)),
-        inputs=[], outputs=[welcome_col, app_col]
     )
 if __name__ == "__main__":

 @spaces.GPU()
 def get_image_description(image: Image.Image) -> str:
     """
+    Lazy-loads the Llava processor + model inside the GPU worker,
     runs captioning, and returns a one-sentence description.
     """
     global processor, vision_model
+    # On first call, instantiate + move to CUDA
     if processor is None or vision_model is None:
         processor = LlavaNextProcessor.from_pretrained("llava-hf/llava-v1.6-mistral-7b-hf")
         vision_model = LlavaNextForConditionalGeneration.from_pretrained(
             low_cpu_mem_usage=True
         ).to("cuda")
     torch.cuda.empty_cache()
     gc.collect()
     prompt = "[INST] <image>\nDescribe the image in a sentence [/INST]"
     inputs = processor(prompt, image, return_tensors="pt").to("cuda")
     output = vision_model.generate(**inputs, max_new_tokens=100)
 ):
     """
     1) (Optional) OCR setup
+    2) Vision+Lang model setup & monkey-patch get_image_description
+    3) Extract text & images
+    4) Build and stash vector DB in CURRENT_VDB
     """
     if not docs:
         raise gr.Error("No documents to process")
+    # 1) OCR pipeline if requested
     if do_ocr == "Get Text With OCR":
         db_m, crnn_m = OCR_CHOICES[ocr_choice]
         local_ocr = ocr_predictor(db_m, crnn_m, pretrained=True, assume_straight_pages=True)
     else:
         local_ocr = None
+    # 2) Vision–language model
     proc = LlavaNextProcessor.from_pretrained(vlm_choice)
     vis = (
         LlavaNextForConditionalGeneration
         .to("cuda")
     )
+    # Monkey-patch our pipeline for image captions
     def describe(img: Image.Image) -> str:
+        torch.cuda.empty_cache()
+        gc.collect()
         prompt = "[INST] <image>\nDescribe the image in a sentence [/INST]"
         inputs = proc(prompt, img, return_tensors="pt").to("cuda")
         output = vis.generate(**inputs, max_new_tokens=100)
     global get_image_description, CURRENT_VDB
     get_image_description = describe
+    # 3) Extract text + images
     progress(0.2, "Extracting text and images…")
     all_text = ""
     images, names = [], []
     for path in docs:
         if local_ocr:
             pdf = DocumentFile.from_pdf(path)
             res = local_ocr(pdf)
             txt = PdfReader(path).pages[0].extract_text() or ""
             all_text += txt + "\n\n"
         if include_images == "Include Images":
             imgs = extract_images([path])
             images.extend(imgs)
             names.extend([os.path.basename(path)] * len(imgs))
+    # 4) Build + store the vector DB
     progress(0.6, "Indexing in vector DB…")
     CURRENT_VDB = get_vectordb(all_text, images, names)
     session["processed"] = True
     sample_imgs = images[:4] if include_images == "Include Images" else []
+    # ─── return *exactly four* picklable outputs ───
     return (
+        session,            # gr.State: so UI knows we're ready
+        all_text[:2000] + "...",  # preview text
+        sample_imgs,        # preview images
+        "<h3>Done!</h3>"    # Done message
     )
 # Chat function
 def conversation(
     session: dict,
     model_id: str
 ):
     """
+    Uses the global CURRENT_VDB (set by extract_data_from_pdfs) to answer.
     """
     global CURRENT_VDB
     if not session.get("processed") or CURRENT_VDB is None:
         huggingfacehub_api_token=HF_TOKEN
     )
+    # 1) Text retrieval
     text_col = CURRENT_VDB.get_collection("text_db")
     docs = text_col.query(
         query_texts=[question],
         include=["documents"]
     )["documents"][0]
+    # 2) Image retrieval
     img_col = CURRENT_VDB.get_collection("image_db")
     img_q = img_col.query(
         query_texts=[question],
             pass
     img_desc = "\n".join(img_descs)
+    # 3) Build prompt & call LLM
     prompt = PromptTemplate(
         template="""
 Context:
 # ─────────────────────────────────────────────────────────────────────────────
 # Gradio UI
 CSS = """
 ]
 with gr.Blocks(css=CSS, theme=gr.themes.Soft()) as demo:
+    # We no longer need vdb_state – we keep only session_state
     session_state = gr.State({})
     # ─── Welcome Screen ─────────────────────────────────────────────
     with gr.Column(visible=True) as welcome_col:
         gr.Markdown(
+            f"<div style='text-align: center'>\n{WELCOME_INTRO}\n</div>",
             elem_id="welcome_md"
         )
         start_btn = gr.Button("🚀 Start")
                     value="Exclude Images",
                     label="Images"
                 )
+                ocr_radio = gr.Radio(
+                    ["Get Text With OCR", "Get Available Text Only"],
+                    value="Get Available Text Only",
+                    label="OCR"
+                )
                 ocr_dd = gr.Dropdown(
                     choices=[
                         "db_resnet50 + crnn_mobilenet_v3_large",
                 extract_btn = gr.Button("Extract")
                 preview_text = gr.Textbox(lines=10, label="Sample Text", interactive=False)
                 preview_img  = gr.Gallery(label="Sample Images", rows=2, value=[])
+                preview_html = gr.HTML()  # for the “Done!” message
                 extract_btn.click(
+                    fn=extract_data_from_pdfs,
                     inputs=[
                         docs,
                         session_state,
                         include_dd,
+                        ocr_radio,
                         ocr_dd,
                         vlm_dd
                     ],
                     outputs=[
+                        session_state,  # session “processed” flag
+                        preview_text,   # preview text
+                        preview_img,    # preview images
+                        preview_html    # done HTML
                     ]
                 )
                             value=MODEL_OPTIONS[0],
                             label="Choose Chat Model"
                         )
+                        num_ctx = gr.Slider(1, 20, value=3, label="Text Contexts")
+                        img_ctx = gr.Slider(1, 10, value=2, label="Image Contexts")
+                        temp    = gr.Slider(0.1, 1.0, step=0.1, value=0.4, label="Temperature")
+                        max_tok = gr.Slider(10, 1000, step=10, value=200, label="Max Tokens")
                 send.click(
+                    fn=conversation,
                     inputs=[
+                        session_state,  # now drives conversation
                         msg,
                         num_ctx,
                         img_ctx,
                     ],
                     outputs=[
                         chat,
+                        gr.Dataframe(),   # returned docs
                         gr.Gallery(label="Relevant Images", rows=2, value=[])
                     ]
                 )
         gr.HTML("<center>Made with ❤️ by Zamal</center>")
     # ─── Wire the Start button ───────────────────────────────────────
     start_btn.click(
         fn=lambda: (gr.update(visible=False), gr.update(visible=True)),
+        inputs=[],
+        outputs=[welcome_col, app_col]
     )
 if __name__ == "__main__":