Spaces:

prithivMLmods
/

Doc-VLMs-OCR

Running on Zero

App Files Files Community

prithivMLmods commited on 8 days ago

Commit

96119c1

verified ·

1 Parent(s): 9ef55f2

Update app.py

Browse files

Files changed (1) hide show

app.py +253 -156

app.py CHANGED Viewed

@@ -54,7 +54,6 @@ model_k = VisionEncoderDecoderModel.from_pretrained(
     torch_dtype=torch.float16
 ).to(device).eval()
-#------------------------------------------------#
 # Load SmolDocling-256M-preview
 MODEL_ID_X = "ds4sd/SmolDocling-256M-preview"
 processor_x = AutoProcessor.from_pretrained(MODEL_ID_X, trust_remote_code=True)
@@ -63,7 +62,6 @@ model_x = AutoModelForVision2Seq.from_pretrained(
     trust_remote_code=True,
     torch_dtype=torch.float16
 ).to(device).eval()
-#------------------------------------------------#
 # Load MonkeyOCR
 MODEL_ID_G = "echo840/MonkeyOCR"
@@ -126,6 +124,104 @@ def downsample_video(video_path):
     vidcap.release()
     return frames
 @spaces.GPU
 def generate_image(model_name: str, text: str, image: Image.Image,
                    max_new_tokens: int = 1024,
@@ -134,84 +230,82 @@ def generate_image(model_name: str, text: str, image: Image.Image,
                    top_k: int = 50,
                    repetition_penalty: float = 1.2):
     """Generate responses for image input using the selected model."""
-    # Model selection
-    if model_name == "Nanonets-OCR-s":
-        processor = processor_m
-        model = model_m
-    elif model_name == "MonkeyOCR-Recognition":
-        processor = processor_g
-        model = model_g
-    elif model_name == "SmolDocling-256M-preview":
-        processor = processor_x
-        model = model_x
-    elif model_name == "ByteDance-s-Dolphin":
-        processor = processor_k
-        model = model_k
     else:
-        yield "Invalid model selected."
-        return
-    if image is None:
-        yield "Please upload an image."
-        return
-    # Prepare images as a list (single image for image inference)
-    images = [image]
-    # SmolDocling-256M specific preprocessing
-    if model_name == "SmolDocling-256M-preview":
-        if "OTSL" in text or "code" in text:
-            images = [add_random_padding(img) for img in images]
-        if "OCR at text at" in text or "Identify element" in text or "formula" in text:
-            text = normalize_values(text, target_max=500)
-    # Unified message structure for all models
-    messages = [
-        {
-            "role": "user",
-            "content": [{"type": "image"} for _ in images] + [
-                {"type": "text", "text": text}
-            ]
-        }
-    ]
-    prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
-    inputs = processor(text=prompt, images=images, return_tensors="pt").to(device)
-    # Generation with streaming
-    streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
-    generation_kwargs = {
-        **inputs,
-        "streamer": streamer,
-        "max_new_tokens": max_new_tokens,
-        "temperature": temperature,
-        "top_p": top_p,
-        "top_k": top_k,
-        "repetition_penalty": repetition_penalty,
-    }
-    thread = Thread(target=model.generate, kwargs=generation_kwargs)
-    thread.start()
-    # Stream output and collect full response
-    buffer = ""
-    full_output = ""
-    for new_text in streamer:
-        full_output += new_text
-        buffer += new_text.replace("<|im_end|>", "")
-        yield buffer
-    # SmolDocling-256M specific postprocessing
-    if model_name == "SmolDocling-256M-preview":
-        cleaned_output = full_output.replace("<end_of_utterance>", "").strip()
-        if any(tag in cleaned_output for tag in ["<doctag>", "<otsl>", "<code>", "<chart>", "<formula>"]):
-            if "<chart>" in cleaned_output:
-                cleaned_output = cleaned_output.replace("<chart>", "<otsl>").replace("</chart>", "</otsl>")
-                cleaned_output = re.sub(r'(<loc_500>)(?!.*<loc_500>)<[^>]+>', r'\1', cleaned_output)
-            doctags_doc = DocTagsDocument.from_doctags_and_image_pairs([cleaned_output], images)
-            doc = DoclingDocument.load_from_doctags(doctags_doc, document_name="Document")
-            markdown_output = doc.export_to_markdown()
-            yield f"**MD Output:**\n\n{markdown_output}"
         else:
-            yield cleaned_output
 @spaces.GPU
 def generate_video(model_name: str, text: str, video_path: str,
@@ -221,85 +315,88 @@ def generate_video(model_name: str, text: str, video_path: str,
                    top_k: int = 50,
                    repetition_penalty: float = 1.2):
     """Generate responses for video input using the selected model."""
-    # Model selection
-    if model_name == "Nanonets-OCR-s":
-        processor = processor_m
-        model = model_m
-    elif model_name == "MonkeyOCR-Recognition":
-        processor = processor_g
-        model = model_g
-    elif model_name == "SmolDocling-256M-preview":
-        processor = processor_x
-        model = model_x
-    elif model_name == "ByteDance-s-Dolphin":
-        processor = processor_k
-        model = model_k
     else:
-        yield "Invalid model selected."
-        return
-    if video_path is None:
-        yield "Please upload a video."
-        return
-    # Extract frames from video
-    frames = downsample_video(video_path)
-    images = [frame for frame, _ in frames]
-    # SmolDocling-256M specific preprocessing
-    if model_name == "SmolDocling-256M-preview":
-        if "OTSL" in text or "code" in text:
-            images = [add_random_padding(img) for img in images]
-        if "OCR at text at" in text or "Identify element" in text or "formula" in text:
-            text = normalize_values(text, target_max=500)
-    # Unified message structure for all models
-    messages = [
-        {
-            "role": "user",
-            "content": [{"type": "image"} for _ in images] + [
-                {"type": "text", "text": text}
-            ]
-        }
-    ]
-    prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
-    inputs = processor(text=prompt, images=images, return_tensors="pt").to(device)
-    # Generation with streaming
-    streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
-    generation_kwargs = {
-        **inputs,
-        "streamer": streamer,
-        "max_new_tokens": max_new_tokens,
-        "temperature": temperature,
-        "top_p": top_p,
-        "top_k": top_k,
-        "repetition_penalty": repetition_penalty,
-    }
-    thread = Thread(target=model.generate, kwargs=generation_kwargs)
-    thread.start()
-    # Stream output and collect full response
-    buffer = ""
-    full_output = ""
-    for new_text in streamer:
-        full_output += new_text
-        buffer += new_text.replace("<|im_end|>", "")
-        yield buffer
-    # SmolDocling-256M specific postprocessing
-    if model_name == "SmolDocling-256M-preview":
-        cleaned_output = full_output.replace("<end_of_utterance>", "").strip()
-        if any(tag in cleaned_output for tag in ["<doctag>", "<otsl>", "<code>", "<chart>", "<formula>"]):
-            if "<chart>" in cleaned_output:
-                cleaned_output = cleaned_output.replace("<chart>", "<otsl>").replace("</chart>", "</otsl>")
-                cleaned_output = re.sub(r'(<loc_500>)(?!.*<loc_500>)<[^>]+>', r'\1', cleaned_output)
-            doctags_doc = DocTagsDocument.from_doctags_and_image_pairs([cleaned_output], images)
-            doc = DoclingDocument.load_from_doctags(doctags_doc, document_name="Document")
-            markdown_output = doc.export_to_markdown()
-            yield f"**MD Output:**\n\n{markdown_output}"
         else:
-            yield cleaned_output
 # Define examples for image and video inference
 image_examples = [
@@ -325,7 +422,7 @@ css = """
 # Create the Gradio Interface
 with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
-    gr.Markdown("# **[OCRNet 4x 🤗](https://huggingface.co/collections/prithivMLmods/multimodal-implementations-67c9982ea04b39f0608badb0)**")
     with gr.Row():
         with gr.Column():
             with gr.Tabs():

     torch_dtype=torch.float16
 ).to(device).eval()
 # Load SmolDocling-256M-preview
 MODEL_ID_X = "ds4sd/SmolDocling-256M-preview"
 processor_x = AutoProcessor.from_pretrained(MODEL_ID_X, trust_remote_code=True)
     trust_remote_code=True,
     torch_dtype=torch.float16
 ).to(device).eval()
 # Load MonkeyOCR
 MODEL_ID_G = "echo840/MonkeyOCR"
     vidcap.release()
     return frames
+# Dolphin-specific functions
+def model_chat(prompt, image):
+    """Use Dolphin model for inference."""
+    processor = processor_k
+    model = model_k
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    inputs = processor(image, return_tensors="pt").to(device)
+    pixel_values = inputs.pixel_values.half()
+    prompt_inputs = processor.tokenizer(
+        f"<s>{prompt} <Answer/>",
+        add_special_tokens=False,
+        return_tensors="pt"
+    ).to(device)
+    outputs = model.generate(
+        pixel_values=pixel_values,
+        decoder_input_ids=prompt_inputs.input_ids,
+        decoder_attention_mask=prompt_inputs.attention_mask,
+        min_length=1,
+        max_length=4096,
+        pad_token_id=processor.tokenizer.pad_token_id,
+        eos_token_id=processor.tokenizer.eos_token_id,
+        use_cache=True,
+        bad_words_ids=[[processor.tokenizer.unk_token_id]],
+        return_dict_in_generate=True,
+        do_sample=False,
+        num_beams=1,
+        repetition_penalty=1.1
+    )
+    sequence = processor.tokenizer.batch_decode(outputs.sequences, skip_special_tokens=False)[0]
+    cleaned = sequence.replace(f"<s>{prompt} <Answer/>", "").replace("<pad>", "").replace("</s>", "").strip()
+    return cleaned
+def process_elements(layout_results, image):
+    """Parse layout results and extract elements from the image."""
+    # Placeholder parsing logic based on expected Dolphin output
+    # Assuming layout_results is a string like "[(x1,y1,x2,y2,label), ...]"
+    try:
+        elements = ast.literal_eval(layout_results)
+    except:
+        elements = []  # Fallback if parsing fails
+    recognition_results = []
+    reading_order = 0
+    for bbox, label in elements:
+        try:
+            x1, y1, x2, y2 = map(int, bbox)
+            cropped = image.crop((x1, y1, x2, y2))
+            if cropped.size[0] > 0 and cropped.size[1] > 0:
+                if label == "text":
+                    text = model_chat("Read text in the image.", cropped)
+                    recognition_results.append({
+                        "label": label,
+                        "bbox": [x1, y1, x2, y2],
+                        "text": text.strip(),
+                        "reading_order": reading_order
+                    })
+                elif label == "table":
+                    table_text = model_chat("Parse the table in the image.", cropped)
+                    recognition_results.append({
+                        "label": label,
+                        "bbox": [x1, y1, x2, y2],
+                        "text": table_text.strip(),
+                        "reading_order": reading_order
+                    })
+                elif label == "figure":
+                    recognition_results.append({
+                        "label": label,
+                        "bbox": [x1, y1, x2, y2],
+                        "text": "[Figure]",  # Placeholder for figure content
+                        "reading_order": reading_order
+                    })
+            reading_order += 1
+        except Exception as e:
+            print(f"Error processing element: {e}")
+            continue
+    return recognition_results
+def generate_markdown(recognition_results):
+    """Generate markdown from extracted elements."""
+    markdown = ""
+    for element in sorted(recognition_results, key=lambda x: x["reading_order"]):
+        if element["label"] == "text":
+            markdown += f"{element['text']}\n\n"
+        elif element["label"] == "table":
+            markdown += f"**Table:**\n{element['text']}\n\n"
+        elif element["label"] == "figure":
+            markdown += f"{element['text']}\n\n"
+    return markdown.strip()
+def process_image_with_dolphin(image):
+    """Process a single image with Dolphin model."""
+    layout_output = model_chat("Parse the reading order of this document.", image)
+    elements = process_elements(layout_output, image)
+    markdown_content = generate_markdown(elements)
+    return markdown_content
 @spaces.GPU
 def generate_image(model_name: str, text: str, image: Image.Image,
                    max_new_tokens: int = 1024,
                    top_k: int = 50,
                    repetition_penalty: float = 1.2):
     """Generate responses for image input using the selected model."""
+    if model_name == "ByteDance-s-Dolphin":
+        if image is None:
+            yield "Please upload an image."
+            return
+        markdown_content = process_image_with_dolphin(image)
+        yield markdown_content
     else:
+        # Existing logic for other models
+        if model_name == "Nanonets-OCR-s":
+            processor = processor_m
+            model = model_m
+        elif model_name == "MonkeyOCR-Recognition":
+            processor = processor_g
+            model = model_g
+        elif model_name == "SmolDocling-256M-preview":
+            processor = processor_x
+            model = model_x
         else:
+            yield "Invalid model selected."
+            return
+        if image is None:
+            yield "Please upload an image."
+            return
+        images = [image]
+        if model_name == "SmolDocling-256M-preview":
+            if "OTSL" in text or "code" in text:
+                images = [add_random_padding(img) for img in images]
+            if "OCR at text at" in text or "Identify element" in text or "formula" in text:
+                text = normalize_values(text, target_max=500)
+        messages = [
+            {
+                "role": "user",
+                "content": [{"type": "image"} for _ in images] + [
+                    {"type": "text", "text": text}
+                ]
+            }
+        ]
+        prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
+        inputs = processor(text=prompt, images=images, return_tensors="pt").to(device)
+        streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
+        generation_kwargs = {
+            **inputs,
+            "streamer": streamer,
+            "max_new_tokens": max_new_tokens,
+            "temperature": temperature,
+            "top_p": top_p,
+            "top_k": top_k,
+            "repetition_penalty": repetition_penalty,
+        }
+        thread = Thread(target=model.generate, kwargs=generation_kwargs)
+        thread.start()
+        buffer = ""
+        full_output = ""
+        for new_text in streamer:
+            full_output += new_text
+            buffer += new_text.replace("<|im_end|>", "")
+            yield buffer
+        if model_name == "SmolDocling-256M-preview":
+            cleaned_output = full_output.replace("<end_of_utterance>", "").strip()
+            if any(tag in cleaned_output for tag in ["<doctag>", "<otsl>", "<code>", "<chart>", "<formula>"]):
+                if "<chart>" in cleaned_output:
+                    cleaned_output = cleaned_output.replace("<chart>", "<otsl>").replace("</chart>", "</otsl>")
+                    cleaned_output = re.sub(r'(<loc_500>)(?!.*<loc_500>)<[^>]+>', r'\1', cleaned_output)
+                doctags_doc = DocTagsDocument.from_doctags_and_image_pairs([cleaned_output], images)
+                doc = DoclingDocument.load_from_doctags(doctags_doc, document_name="Document")
+                markdown_output = doc.export_to_markdown()
+                yield f"**MD Output:**\n\n{markdown_output}"
+            else:
+                yield cleaned_output
 @spaces.GPU
 def generate_video(model_name: str, text: str, video_path: str,
                    top_k: int = 50,
                    repetition_penalty: float = 1.2):
     """Generate responses for video input using the selected model."""
+    if model_name == "ByteDance-s-Dolphin":
+        if video_path is None:
+            yield "Please upload a video."
+            return
+        frames = downsample_video(video_path)
+        markdown_contents = []
+        for frame, _ in frames:
+            markdown_content = process_image_with_dolphin(frame)
+            markdown_contents.append(markdown_content)
+        combined_markdown = "\n\n".join(markdown_contents)
+        yield combined_markdown
     else:
+        # Existing logic for other models
+        if model_name == "Nanonets-OCR-s":
+            processor = processor_m
+            model = model_m
+        elif model_name == "MonkeyOCR-Recognition":
+            processor = processor_g
+            model = model_g
+        elif model_name == "SmolDocling-256M-preview":
+            processor = processor_x
+            model = model_x
         else:
+            yield "Invalid model selected."
+            return
+        if video_path is None:
+            yield "Please upload a video."
+            return
+        frames = downsample_video(video_path)
+        images = [frame for frame, _ in frames]
+        if model_name == "SmolDocling-256M-preview":
+            if "OTSL" in text or "code" in text:
+                images = [add_random_padding(img) for img in images]
+            if "OCR at text at" in text or "Identify element" in text or "formula" in text:
+                text = normalize_values(text, target_max=500)
+        messages = [
+            {
+                "role": "user",
+                "content": [{"type": "image"} for _ in images] + [
+                    {"type": "text", "text": text}
+                ]
+            }
+        ]
+        prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
+        inputs = processor(text=prompt, images=images, return_tensors="pt").to(device)
+        streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
+        generation_kwargs = {
+            **inputs,
+            "streamer": streamer,
+            "max_new_tokens": max_new_tokens,
+            "temperature": temperature,
+            "top_p": top_p,
+            "top_k": top_k,
+            "repetition_penalty": repetition_penalty,
+        }
+        thread = Thread(target=model.generate, kwargs=generation_kwargs)
+        thread.start()
+        buffer = ""
+        full_output = ""
+        for new_text in streamer:
+            full_output += new_text
+            buffer += new_text.replace("<|im_end|>", "")
+            yield buffer
+        if model_name == "SmolDocling-256M-preview":
+            cleaned_output = full_output.replace("<end_of_utterance>", "").strip()
+            if any(tag in cleaned_output for tag in ["<doctag>", "<otsl>", "<code>", "<chart>", "<formula>"]):
+                if "<chart>" in cleaned_output:
+                    cleaned_output = cleaned_output.replace("<chart>", "<otsl>").replace("</chart>", "</otsl>")
+                    cleaned_output = re.sub(r'(<loc_500>)(?!.*<loc_500>)<[^>]+>', r'\1', cleaned_output)
+                doctags_doc = DocTagsDocument.from_doctags_and_image_pairs([cleaned_output], images)
+                doc = DoclingDocument.load_from_doctags(doctags_doc, document_name="Document")
+                markdown_output = doc.export_to_markdown()
+                yield f"**MD Output:**\n\n{markdown_output}"
+            else:
+                yield cleaned_output
 # Define examples for image and video inference
 image_examples = [
 # Create the Gradio Interface
 with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
+    gr.Markdown("# **[Core OCR](https://huggingface.co/collections/prithivMLmods/multimodal-implementations-67c9982ea04b39f0608badb0)**")
     with gr.Row():
         with gr.Column():
             with gr.Tabs():