Spaces:

venkatviswa
/

flowchart-to-text

Sleeping

App Files Files Community

Venkat V commited on Apr 10

Commit

152df72

1 Parent(s): 6ea5d07

updated with fixes to all modules

Browse files

Files changed (8) hide show

api_backend.py +42 -17
app.py +66 -53
graph_module/__init__.py +105 -43
ocr_module/__init__.py +106 -74
ocr_module/__init__pyt.py +0 -135
requirements.txt +4 -0
summarizer_module/__init__.py +28 -18
yolo_module/__init__.py +77 -17

api_backend.py CHANGED Viewed

@@ -1,3 +1,14 @@
 from fastapi import FastAPI, UploadFile, File, Form
 from fastapi.middleware.cors import CORSMiddleware
 from fastapi.responses import JSONResponse
@@ -7,25 +18,41 @@ import io
 import json
 import base64
-# 💡 Import modules
 from yolo_module import run_yolo
 from ocr_module import extract_text, count_elements, validate_structure
-from graph_module import map_arrows, build_flowchart_json
 from summarizer_module import summarize_flowchart
 app = FastAPI()
-# 🔓 Enable CORS for Streamlit frontend
 app.add_middleware(
     CORSMiddleware,
-    allow_origins=["*"],  # Update with actual domain if needed
     allow_credentials=True,
     allow_methods=["*"],
     allow_headers=["*"],
 )
 @app.post("/process-image")
-async def process_image(file: UploadFile = File(...), debug: str = Form("false")):
     debug_mode = debug.lower() == "true"
     debug_log = []
@@ -33,35 +60,31 @@ async def process_image(file: UploadFile = File(...), debug: str = Form("false")
         debug_log.append("📥 Received file upload")
     print(f"📥 File received: {file.filename}")
-    # 🖼️ Load image
     contents = await file.read()
     image = Image.open(io.BytesIO(contents)).convert("RGB")
     if debug_mode:
         debug_log.append("✅ Image converted to RGB")
     print("✅ Image converted to RGB")
-    # 📦 YOLO Detection
     boxes, arrows, vis_debug = run_yolo(image)
     if debug_mode:
         debug_log.append(f"📦 Detected {len(boxes)} boxes, {len(arrows)} arrows")
-    # 🔍 OCR for each box
     for box in boxes:
         box["text"] = extract_text(image, box["bbox"], debug=debug_mode)
         print(f"🔍 OCR for {box['id']}: {box['text']}")
         if debug_mode:
             debug_log.append(f"🔍 {box['id']}: {box['text']}")
-    # ➡️ Build directional edges
-    edges = map_arrows(boxes, arrows)
-    if debug_mode:
-        debug_log.append(f"➡️ Mapped {len(edges)} directional edges")
-    # 🧠 Build structured flowchart
-    flowchart_json = build_flowchart_json(boxes, edges)
     print("🧠 Flowchart JSON:", json.dumps(flowchart_json, indent=2))
-    # ✅ Sanity checks
     structure_info = count_elements(boxes, arrows, debug=debug_mode)
     validation = validate_structure(
         flowchart_json,
@@ -72,17 +95,18 @@ async def process_image(file: UploadFile = File(...), debug: str = Form("false")
     if debug_mode:
         debug_log.append(f"🧾 Validation: {validation}")
-    # ✍️ Generate Summary
     summary = summarize_flowchart(flowchart_json)
     print("📝 Summary:", summary)
-    # 🖼️ Encode visual debug
     yolo_vis = None
     if debug_mode and vis_debug:
         vis_io = io.BytesIO()
         vis_debug.save(vis_io, format="PNG")
         yolo_vis = base64.b64encode(vis_io.getvalue()).decode("utf-8")
     return JSONResponse({
         "flowchart": flowchart_json,
         "summary": summary,
@@ -92,4 +116,5 @@ async def process_image(file: UploadFile = File(...), debug: str = Form("false")
 if __name__ == "__main__":
     uvicorn.run(app, host="0.0.0.0", port=7860)

+"""
+api_backend.py
+FastAPI backend for flowchart-to-English processing. This API supports receiving
+an image file, running YOLO-based detection to identify boxes and arrows, performing
+OCR, and generating structured JSON + English summary of the flowchart.
+Endpoints:
+- POST /process-image: Accepts image input and returns structured flowchart data.
+"""
 from fastapi import FastAPI, UploadFile, File, Form
 from fastapi.middleware.cors import CORSMiddleware
 from fastapi.responses import JSONResponse
 import json
 import base64
+# 🔧 Import local processing modules
 from yolo_module import run_yolo
 from ocr_module import extract_text, count_elements, validate_structure
+from graph_module import map_arrows, build_flowchart_json
 from summarizer_module import summarize_flowchart
+# 🔥 Initialize FastAPI app
 app = FastAPI()
+# 🔓 Enable CORS to allow frontend (e.g., Streamlit on localhost) to connect
 app.add_middleware(
     CORSMiddleware,
+    allow_origins=["*"],  # In production, replace with allowed frontend domain
     allow_credentials=True,
     allow_methods=["*"],
     allow_headers=["*"],
 )
 @app.post("/process-image")
+async def process_image(
+    file: UploadFile = File(...),
+    debug: str = Form("false")
+):
+    """
+    Receives an uploaded flowchart image, performs object detection and OCR,
+    constructs a structured flowchart JSON, and generates a plain-English summary.
+    Args:
+        file (UploadFile): Flowchart image file (.png, .jpg, .jpeg).
+        debug (str): "true" to enable debug mode (includes OCR logs and YOLO preview).
+    Returns:
+        JSONResponse: Contains flowchart structure, summary, debug output, and optional YOLO overlay.
+    """
     debug_mode = debug.lower() == "true"
     debug_log = []
         debug_log.append("📥 Received file upload")
     print(f"📥 File received: {file.filename}")
+    # 🖼️ Convert file bytes to RGB image
     contents = await file.read()
     image = Image.open(io.BytesIO(contents)).convert("RGB")
     if debug_mode:
         debug_log.append("✅ Image converted to RGB")
     print("✅ Image converted to RGB")
+    # 📦 YOLO Detection for boxes and arrows
     boxes, arrows, vis_debug = run_yolo(image)
     if debug_mode:
         debug_log.append(f"📦 Detected {len(boxes)} boxes, {len(arrows)} arrows")
+    # 🔍 Run OCR on each detected box
     for box in boxes:
         box["text"] = extract_text(image, box["bbox"], debug=debug_mode)
         print(f"🔍 OCR for {box['id']}: {box['text']}")
         if debug_mode:
             debug_log.append(f"🔍 {box['id']}: {box['text']}")
+    # 🧠 Build structured JSON from nodes and edges
+    flowchart_json = build_flowchart_json(boxes, arrows)
     print("🧠 Flowchart JSON:", json.dumps(flowchart_json, indent=2))
+    # ✅ Validate structure
     structure_info = count_elements(boxes, arrows, debug=debug_mode)
     validation = validate_structure(
         flowchart_json,
     if debug_mode:
         debug_log.append(f"🧾 Validation: {validation}")
+    # ✍️ Generate plain-English summary
     summary = summarize_flowchart(flowchart_json)
     print("📝 Summary:", summary)
+    # 🖼️ Encode YOLO debug image (if debug enabled)
     yolo_vis = None
     if debug_mode and vis_debug:
         vis_io = io.BytesIO()
         vis_debug.save(vis_io, format="PNG")
         yolo_vis = base64.b64encode(vis_io.getvalue()).decode("utf-8")
+    # 📤 Return full response
     return JSONResponse({
         "flowchart": flowchart_json,
         "summary": summary,
 if __name__ == "__main__":
+    # Run the FastAPI app using Uvicorn
     uvicorn.run(app, host="0.0.0.0", port=7860)

app.py CHANGED Viewed

@@ -1,76 +1,89 @@
 # app.py
 import streamlit as st
 from PIL import Image
-import io
 import base64
 import os
-# Local modules
-from yolo_module import run_yolo
-from ocr_module import extract_text
-from graph_module import map_arrows, build_flowchart_json
-from summarizer_module import summarize_flowchart
 st.set_page_config(page_title="Flowchart to English", layout="wide")
 st.title("📄 Flowchart to Plain English")
-# Enable debug mode
 debug_mode = st.toggle("🔧 Show Debug Info", value=False)
-# Upload image
 uploaded_file = st.file_uploader("Upload a flowchart image", type=["png", "jpg", "jpeg"])
 if uploaded_file:
     image = Image.open(uploaded_file).convert("RGB")
-    # Show resized preview
     max_width = 600
     ratio = max_width / float(image.size[0])
     resized_image = image.resize((max_width, int(image.size[1] * ratio)))
     st.image(resized_image, caption="📤 Uploaded Image", use_container_width=False)
     if st.button("🔍 Analyze Flowchart"):
-        progress = st.progress(0, text="Detecting boxes and arrows...")
-        results, arrows, vis_debug = run_yolo(image)
-        progress.progress(25, text="Running OCR...")
-        debug_log = []
-        debug_log.append(f"📦 Detected {len(results)} boxes")
-        debug_log.append(f"➡️  Detected {len(arrows)} arrows")
-        for node in results:
-            node["text"] = extract_text(image, node["bbox"], debug=debug_mode)
-            label = node.get("label", "box")
-            text = node["text"]
-            debug_log.append(f"🔖 {node['id']} | Label: {label} | Text: {text}")
-        progress.progress(50, text="Mapping arrows to nodes...")
-        edges = map_arrows(results, arrows)
-        progress.progress(75, text="Building graph structure...")
-        flowchart = build_flowchart_json(results, edges)
-        progress.progress(90, text="Generating explanation...")
-        summary = summarize_flowchart(flowchart)
-        # Show Debug Info first
-        if debug_mode:
-            st.markdown("### 🧪 Debug Info")
-            st.code("\n".join(debug_log), language="markdown")
-            st.markdown("### 🖼️ YOLO Detected Bounding Boxes")
-            st.image(vis_debug, caption="YOLO Detected Boxes", use_container_width=True)
-        # Show results: JSON (left), Summary (right)
-        col1, col2 = st.columns(2)
-        with col1:
-            st.subheader("🧠 Flowchart JSON")
-            st.json(flowchart)
-        with col2:
-            st.subheader("📝 English Summary")
-            st.markdown(summary)
-        progress.progress(100, text="Done!")
 else:
     st.info("Upload a flowchart image to begin.")

 # app.py
+"""
+Streamlit Frontend App: Uploads a flowchart image, sends it to FastAPI backend,
+and displays the structured JSON and English summary. Supports multiple OCR engines.
+"""
 import streamlit as st
 from PIL import Image
+import requests
 import base64
+import io
 import os
+# Set up Streamlit UI layout
 st.set_page_config(page_title="Flowchart to English", layout="wide")
 st.title("📄 Flowchart to Plain English")
+# Enable debug mode toggle
 debug_mode = st.toggle("🔧 Show Debug Info", value=False)
+# OCR engine selection dropdown
+ocr_engine = st.selectbox("Select OCR Engine", ["easyocr", "doctr"], index=0,
+                          help="Choose between EasyOCR (lightweight) and Doctr (transformer-based)")
+# Flowchart image uploader
 uploaded_file = st.file_uploader("Upload a flowchart image", type=["png", "jpg", "jpeg"])
+# Backend API URL (defaults to localhost for dev)
+API_URL = os.getenv("API_URL", "http://localhost:7860/process-image")
 if uploaded_file:
+    # Load and resize uploaded image for preview
     image = Image.open(uploaded_file).convert("RGB")
     max_width = 600
     ratio = max_width / float(image.size[0])
     resized_image = image.resize((max_width, int(image.size[1] * ratio)))
     st.image(resized_image, caption="📤 Uploaded Image", use_container_width=False)
     if st.button("🔍 Analyze Flowchart"):
+        progress = st.progress(0, text="Sending image to backend...")
+        try:
+            # Send request to FastAPI backend
+            response = requests.post(
+                API_URL,
+                files={"file": uploaded_file.getvalue()},
+                data={
+                    "debug": str(debug_mode).lower(),
+                    "ocr_engine": ocr_engine
+                }
+            )
+            progress.progress(40, text="Processing detection and OCR...")
+            if response.status_code == 200:
+                result = response.json()
+                # Show debug info if enabled
+                if debug_mode:
+                    st.markdown("### 🧪 Debug Info")
+                    st.code(result.get("debug", ""), language="markdown")
+                # Show YOLO visual if available
+                if debug_mode and result.get("yolo_vis"):
+                    st.markdown("### 🖼️ YOLO Detected Bounding Boxes")
+                    yolo_bytes = base64.b64decode(result["yolo_vis"])
+                    yolo_img = Image.open(io.BytesIO(yolo_bytes))
+                    st.image(yolo_img, caption="YOLO Boxes", use_container_width=True)
+                progress.progress(80, text="Finalizing output...")
+                # Show flowchart JSON and generated English summary
+                col1, col2 = st.columns(2)
+                with col1:
+                    st.subheader("🧠 Flowchart JSON")
+                    st.json(result["flowchart"])
+                with col2:
+                    st.subheader("📝 English Summary")
+                    st.markdown(result["summary"])
+                progress.progress(100, text="Done!")
+            else:
+                st.error(f"❌ Backend Error: {response.status_code} - {response.text}")
+        except Exception as e:
+            st.error(f"⚠️ Request Failed: {e}")
 else:
     st.info("Upload a flowchart image to begin.")

graph_module/__init__.py CHANGED Viewed

@@ -1,34 +1,83 @@
-# flowchart_builder.py
-# Arrow and graph logic for converting detected flowchart elements to structured JSON
-from shapely.geometry import box, Point
 from collections import defaultdict, deque
 def map_arrows(nodes, arrows):
     """
-    Matches arrows to nodes based on geometric endpoints.
-    Returns a list of (source_id, target_id, label) edges.
     """
     for node in nodes:
-        node["shape"] = box(*node["bbox"])
     edges = []
     for arrow in arrows:
-        tail_point = Point(arrow["tail"])
-        head_point = Point(arrow["head"])
-        label = arrow.get("label", "")
-        source = next((n["id"] for n in nodes if n["shape"].contains(tail_point)), None)
-        target = next((n["id"] for n in nodes if n["shape"].contains(head_point)), None)
         if source and target and source != target:
             edges.append((source, target, label))
     return edges
-def detect_node_type(text):
     """
-    Heuristic-based type detection from node text.
     """
     text_lower = text.lower()
     if "start" in text_lower:
@@ -37,37 +86,43 @@ def detect_node_type(text):
         return "end"
     if "?" in text or "yes" in text_lower or "no" in text_lower:
         return "decision"
-    return "process"
-def build_flowchart_json(nodes, edges):
     """
-    Constructs flowchart JSON structure with parent and branching info.
     """
-    graph = {}
     reverse_links = defaultdict(list)
     edge_labels = {}
-    for node in nodes:
-        text = node.get("text", "").strip()
-        graph[node["id"]] = {
-            "text": text,
-            "type": node.get("type") or detect_node_type(text),
-            "next": []
-        }
     for src, tgt, label in edges:
-        graph[src]["next"].append(tgt)
         reverse_links[tgt].append(src)
-        edge_labels[(src, tgt)] = label.lower().strip()
-    start_nodes = [nid for nid in graph if len(reverse_links[nid]) == 0]
-    flowchart_json = {
-        "start": start_nodes[0] if start_nodes else None,
         "steps": []
     }
     visited = set()
-    queue = deque(start_nodes)
     while queue:
         curr = queue.popleft()
@@ -75,21 +130,24 @@ def build_flowchart_json(nodes, edges):
             continue
         visited.add(curr)
-        node = graph[curr]
         step = {
             "id": curr,
-            "text": node["text"],
-            "type": node["type"]
         }
-        parents = reverse_links[curr]
         if len(parents) == 1:
             step["parent"] = parents[0]
         elif len(parents) > 1:
             step["parents"] = parents
-        next_nodes = node["next"]
-        if node["type"] == "decision" and len(next_nodes) >= 2:
             step["branches"] = {}
             for tgt in next_nodes:
                 label = edge_labels.get((curr, tgt), "")
@@ -99,14 +157,18 @@ def build_flowchart_json(nodes, edges):
                     step["branches"]["no"] = tgt
                 else:
                     step["branches"].setdefault("unknown", []).append(tgt)
-            queue.extend(next_nodes)
         elif len(next_nodes) == 1:
             step["next"] = next_nodes[0]
-            queue.append(next_nodes[0])
         elif len(next_nodes) > 1:
             step["next"] = next_nodes
-            queue.extend(next_nodes)
-        flowchart_json["steps"].append(step)
-    return flowchart_json

+from shapely.geometry import box as shapely_box, Point
 from collections import defaultdict, deque
+import math
+MAX_FALLBACK_DIST = 150  # pixels
 def map_arrows(nodes, arrows):
     """
+    Map arrows to node boxes using geometric containment with fallback to nearest box.
+    Returns directional edges (source_id, target_id, label).
+    Args:
+        nodes (list): List of node dicts with 'bbox' field.
+        arrows (list): List of arrow dicts with 'tail' and 'head' coordinates.
+    Returns:
+        list: List of (source, target, label) tuples.
     """
     for node in nodes:
+        node["shape"] = shapely_box(*node["bbox"])
+        node["center"] = (
+            (node["bbox"][0] + node["bbox"][2]) // 2,
+            (node["bbox"][1] + node["bbox"][3]) // 2
+        )
     edges = []
+    def find_nearest_node(pt):
+        min_dist = float("inf")
+        nearest_id = None
+        for n in nodes:
+            cx, cy = n["center"]
+            dist = math.dist(pt, (cx, cy))
+            if dist < min_dist:
+                min_dist = dist
+                nearest_id = n["id"]
+        return nearest_id, min_dist
     for arrow in arrows:
+        if not isinstance(arrow, dict) or not isinstance(arrow.get("tail"), (tuple, list)) or not isinstance(arrow.get("head"), (tuple, list)):
+            print(f"⚠️ Skipping malformed arrow: {arrow}")
+            continue
+        tail_pt = Point(arrow["tail"])
+        head_pt = Point(arrow["head"])
+        label = arrow.get("label", "").strip()
+        source = next((n["id"] for n in nodes if n["shape"].buffer(10).contains(tail_pt)), None)
+        target = next((n["id"] for n in nodes if n["shape"].buffer(10).contains(head_pt)), None)
+        # Fallback to nearest node if not found
+        if not source:
+            source, dist = find_nearest_node(arrow["tail"])
+            if dist > MAX_FALLBACK_DIST:
+                source = None
+        if not target:
+            target, dist = find_nearest_node(arrow["head"])
+            if dist > MAX_FALLBACK_DIST:
+                target = None
         if source and target and source != target:
+            print(f"✅ Mapped arrow from {source} → {target} [{label}]")
             edges.append((source, target, label))
+        else:
+            print(f"⚠️ Could not map arrow endpoints to nodes: tail={arrow.get('tail')} head={arrow.get('head')}")
     return edges
+def detect_node_type(text, default_type="process"):
     """
+    Heuristically infer the node type from its text.
+    Args:
+        text (str): Node label.
+        default_type (str): Fallback type.
+    Returns:
+        str: Inferred node type.
     """
     text_lower = text.lower()
     if "start" in text_lower:
         return "end"
     if "?" in text or "yes" in text_lower or "no" in text_lower:
         return "decision"
+    return default_type
+def build_flowchart_json(nodes, arrows):
     """
+    Construct a structured flowchart JSON using basic graph traversal.
+    Args:
+        nodes (list): Detected node dicts.
+        arrows (list): Detected arrow dicts.
+    Returns:
+        dict: JSON with 'start' and 'steps'.
     """
+    edges = map_arrows(nodes, arrows)
+    # Build adjacency and reverse mappings
+    graph = defaultdict(list)
     reverse_links = defaultdict(list)
     edge_labels = {}
     for src, tgt, label in edges:
+        graph[src].append(tgt)
         reverse_links[tgt].append(src)
+        edge_labels[(src, tgt)] = label.lower()
+    all_node_ids = {n["id"] for n in nodes}
+    start_candidates = [nid for nid in all_node_ids if nid not in reverse_links]
+    flowchart = {
+        "start": start_candidates[0] if start_candidates else None,
         "steps": []
     }
     visited = set()
+    queue = deque(start_candidates)
+    id_to_node = {n["id"]: n for n in nodes}
     while queue:
         curr = queue.popleft()
             continue
         visited.add(curr)
+        node = id_to_node.get(curr, {})
+        text = node.get("text", "").strip()
+        node_type = node.get("type") or detect_node_type(text)
         step = {
             "id": curr,
+            "text": text,
+            "type": node_type
         }
+        parents = list(set(reverse_links[curr]))
         if len(parents) == 1:
             step["parent"] = parents[0]
         elif len(parents) > 1:
             step["parents"] = parents
+        next_nodes = list(set(graph[curr]))
+        if node_type == "decision" and next_nodes:
             step["branches"] = {}
             for tgt in next_nodes:
                 label = edge_labels.get((curr, tgt), "")
                     step["branches"]["no"] = tgt
                 else:
                     step["branches"].setdefault("unknown", []).append(tgt)
+                if tgt not in visited:
+                    queue.append(tgt)
         elif len(next_nodes) == 1:
             step["next"] = next_nodes[0]
+            if next_nodes[0] not in visited:
+                queue.append(next_nodes[0])
         elif len(next_nodes) > 1:
             step["next"] = next_nodes
+            for n in next_nodes:
+                if n not in visited:
+                    queue.append(n)
+        flowchart["steps"].append(step)
+    return flowchart

ocr_module/__init__.py CHANGED Viewed

@@ -1,19 +1,40 @@
-import easyocr
-from PIL import Image
 import numpy as np
 import cv2
-import torch
 from textblob import TextBlob
 from device_config import get_device
-device = get_device()
-# Enable GPU if available
-reader = easyocr.Reader(['en'], gpu=(device == "cuda"))
-print(f"✅ EasyOCR reader initialized on: {device}")
 def expand_bbox(bbox, image_size, pad=10):
     x1, y1, x2, y2 = bbox
     x1 = max(0, x1 - pad)
     y1 = max(0, y1 - pad)
@@ -22,104 +43,115 @@ def expand_bbox(bbox, image_size, pad=10):
     return [x1, y1, x2, y2]
 def clean_text(text):
     blob = TextBlob(text)
     return str(blob.correct())
-def extract_text(image, bbox, debug=False, use_adaptive_threshold=False):
     """
-    Run OCR on a cropped region of the image using EasyOCR with preprocessing.
     Parameters:
-        image (PIL.Image): The full image.
-        bbox (list): [x1, y1, x2, y2] coordinates of the region to crop.
-        debug (bool): If True, show intermediate debug output.
-        use_adaptive_threshold (bool): Use adaptive thresholding instead of Otsu's.
     Returns:
-        str: Extracted and cleaned text.
     """
-    # Expand bbox slightly
     bbox = expand_bbox(bbox, image.size, pad=10)
     x1, y1, x2, y2 = bbox
     cropped = image.crop((x1, y1, x2, y2))
-    # Convert to OpenCV format (numpy array)
     cv_img = np.array(cropped)
-    # Convert to grayscale
     gray = cv2.cvtColor(cv_img, cv2.COLOR_RGB2GRAY)
-    # Apply Gaussian blur to reduce noise
-    blurred = cv2.GaussianBlur(gray, (3, 3), 0)
-    # Resize (upscale) image for better OCR accuracy
-    scale_factor = 2.5
-    resized = cv2.resize(blurred, (0, 0), fx=scale_factor, fy=scale_factor, interpolation=cv2.INTER_LINEAR)
-    # Convert to RGB as EasyOCR expects color image
-    resized_rgb = cv2.cvtColor(resized, cv2.COLOR_GRAY2RGB)
-    # Optional: debug save
-    if debug:
-        debug_image = Image.fromarray(resized_rgb)
-        debug_image.save(f"debug_ocr_crop_{x1}_{y1}.png")
-    # Run OCR using EasyOCR
-    try:
-        results = reader.readtext(resized_rgb, paragraph=False, min_size=5)
-    except Exception as e:
         if debug:
-            print(f"⚠️ EasyOCR failed: {e}")
         return ""
-    if debug:
-        for res in results:
-            print(f"OCR: {res[1]} (conf: {res[2]:.2f})")
-    # Sort boxes top to bottom, then left to right
-    results.sort(key=lambda r: (r[0][0][1], r[0][0][0]))
-    # Filter by confidence
-    filtered = [r for r in results if r[2] > 0.4]
-    if not filtered and results:
-        filtered = sorted(results, key=lambda r: -r[2])[:2]  # fallback to top-2
-    lines = []
-    for res in filtered:
-        lines.append(res[1])
-    joined_text = " ".join(lines).strip()
-    # Apply correction
-    if joined_text:
-        joined_text = clean_text(joined_text)
-    return joined_text
 def count_elements(boxes, arrows, debug=False):
     box_count = len(boxes)
     arrow_count = len(arrows)
     if debug:
-        print(f"📦 Detected {box_count} boxes")
-        print(f"➡️  Detected {arrow_count} arrows")
-    return {
-        "box_count": box_count,
-        "arrow_count": arrow_count
-    }
 def validate_structure(flowchart_json, expected_boxes=None, expected_arrows=None, debug=False):
     actual_boxes = len(flowchart_json.get("steps", []))
     actual_arrows = len(flowchart_json.get("edges", [])) if "edges" in flowchart_json else None
     if debug:
-        print(f"🔍 Flowchart JSON has {actual_boxes} steps")
-        if actual_arrows is not None:
-            print(f"🔍 Flowchart JSON has {actual_arrows} edges")
-    result = {
         "boxes_valid": (expected_boxes is None or expected_boxes == actual_boxes),
         "arrows_valid": (expected_arrows is None or expected_arrows == actual_arrows)
-    }
-    return result

+"""
+OCR module with support for EasyOCR and Doctr.
+Provides the `extract_text` function that accepts a cropped bounding box and image,
+and runs OCR based on the selected engine ("easyocr" or "doctr").
+"""
 import numpy as np
+from PIL import Image
 import cv2
 from textblob import TextBlob
 from device_config import get_device
+# OCR engine flags
+USE_EASYOCR = True
+USE_DOCTR = False
+# Import EasyOCR if available
+try:
+    import easyocr
+    reader = easyocr.Reader(['en'], gpu=(get_device() == "cuda"))
+    print(f"✅ EasyOCR reader initialized on: {get_device()}")
+    USE_EASYOCR = True
+except ImportError:
+    print("⚠️ EasyOCR not installed. Falling back if Doctr is available.")
+# Import Doctr if available
+try:
+    from doctr.io import DocumentFile
+    from doctr.models import ocr_predictor
+    doctr_model = ocr_predictor(pretrained=True)
+    print("✅ Doctr model loaded.")
+    USE_DOCTR = True
+except ImportError:
+    print("⚠️ Doctr not installed.")
 def expand_bbox(bbox, image_size, pad=10):
+    """Expand a bounding box by padding within image bounds."""
     x1, y1, x2, y2 = bbox
     x1 = max(0, x1 - pad)
     y1 = max(0, y1 - pad)
     return [x1, y1, x2, y2]
 def clean_text(text):
+    """Use TextBlob to autocorrect basic OCR errors."""
     blob = TextBlob(text)
     return str(blob.correct())
+def extract_text(image, bbox, debug=False, engine="easyocr"):
     """
+    Run OCR on a cropped region using EasyOCR or Doctr.
     Parameters:
+        image (PIL.Image): Full input image.
+        bbox (list): [x1, y1, x2, y2] bounding box.
+        debug (bool): Enable debug output.
+        engine (str): 'easyocr' or 'doctr'.
     Returns:
+        str: Cleaned OCR output.
     """
+    # Expand and crop image region
     bbox = expand_bbox(bbox, image.size, pad=10)
     x1, y1, x2, y2 = bbox
     cropped = image.crop((x1, y1, x2, y2))
+    # Convert to OpenCV grayscale
     cv_img = np.array(cropped)
     gray = cv2.cvtColor(cv_img, cv2.COLOR_RGB2GRAY)
+    # Enhance contrast using CLAHE (Contrast Limited Adaptive Histogram Equalization)
+    clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
+    enhanced = clahe.apply(gray)
+    # Apply adaptive threshold for better text separation
+    thresh = cv2.adaptiveThreshold(enhanced, 255, cv2.ADAPTIVE_THRESH_MEAN_C, cv2.THRESH_BINARY, 11, 4)
+    # Resize for better OCR resolution
+    resized = cv2.resize(thresh, (0, 0), fx=2.5, fy=2.5, interpolation=cv2.INTER_LINEAR)
+    # Convert to RGB (some OCR engines expect 3-channel images)
+    preprocessed = cv2.cvtColor(resized, cv2.COLOR_GRAY2RGB)
+    if debug:
+        Image.fromarray(preprocessed).save(f"debug_ocr_crop_{x1}_{y1}.png")
+    if engine == "doctr" and USE_DOCTR:
+        try:
+            doc = DocumentFile.from_images([Image.fromarray(preprocessed)])
+            result = doctr_model(doc)
+            out_text = " ".join([b.value for b in result.pages[0].blocks])
+            if debug:
+                print(f"📘 Doctr OCR: {out_text}")
+            return clean_text(out_text)
+        except Exception as e:
+            if debug:
+                print(f"❌ Doctr failed: {e}")
+            return ""
+    elif engine == "easyocr" and USE_EASYOCR:
+        try:
+            results = reader.readtext(preprocessed, paragraph=False, min_size=10)
+            filtered = []
+            for r in results:
+                text = r[1].strip()
+                conf = r[2]
+                if conf > 0.5 and len(text) > 2 and any(c.isalnum() for c in text):
+                    filtered.append(r)
+            # Remove duplicates by bounding box IoU overlap
+            final = []
+            seen = set()
+            for r in filtered:
+                t = r[1].strip()
+                if t.lower() not in seen:
+                    seen.add(t.lower())
+                    final.append(r)
+            final.sort(key=lambda r: (r[0][0][1], r[0][0][0]))
+            text = " ".join([r[1] for r in final]).strip()
+            if debug:
+                for r in final:
+                    print(f"📱 EasyOCR: {r[1]} (conf: {r[2]:.2f})")
+            return clean_text(text) if text else ""
+        except Exception as e:
+            if debug:
+                print(f"❌ EasyOCR failed: {e}")
+            return ""
+    else:
         if debug:
+            print(f"⚠️ Unsupported OCR engine: {engine} or not available.")
         return ""
 def count_elements(boxes, arrows, debug=False):
+    """Return count of boxes and arrows detected."""
     box_count = len(boxes)
     arrow_count = len(arrows)
     if debug:
+        print(f"📦 Boxes: {box_count} | ➡️ Arrows: {arrow_count}")
+    return {"box_count": box_count, "arrow_count": arrow_count}
 def validate_structure(flowchart_json, expected_boxes=None, expected_arrows=None, debug=False):
+    """Validate flowchart structure consistency based on expected counts."""
     actual_boxes = len(flowchart_json.get("steps", []))
     actual_arrows = len(flowchart_json.get("edges", [])) if "edges" in flowchart_json else None
     if debug:
+        print(f"🔍 JSON boxes: {actual_boxes}, edges: {actual_arrows}")
+    return {
         "boxes_valid": (expected_boxes is None or expected_boxes == actual_boxes),
         "arrows_valid": (expected_arrows is None or expected_arrows == actual_arrows)
+    }

ocr_module/__init__pyt.py DELETED Viewed

@@ -1,135 +0,0 @@
-import easyocr
-from PIL import Image
-import numpy as np
-import cv2
-import torch
-from textblob import TextBlob
-import re
-# Enable GPU if available
-use_gpu = torch.cuda.is_available()
-reader = easyocr.Reader(['en'], gpu=use_gpu)
-def expand_bbox(bbox, image_size, pad=10):
-    x1, y1, x2, y2 = bbox
-    x1 = max(0, x1 - pad)
-    y1 = max(0, y1 - pad)
-    x2 = min(image_size[0], x2 + pad)
-    y2 = min(image_size[1], y2 + pad)
-    return [x1, y1, x2, y2]
-def clean_text(text):
-    # Basic cleanup
-    text = re.sub(r'[^A-Za-z0-9?,.:;()\'"\s-]', '', text)  # remove noise characters
-    text = re.sub(r'\s+', ' ', text).strip()
-    # De-duplicate repeated words
-    words = text.split()
-    deduped = [words[0]] + [w for i, w in enumerate(words[1:], 1) if w.lower() != words[i - 1].lower()] if words else []
-    joined = " ".join(deduped)
-    # Run correction only if needed (long word or all caps)
-    if len(joined) > 3 and any(len(w) > 10 or w.isupper() for w in deduped):
-        blob = TextBlob(joined)
-        joined = str(blob.correct())
-    return joined
-def extract_text(image, bbox, debug=False, use_adaptive_threshold=False):
-    """
-    Run OCR on a cropped region of the image using EasyOCR with preprocessing.
-    Parameters:
-        image (PIL.Image): The full image.
-        bbox (list): [x1, y1, x2, y2] coordinates of the region to crop.
-        debug (bool): If True, show intermediate debug output.
-        use_adaptive_threshold (bool): Use adaptive thresholding instead of Otsu's.
-    Returns:
-        str: Extracted and cleaned text.
-    """
-    # Expand bbox slightly
-    bbox = expand_bbox(bbox, image.size, pad=10)
-    x1, y1, x2, y2 = bbox
-    cropped = image.crop((x1, y1, x2, y2))
-    # Convert to OpenCV format (numpy array)
-    cv_img = np.array(cropped)
-    # Convert to grayscale
-    gray = cv2.cvtColor(cv_img, cv2.COLOR_RGB2GRAY)
-    # Apply Gaussian blur to reduce noise
-    blurred = cv2.GaussianBlur(gray, (3, 3), 0)
-    # Resize (upscale) image for better OCR accuracy
-    scale_factor = 2.5
-    resized = cv2.resize(blurred, (0, 0), fx=scale_factor, fy=scale_factor, interpolation=cv2.INTER_LINEAR)
-    # Convert to RGB as EasyOCR expects color image
-    resized_rgb = cv2.cvtColor(resized, cv2.COLOR_GRAY2RGB)
-    # Optional: debug save
-    if debug:
-        debug_image = Image.fromarray(resized_rgb)
-        debug_image.save(f"debug_ocr_crop_{x1}_{y1}.png")
-    # Run OCR using EasyOCR
-    try:
-        results = reader.readtext(resized_rgb, paragraph=False, min_size=5)
-    except Exception as e:
-        if debug:
-            print(f"⚠️ EasyOCR failed: {e}")
-        return ""
-    if debug:
-        for res in results:
-            print(f"OCR: {res[1]} (conf: {res[2]:.2f})")
-    # Sort boxes top to bottom, then left to right
-    results.sort(key=lambda r: (r[0][0][1], r[0][0][0]))
-    # Filter by confidence
-    filtered = [r for r in results if r[2] > 0.4]
-    if not filtered and results:
-        filtered = sorted(results, key=lambda r: -r[2])[:2]  # fallback to top-2
-    lines = []
-    for res in filtered:
-        lines.append(res[1])
-    joined_text = " ".join(lines).strip()
-    # Apply correction
-    if joined_text:
-        joined_text = clean_text(joined_text)
-        if debug:
-            print(f"🧹 Cleaned OCR text: {joined_text}")
-    return joined_text
-def count_elements(boxes, arrows, debug=False):
-    box_count = len(boxes)
-    arrow_count = len(arrows)
-    if debug:
-        print(f"📦 Detected {box_count} boxes")
-        print(f"➡️  Detected {arrow_count} arrows")
-    return {
-        "box_count": box_count,
-        "arrow_count": arrow_count
-    }
-def validate_structure(flowchart_json, expected_boxes=None, expected_arrows=None, debug=False):
-    actual_boxes = len(flowchart_json.get("steps", []))
-    actual_arrows = len(flowchart_json.get("edges", [])) if "edges" in flowchart_json else None
-    if debug:
-        print(f"🔍 Flowchart JSON has {actual_boxes} steps")
-        if actual_arrows is not None:
-            print(f"🔍 Flowchart JSON has {actual_arrows} edges")
-    result = {
-        "boxes_valid": (expected_boxes is None or expected_boxes == actual_boxes),
-        "arrows_valid": (expected_arrows is None or expected_arrows == actual_arrows)
-    }
-    return result

requirements.txt CHANGED Viewed

@@ -20,6 +20,10 @@ numpy                     # Core image array operations
 easyocr                   # GPU-capable OCR engine
 textblob                 # Optional: lightweight text post-processing (optional)
 # 🤖 Object Detection and Language Models
 ultralytics               # YOLOv8/v9 detection (loads .pt models)
 torch                     # Backend for YOLO and EasyOCR

 easyocr                   # GPU-capable OCR engine
 textblob                 # Optional: lightweight text post-processing (optional)
+# --- Doctr dependencies (torch-based) ---
+python-doctr[torch]
+onnxruntime  # Required backend for Doctr inference
 # 🤖 Object Detection and Language Models
 ultralytics               # YOLOv8/v9 detection (loads .pt models)
 torch                     # Backend for YOLO and EasyOCR

summarizer_module/__init__.py CHANGED Viewed

@@ -1,38 +1,48 @@
-# summarizer_module/__init__.py
 from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
 from device_config import get_device
 import torch
 device = get_device()
-# Use a small local model (e.g., Phi-2)
-MODEL_ID = "microsoft/phi-2"  # Ensure it's downloaded and cached locally
-# Load model and tokenizer
 model = AutoModelForCausalLM.from_pretrained(MODEL_ID).to(device)
 tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
 summarizer = pipeline("text-generation", model=model, tokenizer=tokenizer)
 def summarize_flowchart(flowchart_json):
     """
-    Given a flowchart JSON with 'start' and 'steps', returns a plain English explanation
-    formatted as bullets and sub-bullets.
     Args:
-        flowchart_json (dict): Structured representation of flowchart
     Returns:
-        str: Bullet-style natural language summary of the logic
     """
     prompt = (
-        "Turn the following flowchart into a bullet-point explanation in plain English.\n"
-        "Use bullets for steps and sub-bullets for branches.\n"
-        "\n"
-        f"Flowchart JSON:\n{flowchart_json}\n"
-        "\nExplanation:"
-    )
-    result = summarizer(prompt, max_new_tokens=300, do_sample=False)[0]["generated_text"]
-    explanation = result.split("Explanation:")[-1].strip()
     return explanation

 from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
 from device_config import get_device
 import torch
+import json
+# Automatically choose device (CUDA, MPS, CPU)
 device = get_device()
+# ⚙️ Model config: Use phi-2-mini (replace with phi-4-mini when available)
+MODEL_ID = "microsoft/Phi-4-mini-instruct"
+# Load tokenizer and model
 model = AutoModelForCausalLM.from_pretrained(MODEL_ID).to(device)
 tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
 summarizer = pipeline("text-generation", model=model, tokenizer=tokenizer)
 def summarize_flowchart(flowchart_json):
     """
+    Generates a human-friendly explanation from flowchart JSON.
     Args:
+        flowchart_json (dict): Contains "start" node and a list of "steps".
     Returns:
+        str: Bullet-style explanation with proper nesting and flow.
     """
+    # 📄 Prompt optimized for flow comprehension
     prompt = (
+        "You are an expert in visual reasoning and instruction generation.\n"
+        "Convert the following flowchart JSON into a clear, step-by-step summary using bullets.\n"
+        "- Each bullet represents a process step.\n"
+        "- Use indented sub-bullets to explain decision branches (Yes/No).\n"
+        "- Maintain order based on dependencies and parent-child links.\n"
+        "- Avoid repeating the same step more than once.\n"
+        "- Do not include JSON in the output, only human-readable text.\n"
+        "\nFlowchart:\n{flowchart}\n\nBullet Explanation:"
+    ).format(flowchart=json.dumps(flowchart_json, indent=2))
+    # 🧠 Run the model inference
+    result = summarizer(prompt, max_new_tokens=400, do_sample=False)[0]["generated_text"]
+    # Extract the portion after the final prompt marker
+    if "Bullet Explanation:" in result:
+        explanation = result.split("Bullet Explanation:")[-1].strip()
+    else:
+        explanation = result.strip()
     return explanation

yolo_module/__init__.py CHANGED Viewed

@@ -1,55 +1,110 @@
-# yolo_module.py
 from ultralytics import YOLO
 from device_config import get_device
-from PIL import Image, ImageDraw
 import numpy as np
 import easyocr
-# Load YOLO model
 MODEL_PATH = "models/best.pt"
 device = get_device()
 model = YOLO(MODEL_PATH).to(device)
 print(f"✅ YOLO model loaded on: {device}")
-# Optional OCR reader for arrow label detection
-reader = easyocr.Reader(['en'], gpu=False)
 def run_yolo(image: Image.Image):
     results = model.predict(image, conf=0.25, verbose=False)[0]
     boxes = []
     arrows = []
-    # Convert image to OpenCV format for EasyOCR
-    np_img = np.array(image)
     for i, box in enumerate(results.boxes):
         cls_id = int(box.cls)
-        conf = float(box.conf)
         label = model.names[cls_id]
         x1, y1, x2, y2 = map(int, box.xyxy[0])
         bbox = [x1, y1, x2, y2]
         item = {
             "id": f"node{i+1}",
             "bbox": bbox,
-            "type": "arrow" if label in ["arrow", "control_flow"] else "box",
             "label": label
         }
-        if item["type"] == "arrow":
-            # Heuristically scan a small region near the middle of the arrow for a label
             cx, cy = (x1 + x2) // 2, (y1 + y2) // 2
             pad = 20
             crop = np_img[max(cy - pad, 0):cy + pad, max(cx - pad, 0):cx + pad]
             detected_label = ""
             if crop.size > 0:
-                ocr_results = reader.readtext(crop)
-                if ocr_results:
-                    detected_label = ocr_results[0][1]  # (bbox, text, conf)
             arrows.append({
                 "id": f"arrow{len(arrows)+1}",
@@ -60,5 +115,10 @@ def run_yolo(image: Image.Image):
         else:
             boxes.append(item)
     vis_image = results.plot(pil=True)
-    return boxes, arrows, vis_image

+"""
+YOLO module for detecting flowchart elements (boxes and arrows).
+Includes optional OCR for labeling arrows and deduplication to eliminate overlapping detections.
+"""
 from ultralytics import YOLO
 from device_config import get_device
+from PIL import Image
 import numpy as np
 import easyocr
+from shapely.geometry import box as shapely_box
+import torch
+# Load YOLO model and move to appropriate device
 MODEL_PATH = "models/best.pt"
 device = get_device()
 model = YOLO(MODEL_PATH).to(device)
 print(f"✅ YOLO model loaded on: {device}")
+# EasyOCR reader used for detecting optional labels near arrows
+reader = easyocr.Reader(['en'], gpu=(device == "cuda"))
+def iou(box1, box2):
+    """Compute Intersection over Union (IoU) between two bounding boxes."""
+    b1 = shapely_box(*box1)
+    b2 = shapely_box(*box2)
+    return b1.intersection(b2).area / b1.union(b2).area
+def deduplicate_boxes(boxes, iou_threshold=0.6):
+    """
+    Eliminate overlapping or duplicate boxes based on IoU threshold.
+    Args:
+        boxes (list): List of box dictionaries with 'bbox' key.
+        iou_threshold (float): Threshold above which boxes are considered duplicates.
+    Returns:
+        list: Filtered list of unique boxes.
+    """
+    filtered = []
+    for box in boxes:
+        if all(iou(box['bbox'], other['bbox']) < iou_threshold for other in filtered):
+            filtered.append(box)
+    return filtered
+@torch.no_grad()
 def run_yolo(image: Image.Image):
+    """
+    Run YOLO model on input image and return detected boxes, arrows, and annotated image.
+    Args:
+        image (PIL.Image): Input RGB image of a flowchart.
+    Returns:
+        tuple:
+            boxes (list of dict): Each box has id, bbox, type, label.
+            arrows (list of dict): Each arrow has id, tail, head, label.
+            vis_image (PIL.Image): Annotated image with detections drawn.
+    """
     results = model.predict(image, conf=0.25, verbose=False)[0]
     boxes = []
     arrows = []
+    np_img = np.array(image)  # Convert image to numpy array for OCR crops
     for i, box in enumerate(results.boxes):
         cls_id = int(box.cls)
         label = model.names[cls_id]
         x1, y1, x2, y2 = map(int, box.xyxy[0])
         bbox = [x1, y1, x2, y2]
+        width = x2 - x1
+        height = y2 - y1
+        aspect_ratio = width / height
+        # Default type assignment
+        item_type = "arrow" if label in ["arrow", "control_flow"] else "box"
+        # Adjust to 'decision' if it's nearly square (likely diamond shape)
+        if item_type == "box" and 0.8 < aspect_ratio < 1.2:
+            item_type = "decision"
+        # Create basic detection item
         item = {
             "id": f"node{i+1}",
             "bbox": bbox,
+            "type": item_type,
             "label": label
         }
+        if item_type == "arrow":
+            # Extract small patch at arrow center for OCR label
             cx, cy = (x1 + x2) // 2, (y1 + y2) // 2
             pad = 20
             crop = np_img[max(cy - pad, 0):cy + pad, max(cx - pad, 0):cx + pad]
             detected_label = ""
             if crop.size > 0:
+                try:
+                    ocr_results = reader.readtext(crop)
+                    if ocr_results:
+                        detected_label = ocr_results[0][1].strip().lower()
+                except Exception as e:
+                    print(f"⚠️ Arrow OCR failed: {e}")
             arrows.append({
                 "id": f"arrow{len(arrows)+1}",
         else:
             boxes.append(item)
+    # Remove overlapping duplicate boxes
+    boxes = deduplicate_boxes(boxes)
+    # Create annotated image with bounding boxes
     vis_image = results.plot(pil=True)
+    return boxes, arrows, vis_image