Spaces:

ugolefoo
/

bookscanner_app

Runtime error

App Files Files Community

ugolefoo commited on Jun 4

Commit

cece48d

verified ·

1 Parent(s): d668e84

Update app.py

Browse files

Files changed (1) hide show

app.py +138 -47

app.py CHANGED Viewed

@@ -8,22 +8,84 @@ import uuid
 import os
 # ──────────────────────────────────────────────────────────────
-# 1. OCR on the full image (always)
 # ──────────────────────────────────────────────────────────────
-def ocr_full_image(image: np.ndarray) -> str:
     """
-    Run Tesseract OCR on the entire image (no thresholding).
     Return the raw OCR text.
     """
     gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
-    # Note: we’re NOT thresholding here—sometimes stylized covers lose detail under THRESH_OTSU.
-    text = pytesseract.image_to_string(gray, config="--oem 3 --psm 6")
     return text.strip()
 # ──────────────────────────────────────────────────────────────
-# 2. Query OpenLibrary API
 # ──────────────────────────────────────────────────────────────
-def query_openlibrary(title_text: str, author_text: str = None) -> dict | None:
     """
     Search OpenLibrary by title (and optional author).
     Return a dict with title, author_name, publisher, first_publish_year, or None.
@@ -51,38 +113,66 @@ def query_openlibrary(title_text: str, author_text: str = None) -> dict | None:
     return None
 # ──────────────────────────────────────────────────────────────
-# 3. Process one uploaded image (single OCR pass)
 # ──────────────────────────────────────────────────────────────
 def process_image(image_file):
     """
-    Gradio passes a PIL image or numpy array. Convert to OpenCV BGR,
-    OCR the entire image, parse first two lines for title/author,
-    query OpenLibrary once, and return a DataFrame + CSV file path.
     """
     # Convert PIL to OpenCV BGR
     img = np.array(image_file)[:, :, ::-1].copy()
-    # 1) Run OCR on full image
-    full_text = ocr_full_image(img)
-    lines = [line.strip() for line in full_text.splitlines() if line.strip()]
     records = []
-    if lines:
-        # Use first line as title, second (if exists) as author
-        title_guess = lines[0]
-        author_guess = lines[1] if len(lines) > 1 else None
-        meta = query_openlibrary(title_guess, author_guess)
-        if meta:
-            records.append(meta)
-        else:
-            # No match → still include OCR guesses
-            records.append({
-                "title": title_guess,
-                "author_name": author_guess or "",
-                "publisher": "",
-                "first_publish_year": "",
-            })
     # Build DataFrame (even if empty)
     df = pd.DataFrame(records, columns=["title", "author_name", "publisher", "first_publish_year"])
@@ -97,34 +187,35 @@ def process_image(image_file):
     return df, temp_path
 # ──────────────────────────────────────────────────────────────
-# 4. Build the Gradio Interface
 # ──────────────────────────────────────────────────────────────
 def build_interface():
-    with gr.Blocks(title="Book Cover OCR + Lookup (Single‐Cover Mode)") as demo:
         gr.Markdown(
             """
-            ## Book Cover OCR + OpenLibrary Lookup
-            1. Upload a photo of a single book cover (or any cover‐style image).
-            2. The app will run OCR on the full image, take:
-               - the **first line** as a “title” guess, and
-               - the **second line** (if any) as an “author” guess, then
-               - query OpenLibrary once for metadata.
-            3. You’ll see the result in a table and can download a CSV.
-            > **Note:**
-            > • Because we skip rectangle detection, any visible text on your cover (large, legible fonts) should be picked up.
-            > • If you have multiple covers in one photo, only the first “title/author” will be used.
             """
         )
         with gr.Row():
-            img_in = gr.Image(type="pil", label="Upload Single Book Cover")
             run_button = gr.Button("Scan & Lookup")
         output_table = gr.Dataframe(
             headers=["title", "author_name", "publisher", "first_publish_year"],
-            label="Detected Book Metadata",
             datatype="pandas",
         )
         download_file = gr.File(label="Download CSV")

 import os
 # ──────────────────────────────────────────────────────────────
+# 1. Utility: Detect rectangular contours (approximate book covers)
 # ──────────────────────────────────────────────────────────────
+def detect_book_regions(image: np.ndarray, min_area=5000, eps_coef=0.02):
     """
+    Detect rectangular regions in an image that likely correspond to book covers.
+    Returns a list of bounding boxes: (x, y, w, h).
+    """
+    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
+    blurred = cv2.GaussianBlur(gray, (5, 5), 0)
+    edges = cv2.Canny(blurred, 50, 150)
+    # Dilate + erode to close gaps
+    kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (5, 5))
+    closed = cv2.morphologyEx(edges, cv2.MORPH_CLOSE, kernel)
+    contours, _ = cv2.findContours(
+        closed.copy(), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE
+    )
+    boxes = []
+    for cnt in contours:
+        area = cv2.contourArea(cnt)
+        if area < min_area:
+            continue
+        peri = cv2.arcLength(cnt, True)
+        approx = cv2.approxPolyDP(cnt, eps_coef * peri, True)
+        # Keep only quadrilaterals
+        if len(approx) == 4:
+            x, y, w, h = cv2.boundingRect(approx)
+            ar = w / float(h)
+            # Filter by typical book-cover aspect ratios
+            # (you can loosen/tighten these ranges if needed)
+            if 0.4 < ar < 0.9 or 1.0 < ar < 1.6:
+                boxes.append((x, y, w, h))
+    # Sort left→right, then top→bottom
+    boxes = sorted(boxes, key=lambda b: (b[1], b[0]))
+    return boxes
+# ──────────────────────────────────────────────────────────────
+# 2. OCR on a cropped region
+# ──────────────────────────────────────────────────────────────
+def ocr_on_region(image: np.ndarray, box: tuple):
+    """
+    Crop the image to the given box and run Tesseract OCR.
     Return the raw OCR text.
     """
+    x, y, w, h = box
+    cropped = image[y : y + h, x : x + w]
+    gray_crop = cv2.cvtColor(cropped, cv2.COLOR_BGR2GRAY)
+    _, thresh_crop = cv2.threshold(
+        gray_crop, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU
+    )
+    custom_config = r"--oem 3 --psm 6"
+    text = pytesseract.image_to_string(thresh_crop, config=custom_config)
+    return text.strip()
+# ──────────────────────────────────────────────────────────────
+# 3. OCR on the full image (fallback)
+# ──────────────────────────────────────────────────────────────
+def ocr_full_image(image: np.ndarray):
+    """
+    Run OCR on the entire image if no covers were detected.
+    Return the full OCR text (string).
+    """
     gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
+    # Optionally threshold entire image as well
+    _, thresh = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
+    custom_config = r"--oem 3 --psm 6"
+    text = pytesseract.image_to_string(thresh, config=custom_config)
     return text.strip()
 # ──────────────────────────────────────────────────────────────
+# 4. Query OpenLibrary API
 # ──────────────────────────────────────────────────────────────
+def query_openlibrary(title_text: str, author_text: str = None):
     """
     Search OpenLibrary by title (and optional author).
     Return a dict with title, author_name, publisher, first_publish_year, or None.
     return None
 # ──────────────────────────────────────────────────────────────
+# 5. Process one uploaded image
 # ──────────────────────────────────────────────────────────────
 def process_image(image_file):
     """
+    Gradio passes a PIL image or numpy array. Convert to OpenCV BGR,
+    detect covers → OCR → OpenLibrary.
+    If no covers are found, fall back to OCR on the full image once.
+    Write CSV to a temp file and return (DataFrame, filepath).
     """
     # Convert PIL to OpenCV BGR
     img = np.array(image_file)[:, :, ::-1].copy()
+    # 1) Try to detect individual covers
+    boxes = detect_book_regions(img)
     records = []
+    if boxes:
+        # If we found boxes, run OCR + lookup for each
+        for box in boxes:
+            ocr_text = ocr_on_region(img, box)
+            lines = [l.strip() for l in ocr_text.splitlines() if l.strip()]
+            if not lines:
+                continue
+            title_guess = lines[0]
+            author_guess = lines[1] if len(lines) > 1 else None
+            meta = query_openlibrary(title_guess, author_guess)
+            if meta:
+                records.append(meta)
+            else:
+                # No OpenLibrary match → still include OCR result
+                records.append(
+                    {
+                        "title": title_guess,
+                        "author_name": author_guess or "",
+                        "publisher": "",
+                        "first_publish_year": "",
+                    }
+                )
+    else:
+        # 2) FALLBACK: no boxes detected → OCR on full image once
+        full_text = ocr_full_image(img)
+        lines = [l.strip() for l in full_text.splitlines() if l.strip()]
+        if lines:
+            # Use first line as title guess, second (if any) as author guess
+            title_guess = lines[0]
+            author_guess = lines[1] if len(lines) > 1 else None
+            meta = query_openlibrary(title_guess, author_guess)
+            if meta:
+                records.append(meta)
+            else:
+                records.append(
+                    {
+                        "title": title_guess,
+                        "author_name": author_guess or "",
+                        "publisher": "",
+                        "first_publish_year": "",
+                    }
+                )
+        # If lines is empty, records remains empty
     # Build DataFrame (even if empty)
     df = pd.DataFrame(records, columns=["title", "author_name", "publisher", "first_publish_year"])
     return df, temp_path
 # ──────────────────────────────────────────────────────────────
+# 6. Build the Gradio Interface
 # ──────────────────────────────────────────────────────────────
 def build_interface():
+    with gr.Blocks(title="Book Cover Scanner") as demo:
         gr.Markdown(
             """
+            ## Book Cover Scanner + Metadata Lookup
+            1. Upload a photo containing one or multiple book covers
+            2. The app will:
+               - Detect individual covers (rectangles).
+               - If any are found, OCR each one and query OpenLibrary for metadata.
+               - If **no** rectangles are detected, OCR the **entire image** once.
+            3. Display all detected/guessed books in a table.
+            4. Download a CSV of the results.
+            **Tips:**
+            - For best cover detection: use a flat, well-lit photo with minimal glare/obstructions.
+            - You can also place each cover on a plain background (e.g., a white tabletop).
             """
         )
         with gr.Row():
+            img_in = gr.Image(type="pil", label="Upload Image of Book Covers")
             run_button = gr.Button("Scan & Lookup")
         output_table = gr.Dataframe(
             headers=["title", "author_name", "publisher", "first_publish_year"],
+            label="Detected Books + Metadata",
             datatype="pandas",
         )
         download_file = gr.File(label="Download CSV")