Spaces:

ugolefoo
/

bookscanner_app

Runtime error

App Files Files Community

ugolefoo commited on Jun 4

Commit

d668e84

verified ·

1 Parent(s): bbc77e0

Update app.py

Browse files

Files changed (1) hide show

app.py +44 -82

app.py CHANGED Viewed

@@ -8,68 +8,22 @@ import uuid
 import os
 # ──────────────────────────────────────────────────────────────
-# 1. Utility: Detect rectangular contours (approximate book covers)
 # ──────────────────────────────────────────────────────────────
-def detect_book_regions(image: np.ndarray, min_area=10000, eps_coef=0.02):
     """
-    Detect rectangular regions in an image that likely correspond to book covers.
-    Returns a list of bounding boxes: (x, y, w, h).
-    """
-    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
-    blurred = cv2.GaussianBlur(gray, (5, 5), 0)
-    edges = cv2.Canny(blurred, 50, 150)
-    # Dilate + erode to close gaps
-    kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (5, 5))
-    closed = cv2.morphologyEx(edges, cv2.MORPH_CLOSE, kernel)
-    contours, _ = cv2.findContours(
-        closed.copy(), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE
-    )
-    boxes = []
-    for cnt in contours:
-        area = cv2.contourArea(cnt)
-        if area < min_area:
-            continue
-        peri = cv2.arcLength(cnt, True)
-        approx = cv2.approxPolyDP(cnt, eps_coef * peri, True)
-        # Keep only quadrilaterals
-        if len(approx) == 4:
-            x, y, w, h = cv2.boundingRect(approx)
-            ar = w / float(h)
-            # Filter by typical book-cover aspect ratios
-            if 0.4 < ar < 0.9 or 1.0 < ar < 1.6:
-                boxes.append((x, y, w, h))
-    # Sort left→right, top→bottom
-    boxes = sorted(boxes, key=lambda b: (b[1], b[0]))
-    return boxes
-# ──────────────────────────────────────────────────────────────
-# 2. OCR on a cropped region
-# ──────────────────────────────────────────────────────────────
-def ocr_on_region(image: np.ndarray, box: tuple):
-    """
-    Crop the image to the given box and run Tesseract OCR.
     Return the raw OCR text.
     """
-    x, y, w, h = box
-    cropped = image[y : y + h, x : x + w]
-    gray_crop = cv2.cvtColor(cropped, cv2.COLOR_BGR2GRAY)
-    _, thresh_crop = cv2.threshold(
-        gray_crop, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU
-    )
-    custom_config = r"--oem 3 --psm 6"
-    text = pytesseract.image_to_string(thresh_crop, config=custom_config)
     return text.strip()
 # ──────────────────────────────────────────────────────────────
-# 3. Query OpenLibrary API
 # ──────────────────────────────────────────────────────────────
-def query_openlibrary(title_text: str, author_text: str = None):
     """
     Search OpenLibrary by title (and optional author).
     Return a dict with title, author_name, publisher, first_publish_year, or None.
@@ -97,23 +51,24 @@ def query_openlibrary(title_text: str, author_text: str = None):
     return None
 # ──────────────────────────────────────────────────────────────
-# 4. Process one uploaded image
 # ──────────────────────────────────────────────────────────────
 def process_image(image_file):
     """
-    Gradio passes a PIL image or numpy array. Convert to OpenCV BGR, detect covers → OCR → OpenLibrary.
-    Write CSV to a temp file and return (DataFrame, filepath).
     """
-    img = np.array(image_file)[:, :, ::-1].copy()  # PIL to OpenCV BGR
-    boxes = detect_book_regions(img)
-    records = []
-    for box in boxes:
-        ocr_text = ocr_on_region(img, box)
-        lines = [l.strip() for l in ocr_text.splitlines() if l.strip()]
-        if not lines:
-            continue
         title_guess = lines[0]
         author_guess = lines[1] if len(lines) > 1 else None
         meta = query_openlibrary(title_guess, author_guess)
@@ -121,20 +76,19 @@ def process_image(image_file):
         if meta:
             records.append(meta)
         else:
-            records.append(
-                {
-                    "title": title_guess,
-                    "author_name": author_guess or "",
-                    "publisher": "",
-                    "first_publish_year": "",
-                }
-            )
     # Build DataFrame (even if empty)
     df = pd.DataFrame(records, columns=["title", "author_name", "publisher", "first_publish_year"])
     csv_bytes = df.to_csv(index=False).encode()
-    # Write to a unique temporary file
     unique_name = f"books_{uuid.uuid4().hex}.csv"
     temp_path = os.path.join("/tmp", unique_name)
     with open(temp_path, "wb") as f:
@@ -143,26 +97,34 @@ def process_image(image_file):
     return df, temp_path
 # ──────────────────────────────────────────────────────────────
-# 5. Build the Gradio Interface
 # ──────────────────────────────────────────────────────────────
 def build_interface():
-    with gr.Blocks(title="Book Cover Scanner") as demo:
         gr.Markdown(
             """
-            ## Book Cover Scanner + Metadata Lookup
-            1. Upload a photo containing one or multiple book covers
-            2. The app will detect each cover, run OCR, then query OpenLibrary for metadata
-            3. Results appear in a table below, and you can download a CSV
             """
         )
         with gr.Row():
-            img_in = gr.Image(type="pil", label="Upload Image of Book Covers")
             run_button = gr.Button("Scan & Lookup")
         output_table = gr.Dataframe(
             headers=["title", "author_name", "publisher", "first_publish_year"],
-            label="Detected Books with Metadata",
             datatype="pandas",
         )
         download_file = gr.File(label="Download CSV")

 import os
 # ──────────────────────────────────────────────────────────────
+# 1. OCR on the full image (always)
 # ──────────────────────────────────────────────────────────────
+def ocr_full_image(image: np.ndarray) -> str:
     """
+    Run Tesseract OCR on the entire image (no thresholding).
     Return the raw OCR text.
     """
+    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
+    # Note: we’re NOT thresholding here—sometimes stylized covers lose detail under THRESH_OTSU.
+    text = pytesseract.image_to_string(gray, config="--oem 3 --psm 6")
     return text.strip()
 # ──────────────────────────────────────────────────────────────
+# 2. Query OpenLibrary API
 # ──────────────────────────────────────────────────────────────
+def query_openlibrary(title_text: str, author_text: str = None) -> dict | None:
     """
     Search OpenLibrary by title (and optional author).
     Return a dict with title, author_name, publisher, first_publish_year, or None.
     return None
 # ──────────────────────────────────────────────────────────────
+# 3. Process one uploaded image (single OCR pass)
 # ──────────────────────────────────────────────────────────────
 def process_image(image_file):
     """
+    Gradio passes a PIL image or numpy array. Convert to OpenCV BGR,
+    OCR the entire image, parse first two lines for title/author,
+    query OpenLibrary once, and return a DataFrame + CSV file path.
     """
+    # Convert PIL to OpenCV BGR
+    img = np.array(image_file)[:, :, ::-1].copy()
+    # 1) Run OCR on full image
+    full_text = ocr_full_image(img)
+    lines = [line.strip() for line in full_text.splitlines() if line.strip()]
+    records = []
+    if lines:
+        # Use first line as title, second (if exists) as author
         title_guess = lines[0]
         author_guess = lines[1] if len(lines) > 1 else None
         meta = query_openlibrary(title_guess, author_guess)
         if meta:
             records.append(meta)
         else:
+            # No match → still include OCR guesses
+            records.append({
+                "title": title_guess,
+                "author_name": author_guess or "",
+                "publisher": "",
+                "first_publish_year": "",
+            })
     # Build DataFrame (even if empty)
     df = pd.DataFrame(records, columns=["title", "author_name", "publisher", "first_publish_year"])
     csv_bytes = df.to_csv(index=False).encode()
+    # Write CSV to a unique temporary file
     unique_name = f"books_{uuid.uuid4().hex}.csv"
     temp_path = os.path.join("/tmp", unique_name)
     with open(temp_path, "wb") as f:
     return df, temp_path
 # ──────────────────────────────────────────────────────────────
+# 4. Build the Gradio Interface
 # ──────────────────────────────────────────────────────────────
 def build_interface():
+    with gr.Blocks(title="Book Cover OCR + Lookup (Single‐Cover Mode)") as demo:
         gr.Markdown(
             """
+            ## Book Cover OCR + OpenLibrary Lookup
+            1. Upload a photo of a single book cover (or any cover‐style image).
+            2. The app will run OCR on the full image, take:
+               - the **first line** as a “title” guess, and
+               - the **second line** (if any) as an “author” guess, then
+               - query OpenLibrary once for metadata.
+            3. You’ll see the result in a table and can download a CSV.
+            > **Note:**
+            > • Because we skip rectangle detection, any visible text on your cover (large, legible fonts) should be picked up.
+            > • If you have multiple covers in one photo, only the first “title/author” will be used.
             """
         )
         with gr.Row():
+            img_in = gr.Image(type="pil", label="Upload Single Book Cover")
             run_button = gr.Button("Scan & Lookup")
         output_table = gr.Dataframe(
             headers=["title", "author_name", "publisher", "first_publish_year"],
+            label="Detected Book Metadata",
             datatype="pandas",
         )
         download_file = gr.File(label="Download CSV")