import cv2 import numpy as np import pytesseract import requests import pandas as pd import gradio as gr import uuid import os # ────────────────────────────────────────────────────────────── # 1. Utility: Detect rectangular contours (approximate book covers) # ────────────────────────────────────────────────────────────── def detect_book_regions(image: np.ndarray, min_area=5000, eps_coef=0.02): """ Detect rectangular regions in an image that likely correspond to book covers. Returns a list of bounding boxes: (x, y, w, h). """ gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) blurred = cv2.GaussianBlur(gray, (5, 5), 0) edges = cv2.Canny(blurred, 50, 150) # Dilate + erode to close gaps kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (5, 5)) closed = cv2.morphologyEx(edges, cv2.MORPH_CLOSE, kernel) contours, _ = cv2.findContours( closed.copy(), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE ) boxes = [] for cnt in contours: area = cv2.contourArea(cnt) if area < min_area: continue peri = cv2.arcLength(cnt, True) approx = cv2.approxPolyDP(cnt, eps_coef * peri, True) # Keep only quadrilaterals if len(approx) == 4: x, y, w, h = cv2.boundingRect(approx) ar = w / float(h) # Filter by typical book-cover aspect ratios # (you can loosen/tighten these ranges if needed) if 0.4 < ar < 0.9 or 1.0 < ar < 1.6: boxes.append((x, y, w, h)) # Sort left→right, then top→bottom boxes = sorted(boxes, key=lambda b: (b[1], b[0])) return boxes # ────────────────────────────────────────────────────────────── # 2. OCR on a cropped region # ────────────────────────────────────────────────────────────── def ocr_on_region(image: np.ndarray, box: tuple): """ Crop the image to the given box and run Tesseract OCR. Return the raw OCR text. """ x, y, w, h = box cropped = image[y : y + h, x : x + w] gray_crop = cv2.cvtColor(cropped, cv2.COLOR_BGR2GRAY) _, thresh_crop = cv2.threshold( gray_crop, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU ) custom_config = r"--oem 3 --psm 6" text = pytesseract.image_to_string(thresh_crop, config=custom_config) return text.strip() # ────────────────────────────────────────────────────────────── # 3. OCR on the full image (fallback) # ────────────────────────────────────────────────────────────── def ocr_full_image(image: np.ndarray): """ Run OCR on the entire image if no covers were detected. Return the full OCR text (string). """ gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) # Optionally threshold entire image as well _, thresh = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU) custom_config = r"--oem 3 --psm 6" text = pytesseract.image_to_string(thresh, config=custom_config) return text.strip() # ────────────────────────────────────────────────────────────── # 4. Query OpenLibrary API # ────────────────────────────────────────────────────────────── def query_openlibrary(title_text: str, author_text: str = None): """ Search OpenLibrary by title (and optional author). Return a dict with title, author_name, publisher, first_publish_year, or None. """ base_url = "https://openlibrary.org/search.json" params = {"title": title_text} if author_text: params["author"] = author_text try: resp = requests.get(base_url, params=params, timeout=5) resp.raise_for_status() data = resp.json() if data.get("docs"): doc = data["docs"][0] return { "title": doc.get("title", ""), "author_name": ", ".join(doc.get("author_name", [])), "publisher": ", ".join(doc.get("publisher", [])), "first_publish_year": doc.get("first_publish_year", ""), } except Exception as e: print(f"OpenLibrary query failed: {e}") return None # ────────────────────────────────────────────────────────────── # 5. Process one uploaded image # ────────────────────────────────────────────────────────────── def process_image(image_file): """ Gradio passes a PIL image or numpy array. Convert to OpenCV BGR, detect covers → OCR → OpenLibrary. If no covers are found, fall back to OCR on the full image once. Write CSV to a temp file and return (DataFrame, filepath). """ # Convert PIL to OpenCV BGR img = np.array(image_file)[:, :, ::-1].copy() # 1) Try to detect individual covers boxes = detect_book_regions(img) records = [] if boxes: # If we found boxes, run OCR + lookup for each for box in boxes: ocr_text = ocr_on_region(img, box) lines = [l.strip() for l in ocr_text.splitlines() if l.strip()] if not lines: continue title_guess = lines[0] author_guess = lines[1] if len(lines) > 1 else None meta = query_openlibrary(title_guess, author_guess) if meta: records.append(meta) else: # No OpenLibrary match → still include OCR result records.append( { "title": title_guess, "author_name": author_guess or "", "publisher": "", "first_publish_year": "", } ) else: # 2) FALLBACK: no boxes detected → OCR on full image once full_text = ocr_full_image(img) lines = [l.strip() for l in full_text.splitlines() if l.strip()] if lines: # Use first line as title guess, second (if any) as author guess title_guess = lines[0] author_guess = lines[1] if len(lines) > 1 else None meta = query_openlibrary(title_guess, author_guess) if meta: records.append(meta) else: records.append( { "title": title_guess, "author_name": author_guess or "", "publisher": "", "first_publish_year": "", } ) # If lines is empty, records remains empty # Build DataFrame (even if empty) df = pd.DataFrame(records, columns=["title", "author_name", "publisher", "first_publish_year"]) csv_bytes = df.to_csv(index=False).encode() # Write CSV to a unique temporary file unique_name = f"books_{uuid.uuid4().hex}.csv" temp_path = os.path.join("/tmp", unique_name) with open(temp_path, "wb") as f: f.write(csv_bytes) return df, temp_path # ────────────────────────────────────────────────────────────── # 6. Build the Gradio Interface # ────────────────────────────────────────────────────────────── def build_interface(): with gr.Blocks(title="Book Cover Scanner") as demo: gr.Markdown( """ ## Book Cover Scanner + Metadata Lookup 1. Upload a photo containing one or multiple book covers 2. The app will: - Detect individual covers (rectangles). - If any are found, OCR each one and query OpenLibrary for metadata. - If **no** rectangles are detected, OCR the **entire image** once. 3. Display all detected/guessed books in a table. 4. Download a CSV of the results. **Tips:** - For best cover detection: use a flat, well-lit photo with minimal glare/obstructions. - You can also place each cover on a plain background (e.g., a white tabletop). """ ) with gr.Row(): img_in = gr.Image(type="pil", label="Upload Image of Book Covers") run_button = gr.Button("Scan & Lookup") output_table = gr.Dataframe( headers=["title", "author_name", "publisher", "first_publish_year"], label="Detected Books + Metadata", datatype="pandas", ) download_file = gr.File(label="Download CSV") def on_run(image): df, filepath = process_image(image) return df, filepath run_button.click( fn=on_run, inputs=[img_in], outputs=[output_table, download_file], ) return demo if __name__ == "__main__": demo_app = build_interface() demo_app.launch()