bookscanner_app / app.py
ugolefoo's picture
Update app.py
0475645 verified
raw
history blame
7.35 kB
import cv2
import numpy as np
import pytesseract
import requests
import pandas as pd
import gradio as gr
# ──────────────────────────────────────────────────────────────
# 1. Utility: Detect rectangular contours (approximate book covers)
# ──────────────────────────────────────────────────────────────
def detect_book_regions(image: np.ndarray, min_area=10000, eps_coef=0.02):
"""
Detect rectangular regions in an image that likely correspond to book covers.
Returns a list of bounding boxes: (x, y, w, h).
"""
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
blurred = cv2.GaussianBlur(gray, (5, 5), 0)
edges = cv2.Canny(blurred, 50, 150)
# Dilate + erode to close gaps
kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (5, 5))
closed = cv2.morphologyEx(edges, cv2.MORPH_CLOSE, kernel)
contours, _ = cv2.findContours(
closed.copy(), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE
)
boxes = []
for cnt in contours:
area = cv2.contourArea(cnt)
if area < min_area:
continue
peri = cv2.arcLength(cnt, True)
approx = cv2.approxPolyDP(cnt, eps_coef * peri, True)
# Keep only quadrilaterals
if len(approx) == 4:
x, y, w, h = cv2.boundingRect(approx)
ar = w / float(h)
# Filter by typical book-cover aspect ratios
if 0.4 < ar < 0.9 or 1.0 < ar < 1.6:
boxes.append((x, y, w, h))
# Sort left→right, top→bottom
boxes = sorted(boxes, key=lambda b: (b[1], b[0]))
return boxes
# ──────────────────────────────────────────────────────────────
# 2. OCR on a cropped region
# ──────────────────────────────────────────────────────────────
def ocr_on_region(image: np.ndarray, box: tuple):
"""
Crop the image to the given box and run Tesseract OCR.
Return the raw OCR text.
"""
x, y, w, h = box
cropped = image[y : y + h, x : x + w]
gray_crop = cv2.cvtColor(cropped, cv2.COLOR_BGR2GRAY)
_, thresh_crop = cv2.threshold(
gray_crop, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU
)
custom_config = r"--oem 3 --psm 6"
text = pytesseract.image_to_string(thresh_crop, config=custom_config)
return text.strip()
# ──────────────────────────────────────────────────────────────
# 3. Query OpenLibrary API
# ──────────────────────────────────────────────────────────────
def query_openlibrary(title_text: str, author_text: str = None):
"""
Search OpenLibrary by title (and optional author).
Return a dict with title, author_name, publisher, first_publish_year, or None.
"""
base_url = "https://openlibrary.org/search.json"
params = {"title": title_text}
if author_text:
params["author"] = author_text
try:
resp = requests.get(base_url, params=params, timeout=5)
resp.raise_for_status()
data = resp.json()
if data.get("docs"):
doc = data["docs"][0]
return {
"title": doc.get("title", ""),
"author_name": ", ".join(doc.get("author_name", [])),
"publisher": ", ".join(doc.get("publisher", [])),
"first_publish_year": doc.get("first_publish_year", ""),
}
except Exception as e:
print(f"OpenLibrary query failed: {e}")
return None
# ──────────────────────────────────────────────────────────────
# 4. Process one uploaded image
# ──────────────────────────────────────────────────────────────
def process_image(image_file):
"""
Gradio passes a PIL image or numpy array. Convert to OpenCV BGR, detect covers β†’ OCR β†’ OpenLibrary.
Return a DataFrame and a (filename, bytes) tuple for CSV.
"""
img = np.array(image_file)[:, :, ::-1].copy() # PIL to OpenCV BGR
boxes = detect_book_regions(img)
records = []
for box in boxes:
ocr_text = ocr_on_region(img, box)
lines = [l.strip() for l in ocr_text.splitlines() if l.strip()]
if not lines:
continue
title_guess = lines[0]
author_guess = lines[1] if len(lines) > 1 else None
meta = query_openlibrary(title_guess, author_guess)
if meta:
records.append(meta)
else:
records.append(
{
"title": title_guess,
"author_name": author_guess or "",
"publisher": "",
"first_publish_year": "",
}
)
# Build DataFrame (even if empty)
df = pd.DataFrame(records, columns=["title", "author_name", "publisher", "first_publish_year"])
csv_bytes = df.to_csv(index=False).encode()
return df, ("books.csv", csv_bytes)
# ──────────────────────────────────────────────────────────────
# 5. Build the Gradio Interface
# ──────────────────────────────────────────────────────────────
def build_interface():
with gr.Blocks(title="Book Cover Scanner") as demo:
gr.Markdown(
"""
## Book Cover Scanner + Metadata Lookup
1. Upload a photo containing one or multiple book covers
2. The app will detect each cover, run OCR, then query OpenLibrary for metadata
3. Results appear in a table below, and you can download a CSV
"""
)
with gr.Row():
img_in = gr.Image(type="pil", label="Upload Image of Book Covers")
run_button = gr.Button("Scan & Lookup")
output_table = gr.Dataframe(
headers=["title", "author_name", "publisher", "first_publish_year"],
label="Detected Books with Metadata",
datatype="pandas",
)
download_file = gr.File(label="Download CSV")
def on_run(image):
df, file_tuple = process_image(image)
return df, file_tuple
run_button.click(
fn=on_run,
inputs=[img_in],
outputs=[output_table, download_file],
)
return demo
if __name__ == "__main__":
demo_app = build_interface()
demo_app.launch()