Spaces:

ugolefoo
/

bookscanner_app

Sleeping

App Files Files Community

ugolefoo commited on 6 days ago

Commit

c53dc19

verified ·

1 Parent(s): 99298f3

Update app.py

Browse files

Files changed (1) hide show

app.py +108 -78

app.py CHANGED Viewed

@@ -1,32 +1,67 @@
-import cv2
-import numpy as np
-import pytesseract
 import requests
 import pandas as pd
-import gradio as gr
 import uuid
 import os
 # ──────────────────────────────────────────────────────────────
-# 1. OCR on the full image (always)
 # ──────────────────────────────────────────────────────────────
-def ocr_full_image(image: np.ndarray) -> str:
     """
-    Run Tesseract OCR on the entire image (no thresholding).
-    Return the raw OCR text.
     """
-    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
-    # We skip explicit thresholding—sometimes stylized covers lose detail under THRESH_OTSU.
-    text = pytesseract.image_to_string(gray, config="--oem 3 --psm 6")
-    return text.strip()
 # ──────────────────────────────────────────────────────────────
-# 2. Query OpenLibrary API
 # ──────────────────────────────────────────────────────────────
 def query_openlibrary(title_text: str, author_text: str = None) -> dict | None:
     """
-    Search OpenLibrary by title (and optional author).
-    Return a dict with title, author_name, publisher, first_publish_year, or None.
     """
     base_url = "https://openlibrary.org/search.json"
     params = {"title": title_text}
@@ -51,55 +86,42 @@ def query_openlibrary(title_text: str, author_text: str = None) -> dict | None:
     return None
 # ──────────────────────────────────────────────────────────────
-# 3. Process one uploaded image (single OCR pass)
 # ──────────────────────────────────────────────────────────────
-def process_image(image_file):
     """
-    Gradio passes either a PIL image or None.
-    If image_file is None, return an empty DataFrame and empty CSV.
-    Otherwise, convert to OpenCV BGR, OCR the entire image, parse first two lines for title/author,
-    query OpenLibrary once, and return a DataFrame + CSV file path.
     """
-    if image_file is None:
-        # No image provided → return empty table + an empty CSV file
-        df_empty = pd.DataFrame(columns=["title", "author_name", "publisher", "first_publish_year"])
-        empty_bytes = df_empty.to_csv(index=False).encode()
-        unique_name = f"books_{uuid.uuid4().hex}.csv"
-        temp_path = os.path.join("/tmp", unique_name)
-        with open(temp_path, "wb") as f:
-            f.write(empty_bytes)
-        return df_empty, temp_path
-    # Convert PIL to OpenCV BGR
-    img = np.array(image_file)[:, :, ::-1].copy()
-    # 1) Run OCR on full image
-    try:
-        full_text = ocr_full_image(img)
-    except pytesseract.pytesseract.TesseractNotFoundError:
-        # If Tesseract isn’t installed, return empty DataFrame and log the issue
-        print("ERROR: Tesseract not found. Did you add apt.txt with 'tesseract-ocr'?")
-        df_error = pd.DataFrame(columns=["title", "author_name", "publisher", "first_publish_year"])
-        error_bytes = df_error.to_csv(index=False).encode()
-        unique_name = f"books_{uuid.uuid4().hex}.csv"
-        temp_path = os.path.join("/tmp", unique_name)
-        with open(temp_path, "wb") as f:
-            f.write(error_bytes)
-        return df_error, temp_path
-    lines = [line.strip() for line in full_text.splitlines() if line.strip()]
     records = []
-    if lines:
-        # Use first line as title, second (if exists) as author
         title_guess = lines[0]
         author_guess = lines[1] if len(lines) > 1 else None
-        meta = query_openlibrary(title_guess, author_guess)
         if meta:
             records.append(meta)
         else:
-            # No OpenLibrary match → still include OCR guesses
             records.append({
                 "title": title_guess,
                 "author_name": author_guess or "",
@@ -107,11 +129,11 @@ def process_image(image_file):
                 "first_publish_year": "",
             })
-    # Build DataFrame (even if empty)
     df = pd.DataFrame(records, columns=["title", "author_name", "publisher", "first_publish_year"])
     csv_bytes = df.to_csv(index=False).encode()
-    # Write CSV to a unique temporary file
     unique_name = f"books_{uuid.uuid4().hex}.csv"
     temp_path = os.path.join("/tmp", unique_name)
     with open(temp_path, "wb") as f:
@@ -120,41 +142,50 @@ def process_image(image_file):
     return df, temp_path
 # ──────────────────────────────────────────────────────────────
-# 4. Build the Gradio Interface
 # ──────────────────────────────────────────────────────────────
 def build_interface():
-    with gr.Blocks(title="Single‐Cover OCR + OpenLibrary Lookup") as demo:
         gr.Markdown(
             """
-            ## Book Cover OCR + OpenLibrary Lookup
-            1. Upload a photo of a single book cover.
-            2. The app will run OCR on the full image, take:
-               - the **first line** as a “title” guess, and
-               - the **second line** as an “author” guess (if present), then
-               - query OpenLibrary for metadata.
-            3. Results display in a table and can be downloaded as CSV.
-            > **Note:**
-            > • Ensure Tesseract OCR is installed (see `apt.txt`).
-            > • If no image is uploaded, the table and CSV will be empty.
             """
         )
         with gr.Row():
-            img_in = gr.Image(type="pil", label="Upload Single Book Cover")
-            run_button = gr.Button("Scan & Lookup")
         output_table = gr.Dataframe(
             headers=["title", "author_name", "publisher", "first_publish_year"],
-            label="Detected Book Metadata",
             datatype="pandas",
         )
         download_file = gr.File(label="Download CSV")
-        def on_run(image):
-            df, filepath = process_image(image)
-            return df, filepath
         run_button.click(
             fn=on_run,
@@ -165,5 +196,4 @@ def build_interface():
     return demo
 if __name__ == "__main__":
-    demo_app = build_interface()
-    demo_app.launch()

+import gradio as gr
+import torch
+from transformers import AutoProcessor, Qwen2VLForConditionalGeneration
+from PIL import Image
 import requests
 import pandas as pd
+import numpy as np
 import uuid
 import os
 # ──────────────────────────────────────────────────────────────
+# 1. Load Qwen2-VL OCR Model & Processor (once at startup)
+# ──────────────────────────────────────────────────────────────
+MODEL_ID = "prithivMLmods/Qwen2-VL-OCR-2B-Instruct"
+# Choose device: GPU if available, otherwise CPU
+DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
+model = Qwen2VLForConditionalGeneration.from_pretrained(
+    MODEL_ID,
+    trust_remote_code=True,
+    torch_dtype=torch.float16 if DEVICE == "cuda" else torch.float32,
+).to(DEVICE).eval()
 # ──────────────────────────────────────────────────────────────
+# 2. OCR Helper: Extract text from a single PIL image
+# ──────────────────────────────────────────────────────────────
+@torch.no_grad()
+def run_qwen_ocr(pil_image: Image.Image) -> str:
     """
+    Use Qwen2-VL to OCR the given PIL image.
+    Returns a single string of the extracted text.
     """
+    # Build “chat” content: first a text prompt, then the image
+    user_message = [
+        {"type": "text", "text": "OCR the text in the image."},
+        {"type": "image", "image": pil_image},
+    ]
+    messages = [{"role": "user", "content": user_message}]
+    # Create the full prompt
+    prompt_full = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+    inputs = processor(
+        text=[prompt_full],
+        images=[pil_image],
+        return_tensors="pt",
+        padding=True,
+    ).to(DEVICE)
+    # Generate
+    outputs = model.generate(**inputs, max_new_tokens=1024)
+    decoded = processor.decode(outputs[0], skip_special_tokens=True).strip()
+    # The model’s response may include some markup like “<|im_end|>”; remove it
+    return decoded.replace("<|im_end|>", "").strip()
 # ──────────────────────────────────────────────────────────────
+# 3. OpenLibrary Lookup Helper
 # ──────────────────────────────────────────────────────────────
 def query_openlibrary(title_text: str, author_text: str = None) -> dict | None:
     """
+    Query OpenLibrary.search.json by title (and optional author).
+    Returns a dict with keys: title, author_name, publisher, first_publish_year.
+    If no results, returns None.
     """
     base_url = "https://openlibrary.org/search.json"
     params = {"title": title_text}
     return None
 # ──────────────────────────────────────────────────────────────
+# 4. Main Processing: OCR → Parse → OpenLibrary → CSV/DF
 # ──────────────────────────────────────────────────────────────
+def process_image_list(images: list[Image.Image]):
     """
+    Takes a list of PIL images (each ideally a single book cover).
+    Runs OCR on each via Qwen2-VL, parses first two nonempty lines as title/author,
+    looks up metadata once per image, and returns:
+      - A pandas DataFrame of all results
+      - A filepath to a CSV (written under /tmp)
     """
     records = []
+    for pil_img in images:
+        # 1) OCR
+        try:
+            ocr_text = run_qwen_ocr(pil_img)
+        except Exception as e:
+            # If model fails, skip this image
+            print(f"OCR failed on one image: {e}")
+            continue
+        # 2) Parse lines: first nonempty → title, second → author if present
+        lines = [line.strip() for line in ocr_text.splitlines() if line.strip()]
+        if not lines:
+            # No text extracted; skip
+            continue
         title_guess = lines[0]
         author_guess = lines[1] if len(lines) > 1 else None
+        # 3) Query OpenLibrary
+        meta = query_openlibrary(title_guess, author_guess)
         if meta:
             records.append(meta)
         else:
+            # Fallback: record OCR guesses if no OpenLibrary match
             records.append({
                 "title": title_guess,
                 "author_name": author_guess or "",
                 "first_publish_year": "",
             })
+    # 4) Build DataFrame (even if empty)
     df = pd.DataFrame(records, columns=["title", "author_name", "publisher", "first_publish_year"])
     csv_bytes = df.to_csv(index=False).encode()
+    # 5) Write CSV to a temporary file
     unique_name = f"books_{uuid.uuid4().hex}.csv"
     temp_path = os.path.join("/tmp", unique_name)
     with open(temp_path, "wb") as f:
     return df, temp_path
 # ──────────────────────────────────────────────────────────────
+# 5. Gradio Interface
 # ──────────────────────────────────────────────────────────────
 def build_interface():
+    with gr.Blocks(title="Book Cover Scanner (Qwen2-VL OCR)") as demo:
         gr.Markdown(
             """
+            # 📚 Book Cover Scanner + Metadata Lookup
+            1. Upload **one or more** images, each containing a single book cover.
+            2. The app will OCR each cover (via Qwen2-VL), take:
+               - the **first nonempty line** as a “title” guess, and
+               - the **second nonempty line** (if present) as an “author” guess, then
+               - query OpenLibrary once per image for metadata.
+            3. A table appears below with Title, Author(s), Publisher, Year.
+            4. Click “Download CSV” to export all results.
+            **Tips:**
+            - Use clear, high‐contrast photos (text should be legible).
+            - For best results, crop each cover to the image frame (no extra background).
+            - If Qwen2-VL fails on any image, that image is skipped in the table.
             """
         )
         with gr.Row():
+            img_in = gr.Gallery(label="Upload Book Cover(s)", elem_id="input_gallery").style(
+                height="auto"
+            )
+            run_button = gr.Button("OCR & Lookup")
         output_table = gr.Dataframe(
             headers=["title", "author_name", "publisher", "first_publish_year"],
+            label="Detected Books + Metadata",
             datatype="pandas",
         )
         download_file = gr.File(label="Download CSV")
+        def on_run(image_list):
+            # image_list is a list of numpy arrays (H×W×3). Convert to PIL:
+            pil_images = []
+            for np_img in image_list:
+                if isinstance(np_img, np.ndarray):
+                    pil_images.append(Image.fromarray(np_img))
+            df, csv_path = process_image_list(pil_images)
+            return df, csv_path
         run_button.click(
             fn=on_run,
     return demo
 if __name__ == "__main__":
+    build_interface().launch()