Spaces:

ugolefoo
/

bookscanner_app

Runtime error

App Files Files Community

ugolefoo commited on 8 days ago

Commit

1ab41e0

verified ·

1 Parent(s): 60b9f6c

Update app.py

Browse files

Files changed (1) hide show

app.py +34 -39

app.py CHANGED Viewed

@@ -12,8 +12,6 @@ import os
 # 1. Load Qwen2-VL OCR Model & Processor (once at startup)
 # ──────────────────────────────────────────────────────────────
 MODEL_ID = "prithivMLmods/Qwen2-VL-OCR-2B-Instruct"
-# Choose device: GPU if available, otherwise CPU
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
@@ -30,16 +28,16 @@ model = Qwen2VLForConditionalGeneration.from_pretrained(
 def run_qwen_ocr(pil_image: Image.Image) -> str:
     """
     Use Qwen2-VL to OCR the given PIL image.
-    Returns a single string of the extracted text.
     """
-    # Build “chat” content: first a text prompt, then the image
     user_message = [
         {"type": "text", "text": "OCR the text in the image."},
         {"type": "image", "image": pil_image},
     ]
     messages = [{"role": "user", "content": user_message}]
-    # Create the full prompt
     prompt_full = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
     inputs = processor(
         text=[prompt_full],
@@ -48,10 +46,8 @@ def run_qwen_ocr(pil_image: Image.Image) -> str:
         padding=True,
     ).to(DEVICE)
-    # Generate
     outputs = model.generate(**inputs, max_new_tokens=1024)
     decoded = processor.decode(outputs[0], skip_special_tokens=True).strip()
-    # The model’s response may include some markup like “<|im_end|>”; remove it
     return decoded.replace("<|im_end|>", "").strip()
 # ──────────────────────────────────────────────────────────────
@@ -59,9 +55,8 @@ def run_qwen_ocr(pil_image: Image.Image) -> str:
 # ──────────────────────────────────────────────────────────────
 def query_openlibrary(title_text: str, author_text: str = None) -> dict | None:
     """
-    Query OpenLibrary.search.json by title (and optional author).
-    Returns a dict with keys: title, author_name, publisher, first_publish_year.
-    If no results, returns None.
     """
     base_url = "https://openlibrary.org/search.json"
     params = {"title": title_text}
@@ -88,40 +83,41 @@ def query_openlibrary(title_text: str, author_text: str = None) -> dict | None:
 # ──────────────────────────────────────────────────────────────
 # 4. Main Processing: OCR → Parse → OpenLibrary → CSV/DF
 # ──────────────────────────────────────────────────────────────
-def process_image_list(images: list[Image.Image]):
     """
-    Takes a list of PIL images (each ideally a single book cover).
-    Runs OCR on each via Qwen2-VL, parses first two nonempty lines as title/author,
-    looks up metadata once per image, and returns:
-      - A pandas DataFrame of all results
-      - A filepath to a CSV (written under /tmp)
     """
     records = []
-    for pil_img in images:
         # 1) OCR
         try:
             ocr_text = run_qwen_ocr(pil_img)
         except Exception as e:
-            # If model fails, skip this image
-            print(f"OCR failed on one image: {e}")
             continue
-        # 2) Parse lines: first nonempty → title, second → author if present
         lines = [line.strip() for line in ocr_text.splitlines() if line.strip()]
         if not lines:
-            # No text extracted; skip
             continue
         title_guess = lines[0]
         author_guess = lines[1] if len(lines) > 1 else None
-        # 3) Query OpenLibrary
         meta = query_openlibrary(title_guess, author_guess)
         if meta:
             records.append(meta)
         else:
-            # Fallback: record OCR guesses if no OpenLibrary match
             records.append({
                 "title": title_guess,
                 "author_name": author_guess or "",
@@ -129,11 +125,11 @@ def process_image_list(images: list[Image.Image]):
                 "first_publish_year": "",
             })
-    # 4) Build DataFrame (even if empty)
     df = pd.DataFrame(records, columns=["title", "author_name", "publisher", "first_publish_year"])
     csv_bytes = df.to_csv(index=False).encode()
-    # 5) Write CSV to a temporary file
     unique_name = f"books_{uuid.uuid4().hex}.csv"
     temp_path = os.path.join("/tmp", unique_name)
     with open(temp_path, "wb") as f:
@@ -150,7 +146,7 @@ def build_interface():
             """
             # 📚 Book Cover Scanner + Metadata Lookup
-            1. Upload **one or more** images, each containing a single book cover.
             2. The app will OCR each cover (via Qwen2-VL), take:
                - the **first nonempty line** as a “title” guess, and
                - the **second nonempty line** (if present) as an “author” guess, then
@@ -159,15 +155,18 @@ def build_interface():
             4. Click “Download CSV” to export all results.
             **Tips:**
-            - Use clear, high‐contrast photos (text should be legible).
-            - For best results, crop each cover to the image frame (no extra background).
-            - If Qwen2-VL fails on any image, that image is skipped in the table.
             """
         )
         with gr.Row():
-            img_in = gr.Gallery(label="Upload Book Cover(s)", elem_id="input_gallery").style(
-                height="auto"
             )
             run_button = gr.Button("OCR & Lookup")
@@ -178,18 +177,14 @@ def build_interface():
         )
         download_file = gr.File(label="Download CSV")
-        def on_run(image_list):
-            # image_list is a list of numpy arrays (H×W×3). Convert to PIL:
-            pil_images = []
-            for np_img in image_list:
-                if isinstance(np_img, np.ndarray):
-                    pil_images.append(Image.fromarray(np_img))
-            df, csv_path = process_image_list(pil_images)
             return df, csv_path
         run_button.click(
             fn=on_run,
-            inputs=[img_in],
             outputs=[output_table, download_file],
         )

 # 1. Load Qwen2-VL OCR Model & Processor (once at startup)
 # ──────────────────────────────────────────────────────────────
 MODEL_ID = "prithivMLmods/Qwen2-VL-OCR-2B-Instruct"
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
 def run_qwen_ocr(pil_image: Image.Image) -> str:
     """
     Use Qwen2-VL to OCR the given PIL image.
+    Returns extracted text.
     """
+    # Build prompt: text + image
     user_message = [
         {"type": "text", "text": "OCR the text in the image."},
         {"type": "image", "image": pil_image},
     ]
     messages = [{"role": "user", "content": user_message}]
+    # Create full prompt
     prompt_full = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
     inputs = processor(
         text=[prompt_full],
         padding=True,
     ).to(DEVICE)
     outputs = model.generate(**inputs, max_new_tokens=1024)
     decoded = processor.decode(outputs[0], skip_special_tokens=True).strip()
     return decoded.replace("<|im_end|>", "").strip()
 # ──────────────────────────────────────────────────────────────
 # ──────────────────────────────────────────────────────────────
 def query_openlibrary(title_text: str, author_text: str = None) -> dict | None:
     """
+    Query OpenLibrary by title (and optional author).
+    Returns a dict with title, author_name, publisher, first_publish_year.
     """
     base_url = "https://openlibrary.org/search.json"
     params = {"title": title_text}
 # ──────────────────────────────────────────────────────────────
 # 4. Main Processing: OCR → Parse → OpenLibrary → CSV/DF
 # ──────────────────────────────────────────────────────────────
+def process_image_list(filepaths: list[str]):
     """
+    Takes a list of file paths (each a single-cover image).
+    Runs OCR on each via Qwen2-VL, parses first two lines as title/author,
+    queries OpenLibrary, and returns a DataFrame + CSV path.
     """
     records = []
+    for path in filepaths:
+        try:
+            pil_img = Image.open(path).convert("RGB")
+        except Exception as e:
+            print(f"Failed to open image {path}: {e}")
+            continue
         # 1) OCR
         try:
             ocr_text = run_qwen_ocr(pil_img)
         except Exception as e:
+            print(f"OCR failed on {path}: {e}")
             continue
+        # 2) Parse lines
         lines = [line.strip() for line in ocr_text.splitlines() if line.strip()]
         if not lines:
             continue
         title_guess = lines[0]
         author_guess = lines[1] if len(lines) > 1 else None
+        # 3) OpenLibrary lookup
         meta = query_openlibrary(title_guess, author_guess)
         if meta:
             records.append(meta)
         else:
             records.append({
                 "title": title_guess,
                 "author_name": author_guess or "",
                 "first_publish_year": "",
             })
+    # 4) Build DataFrame
     df = pd.DataFrame(records, columns=["title", "author_name", "publisher", "first_publish_year"])
     csv_bytes = df.to_csv(index=False).encode()
+    # 5) Write CSV to temp file
     unique_name = f"books_{uuid.uuid4().hex}.csv"
     temp_path = os.path.join("/tmp", unique_name)
     with open(temp_path, "wb") as f:
             """
             # 📚 Book Cover Scanner + Metadata Lookup
+            1. Upload **one or more** image files, each containing a single book cover.
             2. The app will OCR each cover (via Qwen2-VL), take:
                - the **first nonempty line** as a “title” guess, and
                - the **second nonempty line** (if present) as an “author” guess, then
             4. Click “Download CSV” to export all results.
             **Tips:**
+            - Use clear, high-contrast photos (text should be legible).
+            - Each image should contain exactly one book cover.
+            - If Qwen2-VL OCR fails on any image, that image is skipped.
             """
         )
         with gr.Row():
+            file_input = gr.File(
+                label="Upload Book Cover(s)",
+                file_count="multiple",
+                type="filepath",
+                file_types=[".jpg", ".jpeg", ".png"]
             )
             run_button = gr.Button("OCR & Lookup")
         )
         download_file = gr.File(label="Download CSV")
+        def on_run(filepaths):
+            # filepaths is a list of local file paths
+            df, csv_path = process_image_list(filepaths or [])
             return df, csv_path
         run_button.click(
             fn=on_run,
+            inputs=[file_input],
             outputs=[output_table, download_file],
         )