ugolefoo commited on
Commit
c53dc19
Β·
verified Β·
1 Parent(s): 99298f3

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +108 -78
app.py CHANGED
@@ -1,32 +1,67 @@
1
- import cv2
2
- import numpy as np
3
- import pytesseract
 
4
  import requests
5
  import pandas as pd
6
- import gradio as gr
7
  import uuid
8
  import os
9
 
10
  # ──────────────────────────────────────────────────────────────
11
- # 1. OCR on the full image (always)
 
 
 
 
 
 
 
 
 
 
 
 
 
12
  # ──────────────────────────────────────────────────────────────
13
- def ocr_full_image(image: np.ndarray) -> str:
 
 
 
14
  """
15
- Run Tesseract OCR on the entire image (no thresholding).
16
- Return the raw OCR text.
17
  """
18
- gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
19
- # We skip explicit thresholdingβ€”sometimes stylized covers lose detail under THRESH_OTSU.
20
- text = pytesseract.image_to_string(gray, config="--oem 3 --psm 6")
21
- return text.strip()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
 
23
  # ──────────────────────────────────────────────────────────────
24
- # 2. Query OpenLibrary API
25
  # ──────────────────────────────────────────────────────────────
26
  def query_openlibrary(title_text: str, author_text: str = None) -> dict | None:
27
  """
28
- Search OpenLibrary by title (and optional author).
29
- Return a dict with title, author_name, publisher, first_publish_year, or None.
 
30
  """
31
  base_url = "https://openlibrary.org/search.json"
32
  params = {"title": title_text}
@@ -51,55 +86,42 @@ def query_openlibrary(title_text: str, author_text: str = None) -> dict | None:
51
  return None
52
 
53
  # ──────────────────────────────────────────────────────────────
54
- # 3. Process one uploaded image (single OCR pass)
55
  # ──────────────────────────────────────────────────────────────
56
- def process_image(image_file):
57
  """
58
- Gradio passes either a PIL image or None.
59
- If image_file is None, return an empty DataFrame and empty CSV.
60
- Otherwise, convert to OpenCV BGR, OCR the entire image, parse first two lines for title/author,
61
- query OpenLibrary once, and return a DataFrame + CSV file path.
 
62
  """
63
- if image_file is None:
64
- # No image provided β†’ return empty table + an empty CSV file
65
- df_empty = pd.DataFrame(columns=["title", "author_name", "publisher", "first_publish_year"])
66
- empty_bytes = df_empty.to_csv(index=False).encode()
67
- unique_name = f"books_{uuid.uuid4().hex}.csv"
68
- temp_path = os.path.join("/tmp", unique_name)
69
- with open(temp_path, "wb") as f:
70
- f.write(empty_bytes)
71
- return df_empty, temp_path
72
-
73
- # Convert PIL to OpenCV BGR
74
- img = np.array(image_file)[:, :, ::-1].copy()
75
-
76
- # 1) Run OCR on full image
77
- try:
78
- full_text = ocr_full_image(img)
79
- except pytesseract.pytesseract.TesseractNotFoundError:
80
- # If Tesseract isn’t installed, return empty DataFrame and log the issue
81
- print("ERROR: Tesseract not found. Did you add apt.txt with 'tesseract-ocr'?")
82
- df_error = pd.DataFrame(columns=["title", "author_name", "publisher", "first_publish_year"])
83
- error_bytes = df_error.to_csv(index=False).encode()
84
- unique_name = f"books_{uuid.uuid4().hex}.csv"
85
- temp_path = os.path.join("/tmp", unique_name)
86
- with open(temp_path, "wb") as f:
87
- f.write(error_bytes)
88
- return df_error, temp_path
89
-
90
- lines = [line.strip() for line in full_text.splitlines() if line.strip()]
91
-
92
  records = []
93
- if lines:
94
- # Use first line as title, second (if exists) as author
 
 
 
 
 
 
 
 
 
 
 
 
 
 
95
  title_guess = lines[0]
96
  author_guess = lines[1] if len(lines) > 1 else None
97
- meta = query_openlibrary(title_guess, author_guess)
98
 
 
 
99
  if meta:
100
  records.append(meta)
101
  else:
102
- # No OpenLibrary match β†’ still include OCR guesses
103
  records.append({
104
  "title": title_guess,
105
  "author_name": author_guess or "",
@@ -107,11 +129,11 @@ def process_image(image_file):
107
  "first_publish_year": "",
108
  })
109
 
110
- # Build DataFrame (even if empty)
111
  df = pd.DataFrame(records, columns=["title", "author_name", "publisher", "first_publish_year"])
112
  csv_bytes = df.to_csv(index=False).encode()
113
 
114
- # Write CSV to a unique temporary file
115
  unique_name = f"books_{uuid.uuid4().hex}.csv"
116
  temp_path = os.path.join("/tmp", unique_name)
117
  with open(temp_path, "wb") as f:
@@ -120,41 +142,50 @@ def process_image(image_file):
120
  return df, temp_path
121
 
122
  # ──────────────────────────────────────────────────────────────
123
- # 4. Build the Gradio Interface
124
  # ──────────────────────────────────────────────────────────────
125
  def build_interface():
126
- with gr.Blocks(title="Single‐Cover OCR + OpenLibrary Lookup") as demo:
127
  gr.Markdown(
128
  """
129
- ## Book Cover OCR + OpenLibrary Lookup
130
-
131
- 1. Upload a photo of a single book cover.
132
- 2. The app will run OCR on the full image, take:
133
- - the **first line** as a β€œtitle” guess, and
134
- - the **second line** as an β€œauthor” guess (if present), then
135
- - query OpenLibrary for metadata.
136
- 3. Results display in a table and can be downloaded as CSV.
137
-
138
- > **Note:**
139
- > β€’ Ensure Tesseract OCR is installed (see `apt.txt`).
140
- > β€’ If no image is uploaded, the table and CSV will be empty.
 
 
141
  """
142
  )
143
 
144
  with gr.Row():
145
- img_in = gr.Image(type="pil", label="Upload Single Book Cover")
146
- run_button = gr.Button("Scan & Lookup")
 
 
147
 
148
  output_table = gr.Dataframe(
149
  headers=["title", "author_name", "publisher", "first_publish_year"],
150
- label="Detected Book Metadata",
151
  datatype="pandas",
152
  )
153
  download_file = gr.File(label="Download CSV")
154
 
155
- def on_run(image):
156
- df, filepath = process_image(image)
157
- return df, filepath
 
 
 
 
 
158
 
159
  run_button.click(
160
  fn=on_run,
@@ -165,5 +196,4 @@ def build_interface():
165
  return demo
166
 
167
  if __name__ == "__main__":
168
- demo_app = build_interface()
169
- demo_app.launch()
 
1
+ import gradio as gr
2
+ import torch
3
+ from transformers import AutoProcessor, Qwen2VLForConditionalGeneration
4
+ from PIL import Image
5
  import requests
6
  import pandas as pd
7
+ import numpy as np
8
  import uuid
9
  import os
10
 
11
  # ──────────────────────────────────────────────────────────────
12
+ # 1. Load Qwen2-VL OCR Model & Processor (once at startup)
13
+ # ──────────────────────────────────────────────────────────────
14
+ MODEL_ID = "prithivMLmods/Qwen2-VL-OCR-2B-Instruct"
15
+
16
+ # Choose device: GPU if available, otherwise CPU
17
+ DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
18
+
19
+ processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
20
+ model = Qwen2VLForConditionalGeneration.from_pretrained(
21
+ MODEL_ID,
22
+ trust_remote_code=True,
23
+ torch_dtype=torch.float16 if DEVICE == "cuda" else torch.float32,
24
+ ).to(DEVICE).eval()
25
+
26
  # ──────────────────────────────────────────────────────────────
27
+ # 2. OCR Helper: Extract text from a single PIL image
28
+ # ──────────────────────────────────────────────────────────────
29
+ @torch.no_grad()
30
+ def run_qwen_ocr(pil_image: Image.Image) -> str:
31
  """
32
+ Use Qwen2-VL to OCR the given PIL image.
33
+ Returns a single string of the extracted text.
34
  """
35
+ # Build β€œchat” content: first a text prompt, then the image
36
+ user_message = [
37
+ {"type": "text", "text": "OCR the text in the image."},
38
+ {"type": "image", "image": pil_image},
39
+ ]
40
+ messages = [{"role": "user", "content": user_message}]
41
+
42
+ # Create the full prompt
43
+ prompt_full = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
44
+ inputs = processor(
45
+ text=[prompt_full],
46
+ images=[pil_image],
47
+ return_tensors="pt",
48
+ padding=True,
49
+ ).to(DEVICE)
50
+
51
+ # Generate
52
+ outputs = model.generate(**inputs, max_new_tokens=1024)
53
+ decoded = processor.decode(outputs[0], skip_special_tokens=True).strip()
54
+ # The model’s response may include some markup like β€œ<|im_end|>”; remove it
55
+ return decoded.replace("<|im_end|>", "").strip()
56
 
57
  # ──────────────────────────────────────────────────────────────
58
+ # 3. OpenLibrary Lookup Helper
59
  # ──────────────────────────────────────────────────────────────
60
  def query_openlibrary(title_text: str, author_text: str = None) -> dict | None:
61
  """
62
+ Query OpenLibrary.search.json by title (and optional author).
63
+ Returns a dict with keys: title, author_name, publisher, first_publish_year.
64
+ If no results, returns None.
65
  """
66
  base_url = "https://openlibrary.org/search.json"
67
  params = {"title": title_text}
 
86
  return None
87
 
88
  # ──────────────────────────────────────────────────────────────
89
+ # 4. Main Processing: OCR β†’ Parse β†’ OpenLibrary β†’ CSV/DF
90
  # ──────────────────────────────────────────────────────────────
91
+ def process_image_list(images: list[Image.Image]):
92
  """
93
+ Takes a list of PIL images (each ideally a single book cover).
94
+ Runs OCR on each via Qwen2-VL, parses first two nonempty lines as title/author,
95
+ looks up metadata once per image, and returns:
96
+ - A pandas DataFrame of all results
97
+ - A filepath to a CSV (written under /tmp)
98
  """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
99
  records = []
100
+
101
+ for pil_img in images:
102
+ # 1) OCR
103
+ try:
104
+ ocr_text = run_qwen_ocr(pil_img)
105
+ except Exception as e:
106
+ # If model fails, skip this image
107
+ print(f"OCR failed on one image: {e}")
108
+ continue
109
+
110
+ # 2) Parse lines: first nonempty β†’ title, second β†’ author if present
111
+ lines = [line.strip() for line in ocr_text.splitlines() if line.strip()]
112
+ if not lines:
113
+ # No text extracted; skip
114
+ continue
115
+
116
  title_guess = lines[0]
117
  author_guess = lines[1] if len(lines) > 1 else None
 
118
 
119
+ # 3) Query OpenLibrary
120
+ meta = query_openlibrary(title_guess, author_guess)
121
  if meta:
122
  records.append(meta)
123
  else:
124
+ # Fallback: record OCR guesses if no OpenLibrary match
125
  records.append({
126
  "title": title_guess,
127
  "author_name": author_guess or "",
 
129
  "first_publish_year": "",
130
  })
131
 
132
+ # 4) Build DataFrame (even if empty)
133
  df = pd.DataFrame(records, columns=["title", "author_name", "publisher", "first_publish_year"])
134
  csv_bytes = df.to_csv(index=False).encode()
135
 
136
+ # 5) Write CSV to a temporary file
137
  unique_name = f"books_{uuid.uuid4().hex}.csv"
138
  temp_path = os.path.join("/tmp", unique_name)
139
  with open(temp_path, "wb") as f:
 
142
  return df, temp_path
143
 
144
  # ──────────────────────────────────────────────────────────────
145
+ # 5. Gradio Interface
146
  # ──────────────────────────────────────────────────────────────
147
  def build_interface():
148
+ with gr.Blocks(title="Book Cover Scanner (Qwen2-VL OCR)") as demo:
149
  gr.Markdown(
150
  """
151
+ # πŸ“š Book Cover Scanner + Metadata Lookup
152
+
153
+ 1. Upload **one or more** images, each containing a single book cover.
154
+ 2. The app will OCR each cover (via Qwen2-VL), take:
155
+ - the **first nonempty line** as a β€œtitle” guess, and
156
+ - the **second nonempty line** (if present) as an β€œauthor” guess, then
157
+ - query OpenLibrary once per image for metadata.
158
+ 3. A table appears below with Title, Author(s), Publisher, Year.
159
+ 4. Click β€œDownload CSV” to export all results.
160
+
161
+ **Tips:**
162
+ - Use clear, high‐contrast photos (text should be legible).
163
+ - For best results, crop each cover to the image frame (no extra background).
164
+ - If Qwen2-VL fails on any image, that image is skipped in the table.
165
  """
166
  )
167
 
168
  with gr.Row():
169
+ img_in = gr.Gallery(label="Upload Book Cover(s)", elem_id="input_gallery").style(
170
+ height="auto"
171
+ )
172
+ run_button = gr.Button("OCR & Lookup")
173
 
174
  output_table = gr.Dataframe(
175
  headers=["title", "author_name", "publisher", "first_publish_year"],
176
+ label="Detected Books + Metadata",
177
  datatype="pandas",
178
  )
179
  download_file = gr.File(label="Download CSV")
180
 
181
+ def on_run(image_list):
182
+ # image_list is a list of numpy arrays (HΓ—WΓ—3). Convert to PIL:
183
+ pil_images = []
184
+ for np_img in image_list:
185
+ if isinstance(np_img, np.ndarray):
186
+ pil_images.append(Image.fromarray(np_img))
187
+ df, csv_path = process_image_list(pil_images)
188
+ return df, csv_path
189
 
190
  run_button.click(
191
  fn=on_run,
 
196
  return demo
197
 
198
  if __name__ == "__main__":
199
+ build_interface().launch()