ugolefoo commited on
Commit
1ab41e0
Β·
verified Β·
1 Parent(s): 60b9f6c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +34 -39
app.py CHANGED
@@ -12,8 +12,6 @@ import os
12
  # 1. Load Qwen2-VL OCR Model & Processor (once at startup)
13
  # ──────────────────────────────────────────────────────────────
14
  MODEL_ID = "prithivMLmods/Qwen2-VL-OCR-2B-Instruct"
15
-
16
- # Choose device: GPU if available, otherwise CPU
17
  DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
18
 
19
  processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
@@ -30,16 +28,16 @@ model = Qwen2VLForConditionalGeneration.from_pretrained(
30
  def run_qwen_ocr(pil_image: Image.Image) -> str:
31
  """
32
  Use Qwen2-VL to OCR the given PIL image.
33
- Returns a single string of the extracted text.
34
  """
35
- # Build β€œchat” content: first a text prompt, then the image
36
  user_message = [
37
  {"type": "text", "text": "OCR the text in the image."},
38
  {"type": "image", "image": pil_image},
39
  ]
40
  messages = [{"role": "user", "content": user_message}]
41
 
42
- # Create the full prompt
43
  prompt_full = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
44
  inputs = processor(
45
  text=[prompt_full],
@@ -48,10 +46,8 @@ def run_qwen_ocr(pil_image: Image.Image) -> str:
48
  padding=True,
49
  ).to(DEVICE)
50
 
51
- # Generate
52
  outputs = model.generate(**inputs, max_new_tokens=1024)
53
  decoded = processor.decode(outputs[0], skip_special_tokens=True).strip()
54
- # The model’s response may include some markup like β€œ<|im_end|>”; remove it
55
  return decoded.replace("<|im_end|>", "").strip()
56
 
57
  # ──────────────────────────────────────────────────────────────
@@ -59,9 +55,8 @@ def run_qwen_ocr(pil_image: Image.Image) -> str:
59
  # ──────────────────────────────────────────────────────────────
60
  def query_openlibrary(title_text: str, author_text: str = None) -> dict | None:
61
  """
62
- Query OpenLibrary.search.json by title (and optional author).
63
- Returns a dict with keys: title, author_name, publisher, first_publish_year.
64
- If no results, returns None.
65
  """
66
  base_url = "https://openlibrary.org/search.json"
67
  params = {"title": title_text}
@@ -88,40 +83,41 @@ def query_openlibrary(title_text: str, author_text: str = None) -> dict | None:
88
  # ──────────────────────────────────────────────────────────────
89
  # 4. Main Processing: OCR β†’ Parse β†’ OpenLibrary β†’ CSV/DF
90
  # ──────────────────────────────────────────────────────────────
91
- def process_image_list(images: list[Image.Image]):
92
  """
93
- Takes a list of PIL images (each ideally a single book cover).
94
- Runs OCR on each via Qwen2-VL, parses first two nonempty lines as title/author,
95
- looks up metadata once per image, and returns:
96
- - A pandas DataFrame of all results
97
- - A filepath to a CSV (written under /tmp)
98
  """
99
  records = []
100
 
101
- for pil_img in images:
 
 
 
 
 
 
102
  # 1) OCR
103
  try:
104
  ocr_text = run_qwen_ocr(pil_img)
105
  except Exception as e:
106
- # If model fails, skip this image
107
- print(f"OCR failed on one image: {e}")
108
  continue
109
 
110
- # 2) Parse lines: first nonempty β†’ title, second β†’ author if present
111
  lines = [line.strip() for line in ocr_text.splitlines() if line.strip()]
112
  if not lines:
113
- # No text extracted; skip
114
  continue
115
 
116
  title_guess = lines[0]
117
  author_guess = lines[1] if len(lines) > 1 else None
118
 
119
- # 3) Query OpenLibrary
120
  meta = query_openlibrary(title_guess, author_guess)
121
  if meta:
122
  records.append(meta)
123
  else:
124
- # Fallback: record OCR guesses if no OpenLibrary match
125
  records.append({
126
  "title": title_guess,
127
  "author_name": author_guess or "",
@@ -129,11 +125,11 @@ def process_image_list(images: list[Image.Image]):
129
  "first_publish_year": "",
130
  })
131
 
132
- # 4) Build DataFrame (even if empty)
133
  df = pd.DataFrame(records, columns=["title", "author_name", "publisher", "first_publish_year"])
134
  csv_bytes = df.to_csv(index=False).encode()
135
 
136
- # 5) Write CSV to a temporary file
137
  unique_name = f"books_{uuid.uuid4().hex}.csv"
138
  temp_path = os.path.join("/tmp", unique_name)
139
  with open(temp_path, "wb") as f:
@@ -150,7 +146,7 @@ def build_interface():
150
  """
151
  # πŸ“š Book Cover Scanner + Metadata Lookup
152
 
153
- 1. Upload **one or more** images, each containing a single book cover.
154
  2. The app will OCR each cover (via Qwen2-VL), take:
155
  - the **first nonempty line** as a β€œtitle” guess, and
156
  - the **second nonempty line** (if present) as an β€œauthor” guess, then
@@ -159,15 +155,18 @@ def build_interface():
159
  4. Click β€œDownload CSV” to export all results.
160
 
161
  **Tips:**
162
- - Use clear, high‐contrast photos (text should be legible).
163
- - For best results, crop each cover to the image frame (no extra background).
164
- - If Qwen2-VL fails on any image, that image is skipped in the table.
165
  """
166
  )
167
 
168
  with gr.Row():
169
- img_in = gr.Gallery(label="Upload Book Cover(s)", elem_id="input_gallery").style(
170
- height="auto"
 
 
 
171
  )
172
  run_button = gr.Button("OCR & Lookup")
173
 
@@ -178,18 +177,14 @@ def build_interface():
178
  )
179
  download_file = gr.File(label="Download CSV")
180
 
181
- def on_run(image_list):
182
- # image_list is a list of numpy arrays (HΓ—WΓ—3). Convert to PIL:
183
- pil_images = []
184
- for np_img in image_list:
185
- if isinstance(np_img, np.ndarray):
186
- pil_images.append(Image.fromarray(np_img))
187
- df, csv_path = process_image_list(pil_images)
188
  return df, csv_path
189
 
190
  run_button.click(
191
  fn=on_run,
192
- inputs=[img_in],
193
  outputs=[output_table, download_file],
194
  )
195
 
 
12
  # 1. Load Qwen2-VL OCR Model & Processor (once at startup)
13
  # ──────────────────────────────────────────────────────────────
14
  MODEL_ID = "prithivMLmods/Qwen2-VL-OCR-2B-Instruct"
 
 
15
  DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
16
 
17
  processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
 
28
  def run_qwen_ocr(pil_image: Image.Image) -> str:
29
  """
30
  Use Qwen2-VL to OCR the given PIL image.
31
+ Returns extracted text.
32
  """
33
+ # Build prompt: text + image
34
  user_message = [
35
  {"type": "text", "text": "OCR the text in the image."},
36
  {"type": "image", "image": pil_image},
37
  ]
38
  messages = [{"role": "user", "content": user_message}]
39
 
40
+ # Create full prompt
41
  prompt_full = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
42
  inputs = processor(
43
  text=[prompt_full],
 
46
  padding=True,
47
  ).to(DEVICE)
48
 
 
49
  outputs = model.generate(**inputs, max_new_tokens=1024)
50
  decoded = processor.decode(outputs[0], skip_special_tokens=True).strip()
 
51
  return decoded.replace("<|im_end|>", "").strip()
52
 
53
  # ──────────────────────────────────────────────────────────────
 
55
  # ──────────────────────────────────────────────────────────────
56
  def query_openlibrary(title_text: str, author_text: str = None) -> dict | None:
57
  """
58
+ Query OpenLibrary by title (and optional author).
59
+ Returns a dict with title, author_name, publisher, first_publish_year.
 
60
  """
61
  base_url = "https://openlibrary.org/search.json"
62
  params = {"title": title_text}
 
83
  # ──────────────────────────────────────────────────────────────
84
  # 4. Main Processing: OCR β†’ Parse β†’ OpenLibrary β†’ CSV/DF
85
  # ──────────────────────────────────────────────────────────────
86
+ def process_image_list(filepaths: list[str]):
87
  """
88
+ Takes a list of file paths (each a single-cover image).
89
+ Runs OCR on each via Qwen2-VL, parses first two lines as title/author,
90
+ queries OpenLibrary, and returns a DataFrame + CSV path.
 
 
91
  """
92
  records = []
93
 
94
+ for path in filepaths:
95
+ try:
96
+ pil_img = Image.open(path).convert("RGB")
97
+ except Exception as e:
98
+ print(f"Failed to open image {path}: {e}")
99
+ continue
100
+
101
  # 1) OCR
102
  try:
103
  ocr_text = run_qwen_ocr(pil_img)
104
  except Exception as e:
105
+ print(f"OCR failed on {path}: {e}")
 
106
  continue
107
 
108
+ # 2) Parse lines
109
  lines = [line.strip() for line in ocr_text.splitlines() if line.strip()]
110
  if not lines:
 
111
  continue
112
 
113
  title_guess = lines[0]
114
  author_guess = lines[1] if len(lines) > 1 else None
115
 
116
+ # 3) OpenLibrary lookup
117
  meta = query_openlibrary(title_guess, author_guess)
118
  if meta:
119
  records.append(meta)
120
  else:
 
121
  records.append({
122
  "title": title_guess,
123
  "author_name": author_guess or "",
 
125
  "first_publish_year": "",
126
  })
127
 
128
+ # 4) Build DataFrame
129
  df = pd.DataFrame(records, columns=["title", "author_name", "publisher", "first_publish_year"])
130
  csv_bytes = df.to_csv(index=False).encode()
131
 
132
+ # 5) Write CSV to temp file
133
  unique_name = f"books_{uuid.uuid4().hex}.csv"
134
  temp_path = os.path.join("/tmp", unique_name)
135
  with open(temp_path, "wb") as f:
 
146
  """
147
  # πŸ“š Book Cover Scanner + Metadata Lookup
148
 
149
+ 1. Upload **one or more** image files, each containing a single book cover.
150
  2. The app will OCR each cover (via Qwen2-VL), take:
151
  - the **first nonempty line** as a β€œtitle” guess, and
152
  - the **second nonempty line** (if present) as an β€œauthor” guess, then
 
155
  4. Click β€œDownload CSV” to export all results.
156
 
157
  **Tips:**
158
+ - Use clear, high-contrast photos (text should be legible).
159
+ - Each image should contain exactly one book cover.
160
+ - If Qwen2-VL OCR fails on any image, that image is skipped.
161
  """
162
  )
163
 
164
  with gr.Row():
165
+ file_input = gr.File(
166
+ label="Upload Book Cover(s)",
167
+ file_count="multiple",
168
+ type="filepath",
169
+ file_types=[".jpg", ".jpeg", ".png"]
170
  )
171
  run_button = gr.Button("OCR & Lookup")
172
 
 
177
  )
178
  download_file = gr.File(label="Download CSV")
179
 
180
+ def on_run(filepaths):
181
+ # filepaths is a list of local file paths
182
+ df, csv_path = process_image_list(filepaths or [])
 
 
 
 
183
  return df, csv_path
184
 
185
  run_button.click(
186
  fn=on_run,
187
+ inputs=[file_input],
188
  outputs=[output_table, download_file],
189
  )
190