Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
@@ -12,8 +12,6 @@ import os
|
|
12 |
# 1. Load Qwen2-VL OCR Model & Processor (once at startup)
|
13 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
14 |
MODEL_ID = "prithivMLmods/Qwen2-VL-OCR-2B-Instruct"
|
15 |
-
|
16 |
-
# Choose device: GPU if available, otherwise CPU
|
17 |
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
|
18 |
|
19 |
processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
|
@@ -30,16 +28,16 @@ model = Qwen2VLForConditionalGeneration.from_pretrained(
|
|
30 |
def run_qwen_ocr(pil_image: Image.Image) -> str:
|
31 |
"""
|
32 |
Use Qwen2-VL to OCR the given PIL image.
|
33 |
-
Returns
|
34 |
"""
|
35 |
-
# Build
|
36 |
user_message = [
|
37 |
{"type": "text", "text": "OCR the text in the image."},
|
38 |
{"type": "image", "image": pil_image},
|
39 |
]
|
40 |
messages = [{"role": "user", "content": user_message}]
|
41 |
|
42 |
-
# Create
|
43 |
prompt_full = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
|
44 |
inputs = processor(
|
45 |
text=[prompt_full],
|
@@ -48,10 +46,8 @@ def run_qwen_ocr(pil_image: Image.Image) -> str:
|
|
48 |
padding=True,
|
49 |
).to(DEVICE)
|
50 |
|
51 |
-
# Generate
|
52 |
outputs = model.generate(**inputs, max_new_tokens=1024)
|
53 |
decoded = processor.decode(outputs[0], skip_special_tokens=True).strip()
|
54 |
-
# The modelβs response may include some markup like β<|im_end|>β; remove it
|
55 |
return decoded.replace("<|im_end|>", "").strip()
|
56 |
|
57 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
@@ -59,9 +55,8 @@ def run_qwen_ocr(pil_image: Image.Image) -> str:
|
|
59 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
60 |
def query_openlibrary(title_text: str, author_text: str = None) -> dict | None:
|
61 |
"""
|
62 |
-
Query OpenLibrary
|
63 |
-
Returns a dict with
|
64 |
-
If no results, returns None.
|
65 |
"""
|
66 |
base_url = "https://openlibrary.org/search.json"
|
67 |
params = {"title": title_text}
|
@@ -88,40 +83,41 @@ def query_openlibrary(title_text: str, author_text: str = None) -> dict | None:
|
|
88 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
89 |
# 4. Main Processing: OCR β Parse β OpenLibrary β CSV/DF
|
90 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
91 |
-
def process_image_list(
|
92 |
"""
|
93 |
-
Takes a list of
|
94 |
-
Runs OCR on each via Qwen2-VL, parses first two
|
95 |
-
|
96 |
-
- A pandas DataFrame of all results
|
97 |
-
- A filepath to a CSV (written under /tmp)
|
98 |
"""
|
99 |
records = []
|
100 |
|
101 |
-
for
|
|
|
|
|
|
|
|
|
|
|
|
|
102 |
# 1) OCR
|
103 |
try:
|
104 |
ocr_text = run_qwen_ocr(pil_img)
|
105 |
except Exception as e:
|
106 |
-
|
107 |
-
print(f"OCR failed on one image: {e}")
|
108 |
continue
|
109 |
|
110 |
-
# 2) Parse lines
|
111 |
lines = [line.strip() for line in ocr_text.splitlines() if line.strip()]
|
112 |
if not lines:
|
113 |
-
# No text extracted; skip
|
114 |
continue
|
115 |
|
116 |
title_guess = lines[0]
|
117 |
author_guess = lines[1] if len(lines) > 1 else None
|
118 |
|
119 |
-
# 3)
|
120 |
meta = query_openlibrary(title_guess, author_guess)
|
121 |
if meta:
|
122 |
records.append(meta)
|
123 |
else:
|
124 |
-
# Fallback: record OCR guesses if no OpenLibrary match
|
125 |
records.append({
|
126 |
"title": title_guess,
|
127 |
"author_name": author_guess or "",
|
@@ -129,11 +125,11 @@ def process_image_list(images: list[Image.Image]):
|
|
129 |
"first_publish_year": "",
|
130 |
})
|
131 |
|
132 |
-
# 4) Build DataFrame
|
133 |
df = pd.DataFrame(records, columns=["title", "author_name", "publisher", "first_publish_year"])
|
134 |
csv_bytes = df.to_csv(index=False).encode()
|
135 |
|
136 |
-
# 5) Write CSV to
|
137 |
unique_name = f"books_{uuid.uuid4().hex}.csv"
|
138 |
temp_path = os.path.join("/tmp", unique_name)
|
139 |
with open(temp_path, "wb") as f:
|
@@ -150,7 +146,7 @@ def build_interface():
|
|
150 |
"""
|
151 |
# π Book Cover Scanner + Metadata Lookup
|
152 |
|
153 |
-
1. Upload **one or more**
|
154 |
2. The app will OCR each cover (via Qwen2-VL), take:
|
155 |
- the **first nonempty line** as a βtitleβ guess, and
|
156 |
- the **second nonempty line** (if present) as an βauthorβ guess, then
|
@@ -159,15 +155,18 @@ def build_interface():
|
|
159 |
4. Click βDownload CSVβ to export all results.
|
160 |
|
161 |
**Tips:**
|
162 |
-
- Use clear, high
|
163 |
-
-
|
164 |
-
- If Qwen2-VL fails on any image, that image is skipped
|
165 |
"""
|
166 |
)
|
167 |
|
168 |
with gr.Row():
|
169 |
-
|
170 |
-
|
|
|
|
|
|
|
171 |
)
|
172 |
run_button = gr.Button("OCR & Lookup")
|
173 |
|
@@ -178,18 +177,14 @@ def build_interface():
|
|
178 |
)
|
179 |
download_file = gr.File(label="Download CSV")
|
180 |
|
181 |
-
def on_run(
|
182 |
-
#
|
183 |
-
|
184 |
-
for np_img in image_list:
|
185 |
-
if isinstance(np_img, np.ndarray):
|
186 |
-
pil_images.append(Image.fromarray(np_img))
|
187 |
-
df, csv_path = process_image_list(pil_images)
|
188 |
return df, csv_path
|
189 |
|
190 |
run_button.click(
|
191 |
fn=on_run,
|
192 |
-
inputs=[
|
193 |
outputs=[output_table, download_file],
|
194 |
)
|
195 |
|
|
|
12 |
# 1. Load Qwen2-VL OCR Model & Processor (once at startup)
|
13 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
14 |
MODEL_ID = "prithivMLmods/Qwen2-VL-OCR-2B-Instruct"
|
|
|
|
|
15 |
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
|
16 |
|
17 |
processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
|
|
|
28 |
def run_qwen_ocr(pil_image: Image.Image) -> str:
|
29 |
"""
|
30 |
Use Qwen2-VL to OCR the given PIL image.
|
31 |
+
Returns extracted text.
|
32 |
"""
|
33 |
+
# Build prompt: text + image
|
34 |
user_message = [
|
35 |
{"type": "text", "text": "OCR the text in the image."},
|
36 |
{"type": "image", "image": pil_image},
|
37 |
]
|
38 |
messages = [{"role": "user", "content": user_message}]
|
39 |
|
40 |
+
# Create full prompt
|
41 |
prompt_full = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
|
42 |
inputs = processor(
|
43 |
text=[prompt_full],
|
|
|
46 |
padding=True,
|
47 |
).to(DEVICE)
|
48 |
|
|
|
49 |
outputs = model.generate(**inputs, max_new_tokens=1024)
|
50 |
decoded = processor.decode(outputs[0], skip_special_tokens=True).strip()
|
|
|
51 |
return decoded.replace("<|im_end|>", "").strip()
|
52 |
|
53 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
|
|
55 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
56 |
def query_openlibrary(title_text: str, author_text: str = None) -> dict | None:
|
57 |
"""
|
58 |
+
Query OpenLibrary by title (and optional author).
|
59 |
+
Returns a dict with title, author_name, publisher, first_publish_year.
|
|
|
60 |
"""
|
61 |
base_url = "https://openlibrary.org/search.json"
|
62 |
params = {"title": title_text}
|
|
|
83 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
84 |
# 4. Main Processing: OCR β Parse β OpenLibrary β CSV/DF
|
85 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
86 |
+
def process_image_list(filepaths: list[str]):
|
87 |
"""
|
88 |
+
Takes a list of file paths (each a single-cover image).
|
89 |
+
Runs OCR on each via Qwen2-VL, parses first two lines as title/author,
|
90 |
+
queries OpenLibrary, and returns a DataFrame + CSV path.
|
|
|
|
|
91 |
"""
|
92 |
records = []
|
93 |
|
94 |
+
for path in filepaths:
|
95 |
+
try:
|
96 |
+
pil_img = Image.open(path).convert("RGB")
|
97 |
+
except Exception as e:
|
98 |
+
print(f"Failed to open image {path}: {e}")
|
99 |
+
continue
|
100 |
+
|
101 |
# 1) OCR
|
102 |
try:
|
103 |
ocr_text = run_qwen_ocr(pil_img)
|
104 |
except Exception as e:
|
105 |
+
print(f"OCR failed on {path}: {e}")
|
|
|
106 |
continue
|
107 |
|
108 |
+
# 2) Parse lines
|
109 |
lines = [line.strip() for line in ocr_text.splitlines() if line.strip()]
|
110 |
if not lines:
|
|
|
111 |
continue
|
112 |
|
113 |
title_guess = lines[0]
|
114 |
author_guess = lines[1] if len(lines) > 1 else None
|
115 |
|
116 |
+
# 3) OpenLibrary lookup
|
117 |
meta = query_openlibrary(title_guess, author_guess)
|
118 |
if meta:
|
119 |
records.append(meta)
|
120 |
else:
|
|
|
121 |
records.append({
|
122 |
"title": title_guess,
|
123 |
"author_name": author_guess or "",
|
|
|
125 |
"first_publish_year": "",
|
126 |
})
|
127 |
|
128 |
+
# 4) Build DataFrame
|
129 |
df = pd.DataFrame(records, columns=["title", "author_name", "publisher", "first_publish_year"])
|
130 |
csv_bytes = df.to_csv(index=False).encode()
|
131 |
|
132 |
+
# 5) Write CSV to temp file
|
133 |
unique_name = f"books_{uuid.uuid4().hex}.csv"
|
134 |
temp_path = os.path.join("/tmp", unique_name)
|
135 |
with open(temp_path, "wb") as f:
|
|
|
146 |
"""
|
147 |
# π Book Cover Scanner + Metadata Lookup
|
148 |
|
149 |
+
1. Upload **one or more** image files, each containing a single book cover.
|
150 |
2. The app will OCR each cover (via Qwen2-VL), take:
|
151 |
- the **first nonempty line** as a βtitleβ guess, and
|
152 |
- the **second nonempty line** (if present) as an βauthorβ guess, then
|
|
|
155 |
4. Click βDownload CSVβ to export all results.
|
156 |
|
157 |
**Tips:**
|
158 |
+
- Use clear, high-contrast photos (text should be legible).
|
159 |
+
- Each image should contain exactly one book cover.
|
160 |
+
- If Qwen2-VL OCR fails on any image, that image is skipped.
|
161 |
"""
|
162 |
)
|
163 |
|
164 |
with gr.Row():
|
165 |
+
file_input = gr.File(
|
166 |
+
label="Upload Book Cover(s)",
|
167 |
+
file_count="multiple",
|
168 |
+
type="filepath",
|
169 |
+
file_types=[".jpg", ".jpeg", ".png"]
|
170 |
)
|
171 |
run_button = gr.Button("OCR & Lookup")
|
172 |
|
|
|
177 |
)
|
178 |
download_file = gr.File(label="Download CSV")
|
179 |
|
180 |
+
def on_run(filepaths):
|
181 |
+
# filepaths is a list of local file paths
|
182 |
+
df, csv_path = process_image_list(filepaths or [])
|
|
|
|
|
|
|
|
|
183 |
return df, csv_path
|
184 |
|
185 |
run_button.click(
|
186 |
fn=on_run,
|
187 |
+
inputs=[file_input],
|
188 |
outputs=[output_table, download_file],
|
189 |
)
|
190 |
|