Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -8,7 +8,6 @@ from pathlib import Path
|
|
8 |
from transformers import AutoProcessor, Qwen2VLForConditionalGeneration
|
9 |
|
10 |
from olmocr.data.renderpdf import render_pdf_to_base64png
|
11 |
-
from olmocr.prompts import build_finetuning_prompt
|
12 |
from olmocr.prompts.anchor import get_anchor_text
|
13 |
|
14 |
from ebooklib import epub
|
@@ -36,14 +35,23 @@ def process_pdf_to_epub(pdf_file, title, author):
|
|
36 |
|
37 |
for i in range(num_pages):
|
38 |
page_num = i + 1
|
39 |
-
print(f"Processing page {page_num}...")
|
40 |
|
41 |
try:
|
42 |
# Render page to base64 image
|
43 |
image_base64 = render_pdf_to_base64png(pdf_path, page_num, target_longest_image_dim=1024)
|
44 |
anchor_text = get_anchor_text(pdf_path, page_num, pdf_engine="pdfreport", target_length=4000)
|
45 |
-
print(f"Anchor text for page {page_num}: {anchor_text}")
|
46 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
47 |
|
48 |
messages = [
|
49 |
{
|
@@ -72,11 +80,12 @@ def process_pdf_to_epub(pdf_file, title, author):
|
|
72 |
num_return_sequences=1,
|
73 |
do_sample=True,
|
74 |
)
|
|
|
75 |
prompt_length = inputs["input_ids"].shape[1]
|
76 |
new_tokens = output[:, prompt_length:].detach().cpu()
|
77 |
-
|
78 |
decoded = "[No output generated]"
|
79 |
-
if new_tokens.shape[1] > 0:
|
80 |
try:
|
81 |
decoded_list = processor.tokenizer.batch_decode(new_tokens, skip_special_tokens=True)
|
82 |
decoded = decoded_list[0].strip() if decoded_list else "[No output generated]"
|
@@ -87,18 +96,8 @@ def process_pdf_to_epub(pdf_file, title, author):
|
|
87 |
|
88 |
except Exception as processing_error:
|
89 |
decoded = f"[Processing error on page {page_num}: {str(processing_error)}]"
|
90 |
-
else:
|
91 |
-
try:
|
92 |
-
# Check if the tokens are empty
|
93 |
-
if not new_tokens:
|
94 |
-
decoded = f"[No tokens generated for page {page_num}]"
|
95 |
-
else:
|
96 |
-
decoded_list = processor.tokenizer.batch_decode(new_tokens, skip_special_tokens=True)
|
97 |
-
decoded = decoded_list[0].strip() if decoded_list else "[No output generated]"
|
98 |
-
except Exception as decode_error:
|
99 |
-
decoded = f"[Decoding error on page {page_num}: {str(decode_error)}]"
|
100 |
|
101 |
-
print(f"Decoded content for page {page_num}: {decoded}")
|
102 |
|
103 |
# Create chapter
|
104 |
chapter = epub.EpubHtml(title=f"Page {page_num}", file_name=f"page_{page_num}.xhtml", lang="en")
|
@@ -134,14 +133,14 @@ iface = gr.Interface(
|
|
134 |
outputs=gr.File(label="Download EPUB"),
|
135 |
title="PDF to EPUB Converter (with olmOCR)",
|
136 |
description="Uploads a PDF, extracts text from each page with vision + prompt, and builds an EPUB using the outputs. Sets the first page as cover.",
|
137 |
-
allow_flagging="never"
|
138 |
)
|
139 |
|
140 |
if __name__ == "__main__":
|
141 |
iface.launch(
|
142 |
-
server_name="0.0.0.0",
|
143 |
-
server_port=7860,
|
144 |
-
share=True,
|
145 |
-
debug=True,
|
146 |
-
allowed_paths=["/tmp"]
|
147 |
)
|
|
|
8 |
from transformers import AutoProcessor, Qwen2VLForConditionalGeneration
|
9 |
|
10 |
from olmocr.data.renderpdf import render_pdf_to_base64png
|
|
|
11 |
from olmocr.prompts.anchor import get_anchor_text
|
12 |
|
13 |
from ebooklib import epub
|
|
|
35 |
|
36 |
for i in range(num_pages):
|
37 |
page_num = i + 1
|
38 |
+
print(f"Processing page {page_num}...")
|
39 |
|
40 |
try:
|
41 |
# Render page to base64 image
|
42 |
image_base64 = render_pdf_to_base64png(pdf_path, page_num, target_longest_image_dim=1024)
|
43 |
anchor_text = get_anchor_text(pdf_path, page_num, pdf_engine="pdfreport", target_length=4000)
|
44 |
+
print(f"Anchor text for page {page_num}: {anchor_text}")
|
45 |
+
|
46 |
+
# New prompt format
|
47 |
+
prompt = (
|
48 |
+
"Below is the image of one page of a document, as well as some raw textual content that was previously "
|
49 |
+
"extracted for it. Just return the plain text representation of this document as if you were reading it naturally.\n"
|
50 |
+
"Do not hallucinate.\n"
|
51 |
+
"RAW_TEXT_START\n"
|
52 |
+
f"{anchor_text}\n"
|
53 |
+
"RAW_TEXT_END"
|
54 |
+
)
|
55 |
|
56 |
messages = [
|
57 |
{
|
|
|
80 |
num_return_sequences=1,
|
81 |
do_sample=True,
|
82 |
)
|
83 |
+
|
84 |
prompt_length = inputs["input_ids"].shape[1]
|
85 |
new_tokens = output[:, prompt_length:].detach().cpu()
|
86 |
+
|
87 |
decoded = "[No output generated]"
|
88 |
+
if new_tokens is not None and new_tokens.shape[1] > 0:
|
89 |
try:
|
90 |
decoded_list = processor.tokenizer.batch_decode(new_tokens, skip_special_tokens=True)
|
91 |
decoded = decoded_list[0].strip() if decoded_list else "[No output generated]"
|
|
|
96 |
|
97 |
except Exception as processing_error:
|
98 |
decoded = f"[Processing error on page {page_num}: {str(processing_error)}]"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
99 |
|
100 |
+
print(f"Decoded content for page {page_num}: {decoded}")
|
101 |
|
102 |
# Create chapter
|
103 |
chapter = epub.EpubHtml(title=f"Page {page_num}", file_name=f"page_{page_num}.xhtml", lang="en")
|
|
|
133 |
outputs=gr.File(label="Download EPUB"),
|
134 |
title="PDF to EPUB Converter (with olmOCR)",
|
135 |
description="Uploads a PDF, extracts text from each page with vision + prompt, and builds an EPUB using the outputs. Sets the first page as cover.",
|
136 |
+
allow_flagging="never"
|
137 |
)
|
138 |
|
139 |
if __name__ == "__main__":
|
140 |
iface.launch(
|
141 |
+
server_name="0.0.0.0",
|
142 |
+
server_port=7860,
|
143 |
+
share=True,
|
144 |
+
debug=True,
|
145 |
+
allowed_paths=["/tmp"]
|
146 |
)
|