leonarb commited on
Commit
84e3794
·
verified ·
1 Parent(s): 8d1fa76

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +22 -23
app.py CHANGED
@@ -8,7 +8,6 @@ from pathlib import Path
8
  from transformers import AutoProcessor, Qwen2VLForConditionalGeneration
9
 
10
  from olmocr.data.renderpdf import render_pdf_to_base64png
11
- from olmocr.prompts import build_finetuning_prompt
12
  from olmocr.prompts.anchor import get_anchor_text
13
 
14
  from ebooklib import epub
@@ -36,14 +35,23 @@ def process_pdf_to_epub(pdf_file, title, author):
36
 
37
  for i in range(num_pages):
38
  page_num = i + 1
39
- print(f"Processing page {page_num}...") # Debugging line
40
 
41
  try:
42
  # Render page to base64 image
43
  image_base64 = render_pdf_to_base64png(pdf_path, page_num, target_longest_image_dim=1024)
44
  anchor_text = get_anchor_text(pdf_path, page_num, pdf_engine="pdfreport", target_length=4000)
45
- print(f"Anchor text for page {page_num}: {anchor_text}") # Debugging line
46
- prompt = build_finetuning_prompt(anchor_text)
 
 
 
 
 
 
 
 
 
47
 
48
  messages = [
49
  {
@@ -72,11 +80,12 @@ def process_pdf_to_epub(pdf_file, title, author):
72
  num_return_sequences=1,
73
  do_sample=True,
74
  )
 
75
  prompt_length = inputs["input_ids"].shape[1]
76
  new_tokens = output[:, prompt_length:].detach().cpu()
77
-
78
  decoded = "[No output generated]"
79
- if new_tokens.shape[1] > 0:
80
  try:
81
  decoded_list = processor.tokenizer.batch_decode(new_tokens, skip_special_tokens=True)
82
  decoded = decoded_list[0].strip() if decoded_list else "[No output generated]"
@@ -87,18 +96,8 @@ def process_pdf_to_epub(pdf_file, title, author):
87
 
88
  except Exception as processing_error:
89
  decoded = f"[Processing error on page {page_num}: {str(processing_error)}]"
90
- else:
91
- try:
92
- # Check if the tokens are empty
93
- if not new_tokens:
94
- decoded = f"[No tokens generated for page {page_num}]"
95
- else:
96
- decoded_list = processor.tokenizer.batch_decode(new_tokens, skip_special_tokens=True)
97
- decoded = decoded_list[0].strip() if decoded_list else "[No output generated]"
98
- except Exception as decode_error:
99
- decoded = f"[Decoding error on page {page_num}: {str(decode_error)}]"
100
 
101
- print(f"Decoded content for page {page_num}: {decoded}") # Debugging line
102
 
103
  # Create chapter
104
  chapter = epub.EpubHtml(title=f"Page {page_num}", file_name=f"page_{page_num}.xhtml", lang="en")
@@ -134,14 +133,14 @@ iface = gr.Interface(
134
  outputs=gr.File(label="Download EPUB"),
135
  title="PDF to EPUB Converter (with olmOCR)",
136
  description="Uploads a PDF, extracts text from each page with vision + prompt, and builds an EPUB using the outputs. Sets the first page as cover.",
137
- allow_flagging="never" # Add this line to avoid the flagged directory issue
138
  )
139
 
140
  if __name__ == "__main__":
141
  iface.launch(
142
- server_name="0.0.0.0", # Required to make app publicly accessible
143
- server_port=7860, # Can be changed if needed
144
- share=True, # Optional: creates a public Gradio link if supported
145
- debug=True, # Optional: helpful if you're troubleshooting
146
- allowed_paths=["/tmp"] # Optional: makes it explicit that Gradio can write here
147
  )
 
8
  from transformers import AutoProcessor, Qwen2VLForConditionalGeneration
9
 
10
  from olmocr.data.renderpdf import render_pdf_to_base64png
 
11
  from olmocr.prompts.anchor import get_anchor_text
12
 
13
  from ebooklib import epub
 
35
 
36
  for i in range(num_pages):
37
  page_num = i + 1
38
+ print(f"Processing page {page_num}...")
39
 
40
  try:
41
  # Render page to base64 image
42
  image_base64 = render_pdf_to_base64png(pdf_path, page_num, target_longest_image_dim=1024)
43
  anchor_text = get_anchor_text(pdf_path, page_num, pdf_engine="pdfreport", target_length=4000)
44
+ print(f"Anchor text for page {page_num}: {anchor_text}")
45
+
46
+ # New prompt format
47
+ prompt = (
48
+ "Below is the image of one page of a document, as well as some raw textual content that was previously "
49
+ "extracted for it. Just return the plain text representation of this document as if you were reading it naturally.\n"
50
+ "Do not hallucinate.\n"
51
+ "RAW_TEXT_START\n"
52
+ f"{anchor_text}\n"
53
+ "RAW_TEXT_END"
54
+ )
55
 
56
  messages = [
57
  {
 
80
  num_return_sequences=1,
81
  do_sample=True,
82
  )
83
+
84
  prompt_length = inputs["input_ids"].shape[1]
85
  new_tokens = output[:, prompt_length:].detach().cpu()
86
+
87
  decoded = "[No output generated]"
88
+ if new_tokens is not None and new_tokens.shape[1] > 0:
89
  try:
90
  decoded_list = processor.tokenizer.batch_decode(new_tokens, skip_special_tokens=True)
91
  decoded = decoded_list[0].strip() if decoded_list else "[No output generated]"
 
96
 
97
  except Exception as processing_error:
98
  decoded = f"[Processing error on page {page_num}: {str(processing_error)}]"
 
 
 
 
 
 
 
 
 
 
99
 
100
+ print(f"Decoded content for page {page_num}: {decoded}")
101
 
102
  # Create chapter
103
  chapter = epub.EpubHtml(title=f"Page {page_num}", file_name=f"page_{page_num}.xhtml", lang="en")
 
133
  outputs=gr.File(label="Download EPUB"),
134
  title="PDF to EPUB Converter (with olmOCR)",
135
  description="Uploads a PDF, extracts text from each page with vision + prompt, and builds an EPUB using the outputs. Sets the first page as cover.",
136
+ allow_flagging="never"
137
  )
138
 
139
  if __name__ == "__main__":
140
  iface.launch(
141
+ server_name="0.0.0.0",
142
+ server_port=7860,
143
+ share=True,
144
+ debug=True,
145
+ allowed_paths=["/tmp"]
146
  )