leonarb commited on
Commit
af75cff
·
verified ·
1 Parent(s): 845c4d4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +26 -36
app.py CHANGED
@@ -1,51 +1,45 @@
1
  import os
 
 
 
2
 
3
- # Set Hugging Face and Torch cache to a guaranteed-writable location
4
- cache_dir = "/tmp/huggingface_cache"
5
- os.environ["HF_HOME"] = cache_dir
6
- os.environ["TORCH_HOME"] = cache_dir
7
-
8
- # Create the directory if it doesn't exist
9
- os.makedirs(cache_dir, exist_ok=True)
10
-
11
- import gradio as gr
12
  import torch
13
- from PyPDF2 import PdfReader
14
- from io import BytesIO
15
  from PIL import Image
 
 
16
  from transformers import AutoProcessor, Qwen2VLForConditionalGeneration
 
17
  from olmocr.data.renderpdf import render_pdf_to_base64png
18
  from olmocr.prompts import build_finetuning_prompt
19
  from olmocr.prompts.anchor import get_anchor_text
20
- from ebooklib import epub
21
- import base64
22
- import tempfile
23
-
24
 
 
 
 
 
 
25
 
26
- # Load model
27
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
28
  model = Qwen2VLForConditionalGeneration.from_pretrained(
29
- "allenai/olmOCR-7B-0225-preview", torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32
 
30
  ).eval().to(device)
31
  processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct")
32
 
33
-
34
  def ocr_page(pdf_path, page_num):
35
- # Render page to base64 PNG
36
  image_b64 = render_pdf_to_base64png(pdf_path, page_num + 1, target_longest_image_dim=1024)
37
  anchor_text = get_anchor_text(pdf_path, page_num + 1, pdf_engine="pdfreport", target_length=4000)
38
  prompt = build_finetuning_prompt(anchor_text)
39
 
40
- messages = [
41
- {
42
- "role": "user",
43
- "content": [
44
- {"type": "text", "text": prompt},
45
- {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_b64}"}},
46
- ],
47
- }
48
- ]
49
 
50
  prompt_text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
51
  main_image = Image.open(BytesIO(base64.b64decode(image_b64)))
@@ -65,7 +59,6 @@ def ocr_page(pdf_path, page_num):
65
  decoded = processor.tokenizer.batch_decode(new_tokens, skip_special_tokens=True)
66
  return decoded[0] if decoded else ""
67
 
68
-
69
  def convert_pdf_to_epub(pdf_file, title, author, language):
70
  with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_pdf:
71
  tmp_pdf.write(pdf_file.read())
@@ -74,18 +67,17 @@ def convert_pdf_to_epub(pdf_file, title, author, language):
74
  reader = PdfReader(tmp_pdf_path)
75
  num_pages = len(reader.pages)
76
 
77
- # Create EPUB book
78
  book = epub.EpubBook()
79
  book.set_title(title)
80
  book.add_author(author)
81
  book.set_language(language)
82
 
83
- # Use first page as cover
84
  cover_image_b64 = render_pdf_to_base64png(tmp_pdf_path, 1, target_longest_image_dim=1024)
85
  cover_image_bytes = base64.b64decode(cover_image_b64)
86
  book.set_cover("cover.jpg", cover_image_bytes)
87
 
88
- # OCR and add pages
89
  for i in range(num_pages):
90
  text = ocr_page(tmp_pdf_path, i)
91
  chapter = epub.EpubHtml(title=f"Page {i+1}", file_name=f"page_{i+1}.xhtml", lang=language)
@@ -102,12 +94,10 @@ def convert_pdf_to_epub(pdf_file, title, author, language):
102
  with open(epub_path, "rb") as f:
103
  return epub_path, f.read()
104
 
105
-
106
  def interface_fn(pdf, title, author, language):
107
- epub_path, epub_bytes = convert_pdf_to_epub(pdf, title, author, language)
108
  return epub_path
109
 
110
-
111
  demo = gr.Interface(
112
  fn=interface_fn,
113
  inputs=[
@@ -123,4 +113,4 @@ demo = gr.Interface(
123
  )
124
 
125
  if __name__ == "__main__":
126
- demo.launch(share = True)
 
1
  import os
2
+ import base64
3
+ import tempfile
4
+ from io import BytesIO
5
 
 
 
 
 
 
 
 
 
 
6
  import torch
7
+ import gradio as gr
 
8
  from PIL import Image
9
+ from PyPDF2 import PdfReader
10
+ from ebooklib import epub
11
  from transformers import AutoProcessor, Qwen2VLForConditionalGeneration
12
+
13
  from olmocr.data.renderpdf import render_pdf_to_base64png
14
  from olmocr.prompts import build_finetuning_prompt
15
  from olmocr.prompts.anchor import get_anchor_text
 
 
 
 
16
 
17
+ # Set Hugging Face and Torch cache to a guaranteed-writable location
18
+ cache_dir = "/tmp/huggingface_cache"
19
+ os.environ["HF_HOME"] = cache_dir
20
+ os.environ["TORCH_HOME"] = cache_dir
21
+ os.makedirs(cache_dir, exist_ok=True)
22
 
23
+ # Load model and processor
24
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
25
  model = Qwen2VLForConditionalGeneration.from_pretrained(
26
+ "allenai/olmOCR-7B-0225-preview",
27
+ torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32
28
  ).eval().to(device)
29
  processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct")
30
 
 
31
  def ocr_page(pdf_path, page_num):
 
32
  image_b64 = render_pdf_to_base64png(pdf_path, page_num + 1, target_longest_image_dim=1024)
33
  anchor_text = get_anchor_text(pdf_path, page_num + 1, pdf_engine="pdfreport", target_length=4000)
34
  prompt = build_finetuning_prompt(anchor_text)
35
 
36
+ messages = [{
37
+ "role": "user",
38
+ "content": [
39
+ {"type": "text", "text": prompt},
40
+ {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_b64}"}}
41
+ ],
42
+ }]
 
 
43
 
44
  prompt_text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
45
  main_image = Image.open(BytesIO(base64.b64decode(image_b64)))
 
59
  decoded = processor.tokenizer.batch_decode(new_tokens, skip_special_tokens=True)
60
  return decoded[0] if decoded else ""
61
 
 
62
  def convert_pdf_to_epub(pdf_file, title, author, language):
63
  with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_pdf:
64
  tmp_pdf.write(pdf_file.read())
 
67
  reader = PdfReader(tmp_pdf_path)
68
  num_pages = len(reader.pages)
69
 
 
70
  book = epub.EpubBook()
71
  book.set_title(title)
72
  book.add_author(author)
73
  book.set_language(language)
74
 
75
+ # Set cover from page 1
76
  cover_image_b64 = render_pdf_to_base64png(tmp_pdf_path, 1, target_longest_image_dim=1024)
77
  cover_image_bytes = base64.b64decode(cover_image_b64)
78
  book.set_cover("cover.jpg", cover_image_bytes)
79
 
80
+ # Add OCR'd pages as chapters
81
  for i in range(num_pages):
82
  text = ocr_page(tmp_pdf_path, i)
83
  chapter = epub.EpubHtml(title=f"Page {i+1}", file_name=f"page_{i+1}.xhtml", lang=language)
 
94
  with open(epub_path, "rb") as f:
95
  return epub_path, f.read()
96
 
 
97
  def interface_fn(pdf, title, author, language):
98
+ epub_path, _ = convert_pdf_to_epub(pdf, title, author, language)
99
  return epub_path
100
 
 
101
  demo = gr.Interface(
102
  fn=interface_fn,
103
  inputs=[
 
113
  )
114
 
115
  if __name__ == "__main__":
116
+ demo.launch(share=True)