leonarb commited on
Commit
d45f3e7
·
verified ·
1 Parent(s): bb299d9

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +39 -12
app.py CHANGED
@@ -1,17 +1,44 @@
1
  import gradio as gr
2
- from transformers import AutoModelForCausalLM, AutoTokenizer
3
  import torch
 
 
 
4
 
5
- # Load model and tokenizer
6
- model_name = "allenai/olmOCR-7B-0225-preview"
7
- tokenizer = AutoTokenizer.from_pretrained(model_name)
8
- model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32)
 
 
 
 
9
 
10
- def generate_text(prompt):
11
- inputs = tokenizer(prompt, return_tensors="pt")
12
- outputs = model.generate(**inputs, max_new_tokens=100)
13
- return tokenizer.decode(outputs[0], skip_special_tokens=True)
14
 
15
- # Define Gradio UI
16
- demo = gr.Interface(fn=generate_text, inputs="text", outputs="text")
17
- demo.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import gradio as gr
 
2
  import torch
3
+ import pypdfium2
4
+ from PIL import Image
5
+ from transformers import Qwen2VLProcessor, Qwen2VLModel
6
 
7
+ # Load model and processor
8
+ model_name = "Qwen/Qwen-VL" # You may replace with your preferred VL model
9
+ processor = Qwen2VLProcessor.from_pretrained(model_name)
10
+ model = Qwen2VLModel.from_pretrained(
11
+ model_name,
12
+ torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
13
+ )
14
+ model.eval()
15
 
16
+ # Convert PDF to list of PIL images (one per page)
17
+ def pdf_to_images(pdf_path):
18
+ pdf = pypdfium2.PdfDocument(pdf_path)
19
+ return [page.render().to_pil() for page in pdf]
20
 
21
+ # Generate text from each image using the vision-language model
22
+ def process_pdf(pdf_file):
23
+ images = pdf_to_images(pdf_file.name)
24
+ results = []
25
+
26
+ for image in images:
27
+ inputs = processor(images=image, return_tensors="pt").to(model.device)
28
+ with torch.no_grad():
29
+ outputs = model.generate(**inputs, max_new_tokens=256)
30
+ text = processor.batch_decode(outputs, skip_special_tokens=True)[0]
31
+ results.append(text.strip())
32
+
33
+ return "\n\n".join(results)
34
+
35
+ # Gradio UI
36
+ demo = gr.Interface(
37
+ fn=process_pdf,
38
+ inputs=gr.File(type="file", file_types=[".pdf"]),
39
+ outputs="text",
40
+ title="olmOCR PDF Processor"
41
+ )
42
+
43
+ if __name__ == "__main__":
44
+ demo.launch()