YNS-Elaine commited on
Commit
9bcdecb
·
verified ·
1 Parent(s): b6538da

Update simple_test.py

Browse files
Files changed (1) hide show
  1. simple_test.py +24 -33
simple_test.py CHANGED
@@ -1,42 +1,33 @@
 
1
  from transformers import AutoProcessor, AutoModelForImageTextToText
2
  from PIL import Image
3
- import torch
4
 
 
 
 
5
 
6
-
7
- def smoldocling_readimage(image, prompt_text="Convert this page to docling."):
8
- # Load model and processor
9
- processor = AutoProcessor.from_pretrained("ds4sd/SmolDocling-256M-preview")
10
- model = AutoModelForImageTextToText.from_pretrained("ds4sd/SmolDocling-256M-preview")
11
-
12
- # Create input messages
13
  messages = [
14
- {
15
- "role": "user",
16
- "content": [
17
- {"type": "image"},
18
- {"type": "text", "text": prompt_text}
19
- ]
20
- },
21
  ]
22
-
23
-
24
- # Prepare inputs
25
  prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
26
  inputs = processor(text=prompt, images=[image], return_tensors="pt")
27
- # inputs = inputs.to(device)
28
-
29
-
30
- # Generate outputs
31
- generated_ids = model.generate(**inputs, max_new_tokens=1024) # Reduced for testing
32
  prompt_length = inputs.input_ids.shape[1]
33
- trimmed_generated_ids = generated_ids[:, prompt_length:]
34
- doctags = processor.batch_decode(
35
- trimmed_generated_ids,
36
- skip_special_tokens=False,
37
- )[0].lstrip()
38
-
39
- # Clean the output
40
- doctags = doctags.replace("<end_of_utterance>", "").strip()
41
-
42
- return doctags
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
  from transformers import AutoProcessor, AutoModelForImageTextToText
3
  from PIL import Image
 
4
 
5
+ # Load model & processor once at startup
6
+ processor = AutoProcessor.from_pretrained("ds4sd/SmolDocling-256M-preview")
7
+ model = AutoModelForImageTextToText.from_pretrained("ds4sd/SmolDocling-256M-preview")
8
 
9
+ def smoldocling_readimage(image, prompt_text):
 
 
 
 
 
 
10
  messages = [
11
+ {"role": "user", "content": [{"type": "image"}, {"type": "text", "text": prompt_text}]}
 
 
 
 
 
 
12
  ]
 
 
 
13
  prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
14
  inputs = processor(text=prompt, images=[image], return_tensors="pt")
15
+ outputs = model.generate(**inputs, max_new_tokens=1024)
 
 
 
 
16
  prompt_length = inputs.input_ids.shape[1]
17
+ generated = outputs[:, prompt_length:]
18
+ result = processor.batch_decode(generated, skip_special_tokens=False)[0]
19
+ return result.replace("<end_of_utterance>", "").strip()
20
+
21
+ # Gradio UI
22
+ demo = gr.Interface(
23
+ fn=smoldocling_readimage,
24
+ inputs=[
25
+ gr.Image(type="pil", label="Upload Image"),
26
+ gr.Textbox(lines=1, placeholder="Enter prompt (e.g. Convert to docling)", label="Prompt"),
27
+ ],
28
+ outputs="text",
29
+ title="SmolDocling Web App",
30
+ description="Upload a document image and convert it to structured docling format."
31
+ )
32
+
33
+ demo.launch()