jackkuo commited on
Commit
c1df903
·
verified ·
1 Parent(s): a013bda

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +67 -23
app.py CHANGED
@@ -1,51 +1,95 @@
1
  import gradio as gr
2
  import fitz # PyMuPDF
3
- from base64 import b64encode
4
- from gradio_pdf import PDF
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
 
6
 
7
  def read_pdf(file):
8
- # 打开PDF文件
9
  pdf_document = fitz.open(file)
10
  text = ""
11
- # 遍历每一页PDF,并提取文本
12
  for page in pdf_document:
13
  text += page.get_text()
14
  pdf_document.close()
15
  return text
16
 
17
 
18
- # def display_pdf(file):
19
- # # PDF文件内容编码为base64
20
- # with open(file.name, "rb") as f:
21
- # encoded_pdf = b64encode(f.read()).decode('utf-8')
22
- # # 使用HTML嵌入PDF查看器
23
- # pdf_html = f'<iframe src="data:application/pdf;base64,{encoded_pdf}" width="100%" height="600px"></iframe>'
24
- # return pdf_html
25
- def display_pdf(file):
26
- return file # 返回文件路径,以便PDF组件使用
27
 
 
 
28
 
29
  # 使用Blocks布局
30
  with gr.Blocks() as app:
31
  gr.Markdown('''<h1 align="center"> PDF Text Extractor </h1>''')
32
- gr.Markdown('''<p align="center">Upload a PDF file to extract its text and view it.</p>''')
33
 
34
  with gr.Row():
35
  with gr.Column(scale=1):
36
- file_input = gr.File(label="Upload a PDF file", type="filepath") # 设置为 'filepath'
37
  with gr.Row():
38
- display_button = gr.Button("Display PDF", variant="primary")
39
- extract_button = gr.Button("Extract Text", variant="secondary")
40
- # pdf_viewer = gr.HTML(label="PDF Viewer")
41
- pdf_viewer = PDF(label="PDF Viewer") # 使用gradio_pdf的PDF组件
 
42
 
43
  with gr.Column(scale=1):
44
- text_output = gr.Textbox(label="Extracted Text", interactive=False,
45
- placeholder="Extracted text will appear here...", lines=49)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46
 
47
  # 连接按钮和功能
48
  extract_button.click(read_pdf, inputs=file_input, outputs=text_output)
49
- display_button.click(display_pdf, inputs=file_input, outputs=pdf_viewer)
 
 
 
 
 
 
 
 
 
50
 
51
- app.launch()
 
1
  import gradio as gr
2
  import fitz # PyMuPDF
3
+ from PIL import Image
4
+ from pathlib import Path
5
+ import os
6
+
7
+
8
+ def convert_pdf_to_images(pdf_path, image_folder="pdf_images", dpi=300):
9
+ # 创建存储图像的文件夹
10
+ os.makedirs(image_folder, exist_ok=True)
11
+
12
+ # 打开PDF文档
13
+ pdf_document = fitz.open(pdf_path)
14
+ image_paths = []
15
+
16
+ # 遍历每一页PDF,并生成高DPI的图像
17
+ for page_number in range(len(pdf_document)):
18
+ page = pdf_document[page_number]
19
+ pix = page.get_pixmap(dpi=dpi)
20
+ image_path = Path(image_folder) / f"page_{page_number + 1}.png"
21
+ Image.frombytes("RGB", [pix.width, pix.height], pix.samples).save(image_path)
22
+ image_paths.append(str(image_path)) # 收集每一页的图像路径
23
+
24
+ pdf_document.close()
25
+ return image_paths
26
 
27
 
28
  def read_pdf(file):
29
+ # 提取PDF中的文本
30
  pdf_document = fitz.open(file)
31
  text = ""
 
32
  for page in pdf_document:
33
  text += page.get_text()
34
  pdf_document.close()
35
  return text
36
 
37
 
38
+ def display_pdf_images(file):
39
+ # 转换PDF为高清图像
40
+ image_paths = convert_pdf_to_images(file)
41
+ return image_paths # 返回图像路径列表以显示
42
+
 
 
 
 
43
 
44
+ # 示例PDF路径
45
+ example_pdf_path = "./sample.pdf" # 将此替换为您的示例 PDF 的实际路径
46
 
47
  # 使用Blocks布局
48
  with gr.Blocks() as app:
49
  gr.Markdown('''<h1 align="center"> PDF Text Extractor </h1>''')
50
+ gr.Markdown('''<p align="center">Upload a PDF file to extract its text and view it.</p>''')
51
 
52
  with gr.Row():
53
  with gr.Column(scale=1):
54
+ file_input = gr.File(label="Upload a PDF file", type="filepath")
55
  with gr.Row():
56
+ display_button = gr.Button("Display PDF", variant="secondary")
57
+ extract_button = gr.Button("Extract Text", variant="primary")
58
+
59
+ # 使用 Gallery 作为 PDF 查看器,并指定列数和高度
60
+ pdf_viewer = gr.Gallery(label="PDF Viewer", columns=1, height="auto", object_fit="contain")
61
 
62
  with gr.Column(scale=1):
63
+ text_output = gr.Textbox(
64
+ label="Extracted Text",
65
+ interactive=True,
66
+ placeholder="Extracted text will appear here...",
67
+ lines=39,
68
+ max_lines=39, # 设置最大行数,如果超过将显示滚动条
69
+ autoscroll=False, # 设置自动滚动到底部
70
+ show_copy_button=True,
71
+ elem_id="text-output"
72
+ )
73
+
74
+ # 添加一个预设示例PDF
75
+ gr.Examples(
76
+ examples=[[example_pdf_path]],
77
+ inputs=file_input,
78
+ outputs=[pdf_viewer, text_output],
79
+ fn=lambda file: (display_pdf_images(file), read_pdf(file))
80
+ )
81
 
82
  # 连接按钮和功能
83
  extract_button.click(read_pdf, inputs=file_input, outputs=text_output)
84
+ display_button.click(display_pdf_images, inputs=file_input, outputs=pdf_viewer)
85
+
86
+ # 自定义样式
87
+ app.css = """
88
+ #text-output {
89
+ width: 100%;
90
+ max-width: 600px;
91
+ overflow-y: auto;
92
+ }
93
+ """
94
 
95
+ app.launch()