Spaces:
Sleeping
Sleeping
File size: 1,856 Bytes
de61289 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 |
import gradio as gr
import fitz # PyMuPDF
from base64 import b64encode
from gradio_pdf import PDF
def read_pdf(file):
# 打开PDF文件
pdf_document = fitz.open(file)
text = ""
# 遍历每一页PDF,并提取文本
for page in pdf_document:
text += page.get_text()
pdf_document.close()
return text
# def display_pdf(file):
# # 将PDF文件内容编码为base64
# with open(file.name, "rb") as f:
# encoded_pdf = b64encode(f.read()).decode('utf-8')
# # 使用HTML嵌入PDF查看器
# pdf_html = f'<iframe src="data:application/pdf;base64,{encoded_pdf}" width="100%" height="600px"></iframe>'
# return pdf_html
def display_pdf(file):
return file # 返回文件路径,以便PDF组件使用
# 使用Blocks布局
with gr.Blocks() as app:
gr.Markdown('''<h1 align="center"> PDF Text Extractor </h1>''')
gr.Markdown('''<p align="center">Upload a PDF file to extract its text and view it.</p>''')
with gr.Row():
with gr.Column(scale=1):
file_input = gr.File(label="Upload a PDF file", type="filepath") # 设置为 'filepath'
with gr.Row():
display_button = gr.Button("Display PDF", variant="primary")
extract_button = gr.Button("Extract Text", variant="secondary")
# pdf_viewer = gr.HTML(label="PDF Viewer")
pdf_viewer = PDF(label="PDF Viewer") # 使用gradio_pdf的PDF组件
with gr.Column(scale=1):
text_output = gr.Textbox(label="Extracted Text", interactive=False,
placeholder="Extracted text will appear here...", lines=49)
# 连接按钮和功能
extract_button.click(read_pdf, inputs=file_input, outputs=text_output)
display_button.click(display_pdf, inputs=file_input, outputs=pdf_viewer)
app.launch() |