Spaces:
Sleeping
Sleeping
import gradio as gr | |
import fitz # PyMuPDF | |
from base64 import b64encode | |
from gradio_pdf import PDF | |
def read_pdf(file): | |
# 打开PDF文件 | |
pdf_document = fitz.open(file) | |
text = "" | |
# 遍历每一页PDF,并提取文本 | |
for page in pdf_document: | |
text += page.get_text() | |
pdf_document.close() | |
return text | |
# def display_pdf(file): | |
# # 将PDF文件内容编码为base64 | |
# with open(file.name, "rb") as f: | |
# encoded_pdf = b64encode(f.read()).decode('utf-8') | |
# # 使用HTML嵌入PDF查看器 | |
# pdf_html = f'<iframe src="data:application/pdf;base64,{encoded_pdf}" width="100%" height="600px"></iframe>' | |
# return pdf_html | |
def display_pdf(file): | |
return file # 返回文件路径,以便PDF组件使用 | |
# 使用Blocks布局 | |
with gr.Blocks() as app: | |
gr.Markdown('''<h1 align="center"> PDF Text Extractor </h1>''') | |
gr.Markdown('''<p align="center">Upload a PDF file to extract its text and view it.</p>''') | |
with gr.Row(): | |
with gr.Column(scale=1): | |
file_input = gr.File(label="Upload a PDF file", type="filepath") # 设置为 'filepath' | |
with gr.Row(): | |
display_button = gr.Button("Display PDF", variant="primary") | |
extract_button = gr.Button("Extract Text", variant="secondary") | |
# pdf_viewer = gr.HTML(label="PDF Viewer") | |
pdf_viewer = PDF(label="PDF Viewer") # 使用gradio_pdf的PDF组件 | |
with gr.Column(scale=1): | |
text_output = gr.Textbox(label="Extracted Text", interactive=False, | |
placeholder="Extracted text will appear here...", lines=49) | |
# 连接按钮和功能 | |
extract_button.click(read_pdf, inputs=file_input, outputs=text_output) | |
display_button.click(display_pdf, inputs=file_input, outputs=pdf_viewer) | |
app.launch() |