PDF Text Extractor

import gradio as gr
import fitz  # PyMuPDF
from base64 import b64encode
from gradio_pdf import PDF


def read_pdf(file):
    # 打开PDF文件
    pdf_document = fitz.open(file)
    text = ""
    # 遍历每一页PDF，并提取文本
    for page in pdf_document:
        text += page.get_text()
    pdf_document.close()
    return text


# def display_pdf(file):
#     # 将PDF文件内容编码为base64
#     with open(file.name, "rb") as f:
#         encoded_pdf = b64encode(f.read()).decode('utf-8')
#     # 使用HTML嵌入PDF查看器
#     pdf_html = f'<iframe src="data:application/pdf;base64,{encoded_pdf}" width="100%" height="600px"></iframe>'
#     return pdf_html
def display_pdf(file):
    return file  # 返回文件路径，以便PDF组件使用


# 使用Blocks布局
with gr.Blocks() as app:
    gr.Markdown('''<h1 align="center"> PDF Text Extractor </h1>''')
    gr.Markdown('''<p  align="center">Upload a PDF file to extract its text and view it.</p>''')

    with gr.Row():
        with gr.Column(scale=1):
            file_input = gr.File(label="Upload a PDF file", type="filepath")  # 设置为 'filepath'
            with gr.Row():
                display_button = gr.Button("Display PDF", variant="primary")
                extract_button = gr.Button("Extract Text", variant="secondary")
            # pdf_viewer = gr.HTML(label="PDF Viewer")
            pdf_viewer = PDF(label="PDF Viewer")  # 使用gradio_pdf的PDF组件

        with gr.Column(scale=1):
            text_output = gr.Textbox(label="Extracted Text", interactive=False,
                                     placeholder="Extracted text will appear here...", lines=49)

    # 连接按钮和功能
    extract_button.click(read_pdf, inputs=file_input, outputs=text_output)
    display_button.click(display_pdf, inputs=file_input, outputs=pdf_viewer)

app.launch()