Spaces:

jackkuo
/

PDF-text-extractor

Sleeping

App Files Files Community

PDF-text-extractor / app.py

jackkuo

Create app.py

de61289 verified 8 months ago

raw

history blame

1.86 kB

	import gradio as gr
	import fitz # PyMuPDF
	from base64 import b64encode
	from gradio_pdf import PDF


	def read_pdf(file):
	# 打开PDF文件
	pdf_document = fitz.open(file)
	text = ""
	# 遍历每一页PDF，并提取文本
	for page in pdf_document:
	text += page.get_text()
	pdf_document.close()
	return text


	# def display_pdf(file):
	# # 将PDF文件内容编码为base64
	# with open(file.name, "rb") as f:
	# encoded_pdf = b64encode(f.read()).decode('utf-8')
	# # 使用HTML嵌入PDF查看器
	# pdf_html = f'<iframe src="data:application/pdf;base64,{encoded_pdf}" width="100%" height="600px"></iframe>'
	# return pdf_html
	def display_pdf(file):
	return file # 返回文件路径，以便PDF组件使用


	# 使用Blocks布局
	with gr.Blocks() as app:
	gr.Markdown('''<h1 align="center"> PDF Text Extractor </h1>''')
	gr.Markdown('''<p align="center">Upload a PDF file to extract its text and view it.</p>''')

	with gr.Row():
	with gr.Column(scale=1):
	file_input = gr.File(label="Upload a PDF file", type="filepath") # 设置为 'filepath'
	with gr.Row():
	display_button = gr.Button("Display PDF", variant="primary")
	extract_button = gr.Button("Extract Text", variant="secondary")
	# pdf_viewer = gr.HTML(label="PDF Viewer")
	pdf_viewer = PDF(label="PDF Viewer") # 使用gradio_pdf的PDF组件

	with gr.Column(scale=1):
	text_output = gr.Textbox(label="Extracted Text", interactive=False,
	placeholder="Extracted text will appear here...", lines=49)

	# 连接按钮和功能
	extract_button.click(read_pdf, inputs=file_input, outputs=text_output)
	display_button.click(display_pdf, inputs=file_input, outputs=pdf_viewer)

	app.launch()