Spaces:

jackkuo
/

Automated-Enzyme-Kinetics-Extractor

Running

App Files Files Community

Automated-Enzyme-Kinetics-Extractor / app.py

jackkuo

Update app.py

dbc68cb verified 7 months ago

raw

history blame

29.7 kB

	from openai import OpenAI
	import gradio as gr
	import fitz # PyMuPDF
	from PIL import Image
	from pathlib import Path
	import os
	import re
	import tiktoken
	import pandas as pd

	api_key = os.getenv('API_KEY')
	base_url = os.getenv("BASE_URL")

	client = OpenAI(
	api_key=api_key,
	base_url=base_url,
	)


	def cal_tokens(message_data):
	print("use tiktoken")
	try:
	encoding = tiktoken.encoding_for_model("gpt-3.5-turbo-0301")
	except KeyError:
	print("Warning: model not found. Using cl100k_base encoding.")
	encoding = tiktoken.get_encoding("cl100k_base")
	num_tokens = len(encoding.encode(str(message_data)))
	return num_tokens


	def del_references(lines):
	# 1.mathpix md的格式：匹配\section{REFERENCES}xxxx\section{Table
	pattern = r'\\{.{0,5}(References\|Reference\|REFERENCES\|LITERATURE CITED\|Referencesand notes\|Notes and references)(.?)\\section\*\{Tables'
	matches = re.search(pattern, lines, re.DOTALL)
	if matches:
	lines = lines.replace(matches[0], "\section*{Tables\n")
	print("1.1.匹配到了References和Tables,删除了References，保留了后面的Tables")
	else:
	pattern = r'\\{.{0,5}(References\|Reference\|REFERENCES\|LITERATURE CITED\|Referencesand notes\|Notes and references)(.)'
	matches = re.search(pattern, lines, re.DOTALL)
	if matches:
	print("1.2.匹配到了References,删除了References")
	lines = lines.replace(matches[0], "")
	else:
	# 2.md的格式：匹配 ## REFERENCES
	pattern = r'#.{0,15}(References\|Reference\|REFERENCES\|LITERATURE CITED\|Referencesand notes\|Notes and references)(.*?)(Table\|Tables)'
	matches = re.search(pattern, lines, re.DOTALL)
	if matches:
	lines = lines.replace(matches[0], "Tables")
	print("2.1.匹配到了## References和Tables,删除了References，保留了后面的Tables")
	else:
	pattern = r'#.{0,15}(References\|Reference\|REFERENCES\|LITERATURE CITED\|Referencesand notes\|Notes and references)(.*?)# SUPPLEMENTARY'
	matches = re.search(pattern, lines, re.DOTALL)
	if matches:
	lines = lines.replace(matches[0], "# SUPPLEMENTARY")
	print("2.2.匹配到了## References和# SUPPLEMENTARY,删除了References，保留了后面的# SUPPLEMENTARY")
	else:
	pattern = r'#.{0,15}(References\|Reference\|REFERENCES\|LITERATURE CITED\|Referencesand notes\|Notes and references)(.*)\[\^0\]'
	matches = re.search(pattern, lines, re.DOTALL)
	if matches:
	print("2.3.匹配到了## References和\[\^0\],删除了References和\[\^0\]之间的内容")
	lines = lines.replace(matches[0], "[^0]")
	else:
	pattern = r'#.{0,15}(References\|Reference\|REFERENCES\|LITERATURE CITED\|Referencesand notes\|Notes and references)(.*)'
	matches = re.search(pattern, lines, re.DOTALL)
	if matches:
	print("2.4.匹配到了## References,删除了References")
	lines = lines.replace(matches[0], "")
	else:
	print("没有匹配到References")
	return lines


	def extract_pdf_pypdf(pdf_dir):
	try:
	doc = fitz.open(pdf_dir)
	except Exception as e:
	print(f"Error opening PDF: {e}")
	return None

	page_count = doc.page_count
	file_content = ""
	for page in range(page_count):
	try:
	text = doc.load_page(page).get_text("text")
	file_content += text + "\n\n"
	except Exception as e:
	print(f"Error reading page {page}: {e}")
	continue

	return file_content


	def openai_api(messages):
	try:
	completion = client.chat.completions.create(
	model="claude-3-5-sonnet-20240620",
	messages=messages,
	temperature=0.1,
	max_tokens=8192,
	stream=True
	)
	response = ''.join(
	[chunk.choices[0].delta.content if chunk.choices[0].delta.content else "" for chunk in completion])
	return response
	except Exception as ex:
	print("API error:", ex)
	return None


	def openai_chat_2_step(prompt, file_content):
	all_response = ""
	for i in range(len(file_content)//123000 + 1):
	text = file_content[i123000:(i+1)123000]
	# step1: 拆分两部分，前半部分
	messages = [
	{
	"role": "system",
	"content": "You are an expert in information extraction from scientific literature.",
	},
	{"role": "user",
	"content": "The following is a scientific article, please read it carefully: \n{" + text + "}\n" + prompt},
	]
	tokens = cal_tokens(messages)
	print("step一: 抽取部分{}：".format(i))
	print("prompt tokens:", tokens)
	response_2_content = openai_api(messages)
	if response_2_content:
	all_response += response_2_content + "\n"

	messages = [
	{
	"role": "system",
	"content": "You are an expert in information extraction from scientific literature.",
	},
	{"role": "user", "content": """Provided Text:
	'''
	{{""" + all_response + """}}
	'''
	""" + """
	Combine the above tables into one table.
	Please pay attention to the pipe format as shown in the example below. This format is for reference only regarding the structure; the content within is not the focus of this instruction.

	\| Enzyme \| Organism \| Substrate \| Km \| Unit_Km \| Kcat \| Unit_Kcat \| Kcat/Km \| Unit_Kcat/Km \| Commentary[Temp] \| Commentary[pH] \| Commentary[Mutant] \| Commentary[Cosubstrate] \|
	\|------------\|-------------------\|-------------\|-----\|---------\|------\|-----------\|---------\|--------------\|------------------\|----------------\|--------------------\|-------------------------\|
	\| Enzyme1 \| Bacillus subtilis \| Substrate_A \| 7.3 \| mM \| 6.4 \| s^-1 \| 1.4 × 10^4 \| M^-1s^-1 \| 37°C \| 5.0 \| WT \| NADP^+ \|
	\| Enzyme2 \| Escherichia coli \| Substrate_B \| 5.9 \| mM \| 9.8 \| s^-1 \| 29000 \| mM^-1min^-1 \| 60°C \| 10.0 \| Q176E \| NADPH \|
	\| Enzyme3 \| Homo sapiens \| Substrate_C \| 6.9 \| mM \| 15.6 \| s^-1 \| 43000 \| µM^-1s^-1 \| 65°C \| 8.0 \| T253S \| NAD^+ \|
	"""}
	]
	tokens = cal_tokens(messages)
	print("step二: 合并部分：")
	print("prompt tokens:", tokens)
	response = openai_api(messages)
	return response


	def predict(prompt, pdf_file):
	if pdf_file is None:
	return "Please upload a PDF file to proceed."

	file_content = extract_pdf_pypdf(pdf_file.name)
	messages = [
	{
	"role": "system",
	"content": "You are an expert in information extraction from scientific literature.",
	},
	{"role": "user", "content": """Provided Text:
	'''
	{{""" + file_content + """}}
	'''
	""" + prompt}
	]
	tokens = cal_tokens(messages)
	print("开始：抽取")
	print("prompt tokens:", tokens)
	# time.sleep(20) # claude 需要加这个
	if tokens > 128000:
	file_content = del_references(file_content)
	extract_result = openai_chat_2_step(prompt, file_content)
	else:
	extract_result = openai_api(messages)

	return extract_result or "Too many users. Please wait a moment!"


	def convert_pdf_to_images(pdf_path, image_folder="pdf_images", dpi=300):
	# 创建存储图像的文件夹
	os.makedirs(image_folder, exist_ok=True)

	# 打开PDF文档
	pdf_document = fitz.open(pdf_path)
	image_paths = []

	# 遍历每一页PDF，并生成高DPI的图像
	for page_number in range(len(pdf_document)):
	page = pdf_document[page_number]
	pix = page.get_pixmap(dpi=dpi)
	image_path = Path(image_folder) / f"page_{page_number + 1}.png"
	Image.frombytes("RGB", [pix.width, pix.height], pix.samples).save(image_path)
	image_paths.append(str(image_path)) # 收集每一页的图像路径

	pdf_document.close()
	return image_paths


	def display_pdf_images(file):
	# 转换PDF为高清图像
	image_paths = convert_pdf_to_images(file)
	return image_paths # 返回图像路径列表以显示


	en_1 = """Please read the scientific article provided and extract detailed information about enzymes from a specific organism, focusing on variants or mutants. Your focus should be on data related to the enzyme's activity on substrates at specific concentrations, under certain pH levels and temperatures, and in the presence of different cofactors or cosubstrates at various concentrations. It is essential to identify and record the enzymatic kinetics parameters: Km, Kcat, and Kcat/Km values under these conditions.

	Organize all this information into a table with 13 columns titled: Enzyme, Organism, Substrate, Km, Unit_Km, Kcat, Unit_Kcat, Kcat/Km, Unit_Kcat/Km, Commentary[Temp], Commentary[pH], Commentary[Mutant], and Commentary[Cosubstrate].

	While performing the tasks, please pay special attention to the following points:
	1. Unit retention: Unit_Km, Unit_Kcat, Unit_Kcat/Km should be recorded and output exactly as they appeared in the tables from the Scientific Article Fraction.
	2. Scientific Notation: For values in the table that are derived from the article’s headers containing scientific notations, ensure that the actual values entered into the table reflect these notations accordingly. For instance, if an original table specifies 'Kcat/Km × 10^4 (M^-1s^-1)' in table header, then the value entered under 'Kcat/Km' of your table should be '1.4 × 10^4' without any unit if 1.4 was the original figure. Importantly, enter its respective unit 'M^-1s^-1' under 'Unit_Kcat/Km' in your table. Apply this method for each relevant entry, preserving the scientific notation detail as provided in the article. Conversely, for headers not involving scientific notations, simply transcribe values and units as they are, without adding or altering the notation form.
	3. Pure Numbers and Units: Please ensure that all numerical values in the columns of 'Km', 'Kcat', and 'Kcat/Km' are entered as pure numbers without any accompanying units. The corresponding units must be placed in their respective 'Unit' columns only, such as 'Unit_Km', 'Unit_Kcat', and 'Unit_Kcat/Km'. This separation of values and units is critical to maintain clarity and consistency in the data representation.
	4. Mean Values Only: I need you to include only the mean values, excluding standard deviations or errors, while standard deviations or errors might be indicated after '±' or be wrapped in '()'.
	5. Full Forms: In the case that abbreviated or shortened forms are used in the entries of certain tables or other informative text, endeavor to trace back to the full forms of these abbreviations in the Scientific Article Fraction and reflect them in the tables you are organizing.
	6. Data Derivation: All data must be derived solely from the unit conversion of the Scientific Article Fraction provided, not from any calculations. For example, do not calculate the Kcat/Km ratio by dividing perceived Kcat data by Km data; only use pre-existing Kcat/Km values from the Scientific Article Fraction.
	7. Ensure that each row of the table corresponds to a unique set of conditions and their respective kinetic parameters for the enzyme being measured.


	Output the table using the pipe symbol (\|) as the delimiter, ensuring each entry is separated by a pipe symbol and properly aligned to maintain the structure of the table. I need you to include only the mean values, excluding standard deviations or errors, while standard deviations or errors might be indicated after '±' or be wrapped in '()'. Include all details and rows in the output, providing a comprehensive extraction of every data point without omissions. Format the complete table data clearly, ensuring that every piece of information is included and no data points are left out. Do not use ellipses or any other form of indication suggesting information is continued elsewhere. The full dataset must be provided as per the structure above, ensuring the integrity and usability of the data for subsequent analyses or applications. Present the complete table data in a clear and organized format in your response, without the need for further confirmation or prompts.

	Please pay attention to the pipe format as shown in the example below. This format is for reference only regarding the structure; the content within is not the focus of this instruction.

	\| Enzyme \| Organism \| Substrate \| Km \| Unit_Km \| Kcat \| Unit_Kcat \| Kcat/Km \| Unit_Kcat/Km \| Commentary[Temp] \| Commentary[pH] \| Commentary[Mutant] \| Commentary[Cosubstrate] \|
	\|------------\|-------------------\|-------------\|-----\|---------\|------\|-----------\|---------\|--------------\|------------------\|----------------\|--------------------\|-------------------------\|
	\| Enzyme1 \| Bacillus subtilis \| Substrate_A \| 7.3 \| mM \| 6.4 \| s^-1 \| 1.4 × 10^4 \| M^-1s^-1 \| 37°C \| 5.0 \| WT \| NADP^+ \|
	\| Enzyme2 \| Escherichia coli \| Substrate_B \| 5.9 \| mM \| 9.8 \| s^-1 \| 29000 \| mM^-1min^-1 \| 60°C \| 10.0 \| Q176E \| NADPH \|
	\| Enzyme3 \| Homo sapiens \| Substrate_C \| 6.9 \| mM \| 15.6 \| s^-1 \| 43000 \| µM^-1s^-1 \| 65°C \| 8.0 \| T253S \| NAD^+ \|

	Structure your responses to allow for seamless concatenation, presenting all tabular data from a scientific article as a single table, even if the original content had multiple tables. Use the full response capacity to maximize data presentation, avoiding summarizations, commentaries, or introductions at the end of each response. The subsequent response should pick up precisely where the preceding one concluded, commencing from the following character, without the necessity to reiterate the table header or the fragmented words. This method ensures the table is presented completely and seamlessly, despite character limit constraints. Please start by outputting the first segment of the table according to these guidelines.
	"""


	def update_input():
	return en_1


	EXCEL_FILE_PATH_Golden_Benchmark = "static/golden benchmark.csv"
	EXCEL_FILE_PATH_Expert_Annotated_Dataset = "static/3450_merged_data.csv"


	def load_excel(EXCEL_FILE_PATH):
	try:
	# 读取 Excel 文件
	# df = pd.read_excel(EXCEL_FILE_PATH)
	df = pd.read_csv(EXCEL_FILE_PATH)
	return df
	except Exception as e:
	return f"Error loading Excel file: {e}"


	def get_column_names(EXCEL_FILE_PATH):
	df = load_excel(EXCEL_FILE_PATH)
	if isinstance(df, str):
	return [] # 如果加载失败，返回空列表
	return df.columns.tolist() # 返回列名列表


	def search_data_golden(keyword, selected_column):
	df = load_excel(EXCEL_FILE_PATH_Golden_Benchmark)
	if isinstance(df, str): # 检查是否加载成功
	return df

	# 过滤包含关键字的行
	if selected_column not in df.columns:
	return "Invalid column selected."

	filtered_df = df[df[selected_column].astype(str).str.contains(keyword, case=False)]

	if filtered_df.empty:
	return "No results found."

	return filtered_df.to_html(classes='data', index=False, header=True)


	def search_data_entire(keyword, selected_column):
	df = load_excel(EXCEL_FILE_PATH_Expert_Annotated_Dataset)
	if isinstance(df, str): # 检查是否加载成功
	return df

	# 过滤包含关键字的行
	if selected_column not in df.columns:
	return "Invalid column selected."

	filtered_df = df[df[selected_column].astype(str).str.contains(keyword, case=False)]

	if filtered_df.empty:
	return "No results found."

	return filtered_df.to_html(classes='data', index=False, header=True)


	with gr.Blocks(title="Automated Enzyme Kinetics Extractor") as demo:
	with gr.Tabs():
	with gr.Tab("Automated Enzyme Kinetics Extractor"):
	gr.Markdown(
	'''<h1 align="center"> Automated Enzyme Kinetics Extractor </h1>
	<p>How to use:
	<br><strong>1</strong>: Upload your PDF.
	<br><strong>2</strong>: Click "View PDF" to preview it.
	<br><strong>3</strong>: Click "Extract Text" to extract Text.
	<br><strong>4</strong>: Enter your extraction prompt in the input box.
	<br><strong>5</strong>: Click "Generate" to extract, and the extracted information will display below.
	</p>'''
	)
	file_input = gr.File(label="Upload your PDF", type="filepath")
	example = gr.Examples(examples=[["./sample.pdf"]], inputs=file_input)
	with gr.Row():
	viewer_button = gr.Button("View PDF", variant="secondary")
	extract_button = gr.Button("Extract Text", variant="primary")

	with gr.Row():
	with gr.Column(scale=1):

	file_out = gr.Gallery(label="PDF Viewer", columns=1, height="auto", object_fit="contain")

	with gr.Column(scale=1):
	text_output = gr.Textbox(
	label="Extracted Text",
	interactive=True,
	placeholder="Extracted text will appear here...",
	lines=39,
	max_lines=39, # 设置最大行数，如果超过将显示滚动条
	autoscroll=False, # 设置自动滚动到底部
	show_copy_button=True,
	elem_id="text-output"
	)

	with gr.Column():
	model_input = gr.Textbox(lines=7, value=en_1, placeholder='Enter your extraction prompt here', label='Input Prompt')
	exp = gr.Button("Example Prompt")
	with gr.Row():
	gen = gr.Button("Generate", variant="primary")
	clr = gr.Button("Clear")
	outputs = gr.Markdown(label='Output', value="""\| Enzyme \| Organism \| Substrate \| Km \| Unit_Km \| Kcat \| Unit_Kcat \| Kcat/Km \| Unit_Kcat/Km \| Commentary[Temp] \| Commentary[pH] \| Commentary[Mutant] \| Commentary[Cosubstrate] \|
	\|------------\|-------------------\|-------------\|-----\|---------\|------\|-----------\|---------\|--------------\|------------------\|----------------\|--------------------\|-------------------------\|
	\| Enzyme1 \| Bacillus subtilis \| Substrate_A \| 7.3 \| mM \| 6.4 \| s^-1 \| 1.4 × 10^4 \| M^-1s^-1 \| 37°C \| 5.0 \| WT \| NADP^+ \|
	\| Enzyme2 \| Escherichia coli \| Substrate_B \| 5.9 \| mM \| 9.8 \| s^-1 \| 29000 \| mM^-1min^-1 \| 60°C \| 10.0 \| Q176E \| NADPH \|
	\| Enzyme3 \| Homo sapiens \| Substrate_C \| 6.9 \| mM \| 15.6 \| s^-1 \| 43000 \| µM^-1s^-1 \| 65°C \| 8.0 \| T253S \| NAD^+ \|

	""")
	with gr.Tab("Golden Benchmark"):
	gr.Markdown(
	'''<h1 align="center"> Golden Benchmark Viewer with Advanced Search </h1>
	</p>'''
	)
	gr.Markdown("""
	dataset can download in [LLM-Enzyme-Kinetics-Golden-Benchmark](https://huggingface.co/datasets/jackkuo/LLM-Enzyme-Kinetics-Golden-Benchmark)
	""")

	with gr.Row():
	# 选择搜索字段
	column_names = get_column_names(EXCEL_FILE_PATH_Golden_Benchmark)
	column_dropdown = gr.Dropdown(label="Select Column to Search", choices=column_names)

	# 添加搜索框
	search_box = gr.Textbox(label="Search", placeholder="Enter keyword to search...")
	# 按钮点击后执行搜索
	search_button = gr.Button("Search", variant="primary")

	search_output = gr.HTML(label="Search Results", min_height=1000, max_height=1000)

	# 设置搜索功能
	search_button.click(fn=search_data_golden, inputs=[search_box, column_dropdown], outputs=search_output)

	# 将回车事件绑定到搜索按钮
	search_box.submit(fn=search_data_golden, inputs=[search_box, column_dropdown], outputs=search_output)

	# 初始加载整个 Excel 表格
	initial_output = load_excel(EXCEL_FILE_PATH_Golden_Benchmark)
	if isinstance(initial_output, str):
	search_output.value = initial_output # 直接将错误消息赋值
	else:
	search_output.value = initial_output.to_html(classes='data', index=False, header=True)

	with gr.Tab(" LLM Enzyme Kinetics Archive (LLENKA)"):
	gr.Markdown(
	'''<h1 align="center"> LLM Enzyme Kinetics Archive (LLENKA) Viewer with Advanced Search </h1>
	</p>
	'''
	)
	gr.Markdown("""
	dataset can be download in [LLM-Enzyme-Kinetics-Archive-LLENKA](https://huggingface.co/datasets/jackkuo/LLM-Enzyme-Kinetics-Archive-LLENKA)
	""")
	with gr.Row():
	# 选择搜索字段
	column_names = get_column_names(EXCEL_FILE_PATH_Expert_Annotated_Dataset)
	column_dropdown = gr.Dropdown(label="Select Column to Search", choices=column_names)

	# 添加搜索框
	search_box = gr.Textbox(label="Search", placeholder="Enter keyword to search...")
	# 按钮点击后执行搜索
	search_button = gr.Button("Search", variant="primary")

	search_output = gr.HTML(label="Search Results", min_height=1000, max_height=1000)

	# 设置搜索功能
	search_button.click(fn=search_data_golden, inputs=[search_box, column_dropdown], outputs=search_output)

	# 将回车事件绑定到搜索按钮
	search_box.submit(fn=search_data_golden, inputs=[search_box, column_dropdown], outputs=search_output)

	# 初始加载整个 Excel 表格
	initial_output = load_excel(EXCEL_FILE_PATH_Expert_Annotated_Dataset)
	if isinstance(initial_output, str):
	search_output.value = initial_output # 直接将错误消息赋值
	else:
	search_output.value = initial_output.to_html(classes='data', index=False, header=True)
	with gr.Tab("Paper"):
	gr.Markdown(
	'''<h1 align="center"> Leveraging Large Language Models for Automated Extraction of Enzyme Kinetics Data from Scientific Literature </h1>
	<p><strong>Abstract:</strong>
	<br>Enzyme kinetics data reported in the literature are essential for guiding biomedical research, yet their extraction is traditionally performed manually, a process that is both time-consuming and prone to errors, while there is no automatic extraction pipeline available for enzyme kinetics data. Though Large Language Models (LLMs) have witnessed a significant advancement in information extraction in recent years, the inherent capabilities of processing comprehensive scientific data, both precise extraction and objective evaluation, have been less-investigated. Hence achieving fully automated extraction with satisfactory accuracy and offering a comprehensive performance evaluation standard remain a challenging task. This research introduces a novel framework leveraging LLMs for automatic information extraction from academic literature on enzyme kinetics. It integrated OCR conversion, content extraction, and output formatting through prompt engineering, marking a significant advancement in automated data extraction for scientific research. We contributed a meticulously curated golden benchmark of 156 research articles, which serves as both an accurate validation tool and a valuable resource for evaluating LLM capabilities in extraction tasks. This benchmark enables a rigorous assessment of LLMs in scientific language comprehension, biomedical concept understanding, and tabular data interpretation. The best-performing model achieved a recall rate of 92% and a precision rate of 88%. Our approach culminates in the LLM Enzyme Kinetics Archive (LLENKA), a comprehensive dataset derived from 3,435 articles, offering the research community a structured, high-quality resource for enzyme kinetics data facilitating future research endeavors. Our work leveraged the comprehensive inherent capabilities of LLMs and successfully developed an automated information extraction pipeline that enhances productivity, surpasses manual curation, and serves as a paradigm in various fields.
	<br>Figure 1: Pipeline for Enzyme Kinetics Data Extraction
	</p>'''
	)
	gr.Image("static/img.png", label="Pipeline for Enzyme Kinetics Data Extraction")
	gr.Markdown(
	'''
	<p align="center">Figure 1: Pipeline for Enzyme Kinetics Data Extraction
	</p>'''
	)
	gr.Markdown(
	'''

	\| Model \| Overall Entries Extracted \| Overall Correct Entries \| Overall Recall \| Overall Precision \| Mean Recall by Paper \| Mean Precision by Paper \| Km Entries Extracted \| Km Correct Entries \| Km Recall \| Km Precision \| Kcat Entries Extracted \| Kcat Correct Entries \| Kcat Recall \| Kcat Precision \| Kcat/Km Entries Extracted \| Kcat/Km Correct Entries \| Kcat/Km Recall \| Kcat/Km Precision \|
	\|---------------------------\|--------------------------\|-------------------------\|----------------\|-------------------\|-----------------------\|--------------------------\|----------------------\|---------------------\|-----------\|--------------\|------------------------\|-----------------------\|-------------\|----------------\|--------------------------\|-------------------------\|---------------\|-------------------\|
	\| llama 3.1-405B \| 8700 \| 7839 \| 0.72 \| 0.90 \| 0.80 \| 0.89 \| 2870 \| 2648 \| 0.74 \| 0.92 \| 2849 \| 2594 \| 0.73 \| 0.91 \| 2981 \| 2597 \| 0.69 \| 0.87 \|
	\| claude-3.5-sonnet-20240620\| 11348 \| 9967 \| 0.92 \| 0.88 \| 0.93 \| 0.90 \| 3840 \| 3314 \| 0.93 \| 0.86 \| 3732 \| 3310 \| 0.94 \| 0.89 \| 3776 \| 3343 \| 0.89 \| 0.89 \|
	\| GPT-4o \| 9810 \| 8703 \| 0.80 \| 0.89 \| 0.85 \| 0.90 \| 3294 \| 2932 \| 0.82 \| 0.89 \| 3188 \| 2892 \| 0.82 \| 0.91 \| 3328 \| 2879 \| 0.77 \| 0.87 \|
	\| qwen-plus-0806 \| 8673 \| 7763 \| 0.72 \| 0.90 \| 0.77 \| 0.90 \| 2932 \| 2665 \| 0.75 \| 0.91 \| 2914 \| 2638 \| 0.75 \| 0.91 \| 2827 \| 2460 \| 0.66 \| 0.87 \|

	'''
	)
	gr.Markdown(
	'''
	<p align="center">
	Table 1: Overall Performance of Various Models Examined on 156 Papers
	</p>
	<p><strong>Please note:</strong>
	<br>1. Test model versions: all models were tested in September 2024, The GPT-4o interface was tested on September 23, 2024, while the other model versions are labeled by name.
	<br>2. Llama 3.1 is locally deployed, while the other models use online interfaces.
	<br>3. The temperature used for all models during testing was 0.3.
	<br>4. The maximum outputs of different models also vary, which is discussed in our paper: GPT-4o has 4096 tokens, Claude 3.5 has 8192 tokens, and Qwen-Plus has 8000 tokens, and Llama 3.1 has 4096 tokens.
	<br>5. Due to local GPU resource limitations, Llama 3.1 uses a maximum input of 32k tokens.
	</p>
	'''
	)

	extract_button.click(extract_pdf_pypdf, inputs=file_input, outputs=text_output)
	exp.click(update_input, outputs=model_input)
	gen.click(fn=predict, inputs=[model_input, file_input], outputs=outputs)
	clr.click(fn=lambda: [gr.update(value=""), gr.update(value="")], inputs=None, outputs=[model_input, outputs])
	viewer_button.click(display_pdf_images, inputs=file_input, outputs=file_out)


	demo.launch()