Update app.py
Browse files
app.py
CHANGED
@@ -6,6 +6,7 @@ from pathlib import Path
|
|
6 |
import os
|
7 |
import re
|
8 |
import tiktoken
|
|
|
9 |
|
10 |
api_key = os.getenv('API_KEY')
|
11 |
base_url = os.getenv("BASE_URL")
|
@@ -241,53 +242,209 @@ def update_input():
|
|
241 |
return en_1
|
242 |
|
243 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
244 |
with gr.Blocks(title="Automated Enzyme Kinetics Extractor") as demo:
|
245 |
-
gr.
|
246 |
-
|
247 |
-
|
248 |
-
|
249 |
-
|
250 |
-
|
251 |
-
|
252 |
-
|
253 |
-
|
254 |
-
|
255 |
-
|
256 |
-
|
257 |
-
|
258 |
-
|
259 |
-
|
260 |
-
|
261 |
-
|
262 |
-
|
263 |
-
|
264 |
-
|
265 |
-
|
266 |
-
|
267 |
-
|
268 |
-
|
269 |
-
|
270 |
-
|
271 |
-
|
272 |
-
|
273 |
-
|
274 |
-
|
275 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
276 |
)
|
|
|
|
|
|
|
|
|
277 |
|
278 |
-
|
279 |
-
|
280 |
-
|
281 |
-
|
282 |
-
|
283 |
-
|
284 |
-
|
285 |
-
|
286 |
-
|
287 |
-
|
288 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
289 |
|
290 |
-
""")
|
291 |
extract_button.click(extract_pdf_pypdf, inputs=file_input, outputs=text_output)
|
292 |
exp.click(update_input, outputs=model_input)
|
293 |
gen.click(fn=predict, inputs=[model_input, file_input], outputs=outputs)
|
|
|
6 |
import os
|
7 |
import re
|
8 |
import tiktoken
|
9 |
+
import pandas as pd
|
10 |
|
11 |
api_key = os.getenv('API_KEY')
|
12 |
base_url = os.getenv("BASE_URL")
|
|
|
242 |
return en_1
|
243 |
|
244 |
|
245 |
+
EXCEL_FILE_PATH_Golden_Benchmark = "static/20240919_156篇金标准手工提取_v2.xlsx"
|
246 |
+
EXCEL_FILE_PATH_Expert_Annotated_Dataset = "static/3450_merged_data.xlsx"
|
247 |
+
|
248 |
+
|
249 |
+
def load_excel(EXCEL_FILE_PATH):
|
250 |
+
try:
|
251 |
+
# 读取 Excel 文件
|
252 |
+
df = pd.read_excel(EXCEL_FILE_PATH)
|
253 |
+
return df
|
254 |
+
except Exception as e:
|
255 |
+
return f"Error loading Excel file: {e}"
|
256 |
+
|
257 |
+
|
258 |
+
def get_column_names(EXCEL_FILE_PATH):
|
259 |
+
df = load_excel(EXCEL_FILE_PATH)
|
260 |
+
if isinstance(df, str):
|
261 |
+
return [] # 如果加载失败,返回空列表
|
262 |
+
return df.columns.tolist() # 返回列名列表
|
263 |
+
|
264 |
+
|
265 |
+
def search_data_golden(keyword, selected_column):
|
266 |
+
df = load_excel(EXCEL_FILE_PATH_Golden_Benchmark)
|
267 |
+
if isinstance(df, str): # 检查是否加载成功
|
268 |
+
return df
|
269 |
+
|
270 |
+
# 过滤包含关键字的行
|
271 |
+
if selected_column not in df.columns:
|
272 |
+
return "Invalid column selected."
|
273 |
+
|
274 |
+
filtered_df = df[df[selected_column].astype(str).str.contains(keyword, case=False)]
|
275 |
+
|
276 |
+
if filtered_df.empty:
|
277 |
+
return "No results found."
|
278 |
+
|
279 |
+
return filtered_df.to_html(classes='data', index=False, header=True)
|
280 |
+
|
281 |
+
|
282 |
+
def search_data_entire(keyword, selected_column):
|
283 |
+
df = load_excel(EXCEL_FILE_PATH_Expert_Annotated_Dataset)
|
284 |
+
if isinstance(df, str): # 检查是否加载成功
|
285 |
+
return df
|
286 |
+
|
287 |
+
# 过滤包含关键字的行
|
288 |
+
if selected_column not in df.columns:
|
289 |
+
return "Invalid column selected."
|
290 |
+
|
291 |
+
filtered_df = df[df[selected_column].astype(str).str.contains(keyword, case=False)]
|
292 |
+
|
293 |
+
if filtered_df.empty:
|
294 |
+
return "No results found."
|
295 |
+
|
296 |
+
return filtered_df.to_html(classes='data', index=False, header=True)
|
297 |
+
|
298 |
+
|
299 |
with gr.Blocks(title="Automated Enzyme Kinetics Extractor") as demo:
|
300 |
+
with gr.Tabs():
|
301 |
+
with gr.Tab("Automated Enzyme Kinetics Extractor"):
|
302 |
+
gr.Markdown(
|
303 |
+
'''<h1 align="center"> Automated Enzyme Kinetics Extractor </h1>
|
304 |
+
<p>How to use:
|
305 |
+
<br><strong>1</strong>: Upload your PDF.
|
306 |
+
<br><strong>2</strong>: Click "View PDF" to preview it.
|
307 |
+
<br><strong>3</strong>: Click "Extract Text" to extract Text.
|
308 |
+
<br><strong>4</strong>: Enter your extraction prompt in the input box.
|
309 |
+
<br><strong>5</strong>: Click "Generate" to extract, and the extracted information will display below.
|
310 |
+
</p>'''
|
311 |
+
)
|
312 |
+
file_input = gr.File(label="Upload your PDF", type="filepath")
|
313 |
+
example = gr.Examples(examples=[["./sample.pdf"]], inputs=file_input)
|
314 |
+
with gr.Row():
|
315 |
+
viewer_button = gr.Button("View PDF", variant="secondary")
|
316 |
+
extract_button = gr.Button("Extract Text", variant="primary")
|
317 |
+
|
318 |
+
with gr.Row():
|
319 |
+
with gr.Column(scale=1):
|
320 |
+
|
321 |
+
file_out = gr.Gallery(label="PDF Viewer", columns=1, height="auto", object_fit="contain")
|
322 |
+
|
323 |
+
with gr.Column(scale=1):
|
324 |
+
text_output = gr.Textbox(
|
325 |
+
label="Extracted Text",
|
326 |
+
interactive=True,
|
327 |
+
placeholder="Extracted text will appear here...",
|
328 |
+
lines=39,
|
329 |
+
max_lines=39, # 设置最大行数,如果超过将显示滚动条
|
330 |
+
autoscroll=False, # 设置自动滚动到底部
|
331 |
+
show_copy_button=True,
|
332 |
+
elem_id="text-output"
|
333 |
+
)
|
334 |
+
|
335 |
+
with gr.Column():
|
336 |
+
model_input = gr.Textbox(lines=7, value=en_1, placeholder='Enter your extraction prompt here', label='Input Prompt')
|
337 |
+
exp = gr.Button("Example Prompt")
|
338 |
+
with gr.Row():
|
339 |
+
gen = gr.Button("Generate", variant="primary")
|
340 |
+
clr = gr.Button("Clear")
|
341 |
+
outputs = gr.Markdown(label='Output', value="""| Enzyme | Organism | Substrate | Km | Unit_Km | Kcat | Unit_Kcat | Kcat/Km | Unit_Kcat/Km | Commentary[Temp] | Commentary[pH] | Commentary[Mutant] | Commentary[Cosubstrate] |
|
342 |
+
|------------|-------------------|-------------|-----|---------|------|-----------|---------|--------------|------------------|----------------|--------------------|-------------------------|
|
343 |
+
| Enzyme1 | Bacillus subtilis | Substrate_A | 7.3 | mM | 6.4 | s^-1 | 1.4 × 10^4 | M^-1s^-1 | 37°C | 5.0 | WT | NADP^+ |
|
344 |
+
| Enzyme2 | Escherichia coli | Substrate_B | 5.9 | mM | 9.8 | s^-1 | 29000 | mM^-1min^-1 | 60°C | 10.0 | Q176E | NADPH |
|
345 |
+
| Enzyme3 | Homo sapiens | Substrate_C | 6.9 | mM | 15.6 | s^-1 | 43000 | µM^-1s^-1 | 65°C | 8.0 | T253S | NAD^+ |
|
346 |
+
|
347 |
+
""")
|
348 |
+
with gr.Tab("Golden Benchmark"):
|
349 |
+
gr.Markdown(
|
350 |
+
'''<h1 align="center"> Golden Benchmark Viewer with Advanced Search </h1>
|
351 |
+
</p>'''
|
352 |
)
|
353 |
+
with gr.Row():
|
354 |
+
# 选择搜索字段
|
355 |
+
column_names = get_column_names(EXCEL_FILE_PATH_Golden_Benchmark)
|
356 |
+
column_dropdown = gr.Dropdown(label="Select Column to Search", choices=column_names)
|
357 |
|
358 |
+
# 添加搜索框
|
359 |
+
search_box = gr.Textbox(label="Search", placeholder="Enter keyword to search...")
|
360 |
+
# 按钮点击后执行搜索
|
361 |
+
search_button = gr.Button("Search", variant="primary")
|
362 |
+
|
363 |
+
search_output = gr.HTML(label="Search Results", min_height=1000, max_height=1000)
|
364 |
+
|
365 |
+
# 设置搜索功能
|
366 |
+
search_button.click(fn=search_data_golden, inputs=[search_box, column_dropdown], outputs=search_output)
|
367 |
+
|
368 |
+
# 将回车事件绑定到搜索按钮
|
369 |
+
search_box.submit(fn=search_data_golden, inputs=[search_box, column_dropdown], outputs=search_output)
|
370 |
+
|
371 |
+
# 初始加载整个 Excel 表格
|
372 |
+
initial_output = load_excel(EXCEL_FILE_PATH_Golden_Benchmark)
|
373 |
+
if isinstance(initial_output, str):
|
374 |
+
search_output.value = initial_output # 直接将错误消息赋值
|
375 |
+
else:
|
376 |
+
search_output.value = initial_output.to_html(classes='data', index=False, header=True)
|
377 |
+
|
378 |
+
with gr.Tab(" LLM Enzyme Kinetics Archive (LLENKA)"):
|
379 |
+
gr.Markdown(
|
380 |
+
'''<h1 align="center"> LLM Enzyme Kinetics Archive (LLENKA) Viewer with Advanced Search </h1>
|
381 |
+
</p>'''
|
382 |
+
)
|
383 |
+
with gr.Row():
|
384 |
+
# 选择搜索字段
|
385 |
+
column_names = get_column_names(EXCEL_FILE_PATH_Expert_Annotated_Dataset)
|
386 |
+
column_dropdown = gr.Dropdown(label="Select Column to Search", choices=column_names)
|
387 |
+
|
388 |
+
# 添加搜索框
|
389 |
+
search_box = gr.Textbox(label="Search", placeholder="Enter keyword to search...")
|
390 |
+
# 按钮点击后执行搜索
|
391 |
+
search_button = gr.Button("Search", variant="primary")
|
392 |
+
|
393 |
+
search_output = gr.HTML(label="Search Results", min_height=1000, max_height=1000)
|
394 |
+
|
395 |
+
# 设置搜索功能
|
396 |
+
search_button.click(fn=search_data_golden, inputs=[search_box, column_dropdown], outputs=search_output)
|
397 |
+
|
398 |
+
# 将回车事件绑定到搜索按钮
|
399 |
+
search_box.submit(fn=search_data_golden, inputs=[search_box, column_dropdown], outputs=search_output)
|
400 |
+
|
401 |
+
# 初始加载整个 Excel 表格
|
402 |
+
initial_output = load_excel(EXCEL_FILE_PATH_Expert_Annotated_Dataset)
|
403 |
+
if isinstance(initial_output, str):
|
404 |
+
search_output.value = initial_output # 直接将错误消息赋值
|
405 |
+
else:
|
406 |
+
search_output.value = initial_output.to_html(classes='data', index=False, header=True)
|
407 |
+
with gr.Tab("Paper"):
|
408 |
+
gr.Markdown(
|
409 |
+
'''<h1 align="center"> Leveraging Large Language Models for Automated Extraction of Enzyme Kinetics Data from Scientific Literature </h1>
|
410 |
+
<p><strong>Abstract:</strong>
|
411 |
+
<br>Enzyme kinetics data reported in the literature are essential for guiding biomedical research, yet their extraction is traditionally performed manually, a process that is both time-consuming and prone to errors, while there is no automatic extraction pipeline available for enzyme kinetics data. Though Large Language Models (LLMs) have witnessed a significant advancement in information extraction in recent years, the inherent capabilities of processing comprehensive scientific data, both precise extraction and objective evaluation, have been less-investigated. Hence achieving fully automated extraction with satisfactory accuracy and offering a comprehensive performance evaluation standard remain a challenging task. This research introduces a novel framework leveraging LLMs for automatic information extraction from academic literature on enzyme kinetics. It integrated OCR conversion, content extraction, and output formatting through prompt engineering, marking a significant advancement in automated data extraction for scientific research. We contributed a meticulously curated golden benchmark of 156 research articles, which serves as both an accurate validation tool and a valuable resource for evaluating LLM capabilities in extraction tasks. This benchmark enables a rigorous assessment of LLMs in scientific language comprehension, biomedical concept understanding, and tabular data interpretation. The best-performing model achieved a recall rate of 92% and a precision rate of 88%. Our approach culminates in the LLM Enzyme Kinetics Archive (LLENKA), a comprehensive dataset derived from 3,435 articles, offering the research community a structured, high-quality resource for enzyme kinetics data facilitating future research endeavors. Our work leveraged the comprehensive inherent capabilities of LLMs and successfully developed an automated information extraction pipeline that enhances productivity, surpasses manual curation, and serves as a paradigm in various fields.
|
412 |
+
<br>Figure 1: Pipeline for Enzyme Kinetics Data Extraction
|
413 |
+
</p>'''
|
414 |
+
)
|
415 |
+
gr.Image("static/img.png", label="Pipeline for Enzyme Kinetics Data Extraction")
|
416 |
+
gr.Markdown(
|
417 |
+
'''
|
418 |
+
<p align="center">Figure 1: Pipeline for Enzyme Kinetics Data Extraction
|
419 |
+
</p>'''
|
420 |
+
)
|
421 |
+
gr.Markdown(
|
422 |
+
'''
|
423 |
+
|
424 |
+
| Model | Overall Entries Extracted | Overall Correct Entries | Overall Recall | Overall Precision | Mean Recall by Paper | Mean Precision by Paper | Km Entries Extracted | Km Correct Entries | Km Recall | Km Precision | Kcat Entries Extracted | Kcat Correct Entries | Kcat Recall | Kcat Precision | Kcat/Km Entries Extracted | Kcat/Km Correct Entries | Kcat/Km Recall | Kcat/Km Precision |
|
425 |
+
|---------------------------|--------------------------|-------------------------|----------------|-------------------|-----------------------|--------------------------|----------------------|---------------------|-----------|--------------|------------------------|-----------------------|-------------|----------------|--------------------------|-------------------------|---------------|-------------------|
|
426 |
+
| llama 3.1-405B | 8700 | 7839 | 0.72 | 0.90 | 0.80 | 0.89 | 2870 | 2648 | 0.74 | 0.92 | 2849 | 2594 | 0.73 | 0.91 | 2981 | 2597 | 0.69 | 0.87 |
|
427 |
+
| claude-3.5-sonnet-20240620| 11348 | 9967 | 0.92 | 0.88 | 0.93 | 0.90 | 3840 | 3314 | 0.93 | 0.86 | 3732 | 3310 | 0.94 | 0.89 | 3776 | 3343 | 0.89 | 0.89 |
|
428 |
+
| GPT-4o | 9810 | 8703 | 0.80 | 0.89 | 0.85 | 0.90 | 3294 | 2932 | 0.82 | 0.89 | 3188 | 2892 | 0.82 | 0.91 | 3328 | 2879 | 0.77 | 0.87 |
|
429 |
+
| qwen-plus-0806 | 8673 | 7763 | 0.72 | 0.90 | 0.77 | 0.90 | 2932 | 2665 | 0.75 | 0.91 | 2914 | 2638 | 0.75 | 0.91 | 2827 | 2460 | 0.66 | 0.87 |
|
430 |
+
|
431 |
+
'''
|
432 |
+
)
|
433 |
+
gr.Markdown(
|
434 |
+
'''
|
435 |
+
<p align="center">
|
436 |
+
Table 1: Overall Performance of Various Models Examined on 156 Papers
|
437 |
+
</p>
|
438 |
+
<p><strong>Please note:</strong>
|
439 |
+
<br>1. Test model versions: all models were tested in September 2024, The GPT-4o interface was tested on September 23, 2024, while the other model versions are labeled by name.
|
440 |
+
<br>2. Llama 3.1 is locally deployed, while the other models use online interfaces.
|
441 |
+
<br>3. The temperature used for all models during testing was 0.3.
|
442 |
+
<br>4. The maximum outputs of different models also vary, which is discussed in our paper: GPT-4o has 4096 tokens, Claude 3.5 has 8192 tokens, and Qwen-Plus has 8000 tokens, and Llama 3.1 has 4096 tokens.
|
443 |
+
<br>5. Due to local GPU resource limitations, Llama 3.1 uses a maximum input of 32k tokens.
|
444 |
+
</p>
|
445 |
+
'''
|
446 |
+
)
|
447 |
|
|
|
448 |
extract_button.click(extract_pdf_pypdf, inputs=file_input, outputs=text_output)
|
449 |
exp.click(update_input, outputs=model_input)
|
450 |
gen.click(fn=predict, inputs=[model_input, file_input], outputs=outputs)
|