jackkuo commited on
Commit
1f32a6b
·
verified ·
1 Parent(s): 0cc927f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +92 -88
app.py CHANGED
@@ -11,10 +11,7 @@ import pandas as pd
11
  api_key = os.getenv('API_KEY')
12
  base_url = os.getenv("BASE_URL")
13
 
14
- client = OpenAI(
15
- api_key=api_key,
16
- base_url=base_url,
17
- )
18
 
19
 
20
  def cal_tokens(message_data):
@@ -29,45 +26,25 @@ def cal_tokens(message_data):
29
 
30
 
31
  def del_references(lines):
32
- # 1.mathpix md的格式:匹配\section*{REFERENCES}xxxx\section*{Table
33
- pattern = r'\*\{.{0,5}(References|Reference|REFERENCES|LITERATURE CITED|Referencesand notes|Notes and references)(.*?)\\section\*\{Tables'
34
- matches = re.search(pattern, lines, re.DOTALL)
35
- if matches:
36
- lines = lines.replace(matches[0], "\section*{Tables\n")
37
- print("1.1.匹配到了References和Tables,删除了References,保留了后面的Tables")
38
- else:
39
- pattern = r'\*\{.{0,5}(References|Reference|REFERENCES|LITERATURE CITED|Referencesand notes|Notes and references)(.*)'
 
 
 
40
  matches = re.search(pattern, lines, re.DOTALL)
41
  if matches:
42
- print("1.2.匹配到了References,删除了References")
43
- lines = lines.replace(matches[0], "")
44
- else:
45
- # 2.md的格式:匹配 ## REFERENCES
46
- pattern = r'#.{0,15}(References|Reference|REFERENCES|LITERATURE CITED|Referencesand notes|Notes and references)(.*?)(Table|Tables)'
47
- matches = re.search(pattern, lines, re.DOTALL)
48
- if matches:
49
- lines = lines.replace(matches[0], "Tables")
50
- print("2.1.匹配到了## References和Tables,删除了References,保留了后面的Tables")
51
- else:
52
- pattern = r'#.{0,15}(References|Reference|REFERENCES|LITERATURE CITED|Referencesand notes|Notes and references)(.*?)# SUPPLEMENTARY'
53
- matches = re.search(pattern, lines, re.DOTALL)
54
- if matches:
55
- lines = lines.replace(matches[0], "# SUPPLEMENTARY")
56
- print("2.2.匹配到了## References和# SUPPLEMENTARY,删除了References,保留了后面的# SUPPLEMENTARY")
57
- else:
58
- pattern = r'#.{0,15}(References|Reference|REFERENCES|LITERATURE CITED|Referencesand notes|Notes and references)(.*)\[\^0\]'
59
- matches = re.search(pattern, lines, re.DOTALL)
60
- if matches:
61
- print("2.3.匹配到了## References和\[\^0\],删除了References和\[\^0\]之间的内容")
62
- lines = lines.replace(matches[0], "[^0]")
63
- else:
64
- pattern = r'#.{0,15}(References|Reference|REFERENCES|LITERATURE CITED|Referencesand notes|Notes and references)(.*)'
65
- matches = re.search(pattern, lines, re.DOTALL)
66
- if matches:
67
- print("2.4.匹配到了## References,删除了References")
68
- lines = lines.replace(matches[0], "")
69
- else:
70
- print("没有匹配到References")
71
  return lines
72
 
73
 
@@ -110,8 +87,8 @@ def openai_api(messages):
110
 
111
  def openai_chat_2_step(prompt, file_content):
112
  all_response = ""
113
- for i in range(len(file_content) // 123000 + 1):
114
- text = file_content[i * 123000:(i + 1) * 123000]
115
  # step1: 拆分两部分,前半部分
116
  messages = [
117
  {
@@ -155,11 +132,9 @@ Please pay attention to the pipe format as shown in the example below. This form
155
  return response
156
 
157
 
158
- def predict(prompt, pdf_file):
159
- if pdf_file is None:
160
- return "Please upload a PDF file to proceed."
161
 
162
- file_content = extract_pdf_pypdf(pdf_file.name)
163
  messages = [
164
  {
165
  "role": "system",
@@ -176,7 +151,6 @@ def predict(prompt, pdf_file):
176
  print("prompt tokens:", tokens)
177
  # time.sleep(20) # claude 需要加这个
178
  if tokens > 128000:
179
- file_content = del_references(file_content)
180
  extract_result = openai_chat_2_step(prompt, file_content)
181
  else:
182
  extract_result = openai_api(messages)
@@ -242,32 +216,29 @@ def update_input():
242
  return en_1
243
 
244
 
245
- EXCEL_FILE_PATH_Golden_Benchmark = "static/golden benchmark.csv"
246
- EXCEL_FILE_PATH_Expert_Annotated_Dataset = "static/3450_merged_data_2000_lines.csv"
 
247
 
248
 
249
- def load_excel(EXCEL_FILE_PATH):
250
  try:
251
- # 读取 Excel 文件
252
- # df = pd.read_excel(EXCEL_FILE_PATH)
253
- df = pd.read_csv(EXCEL_FILE_PATH)
254
  return df
255
  except Exception as e:
256
- return f"Error loading Excel file: {e}"
257
 
258
 
259
- def get_column_names(EXCEL_FILE_PATH):
260
- df = load_excel(EXCEL_FILE_PATH)
261
  if isinstance(df, str):
262
  return [] # 如果加载失败,返回空列表
263
  return df.columns.tolist() # 返回列名列表
264
 
265
 
266
- def search_data_golden(keyword, selected_column):
267
- df = load_excel(EXCEL_FILE_PATH_Golden_Benchmark)
268
  if isinstance(df, str): # 检查是否加载成功
269
  return df
270
-
271
  # 过滤包含关键字的行
272
  if selected_column not in df.columns:
273
  return "Invalid column selected."
@@ -276,25 +247,21 @@ def search_data_golden(keyword, selected_column):
276
 
277
  if filtered_df.empty:
278
  return "No results found."
279
-
280
  return filtered_df.to_html(classes='data', index=False, header=True)
281
 
282
 
283
- def search_data_entire(keyword, selected_column):
284
- df = load_excel(EXCEL_FILE_PATH_Expert_Annotated_Dataset)
285
- if isinstance(df, str): # 检查是否加载成功
286
- return df
287
 
288
- # 过滤包含关键字的行
289
- if selected_column not in df.columns:
290
- return "Invalid column selected."
291
 
292
- filtered_df = df[df[selected_column].astype(str).str.contains(keyword, case=False)]
293
-
294
- if filtered_df.empty:
295
- return "No results found."
296
 
297
- return filtered_df.to_html(classes='data', index=False, header=True)
 
 
298
 
299
 
300
  with gr.Blocks(title="Automated Enzyme Kinetics Extractor") as demo:
@@ -318,6 +285,7 @@ with gr.Blocks(title="Automated Enzyme Kinetics Extractor") as demo:
318
 
319
  with gr.Row():
320
  with gr.Column(scale=1):
 
321
  file_out = gr.Gallery(label="PDF Viewer", columns=1, height="auto", object_fit="contain")
322
 
323
  with gr.Column(scale=1):
@@ -333,8 +301,7 @@ with gr.Blocks(title="Automated Enzyme Kinetics Extractor") as demo:
333
  )
334
 
335
  with gr.Column():
336
- model_input = gr.Textbox(lines=7, value=en_1, placeholder='Enter your extraction prompt here',
337
- label='Input Prompt')
338
  exp = gr.Button("Example Prompt")
339
  with gr.Row():
340
  gen = gr.Button("Generate", variant="primary")
@@ -344,9 +311,9 @@ with gr.Blocks(title="Automated Enzyme Kinetics Extractor") as demo:
344
  | Enzyme1 | Bacillus subtilis | Substrate_A | 7.3 | mM | 6.4 | s^-1 | 1.4 × 10^4 | M^-1s^-1 | 37°C | 5.0 | WT | NADP^+ |
345
  | Enzyme2 | Escherichia coli | Substrate_B | 5.9 | mM | 9.8 | s^-1 | 29000 | mM^-1min^-1 | 60°C | 10.0 | Q176E | NADPH |
346
  | Enzyme3 | Homo sapiens | Substrate_C | 6.9 | mM | 15.6 | s^-1 | 43000 | µM^-1s^-1 | 65°C | 8.0 | T253S | NAD^+ |
347
-
348
  """)
349
- with gr.Tab("Golden Benchmark"):
350
  gr.Markdown(
351
  '''<h1 align="center"> Golden Benchmark Viewer with Advanced Search </h1>
352
  </p>'''
@@ -357,7 +324,7 @@ with gr.Blocks(title="Automated Enzyme Kinetics Extractor") as demo:
357
 
358
  with gr.Row():
359
  # 选择搜索字段
360
- column_names = get_column_names(EXCEL_FILE_PATH_Golden_Benchmark)
361
  column_dropdown = gr.Dropdown(label="Select Column to Search", choices=column_names)
362
 
363
  # 添加搜索框
@@ -368,13 +335,47 @@ with gr.Blocks(title="Automated Enzyme Kinetics Extractor") as demo:
368
  search_output = gr.HTML(label="Search Results", min_height=1000, max_height=1000)
369
 
370
  # 设置搜索功能
371
- search_button.click(fn=search_data_golden, inputs=[search_box, column_dropdown], outputs=search_output)
372
 
373
  # 将回车事件绑定到搜索按钮
374
- search_box.submit(fn=search_data_golden, inputs=[search_box, column_dropdown], outputs=search_output)
375
 
376
- # 初始加载整个 Excel 表格
377
- initial_output = load_excel(EXCEL_FILE_PATH_Golden_Benchmark)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
378
  if isinstance(initial_output, str):
379
  search_output.value = initial_output # 直接将错误消息赋值
380
  else:
@@ -391,7 +392,7 @@ with gr.Blocks(title="Automated Enzyme Kinetics Extractor") as demo:
391
  """)
392
  with gr.Row():
393
  # 选择搜索字段
394
- column_names = get_column_names(EXCEL_FILE_PATH_Expert_Annotated_Dataset)
395
  column_dropdown = gr.Dropdown(label="Select Column to Search", choices=column_names)
396
 
397
  # 添加搜索框
@@ -402,23 +403,26 @@ with gr.Blocks(title="Automated Enzyme Kinetics Extractor") as demo:
402
  search_output = gr.HTML(label="Search Results", min_height=1000, max_height=1000)
403
 
404
  # 设置搜索功能
405
- search_button.click(fn=search_data_golden, inputs=[search_box, column_dropdown], outputs=search_output)
406
 
407
  # 将回车事件绑定到搜索按钮
408
- search_box.submit(fn=search_data_golden, inputs=[search_box, column_dropdown], outputs=search_output)
409
 
410
- # 初始加载整个 Excel 表格
411
- initial_output = load_excel(EXCEL_FILE_PATH_Expert_Annotated_Dataset)
412
  if isinstance(initial_output, str):
413
  search_output.value = initial_output # 直接将错误消息赋值
414
  else:
415
  search_output.value = initial_output.to_html(classes='data', index=False, header=True)
416
 
 
 
417
  extract_button.click(extract_pdf_pypdf, inputs=file_input, outputs=text_output)
418
  exp.click(update_input, outputs=model_input)
419
- gen.click(fn=predict, inputs=[model_input, file_input], outputs=outputs)
420
  clr.click(fn=lambda: [gr.update(value=""), gr.update(value="")], inputs=None, outputs=[model_input, outputs])
421
  viewer_button.click(display_pdf_images, inputs=file_input, outputs=file_out)
422
 
 
423
  demo.launch()
424
 
 
11
  api_key = os.getenv('API_KEY')
12
  base_url = os.getenv("BASE_URL")
13
 
14
+ client = OpenAI(api_key=api_key, base_url=base_url)
 
 
 
15
 
16
 
17
  def cal_tokens(message_data):
 
26
 
27
 
28
  def del_references(lines):
29
+ # 定义正则表达式模式
30
+ patterns = [
31
+ (r'\*\{.{0,5}(References|Reference|REFERENCES|LITERATURE CITED|Referencesand notes|Notes and references)(.*?)\\section\*\{Tables', r'\section*{Tables\n'),
32
+ (r'\*\{.{0,5}(References|Reference|REFERENCES|LITERATURE CITED|Referencesand notes|Notes and references)(.*)', ''),
33
+ (r'#.{0,15}(References|Reference|REFERENCES|LITERATURE CITED|Referencesand notes|Notes and references)(.*?)(Table|Tables)', r'Tables'),
34
+ (r'#.{0,15}(References|Reference|REFERENCES|LITERATURE CITED|Referencesand notes|Notes and references)(.*?)# SUPPLEMENTARY', r'# SUPPLEMENTARY'),
35
+ (r'#.{0,15}(References|Reference|REFERENCES|LITERATURE CITED|Referencesand notes|Notes and references)(.*?)\[\^0\]', r'[^0]'),
36
+ (r'#.{0,15}(References|Reference|REFERENCES|LITERATURE CITED|Referencesand notes|Notes and references)(.*)', '')
37
+ ]
38
+
39
+ for pattern, replacement in patterns:
40
  matches = re.search(pattern, lines, re.DOTALL)
41
  if matches:
42
+ lines = lines.replace(matches[0], replacement)
43
+ print(f"匹配到了 {pattern}, 删除了 References, 保留了后面的 {replacement}")
44
+ break
45
+ else:
46
+ print("没有匹配到 References")
47
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48
  return lines
49
 
50
 
 
87
 
88
  def openai_chat_2_step(prompt, file_content):
89
  all_response = ""
90
+ for i in range(len(file_content)//123000 + 1):
91
+ text = file_content[i*123000:(i+1)*123000]
92
  # step1: 拆分两部分,前半部分
93
  messages = [
94
  {
 
132
  return response
133
 
134
 
135
+ def predict(prompt, file_content):
136
+ file_content = del_references(file_content)
 
137
 
 
138
  messages = [
139
  {
140
  "role": "system",
 
151
  print("prompt tokens:", tokens)
152
  # time.sleep(20) # claude 需要加这个
153
  if tokens > 128000:
 
154
  extract_result = openai_chat_2_step(prompt, file_content)
155
  else:
156
  extract_result = openai_api(messages)
 
216
  return en_1
217
 
218
 
219
+ CSV_FILE_PATH_Golden_Benchmark_Enzyme = "static/Golden Benchmark for Enzyme Kinetics.csv"
220
+ CSV_FILE_PATH_Golden_Benchmark_Ribozyme = "static/Golden Benchmark for Ribozyme Kinetics.csv"
221
+ CSV_FILE_PATH_LLENKA_Dataset = "static/3450_merged_data_2000_lines.csv"
222
 
223
 
224
+ def load_csv(CSV_FILE_PATH):
225
  try:
226
+ df = pd.read_csv(CSV_FILE_PATH)
 
 
227
  return df
228
  except Exception as e:
229
+ return f"Error loading CSV file: {e}"
230
 
231
 
232
+ def get_column_names(CSV_FILE_PATH):
233
+ df = load_csv(CSV_FILE_PATH)
234
  if isinstance(df, str):
235
  return [] # 如果加载失败,返回空列表
236
  return df.columns.tolist() # 返回列名列表
237
 
238
 
239
+ def search_data(df, keyword, selected_column):
 
240
  if isinstance(df, str): # 检查是否加载成功
241
  return df
 
242
  # 过滤包含关键字的行
243
  if selected_column not in df.columns:
244
  return "Invalid column selected."
 
247
 
248
  if filtered_df.empty:
249
  return "No results found."
 
250
  return filtered_df.to_html(classes='data', index=False, header=True)
251
 
252
 
253
+ def search_data_golden_Enzyme(keyword, selected_column):
254
+ df = load_csv(CSV_FILE_PATH_Golden_Benchmark_Enzyme)
255
+ return search_data(df, keyword, selected_column)
 
256
 
257
+ def search_data_golden_Ribozyme(keyword, selected_column):
258
+ df = load_csv(CSV_FILE_PATH_Golden_Benchmark_Ribozyme)
259
+ return search_data(df, keyword, selected_column)
260
 
 
 
 
 
261
 
262
+ def search_data_LLENKA(keyword, selected_column):
263
+ df = load_csv(CSV_FILE_PATH_LLENKA_Dataset)
264
+ return search_data(df, keyword, selected_column)
265
 
266
 
267
  with gr.Blocks(title="Automated Enzyme Kinetics Extractor") as demo:
 
285
 
286
  with gr.Row():
287
  with gr.Column(scale=1):
288
+
289
  file_out = gr.Gallery(label="PDF Viewer", columns=1, height="auto", object_fit="contain")
290
 
291
  with gr.Column(scale=1):
 
301
  )
302
 
303
  with gr.Column():
304
+ model_input = gr.Textbox(lines=7, value=en_1, placeholder='Enter your extraction prompt here', label='Input Prompt')
 
305
  exp = gr.Button("Example Prompt")
306
  with gr.Row():
307
  gen = gr.Button("Generate", variant="primary")
 
311
  | Enzyme1 | Bacillus subtilis | Substrate_A | 7.3 | mM | 6.4 | s^-1 | 1.4 × 10^4 | M^-1s^-1 | 37°C | 5.0 | WT | NADP^+ |
312
  | Enzyme2 | Escherichia coli | Substrate_B | 5.9 | mM | 9.8 | s^-1 | 29000 | mM^-1min^-1 | 60°C | 10.0 | Q176E | NADPH |
313
  | Enzyme3 | Homo sapiens | Substrate_C | 6.9 | mM | 15.6 | s^-1 | 43000 | µM^-1s^-1 | 65°C | 8.0 | T253S | NAD^+ |
314
+
315
  """)
316
+ with gr.Tab("Golden Benchmark for Enzyme Kinetics"):
317
  gr.Markdown(
318
  '''<h1 align="center"> Golden Benchmark Viewer with Advanced Search </h1>
319
  </p>'''
 
324
 
325
  with gr.Row():
326
  # 选择搜索字段
327
+ column_names = get_column_names(CSV_FILE_PATH_Golden_Benchmark_Enzyme)
328
  column_dropdown = gr.Dropdown(label="Select Column to Search", choices=column_names)
329
 
330
  # 添加搜索框
 
335
  search_output = gr.HTML(label="Search Results", min_height=1000, max_height=1000)
336
 
337
  # 设置搜索功能
338
+ search_button.click(fn=search_data_golden_Enzyme, inputs=[search_box, column_dropdown], outputs=search_output)
339
 
340
  # 将回车事件绑定到搜索按钮
341
+ search_box.submit(fn=search_data_golden_Enzyme, inputs=[search_box, column_dropdown], outputs=search_output)
342
 
343
+ # 初始加载整个 CSV 表格
344
+ initial_output = load_csv(CSV_FILE_PATH_Golden_Benchmark_Enzyme)
345
+ if isinstance(initial_output, str):
346
+ search_output.value = initial_output # 直接将错误消息赋值
347
+ else:
348
+ search_output.value = initial_output.to_html(classes='data', index=False, header=True)
349
+
350
+ with gr.Tab("Golden Benchmark for Ribozyme Kinetics"):
351
+ gr.Markdown(
352
+ '''<h1 align="center"> Golden Benchmark Viewer with Advanced Search </h1>
353
+ </p>'''
354
+ )
355
+ gr.Markdown("""
356
+ dataset can be download in [LLM-Ribozyme-Kinetics-Golden-Benchmark](https://huggingface.co/datasets/jackkuo/LLM-Ribozyme-Kinetics-Golden-Benchmark)
357
+ """)
358
+
359
+ with gr.Row():
360
+ # 选择搜索字段
361
+ column_names = get_column_names(CSV_FILE_PATH_Golden_Benchmark_Ribozyme)
362
+ column_dropdown = gr.Dropdown(label="Select Column to Search", choices=column_names)
363
+
364
+ # 添加搜索框
365
+ search_box = gr.Textbox(label="Search", placeholder="Enter keyword to search...")
366
+ # 按钮点击后执行搜索
367
+ search_button = gr.Button("Search", variant="primary")
368
+
369
+ search_output = gr.HTML(label="Search Results", min_height=1000, max_height=1000)
370
+
371
+ # 设置搜索功能
372
+ search_button.click(fn=search_data_golden_Ribozyme, inputs=[search_box, column_dropdown], outputs=search_output)
373
+
374
+ # 将回车事件绑定到搜索按钮
375
+ search_box.submit(fn=search_data_golden_Ribozyme, inputs=[search_box, column_dropdown], outputs=search_output)
376
+
377
+ # 初始加载整个 CSV 表格
378
+ initial_output = load_csv(CSV_FILE_PATH_Golden_Benchmark_Ribozyme)
379
  if isinstance(initial_output, str):
380
  search_output.value = initial_output # 直接将错误消息赋值
381
  else:
 
392
  """)
393
  with gr.Row():
394
  # 选择搜索字段
395
+ column_names = get_column_names(CSV_FILE_PATH_LLENKA_Dataset)
396
  column_dropdown = gr.Dropdown(label="Select Column to Search", choices=column_names)
397
 
398
  # 添加搜索框
 
403
  search_output = gr.HTML(label="Search Results", min_height=1000, max_height=1000)
404
 
405
  # 设置搜索功能
406
+ search_button.click(fn=search_data_LLENKA, inputs=[search_box, column_dropdown], outputs=search_output)
407
 
408
  # 将回车事件绑定到搜索按钮
409
+ search_box.submit(fn=search_data_LLENKA, inputs=[search_box, column_dropdown], outputs=search_output)
410
 
411
+ # 初始加载整个 CSV 表格
412
+ initial_output = load_csv(CSV_FILE_PATH_LLENKA_Dataset)
413
  if isinstance(initial_output, str):
414
  search_output.value = initial_output # 直接将错误消息赋值
415
  else:
416
  search_output.value = initial_output.to_html(classes='data', index=False, header=True)
417
 
418
+
419
+
420
  extract_button.click(extract_pdf_pypdf, inputs=file_input, outputs=text_output)
421
  exp.click(update_input, outputs=model_input)
422
+ gen.click(fn=predict, inputs=[model_input, text_output], outputs=outputs)
423
  clr.click(fn=lambda: [gr.update(value=""), gr.update(value="")], inputs=None, outputs=[model_input, outputs])
424
  viewer_button.click(display_pdf_images, inputs=file_input, outputs=file_out)
425
 
426
+
427
  demo.launch()
428