jackkuo commited on
Commit
ddd39dd
·
verified ·
1 Parent(s): 0e4e1ac

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +85 -83
app.py CHANGED
@@ -11,10 +11,7 @@ import pandas as pd
11
  api_key = os.getenv('API_KEY')
12
  base_url = os.getenv("BASE_URL")
13
 
14
- client = OpenAI(
15
- api_key=api_key,
16
- base_url=base_url,
17
- )
18
 
19
 
20
  def cal_tokens(message_data):
@@ -29,45 +26,25 @@ def cal_tokens(message_data):
29
 
30
 
31
  def del_references(lines):
32
- # 1.mathpix md的格式:匹配\section*{REFERENCES}xxxx\section*{Table
33
- pattern = r'\*\{.{0,5}(References|Reference|REFERENCES|LITERATURE CITED|Referencesand notes|Notes and references)(.*?)\\section\*\{Tables'
34
- matches = re.search(pattern, lines, re.DOTALL)
35
- if matches:
36
- lines = lines.replace(matches[0], "\section*{Tables\n")
37
- print("1.1.匹配到了References和Tables,删除了References,保留了后面的Tables")
38
- else:
39
- pattern = r'\*\{.{0,5}(References|Reference|REFERENCES|LITERATURE CITED|Referencesand notes|Notes and references)(.*)'
 
 
 
40
  matches = re.search(pattern, lines, re.DOTALL)
41
  if matches:
42
- print("1.2.匹配到了References,删除了References")
43
- lines = lines.replace(matches[0], "")
44
- else:
45
- # 2.md的格式:匹配 ## REFERENCES
46
- pattern = r'#.{0,15}(References|Reference|REFERENCES|LITERATURE CITED|Referencesand notes|Notes and references)(.*?)(Table|Tables)'
47
- matches = re.search(pattern, lines, re.DOTALL)
48
- if matches:
49
- lines = lines.replace(matches[0], "Tables")
50
- print("2.1.匹配到了## References和Tables,删除了References,保留了后面的Tables")
51
- else:
52
- pattern = r'#.{0,15}(References|Reference|REFERENCES|LITERATURE CITED|Referencesand notes|Notes and references)(.*?)# SUPPLEMENTARY'
53
- matches = re.search(pattern, lines, re.DOTALL)
54
- if matches:
55
- lines = lines.replace(matches[0], "# SUPPLEMENTARY")
56
- print("2.2.匹配到了## References和# SUPPLEMENTARY,删除了References,保留了后面的# SUPPLEMENTARY")
57
- else:
58
- pattern = r'#.{0,15}(References|Reference|REFERENCES|LITERATURE CITED|Referencesand notes|Notes and references)(.*)\[\^0\]'
59
- matches = re.search(pattern, lines, re.DOTALL)
60
- if matches:
61
- print("2.3.匹配到了## References和\[\^0\],删除了References和\[\^0\]之间的内容")
62
- lines = lines.replace(matches[0], "[^0]")
63
- else:
64
- pattern = r'#.{0,15}(References|Reference|REFERENCES|LITERATURE CITED|Referencesand notes|Notes and references)(.*)'
65
- matches = re.search(pattern, lines, re.DOTALL)
66
- if matches:
67
- print("2.4.匹配到了## References,删除了References")
68
- lines = lines.replace(matches[0], "")
69
- else:
70
- print("没有匹配到References")
71
  return lines
72
 
73
 
@@ -155,11 +132,9 @@ Please pay attention to the pipe format as shown in the example below. This form
155
  return response
156
 
157
 
158
- def predict(prompt, pdf_file):
159
- if pdf_file is None:
160
- return "Please upload a PDF file to proceed."
161
 
162
- file_content = extract_pdf_pypdf(pdf_file.name)
163
  messages = [
164
  {
165
  "role": "system",
@@ -176,7 +151,6 @@ def predict(prompt, pdf_file):
176
  print("prompt tokens:", tokens)
177
  # time.sleep(20) # claude 需要加这个
178
  if tokens > 128000:
179
- file_content = del_references(file_content)
180
  extract_result = openai_chat_2_step(prompt, file_content)
181
  else:
182
  extract_result = openai_api(messages)
@@ -242,32 +216,29 @@ def update_input():
242
  return en_1
243
 
244
 
245
- EXCEL_FILE_PATH_Golden_Benchmark = "static/golden benchmark.csv"
246
- EXCEL_FILE_PATH_Expert_Annotated_Dataset = "static/3450_merged_data_2000_lines.csv"
 
247
 
248
 
249
- def load_excel(EXCEL_FILE_PATH):
250
  try:
251
- # 读取 Excel 文件
252
- # df = pd.read_excel(EXCEL_FILE_PATH)
253
- df = pd.read_csv(EXCEL_FILE_PATH)
254
  return df
255
  except Exception as e:
256
- return f"Error loading Excel file: {e}"
257
 
258
 
259
- def get_column_names(EXCEL_FILE_PATH):
260
- df = load_excel(EXCEL_FILE_PATH)
261
  if isinstance(df, str):
262
  return [] # 如果加载失败,返回空列表
263
  return df.columns.tolist() # 返回列名列表
264
 
265
 
266
- def search_data_golden(keyword, selected_column):
267
- df = load_excel(EXCEL_FILE_PATH_Golden_Benchmark)
268
  if isinstance(df, str): # 检查是否加载成功
269
  return df
270
-
271
  # 过滤包含关键字的行
272
  if selected_column not in df.columns:
273
  return "Invalid column selected."
@@ -276,25 +247,21 @@ def search_data_golden(keyword, selected_column):
276
 
277
  if filtered_df.empty:
278
  return "No results found."
279
-
280
  return filtered_df.to_html(classes='data', index=False, header=True)
281
 
282
 
283
- def search_data_entire(keyword, selected_column):
284
- df = load_excel(EXCEL_FILE_PATH_Expert_Annotated_Dataset)
285
- if isinstance(df, str): # 检查是否加载成功
286
- return df
287
 
288
- # 过滤包含关键字的行
289
- if selected_column not in df.columns:
290
- return "Invalid column selected."
291
-
292
- filtered_df = df[df[selected_column].astype(str).str.contains(keyword, case=False)]
293
 
294
- if filtered_df.empty:
295
- return "No results found."
296
 
297
- return filtered_df.to_html(classes='data', index=False, header=True)
 
 
298
 
299
 
300
  with gr.Blocks(title="Automated Enzyme Kinetics Extractor") as demo:
@@ -346,7 +313,7 @@ with gr.Blocks(title="Automated Enzyme Kinetics Extractor") as demo:
346
  | Enzyme3 | Homo sapiens | Substrate_C | 6.9 | mM | 15.6 | s^-1 | 43000 | µM^-1s^-1 | 65°C | 8.0 | T253S | NAD^+ |
347
 
348
  """)
349
- with gr.Tab("Golden Benchmark"):
350
  gr.Markdown(
351
  '''<h1 align="center"> Golden Benchmark Viewer with Advanced Search </h1>
352
  </p>'''
@@ -357,7 +324,41 @@ with gr.Blocks(title="Automated Enzyme Kinetics Extractor") as demo:
357
 
358
  with gr.Row():
359
  # 选择搜索字段
360
- column_names = get_column_names(EXCEL_FILE_PATH_Golden_Benchmark)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
361
  column_dropdown = gr.Dropdown(label="Select Column to Search", choices=column_names)
362
 
363
  # 添加搜索框
@@ -368,13 +369,13 @@ with gr.Blocks(title="Automated Enzyme Kinetics Extractor") as demo:
368
  search_output = gr.HTML(label="Search Results", min_height=1000, max_height=1000)
369
 
370
  # 设置搜索功能
371
- search_button.click(fn=search_data_golden, inputs=[search_box, column_dropdown], outputs=search_output)
372
 
373
  # 将回车事件绑定到搜索按钮
374
- search_box.submit(fn=search_data_golden, inputs=[search_box, column_dropdown], outputs=search_output)
375
 
376
- # 初始加载整个 Excel 表格
377
- initial_output = load_excel(EXCEL_FILE_PATH_Golden_Benchmark)
378
  if isinstance(initial_output, str):
379
  search_output.value = initial_output # 直接将错误消息赋值
380
  else:
@@ -391,7 +392,7 @@ with gr.Blocks(title="Automated Enzyme Kinetics Extractor") as demo:
391
  """)
392
  with gr.Row():
393
  # 选择搜索字段
394
- column_names = get_column_names(EXCEL_FILE_PATH_Expert_Annotated_Dataset)
395
  column_dropdown = gr.Dropdown(label="Select Column to Search", choices=column_names)
396
 
397
  # 添加搜索框
@@ -402,21 +403,22 @@ with gr.Blocks(title="Automated Enzyme Kinetics Extractor") as demo:
402
  search_output = gr.HTML(label="Search Results", min_height=1000, max_height=1000)
403
 
404
  # 设置搜索功能
405
- search_button.click(fn=search_data_golden, inputs=[search_box, column_dropdown], outputs=search_output)
406
 
407
  # 将回车事件绑定到搜索按钮
408
- search_box.submit(fn=search_data_golden, inputs=[search_box, column_dropdown], outputs=search_output)
409
 
410
- # 初始加载整个 Excel 表格
411
- initial_output = load_excel(EXCEL_FILE_PATH_Expert_Annotated_Dataset)
412
  if isinstance(initial_output, str):
413
  search_output.value = initial_output # 直接将错误消息赋值
414
  else:
415
  search_output.value = initial_output.to_html(classes='data', index=False, header=True)
416
 
 
417
  extract_button.click(extract_pdf_pypdf, inputs=file_input, outputs=text_output)
418
  exp.click(update_input, outputs=model_input)
419
- gen.click(fn=predict, inputs=[model_input, file_input], outputs=outputs)
420
  clr.click(fn=lambda: [gr.update(value=""), gr.update(value="")], inputs=None, outputs=[model_input, outputs])
421
  viewer_button.click(display_pdf_images, inputs=file_input, outputs=file_out)
422
 
 
11
  api_key = os.getenv('API_KEY')
12
  base_url = os.getenv("BASE_URL")
13
 
14
+ client = OpenAI(api_key=api_key, base_url=base_url)
 
 
 
15
 
16
 
17
  def cal_tokens(message_data):
 
26
 
27
 
28
  def del_references(lines):
29
+ # 定义正则表达式模式
30
+ patterns = [
31
+ (r'\*\{.{0,5}(References|Reference|REFERENCES|LITERATURE CITED|Referencesand notes|Notes and references)(.*?)\\section\*\{Tables', r'\section*{Tables\n'),
32
+ (r'\*\{.{0,5}(References|Reference|REFERENCES|LITERATURE CITED|Referencesand notes|Notes and references)(.*)', ''),
33
+ (r'#.{0,15}(References|Reference|REFERENCES|LITERATURE CITED|Referencesand notes|Notes and references)(.*?)(Table|Tables)', r'Tables'),
34
+ (r'#.{0,15}(References|Reference|REFERENCES|LITERATURE CITED|Referencesand notes|Notes and references)(.*?)# SUPPLEMENTARY', r'# SUPPLEMENTARY'),
35
+ (r'#.{0,15}(References|Reference|REFERENCES|LITERATURE CITED|Referencesand notes|Notes and references)(.*?)\[\^0\]', r'[^0]'),
36
+ (r'#.{0,15}(References|Reference|REFERENCES|LITERATURE CITED|Referencesand notes|Notes and references)(.*)', '')
37
+ ]
38
+
39
+ for pattern, replacement in patterns:
40
  matches = re.search(pattern, lines, re.DOTALL)
41
  if matches:
42
+ lines = lines.replace(matches[0], replacement)
43
+ print(f"匹配到了 {pattern}, 删除了 References, 保留了后面的 {replacement}")
44
+ break
45
+ else:
46
+ print("没有匹配到 References")
47
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48
  return lines
49
 
50
 
 
132
  return response
133
 
134
 
135
+ def predict(prompt, file_content):
136
+ file_content = del_references(file_content)
 
137
 
 
138
  messages = [
139
  {
140
  "role": "system",
 
151
  print("prompt tokens:", tokens)
152
  # time.sleep(20) # claude 需要加这个
153
  if tokens > 128000:
 
154
  extract_result = openai_chat_2_step(prompt, file_content)
155
  else:
156
  extract_result = openai_api(messages)
 
216
  return en_1
217
 
218
 
219
+ CSV_FILE_PATH_Golden_Benchmark_Enzyme = "static/Golden Benchmark for Enzyme Kinetics.csv"
220
+ CSV_FILE_PATH_Golden_Benchmark_Ribozyme = "static/Golden Benchmark for Ribozyme Kinetics.csv"
221
+ CSV_FILE_PATH_LLENKA_Dataset = "static/3450_merged_data_2000_lines.csv"
222
 
223
 
224
+ def load_csv(CSV_FILE_PATH):
225
  try:
226
+ df = pd.read_csv(CSV_FILE_PATH)
 
 
227
  return df
228
  except Exception as e:
229
+ return f"Error loading CSV file: {e}"
230
 
231
 
232
+ def get_column_names(CSV_FILE_PATH):
233
+ df = load_csv(CSV_FILE_PATH)
234
  if isinstance(df, str):
235
  return [] # 如果加载失败,返回空列表
236
  return df.columns.tolist() # 返回列名列表
237
 
238
 
239
+ def search_data(df, keyword, selected_column):
 
240
  if isinstance(df, str): # 检查是否加载成功
241
  return df
 
242
  # 过滤包含关键字的行
243
  if selected_column not in df.columns:
244
  return "Invalid column selected."
 
247
 
248
  if filtered_df.empty:
249
  return "No results found."
 
250
  return filtered_df.to_html(classes='data', index=False, header=True)
251
 
252
 
253
+ def search_data_golden_Enzyme(keyword, selected_column):
254
+ df = load_csv(CSV_FILE_PATH_Golden_Benchmark_Enzyme)
255
+ return search_data(df, keyword, selected_column)
 
256
 
257
+ def search_data_golden_Ribozyme(keyword, selected_column):
258
+ df = load_csv(CSV_FILE_PATH_Golden_Benchmark_Ribozyme)
259
+ return search_data(df, keyword, selected_column)
 
 
260
 
 
 
261
 
262
+ def search_data_LLENKA(keyword, selected_column):
263
+ df = load_csv(CSV_FILE_PATH_LLENKA_Dataset)
264
+ return search_data(df, keyword, selected_column)
265
 
266
 
267
  with gr.Blocks(title="Automated Enzyme Kinetics Extractor") as demo:
 
313
  | Enzyme3 | Homo sapiens | Substrate_C | 6.9 | mM | 15.6 | s^-1 | 43000 | µM^-1s^-1 | 65°C | 8.0 | T253S | NAD^+ |
314
 
315
  """)
316
+ with gr.Tab("Golden Benchmark for Enzyme Kinetics"):
317
  gr.Markdown(
318
  '''<h1 align="center"> Golden Benchmark Viewer with Advanced Search </h1>
319
  </p>'''
 
324
 
325
  with gr.Row():
326
  # 选择搜索字段
327
+ column_names = get_column_names(CSV_FILE_PATH_Golden_Benchmark_Enzyme)
328
+ column_dropdown = gr.Dropdown(label="Select Column to Search", choices=column_names)
329
+
330
+ # 添加搜索框
331
+ search_box = gr.Textbox(label="Search", placeholder="Enter keyword to search...")
332
+ # 按钮点击后执行搜索
333
+ search_button = gr.Button("Search", variant="primary")
334
+
335
+ search_output = gr.HTML(label="Search Results", min_height=1000, max_height=1000)
336
+
337
+ # 设置搜索功能
338
+ search_button.click(fn=search_data_golden_Enzyme, inputs=[search_box, column_dropdown], outputs=search_output)
339
+
340
+ # 将回车事件绑定到搜索按钮
341
+ search_box.submit(fn=search_data_golden_Enzyme, inputs=[search_box, column_dropdown], outputs=search_output)
342
+
343
+ # 初始加载整个 CSV 表格
344
+ initial_output = load_csv(CSV_FILE_PATH_Golden_Benchmark_Enzyme)
345
+ if isinstance(initial_output, str):
346
+ search_output.value = initial_output # 直接将错误消息赋值
347
+ else:
348
+ search_output.value = initial_output.to_html(classes='data', index=False, header=True)
349
+
350
+ with gr.Tab("Golden Benchmark for Ribozyme Kinetics"):
351
+ gr.Markdown(
352
+ '''<h1 align="center"> Golden Benchmark Viewer with Advanced Search </h1>
353
+ </p>'''
354
+ )
355
+ gr.Markdown("""
356
+ dataset can be download in [LLM-Ribozyme-Kinetics-Golden-Benchmark](https://huggingface.co/datasets/jackkuo/LLM-Ribozyme-Kinetics-Golden-Benchmark)
357
+ """)
358
+
359
+ with gr.Row():
360
+ # 选择搜索字段
361
+ column_names = get_column_names(CSV_FILE_PATH_Golden_Benchmark_Ribozyme)
362
  column_dropdown = gr.Dropdown(label="Select Column to Search", choices=column_names)
363
 
364
  # 添加搜索框
 
369
  search_output = gr.HTML(label="Search Results", min_height=1000, max_height=1000)
370
 
371
  # 设置搜索功能
372
+ search_button.click(fn=search_data_golden_Ribozyme, inputs=[search_box, column_dropdown], outputs=search_output)
373
 
374
  # 将回车事件绑定到搜索按钮
375
+ search_box.submit(fn=search_data_golden_Ribozyme, inputs=[search_box, column_dropdown], outputs=search_output)
376
 
377
+ # 初始加载整个 CSV 表格
378
+ initial_output = load_csv(CSV_FILE_PATH_Golden_Benchmark_Ribozyme)
379
  if isinstance(initial_output, str):
380
  search_output.value = initial_output # 直接将错误消息赋值
381
  else:
 
392
  """)
393
  with gr.Row():
394
  # 选择搜索字段
395
+ column_names = get_column_names(CSV_FILE_PATH_LLENKA_Dataset)
396
  column_dropdown = gr.Dropdown(label="Select Column to Search", choices=column_names)
397
 
398
  # 添加搜索框
 
403
  search_output = gr.HTML(label="Search Results", min_height=1000, max_height=1000)
404
 
405
  # 设置搜索功能
406
+ search_button.click(fn=search_data_LLENKA, inputs=[search_box, column_dropdown], outputs=search_output)
407
 
408
  # 将回车事件绑定到搜索按钮
409
+ search_box.submit(fn=search_data_LLENKA, inputs=[search_box, column_dropdown], outputs=search_output)
410
 
411
+ # 初始加载整个 CSV 表格
412
+ initial_output = load_csv(CSV_FILE_PATH_LLENKA_Dataset)
413
  if isinstance(initial_output, str):
414
  search_output.value = initial_output # 直接将错误消息赋值
415
  else:
416
  search_output.value = initial_output.to_html(classes='data', index=False, header=True)
417
 
418
+
419
  extract_button.click(extract_pdf_pypdf, inputs=file_input, outputs=text_output)
420
  exp.click(update_input, outputs=model_input)
421
+ gen.click(fn=predict, inputs=[model_input, text_output], outputs=outputs)
422
  clr.click(fn=lambda: [gr.update(value=""), gr.update(value="")], inputs=None, outputs=[model_input, outputs])
423
  viewer_button.click(display_pdf_images, inputs=file_input, outputs=file_out)
424