Spaces:

jackkuo
/

Automated-Enzyme-Kinetics-Extractor

Running

App Files Files Community

jackkuo commited on Jan 24

Commit

1f32a6b

verified ·

1 Parent(s): 0cc927f

Update app.py

Browse files

Files changed (1) hide show

app.py +92 -88

app.py CHANGED Viewed

@@ -11,10 +11,7 @@ import pandas as pd
 api_key = os.getenv('API_KEY')
 base_url = os.getenv("BASE_URL")
-client = OpenAI(
-    api_key=api_key,
-    base_url=base_url,
-)
 def cal_tokens(message_data):
@@ -29,45 +26,25 @@ def cal_tokens(message_data):
 def del_references(lines):
-    # 1.mathpix md的格式：匹配\section*{REFERENCES}xxxx\section*{Table
-    pattern = r'\*\{.{0,5}(References|Reference|REFERENCES|LITERATURE CITED|Referencesand notes|Notes and references)(.*?)\\section\*\{Tables'
-    matches = re.search(pattern, lines, re.DOTALL)
-    if matches:
-        lines = lines.replace(matches[0], "\section*{Tables\n")
-        print("1.1.匹配到了References和Tables,删除了References，保留了后面的Tables")
-    else:
-        pattern = r'\*\{.{0,5}(References|Reference|REFERENCES|LITERATURE CITED|Referencesand notes|Notes and references)(.*)'
         matches = re.search(pattern, lines, re.DOTALL)
         if matches:
-            print("1.2.匹配到了References,删除了References")
-            lines = lines.replace(matches[0], "")
-        else:
-            # 2.md的格式：匹配 ## REFERENCES
-            pattern = r'#.{0,15}(References|Reference|REFERENCES|LITERATURE CITED|Referencesand notes|Notes and references)(.*?)(Table|Tables)'
-            matches = re.search(pattern, lines, re.DOTALL)
-            if matches:
-                lines = lines.replace(matches[0], "Tables")
-                print("2.1.匹配到了## References和Tables,删除了References，保留了后面的Tables")
-            else:
-                pattern = r'#.{0,15}(References|Reference|REFERENCES|LITERATURE CITED|Referencesand notes|Notes and references)(.*?)# SUPPLEMENTARY'
-                matches = re.search(pattern, lines, re.DOTALL)
-                if matches:
-                    lines = lines.replace(matches[0], "# SUPPLEMENTARY")
-                    print("2.2.匹配到了## References和# SUPPLEMENTARY,删除了References，保留了后面的# SUPPLEMENTARY")
-                else:
-                    pattern = r'#.{0,15}(References|Reference|REFERENCES|LITERATURE CITED|Referencesand notes|Notes and references)(.*)\[\^0\]'
-                    matches = re.search(pattern, lines, re.DOTALL)
-                    if matches:
-                        print("2.3.匹配到了## References和\[\^0\],删除了References和\[\^0\]之间的内容")
-                        lines = lines.replace(matches[0], "[^0]")
-                    else:
-                        pattern = r'#.{0,15}(References|Reference|REFERENCES|LITERATURE CITED|Referencesand notes|Notes and references)(.*)'
-                        matches = re.search(pattern, lines, re.DOTALL)
-                        if matches:
-                            print("2.4.匹配到了## References,删除了References")
-                            lines = lines.replace(matches[0], "")
-                        else:
-                            print("没有匹配到References")
     return lines
@@ -110,8 +87,8 @@ def openai_api(messages):
 def openai_chat_2_step(prompt, file_content):
     all_response = ""
-    for i in range(len(file_content) // 123000 + 1):
-        text = file_content[i * 123000:(i + 1) * 123000]
         # step1: 拆分两部分，前半部分
         messages = [
             {
@@ -155,11 +132,9 @@ Please pay attention to the pipe format as shown in the example below. This form
     return response
-def predict(prompt, pdf_file):
-    if pdf_file is None:
-        return "Please upload a PDF file to proceed."
-    file_content = extract_pdf_pypdf(pdf_file.name)
     messages = [
         {
             "role": "system",
@@ -176,7 +151,6 @@ def predict(prompt, pdf_file):
     print("prompt tokens:", tokens)
     # time.sleep(20) # claude 需要加这个
     if tokens > 128000:
-        file_content = del_references(file_content)
         extract_result = openai_chat_2_step(prompt, file_content)
     else:
         extract_result = openai_api(messages)
@@ -242,32 +216,29 @@ def update_input():
     return en_1
-EXCEL_FILE_PATH_Golden_Benchmark = "static/golden benchmark.csv"
-EXCEL_FILE_PATH_Expert_Annotated_Dataset = "static/3450_merged_data_2000_lines.csv"
-def load_excel(EXCEL_FILE_PATH):
     try:
-        # 读取 Excel 文件
-        # df = pd.read_excel(EXCEL_FILE_PATH)
-        df = pd.read_csv(EXCEL_FILE_PATH)
         return df
     except Exception as e:
-        return f"Error loading Excel file: {e}"
-def get_column_names(EXCEL_FILE_PATH):
-    df = load_excel(EXCEL_FILE_PATH)
     if isinstance(df, str):
         return []  # 如果加载失败，返回空列表
     return df.columns.tolist()  # 返回列名列表
-def search_data_golden(keyword, selected_column):
-    df = load_excel(EXCEL_FILE_PATH_Golden_Benchmark)
     if isinstance(df, str):  # 检查是否加载成功
         return df
     # 过滤包含关键字的行
     if selected_column not in df.columns:
         return "Invalid column selected."
@@ -276,25 +247,21 @@ def search_data_golden(keyword, selected_column):
     if filtered_df.empty:
         return "No results found."
     return filtered_df.to_html(classes='data', index=False, header=True)
-def search_data_entire(keyword, selected_column):
-    df = load_excel(EXCEL_FILE_PATH_Expert_Annotated_Dataset)
-    if isinstance(df, str):  # 检查是否加载成功
-        return df
-    # 过滤包含关键字的行
-    if selected_column not in df.columns:
-        return "Invalid column selected."
-    filtered_df = df[df[selected_column].astype(str).str.contains(keyword, case=False)]
-    if filtered_df.empty:
-        return "No results found."
-    return filtered_df.to_html(classes='data', index=False, header=True)
 with gr.Blocks(title="Automated Enzyme Kinetics Extractor") as demo:
@@ -318,6 +285,7 @@ with gr.Blocks(title="Automated Enzyme Kinetics Extractor") as demo:
             with gr.Row():
                 with gr.Column(scale=1):
                     file_out = gr.Gallery(label="PDF Viewer", columns=1, height="auto", object_fit="contain")
                 with gr.Column(scale=1):
@@ -333,8 +301,7 @@ with gr.Blocks(title="Automated Enzyme Kinetics Extractor") as demo:
                     )
             with gr.Column():
-                model_input = gr.Textbox(lines=7, value=en_1, placeholder='Enter your extraction prompt here',
-                                         label='Input Prompt')
                 exp = gr.Button("Example Prompt")
                 with gr.Row():
                     gen = gr.Button("Generate", variant="primary")
@@ -344,9 +311,9 @@ with gr.Blocks(title="Automated Enzyme Kinetics Extractor") as demo:
         | Enzyme1    | Bacillus subtilis | Substrate_A | 7.3 | mM      | 6.4  | s^-1      | 1.4 × 10^4   | M^-1s^-1     | 37°C             | 5.0            | WT                 | NADP^+                  |
         | Enzyme2    | Escherichia coli  | Substrate_B | 5.9 | mM      | 9.8  | s^-1      | 29000   | mM^-1min^-1  | 60°C             | 10.0           | Q176E             | NADPH                   |
         | Enzyme3    | Homo sapiens      | Substrate_C | 6.9 | mM      | 15.6 | s^-1      | 43000   | µM^-1s^-1    | 65°C             | 8.0            | T253S             | NAD^+                   |
         """)
-        with gr.Tab("Golden Benchmark"):
             gr.Markdown(
                 '''<h1 align="center"> Golden Benchmark Viewer with Advanced Search </h1>
                 </p>'''
@@ -357,7 +324,7 @@ with gr.Blocks(title="Automated Enzyme Kinetics Extractor") as demo:
             with gr.Row():
                 # 选择搜索字段
-                column_names = get_column_names(EXCEL_FILE_PATH_Golden_Benchmark)
                 column_dropdown = gr.Dropdown(label="Select Column to Search", choices=column_names)
                 # 添加搜索框
@@ -368,13 +335,47 @@ with gr.Blocks(title="Automated Enzyme Kinetics Extractor") as demo:
             search_output = gr.HTML(label="Search Results", min_height=1000, max_height=1000)
             # 设置搜索功能
-            search_button.click(fn=search_data_golden, inputs=[search_box, column_dropdown], outputs=search_output)
             # 将回车事件绑定到搜索按钮
-            search_box.submit(fn=search_data_golden, inputs=[search_box, column_dropdown], outputs=search_output)
-            # 初始加载整个 Excel 表格
-            initial_output = load_excel(EXCEL_FILE_PATH_Golden_Benchmark)
             if isinstance(initial_output, str):
                 search_output.value = initial_output  # 直接将错误消息赋值
             else:
@@ -391,7 +392,7 @@ with gr.Blocks(title="Automated Enzyme Kinetics Extractor") as demo:
             """)
             with gr.Row():
                 # 选择搜索字段
-                column_names = get_column_names(EXCEL_FILE_PATH_Expert_Annotated_Dataset)
                 column_dropdown = gr.Dropdown(label="Select Column to Search", choices=column_names)
                 # 添加搜索框
@@ -402,23 +403,26 @@ with gr.Blocks(title="Automated Enzyme Kinetics Extractor") as demo:
             search_output = gr.HTML(label="Search Results", min_height=1000, max_height=1000)
             # 设置搜索功能
-            search_button.click(fn=search_data_golden, inputs=[search_box, column_dropdown], outputs=search_output)
             # 将回车事件绑定到搜索按钮
-            search_box.submit(fn=search_data_golden, inputs=[search_box, column_dropdown], outputs=search_output)
-            # 初始加载整个 Excel 表格
-            initial_output = load_excel(EXCEL_FILE_PATH_Expert_Annotated_Dataset)
             if isinstance(initial_output, str):
                 search_output.value = initial_output  # 直接将错误消息赋值
             else:
                 search_output.value = initial_output.to_html(classes='data', index=False, header=True)
     extract_button.click(extract_pdf_pypdf, inputs=file_input, outputs=text_output)
     exp.click(update_input, outputs=model_input)
-    gen.click(fn=predict, inputs=[model_input, file_input], outputs=outputs)
     clr.click(fn=lambda: [gr.update(value=""), gr.update(value="")], inputs=None, outputs=[model_input, outputs])
     viewer_button.click(display_pdf_images, inputs=file_input, outputs=file_out)
 demo.launch()

 api_key = os.getenv('API_KEY')
 base_url = os.getenv("BASE_URL")
+client = OpenAI(api_key=api_key, base_url=base_url)
 def cal_tokens(message_data):
 def del_references(lines):
+    # 定义正则表达式模式
+    patterns = [
+        (r'\*\{.{0,5}(References|Reference|REFERENCES|LITERATURE CITED|Referencesand notes|Notes and references)(.*?)\\section\*\{Tables', r'\section*{Tables\n'),
+        (r'\*\{.{0,5}(References|Reference|REFERENCES|LITERATURE CITED|Referencesand notes|Notes and references)(.*)', ''),
+        (r'#.{0,15}(References|Reference|REFERENCES|LITERATURE CITED|Referencesand notes|Notes and references)(.*?)(Table|Tables)', r'Tables'),
+        (r'#.{0,15}(References|Reference|REFERENCES|LITERATURE CITED|Referencesand notes|Notes and references)(.*?)# SUPPLEMENTARY', r'# SUPPLEMENTARY'),
+        (r'#.{0,15}(References|Reference|REFERENCES|LITERATURE CITED|Referencesand notes|Notes and references)(.*?)\[\^0\]', r'[^0]'),
+        (r'#.{0,15}(References|Reference|REFERENCES|LITERATURE CITED|Referencesand notes|Notes and references)(.*)', '')
+    ]
+    for pattern, replacement in patterns:
         matches = re.search(pattern, lines, re.DOTALL)
         if matches:
+            lines = lines.replace(matches[0], replacement)
+            print(f"匹配到了 {pattern}, 删除了 References, 保留了后面的 {replacement}")
+            break
+    else:
+        print("没有匹配到 References")
     return lines
 def openai_chat_2_step(prompt, file_content):
     all_response = ""
+    for i in range(len(file_content)//123000 + 1):
+        text = file_content[i*123000:(i+1)*123000]
         # step1: 拆分两部分，前半部分
         messages = [
             {
     return response
+def predict(prompt, file_content):
+    file_content = del_references(file_content)
     messages = [
         {
             "role": "system",
     print("prompt tokens:", tokens)
     # time.sleep(20) # claude 需要加这个
     if tokens > 128000:
         extract_result = openai_chat_2_step(prompt, file_content)
     else:
         extract_result = openai_api(messages)
     return en_1
+CSV_FILE_PATH_Golden_Benchmark_Enzyme = "static/Golden Benchmark for Enzyme Kinetics.csv"
+CSV_FILE_PATH_Golden_Benchmark_Ribozyme = "static/Golden Benchmark for Ribozyme Kinetics.csv"
+CSV_FILE_PATH_LLENKA_Dataset = "static/3450_merged_data_2000_lines.csv"
+def load_csv(CSV_FILE_PATH):
     try:
+        df = pd.read_csv(CSV_FILE_PATH)
         return df
     except Exception as e:
+        return f"Error loading CSV file: {e}"
+def get_column_names(CSV_FILE_PATH):
+    df = load_csv(CSV_FILE_PATH)
     if isinstance(df, str):
         return []  # 如果加载失败，返回空列表
     return df.columns.tolist()  # 返回列名列表
+def search_data(df, keyword, selected_column):
     if isinstance(df, str):  # 检查是否加载成功
         return df
     # 过滤包含关键字的行
     if selected_column not in df.columns:
         return "Invalid column selected."
     if filtered_df.empty:
         return "No results found."
     return filtered_df.to_html(classes='data', index=False, header=True)
+def search_data_golden_Enzyme(keyword, selected_column):
+    df = load_csv(CSV_FILE_PATH_Golden_Benchmark_Enzyme)
+    return search_data(df, keyword, selected_column)
+def search_data_golden_Ribozyme(keyword, selected_column):
+    df = load_csv(CSV_FILE_PATH_Golden_Benchmark_Ribozyme)
+    return search_data(df, keyword, selected_column)
+def search_data_LLENKA(keyword, selected_column):
+    df = load_csv(CSV_FILE_PATH_LLENKA_Dataset)
+    return search_data(df, keyword, selected_column)
 with gr.Blocks(title="Automated Enzyme Kinetics Extractor") as demo:
             with gr.Row():
                 with gr.Column(scale=1):
                     file_out = gr.Gallery(label="PDF Viewer", columns=1, height="auto", object_fit="contain")
                 with gr.Column(scale=1):
                     )
             with gr.Column():
+                model_input = gr.Textbox(lines=7, value=en_1, placeholder='Enter your extraction prompt here', label='Input Prompt')
                 exp = gr.Button("Example Prompt")
                 with gr.Row():
                     gen = gr.Button("Generate", variant="primary")
         | Enzyme1    | Bacillus subtilis | Substrate_A | 7.3 | mM      | 6.4  | s^-1      | 1.4 × 10^4   | M^-1s^-1     | 37°C             | 5.0            | WT                 | NADP^+                  |
         | Enzyme2    | Escherichia coli  | Substrate_B | 5.9 | mM      | 9.8  | s^-1      | 29000   | mM^-1min^-1  | 60°C             | 10.0           | Q176E             | NADPH                   |
         | Enzyme3    | Homo sapiens      | Substrate_C | 6.9 | mM      | 15.6 | s^-1      | 43000   | µM^-1s^-1    | 65°C             | 8.0            | T253S             | NAD^+                   |
         """)
+        with gr.Tab("Golden Benchmark for Enzyme Kinetics"):
             gr.Markdown(
                 '''<h1 align="center"> Golden Benchmark Viewer with Advanced Search </h1>
                 </p>'''
             with gr.Row():
                 # 选择搜索字段
+                column_names = get_column_names(CSV_FILE_PATH_Golden_Benchmark_Enzyme)
                 column_dropdown = gr.Dropdown(label="Select Column to Search", choices=column_names)
                 # 添加搜索框
             search_output = gr.HTML(label="Search Results", min_height=1000, max_height=1000)
             # 设置搜索功能
+            search_button.click(fn=search_data_golden_Enzyme, inputs=[search_box, column_dropdown], outputs=search_output)
             # 将回车事件绑定到搜索按钮
+            search_box.submit(fn=search_data_golden_Enzyme, inputs=[search_box, column_dropdown], outputs=search_output)
+            # 初始加载整个 CSV 表格
+            initial_output = load_csv(CSV_FILE_PATH_Golden_Benchmark_Enzyme)
+            if isinstance(initial_output, str):
+                search_output.value = initial_output  # 直接将错误消息赋值
+            else:
+                search_output.value = initial_output.to_html(classes='data', index=False, header=True)
+        with gr.Tab("Golden Benchmark for Ribozyme Kinetics"):
+            gr.Markdown(
+                '''<h1 align="center"> Golden Benchmark Viewer with Advanced Search </h1>
+                </p>'''
+            )
+            gr.Markdown("""
+                dataset can be download in [LLM-Ribozyme-Kinetics-Golden-Benchmark](https://huggingface.co/datasets/jackkuo/LLM-Ribozyme-Kinetics-Golden-Benchmark)
+            """)
+            with gr.Row():
+                # 选择搜索字段
+                column_names = get_column_names(CSV_FILE_PATH_Golden_Benchmark_Ribozyme)
+                column_dropdown = gr.Dropdown(label="Select Column to Search", choices=column_names)
+                # 添加搜索框
+                search_box = gr.Textbox(label="Search", placeholder="Enter keyword to search...")
+            # 按钮点击后执行搜索
+            search_button = gr.Button("Search", variant="primary")
+            search_output = gr.HTML(label="Search Results", min_height=1000, max_height=1000)
+            # 设置搜索功能
+            search_button.click(fn=search_data_golden_Ribozyme, inputs=[search_box, column_dropdown], outputs=search_output)
+            # 将回车事件绑定到搜索按钮
+            search_box.submit(fn=search_data_golden_Ribozyme, inputs=[search_box, column_dropdown], outputs=search_output)
+            # 初始加载整个 CSV 表格
+            initial_output = load_csv(CSV_FILE_PATH_Golden_Benchmark_Ribozyme)
             if isinstance(initial_output, str):
                 search_output.value = initial_output  # 直接将错误消息赋值
             else:
             """)
             with gr.Row():
                 # 选择搜索字段
+                column_names = get_column_names(CSV_FILE_PATH_LLENKA_Dataset)
                 column_dropdown = gr.Dropdown(label="Select Column to Search", choices=column_names)
                 # 添加搜索框
             search_output = gr.HTML(label="Search Results", min_height=1000, max_height=1000)
             # 设置搜索功能
+            search_button.click(fn=search_data_LLENKA, inputs=[search_box, column_dropdown], outputs=search_output)
             # 将回车事件绑定到搜索按钮
+            search_box.submit(fn=search_data_LLENKA, inputs=[search_box, column_dropdown], outputs=search_output)
+            # 初始加载整个 CSV 表格
+            initial_output = load_csv(CSV_FILE_PATH_LLENKA_Dataset)
             if isinstance(initial_output, str):
                 search_output.value = initial_output  # 直接将错误消息赋值
             else:
                 search_output.value = initial_output.to_html(classes='data', index=False, header=True)
     extract_button.click(extract_pdf_pypdf, inputs=file_input, outputs=text_output)
     exp.click(update_input, outputs=model_input)
+    gen.click(fn=predict, inputs=[model_input, text_output], outputs=outputs)
     clr.click(fn=lambda: [gr.update(value=""), gr.update(value="")], inputs=None, outputs=[model_input, outputs])
     viewer_button.click(display_pdf_images, inputs=file_input, outputs=file_out)
 demo.launch()