Spaces:

jackkuo
/

Automated-Enzyme-Kinetics-Extractor

Running

App Files Files Community

jackkuo commited on Nov 1, 2024

Commit

f1536fe

verified ·

1 Parent(s): 67e6ad6

Update app.py

Browse files

Files changed (1) hide show

app.py +117 -5

app.py CHANGED Viewed

@@ -4,7 +4,8 @@ import fitz  # PyMuPDF
 from PIL import Image
 from pathlib import Path
 import os
 api_key = os.getenv('API_KEY')
 base_url = os.getenv("BASE_URL")
@@ -15,6 +16,60 @@ client = OpenAI(
 )
 def extract_pdf_pypdf(pdf_dir):
     try:
         doc = fitz.open(pdf_dir)
@@ -52,7 +107,54 @@ def openai_api(messages):
         return None
-def predict(input_text, pdf_file):
     if pdf_file is None:
         return "Please upload a PDF file to proceed."
@@ -66,9 +168,17 @@ def predict(input_text, pdf_file):
     '''
     {{""" + file_content + """}}
     '''
-                                        """ + input_text}
     ]
-    extract_result = openai_api(messages)
     return extract_result or "Too many users. Please wait a moment!"
@@ -147,8 +257,10 @@ with gr.Blocks(title="Automated Enzyme Kinetics Extractor") as demo:
     with gr.Row():
         viewer_button = gr.Button("View PDF", variant="secondary")
         extract_button = gr.Button("Extract Text", variant="primary")
     with gr.Row():
         with gr.Column(scale=1):
             file_out = gr.Gallery(label="PDF Viewer", columns=1, height="auto", object_fit="contain")
         with gr.Column(scale=1):
@@ -167,7 +279,7 @@ with gr.Blocks(title="Automated Enzyme Kinetics Extractor") as demo:
         model_input = gr.Textbox(lines=7, value=en_1, placeholder='Enter your extraction prompt here', label='Input Prompt')
         exp = gr.Button("Example Prompt")
         with gr.Row():
-            gen = gr.Button("Generate")
             clr = gr.Button("Clear")
         outputs = gr.Markdown(label='Output', value="""| Enzyme     | Organism          | Substrate   | Km  | Unit_Km | Kcat | Unit_Kcat | Kcat/Km | Unit_Kcat/Km | Commentary[Temp] | Commentary[pH] | Commentary[Mutant] | Commentary[Cosubstrate] |
 |------------|-------------------|-------------|-----|---------|------|-----------|---------|--------------|------------------|----------------|--------------------|-------------------------|

 from PIL import Image
 from pathlib import Path
 import os
+import re
+import tiktoken
 api_key = os.getenv('API_KEY')
 base_url = os.getenv("BASE_URL")
 )
+def cal_tokens(message_data):
+    print("use tiktoken")
+    try:
+        encoding = tiktoken.encoding_for_model("gpt-3.5-turbo-0301")
+    except KeyError:
+        print("Warning: model not found. Using cl100k_base encoding.")
+        encoding = tiktoken.get_encoding("cl100k_base")
+    num_tokens = len(encoding.encode(str(message_data)))
+    return num_tokens
+def del_references(lines):
+    # 1.mathpix md的格式：匹配\section*{REFERENCES}xxxx\section*{Table
+    pattern = r'\*\{.{0,5}(References|Reference|REFERENCES|LITERATURE CITED|Referencesand notes|Notes and references)(.*?)\\section\*\{Tables'
+    matches = re.search(pattern, lines, re.DOTALL)
+    if matches:
+        lines = lines.replace(matches[0], "\section*{Tables\n")
+        print("1.1.匹配到了References和Tables,删除了References，保留了后面的Tables")
+    else:
+        pattern = r'\*\{.{0,5}(References|Reference|REFERENCES|LITERATURE CITED|Referencesand notes|Notes and references)(.*)'
+        matches = re.search(pattern, lines, re.DOTALL)
+        if matches:
+            print("1.2.匹配到了References,删除了References")
+            lines = lines.replace(matches[0], "")
+        else:
+            # 2.md的格式：匹配 ## REFERENCES
+            pattern = r'#.{0,15}(References|Reference|REFERENCES|LITERATURE CITED|Referencesand notes|Notes and references)(.*?)(Table|Tables)'
+            matches = re.search(pattern, lines, re.DOTALL)
+            if matches:
+                lines = lines.replace(matches[0], "Tables")
+                print("2.1.匹配到了## References和Tables,删除了References，保留了后面的Tables")
+            else:
+                pattern = r'#.{0,15}(References|Reference|REFERENCES|LITERATURE CITED|Referencesand notes|Notes and references)(.*?)# SUPPLEMENTARY'
+                matches = re.search(pattern, lines, re.DOTALL)
+                if matches:
+                    lines = lines.replace(matches[0], "# SUPPLEMENTARY")
+                    print("2.2.匹配到了## References和# SUPPLEMENTARY,删除了References，保留了后面的# SUPPLEMENTARY")
+                else:
+                    pattern = r'#.{0,15}(References|Reference|REFERENCES|LITERATURE CITED|Referencesand notes|Notes and references)(.*)\[\^0\]'
+                    matches = re.search(pattern, lines, re.DOTALL)
+                    if matches:
+                        print("2.3.匹配到了## References和\[\^0\],删除了References和\[\^0\]之间的内容")
+                        lines = lines.replace(matches[0], "[^0]")
+                    else:
+                        pattern = r'#.{0,15}(References|Reference|REFERENCES|LITERATURE CITED|Referencesand notes|Notes and references)(.*)'
+                        matches = re.search(pattern, lines, re.DOTALL)
+                        if matches:
+                            print("2.4.匹配到了## References,删除了References")
+                            lines = lines.replace(matches[0], "")
+                        else:
+                            print("没有匹配到References")
+    return lines
 def extract_pdf_pypdf(pdf_dir):
     try:
         doc = fitz.open(pdf_dir)
         return None
+def openai_chat_2_step(prompt, file_content):
+    all_response = ""
+    for i in range(len(file_content)//123000 + 1):
+        text = file_content[i*123000:(i+1)*123000]
+        # step1: 拆分两部分，前半部分
+        messages = [
+            {
+                "role": "system",
+                "content": "You are an expert in information extraction from scientific literature.",
+            },
+            {"role": "user",
+             "content": "The following is a scientific article, please read it carefully: \n{" + text + "}\n" + prompt},
+        ]
+        tokens = cal_tokens(messages)
+        print("step一: 抽取部分{}：".format(i))
+        print("prompt tokens:", tokens)
+        response_2_content = openai_api(messages)
+        if response_2_content:
+            all_response += response_2_content + "\n"
+    messages = [
+        {
+            "role": "system",
+            "content": "You are an expert in information extraction from scientific literature.",
+        },
+        {"role": "user", "content": """Provided Text:
+'''
+{{""" + all_response + """}}
+'''
+                                    """ + """
+Combine the above tables into one table.
+Please pay attention to the pipe format as shown in the example below. This format is for reference only regarding the structure; the content within is not the focus of this instruction.
+| Enzyme     | Organism          | Substrate   | Km  | Unit_Km | Kcat | Unit_Kcat | Kcat/Km | Unit_Kcat/Km | Commentary[Temp] | Commentary[pH] | Commentary[Mutant] | Commentary[Cosubstrate] |
+|------------|-------------------|-------------|-----|---------|------|-----------|---------|--------------|------------------|----------------|--------------------|-------------------------|
+| Enzyme1    | Bacillus subtilis | Substrate_A | 7.3 | mM      | 6.4  | s^-1      | 1.4 × 10^4   | M^-1s^-1     | 37°C             | 5.0            | WT                 | NADP^+                  |
+| Enzyme2    | Escherichia coli  | Substrate_B | 5.9 | mM      | 9.8  | s^-1      | 29000   | mM^-1min^-1  | 60°C             | 10.0           | Q176E             | NADPH                   |
+| Enzyme3    | Homo sapiens      | Substrate_C | 6.9 | mM      | 15.6 | s^-1      | 43000   | µM^-1s^-1    | 65°C             | 8.0            | T253S             | NAD^+                   |
+                                    """}
+    ]
+    tokens = cal_tokens(messages)
+    print("step二: 合并部分：")
+    print("prompt tokens:", tokens)
+    response = openai_api(messages)
+    return response
+def predict(prompt, pdf_file):
     if pdf_file is None:
         return "Please upload a PDF file to proceed."
     '''
     {{""" + file_content + """}}
     '''
+                                        """ + prompt}
     ]
+    tokens = cal_tokens(messages)
+    print("开始：抽取")
+    print("prompt tokens:", tokens)
+    # time.sleep(20) # claude 需要加这个
+    if tokens > 128000:
+        file_content = del_references(file_content)
+        extract_result = openai_chat_2_step(prompt, file_content)
+    else:
+        extract_result = openai_api(messages)
     return extract_result or "Too many users. Please wait a moment!"
     with gr.Row():
         viewer_button = gr.Button("View PDF", variant="secondary")
         extract_button = gr.Button("Extract Text", variant="primary")
     with gr.Row():
         with gr.Column(scale=1):
             file_out = gr.Gallery(label="PDF Viewer", columns=1, height="auto", object_fit="contain")
         with gr.Column(scale=1):
         model_input = gr.Textbox(lines=7, value=en_1, placeholder='Enter your extraction prompt here', label='Input Prompt')
         exp = gr.Button("Example Prompt")
         with gr.Row():
+            gen = gr.Button("Generate", variant="primary")
             clr = gr.Button("Clear")
         outputs = gr.Markdown(label='Output', value="""| Enzyme     | Organism          | Substrate   | Km  | Unit_Km | Kcat | Unit_Kcat | Kcat/Km | Unit_Kcat/Km | Commentary[Temp] | Commentary[pH] | Commentary[Mutant] | Commentary[Cosubstrate] |
 |------------|-------------------|-------------|-----|---------|------|-----------|---------|--------------|------------------|----------------|--------------------|-------------------------|