Update app.py
Browse files
app.py
CHANGED
@@ -4,7 +4,8 @@ import fitz # PyMuPDF
|
|
4 |
from PIL import Image
|
5 |
from pathlib import Path
|
6 |
import os
|
7 |
-
|
|
|
8 |
|
9 |
api_key = os.getenv('API_KEY')
|
10 |
base_url = os.getenv("BASE_URL")
|
@@ -15,6 +16,60 @@ client = OpenAI(
|
|
15 |
)
|
16 |
|
17 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
18 |
def extract_pdf_pypdf(pdf_dir):
|
19 |
try:
|
20 |
doc = fitz.open(pdf_dir)
|
@@ -52,7 +107,54 @@ def openai_api(messages):
|
|
52 |
return None
|
53 |
|
54 |
|
55 |
-
def
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
56 |
if pdf_file is None:
|
57 |
return "Please upload a PDF file to proceed."
|
58 |
|
@@ -66,9 +168,17 @@ def predict(input_text, pdf_file):
|
|
66 |
'''
|
67 |
{{""" + file_content + """}}
|
68 |
'''
|
69 |
-
""" +
|
70 |
]
|
71 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
72 |
|
73 |
return extract_result or "Too many users. Please wait a moment!"
|
74 |
|
@@ -147,8 +257,10 @@ with gr.Blocks(title="Automated Enzyme Kinetics Extractor") as demo:
|
|
147 |
with gr.Row():
|
148 |
viewer_button = gr.Button("View PDF", variant="secondary")
|
149 |
extract_button = gr.Button("Extract Text", variant="primary")
|
|
|
150 |
with gr.Row():
|
151 |
with gr.Column(scale=1):
|
|
|
152 |
file_out = gr.Gallery(label="PDF Viewer", columns=1, height="auto", object_fit="contain")
|
153 |
|
154 |
with gr.Column(scale=1):
|
@@ -167,7 +279,7 @@ with gr.Blocks(title="Automated Enzyme Kinetics Extractor") as demo:
|
|
167 |
model_input = gr.Textbox(lines=7, value=en_1, placeholder='Enter your extraction prompt here', label='Input Prompt')
|
168 |
exp = gr.Button("Example Prompt")
|
169 |
with gr.Row():
|
170 |
-
gen = gr.Button("Generate")
|
171 |
clr = gr.Button("Clear")
|
172 |
outputs = gr.Markdown(label='Output', value="""| Enzyme | Organism | Substrate | Km | Unit_Km | Kcat | Unit_Kcat | Kcat/Km | Unit_Kcat/Km | Commentary[Temp] | Commentary[pH] | Commentary[Mutant] | Commentary[Cosubstrate] |
|
173 |
|------------|-------------------|-------------|-----|---------|------|-----------|---------|--------------|------------------|----------------|--------------------|-------------------------|
|
|
|
4 |
from PIL import Image
|
5 |
from pathlib import Path
|
6 |
import os
|
7 |
+
import re
|
8 |
+
import tiktoken
|
9 |
|
10 |
api_key = os.getenv('API_KEY')
|
11 |
base_url = os.getenv("BASE_URL")
|
|
|
16 |
)
|
17 |
|
18 |
|
19 |
+
def cal_tokens(message_data):
|
20 |
+
print("use tiktoken")
|
21 |
+
try:
|
22 |
+
encoding = tiktoken.encoding_for_model("gpt-3.5-turbo-0301")
|
23 |
+
except KeyError:
|
24 |
+
print("Warning: model not found. Using cl100k_base encoding.")
|
25 |
+
encoding = tiktoken.get_encoding("cl100k_base")
|
26 |
+
num_tokens = len(encoding.encode(str(message_data)))
|
27 |
+
return num_tokens
|
28 |
+
|
29 |
+
|
30 |
+
def del_references(lines):
|
31 |
+
# 1.mathpix md的格式:匹配\section*{REFERENCES}xxxx\section*{Table
|
32 |
+
pattern = r'\*\{.{0,5}(References|Reference|REFERENCES|LITERATURE CITED|Referencesand notes|Notes and references)(.*?)\\section\*\{Tables'
|
33 |
+
matches = re.search(pattern, lines, re.DOTALL)
|
34 |
+
if matches:
|
35 |
+
lines = lines.replace(matches[0], "\section*{Tables\n")
|
36 |
+
print("1.1.匹配到了References和Tables,删除了References,保留了后面的Tables")
|
37 |
+
else:
|
38 |
+
pattern = r'\*\{.{0,5}(References|Reference|REFERENCES|LITERATURE CITED|Referencesand notes|Notes and references)(.*)'
|
39 |
+
matches = re.search(pattern, lines, re.DOTALL)
|
40 |
+
if matches:
|
41 |
+
print("1.2.匹配到了References,删除了References")
|
42 |
+
lines = lines.replace(matches[0], "")
|
43 |
+
else:
|
44 |
+
# 2.md的格式:匹配 ## REFERENCES
|
45 |
+
pattern = r'#.{0,15}(References|Reference|REFERENCES|LITERATURE CITED|Referencesand notes|Notes and references)(.*?)(Table|Tables)'
|
46 |
+
matches = re.search(pattern, lines, re.DOTALL)
|
47 |
+
if matches:
|
48 |
+
lines = lines.replace(matches[0], "Tables")
|
49 |
+
print("2.1.匹配到了## References和Tables,删除了References,保留了后面的Tables")
|
50 |
+
else:
|
51 |
+
pattern = r'#.{0,15}(References|Reference|REFERENCES|LITERATURE CITED|Referencesand notes|Notes and references)(.*?)# SUPPLEMENTARY'
|
52 |
+
matches = re.search(pattern, lines, re.DOTALL)
|
53 |
+
if matches:
|
54 |
+
lines = lines.replace(matches[0], "# SUPPLEMENTARY")
|
55 |
+
print("2.2.匹配到了## References和# SUPPLEMENTARY,删除了References,保留了后面的# SUPPLEMENTARY")
|
56 |
+
else:
|
57 |
+
pattern = r'#.{0,15}(References|Reference|REFERENCES|LITERATURE CITED|Referencesand notes|Notes and references)(.*)\[\^0\]'
|
58 |
+
matches = re.search(pattern, lines, re.DOTALL)
|
59 |
+
if matches:
|
60 |
+
print("2.3.匹配到了## References和\[\^0\],删除了References和\[\^0\]之间的内容")
|
61 |
+
lines = lines.replace(matches[0], "[^0]")
|
62 |
+
else:
|
63 |
+
pattern = r'#.{0,15}(References|Reference|REFERENCES|LITERATURE CITED|Referencesand notes|Notes and references)(.*)'
|
64 |
+
matches = re.search(pattern, lines, re.DOTALL)
|
65 |
+
if matches:
|
66 |
+
print("2.4.匹配到了## References,删除了References")
|
67 |
+
lines = lines.replace(matches[0], "")
|
68 |
+
else:
|
69 |
+
print("没有匹配到References")
|
70 |
+
return lines
|
71 |
+
|
72 |
+
|
73 |
def extract_pdf_pypdf(pdf_dir):
|
74 |
try:
|
75 |
doc = fitz.open(pdf_dir)
|
|
|
107 |
return None
|
108 |
|
109 |
|
110 |
+
def openai_chat_2_step(prompt, file_content):
|
111 |
+
all_response = ""
|
112 |
+
for i in range(len(file_content)//123000 + 1):
|
113 |
+
text = file_content[i*123000:(i+1)*123000]
|
114 |
+
# step1: 拆分两部分,前半部分
|
115 |
+
messages = [
|
116 |
+
{
|
117 |
+
"role": "system",
|
118 |
+
"content": "You are an expert in information extraction from scientific literature.",
|
119 |
+
},
|
120 |
+
{"role": "user",
|
121 |
+
"content": "The following is a scientific article, please read it carefully: \n{" + text + "}\n" + prompt},
|
122 |
+
]
|
123 |
+
tokens = cal_tokens(messages)
|
124 |
+
print("step一: 抽取部分{}:".format(i))
|
125 |
+
print("prompt tokens:", tokens)
|
126 |
+
response_2_content = openai_api(messages)
|
127 |
+
if response_2_content:
|
128 |
+
all_response += response_2_content + "\n"
|
129 |
+
|
130 |
+
messages = [
|
131 |
+
{
|
132 |
+
"role": "system",
|
133 |
+
"content": "You are an expert in information extraction from scientific literature.",
|
134 |
+
},
|
135 |
+
{"role": "user", "content": """Provided Text:
|
136 |
+
'''
|
137 |
+
{{""" + all_response + """}}
|
138 |
+
'''
|
139 |
+
""" + """
|
140 |
+
Combine the above tables into one table.
|
141 |
+
Please pay attention to the pipe format as shown in the example below. This format is for reference only regarding the structure; the content within is not the focus of this instruction.
|
142 |
+
|
143 |
+
| Enzyme | Organism | Substrate | Km | Unit_Km | Kcat | Unit_Kcat | Kcat/Km | Unit_Kcat/Km | Commentary[Temp] | Commentary[pH] | Commentary[Mutant] | Commentary[Cosubstrate] |
|
144 |
+
|------------|-------------------|-------------|-----|---------|------|-----------|---------|--------------|------------------|----------------|--------------------|-------------------------|
|
145 |
+
| Enzyme1 | Bacillus subtilis | Substrate_A | 7.3 | mM | 6.4 | s^-1 | 1.4 × 10^4 | M^-1s^-1 | 37°C | 5.0 | WT | NADP^+ |
|
146 |
+
| Enzyme2 | Escherichia coli | Substrate_B | 5.9 | mM | 9.8 | s^-1 | 29000 | mM^-1min^-1 | 60°C | 10.0 | Q176E | NADPH |
|
147 |
+
| Enzyme3 | Homo sapiens | Substrate_C | 6.9 | mM | 15.6 | s^-1 | 43000 | µM^-1s^-1 | 65°C | 8.0 | T253S | NAD^+ |
|
148 |
+
"""}
|
149 |
+
]
|
150 |
+
tokens = cal_tokens(messages)
|
151 |
+
print("step二: 合并部分:")
|
152 |
+
print("prompt tokens:", tokens)
|
153 |
+
response = openai_api(messages)
|
154 |
+
return response
|
155 |
+
|
156 |
+
|
157 |
+
def predict(prompt, pdf_file):
|
158 |
if pdf_file is None:
|
159 |
return "Please upload a PDF file to proceed."
|
160 |
|
|
|
168 |
'''
|
169 |
{{""" + file_content + """}}
|
170 |
'''
|
171 |
+
""" + prompt}
|
172 |
]
|
173 |
+
tokens = cal_tokens(messages)
|
174 |
+
print("开始:抽取")
|
175 |
+
print("prompt tokens:", tokens)
|
176 |
+
# time.sleep(20) # claude 需要加这个
|
177 |
+
if tokens > 128000:
|
178 |
+
file_content = del_references(file_content)
|
179 |
+
extract_result = openai_chat_2_step(prompt, file_content)
|
180 |
+
else:
|
181 |
+
extract_result = openai_api(messages)
|
182 |
|
183 |
return extract_result or "Too many users. Please wait a moment!"
|
184 |
|
|
|
257 |
with gr.Row():
|
258 |
viewer_button = gr.Button("View PDF", variant="secondary")
|
259 |
extract_button = gr.Button("Extract Text", variant="primary")
|
260 |
+
|
261 |
with gr.Row():
|
262 |
with gr.Column(scale=1):
|
263 |
+
|
264 |
file_out = gr.Gallery(label="PDF Viewer", columns=1, height="auto", object_fit="contain")
|
265 |
|
266 |
with gr.Column(scale=1):
|
|
|
279 |
model_input = gr.Textbox(lines=7, value=en_1, placeholder='Enter your extraction prompt here', label='Input Prompt')
|
280 |
exp = gr.Button("Example Prompt")
|
281 |
with gr.Row():
|
282 |
+
gen = gr.Button("Generate", variant="primary")
|
283 |
clr = gr.Button("Clear")
|
284 |
outputs = gr.Markdown(label='Output', value="""| Enzyme | Organism | Substrate | Km | Unit_Km | Kcat | Unit_Kcat | Kcat/Km | Unit_Kcat/Km | Commentary[Temp] | Commentary[pH] | Commentary[Mutant] | Commentary[Cosubstrate] |
|
285 |
|------------|-------------------|-------------|-----|---------|------|-----------|---------|--------------|------------------|----------------|--------------------|-------------------------|
|