jackkuo commited on
Commit
f1536fe
·
verified ·
1 Parent(s): 67e6ad6

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +117 -5
app.py CHANGED
@@ -4,7 +4,8 @@ import fitz # PyMuPDF
4
  from PIL import Image
5
  from pathlib import Path
6
  import os
7
-
 
8
 
9
  api_key = os.getenv('API_KEY')
10
  base_url = os.getenv("BASE_URL")
@@ -15,6 +16,60 @@ client = OpenAI(
15
  )
16
 
17
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
  def extract_pdf_pypdf(pdf_dir):
19
  try:
20
  doc = fitz.open(pdf_dir)
@@ -52,7 +107,54 @@ def openai_api(messages):
52
  return None
53
 
54
 
55
- def predict(input_text, pdf_file):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
56
  if pdf_file is None:
57
  return "Please upload a PDF file to proceed."
58
 
@@ -66,9 +168,17 @@ def predict(input_text, pdf_file):
66
  '''
67
  {{""" + file_content + """}}
68
  '''
69
- """ + input_text}
70
  ]
71
- extract_result = openai_api(messages)
 
 
 
 
 
 
 
 
72
 
73
  return extract_result or "Too many users. Please wait a moment!"
74
 
@@ -147,8 +257,10 @@ with gr.Blocks(title="Automated Enzyme Kinetics Extractor") as demo:
147
  with gr.Row():
148
  viewer_button = gr.Button("View PDF", variant="secondary")
149
  extract_button = gr.Button("Extract Text", variant="primary")
 
150
  with gr.Row():
151
  with gr.Column(scale=1):
 
152
  file_out = gr.Gallery(label="PDF Viewer", columns=1, height="auto", object_fit="contain")
153
 
154
  with gr.Column(scale=1):
@@ -167,7 +279,7 @@ with gr.Blocks(title="Automated Enzyme Kinetics Extractor") as demo:
167
  model_input = gr.Textbox(lines=7, value=en_1, placeholder='Enter your extraction prompt here', label='Input Prompt')
168
  exp = gr.Button("Example Prompt")
169
  with gr.Row():
170
- gen = gr.Button("Generate")
171
  clr = gr.Button("Clear")
172
  outputs = gr.Markdown(label='Output', value="""| Enzyme | Organism | Substrate | Km | Unit_Km | Kcat | Unit_Kcat | Kcat/Km | Unit_Kcat/Km | Commentary[Temp] | Commentary[pH] | Commentary[Mutant] | Commentary[Cosubstrate] |
173
  |------------|-------------------|-------------|-----|---------|------|-----------|---------|--------------|------------------|----------------|--------------------|-------------------------|
 
4
  from PIL import Image
5
  from pathlib import Path
6
  import os
7
+ import re
8
+ import tiktoken
9
 
10
  api_key = os.getenv('API_KEY')
11
  base_url = os.getenv("BASE_URL")
 
16
  )
17
 
18
 
19
+ def cal_tokens(message_data):
20
+ print("use tiktoken")
21
+ try:
22
+ encoding = tiktoken.encoding_for_model("gpt-3.5-turbo-0301")
23
+ except KeyError:
24
+ print("Warning: model not found. Using cl100k_base encoding.")
25
+ encoding = tiktoken.get_encoding("cl100k_base")
26
+ num_tokens = len(encoding.encode(str(message_data)))
27
+ return num_tokens
28
+
29
+
30
+ def del_references(lines):
31
+ # 1.mathpix md的格式:匹配\section*{REFERENCES}xxxx\section*{Table
32
+ pattern = r'\*\{.{0,5}(References|Reference|REFERENCES|LITERATURE CITED|Referencesand notes|Notes and references)(.*?)\\section\*\{Tables'
33
+ matches = re.search(pattern, lines, re.DOTALL)
34
+ if matches:
35
+ lines = lines.replace(matches[0], "\section*{Tables\n")
36
+ print("1.1.匹配到了References和Tables,删除了References,保留了后面的Tables")
37
+ else:
38
+ pattern = r'\*\{.{0,5}(References|Reference|REFERENCES|LITERATURE CITED|Referencesand notes|Notes and references)(.*)'
39
+ matches = re.search(pattern, lines, re.DOTALL)
40
+ if matches:
41
+ print("1.2.匹配到了References,删除了References")
42
+ lines = lines.replace(matches[0], "")
43
+ else:
44
+ # 2.md的格式:匹配 ## REFERENCES
45
+ pattern = r'#.{0,15}(References|Reference|REFERENCES|LITERATURE CITED|Referencesand notes|Notes and references)(.*?)(Table|Tables)'
46
+ matches = re.search(pattern, lines, re.DOTALL)
47
+ if matches:
48
+ lines = lines.replace(matches[0], "Tables")
49
+ print("2.1.匹配到了## References和Tables,删除了References,保留了后面的Tables")
50
+ else:
51
+ pattern = r'#.{0,15}(References|Reference|REFERENCES|LITERATURE CITED|Referencesand notes|Notes and references)(.*?)# SUPPLEMENTARY'
52
+ matches = re.search(pattern, lines, re.DOTALL)
53
+ if matches:
54
+ lines = lines.replace(matches[0], "# SUPPLEMENTARY")
55
+ print("2.2.匹配到了## References和# SUPPLEMENTARY,删除了References,保留了后面的# SUPPLEMENTARY")
56
+ else:
57
+ pattern = r'#.{0,15}(References|Reference|REFERENCES|LITERATURE CITED|Referencesand notes|Notes and references)(.*)\[\^0\]'
58
+ matches = re.search(pattern, lines, re.DOTALL)
59
+ if matches:
60
+ print("2.3.匹配到了## References和\[\^0\],删除了References和\[\^0\]之间的内容")
61
+ lines = lines.replace(matches[0], "[^0]")
62
+ else:
63
+ pattern = r'#.{0,15}(References|Reference|REFERENCES|LITERATURE CITED|Referencesand notes|Notes and references)(.*)'
64
+ matches = re.search(pattern, lines, re.DOTALL)
65
+ if matches:
66
+ print("2.4.匹配到了## References,删除了References")
67
+ lines = lines.replace(matches[0], "")
68
+ else:
69
+ print("没有匹配到References")
70
+ return lines
71
+
72
+
73
  def extract_pdf_pypdf(pdf_dir):
74
  try:
75
  doc = fitz.open(pdf_dir)
 
107
  return None
108
 
109
 
110
+ def openai_chat_2_step(prompt, file_content):
111
+ all_response = ""
112
+ for i in range(len(file_content)//123000 + 1):
113
+ text = file_content[i*123000:(i+1)*123000]
114
+ # step1: 拆分两部分,前半部分
115
+ messages = [
116
+ {
117
+ "role": "system",
118
+ "content": "You are an expert in information extraction from scientific literature.",
119
+ },
120
+ {"role": "user",
121
+ "content": "The following is a scientific article, please read it carefully: \n{" + text + "}\n" + prompt},
122
+ ]
123
+ tokens = cal_tokens(messages)
124
+ print("step一: 抽取部分{}:".format(i))
125
+ print("prompt tokens:", tokens)
126
+ response_2_content = openai_api(messages)
127
+ if response_2_content:
128
+ all_response += response_2_content + "\n"
129
+
130
+ messages = [
131
+ {
132
+ "role": "system",
133
+ "content": "You are an expert in information extraction from scientific literature.",
134
+ },
135
+ {"role": "user", "content": """Provided Text:
136
+ '''
137
+ {{""" + all_response + """}}
138
+ '''
139
+ """ + """
140
+ Combine the above tables into one table.
141
+ Please pay attention to the pipe format as shown in the example below. This format is for reference only regarding the structure; the content within is not the focus of this instruction.
142
+
143
+ | Enzyme | Organism | Substrate | Km | Unit_Km | Kcat | Unit_Kcat | Kcat/Km | Unit_Kcat/Km | Commentary[Temp] | Commentary[pH] | Commentary[Mutant] | Commentary[Cosubstrate] |
144
+ |------------|-------------------|-------------|-----|---------|------|-----------|---------|--------------|------------------|----------------|--------------------|-------------------------|
145
+ | Enzyme1 | Bacillus subtilis | Substrate_A | 7.3 | mM | 6.4 | s^-1 | 1.4 × 10^4 | M^-1s^-1 | 37°C | 5.0 | WT | NADP^+ |
146
+ | Enzyme2 | Escherichia coli | Substrate_B | 5.9 | mM | 9.8 | s^-1 | 29000 | mM^-1min^-1 | 60°C | 10.0 | Q176E | NADPH |
147
+ | Enzyme3 | Homo sapiens | Substrate_C | 6.9 | mM | 15.6 | s^-1 | 43000 | µM^-1s^-1 | 65°C | 8.0 | T253S | NAD^+ |
148
+ """}
149
+ ]
150
+ tokens = cal_tokens(messages)
151
+ print("step二: 合并部分:")
152
+ print("prompt tokens:", tokens)
153
+ response = openai_api(messages)
154
+ return response
155
+
156
+
157
+ def predict(prompt, pdf_file):
158
  if pdf_file is None:
159
  return "Please upload a PDF file to proceed."
160
 
 
168
  '''
169
  {{""" + file_content + """}}
170
  '''
171
+ """ + prompt}
172
  ]
173
+ tokens = cal_tokens(messages)
174
+ print("开始:抽取")
175
+ print("prompt tokens:", tokens)
176
+ # time.sleep(20) # claude 需要加这个
177
+ if tokens > 128000:
178
+ file_content = del_references(file_content)
179
+ extract_result = openai_chat_2_step(prompt, file_content)
180
+ else:
181
+ extract_result = openai_api(messages)
182
 
183
  return extract_result or "Too many users. Please wait a moment!"
184
 
 
257
  with gr.Row():
258
  viewer_button = gr.Button("View PDF", variant="secondary")
259
  extract_button = gr.Button("Extract Text", variant="primary")
260
+
261
  with gr.Row():
262
  with gr.Column(scale=1):
263
+
264
  file_out = gr.Gallery(label="PDF Viewer", columns=1, height="auto", object_fit="contain")
265
 
266
  with gr.Column(scale=1):
 
279
  model_input = gr.Textbox(lines=7, value=en_1, placeholder='Enter your extraction prompt here', label='Input Prompt')
280
  exp = gr.Button("Example Prompt")
281
  with gr.Row():
282
+ gen = gr.Button("Generate", variant="primary")
283
  clr = gr.Button("Clear")
284
  outputs = gr.Markdown(label='Output', value="""| Enzyme | Organism | Substrate | Km | Unit_Km | Kcat | Unit_Kcat | Kcat/Km | Unit_Kcat/Km | Commentary[Temp] | Commentary[pH] | Commentary[Mutant] | Commentary[Cosubstrate] |
285
  |------------|-------------------|-------------|-----|---------|------|-----------|---------|--------------|------------------|----------------|--------------------|-------------------------|