jackkuo commited on
Commit
bfdf08c
·
verified ·
1 Parent(s): b26838d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +65 -29
app.py CHANGED
@@ -1,4 +1,5 @@
1
  from openai import OpenAI
 
2
  import gradio as gr
3
  import fitz # PyMuPDF
4
  from PIL import Image
@@ -28,11 +29,20 @@ def cal_tokens(message_data):
28
  def del_references(lines):
29
  # 定义正则表达式模式
30
  patterns = [
31
- (r'\*\{.{0,5}(References|Reference|REFERENCES|LITERATURE CITED|Referencesand notes|Notes and references)(.*?)\\section\*\{Tables', r'\section*{Tables\n'),
32
- (r'\*\{.{0,5}(References|Reference|REFERENCES|LITERATURE CITED|Referencesand notes|Notes and references)(.*)', ''),
33
- (r'#.{0,15}(References|Reference|REFERENCES|LITERATURE CITED|Referencesand notes|Notes and references)(.*?)(Table|Tables)', r'Tables'),
34
- (r'#.{0,15}(References|Reference|REFERENCES|LITERATURE CITED|Referencesand notes|Notes and references)(.*?)# SUPPLEMENTARY', r'# SUPPLEMENTARY'),
35
- (r'#.{0,15}(References|Reference|REFERENCES|LITERATURE CITED|Referencesand notes|Notes and references)(.*?)\[\^0\]', r'[^0]'),
 
 
 
 
 
 
 
 
 
36
  (r'#.{0,15}(References|Reference|REFERENCES|LITERATURE CITED|Referencesand notes|Notes and references)(.*)', '')
37
  ]
38
 
@@ -68,12 +78,25 @@ def extract_pdf_pypdf(pdf_dir):
68
  return file_content
69
 
70
 
71
- def openai_api(messages):
 
 
 
 
 
 
 
 
 
 
 
 
 
72
  try:
73
  completion = client.chat.completions.create(
74
- model="claude-3-5-sonnet-20240620",
75
  messages=messages,
76
- temperature=0.1,
77
  max_tokens=8192,
78
  stream=True
79
  )
@@ -85,10 +108,10 @@ def openai_api(messages):
85
  return None
86
 
87
 
88
- def openai_chat_2_step(prompt, file_content):
89
  all_response = ""
90
- for i in range(len(file_content)//123000 + 1):
91
- text = file_content[i*123000:(i+1)*123000]
92
  # step1: 拆分两部分,前半部分
93
  messages = [
94
  {
@@ -101,7 +124,7 @@ def openai_chat_2_step(prompt, file_content):
101
  tokens = cal_tokens(messages)
102
  print("step一: 抽取部分{}:".format(i))
103
  print("prompt tokens:", tokens)
104
- response_2_content = openai_api(messages)
105
  if response_2_content:
106
  all_response += response_2_content + "\n"
107
 
@@ -128,11 +151,11 @@ Please pay attention to the pipe format as shown in the example below. This form
128
  tokens = cal_tokens(messages)
129
  print("step二: 合并部分:")
130
  print("prompt tokens:", tokens)
131
- response = openai_api(messages)
132
  return response
133
 
134
 
135
- def predict(prompt, file_content):
136
  file_content = del_references(file_content)
137
 
138
  messages = [
@@ -151,9 +174,9 @@ def predict(prompt, file_content):
151
  print("prompt tokens:", tokens)
152
  # time.sleep(20) # claude 需要加这个
153
  if tokens > 128000:
154
- extract_result = openai_chat_2_step(prompt, file_content)
155
  else:
156
- extract_result = openai_api(messages)
157
 
158
  return extract_result or "Too many users. Please wait a moment!"
159
 
@@ -254,6 +277,7 @@ def search_data_golden_Enzyme(keyword, selected_column):
254
  df = load_csv(CSV_FILE_PATH_Golden_Benchmark_Enzyme)
255
  return search_data(df, keyword, selected_column)
256
 
 
257
  def search_data_golden_Ribozyme(keyword, selected_column):
258
  df = load_csv(CSV_FILE_PATH_Golden_Benchmark_Ribozyme)
259
  return search_data(df, keyword, selected_column)
@@ -272,20 +296,21 @@ with gr.Blocks(title="Automated Enzyme Kinetics Extractor") as demo:
272
  <p>How to use:
273
  <br><strong>1</strong>: Upload your PDF.
274
  <br><strong>2</strong>: Click "View PDF" to preview it.
275
- <br><strong>3</strong>: Click "Extract Text" to extract Text.
276
  <br><strong>4</strong>: Enter your extraction prompt in the input box.
277
- <br><strong>5</strong>: Click "Generate" to extract, and the extracted information will display below.
278
  </p>'''
279
  )
280
  file_input = gr.File(label="Upload your PDF", type="filepath")
281
  example = gr.Examples(examples=[["./sample.pdf"]], inputs=file_input)
282
  with gr.Row():
283
  viewer_button = gr.Button("View PDF", variant="secondary")
284
- extract_button = gr.Button("Extract Text", variant="primary")
 
 
285
 
286
  with gr.Row():
287
  with gr.Column(scale=1):
288
-
289
  file_out = gr.Gallery(label="PDF Viewer", columns=1, height="auto", object_fit="contain")
290
 
291
  with gr.Column(scale=1):
@@ -301,8 +326,17 @@ with gr.Blocks(title="Automated Enzyme Kinetics Extractor") as demo:
301
  )
302
 
303
  with gr.Column():
304
- model_input = gr.Textbox(lines=7, value=en_1, placeholder='Enter your extraction prompt here', label='Input Prompt')
 
305
  exp = gr.Button("Example Prompt")
 
 
 
 
 
 
 
 
306
  with gr.Row():
307
  gen = gr.Button("Generate", variant="primary")
308
  clr = gr.Button("Clear")
@@ -335,7 +369,8 @@ with gr.Blocks(title="Automated Enzyme Kinetics Extractor") as demo:
335
  search_output = gr.HTML(label="Search Results", min_height=1000, max_height=1000)
336
 
337
  # 设置搜索功能
338
- search_button.click(fn=search_data_golden_Enzyme, inputs=[search_box, column_dropdown], outputs=search_output)
 
339
 
340
  # 将回车事件绑定到搜索按钮
341
  search_box.submit(fn=search_data_golden_Enzyme, inputs=[search_box, column_dropdown], outputs=search_output)
@@ -369,10 +404,12 @@ with gr.Blocks(title="Automated Enzyme Kinetics Extractor") as demo:
369
  search_output = gr.HTML(label="Search Results", min_height=1000, max_height=1000)
370
 
371
  # 设置搜索功能
372
- search_button.click(fn=search_data_golden_Ribozyme, inputs=[search_box, column_dropdown], outputs=search_output)
 
373
 
374
  # 将回车事件绑定到搜索按钮
375
- search_box.submit(fn=search_data_golden_Ribozyme, inputs=[search_box, column_dropdown], outputs=search_output)
 
376
 
377
  # 初始加载整个 CSV 表格
378
  initial_output = load_csv(CSV_FILE_PATH_Golden_Benchmark_Ribozyme)
@@ -415,14 +452,13 @@ with gr.Blocks(title="Automated Enzyme Kinetics Extractor") as demo:
415
  else:
416
  search_output.value = initial_output.to_html(classes='data', index=False, header=True)
417
 
418
-
419
 
420
- extract_button.click(extract_pdf_pypdf, inputs=file_input, outputs=text_output)
 
 
421
  exp.click(update_input, outputs=model_input)
422
- gen.click(fn=predict, inputs=[model_input, text_output], outputs=outputs)
423
  clr.click(fn=lambda: [gr.update(value=""), gr.update(value="")], inputs=None, outputs=[model_input, outputs])
424
  viewer_button.click(display_pdf_images, inputs=file_input, outputs=file_out)
425
 
426
-
427
  demo.launch()
428
-
 
1
  from openai import OpenAI
2
+ from ocr_mathpix import extract_pdf_mathpix
3
  import gradio as gr
4
  import fitz # PyMuPDF
5
  from PIL import Image
 
29
  def del_references(lines):
30
  # 定义正则表达式模式
31
  patterns = [
32
+ (
33
+ r'\*\{.{0,5}(References|Reference|REFERENCES|LITERATURE CITED|Referencesand notes|Notes and references)(.*?)\\section\*\{Tables',
34
+ r'\section*{Tables\n'),
35
+ (r'\*\{.{0,5}(References|Reference|REFERENCES|LITERATURE CITED|Referencesand notes|Notes and references)(.*)',
36
+ ''),
37
+ (
38
+ r'#.{0,15}(References|Reference|REFERENCES|LITERATURE CITED|Referencesand notes|Notes and references)(.*?)(Table|Tables)',
39
+ r'Tables'),
40
+ (
41
+ r'#.{0,15}(References|Reference|REFERENCES|LITERATURE CITED|Referencesand notes|Notes and references)(.*?)# SUPPLEMENTARY',
42
+ r'# SUPPLEMENTARY'),
43
+ (
44
+ r'#.{0,15}(References|Reference|REFERENCES|LITERATURE CITED|Referencesand notes|Notes and references)(.*?)\[\^0\]',
45
+ r'[^0]'),
46
  (r'#.{0,15}(References|Reference|REFERENCES|LITERATURE CITED|Referencesand notes|Notes and references)(.*)', '')
47
  ]
48
 
 
78
  return file_content
79
 
80
 
81
+ def extract_pdf_md(pdf_dir):
82
+ print(f"start convert pdf 2 md: {pdf_dir}")
83
+ try:
84
+ content = extract_pdf_mathpix(pdf_folder_dir=os.path.split(pdf_dir)[0], pdf_dir=os.path.split(pdf_dir)[1],
85
+ md_folder_dir=os.path.split(pdf_dir)[0])
86
+ except Exception as e:
87
+ print(f"Error opening PDF: {e}")
88
+ return None
89
+
90
+ return content
91
+
92
+
93
+ def openai_api(messages, model="claude-3-5-sonnet-20240620", temperature=0.1):
94
+ print("use model:", model, "temperature:", temperature)
95
  try:
96
  completion = client.chat.completions.create(
97
+ model=model,
98
  messages=messages,
99
+ temperature=temperature,
100
  max_tokens=8192,
101
  stream=True
102
  )
 
108
  return None
109
 
110
 
111
+ def openai_chat_2_step(prompt, file_content, model, temperature):
112
  all_response = ""
113
+ for i in range(len(file_content) // 123000 + 1):
114
+ text = file_content[i * 123000:(i + 1) * 123000]
115
  # step1: 拆分两部分,前半部分
116
  messages = [
117
  {
 
124
  tokens = cal_tokens(messages)
125
  print("step一: 抽取部分{}:".format(i))
126
  print("prompt tokens:", tokens)
127
+ response_2_content = openai_api(messages, model, temperature)
128
  if response_2_content:
129
  all_response += response_2_content + "\n"
130
 
 
151
  tokens = cal_tokens(messages)
152
  print("step二: 合并部分:")
153
  print("prompt tokens:", tokens)
154
+ response = openai_api(messages, model, temperature)
155
  return response
156
 
157
 
158
+ def predict(prompt, file_content, model="claude-3-5-sonnet-20240620", temperature=0.1):
159
  file_content = del_references(file_content)
160
 
161
  messages = [
 
174
  print("prompt tokens:", tokens)
175
  # time.sleep(20) # claude 需要加这个
176
  if tokens > 128000:
177
+ extract_result = openai_chat_2_step(prompt, file_content, model, temperature)
178
  else:
179
+ extract_result = openai_api(messages, model, temperature)
180
 
181
  return extract_result or "Too many users. Please wait a moment!"
182
 
 
277
  df = load_csv(CSV_FILE_PATH_Golden_Benchmark_Enzyme)
278
  return search_data(df, keyword, selected_column)
279
 
280
+
281
  def search_data_golden_Ribozyme(keyword, selected_column):
282
  df = load_csv(CSV_FILE_PATH_Golden_Benchmark_Ribozyme)
283
  return search_data(df, keyword, selected_column)
 
296
  <p>How to use:
297
  <br><strong>1</strong>: Upload your PDF.
298
  <br><strong>2</strong>: Click "View PDF" to preview it.
299
+ <br><strong>3</strong>: Click "Convert to Markdown(Mathpix)/Convert to Text(PyMuPDF)" to extract PDF to Text.
300
  <br><strong>4</strong>: Enter your extraction prompt in the input box.
301
+ <br><strong>5</strong>: Click "Generate" to extract data, and the extracted information will display below.
302
  </p>'''
303
  )
304
  file_input = gr.File(label="Upload your PDF", type="filepath")
305
  example = gr.Examples(examples=[["./sample.pdf"]], inputs=file_input)
306
  with gr.Row():
307
  viewer_button = gr.Button("View PDF", variant="secondary")
308
+ with gr.Row():
309
+ extract_button_md = gr.Button("Convert to Markdown(Mathpix)", variant="primary")
310
+ extract_button_text = gr.Button("Convert to Text(PyMuPDF)", variant="primary")
311
 
312
  with gr.Row():
313
  with gr.Column(scale=1):
 
314
  file_out = gr.Gallery(label="PDF Viewer", columns=1, height="auto", object_fit="contain")
315
 
316
  with gr.Column(scale=1):
 
326
  )
327
 
328
  with gr.Column():
329
+ model_input = gr.Textbox(lines=7, value=en_1, placeholder='Enter your extraction prompt here',
330
+ label='Input Prompt')
331
  exp = gr.Button("Example Prompt")
332
+ with gr.Row():
333
+ # 模型选择下拉菜单
334
+ model_choices = ["claude-3-5-sonnet-20240620", "gpt-4o-2024-08-06"]
335
+ model_dropdown = gr.Dropdown(choices=model_choices, label="Select Model", value=model_choices[0])
336
+
337
+ # 温度选择滑块
338
+ temp_slider = gr.Slider(minimum=0.0, maximum=1.0, step=0.1, label="Temperature", value=0.1)
339
+
340
  with gr.Row():
341
  gen = gr.Button("Generate", variant="primary")
342
  clr = gr.Button("Clear")
 
369
  search_output = gr.HTML(label="Search Results", min_height=1000, max_height=1000)
370
 
371
  # 设置搜索功能
372
+ search_button.click(fn=search_data_golden_Enzyme, inputs=[search_box, column_dropdown],
373
+ outputs=search_output)
374
 
375
  # 将回车事件绑定到搜索按钮
376
  search_box.submit(fn=search_data_golden_Enzyme, inputs=[search_box, column_dropdown], outputs=search_output)
 
404
  search_output = gr.HTML(label="Search Results", min_height=1000, max_height=1000)
405
 
406
  # 设置搜索功能
407
+ search_button.click(fn=search_data_golden_Ribozyme, inputs=[search_box, column_dropdown],
408
+ outputs=search_output)
409
 
410
  # 将回车事件绑定到搜索按钮
411
+ search_box.submit(fn=search_data_golden_Ribozyme, inputs=[search_box, column_dropdown],
412
+ outputs=search_output)
413
 
414
  # 初始加载整个 CSV 表格
415
  initial_output = load_csv(CSV_FILE_PATH_Golden_Benchmark_Ribozyme)
 
452
  else:
453
  search_output.value = initial_output.to_html(classes='data', index=False, header=True)
454
 
 
455
 
456
+
457
+ extract_button_md.click(extract_pdf_md, inputs=file_input, outputs=text_output)
458
+ extract_button_text.click(extract_pdf_pypdf, inputs=file_input, outputs=text_output)
459
  exp.click(update_input, outputs=model_input)
460
+ gen.click(fn=predict, inputs=[model_input, text_output, model_dropdown, temp_slider], outputs=outputs)
461
  clr.click(fn=lambda: [gr.update(value=""), gr.update(value="")], inputs=None, outputs=[model_input, outputs])
462
  viewer_button.click(display_pdf_images, inputs=file_input, outputs=file_out)
463
 
 
464
  demo.launch()