Spaces:
Running
Running
File size: 5,359 Bytes
73a2cf2 f704336 73a2cf2 f704336 73a2cf2 f704336 73a2cf2 f704336 73a2cf2 f704336 73a2cf2 77a07a6 73a2cf2 f704336 73a2cf2 f704336 73a2cf2 f704336 73a2cf2 f704336 73a2cf2 f704336 73a2cf2 f704336 73a2cf2 f704336 73a2cf2 f704336 73a2cf2 f704336 73a2cf2 f704336 73a2cf2 f704336 73a2cf2 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 |
import gradio as gr
import base64
import os
from openai import OpenAI
api_key = os.getenv('API_KEY')
base_url = os.getenv("BASE_URL")
client = OpenAI(
api_key=api_key,
base_url=base_url,
)
def extract_pdf_pypdf(pdf_dir):
import fitz
path = pdf_dir
try:
doc = fitz.open(path)
except:
print("can not read pdf")
return None
page_count = doc.page_count
file_content = ""
for page in range(page_count):
text = doc.load_page(page).get_text("text")
# 防止目录中包含References
file_content += text + "\n\n"
return file_content
def openai_api(messages):
try:
completion = client.chat.completions.create(
model="claude-3-5-sonnet-20240620",
messages=messages,
temperature=0.1,
max_tokens=8192,
# timeout=300,
stream=True
)
except Exception as ex:
print("api 出现如下异常%s" % ex)
return None
if completion:
try:
response_2_list = [chunk.choices[0].delta.content if chunk.choices[0].delta.content else "" for chunk in
completion]
print("response tokens:", len(response_2_list))
response_2_content = ''.join(response_2_list)
return response_2_content
except Exception as ex:
print("第二轮 出现如下异常%s" % ex)
return None
else:
print("第二轮出现异常")
return None
def predict(input_text, pdf_file):
if pdf_file is None:
return "Please upload a PDF file to proceed."
file_content = extract_pdf_pypdf(pdf_file.name)
messages = [
{
"role": "system",
"content": "You are an expert in information extraction from scientific literature.",
},
{"role": "user", "content": """Provided Text:
'''
{{""" + file_content + """}}
'''
""" + input_text}
]
extract_result = openai_api(messages)
return extract_result or "Too many users. Please wait a moment!"
def view_pdf(pdf_file):
if pdf_file is None:
return "Please upload a PDF file to view."
with open(pdf_file.name, 'rb') as f:
pdf_data = f.read()
b64_data = base64.b64encode(pdf_data).decode('utf-8')
return f"<embed src='data:application/pdf;base64,{b64_data}' type='application/pdf' width='100%' height='700px' />"
en_1 = """Could you please help me extract the information of 'title'/'journal'/'year'/'author'/'institution'/'email' from the previous content in a markdown table format?
If any of this information was not available in the paper, please replace it with the string `""`. If the property contains multiple entities, please use a list to contain.
"""
en_2 = """Could you please help me extract the information of 'title'/'journal'/'year'/'author'/'institution'/'email' from the previous content in a JSON format?
If any of this information was not available in the paper, please replace it with the string `""`. If the property contains multiple entities, please use a list to contain.
"""
examples = [[en_1], [en_2]]
with gr.Blocks(title="PaperExtractGPT") as demo:
gr.Markdown(
'''<p align="center">
<h1 align="center"> Paper Extract GPT </h1>
<p> How to use:
<br> <strong>1</strong>: Upload your PDF.
<br> <strong>2</strong>: Click "View PDF" to preview it.
<br> <strong>3</strong>: Enter your extraction prompt in the input box.
<br> <strong>4</strong>: Click "Generate" to extract, and the extracted information will display below.
</p>
'''
)
with gr.Row():
with gr.Column():
gr.Markdown('## Upload PDF')
file_input = gr.File(label="Upload your PDF", type="filepath")
viewer_button = gr.Button("View PDF")
file_out = gr.HTML(label="PDF Preview")
with gr.Column():
model_input = gr.Textbox(lines=7, placeholder='Enter your extraction prompt here', label='Input Prompt')
example = gr.Examples(examples=examples, inputs=model_input)
with gr.Row():
gen = gr.Button("Generate")
clr = gr.Button("Clear")
outputs = gr.Markdown(label='Output', show_label=True, value="""| Title | Journal | Year | Author | Institution | Email |
|---------------------------------------------|--------------------|------|-----------------------------------------------|-------------------------------------------------------|-----------------------|
| Paleomagnetic Study of Deccan Traps from Jabalpur to Amarkantak, Central India | J. Geomag. Geoelectr. | 1973 | R. K. VERMA, G. PULLAIAH, G.R. ANJANEYULU, P. K. MALLIK | National Geophysical Research Institute, Hyderabad, and Indian School o f Mines, Dhanbad | "" |
""")
gen.click(fn=predict, inputs=[model_input, file_input], outputs=outputs)
clr.click(fn=lambda: [gr.update(value=""), gr.update(value="")], inputs=None, outputs=[model_input, outputs])
viewer_button.click(view_pdf, inputs=file_input, outputs=file_out)
demo.launch()
|