File size: 25,559 Bytes
ccfd0bd
bfdf08c
ccfd0bd
 
 
 
 
f1536fe
 
8c2d709
e43077d
ccfd0bd
 
 
 
1f32a6b
ccfd0bd
 
f1536fe
 
 
 
 
 
 
 
 
 
 
 
1f32a6b
 
bfdf08c
 
 
 
 
 
 
 
 
 
 
 
 
 
1f32a6b
 
 
 
f1536fe
 
1f32a6b
 
 
 
 
 
f1536fe
 
 
ccfd0bd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e43077d
 
bfdf08c
 
 
 
 
 
 
 
 
 
 
 
 
 
ccfd0bd
 
bfdf08c
ccfd0bd
bfdf08c
ccfd0bd
 
 
 
 
 
 
 
 
 
 
bfdf08c
f1536fe
bfdf08c
 
f1536fe
 
 
 
 
 
 
 
 
 
 
 
bfdf08c
f1536fe
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bfdf08c
f1536fe
 
 
bfdf08c
1f32a6b
ccfd0bd
 
 
 
 
 
 
 
 
 
f1536fe
ccfd0bd
f1536fe
 
 
 
 
bfdf08c
f1536fe
bfdf08c
ccfd0bd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
71a363f
ccfd0bd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6187478
 
f970a03
6187478
 
1f32a6b
 
 
8c2d709
 
1f32a6b
8c2d709
1f32a6b
8c2d709
 
1f32a6b
8c2d709
 
1f32a6b
 
8c2d709
 
 
 
 
1f32a6b
8c2d709
 
 
 
 
 
 
 
 
 
 
 
 
1f32a6b
 
 
8c2d709
bfdf08c
1f32a6b
 
 
7ce8079
8c2d709
1f32a6b
 
 
8c2d709
 
ccfd0bd
8c2d709
 
 
 
 
 
 
3a5b282
8c2d709
bfdf08c
8c2d709
 
 
 
 
 
bfdf08c
 
f6dbe48
8c2d709
 
 
72c9856
8c2d709
 
 
 
 
 
 
 
 
 
 
 
 
 
bfdf08c
 
8c2d709
bfdf08c
 
 
 
 
 
 
 
8c2d709
 
 
 
 
 
 
 
1f32a6b
8c2d709
1f32a6b
8c2d709
 
 
f970a03
dbc68cb
a7de127
dbc68cb
 
8c2d709
 
1f32a6b
8c2d709
f970a03
8c2d709
 
 
 
 
 
 
 
bfdf08c
 
8c2d709
 
1f32a6b
8c2d709
1f32a6b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bfdf08c
 
1f32a6b
 
bfdf08c
 
1f32a6b
 
 
8c2d709
 
 
 
 
 
 
 
dbc68cb
 
8c2d709
dbc68cb
15e6105
dbc68cb
8c2d709
 
1f32a6b
8c2d709
 
 
 
 
 
 
 
 
 
1f32a6b
8c2d709
 
1f32a6b
8c2d709
1f32a6b
 
8c2d709
 
 
 
ccfd0bd
1f32a6b
bfdf08c
 
6187478
bfdf08c
ccfd0bd
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
from openai import OpenAI
from ocr_mathpix import extract_pdf_mathpix
import gradio as gr
import fitz  # PyMuPDF
from PIL import Image
from pathlib import Path
import os
import re
import tiktoken
import pandas as pd
import functools

api_key = os.getenv('API_KEY')
base_url = os.getenv("BASE_URL")

client = OpenAI(api_key=api_key, base_url=base_url)


def cal_tokens(message_data):
    print("use tiktoken")
    try:
        encoding = tiktoken.encoding_for_model("gpt-3.5-turbo-0301")
    except KeyError:
        print("Warning: model not found. Using cl100k_base encoding.")
        encoding = tiktoken.get_encoding("cl100k_base")
    num_tokens = len(encoding.encode(str(message_data)))
    return num_tokens


def del_references(lines):
    # 定义正则表达式模式
    patterns = [
        (
        r'\*\{.{0,5}(References|Reference|REFERENCES|LITERATURE CITED|Referencesand notes|Notes and references)(.*?)\\section\*\{Tables',
        r'\section*{Tables\n'),
        (r'\*\{.{0,5}(References|Reference|REFERENCES|LITERATURE CITED|Referencesand notes|Notes and references)(.*)',
         ''),
        (
        r'#.{0,15}(References|Reference|REFERENCES|LITERATURE CITED|Referencesand notes|Notes and references)(.*?)(Table|Tables)',
        r'Tables'),
        (
        r'#.{0,15}(References|Reference|REFERENCES|LITERATURE CITED|Referencesand notes|Notes and references)(.*?)# SUPPLEMENTARY',
        r'# SUPPLEMENTARY'),
        (
        r'#.{0,15}(References|Reference|REFERENCES|LITERATURE CITED|Referencesand notes|Notes and references)(.*?)\[\^0\]',
        r'[^0]'),
        (r'#.{0,15}(References|Reference|REFERENCES|LITERATURE CITED|Referencesand notes|Notes and references)(.*)', '')
    ]

    for pattern, replacement in patterns:
        matches = re.search(pattern, lines, re.DOTALL)
        if matches:
            lines = lines.replace(matches[0], replacement)
            print(f"匹配到了 {pattern}, 删除了 References, 保留了后面的 {replacement}")
            break
    else:
        print("没有匹配到 References")

    return lines


def extract_pdf_pypdf(pdf_dir):
    try:
        doc = fitz.open(pdf_dir)
    except Exception as e:
        print(f"Error opening PDF: {e}")
        return None

    page_count = doc.page_count
    file_content = ""
    for page in range(page_count):
        try:
            text = doc.load_page(page).get_text("text")
            file_content += text + "\n\n"
        except Exception as e:
            print(f"Error reading page {page}: {e}")
            continue

    return file_content

# 设置缓存大小,可以根据需要调整
@functools.lru_cache(maxsize=128)
def extract_pdf_md(pdf_dir):
    print(f"start convert pdf 2 md: {pdf_dir}")
    try:
        content = extract_pdf_mathpix(pdf_folder_dir=os.path.split(pdf_dir)[0], pdf_dir=os.path.split(pdf_dir)[1],
                                      md_folder_dir=os.path.split(pdf_dir)[0])
    except Exception as e:
        print(f"Error opening PDF: {e}")
        return None

    return content


def openai_api(messages, model="claude-3-5-sonnet-20240620", temperature=0.1):
    print("use model:", model, "temperature:", temperature)
    try:
        completion = client.chat.completions.create(
            model=model,
            messages=messages,
            temperature=temperature,
            max_tokens=8192,
            stream=True
        )
        response = ''.join(
            [chunk.choices[0].delta.content if chunk.choices[0].delta.content else "" for chunk in completion])
        return response
    except Exception as ex:
        print("API error:", ex)
        return None


def openai_chat_2_step(prompt, file_content, model, temperature):
    all_response = ""
    for i in range(len(file_content) // 123000 + 1):
        text = file_content[i * 123000:(i + 1) * 123000]
        # step1: 拆分两部分,前半部分
        messages = [
            {
                "role": "system",
                "content": "You are an expert in information extraction from scientific literature.",
            },
            {"role": "user",
             "content": "The following is a scientific article, please read it carefully: \n{" + text + "}\n" + prompt},
        ]
        tokens = cal_tokens(messages)
        print("step一: 抽取部分{}:".format(i))
        print("prompt tokens:", tokens)
        response_2_content = openai_api(messages, model, temperature)
        if response_2_content:
            all_response += response_2_content + "\n"

    messages = [
        {
            "role": "system",
            "content": "You are an expert in information extraction from scientific literature.",
        },
        {"role": "user", "content": """Provided Text:
'''
{{""" + all_response + """}}
'''
                                    """ + """
Combine the above tables into one table.
Please pay attention to the pipe format as shown in the example below. This format is for reference only regarding the structure; the content within is not the focus of this instruction.

| Enzyme     | Organism          | Substrate   | Km  | Unit_Km | Kcat | Unit_Kcat | Kcat/Km | Unit_Kcat/Km | Commentary[Temp] | Commentary[pH] | Commentary[Mutant] | Commentary[Cosubstrate] |
|------------|-------------------|-------------|-----|---------|------|-----------|---------|--------------|------------------|----------------|--------------------|-------------------------|
| Enzyme1    | Bacillus subtilis | Substrate_A | 7.3 | mM      | 6.4  | s^-1      | 1.4 × 10^4   | M^-1s^-1     | 37°C             | 5.0            | WT                 | NADP^+                  |
| Enzyme2    | Escherichia coli  | Substrate_B | 5.9 | mM      | 9.8  | s^-1      | 29000   | mM^-1min^-1  | 60°C             | 10.0           | Q176E             | NADPH                   |
| Enzyme3    | Homo sapiens      | Substrate_C | 6.9 | mM      | 15.6 | s^-1      | 43000   | µM^-1s^-1    | 65°C             | 8.0            | T253S             | NAD^+                   |
                                    """}
    ]
    tokens = cal_tokens(messages)
    print("step二: 合并部分:")
    print("prompt tokens:", tokens)
    response = openai_api(messages, model, temperature)
    return response


def predict(prompt, file_content, model="claude-3-5-sonnet-20240620", temperature=0.1):
    file_content = del_references(file_content)

    messages = [
        {
            "role": "system",
            "content": "You are an expert in information extraction from scientific literature.",
        },
        {"role": "user", "content": """Provided Text:
    '''
    {{""" + file_content + """}}
    '''
                                        """ + prompt}
    ]
    tokens = cal_tokens(messages)
    print("开始:抽取")
    print("prompt tokens:", tokens)
    # time.sleep(20) # claude 需要加这个
    if tokens > 128000:
        extract_result = openai_chat_2_step(prompt, file_content, model, temperature)
    else:
        extract_result = openai_api(messages, model, temperature)

    return extract_result or "Too many users. Please wait a moment!"


def convert_pdf_to_images(pdf_path, image_folder="pdf_images", dpi=300):
    # 创建存储图像的文件夹
    os.makedirs(image_folder, exist_ok=True)

    # 打开PDF文档
    pdf_document = fitz.open(pdf_path)
    image_paths = []

    # 遍历每一页PDF,并生成高DPI的图像
    for page_number in range(len(pdf_document)):
        page = pdf_document[page_number]
        pix = page.get_pixmap(dpi=dpi)
        image_path = Path(image_folder) / f"page_{page_number + 1}.png"
        Image.frombytes("RGB", [pix.width, pix.height], pix.samples).save(image_path)
        image_paths.append(str(image_path))  # 收集每一页的图像路径

    pdf_document.close()
    return image_paths


@functools.lru_cache(maxsize=128)
def display_pdf_images(file):
    # 转换PDF为高清图像
    image_paths = convert_pdf_to_images(file)
    return image_paths  # 返回图像路径列表以显示


en_1 = """Please read the scientific article provided and extract detailed information about enzymes from a specific organism, focusing on variants or mutants. Your focus should be on data related to the enzyme's activity on substrates at specific concentrations, under certain pH levels and temperatures, and in the presence of different cofactors or cosubstrates at various concentrations. It is essential to identify and record the enzymatic kinetics parameters: Km, Kcat, and Kcat/Km values under these conditions.

Organize all this information into a table with 13 columns titled: Enzyme, Organism, Substrate, Km, Unit_Km, Kcat, Unit_Kcat, Kcat/Km, Unit_Kcat/Km, Commentary[Temp], Commentary[pH], Commentary[Mutant], and Commentary[Cosubstrate].

While performing the tasks, please pay special attention to the following points:
1. Unit retention: Unit_Km, Unit_Kcat, Unit_Kcat/Km should be recorded and output exactly as they appeared in the tables from the Scientific Article Fraction.
2. Scientific Notation: For values in the table that are derived from the article’s headers containing scientific notations, ensure that the actual values entered into the table reflect these notations accordingly. For instance, if an original table specifies 'Kcat/Km × 10^4 (M^-1s^-1)' in table header, then the value entered under 'Kcat/Km' of your table should be '1.4 × 10^4' without any unit if 1.4 was the original figure. Importantly, enter its respective unit 'M^-1s^-1' under 'Unit_Kcat/Km' in your table. Apply this method for each relevant entry, preserving the scientific notation detail as provided in the article. Conversely, for headers not involving scientific notations, simply transcribe values and units as they are, without adding or altering the notation form.
3. Pure Numbers and Units: Please ensure that all numerical values in the columns of 'Km', 'Kcat', and 'Kcat/Km' are entered as pure numbers without any accompanying units. The corresponding units must be placed in their respective 'Unit' columns only, such as 'Unit_Km', 'Unit_Kcat', and 'Unit_Kcat/Km'. This separation of values and units is critical to maintain clarity and consistency in the data representation.
4. Mean Values Only: I need you to include only the mean values, excluding standard deviations or errors, while standard deviations or errors might be indicated after '±' or be wrapped in '()'.
5. Full Forms: In the case that abbreviated or shortened forms are used in the entries of certain tables or other informative text, endeavor to trace back to the full forms of these abbreviations in the Scientific Article Fraction and reflect them in the tables you are organizing.
6. Data Derivation: All data must be derived solely from the unit conversion of the Scientific Article Fraction provided, not from any calculations. For example, do not calculate the Kcat/Km ratio by dividing perceived Kcat data by Km data; only use pre-existing Kcat/Km values from the Scientific Article Fraction.
7. Ensure that each row of the table corresponds to a unique set of conditions and their respective kinetic parameters for the enzyme being measured.


Output the table using the pipe symbol (|) as the delimiter, ensuring each entry is separated by a pipe symbol and properly aligned to maintain the structure of the table. I need you to include only the mean values, excluding standard deviations or errors, while standard deviations or errors might be indicated after '±' or be wrapped in '()'. Include all details and rows in the output, providing a comprehensive extraction of every data point without omissions. Format the complete table data clearly, ensuring that every piece of information is included and no data points are left out. Do not use ellipses or any other form of indication suggesting information is continued elsewhere. The full dataset must be provided as per the structure above, ensuring the integrity and usability of the data for subsequent analyses or applications. Present the complete table data in a clear and organized format in your response, without the need for further confirmation or prompts.

Please pay attention to the pipe format as shown in the example below. This format is for reference only regarding the structure; the content within is not the focus of this instruction.

| Enzyme     | Organism          | Substrate   | Km  | Unit_Km | Kcat | Unit_Kcat | Kcat/Km | Unit_Kcat/Km | Commentary[Temp] | Commentary[pH] | Commentary[Mutant] | Commentary[Cosubstrate] |
|------------|-------------------|-------------|-----|---------|------|-----------|---------|--------------|------------------|----------------|--------------------|-------------------------|
| Enzyme1    | Bacillus subtilis | Substrate_A | 7.3 | mM      | 6.4  | s^-1      | 1.4 × 10^4   | M^-1s^-1     | 37°C             | 5.0            | WT                 | NADP^+                  |
| Enzyme2    | Escherichia coli  | Substrate_B | 5.9 | mM      | 9.8  | s^-1      | 29000   | mM^-1min^-1  | 60°C             | 10.0           | Q176E             | NADPH                   |
| Enzyme3    | Homo sapiens      | Substrate_C | 6.9 | mM      | 15.6 | s^-1      | 43000   | µM^-1s^-1    | 65°C             | 8.0            | T253S             | NAD^+                   |

Structure your responses to allow for seamless concatenation, presenting all tabular data from a scientific article as a single table, even if the original content had multiple tables. Use the full response capacity to maximize data presentation, avoiding summarizations, commentaries, or introductions at the end of each response. The subsequent response should pick up precisely where the preceding one concluded, commencing from the following character, without the necessity to reiterate the table header or the fragmented words. This method ensures the table is presented completely and seamlessly, despite character limit constraints. Please start by outputting the first segment of the table according to these guidelines.
"""


def update_input():
    return en_1


CSV_FILE_PATH_Golden_Benchmark_Enzyme = "static/Golden Benchmark for Enzyme Kinetics.csv"
CSV_FILE_PATH_Golden_Benchmark_Ribozyme = "static/Golden Benchmark for Ribozyme Kinetics.csv"
CSV_FILE_PATH_LLENKA_Dataset = "static/3450_merged_data_2000_lines.csv"


def load_csv(CSV_FILE_PATH):
    try:
        df = pd.read_csv(CSV_FILE_PATH)
        return df
    except Exception as e:
        return f"Error loading CSV file: {e}"


def get_column_names(CSV_FILE_PATH):
    df = load_csv(CSV_FILE_PATH)
    if isinstance(df, str):
        return []  # 如果加载失败,返回空列表
    return df.columns.tolist()  # 返回列名列表


def search_data(df, keyword, selected_column):
    if isinstance(df, str):  # 检查是否加载成功
        return df
    # 过滤包含关键字的行
    if selected_column not in df.columns:
        return "Invalid column selected."

    filtered_df = df[df[selected_column].astype(str).str.contains(keyword, case=False)]

    if filtered_df.empty:
        return "No results found."
    return filtered_df.to_html(classes='data', index=False, header=True)


def search_data_golden_Enzyme(keyword, selected_column):
    df = load_csv(CSV_FILE_PATH_Golden_Benchmark_Enzyme)
    return search_data(df, keyword, selected_column)


def search_data_golden_Ribozyme(keyword, selected_column):
    df = load_csv(CSV_FILE_PATH_Golden_Benchmark_Ribozyme)
    return search_data(df, keyword, selected_column)


def search_data_LLENKA(keyword, selected_column):
    df = load_csv(CSV_FILE_PATH_LLENKA_Dataset)
    return search_data(df, keyword, selected_column)


with gr.Blocks(title="Automated Enzyme Kinetics Extractor") as demo:
    with gr.Tabs():
        with gr.Tab("Automated Enzyme Kinetics Extractor"):
            gr.Markdown(
                '''<h1 align="center"> Automated Enzyme Kinetics Extractor </h1>
                <p>How to use:
                <br><strong>1</strong>: Upload your PDF.
                <br><strong>2</strong>: Click "View PDF" to preview it.
                <br><strong>3</strong>: Click "Convert to Markdown(Mathpix)" or "Convert to Text(PyMuPDF)" to Convert PDF to MD/Text.
                <br><strong>4</strong>: Enter your extraction prompt in the input box.
                <br><strong>5</strong>: Click "Generate" to extract data, and the extracted information will display below.
                </p>'''
            )
            file_input = gr.File(label="Upload your PDF", type="filepath")
            example = gr.Examples(examples=[["./sample.pdf"]], inputs=file_input)
            with gr.Row():
                viewer_button = gr.Button("View PDF", variant="secondary")
                with gr.Row():
                    extract_button_md = gr.Button("Convert to Markdown(Mathpix)", variant="primary")
                    extract_button_text = gr.Button("Convert to Text(PyMuPDF)", variant="secondary")

            with gr.Row():
                with gr.Column(scale=1):
                    file_out = gr.Gallery(label="PDF Viewer", columns=1, rows=1, height="820px", preview=True)

                with gr.Column(scale=1):
                    text_output = gr.Textbox(
                        label="Extracted Text",
                        interactive=True,
                        placeholder="Extracted text will appear here...",
                        lines=39,
                        max_lines=39,  # 设置最大行数,如果超过将显示滚动条
                        autoscroll=False,  # 设置自动滚动到底部
                        show_copy_button=True,
                        elem_id="text-output"
                    )

            with gr.Column():
                model_input = gr.Textbox(lines=7, value=en_1, placeholder='Enter your extraction prompt here',
                                         label='Input Prompt')
                exp = gr.Button("Example Prompt")
                with gr.Row():
                    # 模型选择下拉菜单
                    model_choices = ["claude-3-5-sonnet-20240620", "gpt-4o-2024-08-06"]
                    model_dropdown = gr.Dropdown(choices=model_choices, label="Select Model", value=model_choices[0])

                    # 温度选择滑块
                    temp_slider = gr.Slider(minimum=0.0, maximum=1.0, step=0.1, label="Temperature", value=0.1)

                with gr.Row():
                    gen = gr.Button("Generate", variant="primary")
                    clr = gr.Button("Clear")
                outputs = gr.Markdown(label='Output', value="""| Enzyme     | Organism          | Substrate   | Km  | Unit_Km | Kcat | Unit_Kcat | Kcat/Km | Unit_Kcat/Km | Commentary[Temp] | Commentary[pH] | Commentary[Mutant] | Commentary[Cosubstrate] |
        |------------|-------------------|-------------|-----|---------|------|-----------|---------|--------------|------------------|----------------|--------------------|-------------------------|
        | Enzyme1    | Bacillus subtilis | Substrate_A | 7.3 | mM      | 6.4  | s^-1      | 1.4 × 10^4   | M^-1s^-1     | 37°C             | 5.0            | WT                 | NADP^+                  |
        | Enzyme2    | Escherichia coli  | Substrate_B | 5.9 | mM      | 9.8  | s^-1      | 29000   | mM^-1min^-1  | 60°C             | 10.0           | Q176E             | NADPH                   |
        | Enzyme3    | Homo sapiens      | Substrate_C | 6.9 | mM      | 15.6 | s^-1      | 43000   | µM^-1s^-1    | 65°C             | 8.0            | T253S             | NAD^+                   |
        
        """)
        with gr.Tab("Golden Benchmark for Enzyme Kinetics"):
            gr.Markdown(
                '''<h1 align="center"> Golden Benchmark Viewer with Advanced Search </h1>
                </p>'''
            )
            gr.Markdown("""
                dataset can be download in [LLM-Enzyme-Kinetics-Golden-Benchmark](https://huggingface.co/datasets/jackkuo/LLM-Enzyme-Kinetics-Golden-Benchmark)
            """)

            with gr.Row():
                # 选择搜索字段
                column_names = get_column_names(CSV_FILE_PATH_Golden_Benchmark_Enzyme)
                column_dropdown = gr.Dropdown(label="Select Column to Search", choices=column_names)

                # 添加搜索框
                search_box = gr.Textbox(label="Search", placeholder="Enter keyword to search...")
            # 按钮点击后执行搜索
            search_button = gr.Button("Search", variant="primary")

            search_output = gr.HTML(label="Search Results", min_height=1000, max_height=1000)

            # 设置搜索功能
            search_button.click(fn=search_data_golden_Enzyme, inputs=[search_box, column_dropdown],
                                outputs=search_output)

            # 将回车事件绑定到搜索按钮
            search_box.submit(fn=search_data_golden_Enzyme, inputs=[search_box, column_dropdown], outputs=search_output)

            # 初始加载整个 CSV 表格
            initial_output = load_csv(CSV_FILE_PATH_Golden_Benchmark_Enzyme)
            if isinstance(initial_output, str):
                search_output.value = initial_output  # 直接将错误消息赋值
            else:
                search_output.value = initial_output.to_html(classes='data', index=False, header=True)

        with gr.Tab("Golden Benchmark for Ribozyme Kinetics"):
            gr.Markdown(
                '''<h1 align="center"> Golden Benchmark Viewer with Advanced Search </h1>
                </p>'''
            )
            gr.Markdown("""
                dataset can be download in [LLM-Ribozyme-Kinetics-Golden-Benchmark](https://huggingface.co/datasets/jackkuo/LLM-Ribozyme-Kinetics-Golden-Benchmark)
            """)

            with gr.Row():
                # 选择搜索字段
                column_names = get_column_names(CSV_FILE_PATH_Golden_Benchmark_Ribozyme)
                column_dropdown = gr.Dropdown(label="Select Column to Search", choices=column_names)

                # 添加搜索框
                search_box = gr.Textbox(label="Search", placeholder="Enter keyword to search...")
            # 按钮点击后执行搜索
            search_button = gr.Button("Search", variant="primary")

            search_output = gr.HTML(label="Search Results", min_height=1000, max_height=1000)

            # 设置搜索功能
            search_button.click(fn=search_data_golden_Ribozyme, inputs=[search_box, column_dropdown],
                                outputs=search_output)

            # 将回车事件绑定到搜索按钮
            search_box.submit(fn=search_data_golden_Ribozyme, inputs=[search_box, column_dropdown],
                              outputs=search_output)

            # 初始加载整个 CSV 表格
            initial_output = load_csv(CSV_FILE_PATH_Golden_Benchmark_Ribozyme)
            if isinstance(initial_output, str):
                search_output.value = initial_output  # 直接将错误消息赋值
            else:
                search_output.value = initial_output.to_html(classes='data', index=False, header=True)

        with gr.Tab(" LLM Enzyme Kinetics Archive (LLENKA)"):
            gr.Markdown(
                '''<h1 align="center">  LLM Enzyme Kinetics Archive (LLENKA) Viewer with Advanced Search </h1>
                </p>
                '''
            )
            gr.Markdown("""
                Since the entire data set is relatively large, only the first 2,000 rows are shown here. The complete dataset can be download in [LLM-Enzyme-Kinetics-Archive-LLENKA](https://huggingface.co/datasets/jackkuo/LLM-Enzyme-Kinetics-Archive-LLENKA)
            """)
            with gr.Row():
                # 选择搜索字段
                column_names = get_column_names(CSV_FILE_PATH_LLENKA_Dataset)
                column_dropdown = gr.Dropdown(label="Select Column to Search", choices=column_names)

                # 添加搜索框
                search_box = gr.Textbox(label="Search", placeholder="Enter keyword to search...")
            # 按钮点击后执行搜索
            search_button = gr.Button("Search", variant="primary")

            search_output = gr.HTML(label="Search Results", min_height=1000, max_height=1000)

            # 设置搜索功能
            search_button.click(fn=search_data_LLENKA, inputs=[search_box, column_dropdown], outputs=search_output)

            # 将回车事件绑定到搜索按钮
            search_box.submit(fn=search_data_LLENKA, inputs=[search_box, column_dropdown], outputs=search_output)

            # 初始加载整个 CSV 表格
            initial_output = load_csv(CSV_FILE_PATH_LLENKA_Dataset)
            if isinstance(initial_output, str):
                search_output.value = initial_output  # 直接将错误消息赋值
            else:
                search_output.value = initial_output.to_html(classes='data', index=False, header=True)


    extract_button_md.click(extract_pdf_md, inputs=file_input, outputs=text_output)
    extract_button_text.click(extract_pdf_pypdf, inputs=file_input, outputs=text_output)
    exp.click(update_input, outputs=model_input)
    gen.click(fn=predict, inputs=[model_input, text_output, model_dropdown, temp_slider], outputs=outputs)
    clr.click(fn=lambda: [gr.update(value=""), gr.update(value="")], inputs=None, outputs=[model_input, outputs])
    viewer_button.click(display_pdf_images, inputs=file_input, outputs=file_out)

demo.launch()