File size: 2,461 Bytes
691ea78
0231bb1
 
 
691ea78
0231bb1
691ea78
0231bb1
 
 
 
691ea78
0231bb1
 
691ea78
0231bb1
 
691ea78
0231bb1
691ea78
 
0231bb1
 
691ea78
0231bb1
 
 
 
691ea78
 
 
0231bb1
 
 
691ea78
0231bb1
 
 
691ea78
0231bb1
 
 
 
691ea78
0231bb1
 
 
 
 
 
 
 
691ea78
0231bb1
 
 
 
 
 
691ea78
0231bb1
 
 
 
 
 
 
 
 
 
 
 
553555d
0231bb1
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
import gradio as gr
from openai import OpenAI
import os
import time

def predict(message, history, system_prompt, model, max_tokens, temperature, top_p):

    # Initialize the OpenAI client
    client = OpenAI(
        api_key=os.environ.get("API_TOKEN"),
    )

    # Start with the system prompt
    messages = [{"role": "system", "content": system_prompt}]

    # Add the conversation history
    messages.extend(history if history else [])

    # Add the current user message
    messages.append({"role": "user", "content": message})

    # Record the start time
    start_time = time.time()

    # Streaming response
    response = client.chat.completions.create(
        model=model,
        messages=messages,
        max_tokens=max_tokens,
        temperature=temperature,
        top_p=top_p,
        stop=None,
        stream=True
    )

    full_message = ""
    first_chunk_time = None
    last_yield_time = None

    for chunk in response:
        if chunk.choices and chunk.choices[0].delta.content:
            if first_chunk_time is None:
                first_chunk_time = time.time() - start_time  # Record time for the first chunk

            full_message += chunk.choices[0].delta.content
            current_time = time.time()
            chunk_time = current_time - start_time  # calculate the time delay of the chunk
            print(f"Message received {chunk_time:.2f} seconds after request: {chunk.choices[0].delta.content}")  

            if last_yield_time is None or (current_time - last_yield_time >= 0.25):
                yield full_message
                last_yield_time = current_time

    # Ensure to yield any remaining message that didn't meet the time threshold
    if full_message:
        total_time = time.time() - start_time
        # Append timing information to the response message
        full_message += f" (First Chunk: {first_chunk_time:.2f}s, Total: {total_time:.2f}s)"
        yield full_message

gr.ChatInterface(
    fn=predict,
    type="messages",
    #save_history=True,
    #editable=True,
    additional_inputs=[
        gr.Textbox("You are a helpful AI assistant.", label="System Prompt"),
        gr.Dropdown(["gpt-4o", "gpt-4o-mini"], label="Model"),
        gr.Slider(800, 4000, value=2000, label="Max Token"),
        gr.Slider(0, 1, value=0.7, label="Temperature"),
        gr.Slider(0, 1, value=0.95, label="Top P"),
    ],
    inputs=None,
    css="footer{display:none !important}"
).launch()