Spaces:
Sleeping
Sleeping
File size: 2,461 Bytes
691ea78 0231bb1 691ea78 0231bb1 691ea78 0231bb1 691ea78 0231bb1 691ea78 0231bb1 691ea78 0231bb1 691ea78 0231bb1 691ea78 0231bb1 691ea78 0231bb1 691ea78 0231bb1 691ea78 0231bb1 691ea78 0231bb1 691ea78 0231bb1 691ea78 0231bb1 553555d 0231bb1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 |
import gradio as gr
from openai import OpenAI
import os
import time
def predict(message, history, system_prompt, model, max_tokens, temperature, top_p):
# Initialize the OpenAI client
client = OpenAI(
api_key=os.environ.get("API_TOKEN"),
)
# Start with the system prompt
messages = [{"role": "system", "content": system_prompt}]
# Add the conversation history
messages.extend(history if history else [])
# Add the current user message
messages.append({"role": "user", "content": message})
# Record the start time
start_time = time.time()
# Streaming response
response = client.chat.completions.create(
model=model,
messages=messages,
max_tokens=max_tokens,
temperature=temperature,
top_p=top_p,
stop=None,
stream=True
)
full_message = ""
first_chunk_time = None
last_yield_time = None
for chunk in response:
if chunk.choices and chunk.choices[0].delta.content:
if first_chunk_time is None:
first_chunk_time = time.time() - start_time # Record time for the first chunk
full_message += chunk.choices[0].delta.content
current_time = time.time()
chunk_time = current_time - start_time # calculate the time delay of the chunk
print(f"Message received {chunk_time:.2f} seconds after request: {chunk.choices[0].delta.content}")
if last_yield_time is None or (current_time - last_yield_time >= 0.25):
yield full_message
last_yield_time = current_time
# Ensure to yield any remaining message that didn't meet the time threshold
if full_message:
total_time = time.time() - start_time
# Append timing information to the response message
full_message += f" (First Chunk: {first_chunk_time:.2f}s, Total: {total_time:.2f}s)"
yield full_message
gr.ChatInterface(
fn=predict,
type="messages",
#save_history=True,
#editable=True,
additional_inputs=[
gr.Textbox("You are a helpful AI assistant.", label="System Prompt"),
gr.Dropdown(["gpt-4o", "gpt-4o-mini"], label="Model"),
gr.Slider(800, 4000, value=2000, label="Max Token"),
gr.Slider(0, 1, value=0.7, label="Temperature"),
gr.Slider(0, 1, value=0.95, label="Top P"),
],
inputs=None,
css="footer{display:none !important}"
).launch() |