Spaces:

Nymbo
/

Serverless-TextGen-Hub

Running

File size: 6,078 Bytes

038f313
 
4c18bfc
038f313
880ced6
 
038f313
cf508a7
038f313
 
 
 
 
 
 
cf508a7
69b4a5f
038f313
 
 
3a64d68
cf508a7
038f313
cf508a7
f7c4208
 
cf508a7
 
f7c4208
 
cf508a7
5b1509d
 
038f313
cf508a7
880ced6
f7c4208
 
cf508a7
 
 
 
 
 
 
 
 
038f313
 
cf508a7
038f313
f7c4208
cf508a7
 
 
 
038f313
 
 
21137c4
 
cf508a7
038f313
f7c4208
cf508a7
 
 
542c2ac
cf508a7
fde397b
f7c4208
cf508a7
f7c4208
 
cf508a7
 
 
 
 
 
 
 
 
 
 
f7c4208
 
7d3730f
cf508a7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fde397b
 
cf508a7
 
fde397b
 
cf508a7
 
fde397b
cf508a7
fde397b
cf508a7
 
 
 
 
fde397b
cf508a7
 
 
fde397b
cf508a7

import gradio as gr
from openai import OpenAI
import os

# Retrieve the access token from the environment variable
ACCESS_TOKEN = os.getenv("HF_TOKEN")

# Initialize the OpenAI API client
client = OpenAI(
    base_url="https://api-inference.huggingface.co/v1/",
    api_key=ACCESS_TOKEN,
)

def respond(
    message,
    history,
    system_message,
    max_tokens,
    temperature,
    top_p,
    frequency_penalty,
    seed
):
    # Process the incoming message
    print(f"Received message: {message}")
    print(f"History: {history}")
    print(f"System Message: {system_message}")
    print(f"Max Tokens: {max_tokens}, Temperature: {temperature}, Top P: {top_p}")
    print(f"Frequency Penalty: {frequency_penalty}, Seed: {seed}")

    # Convert seed to None if -1 (random)
    if seed == -1:
        seed = None

    # Construct the messages list for the API
    messages = [{"role": "system", "content": system_message}]

    # Add conversation history to the context
    for user_message, assistant_message in history:
        if user_message:
            messages.append({"role": "user", "content": user_message})
            print(f"Added user message: {user_message}")
        if assistant_message:
            messages.append({"role": "assistant", "content": assistant_message})
            print(f"Added assistant message: {assistant_message}")

    # Append the latest message
    messages.append({"role": "user", "content": message})

    # Initialize response
    response = ""

    # Make the API request
    for chunk in client.chat.completions.create(
        model="meta-llama/Llama-3.3-70B-Instruct",
        messages=messages,
        max_tokens=max_tokens,
        temperature=temperature,
        top_p=top_p,
        frequency_penalty=frequency_penalty,
        seed=seed,
        stream=True,
    ):
        # Extract the token text from the response chunk
        token = chunk.choices[0].message.content
        response += token
        yield response

# Create the Gradio Chatbot component
chatbot = gr.Chatbot(height=600)

# Define the Gradio ChatInterface
demo = gr.ChatInterface(
    chatbot=chatbot,
    fn=respond,
    inputs=[
        gr.Textbox(lines=1, placeholder="Enter your message..."),
        gr.Chatbot(label="Conversation History"),
        gr.Textbox(label="System Message"),
        gr.Slider(minimum=10, maximum=200, step=1, label="Max Tokens"),
        gr.Slider(minimum=0, maximum=2, step=0.1, label="Temperature"),
        gr.Slider(minimum=0, maximum=1, step=0.05, label="Top P"),
        gr.Slider(minimum=-2, maximum=2, step=0.1, label="Frequency Penalty"),
        gr.Slider(minimum=-1, maximum=1000000, step=1, label="Seed (-1 for random)"),
    ],
    theme="Nymbo/Nymbo_Theme",
)

# Create the "Featured Models" accordion
with gr.Accordion("Featured Models", open=True) as featured_models:
    # Textbox for searching models
    model_search = gr.Textbox(label="Filter Models")
    # List of featured models
    models = [
        "meta-llama/Llama-3.3-70B-Instruct",
        "meta-llama/Llama-2-70B-Chat-hf",
        "TheBloke/Llama-2-13B-Chat-GGML",
        "TheBloke/Llama-2-70B-Chat-GGML",
        "TheBloke/Llama-2-13B-Chat-GGML-v2",
        "TheBloke/Llama-2-70B-Chat-GGML-v2",
        "TheBloke/Llama-2-70B-Chat-HF-API-compatible-GGML",
        "TheBloke/Llama-2-70b-chat-hf",
        "TheBloke/Llama-2-70B-Chat-GGML-v2-32K",
        "TheBloke/Llama-2-13B-Chat-GGML-v2-32K",
        "TheBloke/Llama-2-70B-Chat-GGML-v2-32K",
        "TheBloke/Llama-2-13B-Chat-GGML-v2-32K",
        "TheBloke/Llama-2-70B-Chat-GGML-v2-32K",
        "TheBloke/Llama-7-13B-Chat-GGML-v2-32K",
        "TheBloke/Llama-2-70B-Chat-GGML-v2-32K",
        "TheBloke/Llama-2-13B-Chat-GGML-v2-32K",
        "TheBloke/Llama-2-70B-Chat-GGML-v2-32K",
        # Add more models as needed...
    ]
    # Radio buttons for selecting a model
    model_radio = gr.Radio(choices=models, label="Select a Model")

    # Update the model list based on search input
    def filter_models(search_term):
        filtered_models = [model for model in models if search_term.lower() in model.lower()]
        return gr.update(choices=filtered_models)

    # Update the model list when the search box is used
    model_search.change(filter_models, inputs=model_search, outputs=model_radio)

# Create a "Custom Model" textbox
custom_model = gr.Textbox(label="Custom Model", placeholder="Hugging Face model path")

# Create the "Information" tab
with gr.Tab("Information"):
    # Featured Models accordion
    with gr.Accordion("Featured Models", open=False):
        gr.Markdown(
            """
            # Featured Models

            Here's a list of some popular models available on Hugging Face:

            - meta-llama/Llama-3.3-70B-Instruct
            - meta-llama/Llama-2-70B-Chat-hf
            - TheBloke/Llama-2-13B-Chat-GGML
            - TheBloke/Llama-2-70B-Chat-GGML
            - TheBloke/Llama-2-13B-Chat-GGML-v2
            - TheBloke/Llama-2-70B-Chat-GGML-v2
            - ... (and many more)

            You can search and select a model from the list above, or use your own custom model path.
            """
        )

    # Parameters Overview accordion
    with gr.Accordion("Parameters Overview", open=False):
        gr.Markdown(
            """
            # Parameters Overview

            Here's a brief explanation of the parameters you can adjust:

            - **Max Tokens**: The maximum number of tokens to generate in the response.
            - **Temperature**: Controls the randomness of the output. Higher values make the output more random.
            - **Top P**: Also known as nucleus sampling, it filters the least probable tokens, encouraging the model to be more creative.
            - **Frequency Penalty**: Penalizes repeated tokens to avoid repetition.
            - **Seed**: A fixed seed for reproducibility. Use -1 for a random seed.

            Feel free to experiment with these settings to achieve the desired output.
            """
        )

# Launch the Gradio interface
demo.launch(share=True)