import gradio as gr
import requests

def talk_to_llama(prompt):
    url = "https://integrate.api.nvidia.com/v1/chat/completions"
    headers = {
        "Authorization": "Bearer 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",  # Replace if you regenerate key
        "Content-Type": "application/json"
    }
    data = {
        "model": "meta/llama-4-maverick-17b-128e-instruct",
        "messages": [
            {"role": "user", "content": prompt}
        ],
        "max_tokens": 512,
        "temperature": 1.0,
        "top_p": 1.0,
        "stream": False
    }

    response = requests.post(url, headers=headers, json=data)

    try:
        return response.json()["choices"][0]["message"]["content"]
    except Exception:
        return f"Error:\n{response.text}"

chat = gr.Interface(
    fn=talk_to_llama,
    inputs="text",
    outputs="text",
    title="Chat with LLaMA 4 Maverick",
    description="Ask anything! This chatbot uses NVIDIA’s 3.5M token LLaMA 4 Maverick 17B model."
)

chat.launch()