import gradio as gr import requests def talk_to_llama(prompt): url = "https://integrate.api.nvidia.com/v1/chat/completions" headers = { "Authorization": "Bearer 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", # Replace if you regenerate key "Content-Type": "application/json" } data = { "model": "meta/llama-4-maverick-17b-128e-instruct", "messages": [ {"role": "user", "content": prompt} ], "max_tokens": 512, "temperature": 1.0, "top_p": 1.0, "stream": False } response = requests.post(url, headers=headers, json=data) try: return response.json()["choices"][0]["message"]["content"] except Exception: return f"Error:\n{response.text}" chat = gr.Interface( fn=talk_to_llama, inputs="text", outputs="text", title="Chat with LLaMA 4 Maverick", description="Ask anything! This chatbot uses NVIDIA’s 3.5M token LLaMA 4 Maverick 17B model." ) chat.launch()