import socket import subprocess import gradio as gr from openai import OpenAI subprocess.Popen("bash /home/user/app/start.sh", shell=True) client = OpenAI(base_url="http://0.0.0.0:8000/v1", api_key="sk-local", timeout=600) def respond( message, history: list[tuple[str, str]], system_message=None, max_tokens=None, temperature=0.7 ): messages = [] if system_message: messages = [{"role": "system", "content": system_message}] for user, assistant in history: if user: messages.append({"role": "user", "content": user}) if assistant: messages.append({"role": "assistant", "content": assistant}) messages.append({"role": "user", "content": message}) try: stream = client.chat.completions.create( model="Deepseek-R1-0528-Qwen3-8B", messages=messages, max_tokens=max_tokens, temperature=temperature, top_p=top_p, stream=True, tools=[ { "type": "function", "function": { "name": "browser_search", "description": ( "Search the web for a given query and return the most relevant results." ), "parameters": { "type": "object", "properties": { "query": { "type": "string", "description": "The search query string.", }, "max_results": { "type": "integer", "description": ( "Maximum number of search results to return. " "If omitted the service will use its default." ), "default": 5, }, }, "required": ["query"], }, }, }, {"type": "code_interpreter"}, ], ) print("messages", messages) output = "" for chunk in stream: delta = chunk.choices[0].delta try: output += delta.reasoning_content except: output += delta.content or "" yield output except Exception as e: print(f"[Error] {e}") yield "⚠️ Llama.cpp server error" demo = gr.ChatInterface(respond) if __name__ == "__main__": demo.launch(show_api=False)