Spaces:

huggingface
/

inference-playground

Running on CPU Upgrade

File size: 6,087 Bytes

52c6f5c

import { last } from "$lib/utils/array.js";
import { StreamWriter } from "$lib/utils/stream.js";
import { json } from "@sveltejs/kit";
import type { ChatCompletionMessage } from "openai/resources/index.mjs";
import type { RequestHandler } from "./$types.js";
import { createAdapter, type GenerationArgs } from "./adapter.js";
import { connectToMCPServers, executeMcpTool, type MCPServerConnection } from "./mcp.js";
import type { FinishReason, GenerateRequest } from "./types.js";
import { debugLog } from "./utils.js";

type AssistantResponse = { message: ChatCompletionMessage; finish_reason: FinishReason };

type GenerateLoopArgs = {
	args: GenerationArgs;
	getAssistantResponse: (args: GenerationArgs) => Promise<AssistantResponse>;
	connections: MCPServerConnection[];
};

async function generateLoop({ args, getAssistantResponse, connections }: GenerateLoopArgs) {
	let finish_reason: FinishReason | null = null;
	const abortReasons: FinishReason[] = ["stop", "content_filter", "length"];

	while (!abortReasons.includes(finish_reason)) {
		debugLog("finish reason", finish_reason);
		switch (finish_reason) {
			case null: {
				const res = await getAssistantResponse(args);
				args.messages.push(res.message);
				finish_reason = res.finish_reason;
				break;
			}
			case "tool_calls": {
				const toolCalls = last(args.messages)?.tool_calls;
				if (!toolCalls) {
					debugLog("No tool calls found");
					finish_reason = null;
					break;
				}

				debugLog("Executing tool calls");
				debugLog(JSON.stringify(toolCalls, null, 2));

				await Promise.allSettled(
					toolCalls.map(async toolCall => {
						// eslint-disable-next-line @typescript-eslint/no-explicit-any
						const response = await executeMcpTool(connections, toolCall as any);
						debugLog("Tool call response", response);
						args.messages.push(response);
					}),
				);

				finish_reason = null;

				break;
			}
			default: {
				finish_reason = "stop";
				break;
			}
		}
	}
}

export const POST: RequestHandler = async ({ request }) => {
	try {
		const body: GenerateRequest = await request.json();
		const { model, messages, config, provider, streaming, response_format, enabledMCPs } = body;

		if (enabledMCPs?.length === 0) {
			debugLog(`MCP: Enabled MCP servers: ${enabledMCPs?.join(", ")}`);
		}

		// Connect to enabled MCP servers
		const connections = await connectToMCPServers(enabledMCPs || []);
		const tools = connections.flatMap(conn => conn.tools);

		debugLog(`MCP: Connected to ${connections.length} servers with ${tools.length} tools available`);

		const adapter = createAdapter(body);

		const args = {
			model: model.id,
			messages,
			provider,
			...config,
			tools,
			response_format,
			stream: streaming,
		};

		if (streaming) {
			const writer = new StreamWriter();

			generateLoop({
				args,
				connections,
				getAssistantResponse: async args => {
					debugLog("Generating streaming response");
					const res: AssistantResponse = {
						message: {
							role: "assistant",
							content: "",
							// refusal: null,
							// eslint-disable-next-line @typescript-eslint/no-explicit-any
						} as any,
						finish_reason: null,
					};

					try {
						const adapterStream = await adapter.stream(args);
						for await (const chunk of adapterStream) {
							const choice = chunk.choices[0];
							if (!choice) continue;

							if (choice.delta.content) {
								res.message.content += choice.delta.content;
								writer.writeChunk(choice.delta.content || "");
							}
							if (choice.delta.tool_calls) {
								res.message.tool_calls = res.message.tool_calls ?? [];

								for (const toolCall of choice.delta.tool_calls) {
									res.message.tool_calls[toolCall.index] = res.message.tool_calls[toolCall.index] ?? {
										id: toolCall.id ?? "",
										type: "function",
										function: {
											name: "",
											arguments: "",
										},
									};

									if (toolCall.function?.name) {
										res.message.tool_calls[toolCall.index]!.function.name += toolCall.function.name;
									}
									if (toolCall.function?.arguments) {
										res.message.tool_calls[toolCall.index]!.function.arguments += toolCall.function.arguments;
									}
								}
							}
							if (choice.finish_reason) {
								res.finish_reason = choice.finish_reason;
							}
						}
					} catch (error) {
						console.error("stream error", error);
						writer.error(error instanceof Error ? error : new Error(String(error)));
						res.finish_reason = "stop";
						return res;
					}

					debugLog("Generated message");
					debugLog(JSON.stringify(res.message, null, 2));
					return res;
				},
			})
				.then(() => writer.end())
				.catch(error => {
					console.error("Generation loop error:", error);
					writer.error(error instanceof Error ? error : new Error(String(error)));
				});

			debugLog("Creating response...");

			return writer.createResponse();
		}

		const message: ChatCompletionMessage = {
			role: "assistant",
			content: "",
			// refusal: null,
			// eslint-disable-next-line @typescript-eslint/no-explicit-any
		} as any;

		await generateLoop({
			args,
			connections,
			getAssistantResponse: async args => {
				debugLog("Generating non-streaming response");
				const response = await adapter.generate(args);
				debugLog("Generated the response");
				debugLog(JSON.stringify(response, null, 2));

				if (response.choices && response.choices.length > 0) {
					message.content += response.choices[0]!.message.content ?? "";
					// const { completion_tokens } = response.usage || { completion_tokens: 0 };

					return {
						message: response.choices[0]!.message,
						finish_reason: response.choices[0]!.finish_reason,
					};
				}
				throw new Error("No response from the model");
			},
		});

		return json({ message /* ,completion_tokens */ });
	} catch (error) {
		debugLog(JSON.stringify(error, null, 2));
		console.error("Generation error:", error);
		return json({ error: error instanceof Error ? error.message : "Unknown error occurred" }, { status: 500 });
	}
};