Spaces:
Sleeping
Sleeping
Upload 7 files
Browse filesAdd FastAPI chat completion service and configuration files
- Implement chat completion API
- Add Dockerfile for containerization
- Include requirements for dependencies
- Configure VSCode settings
- Create .gitignore for project files
- .gitignore +2 -0
- .vscode/settings.json +3 -0
- Dockerfile +13 -0
- main.py +8 -0
- models/chat_completion.py +24 -0
- requirements.txt +3 -0
- routes/chatCompletion.py +41 -0
.gitignore
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
test.py
|
2 |
+
.venv
|
.vscode/settings.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"editor.wordWrap": "on"
|
3 |
+
}
|
Dockerfile
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
FROM python:3.11
|
2 |
+
|
3 |
+
RUN useradd -m -u 1000 user
|
4 |
+
USER user
|
5 |
+
ENV PATH="/home/user/.local/bin:$PATH"
|
6 |
+
|
7 |
+
WORKDIR /app
|
8 |
+
|
9 |
+
COPY --chown=user ./requirements.txt requirements.txt
|
10 |
+
RUN pip install --no-cache-dir --upgrade -r requirements.txt
|
11 |
+
|
12 |
+
COPY --chown=user . /app
|
13 |
+
CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
|
main.py
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from huggingface_hub import InferenceClient
|
2 |
+
from fastapi import FastAPI
|
3 |
+
|
4 |
+
app = FastAPI()
|
5 |
+
|
6 |
+
@app.get("/")
|
7 |
+
async def root():
|
8 |
+
return {"message": "Hello World"}
|
models/chat_completion.py
ADDED
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from pydantic import BaseModel, Field
|
2 |
+
from typing import List, Optional
|
3 |
+
from huggingface_hub import ChatCompletionInputMessage, ChatCompletionInputGrammarType, ChatCompletionInputStreamOptions, ChatCompletionInputToolChoiceClass, ChatCompletionInputTool
|
4 |
+
|
5 |
+
class ChatRequest(BaseModel):
|
6 |
+
model: str = Field(..., description="The model to use for chat-completion. Can be a model ID hosted on the Hugging Face Hub or a URL to a deployed Inference Endpoint. If not provided, the default recommended model for chat-based text-generation will be used. See https://huggingface.co/tasks/text-generation for more details.")
|
7 |
+
messages: List[ChatCompletionInputMessage] = Field(..., description="Conversation history consisting of roles and content pairs.")
|
8 |
+
frequency_penalty: Optional[float] = Field(0.0, ge=-2.0, le=2.0, description="Penalizes new tokens based on their existing frequency in the text so far. Range: [-2.0, 2.0]. Defaults to 0.0.")
|
9 |
+
logit_bias: Optional[dict] = Field(None, description="Modify the likelihood of specified tokens appearing in the completion. Accepts a JSON object that maps tokens to an associated bias value from -100 to 100.")
|
10 |
+
logprobs: Optional[bool] = Field(None, description="Whether to return log probabilities of the output tokens or not.")
|
11 |
+
max_tokens: Optional[int] = Field(100, description="Maximum number of tokens allowed in the response. Defaults to 100.")
|
12 |
+
n: Optional[int] = Field(None, description="UNUSED.")
|
13 |
+
presence_penalty: Optional[float] = Field(None, ge=-2.0, le=2.0, description="Positive values penalize new tokens based on whether they appear in the text so far.")
|
14 |
+
response_format: Optional[ChatCompletionInputGrammarType] = Field(None, description="Grammar constraints. Can be either a JSONSchema or a regex.")
|
15 |
+
seed: Optional[int] = Field(None, description="Seed for reproducible control flow.")
|
16 |
+
stop: Optional[str] = Field(None, description="Up to four strings which trigger the end of the response.")
|
17 |
+
stream: Optional[bool] = Field(False, description="Enable realtime streaming of responses. Defaults to False.")
|
18 |
+
stream_options: Optional[ChatCompletionInputStreamOptions] = Field(None, description="Options for streaming completions.")
|
19 |
+
temperature: Optional[float] = Field(1.0, ge=0.0, le=2.0, description="Controls randomness of the generations. Lower values ensure less random completions.")
|
20 |
+
top_logprobs: Optional[int] = Field(None, ge=0, le=5, description="Specifying the number of most likely tokens to return at each token position.")
|
21 |
+
top_p: Optional[float] = Field(1.0, ge=0.0, le=1.0, description="Fraction of the most likely next words to sample from.")
|
22 |
+
tool_choice: Optional[ChatCompletionInputToolChoiceClass] = Field("auto", description="The tool to use for the completion. Defaults to 'auto'.")
|
23 |
+
tool_prompt: Optional[str] = Field(None, description="A prompt to be appended before the tools.")
|
24 |
+
tools: Optional[List[ChatCompletionInputTool]] = Field(None, description="A list of tools the model may call.")
|
requirements.txt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
fastapi
|
2 |
+
uvicorn[standard]
|
3 |
+
huggingface_hub==0.27.1
|
routes/chatCompletion.py
ADDED
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from fastapi import APIRouter
|
2 |
+
from fastapi.responses import StreamingResponse
|
3 |
+
from models.chat_completion import ChatRequest
|
4 |
+
from huggingface_hub import InferenceClient
|
5 |
+
import json
|
6 |
+
|
7 |
+
router = APIRouter()
|
8 |
+
|
9 |
+
def generate_stream(response):
|
10 |
+
for chunk in response:
|
11 |
+
yield f"data: {json.dumps(chunk.__dict__, separators=(',', ':'))}\n\n"
|
12 |
+
|
13 |
+
@router.post("/v1/chat-completion", tags=["Chat Completion"])
|
14 |
+
async def chat_completion(body: ChatRequest):
|
15 |
+
client = InferenceClient(model=body.model)
|
16 |
+
|
17 |
+
res = client.chat_completion(
|
18 |
+
messages=body.messages,
|
19 |
+
frequency_penalty=body.frequency_penalty,
|
20 |
+
logit_bias=body.logit_bias,
|
21 |
+
logprobs=body.logprobs,
|
22 |
+
max_tokens=body.max_tokens,
|
23 |
+
n=body.n,
|
24 |
+
presence_penalty=body.presence_penalty,
|
25 |
+
response_format=body.response_format,
|
26 |
+
seed=body.seed,
|
27 |
+
stop=body.stop,
|
28 |
+
stream=body.stream,
|
29 |
+
stream_options=body.stream_options,
|
30 |
+
temperature=body.temperature,
|
31 |
+
top_logprobs=body.top_logprobs,
|
32 |
+
top_p=body.top_p,
|
33 |
+
tool_choice=body.tool_choice,
|
34 |
+
tool_prompt=body.tool_prompt,
|
35 |
+
tools=body.tools
|
36 |
+
)
|
37 |
+
|
38 |
+
if not body.stream:
|
39 |
+
return json.dumps(res.__dict__, indent=2)
|
40 |
+
else:
|
41 |
+
return StreamingResponse(generate_stream(res), media_type="text/event-stream")
|