|
from fastapi import FastAPI, Request
|
|
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
|
|
import torch
|
|
|
|
app = FastAPI()
|
|
|
|
model_name = "howtomakepplragequit/phi2-lora-instruct"
|
|
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
|
model = AutoModelForCausalLM.from_pretrained(
|
|
model_name,
|
|
torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
|
|
device_map="auto"
|
|
)
|
|
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)
|
|
|
|
@app.post("/generate")
|
|
async def generate(request: Request):
|
|
data = await request.json()
|
|
prompt = data.get("prompt", "")
|
|
formatted = f"### Instruction:\n{prompt}\n\n### Response:\n"
|
|
result = pipe(formatted, max_new_tokens=200)[0]["generated_text"]
|
|
return {"response": result.split("### Response:")[-1].strip()}
|
|
|