File size: 874 Bytes
0f9dc03 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 |
from fastapi import FastAPI, Request
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel, PeftConfig
import torch
app = FastAPI()
model_name = "microsoft/phi-2"
peft_model_id = "howtomakepplragequit/phi2-lora-instruct"
# Load tokenizer and model with LoRA
tokenizer = AutoTokenizer.from_pretrained(model_name)
base_model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16)
model = PeftModel.from_pretrained(base_model, peft_model_id)
model.eval()
@app.post("/generate")
async def generate(request: Request):
data = await request.json()
prompt = data.get("prompt", "")
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
outputs = model.generate(**inputs, max_new_tokens=100)
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
return {"response": response}
|