from fastapi import FastAPI, Request from transformers import AutoTokenizer, AutoModelForCausalLM from peft import PeftModel, PeftConfig import torch app = FastAPI() model_name = "microsoft/phi-2" peft_model_id = "howtomakepplragequit/phi2-lora-instruct" # Load tokenizer and model with LoRA tokenizer = AutoTokenizer.from_pretrained(model_name) base_model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16) model = PeftModel.from_pretrained(base_model, peft_model_id) model.eval() @app.post("/generate") async def generate(request: Request): data = await request.json() prompt = data.get("prompt", "") inputs = tokenizer(prompt, return_tensors="pt").to(model.device) outputs = model.generate(**inputs, max_new_tokens=100) response = tokenizer.decode(outputs[0], skip_special_tokens=True) return {"response": response}