from fastapi import FastAPI, Request from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline import torch app = FastAPI() model_name = "howtomakepplragequit/phi2-lora-instruct" tokenizer = AutoTokenizer.from_pretrained(model_name) model = AutoModelForCausalLM.from_pretrained( model_name, torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32, device_map="auto" ) pipe = pipeline("text-generation", model=model, tokenizer=tokenizer) @app.post("/generate") async def generate(request: Request): data = await request.json() prompt = data.get("prompt", "") formatted = f"### Instruction:\n{prompt}\n\n### Response:\n" result = pipe(formatted, max_new_tokens=200)[0]["generated_text"] return {"response": result.split("### Response:")[-1].strip()}