Spaces:
Sleeping
Sleeping
File size: 1,682 Bytes
0abf936 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 |
import os
from fastapi import FastAPI, Request
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Set Hugging Face cache directory
os.environ["HF_HOME"] = "/home/user/cache"
# Get Hugging Face API token
HF_API_TOKEN = os.getenv("HF_API_TOKEN")
if not HF_API_TOKEN:
raise ValueError("HF_API_TOKEN environment variable is not set!")
app = FastAPI()
# Load Falcon 7B model
MODEL_NAME = "SpiceyToad/demo-falc"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_auth_token=HF_API_TOKEN)
model = AutoModelForCausalLM.from_pretrained(
MODEL_NAME,
device_map="auto",
torch_dtype=torch.bfloat16,
use_auth_token=HF_API_TOKEN
)
# Ensure tokenizer has a padding token
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token # Use the EOS token as the padding token
@app.post("/generate")
async def generate_text(request: Request):
data = await request.json()
prompt = data.get("prompt", "").strip()
max_length = data.get("max_length", 50)
if not prompt:
return {"error": "Prompt is required!"}
# Validate max_length
max_length = min(max_length, model.config.max_position_embeddings)
# Tokenize with padding and attention mask
inputs = tokenizer(
prompt,
return_tensors="pt",
padding=True,
truncation=True,
max_length=max_length
).to(model.device)
# Generate response
outputs = model.generate(
inputs["input_ids"],
attention_mask=inputs["attention_mask"],
max_length=max_length
)
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
return {"generated_text": response} |