Spaces:
Sleeping
Sleeping
import os | |
from fastapi import FastAPI, Request | |
from transformers import AutoTokenizer, AutoModelForCausalLM | |
import torch | |
# Set Hugging Face cache directory | |
os.environ["HF_HOME"] = "/home/user/cache" | |
# Get Hugging Face API token | |
HF_API_TOKEN = os.getenv("HF_API_TOKEN") | |
if not HF_API_TOKEN: | |
raise ValueError("HF_API_TOKEN environment variable is not set!") | |
app = FastAPI() | |
# Load Falcon 7B model | |
MODEL_NAME = "SpiceyToad/demo-falc" | |
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_auth_token=HF_API_TOKEN) | |
model = AutoModelForCausalLM.from_pretrained( | |
MODEL_NAME, | |
device_map="auto", | |
torch_dtype=torch.bfloat16, | |
use_auth_token=HF_API_TOKEN | |
) | |
# Ensure tokenizer has a padding token | |
if tokenizer.pad_token is None: | |
tokenizer.pad_token = tokenizer.eos_token # Use the EOS token as the padding token | |
async def generate_text(request: Request): | |
data = await request.json() | |
prompt = data.get("prompt", "").strip() | |
max_length = data.get("max_length", 50) | |
if not prompt: | |
return {"error": "Prompt is required!"} | |
# Validate max_length | |
max_length = min(max_length, model.config.max_position_embeddings) | |
# Tokenize with padding and attention mask | |
inputs = tokenizer( | |
prompt, | |
return_tensors="pt", | |
padding=True, | |
truncation=True, | |
max_length=max_length | |
).to(model.device) | |
# Generate response | |
outputs = model.generate( | |
inputs["input_ids"], | |
attention_mask=inputs["attention_mask"], | |
max_length=max_length | |
) | |
response = tokenizer.decode(outputs[0], skip_special_tokens=True) | |
return {"generated_text": response} |