Spaces:
Sleeping
Sleeping
from fastapi import FastAPI, Request | |
from transformers import AutoTokenizer, AutoModelForCausalLM | |
import torch | |
import os | |
# Retrieve the Hugging Face API token from the environment | |
HF_API_TOKEN = os.getenv("HF_API_TOKEN") | |
app = FastAPI() | |
# Load the Falcon 7B model and tokenizer | |
MODEL_NAME = "SpiceyToad/demo-falc" # Replace with your Hugging Face repo name | |
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_auth_token=HF_API_TOKEN) | |
model = AutoModelForCausalLM.from_pretrained( | |
MODEL_NAME, torch_dtype=torch.bfloat16, device_map="auto", use_auth_token=HF_API_TOKEN | |
) | |
# Automatically determine if CUDA is available for GPU support | |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
model = model.to(device) | |
async def generate_text(request: Request): | |
# Parse input JSON | |
data = await request.json() | |
prompt = data.get("prompt", "") | |
max_length = data.get("max_length", 50) | |
# Tokenize input and generate text | |
inputs = tokenizer(prompt, return_tensors="pt").to(device) | |
outputs = model.generate(inputs["input_ids"], max_length=max_length) | |
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True) | |
return {"generated_text": generated_text} | |