from fastapi import FastAPI, Request from transformers import AutoTokenizer, AutoModelForCausalLM import torch import os # Retrieve the Hugging Face API token from the environment HF_API_TOKEN = os.getenv("HF_API_TOKEN") app = FastAPI() # Load the Falcon 7B model and tokenizer MODEL_NAME = "SpiceyToad/demo-falc" # Replace with your Hugging Face repo name tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_auth_token=HF_API_TOKEN) model = AutoModelForCausalLM.from_pretrained( MODEL_NAME, torch_dtype=torch.bfloat16, device_map="auto", use_auth_token=HF_API_TOKEN ) # Automatically determine if CUDA is available for GPU support device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model = model.to(device) @app.post("/generate") async def generate_text(request: Request): # Parse input JSON data = await request.json() prompt = data.get("prompt", "") max_length = data.get("max_length", 50) # Tokenize input and generate text inputs = tokenizer(prompt, return_tensors="pt").to(device) outputs = model.generate(inputs["input_ids"], max_length=max_length) generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True) return {"generated_text": generated_text}