Spaces:

SpiceyToad
/

demo-falc-api

Sleeping

SpiceyToad commited on Dec 4, 2024

Commit

24242bc

1 Parent(s): 156851a

Fix API token and device handling

Files changed (1) hide show

app.py CHANGED Viewed

@@ -1,13 +1,23 @@
 from fastapi import FastAPI, Request
 from transformers import AutoTokenizer, AutoModelForCausalLM
 import torch
 app = FastAPI()
 # Load the Falcon 7B model and tokenizer
-MODEL_NAME = "SpiceyToad/demo-falc"  # Replace with your Hub repo name
-tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
-model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, torch_dtype=torch.bfloat16, device_map="auto")
 @app.post("/generate")
 async def generate_text(request: Request):
@@ -17,8 +27,8 @@ async def generate_text(request: Request):
     max_length = data.get("max_length", 50)
     # Tokenize input and generate text
-    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
     outputs = model.generate(inputs["input_ids"], max_length=max_length)
-    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
-    return {"generated_text": response}

 from fastapi import FastAPI, Request
 from transformers import AutoTokenizer, AutoModelForCausalLM
 import torch
+import os
+# Retrieve the Hugging Face API token from the environment
+HF_API_TOKEN = os.getenv("HF_API_TOKEN")
 app = FastAPI()
 # Load the Falcon 7B model and tokenizer
+MODEL_NAME = "SpiceyToad/demo-falc"  # Replace with your Hugging Face repo name
+tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_auth_token=HF_API_TOKEN)
+model = AutoModelForCausalLM.from_pretrained(
+    MODEL_NAME, torch_dtype=torch.bfloat16, device_map="auto", use_auth_token=HF_API_TOKEN
+)
+# Automatically determine if CUDA is available for GPU support
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+model = model.to(device)
 @app.post("/generate")
 async def generate_text(request: Request):
     max_length = data.get("max_length", 50)
     # Tokenize input and generate text
+    inputs = tokenizer(prompt, return_tensors="pt").to(device)
     outputs = model.generate(inputs["input_ids"], max_length=max_length)
+    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
+    return {"generated_text": generated_text}