Spaces:
Runtime error
Runtime error
import asyncio | |
import torch | |
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline | |
from accelerate import init_empty_weights, load_checkpoint_and_dispatch | |
# Model name (Ensure it's available on Hugging Face) | |
MODEL_NAME = "mistralai/Mistral-7B-Instruct-v0.1" # Use smaller if needed | |
# Detect device | |
device = "cuda" if torch.cuda.is_available() else "cpu" | |
# Load tokenizer | |
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) | |
# Enable disk offloading if using CPU (to prevent memory overload) | |
with init_empty_weights(): | |
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME) | |
# Offload model to disk if no GPU available | |
model = load_checkpoint_and_dispatch( | |
model, | |
MODEL_NAME, | |
device_map="auto", | |
offload_folder="/code/model_cache", # Ensure a valid folder for offloading | |
offload_state_dict=True | |
).to(device) | |
# Hugging Face pipeline for text generation | |
generator = pipeline("text-generation", model=model, tokenizer=tokenizer, device=0 if torch.cuda.is_available() else -1) | |
async def generate_stream(query: str): | |
"""Stream responses using Hugging Face Transformers (LLaMA 2).""" | |
input_ids = tokenizer(query, return_tensors="pt").input_ids.to(device) | |
# Generate text with controlled memory usage | |
output = generator(query, max_length=512, do_sample=True, temperature=0.7) | |
response_text = output[0]["generated_text"] | |
# Simulate streaming output | |
for word in response_text.split(): | |
yield word + " " | |
await asyncio.sleep(0.05) | |
yield "\n" | |