Spaces:
Runtime error
Runtime error
File size: 1,535 Bytes
8b883c8 7c59172 2f136fb e4f5d4a 2f136fb 7c59172 2f136fb 7c59172 2f136fb 7c59172 2f136fb 7c59172 2f136fb 7c59172 2f136fb 7c59172 e4f5d4a 8b883c8 2f136fb 7c59172 2f136fb 7c59172 2f136fb 7c59172 2f136fb 7c59172 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 |
import asyncio
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from accelerate import init_empty_weights, load_checkpoint_and_dispatch
# Model name (Ensure it's available on Hugging Face)
MODEL_NAME = "mistralai/Mistral-7B-Instruct-v0.1" # Use smaller if needed
# Detect device
device = "cuda" if torch.cuda.is_available() else "cpu"
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
# Enable disk offloading if using CPU (to prevent memory overload)
with init_empty_weights():
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)
# Offload model to disk if no GPU available
model = load_checkpoint_and_dispatch(
model,
MODEL_NAME,
device_map="auto",
offload_folder="/code/model_cache", # Ensure a valid folder for offloading
offload_state_dict=True
).to(device)
# Hugging Face pipeline for text generation
generator = pipeline("text-generation", model=model, tokenizer=tokenizer, device=0 if torch.cuda.is_available() else -1)
async def generate_stream(query: str):
"""Stream responses using Hugging Face Transformers (LLaMA 2)."""
input_ids = tokenizer(query, return_tensors="pt").input_ids.to(device)
# Generate text with controlled memory usage
output = generator(query, max_length=512, do_sample=True, temperature=0.7)
response_text = output[0]["generated_text"]
# Simulate streaming output
for word in response_text.split():
yield word + " "
await asyncio.sleep(0.05)
yield "\n"
|