Spaces:

HumbleBeeAI
/

llm_host

Runtime error

File size: 1,535 Bytes

8b883c8
7c59172
 
2f136fb
e4f5d4a
2f136fb
 
7c59172
2f136fb
7c59172
 
2f136fb
7c59172
2f136fb
 
 
 
 
 
 
 
7c59172
2f136fb
 
 
7c59172
 
2f136fb
7c59172
e4f5d4a
8b883c8
2f136fb
7c59172
 
2f136fb
7c59172
 
 
2f136fb
 
7c59172
 
 
2f136fb
7c59172

import asyncio
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from accelerate import init_empty_weights, load_checkpoint_and_dispatch

# Model name (Ensure it's available on Hugging Face)
MODEL_NAME = "mistralai/Mistral-7B-Instruct-v0.1"  # Use smaller if needed

# Detect device
device = "cuda" if torch.cuda.is_available() else "cpu"

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

# Enable disk offloading if using CPU (to prevent memory overload)
with init_empty_weights():
    model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)

# Offload model to disk if no GPU available
model = load_checkpoint_and_dispatch(
    model,
    MODEL_NAME,
    device_map="auto",
    offload_folder="/code/model_cache",  # Ensure a valid folder for offloading
    offload_state_dict=True
).to(device)

# Hugging Face pipeline for text generation
generator = pipeline("text-generation", model=model, tokenizer=tokenizer, device=0 if torch.cuda.is_available() else -1)

async def generate_stream(query: str):
    """Stream responses using Hugging Face Transformers (LLaMA 2)."""
    input_ids = tokenizer(query, return_tensors="pt").input_ids.to(device)

    # Generate text with controlled memory usage
    output = generator(query, max_length=512, do_sample=True, temperature=0.7)

    response_text = output[0]["generated_text"]

    # Simulate streaming output
    for word in response_text.split():
        yield word + " "
        await asyncio.sleep(0.05)
    
    yield "\n"