llm_host / utils.py
Bahodir Nematjonov
debuging
2f136fb
raw
history blame
1.54 kB
import asyncio
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from accelerate import init_empty_weights, load_checkpoint_and_dispatch
# Model name (Ensure it's available on Hugging Face)
MODEL_NAME = "mistralai/Mistral-7B-Instruct-v0.1" # Use smaller if needed
# Detect device
device = "cuda" if torch.cuda.is_available() else "cpu"
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
# Enable disk offloading if using CPU (to prevent memory overload)
with init_empty_weights():
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)
# Offload model to disk if no GPU available
model = load_checkpoint_and_dispatch(
model,
MODEL_NAME,
device_map="auto",
offload_folder="/code/model_cache", # Ensure a valid folder for offloading
offload_state_dict=True
).to(device)
# Hugging Face pipeline for text generation
generator = pipeline("text-generation", model=model, tokenizer=tokenizer, device=0 if torch.cuda.is_available() else -1)
async def generate_stream(query: str):
"""Stream responses using Hugging Face Transformers (LLaMA 2)."""
input_ids = tokenizer(query, return_tensors="pt").input_ids.to(device)
# Generate text with controlled memory usage
output = generator(query, max_length=512, do_sample=True, temperature=0.7)
response_text = output[0]["generated_text"]
# Simulate streaming output
for word in response_text.split():
yield word + " "
await asyncio.sleep(0.05)
yield "\n"