import asyncio import torch from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline from accelerate import init_empty_weights, load_checkpoint_and_dispatch # Model name (Ensure it's available on Hugging Face) MODEL_NAME = "mistralai/Mistral-7B-Instruct-v0.1" # Use smaller if needed # Detect device device = "cuda" if torch.cuda.is_available() else "cpu" # Load tokenizer tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) # Enable disk offloading if using CPU (to prevent memory overload) with init_empty_weights(): model = AutoModelForCausalLM.from_pretrained(MODEL_NAME) # Offload model to disk if no GPU available model = load_checkpoint_and_dispatch( model, MODEL_NAME, device_map="auto", offload_folder="/code/model_cache", # Ensure a valid folder for offloading offload_state_dict=True ).to(device) # Hugging Face pipeline for text generation generator = pipeline("text-generation", model=model, tokenizer=tokenizer, device=0 if torch.cuda.is_available() else -1) async def generate_stream(query: str): """Stream responses using Hugging Face Transformers (LLaMA 2).""" input_ids = tokenizer(query, return_tensors="pt").input_ids.to(device) # Generate text with controlled memory usage output = generator(query, max_length=512, do_sample=True, temperature=0.7) response_text = output[0]["generated_text"] # Simulate streaming output for word in response_text.split(): yield word + " " await asyncio.sleep(0.05) yield "\n"