Spaces:

HumbleBeeAI
/

llm_host

Runtime error

llm_host / utils.py

Bahodir Nematjonov

debuging

2f136fb 7 months ago

1.54 kB

	import asyncio
	import torch
	from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
	from accelerate import init_empty_weights, load_checkpoint_and_dispatch

	# Model name (Ensure it's available on Hugging Face)
	MODEL_NAME = "mistralai/Mistral-7B-Instruct-v0.1" # Use smaller if needed

	# Detect device
	device = "cuda" if torch.cuda.is_available() else "cpu"

	# Load tokenizer
	tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

	# Enable disk offloading if using CPU (to prevent memory overload)
	with init_empty_weights():
	model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)

	# Offload model to disk if no GPU available
	model = load_checkpoint_and_dispatch(
	model,
	MODEL_NAME,
	device_map="auto",
	offload_folder="/code/model_cache", # Ensure a valid folder for offloading
	offload_state_dict=True
	).to(device)

	# Hugging Face pipeline for text generation
	generator = pipeline("text-generation", model=model, tokenizer=tokenizer, device=0 if torch.cuda.is_available() else -1)

	async def generate_stream(query: str):
	"""Stream responses using Hugging Face Transformers (LLaMA 2)."""
	input_ids = tokenizer(query, return_tensors="pt").input_ids.to(device)

	# Generate text with controlled memory usage
	output = generator(query, max_length=512, do_sample=True, temperature=0.7)

	response_text = output[0]["generated_text"]

	# Simulate streaming output
	for word in response_text.split():
	yield word + " "
	await asyncio.sleep(0.05)

	yield "\n"