Spaces:

francismurray
/

llm-compare

Running

App Files Files Community

llm-compare / app.py

francismurray

reduce max tokens default value to 10

895116b 20 days ago

raw

history blame contribute delete

10.7 kB

	import os
	import gradio as gr
	import asyncio
	from dotenv import load_dotenv
	from huggingface_hub import InferenceClient, hf_hub_download, model_info
	from functools import partial

	# Load environment variables
	load_dotenv()
	HF_TOKEN = os.getenv("HF_TOKEN")

	if not HF_TOKEN:
	raise ValueError("Please set HF_TOKEN environment variable")

	# Available models
	AVAILABLE_MODELS = [
	"HuggingFaceH4/zephyr-7b-beta",
	"NousResearch/Hermes-3-Llama-3.1-8B",
	"mistralai/Mistral-Nemo-Base-2407",
	"meta-llama/Llama-2-70b-hf",
	"aaditya/Llama3-OpenBioLLM-8B",
	]

	# Initialize inference client
	inference_client = InferenceClient(token=HF_TOKEN)

	def get_model_card_html(model_name, title):
	"""Fetch and format model card information."""
	try:
	info = model_info(model_name, token=HF_TOKEN)

	return f"""
	<div class="model-card-container">
	<h3>{info.modelId}</h3>
	<p><strong>Pipeline Tag:</strong> {info.pipeline_tag or 'Not specified'}</p>
	<p><strong>Downloads:</strong> {info.downloads:,}</p>
	<p><strong>Likes:</strong> {info.likes:,}</p>
	<p><a href="https://huggingface.co/{model_name}" target="_blank">View on Hugging Face</a></p>
	</div>
	"""
	except Exception as e:
	return f"""
	<div class="model-card-container">
	<h3>{model_name}</h3>
	<p>Unable to load full model card information.</p>
	<p><a href="https://huggingface.co/{model_name}" target="_blank">View on Hugging Face</a></p>
	</div>
	"""

	async def get_model_response(prompt, model_name, temperature_value, do_sample, max_tokens):
	"""Get response from a Hugging Face model."""
	try:
	# Build kwargs dynamically
	generation_args = {
	"prompt": prompt,
	"model": model_name,
	"max_new_tokens": max_tokens,
	"do_sample": do_sample,
	"return_full_text": False
	}

	# Only include temperature if sampling is enabled
	if do_sample and temperature_value > 0:
	generation_args["temperature"] = temperature_value

	# Run the inference in a thread pool to not block the event loop
	loop = asyncio.get_event_loop()
	response = await loop.run_in_executor(
	None,
	partial(inference_client.text_generation, **generation_args)
	)

	# Check if response might be truncated
	if len(response) >= max_tokens * 4: # Rough estimate of tokens to characters ratio
	response += "\n\n[Warning: Response may have been truncated. Try increasing the max tokens if the response seems incomplete.]"

	return response

	except Exception as e:
	return f"Error: {str(e)}"

	async def process_single_response(prompt, model_name, temp, do_sample, max_tokens, chatbot):
	"""Process a single model response and update its chatbot."""
	response = await get_model_response(prompt, model_name, temp, do_sample, max_tokens)
	chat_history = [{"role": "user", "content": prompt}, {"role": "assistant", "content": response}]
	return chat_history

	async def compare_models(prompt, model1, model2, temp1, temp2, do_sample1, do_sample2, max_tokens1, max_tokens2):
	"""Compare outputs from two selected models."""
	if not prompt.strip():
	empty_response = [{"role": "user", "content": prompt}, {"role": "assistant", "content": "Please enter a prompt"}]
	yield empty_response, empty_response, gr.update(interactive=True)
	return # Exit the generator

	# Initialize with "Generating..." messages
	initial_message = [{"role": "user", "content": prompt}, {"role": "assistant", "content": "Generating..."}]
	yield initial_message, initial_message, gr.update(interactive=False)

	# Create tasks for both model responses
	task1 = asyncio.create_task(process_single_response(prompt, model1, temp1, do_sample1, max_tokens1, "chatbot1"))
	task2 = asyncio.create_task(process_single_response(prompt, model2, temp2, do_sample2, max_tokens2, "chatbot2"))

	chat1 = chat2 = initial_message
	start_time = asyncio.get_event_loop().time()

	try:
	while not (task1.done() and task2.done()):
	# Update the messages with elapsed time
	elapsed = round(asyncio.get_event_loop().time() - start_time, 1)
	chat1_content = chat1[1]["content"]
	chat2_content = chat2[1]["content"]

	if not task1.done():
	chat1 = [{"role": "user", "content": prompt},
	{"role": "assistant", "content": f"Generating... ({elapsed:.1f}s)"}]
	if not task2.done():
	chat2 = [{"role": "user", "content": prompt},
	{"role": "assistant", "content": f"Generating... ({elapsed:.1f}s)"}]

	# Check if any task completed
	done, pending = await asyncio.wait([t for t in [task1, task2] if not t.done()],
	timeout=0.1,
	return_when=asyncio.FIRST_COMPLETED)

	for task in done:
	if task == task1:
	chat1 = await task1
	else:
	chat2 = await task2

	yield chat1, chat2, gr.update(interactive=False)

	# Ensure we have both final results
	if not task1.done():
	chat1 = await task1
	if not task2.done():
	chat2 = await task2

	# Final yield with both results
	yield chat1, chat2, gr.update(interactive=True)

	except Exception as e:
	error_message = [{"role": "user", "content": prompt}, {"role": "assistant", "content": f"Error: {str(e)}"}]
	yield error_message, error_message, gr.update(interactive=True)

	# Update temperature slider interactivity based on sampling checkbox
	def update_slider_state(enabled):
	return [
	gr.update(interactive=enabled),
	gr.update(
	elem_classes=[] if enabled else ["disabled-slider"],
	value=0 if not enabled else None
	)
	]

	# Create the Gradio interface
	with gr.Blocks(css="""
	.disabled-slider { opacity: 0.5; pointer-events: none; }
	.model-card-container {
	background-color: #f8f9fa;
	font-size: 14px;
	color: #666;
	}
	.model-card-container h3 {
	margin: 0;
	color: black;
	}
	.model-card-container p {
	margin: 5px 0;
	}
	""") as demo:
	gr.Markdown("# LLM Comparison Tool")
	gr.Markdown("Using HuggingFace's Inference API, compare outputs from different `text-generation` models side by side.")

	with gr.Row():
	prompt = gr.Textbox(
	label="Enter your prompt",
	placeholder="Type your prompt here...",
	lines=3
	)

	with gr.Row():
	submit_btn = gr.Button("Generate Responses")

	with gr.Row():
	with gr.Column():
	model1_dropdown = gr.Dropdown(
	choices=AVAILABLE_MODELS,
	value=AVAILABLE_MODELS[0],
	label="Select Model 1"
	)
	model1_card = gr.HTML(
	value=get_model_card_html(AVAILABLE_MODELS[0], "Model 1 Information"),
	elem_classes=["model-card-container"]
	)
	do_sample1 = gr.Checkbox(
	label="Enable sampling (random outputs)",
	value=False
	)
	temp1 = gr.Slider(
	label="Temperature (Higher = more creative, lower = more predictable)",
	minimum=0,
	maximum=1,
	step=0.1,
	value=0.0,
	interactive=False,
	elem_classes=["disabled-slider"]
	)
	max_tokens1 = gr.Slider(
	label="Maximum new tokens in response",
	minimum=10,
	maximum=2000,
	step=10,
	value=10
	)
	chatbot1 = gr.Chatbot(
	label="Model 1 Output",
	show_label=True,
	height=300,
	type="messages"
	)

	with gr.Column():
	model2_dropdown = gr.Dropdown(
	choices=AVAILABLE_MODELS,
	value=AVAILABLE_MODELS[1],
	label="Select Model 2"
	)
	model2_card = gr.HTML(
	value=get_model_card_html(AVAILABLE_MODELS[1], "Model 2 Information"),
	elem_classes=["model-card-container"]
	)
	do_sample2 = gr.Checkbox(
	label="Enable sampling (random outputs)",
	value=False
	)
	temp2 = gr.Slider(
	label="Temperature (Higher = more creative, lower = more predictable)",
	minimum=0,
	maximum=1,
	step=0.1,
	value=0.0,
	interactive=False,
	elem_classes=["disabled-slider"]
	)
	max_tokens2 = gr.Slider(
	label="Maximum new tokens in response",
	minimum=10,
	maximum=2000,
	step=10,
	value=10
	)
	chatbot2 = gr.Chatbot(
	label="Model 2 Output",
	show_label=True,
	height=300,
	type="messages"
	)

	def start_loading():
	return gr.update(interactive=False)

	# Handle form submission
	submit_btn.click(
	fn=start_loading,
	inputs=None,
	outputs=submit_btn,
	queue=False
	).then(
	fn=compare_models,
	inputs=[prompt, model1_dropdown, model2_dropdown, temp1, temp2, do_sample1, do_sample2, max_tokens1, max_tokens2],
	outputs=[chatbot1, chatbot2, submit_btn],
	queue=True # Enable queuing for streaming updates
	)

	# Update model cards when models are changed
	model1_dropdown.change(
	fn=lambda x: get_model_card_html(x, "Model 1 Information"),
	inputs=[model1_dropdown],
	outputs=[model1_card]
	)

	model2_dropdown.change(
	fn=lambda x: get_model_card_html(x, "Model 2 Information"),
	inputs=[model2_dropdown],
	outputs=[model2_card]
	)

	# Existing event handlers
	do_sample1.change(
	fn=update_slider_state,
	inputs=[do_sample1],
	outputs=[temp1, temp1]
	)

	do_sample2.change(
	fn=update_slider_state,
	inputs=[do_sample2],
	outputs=[temp2, temp2]
	)

	if __name__ == "__main__":
	demo.queue().launch()