Spaces:

francismurray
/

llm-compare

Running

francismurray commited on May 22

Commit

4667b7d

1 Parent(s): b6ed54a

feat: Implement async model responses with real-time progress

- Make model calls run concurrently instead of sequentially
- Add real-time progress indicator with 0.1s precision timer
- Display responses immediately as they arrive
- Improve error handling and loading states

Files changed (1) hide show

app.py +71 -23

app.py CHANGED Viewed

@@ -1,7 +1,9 @@
 import os
 import gradio as gr
 from dotenv import load_dotenv
 from huggingface_hub import InferenceClient
 # Load environment variables
 load_dotenv()
@@ -22,7 +24,7 @@ AVAILABLE_MODELS = [
 # Initialize inference client
 inference_client = InferenceClient(token=HF_TOKEN)
-def get_model_response(prompt, model_name, temperature_value, do_sample):
     """Get response from a Hugging Face model."""
     try:
         # Build kwargs dynamically
@@ -38,31 +40,80 @@ def get_model_response(prompt, model_name, temperature_value, do_sample):
         if do_sample and temperature_value > 0:
             generation_args["temperature"] = temperature_value
-        response = inference_client.text_generation(**generation_args)
         return response
     except Exception as e:
         return f"Error: {str(e)}"
-def compare_models(prompt, model1, model2, temp1, temp2, do_sample1, do_sample2):
     """Compare outputs from two selected models."""
     if not prompt.strip():
-        return (
-            [{"role": "user", "content": prompt}, {"role": "assistant", "content": "Please enter a prompt"}],
-            [{"role": "user", "content": prompt}, {"role": "assistant", "content": "Please enter a prompt"}],
-            gr.update(interactive=True)
-        )
-    response1 = get_model_response(prompt, model1, temp1, do_sample1)
-    response2 = get_model_response(prompt, model2, temp2, do_sample2)
-    # Format responses for chatbot display
-    chat1 = [{"role": "user", "content": prompt}, {"role": "assistant", "content": response1}]
-    chat2 = [{"role": "user", "content": prompt}, {"role": "assistant", "content": response2}]
-    return chat1, chat2, gr.update(interactive=True)
 # Update temperature slider interactivity based on sampling checkbox
 def update_slider_state(enabled):
@@ -79,7 +130,7 @@ with gr.Blocks(css="""
     .disabled-slider { opacity: 0.5; pointer-events: none; }
 """) as demo:
     gr.Markdown("# LLM Comparison Tool")
-    gr.Markdown("Compare outputs from different Hugging Face models side by side.")
     with gr.Row():
         prompt = gr.Textbox(
@@ -117,7 +168,6 @@ with gr.Blocks(css="""
                 height=300,
                 type="messages"
             )
         with gr.Column():
             model2_dropdown = gr.Dropdown(
@@ -157,11 +207,10 @@ with gr.Blocks(css="""
     ).then(
         fn=compare_models,
         inputs=[prompt, model1_dropdown, model2_dropdown, temp1, temp2, do_sample1, do_sample2],
-        outputs=[chatbot1, chatbot2, submit_btn]
     )
     do_sample1.change(
         fn=update_slider_state,
         inputs=[do_sample1],
@@ -175,5 +224,4 @@ with gr.Blocks(css="""
     )
 if __name__ == "__main__":
-    demo.launch()
-    # demo.launch(share=True)

 import os
 import gradio as gr
+import asyncio
 from dotenv import load_dotenv
 from huggingface_hub import InferenceClient
+from functools import partial
 # Load environment variables
 load_dotenv()
 # Initialize inference client
 inference_client = InferenceClient(token=HF_TOKEN)
+async def get_model_response(prompt, model_name, temperature_value, do_sample):
     """Get response from a Hugging Face model."""
     try:
         # Build kwargs dynamically
         if do_sample and temperature_value > 0:
             generation_args["temperature"] = temperature_value
+        # Run the inference in a thread pool to not block the event loop
+        loop = asyncio.get_event_loop()
+        response = await loop.run_in_executor(
+            None,
+            partial(inference_client.text_generation, **generation_args)
+        )
         return response
     except Exception as e:
         return f"Error: {str(e)}"
+async def process_single_response(prompt, model_name, temp, do_sample, chatbot):
+    """Process a single model response and update its chatbot."""
+    response = await get_model_response(prompt, model_name, temp, do_sample)
+    chat_history = [{"role": "user", "content": prompt}, {"role": "assistant", "content": response}]
+    return chat_history
+async def compare_models(prompt, model1, model2, temp1, temp2, do_sample1, do_sample2):
     """Compare outputs from two selected models."""
     if not prompt.strip():
+        empty_response = [{"role": "user", "content": prompt}, {"role": "assistant", "content": "Please enter a prompt"}]
+        yield empty_response, empty_response, gr.update(interactive=True)
+        return  # Exit the generator
+    # Initialize with "Generating..." messages
+    initial_message = [{"role": "user", "content": prompt}, {"role": "assistant", "content": "Generating..."}]
+    yield initial_message, initial_message, gr.update(interactive=False)
+    # Create tasks for both model responses
+    task1 = asyncio.create_task(process_single_response(prompt, model1, temp1, do_sample1, "chatbot1"))
+    task2 = asyncio.create_task(process_single_response(prompt, model2, temp2, do_sample2, "chatbot2"))
+    chat1 = chat2 = initial_message
+    start_time = asyncio.get_event_loop().time()
+    try:
+        while not (task1.done() and task2.done()):
+            # Update the messages with elapsed time
+            elapsed = round(asyncio.get_event_loop().time() - start_time, 1)
+            chat1_content = chat1[1]["content"]
+            chat2_content = chat2[1]["content"]
+            if not task1.done():
+                chat1 = [{"role": "user", "content": prompt},
+                        {"role": "assistant", "content": f"Generating... ({elapsed:.1f}s)"}]
+            if not task2.done():
+                chat2 = [{"role": "user", "content": prompt},
+                        {"role": "assistant", "content": f"Generating... ({elapsed:.1f}s)"}]
+            # Check if any task completed
+            done, pending = await asyncio.wait([t for t in [task1, task2] if not t.done()],
+                                             timeout=0.1,
+                                             return_when=asyncio.FIRST_COMPLETED)
+            for task in done:
+                if task == task1:
+                    chat1 = await task1
+                else:
+                    chat2 = await task2
+            yield chat1, chat2, gr.update(interactive=False)
+        # Ensure we have both final results
+        if not task1.done():
+            chat1 = await task1
+        if not task2.done():
+            chat2 = await task2
+        # Final yield with both results
+        yield chat1, chat2, gr.update(interactive=True)
+    except Exception as e:
+        error_message = [{"role": "user", "content": prompt}, {"role": "assistant", "content": f"Error: {str(e)}"}]
+        yield error_message, error_message, gr.update(interactive=True)
 # Update temperature slider interactivity based on sampling checkbox
 def update_slider_state(enabled):
     .disabled-slider { opacity: 0.5; pointer-events: none; }
 """) as demo:
     gr.Markdown("# LLM Comparison Tool")
+    gr.Markdown("Using HuggingFace's Inference API, compare outputs from different `text-generation` models side by side.")
     with gr.Row():
         prompt = gr.Textbox(
                 height=300,
                 type="messages"
             )
         with gr.Column():
             model2_dropdown = gr.Dropdown(
     ).then(
         fn=compare_models,
         inputs=[prompt, model1_dropdown, model2_dropdown, temp1, temp2, do_sample1, do_sample2],
+        outputs=[chatbot1, chatbot2, submit_btn],
+        queue=True  # Enable queuing for streaming updates
     )
     do_sample1.change(
         fn=update_slider_state,
         inputs=[do_sample1],
     )
 if __name__ == "__main__":
+    demo.queue().launch()