Spaces:

francismurray
/

llm-compare

Running

App Files Files Community

francismurray commited on May 22

Commit

d85e33e

1 Parent(s): 4667b7d

feat: add max tokens control for model responses

Browse files

Files changed (1) hide show

app.py +27 -8

app.py CHANGED Viewed

@@ -24,14 +24,14 @@ AVAILABLE_MODELS = [
 # Initialize inference client
 inference_client = InferenceClient(token=HF_TOKEN)
-async def get_model_response(prompt, model_name, temperature_value, do_sample):
     """Get response from a Hugging Face model."""
     try:
         # Build kwargs dynamically
         generation_args = {
             "prompt": prompt,
             "model": model_name,
-            "max_new_tokens": 100,
             "do_sample": do_sample,
             "return_full_text": False
         }
@@ -46,18 +46,23 @@ async def get_model_response(prompt, model_name, temperature_value, do_sample):
             None,
             partial(inference_client.text_generation, **generation_args)
         )
         return response
     except Exception as e:
         return f"Error: {str(e)}"
-async def process_single_response(prompt, model_name, temp, do_sample, chatbot):
     """Process a single model response and update its chatbot."""
-    response = await get_model_response(prompt, model_name, temp, do_sample)
     chat_history = [{"role": "user", "content": prompt}, {"role": "assistant", "content": response}]
     return chat_history
-async def compare_models(prompt, model1, model2, temp1, temp2, do_sample1, do_sample2):
     """Compare outputs from two selected models."""
     if not prompt.strip():
         empty_response = [{"role": "user", "content": prompt}, {"role": "assistant", "content": "Please enter a prompt"}]
@@ -69,8 +74,8 @@ async def compare_models(prompt, model1, model2, temp1, temp2, do_sample1, do_sa
     yield initial_message, initial_message, gr.update(interactive=False)
     # Create tasks for both model responses
-    task1 = asyncio.create_task(process_single_response(prompt, model1, temp1, do_sample1, "chatbot1"))
-    task2 = asyncio.create_task(process_single_response(prompt, model2, temp2, do_sample2, "chatbot2"))
     chat1 = chat2 = initial_message
     start_time = asyncio.get_event_loop().time()
@@ -162,6 +167,13 @@ with gr.Blocks(css="""
                 interactive=False,
                 elem_classes=["disabled-slider"]
             )
             chatbot1 = gr.Chatbot(
                 label="Model 1 Output",
                 show_label=True,
@@ -188,6 +200,13 @@ with gr.Blocks(css="""
                 interactive=False,
                 elem_classes=["disabled-slider"]
             )
             chatbot2 = gr.Chatbot(
                 label="Model 2 Output",
                 show_label=True,
@@ -206,7 +225,7 @@ with gr.Blocks(css="""
         queue=False
     ).then(
         fn=compare_models,
-        inputs=[prompt, model1_dropdown, model2_dropdown, temp1, temp2, do_sample1, do_sample2],
         outputs=[chatbot1, chatbot2, submit_btn],
         queue=True  # Enable queuing for streaming updates
     )

 # Initialize inference client
 inference_client = InferenceClient(token=HF_TOKEN)
+async def get_model_response(prompt, model_name, temperature_value, do_sample, max_tokens):
     """Get response from a Hugging Face model."""
     try:
         # Build kwargs dynamically
         generation_args = {
             "prompt": prompt,
             "model": model_name,
+            "max_new_tokens": max_tokens,
             "do_sample": do_sample,
             "return_full_text": False
         }
             None,
             partial(inference_client.text_generation, **generation_args)
         )
+        # Check if response might be truncated
+        if len(response) >= max_tokens * 4:  # Rough estimate of tokens to characters ratio
+            response += "\n\n[Warning: Response may have been truncated. Try increasing the max tokens if the response seems incomplete.]"
         return response
     except Exception as e:
         return f"Error: {str(e)}"
+async def process_single_response(prompt, model_name, temp, do_sample, max_tokens, chatbot):
     """Process a single model response and update its chatbot."""
+    response = await get_model_response(prompt, model_name, temp, do_sample, max_tokens)
     chat_history = [{"role": "user", "content": prompt}, {"role": "assistant", "content": response}]
     return chat_history
+async def compare_models(prompt, model1, model2, temp1, temp2, do_sample1, do_sample2, max_tokens1, max_tokens2):
     """Compare outputs from two selected models."""
     if not prompt.strip():
         empty_response = [{"role": "user", "content": prompt}, {"role": "assistant", "content": "Please enter a prompt"}]
     yield initial_message, initial_message, gr.update(interactive=False)
     # Create tasks for both model responses
+    task1 = asyncio.create_task(process_single_response(prompt, model1, temp1, do_sample1, max_tokens1, "chatbot1"))
+    task2 = asyncio.create_task(process_single_response(prompt, model2, temp2, do_sample2, max_tokens2, "chatbot2"))
     chat1 = chat2 = initial_message
     start_time = asyncio.get_event_loop().time()
                 interactive=False,
                 elem_classes=["disabled-slider"]
             )
+            max_tokens1 = gr.Slider(
+                label="Maximum new tokens in response",
+                minimum=10,
+                maximum=2000,
+                step=10,
+                value=100
+            )
             chatbot1 = gr.Chatbot(
                 label="Model 1 Output",
                 show_label=True,
                 interactive=False,
                 elem_classes=["disabled-slider"]
             )
+            max_tokens2 = gr.Slider(
+                label="Maximum new tokens in response",
+                minimum=10,
+                maximum=2000,
+                step=10,
+                value=100
+            )
             chatbot2 = gr.Chatbot(
                 label="Model 2 Output",
                 show_label=True,
         queue=False
     ).then(
         fn=compare_models,
+        inputs=[prompt, model1_dropdown, model2_dropdown, temp1, temp2, do_sample1, do_sample2, max_tokens1, max_tokens2],
         outputs=[chatbot1, chatbot2, submit_btn],
         queue=True  # Enable queuing for streaming updates
     )