import os import streamlit as st import dotenv import openai from openai import OpenAI import anthropic from together import Together import google.generativeai as genai import time dotenv.load_dotenv() PASSWORD = os.getenv("APP_PASSWORD") # Load API keys from environment variables OPENAI_API_KEY = os.getenv("OPENAI_API_KEY") ANTHROPIC_API_KEY = os.getenv("ANTHROPIC_API_KEY") GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY") TOGETHER_API_KEY = os.getenv("TOGETHER_API_KEY") # Initialize API clients together_client = Together(api_key=TOGETHER_API_KEY) genai.configure(api_key=GOOGLE_API_KEY) # Set up API clients for OpenAI and Anthropic openai.api_key = OPENAI_API_KEY openai_client = OpenAI( organization="org-kUoRSK0nOw4W2nQYMVGWOt03", project="proj_zb6k1DdgnSEbiAEMWxSOVVu4", ) # anthropic_client = anthropic.Client(api_key=ANTHROPIC_API_KEY) anthropic_client = anthropic.Anthropic() LLM_COUNCIL_MEMBERS = { "Smalls": [ "openai://gpt-4o-mini", "together://meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo", "vertex://gemini-1.5-flash-001", "anthropic://claude-3-haiku-20240307", ], "Flagships": [ "openai://gpt-4", "together://meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo", "vertex://gemini-1.5-pro-001", "anthropic://claude-3-5-sonnet", ], } PROVIDER_TO_AVATAR_MAP = { "openai://gpt-4o-mini": "", "anthropic://claude-3-5-sonnet": "", "vertex://gemini-1.5-flash-001": "", "together://meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo": "", "anthropic://claude-3-haiku-20240307": "", } AGGREGATORS = ["openai://gpt-4", "openai://gpt-3.5-turbo"] def anthropic_streamlit_streamer(stream): """ Process the Anthropic streaming response and yield content from the deltas. :param stream: Streaming object from Anthropic API :return: Yields content (text) from the streaming response. """ for event in stream: if hasattr(event, "type"): # Handle content blocks if event.type == "content_block_delta" and hasattr(event, "delta"): # Extract text delta from the event text_delta = getattr(event.delta, "text", None) if text_delta: yield text_delta # Handle message completion events (optional if needed) elif event.type == "message_stop": break # End of message, stop streaming def google_streamlit_streamer(stream): for chunk in stream: yield chunk.text def together_streamlit_streamer(stream): for chunk in stream: yield chunk.choices[0].delta.content # Helper functions for LLM council and aggregator selection def llm_council_selector(): selected_council = st.radio( "Choose a council configuration", options=list(LLM_COUNCIL_MEMBERS.keys()) ) return LLM_COUNCIL_MEMBERS[selected_council] def aggregator_selector(): return st.radio("Choose an aggregator LLM", options=AGGREGATORS) # API calls for different providers def get_openai_response(model_name, prompt): return openai_client.chat.completions.create( model=model_name, messages=[{"role": "user", "content": prompt}], stream=True, ) # https://docs.anthropic.com/en/api/messages-streaming def get_anthropic_response(model_name, prompt): return anthropic_client.messages.create( max_tokens=1024, messages=[{"role": "user", "content": prompt}], model=model_name, stream=True, ) def get_together_response(model_name, prompt): return together_client.chat.completions.create( model=model_name, messages=[{"role": "user", "content": prompt}], stream=True, ) # https://ai.google.dev/gemini-api/docs/text-generation?lang=python def get_google_response(model_name, prompt): model = genai.GenerativeModel(model_name) return model.generate_content(prompt, stream=True) def get_llm_response(model_identifier, prompt): provider, model_name = model_identifier.split("://") if provider == "openai": return get_openai_response(model_name, prompt) elif provider == "anthropic": return get_anthropic_response(model_name, prompt) elif provider == "together": return get_together_response(model_name, prompt) elif provider == "vertex": return get_google_response(model_name, prompt) else: return None # Main Streamlit App def main(): st.set_page_config( page_title="Language Model Council Sandbox", page_icon="🏛️", layout="wide" ) # Custom CSS for the chat display center_css = """ """ st.markdown(center_css, unsafe_allow_html=True) # App title and description st.title("Language Model Council Sandbox") st.markdown("###### Invoke a council of LLMs to generate and judge each other.") st.markdown("###### [ArXiv Paper](https://arxiv.org/abs/2406.08598)") # Authentication system if "authenticated" not in st.session_state: st.session_state.authenticated = False cols = st.columns([2, 1, 2]) if not st.session_state.authenticated: with cols[1]: password = st.text_input("Password", type="password") if st.button("Login", use_container_width=True): if password == PASSWORD: st.session_state.authenticated = True else: st.error("Invalid credentials") if st.session_state.authenticated: st.success("Logged in successfully!") # Council and aggregator selection selected_models = llm_council_selector() st.write("Selected Models:", selected_models) selected_aggregator = aggregator_selector() st.write("Selected Aggregator:", selected_aggregator) # Prompt input prompt = st.text_area("Enter your prompt:") if st.button("Submit"): st.write("Responses:") # Fetching and streaming responses from each selected model for model in selected_models: # with st.chat_message(model): with st.chat_message( model, avatar=PROVIDER_TO_AVATAR_MAP[model], ): message_placeholder = st.empty() stream = get_llm_response(model, prompt) if stream: if model.startswith("anthropic"): stream = anthropic_streamlit_streamer(stream) elif model.startswith("vertex"): stream = google_streamlit_streamer(stream) elif model.startswith("together"): stream = together_streamlit_streamer(stream) message_placeholder.write_stream(stream) # Constructing the aggregator prompt aggregator_prompt = f"User prompt: {prompt}\n\n" aggregator_prompt += "Responses from other LLMs:\n" aggregator_prompt += "\n".join( [ f"{model}: {st.session_state.get(model, '')}" for model in selected_models ] ) aggregator_prompt += "\n\nPlease provide an aggregated response." # Fetching and streaming response from the aggregator st.write(f"Aggregated response from {selected_aggregator}:") with st.chat_message(selected_aggregator): message_placeholder = st.empty() aggregator_stream = get_llm_response( selected_aggregator, aggregator_prompt ) if aggregator_stream: message_placeholder.write_stream(aggregator_stream) else: with cols[1]: st.warning("Please log in to access this app.") if __name__ == "__main__": main() # import streamlit as st # from components import llm_council_selector # st.title("LLM Council Selector") # selected_models = llm_council_selector() # if selected_models is not None: # st.write("Selected Models:", selected_models) # else: # st.write("No models selected or component didn't return a value.") # Choose your council. # Pre-selected. # Smalls: GPT-4o-mini, llama-3.1-70b, qwen-2.0-70b # Flagships: GPT-4o, llama-3.1-405b, qwen-2.0-110b, gemini, claude-3.5-sonnet # Best: chatgpt-4o-latest, gemini-1.5-pro-exp-0827, grok-2-2024-08-13, claude-3-5-sonnet-20240620, llama-3.1-405b-instruct # Custom: # Choose from a list of available models. # All: # All available models. # Choose aggregator. # Aggregators are models proficient in synthesizing responses from other models into a single, highquality output. An effective aggregator should maintain or enhance output quality even when # integrating inputs that are of lesser quality than its own. # Choices: # - 4o-latest # - gemini-1.5 # - grok-2 # - claude-3.5-sonnet # - llama-3.1-405b-instruct # Provide a prompt. (Or pre-canned prompts.) # Paste chat history. # Checkbox, enable judging. # # If checked, Judging config: # Single sided # Provide criteria. (or default). # If pairwise, choose granularity (or default). # Choose criteria. (or default). # Enable position swapping? # Go button. # Sections. # 1. Model outputs. # 2. Aggregated output. # 3. Judging underneath each output. # Highlight in green, the output that was best, as determined by council. # Show graph breakdown of scores and justifications. (by criteria, # wins and # losses) # Show final overall score. # Highlight in red, the output that was worst, as determined by council. # Judging section. # Show agreement matrix. # Show bar graph of self-bias. # Plot contrarianism vs. conviction (scatter plot) # Show per-judge scores. # Calculate total cost. # Show total tokens used.