import streamlit as st from transformers import AutoTokenizer, AutoModelForCausalLM import torch # Hugging Face repository details MODEL_ID = "meta-llama/CodeLlama-7b-Instruct-hf" def load_model(): """Load the Hugging Face model and tokenizer.""" try: st.write("Loading model and tokenizer...") tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) model = AutoModelForCausalLM.from_pretrained( MODEL_ID, device_map="auto", torch_dtype=torch.float16 ) st.write("Model and tokenizer successfully loaded.") return tokenizer, model except Exception as e: st.error(f"Error loading model: {e}") return None, None # Load the model and tokenizer @st.cache_resource def get_model(): return load_model() tokenizer, model = get_model() # Streamlit UI st.title("Medical Chatbot") st.write("This chatbot provides medical assistance. Type your question below!") if model is None or tokenizer is None: st.error("Model failed to load. Please check the Hugging Face model path or environment configuration.") else: user_input = st.text_input("You:", placeholder="Enter your medical question here...", key="input_box") if st.button("Send"): if user_input.strip(): # Construct the prompt SYSTEM_PROMPT = "You are a helpful medical assistant. Provide accurate and concise answers." full_prompt = f"{SYSTEM_PROMPT}\nUser: {user_input}\nAssistant:" # Tokenize the input inputs = tokenizer(full_prompt, return_tensors="pt", truncation=True).to("cuda") try: # Generate the response outputs = model.generate( inputs["input_ids"], max_length=200, # Limit response length temperature=0.7, # Control randomness top_p=0.9, # Top-p sampling pad_token_id=tokenizer.eos_token_id ) # Decode and display the response response = tokenizer.decode(outputs[0], skip_special_tokens=True).split("Assistant:")[-1].strip() st.write(f"**Model:** {response}") except Exception as e: st.error(f"Error generating response: {e}") else: st.warning("Please enter a valid question.")