import gradio as gr import torch import deepspeed from transformers import AutoModelForCausalLM, AutoTokenizer # Model name model_name = "OpenGVLab/InternVideo2_5_Chat_8B" # Load tokenizer tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) # Enable DeepSpeed Inference (ZeRO-3) ds_engine = deepspeed.init_inference( dtype=torch.float16, # Use float16 for efficiency replace_method="auto", # Automatically replace ops for inference replace_with_kernel_inject=True ) # Load model with DeepSpeed model = AutoModelForCausalLM.from_pretrained( model_name, trust_remote_code=True, torch_dtype=torch.float16, device_map="auto" # Auto place on GPU ) # Apply DeepSpeed to model model = ds_engine.module(model) # Define inference function def chat_with_model(prompt): inputs = tokenizer(prompt, return_tensors="pt").to("cuda") output = model.generate(**inputs, max_length=200) return tokenizer.decode(output[0], skip_special_tokens=True) # Create Gradio UI demo = gr.Interface( fn=chat_with_model, inputs=gr.Textbox(placeholder="Type your prompt here..."), outputs="text", title="InternVideo2.5 Chatbot", description="A chatbot powered by InternVideo2_5_Chat_8B.", theme="compact" ) # Run the Gradio app if __name__ == "__main__": demo.launch()