from transformers import AutoModelForCausalLM, AutoTokenizer import torch # Model name model_name = "MONAI/Llama3-VILA-M3-8B" # Load tokenizer and model with trust_remote_code=True tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) model = AutoModelForCausalLM.from_pretrained( model_name, torch_dtype=torch.float16, # Use float16 for optimized inference device_map="auto", # Automatically assigns model to available GPU/CPU trust_remote_code=True # Allows loading custom model code if needed ) # Example input prompt prompt = "Explain the findings in this chest X-ray report:" # Tokenize input inputs = tokenizer(prompt, return_tensors="pt").to("cuda" if torch.cuda.is_available() else "cpu") # Generate response with torch.no_grad(): output = model.generate(**inputs, max_length=200) # Decode and print response response = tokenizer.decode(output[0], skip_special_tokens=True) print("\nGenerated Response:\n", response)