import torch
from PIL import Image
from transformers import AutoModel, CLIPImageProcessor
import gradio as gr

# Force the use of GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load the model
model = AutoModel.from_pretrained(
    'OpenGVLab/InternVL2_5-1B',
    torch_dtype=torch.float16,  # Use float16 for GPU efficiency
    low_cpu_mem_usage=True,
    trust_remote_code=True,
    use_flash_attn=True  # Enable Flash Attention for improved performance
).to(device).eval()  # Explicitly move the model to GPU

# Load the image processor
image_processor = CLIPImageProcessor.from_pretrained('OpenGVLab/InternVL2_5-1B')

# Define the function to process the image and generate outputs
def process_image(image):
    try:
        # Convert uploaded image to RGB
        image = image.convert('RGB')
        
        # Preprocess the image
        pixel_values = image_processor(images=image, return_tensors='pt').pixel_values.to(device)  # Ensure tensor is on GPU
        
        # Run the model
        with torch.no_grad():  # Disable gradient calculations for inference
            outputs = model(pixel_values)
        
        # Assuming the model returns embeddings or features
        return f"Output Shape: {outputs.last_hidden_state.shape}"
    except Exception as e:
        return f"Error: {str(e)}"

# Create the Gradio interface
demo = gr.Interface(
    fn=process_image,  # Function to process the input
    inputs=gr.Image(type="pil"),  # Accepts images as input
    outputs=gr.Textbox(label="Model Output"),  # Displays model output
    title="InternVL2_5 Demo",
    description="Upload an image to process it using the InternVL2_5-1B model from OpenGVLab."
)

# Launch the demo
if __name__ == "__main__":
    demo.launch(server_name="0.0.0.0", server_port=7860)