import gradio as gr
import cv2
from PIL import Image
import torch
import numpy as np

from transformers import AutoImageProcessor, AutoProcessor, AutoModel, CLIPVisionModel
from detection import detect_image, detect_video
from model import LinearClassifier


def load_model(detection_type):

    device = torch.device("cpu")

    processor = AutoProcessor.from_pretrained("openai/clip-vit-large-patch14")
    clip_model = CLIPVisionModel.from_pretrained("openai/clip-vit-large-patch14", output_attentions=True)

    model_path = f"pretrained_models/{detection_type}/clip_weights.pth"
    checkpoint = torch.load(model_path, map_location="cpu")
    input_dim = checkpoint["linear.weight"].shape[1]
    
    detection_model = LinearClassifier(input_dim)
    detection_model.load_state_dict(checkpoint)
    detection_model = detection_model.to(device)

    return processor, clip_model, detection_model

def process_image(image, detection_type):
    processor, clip_model, detection_model = load_model(detection_type)
    
    results = detect_image(image, processor, clip_model, detection_model)

    pred_score = results["pred_score"]
    attn_map = results["attn_map"]

    return pred_score, attn_map

def process_video(video, detection_type):
    processor, clip_model, detection_model = load_model(detection_type)

    cap = cv2.VideoCapture(video)
    frames = []
    while True:
        ret, frame = cap.read()
        if not ret:
            break
        frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        pil_image = Image.fromarray(frame)
        frames.append(pil_image)
    cap.release()

    results = detect_video(frames, processor, clip_model, detection_model)

    pred_score = results["pred_score"]
    attn_map = results["attn_map"]

    return pred_score, attn_map

def change_input(input_type):
    if input_type == "Image":
        return gr.update(visible=True), gr.update(visible=False)
    elif input_type == "Video":
        return gr.update(visible=False), gr.update(visible=True)
    else:
        return None


def process_input(input_type, model_type, image, video):
    detection_type = "facial" if model_type == "Facial" else "general"

    if input_type == "Image" and image is not None:
        return process_image(image, detection_type)
    elif input_type == "Video" and video is not None:
        return process_video(video, detection_type)
    else:
        return None, None


with gr.Blocks() as demo:
  
    gr.Markdown("## Deepfake Detection : Facial / General")
  
    input_type = gr.Radio(["Image", "Video"], label="Choose Input Type", value="Image")

    model_type = gr.Radio(["Facial", "General"], label="Choose Model Type", value="General")
  
    image_input = gr.Image(type="pil", label="Upload Image", visible=True)
    video_input = gr.Video(label="Upload Video", visible=False)
  
    process_button = gr.Button("Run Model")

    pred_score_output = gr.Textbox(label="Prediction Score")
    attn_map_output = gr.Image(type="pil", label="Attention Map")
  
    input_type.change(fn=change_input, inputs=[input_type], outputs=[image_input, video_input])
  
    process_button.click(
        fn=process_input, 
        inputs=[input_type, model_type, image_input, video_input], 
        outputs=[pred_score_output, attn_map_output]
    )

if __name__ == "__main__":
    demo.launch()