import gradio as gr import cv2 from PIL import Image import torch import numpy as np from transformers import AutoImageProcessor, AutoProcessor, AutoModel, CLIPVisionModel from detection import detect_image, detect_video from model import LinearClassifier def load_model(detection_type): device = torch.device("cpu") processor = AutoProcessor.from_pretrained("openai/clip-vit-large-patch14") clip_model = CLIPVisionModel.from_pretrained("openai/clip-vit-large-patch14", output_attentions=True) model_path = f"pretrained_models/{detection_type}/clip_weights.pth" checkpoint = torch.load(model_path, map_location="cpu") input_dim = checkpoint["linear.weight"].shape[1] detection_model = LinearClassifier(input_dim) detection_model.load_state_dict(checkpoint) detection_model = detection_model.to(device) return processor, clip_model, detection_model def process_image(image, detection_type): processor, clip_model, detection_model = load_model(detection_type) results = detect_image(image, processor, clip_model, detection_model) pred_score = results["pred_score"] attn_map = results["attn_map"] return pred_score, attn_map def process_video(video, detection_type): processor, clip_model, detection_model = load_model(detection_type) cap = cv2.VideoCapture(video) frames = [] while True: ret, frame = cap.read() if not ret: break frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) pil_image = Image.fromarray(frame) frames.append(pil_image) cap.release() results = detect_video(frames, processor, clip_model, detection_model) pred_score = results["pred_score"] attn_map = results["attn_map"] return pred_score, attn_map def change_input(input_type): if input_type == "Image": return gr.update(visible=True), gr.update(visible=False) elif input_type == "Video": return gr.update(visible=False), gr.update(visible=True) else: return None def process_input(input_type, model_type, image, video): detection_type = "facial" if model_type == "Facial" else "general" if input_type == "Image" and image is not None: return process_image(image, detection_type) elif input_type == "Video" and video is not None: return process_video(video, detection_type) else: return None, None with gr.Blocks() as demo: gr.Markdown("## Deepfake Detection : Facial / General") input_type = gr.Radio(["Image", "Video"], label="Choose Input Type", value="Image") model_type = gr.Radio(["Facial", "General"], label="Choose Model Type", value="General") image_input = gr.Image(type="pil", label="Upload Image", visible=True) video_input = gr.Video(label="Upload Video", visible=False) process_button = gr.Button("Run Model") pred_score_output = gr.Textbox(label="Prediction Score") attn_map_output = gr.Image(type="pil", label="Attention Map") input_type.change(fn=change_input, inputs=[input_type], outputs=[image_input, video_input]) process_button.click( fn=process_input, inputs=[input_type, model_type, image_input, video_input], outputs=[pred_score_output, attn_map_output] ) if __name__ == "__main__": demo.launch()