import gradio as gr import cv2 from PIL import Image import torch import numpy as np import os from transformers import AutoProcessor, CLIPVisionModel from detection import detect_image, detect_video from model import LinearClassifier def load_model(detection_type): device = torch.device("cpu") processor = AutoProcessor.from_pretrained("openai/clip-vit-large-patch14") clip_model = CLIPVisionModel.from_pretrained("openai/clip-vit-large-patch14", output_attentions=True) model_path = f"pretrained_models/{detection_type}/clip_weights.pth" checkpoint = torch.load(model_path, map_location="cpu") input_dim = checkpoint["linear.weight"].shape[1] detection_model = LinearClassifier(input_dim) detection_model.load_state_dict(checkpoint) detection_model = detection_model.to(device) return processor, clip_model, detection_model def process_image(image, detection_type): processor, clip_model, detection_model = load_model(detection_type) results = detect_image(image, processor, clip_model, detection_model) pred_score = results["pred_score"] attn_map = results["attn_map"] return pred_score, attn_map def process_video(video, detection_type): processor, clip_model, detection_model = load_model(detection_type) cap = cv2.VideoCapture(video) frames = [] while True: ret, frame = cap.read() if not ret: break frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) pil_image = Image.fromarray(frame) frames.append(pil_image) cap.release() results = detect_video(frames, processor, clip_model, detection_model) pred_score = results["pred_score"] attn_map = results["attn_map"] return pred_score, attn_map def change_input(input_type): if input_type == "Image": return gr.update(visible=True), gr.update(visible=False) elif input_type == "Video": return gr.update(visible=False), gr.update(visible=True) else: return None def determine_model_type(image_path): if "facial" in image_path.lower(): return "Facial" elif "general" in image_path.lower(): return "General" else: return "Facial" # 기본값 def process_input(input_type, model_type, image, video): detection_type = "facial" if model_type == "Facial" else "general" if input_type == "Image" and image is not None: return process_image(image, detection_type) elif input_type == "Video" and video is not None: return process_video(video, detection_type) else: return None, None def process_example(image_path): model_type = determine_model_type(image_path) return Image.open(image_path), model_type example_images = [ "examples/fake/facial.jpg", "examples/fake/general.jpg", "examples/real/facial.jpg", "examples/real/general.jpg", ] with gr.Blocks() as demo: gr.Markdown("## Deepfake Detection : Facial / General") input_type = gr.Radio(["Image", "Video"], label="Choose Input Type", value="Image") model_type = gr.Radio(["Facial", "General"], label="Choose Model Type", value="General") image_input = gr.Image(type="pil", label="Upload Image", visible=True) video_input = gr.Video(label="Upload Video", visible=False) process_button = gr.Button("Run Model") pred_score_output = gr.Textbox(label="Prediction Score") attn_map_output = gr.Image(type="pil", label="Attention Map") # Example Images 추가 gr.Examples( examples=example_images, inputs=[image_input], outputs=[image_input, model_type], fn=process_example, cache_examples=False ) input_type.change(fn=change_input, inputs=[input_type], outputs=[image_input, video_input]) process_button.click( fn=process_input, inputs=[input_type, model_type, image_input, video_input], outputs=[pred_score_output, attn_map_output] ) if __name__ == "__main__": demo.launch()