Spaces:

saakshigupta
/

deepfake_detection_uq

Paused

App Files Files Community

saakshigupta commited on May 14

Commit

e0112d5

verified ·

1 Parent(s): d742a0e

Update app.py

Browse files

Files changed (1) hide show

app.py +120 -305

app.py CHANGED Viewed

@@ -3,8 +3,7 @@ import torch
 import torch.nn as nn
 from torch.utils.data import DataLoader
 from torchvision import transforms
-from transformers import CLIPModel, BlipProcessor, BlipForConditionalGeneration
-from transformers.models.clip import CLIPModel
 from PIL import Image
 import numpy as np
 import io
@@ -16,6 +15,7 @@ from unsloth import FastVisionModel
 import os
 import tempfile
 import warnings
 warnings.filterwarnings("ignore", category=UserWarning)
 # App title and description
@@ -42,14 +42,13 @@ def check_gpu():
 # Sidebar components
 st.sidebar.title("About")
 st.sidebar.markdown("""
-This tool detects deepfakes using four AI models:
-- **CLIP**: Initial Real/Fake classification
-- **GradCAM**: Highlights suspicious regions
 - **BLIP**: Describes image content
 - **Llama 3.2**: Explains potential manipulations
 ### Quick Start
-1. **Load Models** - Start with CLIP, add others as needed
 2. **Upload Image** - View classification and heat map
 3. **Analyze** - Get explanations and ask questions
@@ -72,8 +71,7 @@ if use_custom_instructions:
 else:
     custom_instruction = ""
-# ----- GradCAM Implementation -----
 class ImageDataset(torch.utils.data.Dataset):
     def __init__(self, image, transform=None, face_only=True, dataset_name=None):
         self.image = image
@@ -149,262 +147,45 @@ class ImageDataset(torch.utils.data.Dataset):
             return image_tensor, label, "uploaded_image", original_image, None, self.dataset_name
-class GradCAM:
-    def __init__(self, model, target_layer):
-        self.model = model
-        self.target_layer = target_layer
-        self.gradients = None
-        self.activations = None
-        self._register_hooks()
-    def _register_hooks(self):
-        def forward_hook(module, input, output):
-            if isinstance(output, tuple):
-                self.activations = output[0]
-            else:
-                self.activations = output
-        def backward_hook(module, grad_in, grad_out):
-            if isinstance(grad_out, tuple):
-                self.gradients = grad_out[0]
-            else:
-                self.gradients = grad_out
-        layer = dict([*self.model.named_modules()])[self.target_layer]
-        layer.register_forward_hook(forward_hook)
-        layer.register_backward_hook(backward_hook)
-    def generate(self, input_tensor, class_idx):
-        self.model.zero_grad()
-        try:
-            # Use only the vision part of the model for gradient calculation
-            vision_outputs = self.model.vision_model(pixel_values=input_tensor)
-            # Get the pooler output
-            features = vision_outputs.pooler_output
-            # Create a dummy gradient for the feature based on the class idx
-            one_hot = torch.zeros_like(features)
-            one_hot[0, class_idx] = 1
-            # Manually backpropagate
-            features.backward(gradient=one_hot)
-            # Check for None values
-            if self.gradients is None or self.activations is None:
-                st.warning("Warning: Gradients or activations are None. Using fallback CAM.")
-                return np.ones((14, 14), dtype=np.float32) * 0.5
-            # Process gradients and activations for transformer-based model
-            gradients = self.gradients.cpu().detach().numpy()
-            activations = self.activations.cpu().detach().numpy()
-            if len(activations.shape) == 3:  # [batch, sequence_length, hidden_dim]
-                seq_len = activations.shape[1]
-                # CLIP ViT typically has 196 patch tokens (14×14) + 1 class token = 197
-                if seq_len >= 197:
-                    # Skip the class token (first token) and reshape the patch tokens into a square
-                    patch_tokens = activations[0, 1:197, :]  # Remove the class token
-                    # Take the mean across the hidden dimension
-                    token_importance = np.mean(np.abs(patch_tokens), axis=1)
-                    # Reshape to the expected grid size (14×14 for CLIP ViT)
-                    cam = token_importance.reshape(14, 14)
-                else:
-                    # Try to find factors close to a square
-                    side_len = int(np.sqrt(seq_len))
-                    # Use the mean across features as importance
-                    token_importance = np.mean(np.abs(activations[0]), axis=1)
-                    # Create as square-like shape as possible
-                    cam = np.zeros((side_len, side_len))
-                    # Fill the cam with available values
-                    flat_cam = cam.flatten()
-                    flat_cam[:min(len(token_importance), len(flat_cam))] = token_importance[:min(len(token_importance), len(flat_cam))]
-                    cam = flat_cam.reshape(side_len, side_len)
-            else:
-                # Fallback
-                st.info("Using fallback CAM shape (14x14)")
-                cam = np.ones((14, 14), dtype=np.float32) * 0.5  # Default fallback
-            # Ensure we have valid values
-            cam = np.maximum(cam, 0)
-            if np.max(cam) > 0:
-                cam = cam / np.max(cam)
-            return cam
-        except Exception as e:
-            st.error(f"Error in GradCAM.generate: {str(e)}")
-            return np.ones((14, 14), dtype=np.float32) * 0.5
-def overlay_cam_on_image(image, cam, face_box=None, alpha=0.5):
-    """Overlay the CAM on the image"""
-    if face_box is not None:
-        x, y, w, h = face_box
-        # Create a mask for the entire image (all zeros initially)
-        img_np = np.array(image)
-        full_h, full_w = img_np.shape[:2]
-        full_cam = np.zeros((full_h, full_w), dtype=np.float32)
-        # Resize CAM to match face region
-        face_cam = cv2.resize(cam, (w, h))
-        # Copy the face CAM into the full image CAM at the face position
-        full_cam[y:y+h, x:x+w] = face_cam
-        # Convert full CAM to image
-        cam_resized = Image.fromarray((full_cam * 255).astype(np.uint8))
-        cam_colormap = plt.cm.jet(np.array(cam_resized) / 255.0)[:, :, :3]  # Apply colormap
-        cam_colormap = (cam_colormap * 255).astype(np.uint8)
-    else:
-        # Resize CAM to match image dimensions
-        img_np = np.array(image)
-        h, w = img_np.shape[:2]
-        cam_resized = cv2.resize(cam, (w, h))
-        # Apply colormap
-        cam_colormap = plt.cm.jet(cam_resized)[:, :, :3]  # Apply colormap
-        cam_colormap = (cam_colormap * 255).astype(np.uint8)
-    # Blend the original image with the colormap
-    img_np_float = img_np.astype(float) / 255.0
-    cam_colormap_float = cam_colormap.astype(float) / 255.0
-    blended = img_np_float * (1 - alpha) + cam_colormap_float * alpha
-    blended = (blended * 255).astype(np.uint8)
-    return Image.fromarray(blended)
-def save_comparison(image, cam, overlay, face_box=None):
-    """Create a side-by-side comparison of the original, CAM, and overlay"""
-    fig, axes = plt.subplots(1, 3, figsize=(15, 5))
-    # Original Image
-    axes[0].imshow(image)
-    axes[0].set_title("Original")
-    if face_box is not None:
-        x, y, w, h = face_box
-        rect = plt.Rectangle((x, y), w, h, edgecolor='lime', linewidth=2, fill=False)
-        axes[0].add_patch(rect)
-    axes[0].axis("off")
-    # CAM
-    if face_box is not None:
-        # Create a full image CAM that highlights only the face
-        img_np = np.array(image)
-        h, w = img_np.shape[:2]
-        full_cam = np.zeros((h, w))
-        x, y, fw, fh = face_box
-        # Resize CAM to face size
-        face_cam = cv2.resize(cam, (fw, fh))
-        # Place it in the right position
-        full_cam[y:y+fh, x:x+fw] = face_cam
-        axes[1].imshow(full_cam, cmap="jet")
     else:
-        cam_resized = cv2.resize(cam, (image.width, image.height))
-        axes[1].imshow(cam_resized, cmap="jet")
-    axes[1].set_title("CAM")
-    axes[1].axis("off")
-    # Overlay
-    axes[2].imshow(overlay)
-    axes[2].set_title("Overlay")
-    axes[2].axis("off")
-    plt.tight_layout()
-    # Convert plot to PIL Image for Streamlit display
-    buf = io.BytesIO()
-    plt.savefig(buf, format="png", bbox_inches="tight")
-    plt.close()
-    buf.seek(0)
-    return Image.open(buf)
-# Function to load GradCAM CLIP model
 @st.cache_resource
-def load_clip_model():
-    with st.spinner("Loading CLIP model for GradCAM..."):
         try:
-            model = CLIPModel.from_pretrained("openai/clip-vit-large-patch14")
-            # Apply a simple classification head
-            model.classification_head = nn.Linear(1024, 2)
-            model.classification_head.weight.data.normal_(mean=0.0, std=0.02)
-            model.classification_head.bias.data.zero_()
             model.eval()
-            return model
-        except Exception as e:
-            st.error(f"Error loading CLIP model: {str(e)}")
-            return None
-def get_target_layer_clip(model):
-    """Get the target layer for GradCAM"""
-    return "vision_model.encoder.layers.23"
-def process_image_with_gradcam(image, model, device, pred_class):
-    """Process an image with GradCAM"""
-    # Set up transformations
-    transform = transforms.Compose([
-        transforms.Resize((224, 224)),
-        transforms.ToTensor(),
-        transforms.Normalize(mean=[0.48145466, 0.4578275, 0.40821073], std=[0.26862954, 0.26130258, 0.27577711]),
-    ])
-    # Create dataset for the single image
-    dataset = ImageDataset(image, transform=transform, face_only=True)
-    # Custom collate function
-    def custom_collate(batch):
-        tensors = [item[0] for item in batch]
-        labels = [item[1] for item in batch]
-        paths = [item[2] for item in batch]
-        images = [item[3] for item in batch]
-        face_boxes = [item[4] for item in batch]
-        dataset_names = [item[5] for item in batch]
-        tensors = torch.stack(tensors)
-        labels = torch.tensor(labels)
-        return tensors, labels, paths, images, face_boxes, dataset_names
-    # Create dataloader
-    dataloader = DataLoader(dataset, batch_size=1, shuffle=False, collate_fn=custom_collate)
-    # Extract the batch
-    for batch in dataloader:
-        input_tensor, label, img_paths, original_images, face_boxes, dataset_names = batch
-        original_image = original_images[0]
-        face_box = face_boxes[0]
-        # Move tensors and model to device
-        input_tensor = input_tensor.to(device)
-        model = model.to(device)
-        try:
-            # Create GradCAM extractor
-            target_layer = get_target_layer_clip(model)
-            cam_extractor = GradCAM(model, target_layer)
-            # Generate CAM
-            cam = cam_extractor.generate(input_tensor, pred_class)
-            # Create visualizations
-            overlay = overlay_cam_on_image(original_image, cam, face_box)
-            comparison = save_comparison(original_image, cam, overlay, face_box)
-            # Return results
-            return cam, overlay, comparison, face_box
         except Exception as e:
-            st.error(f"Error processing image with GradCAM: {str(e)}")
-            # Return default values
-            default_cam = np.ones((14, 14), dtype=np.float32) * 0.5
-            overlay = overlay_cam_on_image(original_image, default_cam, face_box)
-            comparison = save_comparison(original_image, default_cam, overlay, face_box)
-            return default_cam, overlay, comparison, face_box
 # ----- BLIP Image Captioning -----
@@ -624,12 +405,47 @@ def analyze_image_with_llm(image, gradcam_overlay, face_box, pred_label, confide
         st.error(f"Error during LLM analysis: {str(e)}")
         return f"Error analyzing image: {str(e)}"
 # Main app
 def main():
     # Initialize session state variables
-    if 'clip_model_loaded' not in st.session_state:
-        st.session_state.clip_model_loaded = False
-        st.session_state.clip_model = None
     if 'llm_model_loaded' not in st.session_state:
         st.session_state.llm_model_loaded = False
@@ -652,21 +468,22 @@ def main():
         st.write("Please load the models using the buttons below:")
         # Button for loading models
-        clip_col, blip_col, llm_col = st.columns(3)
-        with clip_col:
-            if not st.session_state.clip_model_loaded:
-                if st.button("📥 Load CLIP Model for Detection", type="primary"):
-                    # Load CLIP model
-                    model = load_clip_model()
                     if model is not None:
-                        st.session_state.clip_model = model
-                        st.session_state.clip_model_loaded = True
-                        st.success("✅ CLIP model loaded successfully!")
                     else:
-                        st.error("❌ Failed to load CLIP model.")
             else:
-                st.success("✅ CLIP model loaded and ready!")
         with blip_col:
             if not st.session_state.blip_model_loaded:
@@ -724,56 +541,56 @@ def main():
                             st.session_state.original_model
                         )
                         st.session_state.image_caption = caption
-                        # Store caption but don't display it yet
-                # Detect with CLIP model if loaded
-                if st.session_state.clip_model_loaded:
-                    with st.spinner("Analyzing image with CLIP model..."):
-                        # Preprocess image for CLIP
-                        transform = transforms.Compose([
-                            transforms.Resize((224, 224)),
-                            transforms.ToTensor(),
-                            transforms.Normalize(mean=[0.48145466, 0.4578275, 0.40821073], std=[0.26862954, 0.26130258, 0.27577711]),
-                        ])
-                        # Create a simple dataset for the image
-                        dataset = ImageDataset(image, transform=transform, face_only=True)
-                        tensor, _, _, _, face_box, _ = dataset[0]
-                        tensor = tensor.unsqueeze(0)
-                        # Get device
-                        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-                        # Move model and tensor to device
-                        model = st.session_state.clip_model.to(device)
-                        tensor = tensor.to(device)
                         # Forward pass
                         with torch.no_grad():
-                            outputs = model.vision_model(pixel_values=tensor).pooler_output
-                            logits = model.classification_head(outputs)
-                            probs = torch.softmax(logits, dim=1)[0]
-                            pred_class = torch.argmax(probs).item()
-                            confidence = probs[pred_class].item()
-                            pred_label = "Fake" if pred_class == 1 else "Real"
                         # Display results
                         with col2:
                             st.markdown("### Detection Result")
                             st.markdown(f"**Classification:** {pred_label} (Confidence: {confidence:.2%})")
                         # GradCAM visualization
                         st.subheader("GradCAM Visualization")
-                        cam, overlay, comparison, detected_face_box = process_image_with_gradcam(
                             image, model, device, pred_class
                         )
-                        # Display GradCAM results (controlled size)
-                        st.image(comparison, caption="Original | CAM | Overlay", width=700)
                         # Generate caption for GradCAM overlay image if BLIP model is loaded
-                        if st.session_state.blip_model_loaded:
                             with st.spinner("Analyzing GradCAM visualization..."):
                                 gradcam_caption = generate_gradcam_caption(
                                     overlay,
@@ -781,8 +598,6 @@ def main():
                                     st.session_state.finetuned_model
                                 )
                                 st.session_state.gradcam_caption = gradcam_caption
-                                # Store caption but don't display it yet
                         # Save results in session state for LLM analysis
                         st.session_state.current_image = image
@@ -793,7 +608,7 @@ def main():
                         st.success("✅ Initial detection and GradCAM visualization complete!")
                 else:
-                    st.warning("⚠️ Please load the CLIP model first to perform initial detection.")
             except Exception as e:
                 st.error(f"Error processing image: {str(e)}")
                 import traceback
@@ -930,7 +745,7 @@ def main():
     st.markdown("---")
     # Add model version indicator in sidebar
-    st.sidebar.info("Using deepfake-explainer-2 model")
 if __name__ == "__main__":
     main()

 import torch.nn as nn
 from torch.utils.data import DataLoader
 from torchvision import transforms
+from transformers import BlipProcessor, BlipForConditionalGeneration
 from PIL import Image
 import numpy as np
 import io
 import os
 import tempfile
 import warnings
+from gradcam_xception import load_xception_model, generate_smoothgrad_visualizations_xception, get_xception_transform
 warnings.filterwarnings("ignore", category=UserWarning)
 # App title and description
 # Sidebar components
 st.sidebar.title("About")
 st.sidebar.markdown("""
+This tool detects deepfakes using three AI models:
+- **Xception**: Initial Real/Fake classification
 - **BLIP**: Describes image content
 - **Llama 3.2**: Explains potential manipulations
 ### Quick Start
+1. **Load Models** - Start with Xception, add others as needed
 2. **Upload Image** - View classification and heat map
 3. **Analyze** - Get explanations and ask questions
 else:
     custom_instruction = ""
+# ----- GradCAM Implementation for Xception -----
 class ImageDataset(torch.utils.data.Dataset):
     def __init__(self, image, transform=None, face_only=True, dataset_name=None):
         self.image = image
             return image_tensor, label, "uploaded_image", original_image, None, self.dataset_name
+# Function to process image with Xception GradCAM
+def process_image_with_xception_gradcam(image, model, device, pred_class):
+    """Process an image with Xception GradCAM"""
+    cam_results = generate_smoothgrad_visualizations_xception(
+        model=model,
+        image=image,
+        target_class=pred_class,
+        face_only=True,
+        num_samples=5  # Can be adjusted
+    )
+    if cam_results and len(cam_results) == 4:
+        raw_cam, cam_img, overlay, comparison = cam_results
+        # Extract the face box from the dataset if needed
+        transform = get_xception_transform()
+        dataset = ImageDataset(image, transform=transform, face_only=True)
+        _, _, _, _, face_box, _ = dataset[0]
+        return raw_cam, overlay, comparison, face_box
     else:
+        st.error("Failed to generate GradCAM visualization")
+        return None, None, None, None
+# ----- Xception Model Loading -----
 @st.cache_resource
+def load_detection_model_xception():
+    """Loads the Xception model from our module"""
+    with st.spinner("Loading Xception model for deepfake detection..."):
         try:
+            model = load_xception_model()
+            # Get the device
+            device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+            model.to(device)
             model.eval()
+            return model, device
         except Exception as e:
+            st.error(f"Error loading Xception model: {str(e)}")
+            return None, None
 # ----- BLIP Image Captioning -----
         st.error(f"Error during LLM analysis: {str(e)}")
         return f"Error analyzing image: {str(e)}"
+# Preprocess image for Xception
+def preprocess_image_xception(image):
+    """Preprocesses image for Xception model input and face detection."""
+    face_detector = cv2.CascadeClassifier(cv2.data.haarcascades + 'haarcascade_frontalface_default.xml')
+    image_np = np.array(image.convert('RGB'))  # Ensure RGB
+    gray = cv2.cvtColor(image_np, cv2.COLOR_RGB2GRAY)
+    faces = face_detector.detectMultiScale(gray, 1.1, 5)
+    face_img_for_transform = image  # Default to whole image
+    face_box_display = None  # For drawing on original image
+    if len(faces) == 0:
+        st.warning("No face detected, using whole image for prediction/CAM.")
+    else:
+        areas = [w * h for (x, y, w, h) in faces]
+        largest_idx = np.argmax(areas)
+        x, y, w, h = faces[largest_idx]
+        padding_x = int(w * 0.05)  # Use percentages as in gradcam_xception
+        padding_y = int(h * 0.05)
+        x1, y1 = max(0, x - padding_x), max(0, y - padding_y)
+        x2, y2 = min(image_np.shape[1], x + w + padding_x), min(image_np.shape[0], y + h + padding_y)
+        # Use the padded face region for the model transform
+        face_img_for_transform = Image.fromarray(image_np[y1:y2, x1:x2])
+        # Use the original detected box (without padding) for display rectangle
+        face_box_display = (x, y, w, h)
+    # Xception specific transform
+    transform = get_xception_transform()
+    # Apply transform to the selected region (face or whole image)
+    input_tensor = transform(face_img_for_transform).unsqueeze(0)
+    # Return tensor, original full image, and the display face box
+    return input_tensor, image, face_box_display
 # Main app
 def main():
     # Initialize session state variables
+    if 'xception_model_loaded' not in st.session_state:
+        st.session_state.xception_model_loaded = False
+        st.session_state.xception_model = None
     if 'llm_model_loaded' not in st.session_state:
         st.session_state.llm_model_loaded = False
         st.write("Please load the models using the buttons below:")
         # Button for loading models
+        xception_col, blip_col, llm_col = st.columns(3)
+        with xception_col:
+            if not st.session_state.xception_model_loaded:
+                if st.button("📥 Load Xception Model for Detection", type="primary"):
+                    # Load Xception model
+                    model, device = load_detection_model_xception()
                     if model is not None:
+                        st.session_state.xception_model = model
+                        st.session_state.device = device
+                        st.session_state.xception_model_loaded = True
+                        st.success("✅ Xception model loaded successfully!")
                     else:
+                        st.error("❌ Failed to load Xception model.")
             else:
+                st.success("✅ Xception model loaded and ready!")
         with blip_col:
             if not st.session_state.blip_model_loaded:
                             st.session_state.original_model
                         )
                         st.session_state.image_caption = caption
+                # Detect with Xception model if loaded
+                if st.session_state.xception_model_loaded:
+                    with st.spinner("Analyzing image with Xception model..."):
+                        # Preprocess image for Xception
+                        input_tensor, original_image, face_box = preprocess_image_xception(image)
+                        # Get device and model
+                        device = st.session_state.device
+                        model = st.session_state.xception_model
+                        # Move tensor to device
+                        input_tensor = input_tensor.to(device)
                         # Forward pass
                         with torch.no_grad():
+                            logits = model(input_tensor)
+                            probabilities = torch.softmax(logits, dim=1)[0]
+                            pred_class = torch.argmax(probabilities).item()
+                            confidence = probabilities[pred_class].item()
+                            pred_label = "Fake" if pred_class == 0 else "Real"  # Check class mapping
                         # Display results
                         with col2:
                             st.markdown("### Detection Result")
                             st.markdown(f"**Classification:** {pred_label} (Confidence: {confidence:.2%})")
+                            # Display face box on image if detected
+                            if face_box:
+                                img_to_show = original_image.copy()
+                                img_draw = np.array(img_to_show)
+                                x, y, w, h = face_box
+                                cv2.rectangle(img_draw, (x, y), (x + w, y + h), (0, 255, 0), 2)
+                                st.image(Image.fromarray(img_draw), caption="Detected Face", width=300)
                         # GradCAM visualization
                         st.subheader("GradCAM Visualization")
+                        cam, overlay, comparison, detected_face_box = process_image_with_xception_gradcam(
                             image, model, device, pred_class
                         )
+                        if comparison:
+                            # Display GradCAM results (controlled size)
+                            st.image(comparison, caption="Original | CAM | Overlay", width=700)
+                            # Save for later use
+                            st.session_state.comparison_image = comparison
                         # Generate caption for GradCAM overlay image if BLIP model is loaded
+                        if st.session_state.blip_model_loaded and overlay:
                             with st.spinner("Analyzing GradCAM visualization..."):
                                 gradcam_caption = generate_gradcam_caption(
                                     overlay,
                                     st.session_state.finetuned_model
                                 )
                                 st.session_state.gradcam_caption = gradcam_caption
                         # Save results in session state for LLM analysis
                         st.session_state.current_image = image
                         st.success("✅ Initial detection and GradCAM visualization complete!")
                 else:
+                    st.warning("⚠️ Please load the Xception model first to perform initial detection.")
             except Exception as e:
                 st.error(f"Error processing image: {str(e)}")
                 import traceback
     st.markdown("---")
     # Add model version indicator in sidebar
+    st.sidebar.info("Using Xception + deepfake-explainer-2 models")
 if __name__ == "__main__":
     main()