coldlike's picture
Initial commit
faf90bc
raw
history blame
8.15 kB
#!/usr/bin/env python
# coding: utf-8
# # Visualize data
# In[1]:
import os
import numpy as np
import matplotlib.pyplot as plt
import cv2
from pathlib import Path
from collections import defaultdict
# In[2]:
data_dir = "malaria_data/cell_images"
parasitized_dir = os.path.join(data_dir, 'Parasitized')
uninfected_dir = os.path.join(data_dir, 'Uninfected')
parasitized_files = list(Path(parasitized_dir).glob('*.png'))
uninfected_files = list(Path(uninfected_dir).glob('*.png'))
print(f"Parasitized Images: {len(parasitized_files)}")
print(f"Uninfected Images: {len(uninfected_files)}")
# In[3]:
labels = ['Parasitized', 'Uninfected']
counts = [len(parasitized_files), len(uninfected_files)]
plt.figure(figsize=(6, 4))
plt.bar(labels, counts, color=['#ff7f0e', '#1f77b4'])
plt.title("Class Distribution")
plt.ylabel("Number of Images")
plt.show()
# In[4]:
def plot_samples(image_files, title, num_samples=5):
plt.figure(figsize=(15, 3))
for i in range(num_samples):
img = cv2.imread(str(image_files[i]))
img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
plt.subplot(1, num_samples, i+1)
plt.imshow(img)
plt.axis("off")
plt.suptitle(title)
plt.show()
plot_samples(parasitized_files, "Parasitized Cells")
plot_samples(uninfected_files, "Uninfected Cells")
# In[5]:
def get_image_sizes(file_list):
sizes = []
for f in file_list:
img = cv2.imread(str(f))
sizes.append(img.shape[:2]) # height, width
return sizes
parasitized_sizes = get_image_sizes(parasitized_files)
uninfected_sizes = get_image_sizes(uninfected_files)
all_sizes = parasitized_sizes + uninfected_sizes
unique_sizes = set(all_sizes)
print("Unique image sizes found:")
print(unique_sizes)
# In[6]:
total_images = len(parasitized_files) + len(uninfected_files)
avg_height = np.mean([size[0] for size in all_sizes])
avg_width = np.mean([size[1] for size in all_sizes])
print(f"\nTotal Images: {total_images}")
print(f"Average Image Size: {avg_width:.0f}x{avg_height:.0f}")
print(f"Min/Max Height: {min(s[0] for s in all_sizes)} / {max(s[0] for s in all_sizes)}")
print(f"Min/Max Width: {min(s[1] for s in all_sizes)} / {max(s[1] for s in all_sizes)}")
# In[7]:
sample_img = cv2.imread(str(parasitized_files[5]))
print("Image shape:", sample_img.shape)
# # Data preprocessing
# In[8]:
import matplotlib.pyplot as plt
import numpy as np
# Assuming you have your image data in a numpy array called 'image_data'
# For a single image:
plt.figure(figsize=(10, 6))
plt.hist(sample_img.ravel(), bins=256, range=(0, 256), color='blue', alpha=0.7)
plt.title('Pixel Value Distribution')
plt.xlabel('Pixel Intensity')
plt.ylabel('Frequency')
plt.grid(True, linestyle='--', alpha=0.5)
plt.show()
# # Data Splitting
# In[20]:
import os
import shutil
from pathlib import Path
import random
from sklearn.model_selection import train_test_split
import numpy as np
import matplotlib.pyplot as plt
import cv2
import torch
from torchvision import datasets, transforms
from torch.utils.data import DataLoader
# In[21]:
RAW_DATA_DIR = 'malaria_data/cell_images'
OUTPUT_DIR = 'malaria_ds/split_dataset'
PARASITIZED_DIR = os.path.join(RAW_DATA_DIR, 'Parasitized')
UNINFECTED_DIR = os.path.join(RAW_DATA_DIR, 'Uninfected')
# Output directories
TRAIN_DIR = os.path.join(OUTPUT_DIR, 'train')
VAL_DIR = os.path.join(OUTPUT_DIR, 'validation')
TEST_DIR = os.path.join(OUTPUT_DIR, 'test')
# Ensure output directories exist
os.makedirs(PARASITIZED_DIR, exist_ok=True)
os.makedirs(UNINFECTED_DIR, exist_ok=True)
print("Paths defined.")
# In[22]:
def split_class_files(class_dir, train_dir, val_dir, test_dir):
all_files = list(Path(class_dir).glob('*.*'))
train_files, test_files = train_test_split(all_files, test_size=0.1, random_state=42)
train_files, val_files = train_test_split(train_files, test_size=0.1 / (1 - 0.1), random_state=42)
for f in train_files:
shutil.copy(f, train_dir)
for f in val_files:
shutil.copy(f, val_dir)
for f in test_files:
shutil.copy(f, test_dir)
return len(all_files)
def create_split_folders():
class_names = ['Parasitized', 'Uninfected']
for folder in ['train', 'validation', 'test']:
for cls in class_names:
os.makedirs(os.path.join(OUTPUT_DIR, folder, cls), exist_ok=True)
print("Splitting Parasitized Images:")
total_parasitized = split_class_files(
os.path.join(RAW_DATA_DIR, 'Parasitized'),
os.path.join(OUTPUT_DIR, 'train', 'Parasitized'),
os.path.join(OUTPUT_DIR, 'validation', 'Parasitized'),
os.path.join(OUTPUT_DIR, 'test', 'Parasitized')
)
print("\nSplitting Uninfected Images:")
total_uninfected = split_class_files(
os.path.join(RAW_DATA_DIR, 'Uninfected'),
os.path.join(OUTPUT_DIR, 'train', 'Uninfected'),
os.path.join(OUTPUT_DIR, 'validation', 'Uninfected'),
os.path.join(OUTPUT_DIR, 'test', 'Uninfected')
)
print(f"\nTotal Parasitized: {total_parasitized}, Uninfected: {total_uninfected}")
print("Dataset split completed.")
# ## Data Aug and transforms
# In[23]:
IMG_SIZE = (128, 128)
BATCH_SIZE = 32
# Custom class_to_idx mapping to fix label order
class_to_idx = {'Uninfected': 0, 'Parasitized': 1}
idx_to_class = {v: k for k, v in class_to_idx.items()}
# Define transforms
train_transforms = transforms.Compose([
transforms.Resize(IMG_SIZE),
transforms.ToTensor(),
transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
transforms.RandomRotation(20),
transforms.RandomHorizontalFlip(),
])
val_test_transforms = transforms.Compose([
transforms.Resize(IMG_SIZE),
transforms.ToTensor(),
transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
])
# Custom Dataset Class to enforce class_to_idx
class CustomImageFolder(datasets.ImageFolder):
def __init__(self, root, transform, class_to_idx_override=None):
super().__init__(root=root, transform=transform)
if class_to_idx_override:
self.class_to_idx = class_to_idx_override
self.samples = [
(path, class_to_idx[cls])
for path, cls_idx in self.samples
for cls in [self.classes[cls_idx]]
if cls in class_to_idx_override
]
self.classes = list(class_to_idx_override.keys())
# In[24]:
def get_dataloaders():
# Create datasets
train_dataset = CustomImageFolder(root=os.path.join(OUTPUT_DIR, 'train'), transform=train_transforms, class_to_idx_override=class_to_idx)
val_dataset = CustomImageFolder(root=os.path.join(OUTPUT_DIR, 'validation'), transform=val_test_transforms, class_to_idx_override=class_to_idx)
test_dataset = CustomImageFolder(root=os.path.join(OUTPUT_DIR, 'test'), transform=val_test_transforms, class_to_idx_override=class_to_idx)
# Create data loaders
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)
print(f"Train: {len(train_dataset)}, Val: {len(val_dataset)}, Test: {len(test_dataset)}")
print("Class Mapping:", train_dataset.class_to_idx)
return train_loader, val_loader, test_loader, train_dataset, val_dataset, test_dataset
# In[26]:
def show_batch_sample(loader, dataset):
images, labels = next(iter(loader))
plt.figure(figsize=(12, 6))
for i in range(min(6, BATCH_SIZE)):
img = images[i].numpy().transpose((1, 2, 0))
img = np.clip(img * np.array([0.229, 0.224, 0.225]) + np.array([0.485, 0.456, 0.406]), 0, 1)
plt.subplot(2, 3, i+1)
plt.imshow(img)
plt.title(idx_to_class[labels[i].item()])
plt.axis("off")
plt.suptitle("Sample Batch from DataLoader")
plt.show()
# In[32]:
create_split_folders()
train_loader, val_loader, test_loader, train_dataset, val_dataset, test_dataset = get_dataloaders()
show_batch_sample(train_loader, train_dataset)
# In[34]:
print(train_dataset)