Spaces:
Sleeping
Sleeping
#!/usr/bin/env python | |
# coding: utf-8 | |
# # Visualize data | |
# In[1]: | |
import os | |
import numpy as np | |
import matplotlib.pyplot as plt | |
import cv2 | |
from pathlib import Path | |
from collections import defaultdict | |
# In[2]: | |
data_dir = "malaria_data/cell_images" | |
parasitized_dir = os.path.join(data_dir, 'Parasitized') | |
uninfected_dir = os.path.join(data_dir, 'Uninfected') | |
parasitized_files = list(Path(parasitized_dir).glob('*.png')) | |
uninfected_files = list(Path(uninfected_dir).glob('*.png')) | |
print(f"Parasitized Images: {len(parasitized_files)}") | |
print(f"Uninfected Images: {len(uninfected_files)}") | |
# In[3]: | |
labels = ['Parasitized', 'Uninfected'] | |
counts = [len(parasitized_files), len(uninfected_files)] | |
plt.figure(figsize=(6, 4)) | |
plt.bar(labels, counts, color=['#ff7f0e', '#1f77b4']) | |
plt.title("Class Distribution") | |
plt.ylabel("Number of Images") | |
plt.show() | |
# In[4]: | |
def plot_samples(image_files, title, num_samples=5): | |
plt.figure(figsize=(15, 3)) | |
for i in range(num_samples): | |
img = cv2.imread(str(image_files[i])) | |
img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) | |
plt.subplot(1, num_samples, i+1) | |
plt.imshow(img) | |
plt.axis("off") | |
plt.suptitle(title) | |
plt.show() | |
plot_samples(parasitized_files, "Parasitized Cells") | |
plot_samples(uninfected_files, "Uninfected Cells") | |
# In[5]: | |
def get_image_sizes(file_list): | |
sizes = [] | |
for f in file_list: | |
img = cv2.imread(str(f)) | |
sizes.append(img.shape[:2]) # height, width | |
return sizes | |
parasitized_sizes = get_image_sizes(parasitized_files) | |
uninfected_sizes = get_image_sizes(uninfected_files) | |
all_sizes = parasitized_sizes + uninfected_sizes | |
unique_sizes = set(all_sizes) | |
print("Unique image sizes found:") | |
print(unique_sizes) | |
# In[6]: | |
total_images = len(parasitized_files) + len(uninfected_files) | |
avg_height = np.mean([size[0] for size in all_sizes]) | |
avg_width = np.mean([size[1] for size in all_sizes]) | |
print(f"\nTotal Images: {total_images}") | |
print(f"Average Image Size: {avg_width:.0f}x{avg_height:.0f}") | |
print(f"Min/Max Height: {min(s[0] for s in all_sizes)} / {max(s[0] for s in all_sizes)}") | |
print(f"Min/Max Width: {min(s[1] for s in all_sizes)} / {max(s[1] for s in all_sizes)}") | |
# In[7]: | |
sample_img = cv2.imread(str(parasitized_files[5])) | |
print("Image shape:", sample_img.shape) | |
# # Data preprocessing | |
# In[8]: | |
import matplotlib.pyplot as plt | |
import numpy as np | |
# Assuming you have your image data in a numpy array called 'image_data' | |
# For a single image: | |
plt.figure(figsize=(10, 6)) | |
plt.hist(sample_img.ravel(), bins=256, range=(0, 256), color='blue', alpha=0.7) | |
plt.title('Pixel Value Distribution') | |
plt.xlabel('Pixel Intensity') | |
plt.ylabel('Frequency') | |
plt.grid(True, linestyle='--', alpha=0.5) | |
plt.show() | |
# # Data Splitting | |
# In[20]: | |
import os | |
import shutil | |
from pathlib import Path | |
import random | |
from sklearn.model_selection import train_test_split | |
import numpy as np | |
import matplotlib.pyplot as plt | |
import cv2 | |
import torch | |
from torchvision import datasets, transforms | |
from torch.utils.data import DataLoader | |
# In[21]: | |
RAW_DATA_DIR = 'malaria_data/cell_images' | |
OUTPUT_DIR = 'malaria_ds/split_dataset' | |
PARASITIZED_DIR = os.path.join(RAW_DATA_DIR, 'Parasitized') | |
UNINFECTED_DIR = os.path.join(RAW_DATA_DIR, 'Uninfected') | |
# Output directories | |
TRAIN_DIR = os.path.join(OUTPUT_DIR, 'train') | |
VAL_DIR = os.path.join(OUTPUT_DIR, 'validation') | |
TEST_DIR = os.path.join(OUTPUT_DIR, 'test') | |
# Ensure output directories exist | |
os.makedirs(PARASITIZED_DIR, exist_ok=True) | |
os.makedirs(UNINFECTED_DIR, exist_ok=True) | |
print("Paths defined.") | |
# In[22]: | |
def split_class_files(class_dir, train_dir, val_dir, test_dir): | |
all_files = list(Path(class_dir).glob('*.*')) | |
train_files, test_files = train_test_split(all_files, test_size=0.1, random_state=42) | |
train_files, val_files = train_test_split(train_files, test_size=0.1 / (1 - 0.1), random_state=42) | |
for f in train_files: | |
shutil.copy(f, train_dir) | |
for f in val_files: | |
shutil.copy(f, val_dir) | |
for f in test_files: | |
shutil.copy(f, test_dir) | |
return len(all_files) | |
def create_split_folders(): | |
class_names = ['Parasitized', 'Uninfected'] | |
for folder in ['train', 'validation', 'test']: | |
for cls in class_names: | |
os.makedirs(os.path.join(OUTPUT_DIR, folder, cls), exist_ok=True) | |
print("Splitting Parasitized Images:") | |
total_parasitized = split_class_files( | |
os.path.join(RAW_DATA_DIR, 'Parasitized'), | |
os.path.join(OUTPUT_DIR, 'train', 'Parasitized'), | |
os.path.join(OUTPUT_DIR, 'validation', 'Parasitized'), | |
os.path.join(OUTPUT_DIR, 'test', 'Parasitized') | |
) | |
print("\nSplitting Uninfected Images:") | |
total_uninfected = split_class_files( | |
os.path.join(RAW_DATA_DIR, 'Uninfected'), | |
os.path.join(OUTPUT_DIR, 'train', 'Uninfected'), | |
os.path.join(OUTPUT_DIR, 'validation', 'Uninfected'), | |
os.path.join(OUTPUT_DIR, 'test', 'Uninfected') | |
) | |
print(f"\nTotal Parasitized: {total_parasitized}, Uninfected: {total_uninfected}") | |
print("Dataset split completed.") | |
# ## Data Aug and transforms | |
# In[23]: | |
IMG_SIZE = (128, 128) | |
BATCH_SIZE = 32 | |
# Custom class_to_idx mapping to fix label order | |
class_to_idx = {'Uninfected': 0, 'Parasitized': 1} | |
idx_to_class = {v: k for k, v in class_to_idx.items()} | |
# Define transforms | |
train_transforms = transforms.Compose([ | |
transforms.Resize(IMG_SIZE), | |
transforms.ToTensor(), | |
transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]), | |
transforms.RandomRotation(20), | |
transforms.RandomHorizontalFlip(), | |
]) | |
val_test_transforms = transforms.Compose([ | |
transforms.Resize(IMG_SIZE), | |
transforms.ToTensor(), | |
transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]), | |
]) | |
# Custom Dataset Class to enforce class_to_idx | |
class CustomImageFolder(datasets.ImageFolder): | |
def __init__(self, root, transform, class_to_idx_override=None): | |
super().__init__(root=root, transform=transform) | |
if class_to_idx_override: | |
self.class_to_idx = class_to_idx_override | |
self.samples = [ | |
(path, class_to_idx[cls]) | |
for path, cls_idx in self.samples | |
for cls in [self.classes[cls_idx]] | |
if cls in class_to_idx_override | |
] | |
self.classes = list(class_to_idx_override.keys()) | |
# In[24]: | |
def get_dataloaders(): | |
# Create datasets | |
train_dataset = CustomImageFolder(root=os.path.join(OUTPUT_DIR, 'train'), transform=train_transforms, class_to_idx_override=class_to_idx) | |
val_dataset = CustomImageFolder(root=os.path.join(OUTPUT_DIR, 'validation'), transform=val_test_transforms, class_to_idx_override=class_to_idx) | |
test_dataset = CustomImageFolder(root=os.path.join(OUTPUT_DIR, 'test'), transform=val_test_transforms, class_to_idx_override=class_to_idx) | |
# Create data loaders | |
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True) | |
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False) | |
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False) | |
print(f"Train: {len(train_dataset)}, Val: {len(val_dataset)}, Test: {len(test_dataset)}") | |
print("Class Mapping:", train_dataset.class_to_idx) | |
return train_loader, val_loader, test_loader, train_dataset, val_dataset, test_dataset | |
# In[26]: | |
def show_batch_sample(loader, dataset): | |
images, labels = next(iter(loader)) | |
plt.figure(figsize=(12, 6)) | |
for i in range(min(6, BATCH_SIZE)): | |
img = images[i].numpy().transpose((1, 2, 0)) | |
img = np.clip(img * np.array([0.229, 0.224, 0.225]) + np.array([0.485, 0.456, 0.406]), 0, 1) | |
plt.subplot(2, 3, i+1) | |
plt.imshow(img) | |
plt.title(idx_to_class[labels[i].item()]) | |
plt.axis("off") | |
plt.suptitle("Sample Batch from DataLoader") | |
plt.show() | |
# In[32]: | |
create_split_folders() | |
train_loader, val_loader, test_loader, train_dataset, val_dataset, test_dataset = get_dataloaders() | |
show_batch_sample(train_loader, train_dataset) | |
# In[34]: | |
print(train_dataset) | |