Spaces:
Runtime error
Runtime error
File size: 4,392 Bytes
a167ff0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 |
import random
from datasets import load_dataset
import pandas as pd
import os
from pathlib import Path
import requests
from PIL import Image
from io import BytesIO
# Load the experimental dataset
dataset = load_dataset("taesiri/IERv2-BattleResults_exp", split="train")
dataset_post_ids = list(
set(
load_dataset(
"taesiri/IERv2-BattleResults_exp", columns=["post_id"], split="train"
)
.to_pandas()
.post_id.tolist()
)
)
# Load and filter photoexp dataset
photoexp = pd.read_csv("./photoexp_filtered.csv")
valid_post_ids = set(photoexp.post_id.tolist())
# Filter dataset to include only valid_post_ids
dataset = dataset.filter(
lambda xs: [x in valid_post_ids for x in xs["post_id"]],
batched=True,
batch_size=256,
)
def download_and_save_image(url, save_path):
"""Download image from URL and save it to disk"""
try:
response = requests.get(url)
response.raise_for_status()
img = Image.open(BytesIO(response.content))
img.save(save_path)
return True
except Exception as e:
print(f"Error downloading image {url}: {e}")
return False
def get_random_sample():
"""Get a random sample by first selecting a post_id then picking random edits for that post."""
# First randomly select a post_id from valid posts
random_post_id = random.choice(list(valid_post_ids))
# Filter dataset for this post_id
post_edits = dataset.filter(
lambda xs: [x == random_post_id for x in xs["post_id"]],
batched=True,
batch_size=256,
)
# Get matching photoexp entries for this post_id
matching_photoexp_entries = photoexp[photoexp.post_id == random_post_id]
# Randomly select one edit from the dataset
idx = random.randint(0, len(post_edits) - 1)
sample = post_edits[idx]
# Randomly select one entry from the matching photoexp entries
if not matching_photoexp_entries.empty:
random_photoexp_entry = matching_photoexp_entries.sample(n=1).iloc[0]
additional_edited_image = random_photoexp_entry["edited_image"]
model_b = random_photoexp_entry.get("model")
if model_b is None:
model_b = f"REDDIT_{random_photoexp_entry['comment_id']}"
else:
return None
return {
"post_id": sample["post_id"],
"instruction": sample["instruction"],
"simplified_instruction": sample["simplified_instruction"],
"source_image": sample["source_image"],
"edit1_image": sample["edited_image"],
"edit1_model": sample["model"],
"edit2_image": additional_edited_image,
"edit2_model": model_b,
}
def save_sample(sample, output_dir):
"""Save a sample to disk with all its components"""
if sample is None:
return False
# Create directory structure
sample_dir = Path(output_dir) / str(sample["post_id"])
sample_dir.mkdir(parents=True, exist_ok=True)
# Save instruction and metadata
with open(sample_dir / "metadata.txt", "w") as f:
f.write(f"Post ID: {sample['post_id']}\n")
f.write(f"Original Instruction: {sample['instruction']}\n")
f.write(f"Simplified Instruction: {sample['simplified_instruction']}\n")
f.write(f"Edit 1 Model: {sample['edit1_model']}\n")
f.write(f"Edit 2 Model: {sample['edit2_model']}\n")
# Save images
success = True
success &= download_and_save_image(
sample["source_image"], sample_dir / "source.jpg"
)
success &= download_and_save_image(sample["edit1_image"], sample_dir / "edit1.jpg")
success &= download_and_save_image(sample["edit2_image"], sample_dir / "edit2.jpg")
return success
def main():
output_dir = Path("extracted_samples")
output_dir.mkdir(exist_ok=True)
num_samples = 100 # Number of samples to extract
successful_samples = 0
print(f"Extracting {num_samples} samples...")
while successful_samples < num_samples:
sample = get_random_sample()
if sample and save_sample(sample, output_dir):
successful_samples += 1
print(f"Successfully saved sample {successful_samples}/{num_samples}")
else:
print("Failed to save sample, trying next...")
print(f"Successfully extracted {successful_samples} samples to {output_dir}")
if __name__ == "__main__":
main()
|