from datasets import load_dataset, concatenate_datasets ds1 = load_dataset("JacobLinCool/taiko-2023-1.1", split="train") ds2 = load_dataset("JacobLinCool/taiko-2023-1.2", split="train") ds3 = load_dataset("JacobLinCool/taiko-2023-1.3", split="train") ds4 = load_dataset("JacobLinCool/taiko-2023-1.4", split="train") ds5 = load_dataset("JacobLinCool/taiko-2023-1.5", split="train") ds6 = load_dataset("JacobLinCool/taiko-2023-1.6", split="train") ds7 = load_dataset("JacobLinCool/taiko-2023-1.7", split="train") ds = concatenate_datasets([ds1, ds2, ds3, ds4, ds5, ds6, ds7]).with_format("torch") good = list(range(len(ds))) good.remove(1079) # 1079 has file problem ds = ds.select(good) # for local test # ds = ( # load_dataset("JacobLinCool/taiko-2023-1.6", split="train") # .with_format("torch") # .select(range(10)) # )