## split audio clips PATH_TO_AUDIO_DIR= # dir to audio clips e.g.: /home/to/audiocaps_wav OUTPUT_PARTITION_FILE= # ouput csv path, e.g.: /home/to/output/audiocaps-test-partition.tsv python training/partition_clips.py \ --data_dir $PATH_TO_AUDIO_DIR \ --output_dir $OUTPUT_PARTITION_FILE ## extract audio latents export CUDA_VISIBLE_DEVICES=0 CAPTIONS_TSV=./sets/audiocaps-test.tsv # captions tsv path, e.g.: /home/to/audiocaps-test.tsv OUTPUT_LATENT_DIR= # output latent dir, e.g.: /home/to/output/audiocaps-test-latent OUTPUT_NPZ_DIR= # output npz dir, e.g.: /home/to/output/audiocaps-test-npz torchrun --standalone --nproc_per_node=1 training/extract_audio_latents.py \ --captions_tsv $CAPTIONS_TSV \ --data_dir $PATH_TO_AUDIO_DIR \ --clips_tsv $OUTPUT_PARTITION_FILE \ --latent_dir $OUTPUT_LATENT_DIR \ --output_dir $OUTPUT_NPZ_DIR \ --text_encoder='t5_clap' # ['clip', 't5', 't5_clap']