MeanAudio / scripts /extract_audio_latents.sh
junxiliu's picture
add needed model with proper LFS tracking
3a1da90
raw
history blame
947 Bytes
## split audio clips
PATH_TO_AUDIO_DIR= # dir to audio clips e.g.: /home/to/audiocaps_wav
OUTPUT_PARTITION_FILE= # ouput csv path, e.g.: /home/to/output/audiocaps-test-partition.tsv
python training/partition_clips.py \
--data_dir $PATH_TO_AUDIO_DIR \
--output_dir $OUTPUT_PARTITION_FILE
## extract audio latents
export CUDA_VISIBLE_DEVICES=0
CAPTIONS_TSV=./sets/audiocaps-test.tsv # captions tsv path, e.g.: /home/to/audiocaps-test.tsv
OUTPUT_LATENT_DIR= # output latent dir, e.g.: /home/to/output/audiocaps-test-latent
OUTPUT_NPZ_DIR= # output npz dir, e.g.: /home/to/output/audiocaps-test-npz
torchrun --standalone --nproc_per_node=1 training/extract_audio_latents.py \
--captions_tsv $CAPTIONS_TSV \
--data_dir $PATH_TO_AUDIO_DIR \
--clips_tsv $OUTPUT_PARTITION_FILE \
--latent_dir $OUTPUT_LATENT_DIR \
--output_dir $OUTPUT_NPZ_DIR \
--text_encoder='t5_clap' # ['clip', 't5', 't5_clap']