File size: 947 Bytes
3a1da90
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
## split audio clips
PATH_TO_AUDIO_DIR=     # dir to audio clips e.g.: /home/to/audiocaps_wav
OUTPUT_PARTITION_FILE=     # ouput csv path, e.g.: /home/to/output/audiocaps-test-partition.tsv

python training/partition_clips.py \
    --data_dir $PATH_TO_AUDIO_DIR \
    --output_dir $OUTPUT_PARTITION_FILE


## extract audio latents
export CUDA_VISIBLE_DEVICES=0

CAPTIONS_TSV=./sets/audiocaps-test.tsv     # captions tsv path, e.g.: /home/to/audiocaps-test.tsv
OUTPUT_LATENT_DIR=     # output latent dir, e.g.: /home/to/output/audiocaps-test-latent
OUTPUT_NPZ_DIR=     # output npz dir, e.g.: /home/to/output/audiocaps-test-npz

torchrun --standalone --nproc_per_node=1 training/extract_audio_latents.py \
    --captions_tsv $CAPTIONS_TSV \
    --data_dir $PATH_TO_AUDIO_DIR \
    --clips_tsv $OUTPUT_PARTITION_FILE \
    --latent_dir $OUTPUT_LATENT_DIR \
    --output_dir $OUTPUT_NPZ_DIR \
    --text_encoder='t5_clap'  # ['clip', 't5', 't5_clap']