File size: 4,263 Bytes
9580089
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
import os
import requests
import tarfile
import zipfile
import shutil
from pathlib import Path
from tqdm import tqdm
import subprocess

def download_file(url: str, target_path: str):
    """使用requests下载文件,支持进度条"""
    response = requests.get(url, stream=True)
    total_size = int(response.headers.get('content-length', 0))
    
    with open(target_path, 'wb') as file, tqdm(
        desc="Downloading",
        total=total_size,
        unit='iB',
        unit_scale=True,
        unit_divisor=1024,
    ) as pbar:
        for data in response.iter_content(chunk_size=1024):
            size = file.write(data)
            pbar.update(size)

def download_vctk(target_dir: str = "data/raw"):
    """下载VCTK数据集"""
    url = "https://datashare.ed.ac.uk/bitstream/handle/10283/3443/VCTK-Corpus-0.92.zip"
    target_dir = Path(target_dir)
    zip_path = target_dir / "vctk.zip"
    
    # 创建目标目录
    os.makedirs(target_dir, exist_ok=True)
    
    # 下载数据集
    if not zip_path.exists():
        print("Downloading VCTK dataset...")
        download_file(url, str(zip_path))
    
    # 解压数据集
    if not (target_dir / "VCTK-Corpus").exists():
        print("\nExtracting VCTK dataset...")
        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
            zip_ref.extractall(target_dir)
    
    # 整理文件结构
    vctk_dir = target_dir / "VCTK-Corpus" / "wav48"
    for speaker_dir in tqdm(os.listdir(vctk_dir), desc="Organizing files"):
        if os.path.isdir(vctk_dir / speaker_dir):
            # 移动音频文件
            src_dir = vctk_dir / speaker_dir
            dst_dir = target_dir / speaker_dir
            if not dst_dir.exists():
                shutil.copytree(src_dir, dst_dir)
    
    # 清理下载文件
    if zip_path.exists():
        os.remove(zip_path)
    if (target_dir / "VCTK-Corpus").exists():
        shutil.rmtree(target_dir / "VCTK-Corpus")

def download_librispeech(target_dir: str = "data/raw", subset: str = "dev-clean"):
    """下载LibriSpeech数据集的一个子集"""
    url = f"https://www.openslr.org/resources/12/{subset}.tar.gz"
    target_dir = Path(target_dir)
    tar_path = target_dir / f"librispeech_{subset}.tar.gz"
    
    # 创建目标目录
    os.makedirs(target_dir, exist_ok=True)
    
    # 下载数据集
    if not tar_path.exists():
        print(f"Downloading LibriSpeech {subset} dataset...")
        download_file(url, str(tar_path))
    
    # 解压数据集
    if not (target_dir / "LibriSpeech").exists():
        print(f"\nExtracting LibriSpeech {subset} dataset...")
        with tarfile.open(tar_path, 'r:gz') as tar:
            tar.extractall(target_dir)
    
    # 整理文件结构
    libri_dir = target_dir / "LibriSpeech" / subset
    for speaker_dir in tqdm(os.listdir(libri_dir), desc="Organizing files"):
        if os.path.isdir(libri_dir / speaker_dir):
            # 移动音频文件
            src_dir = libri_dir / speaker_dir
            dst_dir = target_dir / f"libri_{speaker_dir}"
            if not dst_dir.exists():
                shutil.copytree(src_dir, dst_dir)
    
    # 清理下载文件
    if tar_path.exists():
        os.remove(tar_path)
    if (target_dir / "LibriSpeech").exists():
        shutil.rmtree(target_dir / "LibriSpeech")

def download_aishell3(target_dir: str = "data/raw"):
    """下载AISHELL-3数据集(需要OpenSLR账号)"""
    print("AISHELL-3 dataset needs to be downloaded manually from:")
    print("https://www.openslr.org/93/")
    print(f"Please download and extract it to {target_dir}")

if __name__ == "__main__":
    import argparse
    parser = argparse.ArgumentParser(description="Download speech datasets")
    parser.add_argument("--dataset", type=str, choices=["vctk", "librispeech", "aishell3"],
                      required=True, help="Dataset to download")
    parser.add_argument("--target_dir", type=str, default="data/raw",
                      help="Directory to save the dataset")
    args = parser.parse_args()
    
    if args.dataset == "vctk":
        download_vctk(args.target_dir)
    elif args.dataset == "librispeech":
        download_librispeech(args.target_dir)
    else:
        download_aishell3(args.target_dir)