diff --git a/.DS_Store b/.DS_Store
new file mode 100644
index 0000000000000000000000000000000000000000..2bf35c43ed664551d57926e08095cedb08191769
Binary files /dev/null and b/.DS_Store differ
diff --git a/.gitattributes b/.gitattributes
index a6344aac8c09253b3b630fb776ae94478aa0275b..0da7c02e5dda467db394050b263aff53bdc12ee4 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+test.wav filter=lfs diff=lfs merge=lfs -text
+vad/assets/silero_vad.jit filter=lfs diff=lfs merge=lfs -text
diff --git a/.python-version b/.python-version
new file mode 100644
index 0000000000000000000000000000000000000000..54c5196a2b9da061074225f39dc40aed04fec0b9
--- /dev/null
+++ b/.python-version
@@ -0,0 +1 @@
+3.10.9
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000000000000000000000000000000000000..6ebe0ff106911262124772a8f199fe9c9e68e585
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2023 PlayVoice
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/README.md b/README.md
index 68b3932323aa60b5afa11d18fc39150f78d0e676..056a88d5d2640c34cafdabc6a8bb1f41148e973b 100644
--- a/README.md
+++ b/README.md
@@ -1,12 +1,500 @@
 ---
-title: Sovits Test
-emoji: 📚
+title: Whisper Vits SVC
+emoji: 🎵
+python_version: 3.10.12
 colorFrom: blue
 colorTo: purple
 sdk: gradio
-sdk_version: 5.8.0
-app_file: app.py
+sdk_version: 5.7.1
+app_file: main.py
 pinned: false
+license: mit
 ---
 
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
+<div align="center">
+<h1> Variational Inference with adversarial learning for end-to-end Singing Voice Conversion based on VITS </h1>
+    
+[![Hugging Face Spaces](https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue)](https://huggingface.co/spaces/maxmax20160403/sovits5.0)
+<img alt="GitHub Repo stars" src="https://img.shields.io/github/stars/PlayVoice/so-vits-svc-5.0">
+<img alt="GitHub forks" src="https://img.shields.io/github/forks/PlayVoice/so-vits-svc-5.0">
+<img alt="GitHub issues" src="https://img.shields.io/github/issues/PlayVoice/so-vits-svc-5.0">
+<img alt="GitHub" src="https://img.shields.io/github/license/PlayVoice/so-vits-svc-5.0">
+
+[中文文档](./README_ZH.md)
+
+The tree [bigvgan-mix-v2](https://github.com/PlayVoice/whisper-vits-svc/tree/bigvgan-mix-v2) has good audio quality
+
+The tree [RoFormer-HiFTNet](https://github.com/PlayVoice/whisper-vits-svc/tree/RoFormer-HiFTNet) has fast infer speed
+
+No More Upgrade
+
+</div>
+
+- This project targets deep learning beginners, basic knowledge of Python and PyTorch are the prerequisites for this project;
+- This project aims to help deep learning beginners get rid of boring pure theoretical learning, and master the basic knowledge of deep learning by combining it with practices;
+- This project does not support real-time voice converting; (need to replace whisper if real-time voice converting is what you are looking for)
+- This project will not develop one-click packages for other purposes;
+
+![vits-5.0-frame](https://github.com/PlayVoice/so-vits-svc-5.0/assets/16432329/3854b281-8f97-4016-875b-6eb663c92466)
+
+- A minimum VRAM requirement of 6GB for training
+
+- Support for multiple speakers
+
+- Create unique speakers through speaker mixing
+
+- It can even convert voices with light accompaniment
+
+- You can edit F0 using Excel
+
+https://github.com/PlayVoice/so-vits-svc-5.0/assets/16432329/6a09805e-ab93-47fe-9a14-9cbc1e0e7c3a
+
+Powered by [@ShadowVap](https://space.bilibili.com/491283091)
+
+## Model properties
+
+| Feature | From | Status | Function |
+| :--- | :--- | :--- | :--- |
+| whisper | OpenAI | ✅ | strong noise immunity |
+| bigvgan  | NVIDA | ✅ | alias and snake | The formant is clearer and the sound quality is obviously improved |
+| natural speech | Microsoft | ✅ | reduce mispronunciation |
+| neural source-filter | Xin Wang | ✅ | solve the problem of audio F0 discontinuity |
+| pitch quantization | Xin Wang | ✅ | quantize the F0 for embedding |
+| speaker encoder | Google | ✅ | Timbre Encoding and Clustering |
+| GRL for speaker | Ubisoft |✅ | Preventing Encoder Leakage Timbre |
+| SNAC |  Samsung | ✅ | One Shot Clone of VITS |
+| SCLN |  Microsoft | ✅ | Improve Clone |
+| Diffusion |  HuaWei | ✅ | Improve sound quality |
+| PPG perturbation | this project | ✅ | Improved noise immunity and de-timbre |
+| HuBERT perturbation | this project | ✅ | Improved noise immunity and de-timbre |
+| VAE perturbation | this project | ✅ | Improve sound quality |
+| MIX encoder | this project | ✅ | Improve conversion stability |
+| USP infer | this project | ✅ | Improve conversion stability |
+| HiFTNet | Columbia University | ✅ | NSF-iSTFTNet for speed up |
+| RoFormer | Zhuiyi Technology | ✅ | Rotary Positional Embeddings |
+
+due to the use of data perturbation, it takes longer to train than other projects.
+
+**USP : Unvoice and Silence with Pitch when infer**
+![vits_svc_usp](https://github.com/PlayVoice/so-vits-svc-5.0/assets/16432329/ba733b48-8a89-4612-83e0-a0745587d150)
+
+## Why mix
+
+![mix_frame](https://github.com/PlayVoice/whisper-vits-svc/assets/16432329/3ffa1be0-1a21-4752-96b5-6220f98f2313)
+
+## Plug-In-Diffusion
+
+![plug-in-diffusion](https://github.com/PlayVoice/so-vits-svc-5.0/assets/16432329/54a61c90-a97b-404d-9cc9-a2151b2db28f)
+
+## Setup Environment
+
+1. Install [PyTorch](https://pytorch.org/get-started/locally/).
+
+2. Install project dependencies
+    ```shell
+    pip install -i https://pypi.tuna.tsinghua.edu.cn/simple -r requirements.txt
+    ```
+    **Note: whisper is already built-in, do not install it again otherwise it will cuase conflict and error**
+3. Download the Timbre Encoder: [Speaker-Encoder by @mueller91](https://drive.google.com/drive/folders/15oeBYf6Qn1edONkVLXe82MzdIi3O_9m3), put `best_model.pth.tar`  into `speaker_pretrain/`.
+
+4. Download whisper model [whisper-large-v2](https://openaipublic.azureedge.net/main/whisper/models/81f7c96c852ee8fc832187b0132e569d6c3065a3252ed18e56effd0b6a73e524/large-v2.pt). Make sure to download `large-v2.pt`，put it into `whisper_pretrain/`.
+
+5. Download [hubert_soft model](https://github.com/bshall/hubert/releases/tag/v0.1)，put `hubert-soft-0d54a1f4.pt` into `hubert_pretrain/`.
+
+6. Download pitch extractor [crepe full](https://github.com/maxrmorrison/torchcrepe/tree/master/torchcrepe/assets)，put `full.pth` into `crepe/assets`.
+
+   **Note: crepe full.pth is 84.9 MB, not 6kb**
+   
+7. Download pretrain model [sovits5.0.pretrain.pth](https://github.com/PlayVoice/so-vits-svc-5.0/releases/tag/5.0/), and put it into `vits_pretrain/`.
+    ```shell
+    python svc_inference.py --config configs/base.yaml --model ./vits_pretrain/sovits5.0.pretrain.pth --spk ./configs/singers/singer0001.npy --wave test.wav
+    ```
+
+## Dataset preparation
+
+Necessary pre-processing:
+1. Separate voice and accompaniment with [UVR](https://github.com/Anjok07/ultimatevocalremovergui) (skip if no accompaniment)
+2. Cut audio input to shorter length with [slicer](https://github.com/flutydeer/audio-slicer), whisper takes input less than 30 seconds.
+3. Manually check generated audio input, remove inputs shorter than 2 seconds or with obivous noise.
+4. Adjust loudness if necessary, recommend Adobe Audiiton.
+5. Put the dataset into the `dataset_raw` directory following the structure below.
+```
+dataset_raw
+├───speaker0
+│   ├───000001.wav
+│   ├───...
+│   └───000xxx.wav
+└───speaker1
+    ├───000001.wav
+    ├───...
+    └───000xxx.wav
+```
+
+## Data preprocessing
+```shell
+python svc_preprocessing.py -t 2
+```
+`-t`: threading, max number should not exceed CPU core count, usually 2 is enough.
+After preprocessing you will get an output with following structure.
+```
+data_svc/
+└── waves-16k
+│    └── speaker0
+│    │      ├── 000001.wav
+│    │      └── 000xxx.wav
+│    └── speaker1
+│           ├── 000001.wav
+│           └── 000xxx.wav
+└── waves-32k
+│    └── speaker0
+│    │      ├── 000001.wav
+│    │      └── 000xxx.wav
+│    └── speaker1
+│           ├── 000001.wav
+│           └── 000xxx.wav
+└── pitch
+│    └── speaker0
+│    │      ├── 000001.pit.npy
+│    │      └── 000xxx.pit.npy
+│    └── speaker1
+│           ├── 000001.pit.npy
+│           └── 000xxx.pit.npy
+└── hubert
+│    └── speaker0
+│    │      ├── 000001.vec.npy
+│    │      └── 000xxx.vec.npy
+│    └── speaker1
+│           ├── 000001.vec.npy
+│           └── 000xxx.vec.npy
+└── whisper
+│    └── speaker0
+│    │      ├── 000001.ppg.npy
+│    │      └── 000xxx.ppg.npy
+│    └── speaker1
+│           ├── 000001.ppg.npy
+│           └── 000xxx.ppg.npy
+└── speaker
+│    └── speaker0
+│    │      ├── 000001.spk.npy
+│    │      └── 000xxx.spk.npy
+│    └── speaker1
+│           ├── 000001.spk.npy
+│           └── 000xxx.spk.npy
+└── singer
+│   ├── speaker0.spk.npy
+│   └── speaker1.spk.npy
+|
+└── indexes
+    ├── speaker0
+    │   ├── some_prefix_hubert.index
+    │   └── some_prefix_whisper.index
+    └── speaker1
+        ├── hubert.index
+        └── whisper.index
+```
+
+1.  Re-sampling
+    - Generate audio with a sampling rate of 16000Hz in `./data_svc/waves-16k` 
+    ```
+    python prepare/preprocess_a.py -w ./dataset_raw -o ./data_svc/waves-16k -s 16000
+    ```
+    
+    - Generate audio with a sampling rate of 32000Hz in `./data_svc/waves-32k`
+    ```
+    python prepare/preprocess_a.py -w ./dataset_raw -o ./data_svc/waves-32k -s 32000
+    ```
+2. Use 16K audio to extract pitch
+    ```
+    python prepare/preprocess_crepe.py -w data_svc/waves-16k/ -p data_svc/pitch
+    ```
+3. Use 16K audio to extract ppg
+    ```
+    python prepare/preprocess_ppg.py -w data_svc/waves-16k/ -p data_svc/whisper
+    ```
+4. Use 16K audio to extract hubert
+    ```
+    python prepare/preprocess_hubert.py -w data_svc/waves-16k/ -v data_svc/hubert
+    ```
+5. Use 16k audio to extract timbre code
+    ```
+    python prepare/preprocess_speaker.py data_svc/waves-16k/ data_svc/speaker
+    ```
+6. Extract the average value of the timbre code for inference; it can also replace a single audio timbre in generating the training index, and use it as the unified timbre of the speaker for training 
+    ```
+    python prepare/preprocess_speaker_ave.py data_svc/speaker/ data_svc/singer
+    ``` 
+7. Use 32k audio to extract the linear spectrum
+    ```
+    python prepare/preprocess_spec.py -w data_svc/waves-32k/ -s data_svc/specs
+    ``` 
+8. Use 32k audio to generate training index
+    ```
+    python prepare/preprocess_train.py
+    ```
+11. Training file debugging
+    ```
+    python prepare/preprocess_zzz.py
+    ```
+
+## Train
+1. If fine-tuning is based on the pre-trained model, you need to download the pre-trained model: [sovits5.0.pretrain.pth](https://github.com/PlayVoice/so-vits-svc-5.0/releases/tag/5.0). Put pretrained model under project root, change this line
+    ```
+    pretrain: "./vits_pretrain/sovits5.0.pretrain.pth"
+    ```
+    in `configs/base.yaml`，and adjust the learning rate appropriately, eg 5e-5.
+   
+   `batch_size`: for GPU with 6G VRAM, 6 is the recommended value, 8 will work but step speed will be much slower.
+2. Start training
+   ```
+   python svc_trainer.py -c configs/base.yaml -n sovits5.0
+   ``` 
+3. Resume training
+   ```
+   python svc_trainer.py -c configs/base.yaml -n sovits5.0 -p chkpt/sovits5.0/sovits5.0_***.pt
+   ```
+4. Log visualization
+   ```
+   tensorboard --logdir logs/
+   ```
+
+![sovits5 0_base](https://github.com/PlayVoice/so-vits-svc-5.0/assets/16432329/1628e775-5888-4eac-b173-a28dca978faa)
+
+![sovits_spec](https://github.com/PlayVoice/so-vits-svc-5.0/assets/16432329/c4223cf3-b4a0-4325-bec0-6d46d195a1fc)
+
+## Inference
+
+1. Export inference model: text encoder, Flow network, Decoder network
+   ```
+   python svc_export.py --config configs/base.yaml --checkpoint_path chkpt/sovits5.0/***.pt
+   ```
+2. Inference
+   - if there is no need to adjust `f0`, just run the following command.
+   ```
+   python svc_inference.py --config configs/base.yaml --model sovits5.0.pth --spk ./data_svc/singer/your_singer.spk.npy --wave test.wav --shift 0
+   ```
+   - if `f0` will be adjusted manually, follow the steps:
+     1. use whisper to extract content encoding, generate `test.vec.npy`.
+       ```
+       python whisper/inference.py -w test.wav -p test.ppg.npy
+       ```
+     2. use hubert to extract content vector, without using one-click reasoning, in order to reduce GPU memory usage
+       ```
+       python hubert/inference.py -w test.wav -v test.vec.npy
+       ```
+     3. extract the F0 parameter to the csv text format, open the csv file in Excel, and manually modify the wrong F0 according to Audition or SonicVisualiser
+       ```
+       python pitch/inference.py -w test.wav -p test.csv
+       ```
+     4. final inference
+       ```
+       python svc_inference.py --config configs/base.yaml --model sovits5.0.pth --spk ./data_svc/singer/your_singer.spk.npy --wave test.wav --ppg test.ppg.npy --vec test.vec.npy --pit test.csv --shift 0
+       ```
+3. Notes
+
+    - when `--ppg` is specified, when the same audio is reasoned multiple times, it can avoid repeated extraction of audio content codes; if it is not specified, it will be automatically extracted;
+
+    - when `--vec` is specified, when the same audio is reasoned multiple times, it can avoid repeated extraction of audio content codes; if it is not specified, it will be automatically extracted;
+
+    - when `--pit` is specified, the manually tuned F0 parameter can be loaded; if not specified, it will be automatically extracted;
+
+    - generate files in the current directory:svc_out.wav
+
+4. Arguments ref
+
+    | args |--config | --model | --spk | --wave | --ppg | --vec | --pit | --shift |
+    | :---:  | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: |
+    | name | config path | model path | speaker | wave input | wave ppg | wave hubert | wave pitch | pitch shift |
+
+5. post by vad
+```
+python svc_inference_post.py --ref test.wav --svc svc_out.wav --out svc_out_post.wav
+```
+
+## Train Feature Retrieval Index (Optional)
+
+To increase the stability of the generated timbre, you can use the method described in the 
+[Retrieval-based-Voice-Conversion](https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI/blob/main/docs/en/README.en.md) 
+repository. This method consists of 2 steps: 
+
+1. Training the retrieval index on hubert and whisper features
+    Run training with default settings:
+    ```
+    python svc_train_retrieval.py
+    ```
+   
+    If the number of vectors is more than 200_000 they will be compressed to 10_000 using the MiniBatchKMeans algorithm.
+    You can change these settings using command line options:
+    ```
+    usage: crate faiss indexes for feature retrieval [-h] [--debug] [--prefix PREFIX] [--speakers SPEAKERS [SPEAKERS ...]] [--compress-features-after COMPRESS_FEATURES_AFTER]
+                                                     [--n-clusters N_CLUSTERS] [--n-parallel N_PARALLEL]
+
+    options:
+      -h, --help            show this help message and exit
+      --debug
+      --prefix PREFIX       add prefix to index filename
+      --speakers SPEAKERS [SPEAKERS ...]
+                            speaker names to create an index. By default all speakers are from data_svc
+      --compress-features-after COMPRESS_FEATURES_AFTER
+                            If the number of features is greater than the value compress feature vectors using MiniBatchKMeans.
+      --n-clusters N_CLUSTERS
+                            Number of centroids to which features will be compressed
+      --n-parallel N_PARALLEL
+                            Nuber of parallel job of MinibatchKmeans. Default is cpus-1
+    ``` 
+    Compression of training vectors can speed up index inference, but reduces the quality of the retrieve.
+    Use vector count compression if you really have a lot of them.
+ 
+    The resulting indexes will be stored in the "indexes" folder as:
+    ``` 
+    data_svc
+    ...
+    └── indexes
+        ├── speaker0
+        │   ├── some_prefix_hubert.index
+        │   └── some_prefix_whisper.index
+        └── speaker1
+            ├── hubert.index
+            └── whisper.index
+    ```
+2. At the inference stage adding the n closest features in a certain proportion of the vits model
+    Enable Feature Retrieval with settings:
+    ```
+    python svc_inference.py --config configs/base.yaml --model sovits5.0.pth --spk ./data_svc/singer/your_singer.spk.npy --wave test.wav --shift 0 \
+    --enable-retrieval \
+    --retrieval-ratio 0.5 \
+    --n-retrieval-vectors 3
+    ``` 
+    For a better retrieval effect, you can try to cycle through different parameters: `--retrieval-ratio` and `--n-retrieval-vectors`
+ 
+    If you have multiple sets of indexes, you can specify a specific set via the parameter: `--retrieval-index-prefix`
+ 
+    You can explicitly specify the paths to the hubert and whisper indexes using the parameters: `--hubert-index-path` and `--whisper-index-path`
+    
+
+## Create singer
+named by pure coincidence：average -> ave -> eva，eve(eva) represents conception and reproduction
+
+```
+python svc_eva.py
+```
+
+```python
+eva_conf = {
+    './configs/singers/singer0022.npy': 0,
+    './configs/singers/singer0030.npy': 0,
+    './configs/singers/singer0047.npy': 0.5,
+    './configs/singers/singer0051.npy': 0.5,
+}
+```
+
+the generated singer file will be `eva.spk.npy`.
+
+## Data set
+
+| Name | URL |
+| :--- | :--- |
+|KiSing         |http://shijt.site/index.php/2021/05/16/kising-the-first-open-source-mandarin-singing-voice-synthesis-corpus/|
+|PopCS          |https://github.com/MoonInTheRiver/DiffSinger/blob/master/resources/apply_form.md|
+|opencpop       |https://wenet.org.cn/opencpop/download/|
+|Multi-Singer   |https://github.com/Multi-Singer/Multi-Singer.github.io|
+|M4Singer       |https://github.com/M4Singer/M4Singer/blob/master/apply_form.md|
+|CSD            |https://zenodo.org/record/4785016#.YxqrTbaOMU4|
+|KSS            |https://www.kaggle.com/datasets/bryanpark/korean-single-speaker-speech-dataset|
+|JVS MuSic      |https://sites.google.com/site/shinnosuketakamichi/research-topics/jvs_music|
+|PJS            |https://sites.google.com/site/shinnosuketakamichi/research-topics/pjs_corpus|
+|JUST Song      |https://sites.google.com/site/shinnosuketakamichi/publication/jsut-song|
+|MUSDB18        |https://sigsep.github.io/datasets/musdb.html#musdb18-compressed-stems|
+|DSD100         |https://sigsep.github.io/datasets/dsd100.html|
+|Aishell-3      |http://www.aishelltech.com/aishell_3|
+|VCTK           |https://datashare.ed.ac.uk/handle/10283/2651|
+|Korean Songs   |http://urisori.co.kr/urisori-en/doku.php/|
+
+## Code sources and references
+
+https://github.com/facebookresearch/speech-resynthesis [paper](https://arxiv.org/abs/2104.00355)
+
+https://github.com/jaywalnut310/vits [paper](https://arxiv.org/abs/2106.06103)
+
+https://github.com/openai/whisper/ [paper](https://arxiv.org/abs/2212.04356)
+
+https://github.com/NVIDIA/BigVGAN [paper](https://arxiv.org/abs/2206.04658)
+
+https://github.com/mindslab-ai/univnet [paper](https://arxiv.org/abs/2106.07889)
+
+https://github.com/nii-yamagishilab/project-NN-Pytorch-scripts/tree/master/project/01-nsf
+
+https://github.com/huawei-noah/Speech-Backbones/tree/main/Grad-TTS
+
+https://github.com/brentspell/hifi-gan-bwe
+
+https://github.com/mozilla/TTS
+
+https://github.com/bshall/soft-vc
+
+https://github.com/maxrmorrison/torchcrepe
+
+https://github.com/MoonInTheRiver/DiffSinger
+
+https://github.com/OlaWod/FreeVC [paper](https://arxiv.org/abs/2210.15418)
+
+https://github.com/yl4579/HiFTNet [paper](https://arxiv.org/abs/2309.09493)
+
+[Autoregressive neural f0 model for statistical parametric speech synthesis](https://web.archive.org/web/20210718024752id_/https://ieeexplore.ieee.org/ielx7/6570655/8356719/08341752.pdf)
+
+[One-shot Voice Conversion by Separating Speaker and Content Representations with Instance Normalization](https://arxiv.org/abs/1904.05742)
+
+[SNAC : Speaker-normalized Affine Coupling Layer in Flow-based Architecture for Zero-Shot Multi-Speaker Text-to-Speech](https://github.com/hcy71o/SNAC)
+
+[Adapter-Based Extension of Multi-Speaker Text-to-Speech Model for New Speakers](https://arxiv.org/abs/2211.00585)
+
+[AdaSpeech: Adaptive Text to Speech for Custom Voice](https://arxiv.org/pdf/2103.00993.pdf)
+
+[AdaVITS: Tiny VITS for Low Computing Resource Speaker Adaptation](https://arxiv.org/pdf/2206.00208.pdf)
+
+[Cross-Speaker Prosody Transfer on Any Text for Expressive Speech Synthesis](https://github.com/ubisoft/ubisoft-laforge-daft-exprt)
+
+[Learn to Sing by Listening: Building Controllable Virtual Singer by Unsupervised Learning from Voice Recordings](https://arxiv.org/abs/2305.05401)
+
+[Adversarial Speaker Disentanglement Using Unannotated External Data for Self-supervised Representation Based Voice Conversion](https://arxiv.org/pdf/2305.09167.pdf)
+
+[Multilingual Speech Synthesis and Cross-Language Voice Cloning: GRL](https://arxiv.org/abs/1907.04448)
+
+[RoFormer: Enhanced Transformer with rotary position embedding](https://arxiv.org/abs/2104.09864)
+
+## Method of Preventing Timbre Leakage Based on Data Perturbation
+
+https://github.com/auspicious3000/contentvec/blob/main/contentvec/data/audio/audio_utils_1.py
+
+https://github.com/revsic/torch-nansy/blob/main/utils/augment/praat.py
+
+https://github.com/revsic/torch-nansy/blob/main/utils/augment/peq.py
+
+https://github.com/biggytruck/SpeechSplit2/blob/main/utils.py
+
+https://github.com/OlaWod/FreeVC/blob/main/preprocess_sr.py
+
+## Contributors
+
+<a href="https://github.com/PlayVoice/so-vits-svc/graphs/contributors">
+  <img src="https://contrib.rocks/image?repo=PlayVoice/so-vits-svc" />
+</a>
+
+## Thanks to
+
+https://github.com/Francis-Komizu/Sovits
+
+## Relevant Projects
+- [LoRA-SVC](https://github.com/PlayVoice/lora-svc): decoder only svc
+- [Grad-SVC](https://github.com/PlayVoice/Grad-SVC): diffusion based svc
+
+## Original evidence
+2022.04.12 https://mp.weixin.qq.com/s/autNBYCsG4_SvWt2-Ll_zA
+
+2022.04.22 https://github.com/PlayVoice/VI-SVS
+
+2022.07.26 https://mp.weixin.qq.com/s/qC4TJy-4EVdbpvK2cQb1TA
+
+2022.09.08 https://github.com/PlayVoice/VI-SVC
+
+## Be copied by svc-develop-team/so-vits-svc
+![coarse_f0_1](https://github.com/PlayVoice/so-vits-svc-5.0/assets/16432329/e2f5e5d3-d169-42c1-953f-4e1648b6da37)
diff --git a/README_ZH.md b/README_ZH.md
new file mode 100644
index 0000000000000000000000000000000000000000..cf9571210c7a3d319a65dae3c4a449f6473a3bd9
--- /dev/null
+++ b/README_ZH.md
@@ -0,0 +1,418 @@
+<div align="center">
+<h1> Variational Inference with adversarial learning for end-to-end Singing Voice Conversion based on VITS </h1>
+    
+[![Hugging Face Spaces](https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue)](https://huggingface.co/spaces/maxmax20160403/sovits5.0)
+<img alt="GitHub Repo stars" src="https://img.shields.io/github/stars/PlayVoice/so-vits-svc-5.0">
+<img alt="GitHub forks" src="https://img.shields.io/github/forks/PlayVoice/so-vits-svc-5.0">
+<img alt="GitHub issues" src="https://img.shields.io/github/issues/PlayVoice/so-vits-svc-5.0">
+<img alt="GitHub" src="https://img.shields.io/github/license/PlayVoice/so-vits-svc-5.0">
+
+</div>
+
+### 本项目使用简洁明了的代码结构，用于深度学习技术的研究
+### 基于学习的目的，本项目并不追求效果极限、而更多的为学生笔记本考虑，采用了低配置参数、最终预训练模型为202M（包括生成器和判别器，且为float32模型），远远小于同类项目模型大小
+### 如果你寻找的是直接可用的项目，本项目并不适合你
+
+- 本项目的目标群体是：深度学习初学者，具备Python和PyTorch的基本操作是使用本项目的前置条件；
+- 本项目旨在帮助深度学习初学者，摆脱枯燥的纯理论学习，通过与实践结合，熟练掌握深度学习基本知识；
+- 本项目不支持实时变声；（支持需要换掉whisper）
+- 本项目不会开发用于其他用途的一键包
+### 代码详解课程
+- 1-整体框架 https://www.bilibili.com/video/BV1Tj411e7pQ
+- 2-数据准备和预处理 https://www.bilibili.com/video/BV1uj411v7zW
+- 3-先验后验编码器 https://www.bilibili.com/video/BV1Be411Q7r5
+- 4-decoder部分 https://www.bilibili.com/video/BV19u4y1b73U
+- 5-蛇形激活函数 https://www.bilibili.com/video/BV1HN4y1D7AR
+- 6-Flow部分 https://www.bilibili.com/video/BV1ju411F7Fs
+- 7-训练及损失函数部分 https://www.bilibili.com/video/BV1qw411W73B
+- 8-训练推理以及基频矫正 https://www.bilibili.com/video/BV1eb4y1u7ER
+
+![vits-5.0-frame](https://github.com/PlayVoice/so-vits-svc-5.0/assets/16432329/3854b281-8f97-4016-875b-6eb663c92466)
+
+- 【无 泄漏】支持多发音人
+
+- 【捏 音色】创造独有发音人
+
+- 【带 伴奏】也能进行转换，轻度伴奏
+
+- 【用 Excel】进行原始调教，纯手工
+
+https://github.com/PlayVoice/so-vits-svc-5.0/assets/16432329/63858332-cc0d-40e1-a216-6fe8bf638f7c
+
+Powered by [@ShadowVap](https://space.bilibili.com/491283091)
+
+## 模型特点：
+
+| Feature | From | Status | Function |
+| :--- | :--- | :--- | :--- |
+| whisper | OpenAI | ✅ | 强大的抗噪能力 |
+| bigvgan  | NVIDA | ✅ | 抗锯齿与蛇形激活，共振峰更清晰，提升音质明显 |
+| natural speech | Microsoft | ✅ | 减少发音错误 |
+| neural source-filter | NII | ✅ | 解决断音问题 |
+| speaker encoder | Google | ✅ | 音色编码与聚类 |
+| GRL for speaker | Ubisoft |✅ | 对抗去音色 |
+| SNAC |  Samsung | ✅ | VITS 一句话克隆 |
+| SCLN |  Microsoft | ✅ | 改善克隆 |
+| PPG perturbation | 本项目 | ✅ | 提升抗噪性和去音色 |
+| HuBERT perturbation | 本项目 | ✅ | 提升抗噪性和去音色 |
+| VAE perturbation | 本项目 | ✅ | 提升音质 |
+| Mix encoder | 本项目 | ✅ | 提升转换稳定性 |
+| USP 推理 | 本项目 | ✅ | 提升转换稳定性 |
+
+**USP : 即使unvoice和silence在推理的时候，也有Pitch，这个Pitch平滑链接voice段**
+![vits_svc_usp](https://github.com/PlayVoice/so-vits-svc-5.0/assets/16432329/ba733b48-8a89-4612-83e0-a0745587d150)
+
+## 为什么要mix
+
+![mix_frame](https://github.com/PlayVoice/whisper-vits-svc/assets/16432329/3ffa1be0-1a21-4752-96b5-6220f98f2313)
+
+## 安装环境
+
+1. 安装[PyTorch](https://pytorch.org/get-started/locally/)
+
+2.  安装项目依赖  
+    ```
+    pip install -i https://pypi.tuna.tsinghua.edu.cn/simple -r requirements.txt
+    ```
+    **注意：不能额外安装whisper，否则会和代码内置whisper冲突**  
+
+3.  下载[音色编码器](https://drive.google.com/drive/folders/15oeBYf6Qn1edONkVLXe82MzdIi3O_9m3), 把`best_model.pth.tar`放到`speaker_pretrain/`里面 （**不要解压**）
+
+4.  下载[whisper-large-v2模型](https://openaipublic.azureedge.net/main/whisper/models/81f7c96c852ee8fc832187b0132e569d6c3065a3252ed18e56effd0b6a73e524/large-v2.pt)，把`large-v2.pt`放到`whisper_pretrain/`里面
+
+5.  下载[hubert_soft模型](https://github.com/bshall/hubert/releases/tag/v0.1)，把`hubert-soft-0d54a1f4.pt`放到`hubert_pretrain/`里面
+
+6.  下载音高提取模型[crepe full](https://github.com/maxrmorrison/torchcrepe/tree/master/torchcrepe/assets)，把`full.pth`放到`crepe/assets`里面
+
+    **注意：full.pth为84.9M，请确认文件大小无误**
+    
+7.  下载[sovits5.0.pretrain.pth](https://github.com/PlayVoice/so-vits-svc-5.0/releases/tag/5.0/), 把它放到`vits_pretrain/`里面，推理测试
+
+    > python svc_inference.py --config configs/base.yaml --model ./vits_pretrain/sovits5.0.pretrain.pth --spk ./configs/singers/singer0001.npy --wave test.wav
+
+## 数据集准备
+1. 人声分离，如果数据集没有BGM直接跳过此步骤（推荐使用[UVR](https://github.com/Anjok07/ultimatevocalremovergui)中的3_HP-Vocal-UVR模型或者htdemucs_ft模型抠出数据集中的人声）  
+2. 用[slicer](https://github.com/flutydeer/audio-slicer)剪切音频，whisper要求为小于30秒（建议丢弃不足2秒的音频，短音频大多没有音素，有可能会影响训练效果）  
+3. 手动筛选经过第1步和第2步处理过的音频，裁剪或者丢弃杂音明显的音频，如果数据集没有BGM直接跳过此步骤  
+4. 用Adobe Audition进行响度平衡处理  
+5. 按下面文件结构，将数据集放入dataset_raw目录  
+```shell
+dataset_raw
+├───speaker0
+│   ├───000001.wav
+│   ├───...
+│   └───000xxx.wav
+└───speaker1
+    ├───000001.wav
+    ├───...
+    └───000xxx.wav
+```
+
+## 数据预处理
+
+```shell
+python svc_preprocessing.py -t 2
+```
+-t：指定线程数，必须是正整数且不得超过CPU总核心数，一般写2就可以了
+
+预处理完成后文件夹结构如下面所示
+```shell
+data_svc/
+└── waves-16k
+│    └── speaker0
+│    │      ├── 000001.wav
+│    │      └── 000xxx.wav
+│    └── speaker1
+│           ├── 000001.wav
+│           └── 000xxx.wav
+└── waves-32k
+│    └── speaker0
+│    │      ├── 000001.wav
+│    │      └── 000xxx.wav
+│    └── speaker1
+│           ├── 000001.wav
+│           └── 000xxx.wav
+└── pitch
+│    └── speaker0
+│    │      ├── 000001.pit.npy
+│    │      └── 000xxx.pit.npy
+│    └── speaker1
+│           ├── 000001.pit.npy
+│           └── 000xxx.pit.npy
+└── hubert
+│    └── speaker0
+│    │      ├── 000001.vec.npy
+│    │      └── 000xxx.vec.npy
+│    └── speaker1
+│           ├── 000001.vec.npy
+│           └── 000xxx.vec.npy
+└── whisper
+│    └── speaker0
+│    │      ├── 000001.ppg.npy
+│    │      └── 000xxx.ppg.npy
+│    └── speaker1
+│           ├── 000001.ppg.npy
+│           └── 000xxx.ppg.npy
+└── speaker
+│    └── speaker0
+│    │      ├── 000001.spk.npy
+│    │      └── 000xxx.spk.npy
+│    └── speaker1
+│           ├── 000001.spk.npy
+│           └── 000xxx.spk.npy
+└── singer
+    ├── speaker0.spk.npy
+    └── speaker1.spk.npy
+```
+
+如果您有编程基础，推荐，逐步完成数据处理，也利于学习内部工作原理
+
+- 1， 重采样
+
+    生成采样率16000Hz音频, 存储路径为：./data_svc/waves-16k
+
+    > python prepare/preprocess_a.py -w ./dataset_raw -o ./data_svc/waves-16k -s 16000
+
+    生成采样率32000Hz音频, 存储路径为：./data_svc/waves-32k
+
+    > python prepare/preprocess_a.py -w ./dataset_raw -o ./data_svc/waves-32k -s 32000
+
+- 2， 使用16K音频，提取音高
+
+    > python prepare/preprocess_crepe.py -w data_svc/waves-16k/ -p data_svc/pitch
+
+- 3， 使用16k音频，提取内容编码
+    > python prepare/preprocess_ppg.py -w data_svc/waves-16k/ -p data_svc/whisper
+
+- 4， 使用16k音频，提取内容编码
+    > python prepare/preprocess_hubert.py -w data_svc/waves-16k/ -v data_svc/hubert
+
+- 5， 使用16k音频，提取音色编码
+    > python prepare/preprocess_speaker.py data_svc/waves-16k/ data_svc/speaker
+
+- 6， 提取音色编码均值；用于推理，也可作为发音人统一音色用于生成训练索引（数据音色变化不大的情况下）
+    > python prepare/preprocess_speaker_ave.py data_svc/speaker/ data_svc/singer
+
+- 7， 使用32k音频，提取线性谱
+    > python prepare/preprocess_spec.py -w data_svc/waves-32k/ -s data_svc/specs
+
+- 8， 使用32k音频，生成训练索引
+    > python prepare/preprocess_train.py
+
+- 9， 训练文件调试
+    > python prepare/preprocess_zzz.py
+
+## 训练
+0. 参数调整  
+  如果基于预训练模型微调，需要下载预训练模型[sovits5.0.pretrain.pth](https://github.com/PlayVoice/so-vits-svc-5.0/releases/tag/5.0)并且放在项目根目录下面<br>
+  并且修改`configs/base.yaml`的参数`pretrain: "./vits_pretrain/sovits5.0.pretrain.pth"`，并适当调小学习率（建议从5e-5开始尝试）<br>
+  **learning_rate & batch_size & accum_step 为三个紧密相关的参数，需要仔细调节**<br>
+  **batch_size 乘以 accum_step 通常等于 16 或 32，对于低显存GPU，可以尝试 batch_size = 4，accum_step = 4**
+
+1. 开始训练  
+   ```
+   python svc_trainer.py -c configs/base.yaml -n sovits5.0
+   ```
+2. 恢复训练
+   ```
+   python svc_trainer.py -c configs/base.yaml -n sovits5.0 -p chkpt/sovits5.0/sovits5.0_***.pt
+   ```
+3. 训练日志可视化
+   ```
+   tensorboard --logdir logs/
+   ```
+
+![sovits5 0_base](https://github.com/PlayVoice/so-vits-svc-5.0/assets/16432329/1628e775-5888-4eac-b173-a28dca978faa)
+
+![sovits_spec](https://github.com/PlayVoice/so-vits-svc-5.0/assets/16432329/c4223cf3-b4a0-4325-bec0-6d46d195a1fc)
+
+## 推理
+1. 导出推理模型：文本编码器，Flow网络，Decoder网络；判别器和后验编码器等只在训练中使用  
+   ```
+   python svc_export.py --config configs/base.yaml --checkpoint_path chkpt/sovits5.0/***.pt
+   ```
+2. 推理  
+- 如果不想手动调整f0，只需要最终的推理结果，运行下面的命令即可
+  ```
+  python svc_inference.py --config configs/base.yaml --model sovits5.0.pth --spk ./data_svc/singer/修改成对应的名称.npy --wave test.wav --shift 0
+  ```
+- 如果需要手动调整f0，依据下面的流程操作
+
+  - 使用whisper提取内容编码，生成test.ppg.npy
+    ```
+    python whisper/inference.py -w test.wav -p test.ppg.npy
+    ```
+
+  - 使用hubert提取内容编码，生成test.vec.npy
+    ```
+    python hubert/inference.py -w test.wav -v test.vec.npy
+    ```
+
+  - 提取csv文本格式F0参数，用Excel打开csv文件，对照Audition或者SonicVisualiser手动修改错误的F0
+     ```
+     python pitch/inference.py -w test.wav -p test.csv
+     ```
+  - 最终推理
+     ```
+     python svc_inference.py --config configs/base.yaml --model sovits5.0.pth --spk ./data_svc/singer/修改成对应的名称.npy --wave test.wav --ppg test.ppg.npy --vec test.vec.npy --pit test.csv --shift 0
+     ```
+
+3. 一些注意点  
+    当指定--ppg后，多次推理同一个音频时，可以避免重复提取音频内容编码；没有指定，也会自动提取  
+    
+    当指定--vec后，多次推理同一个音频时，可以避免重复提取音频内容编码；没有指定，也会自动提取  
+    
+    当指定--pit后，可以加载手工调教的F0参数；没有指定，也会自动提取  
+    
+    生成文件在当前目录svc_out.wav
+    
+    | args | --config | --model | --spk | --wave | --ppg | --vec | --pit | --shift |
+    | :---:  | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: |
+    | name | 配置文件 | 模型文件 | 音色文件 | 音频文件 | ppg内容 | hubert内容 | 音高内容 | 升降调 |
+
+4. 去噪后处理
+```
+python svc_inference_post.py --ref test.wav --svc svc_out.wav --out svc_out_post.wav
+```
+
+## 两种训练模式
+- 分散模式：训练索引中，音色文件使用音频音色
+- 统一模式：训练索引中，音色文件使用发音人音色
+
+**问题：哪种情况下，哪个模式更好**
+
+## 模型融合
+```
+python svc_merge.py --model1 模型1.pt --model1 模型2.pt --rate 模型1占比(0~1)
+```
+对不同epoch的模型进行融合，可以获得比较平均的性能、削弱过拟合
+
+例如：python svc_merge.py --model1 chkpt\sovits5.0\sovits5.0_1045.pt --model2 chkpt\sovits5.0\sovits5.0_1050.pt --rate 0.4
+
+## 捏音色
+纯属巧合的取名：average -> ave -> eva，夏娃代表者孕育和繁衍
+```
+python svc_eva.py
+```
+```python
+eva_conf = {
+    './configs/singers/singer0022.npy': 0,
+    './configs/singers/singer0030.npy': 0,
+    './configs/singers/singer0047.npy': 0.5,
+    './configs/singers/singer0051.npy': 0.5,
+}
+```
+
+生成的音色文件为：eva.spk.npy
+
+## 数据集
+
+| Name | URL |
+| :--- | :--- |
+|KiSing         |http://shijt.site/index.php/2021/05/16/kising-the-first-open-source-mandarin-singing-voice-synthesis-corpus/|
+|PopCS          |https://github.com/MoonInTheRiver/DiffSinger/blob/master/resources/apply_form.md|
+|opencpop       |https://wenet.org.cn/opencpop/download/|
+|Multi-Singer   |https://github.com/Multi-Singer/Multi-Singer.github.io|
+|M4Singer       |https://github.com/M4Singer/M4Singer/blob/master/apply_form.md|
+|CSD            |https://zenodo.org/record/4785016#.YxqrTbaOMU4|
+|KSS            |https://www.kaggle.com/datasets/bryanpark/korean-single-speaker-speech-dataset|
+|JVS MuSic      |https://sites.google.com/site/shinnosuketakamichi/research-topics/jvs_music|
+|PJS            |https://sites.google.com/site/shinnosuketakamichi/research-topics/pjs_corpus|
+|JUST Song      |https://sites.google.com/site/shinnosuketakamichi/publication/jsut-song|
+|MUSDB18        |https://sigsep.github.io/datasets/musdb.html#musdb18-compressed-stems|
+|DSD100         |https://sigsep.github.io/datasets/dsd100.html|
+|Aishell-3      |http://www.aishelltech.com/aishell_3|
+|VCTK           |https://datashare.ed.ac.uk/handle/10283/2651|
+|Korean Songs   |http://urisori.co.kr/urisori-en/doku.php/|
+
+## 代码来源和参考文献
+
+https://github.com/facebookresearch/speech-resynthesis [paper](https://arxiv.org/abs/2104.00355)
+
+https://github.com/jaywalnut310/vits [paper](https://arxiv.org/abs/2106.06103)
+
+https://github.com/openai/whisper/ [paper](https://arxiv.org/abs/2212.04356)
+
+https://github.com/NVIDIA/BigVGAN [paper](https://arxiv.org/abs/2206.04658)
+
+https://github.com/mindslab-ai/univnet [paper](https://arxiv.org/abs/2106.07889)
+
+https://github.com/nii-yamagishilab/project-NN-Pytorch-scripts/tree/master/project/01-nsf
+
+https://github.com/huawei-noah/Speech-Backbones/tree/main/Grad-TTS
+
+https://github.com/brentspell/hifi-gan-bwe
+
+https://github.com/mozilla/TTS
+
+https://github.com/bshall/soft-vc
+
+https://github.com/maxrmorrison/torchcrepe
+
+https://github.com/MoonInTheRiver/DiffSinger
+
+https://github.com/OlaWod/FreeVC [paper](https://arxiv.org/abs/2210.15418)
+
+https://github.com/yl4579/HiFTNet [paper](https://arxiv.org/abs/2309.09493)
+
+[One-shot Voice Conversion by Separating Speaker and Content Representations with Instance Normalization](https://arxiv.org/abs/1904.05742)
+
+[SNAC : Speaker-normalized Affine Coupling Layer in Flow-based Architecture for Zero-Shot Multi-Speaker Text-to-Speech](https://github.com/hcy71o/SNAC)
+
+[Adapter-Based Extension of Multi-Speaker Text-to-Speech Model for New Speakers](https://arxiv.org/abs/2211.00585)
+
+[AdaSpeech: Adaptive Text to Speech for Custom Voice](https://arxiv.org/pdf/2103.00993.pdf)
+
+[AdaVITS: Tiny VITS for Low Computing Resource Speaker Adaptation](https://arxiv.org/pdf/2206.00208.pdf)
+
+[Cross-Speaker Prosody Transfer on Any Text for Expressive Speech Synthesis](https://github.com/ubisoft/ubisoft-laforge-daft-exprt)
+
+[Learn to Sing by Listening: Building Controllable Virtual Singer by Unsupervised Learning from Voice Recordings](https://arxiv.org/abs/2305.05401)
+
+[Adversarial Speaker Disentanglement Using Unannotated External Data for Self-supervised Representation Based Voice Conversion](https://arxiv.org/pdf/2305.09167.pdf)
+
+[Multilingual Speech Synthesis and Cross-Language Voice Cloning: GRL](https://arxiv.org/abs/1907.04448)
+
+[RoFormer: Enhanced Transformer with rotary position embedding](https://arxiv.org/abs/2104.09864))https://github.com/facebookresearch/speech-resynthesis [paper](https://arxiv.org/abs/2104.00355)
+
+## 基于数据扰动防止音色泄露的方法
+
+https://github.com/auspicious3000/contentvec/blob/main/contentvec/data/audio/audio_utils_1.py
+
+https://github.com/revsic/torch-nansy/blob/main/utils/augment/praat.py
+
+https://github.com/revsic/torch-nansy/blob/main/utils/augment/peq.py
+
+https://github.com/biggytruck/SpeechSplit2/blob/main/utils.py
+
+https://github.com/OlaWod/FreeVC/blob/main/preprocess_sr.py
+
+## 贡献者
+
+<a href="https://github.com/PlayVoice/so-vits-svc/graphs/contributors">
+  <img src="https://contrib.rocks/image?repo=PlayVoice/so-vits-svc" />
+</a>
+
+## 特别感谢
+
+https://github.com/Francis-Komizu/Sovits
+
+## 原创过程
+2022.04.12 https://mp.weixin.qq.com/s/autNBYCsG4_SvWt2-Ll_zA
+
+2022.04.22 https://github.com/PlayVoice/VI-SVS
+
+2022.07.26 https://mp.weixin.qq.com/s/qC4TJy-4EVdbpvK2cQb1TA
+
+2022.09.08 https://github.com/PlayVoice/VI-SVC
+
+## 被这个项目拷贝：svc-develop-team/so-vits-svc
+![coarse_f0_1](https://github.com/PlayVoice/so-vits-svc-5.0/assets/16432329/e2f5e5d3-d169-42c1-953f-4e1648b6da37)
+
+![coarse_f0_2](https://github.com/PlayVoice/so-vits-svc-5.0/assets/16432329/f3539c83-7c8a-425e-bf20-2c402132f0f4)
+
+![coarse_f0_3](https://github.com/PlayVoice/so-vits-svc-5.0/assets/16432329/f3cee94a-0eeb-4189-b9bb-7043d06e62ef)
+
+## Rcell对拷贝的真实回应
+
+![Rcell](https://github.com/PlayVoice/so-vits-svc-5.0/assets/16432329/8ebb236d-e233-4cea-9359-8e44029b5af5)
diff --git a/__pycache__/svc_inference.cpython-310.pyc b/__pycache__/svc_inference.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a79eaa7c24684227ac10306cb42310722ef8672a
Binary files /dev/null and b/__pycache__/svc_inference.cpython-310.pyc differ
diff --git a/app.py b/app.py
new file mode 100644
index 0000000000000000000000000000000000000000..33e8eb3051352a1e2f93eaebcdc8681b5e6799fb
--- /dev/null
+++ b/app.py
@@ -0,0 +1,444 @@
+import os
+import subprocess
+import yaml
+import sys
+import webbrowser
+import gradio as gr
+from ruamel.yaml import YAML
+import shutil
+import soundfile
+import shlex
+import locale
+
+class WebUI:
+    def __init__(self):
+        self.train_config_path = 'configs/train.yaml'
+        self.info = Info()
+        self.names = []
+        self.names2 = []
+        self.voice_names = []
+        self.base_config_path = 'configs/base.yaml'
+        if not os.path.exists(self.train_config_path):
+            shutil.copyfile(self.base_config_path, self.train_config_path)
+            print(i18n("初始化成功"))
+        else:
+            print(i18n("就绪"))
+        self.main_ui()
+
+    def main_ui(self):
+        with gr.Blocks(theme=gr.themes.Base(primary_hue=gr.themes.colors.green)) as ui:
+
+            gr.Markdown('# so-vits-svc5.0 WebUI')
+
+            with gr.Tab(i18n("预处理-训练")):
+
+                with gr.Accordion(i18n('训练说明'), open=False):
+
+                    gr.Markdown(self.info.train)
+
+                gr.Markdown(i18n('### 预处理参数设置'))
+
+                with gr.Row():
+
+                    self.model_name = gr.Textbox(value='sovits5.0', label='model', info=i18n('模型名称'), interactive=True) #建议设置为不可修改
+
+                    self.f0_extractor = gr.Textbox(value='crepe', label='f0_extractor', info=i18n('f0提取器'), interactive=False)
+
+                    self.thread_count = gr.Slider(minimum=1, maximum=os.cpu_count(), step=1, value=2, label='thread_count', info=i18n('预处理线程数'), interactive=True)
+
+                gr.Markdown(i18n('### 训练参数设置'))
+
+                with gr.Row():
+
+                    self.learning_rate = gr.Number(value=5e-5, label='learning_rate', info=i18n('学习率'), interactive=True)
+
+                    self.batch_size = gr.Slider(minimum=1, maximum=50, step=1, value=6, label='batch_size', info=i18n('批大小'), interactive=True)
+
+                with gr.Row():
+
+                    self.info_interval = gr.Number(value=50, label='info_interval', info=i18n('训练日志记录间隔（step）'), interactive=True)
+
+                    self.eval_interval = gr.Number(value=1, label='eval_interval', info=i18n('验证集验证间隔（epoch）'), interactive=True)
+
+                    self.save_interval = gr.Number(value=5, label='save_interval', info=i18n('检查点保存间隔（epoch）'), interactive=True)
+
+                    self.keep_ckpts = gr.Number(value=0, label='keep_ckpts', info=i18n('保留最新的检查点文件(0保存全部)'),interactive=True)
+
+                with gr.Row():
+
+                    self.slow_model = gr.Checkbox(label=i18n("是否添加底模"), value=True, interactive=True)
+
+                gr.Markdown(i18n('### 开始训练'))
+
+                with gr.Row():
+
+                    self.bt_open_dataset_folder = gr.Button(value=i18n('打开数据集文件夹'))
+
+                    self.bt_onekey_train = gr.Button(i18n('一键训练'), variant="primary")
+
+                    self.bt_tb = gr.Button(i18n('启动Tensorboard'), variant="primary")
+
+                gr.Markdown(i18n('### 恢复训练'))
+
+                with gr.Row():
+
+                    self.resume_model = gr.Dropdown(choices=sorted(self.names), label='Resume training progress from checkpoints', info=i18n('从检查点恢复训练进度'), interactive=True)
+
+                    with gr.Column():
+
+                        self.bt_refersh = gr.Button(i18n('刷新'))
+
+                        self.bt_resume_train = gr.Button(i18n('恢复训练'), variant="primary")
+
+            with gr.Tab(i18n("推理")):
+
+                with gr.Accordion(i18n('推理说明'), open=False):
+
+                    gr.Markdown(self.info.inference)
+
+                gr.Markdown(i18n('### 推理参数设置'))
+
+                with gr.Row():
+
+                    with gr.Column():
+
+                        self.keychange = gr.Slider(-24, 24, value=0, step=1, label=i18n('变调'))
+
+                        self.file_list = gr.Markdown(value="", label=i18n("文件列表"))
+
+                        with gr.Row():
+
+                            self.resume_model2 = gr.Dropdown(choices=sorted(self.names2), label='Select the model you want to export',
+                                                             info=i18n('选择要导出的模型'), interactive=True)
+                            with gr.Column():
+
+                                self.bt_refersh2 = gr.Button(value=i18n('刷新模型和音色'))
+
+
+                                self.bt_out_model = gr.Button(value=i18n('导出模型'), variant="primary")
+
+                        with gr.Row():
+
+                            self.resume_voice = gr.Dropdown(choices=sorted(self.voice_names), label='Select the sound file',
+                                                            info=i18n('选择音色文件'), interactive=True)
+
+                        with gr.Row():
+
+                            self.input_wav = gr.Audio(type='filepath', label=i18n('选择待转换音频'), source='upload')
+
+                        with gr.Row():
+
+                            self.bt_infer = gr.Button(value=i18n('开始转换'), variant="primary")
+
+                        with gr.Row():
+
+                            self.output_wav = gr.Audio(label=i18n('输出音频'), interactive=False)
+
+            self.bt_open_dataset_folder.click(fn=self.openfolder)
+            self.bt_onekey_train.click(fn=self.onekey_training,inputs=[self.model_name, self.thread_count,self.learning_rate,self.batch_size, self.info_interval, self.eval_interval,self.save_interval, self.keep_ckpts, self.slow_model])
+            self.bt_out_model.click(fn=self.out_model, inputs=[self.model_name, self.resume_model2])
+            self.bt_tb.click(fn=self.tensorboard)
+            self.bt_refersh.click(fn=self.refresh_model, inputs=[self.model_name], outputs=[self.resume_model])
+            self.bt_resume_train.click(fn=self.resume_train, inputs=[self.model_name, self.resume_model, self.learning_rate,self.batch_size, self.info_interval, self.eval_interval,self.save_interval, self.keep_ckpts, self.slow_model])
+            self.bt_infer.click(fn=self.inference, inputs=[self.input_wav, self.resume_voice, self.keychange], outputs=[self.output_wav])
+            self.bt_refersh2.click(fn=self.refresh_model_and_voice, inputs=[self.model_name],outputs=[self.resume_model2, self.resume_voice])
+
+        ui.launch(inbrowser=True, server_port=2333, share=True)
+
+    def openfolder(self):
+
+        try:
+            if sys.platform.startswith('win'):
+                os.startfile('dataset_raw')
+            elif sys.platform.startswith('linux'):
+                subprocess.call(['xdg-open', 'dataset_raw'])
+            elif sys.platform.startswith('darwin'):
+                subprocess.call(['open', 'dataset_raw'])
+            else:
+                print(i18n('打开文件夹失败！'))
+        except BaseException:
+            print(i18n('打开文件夹失败！'))
+
+    def preprocessing(self, thread_count):
+        print(i18n('开始预处理'))
+        train_process = subprocess.Popen('python -u svc_preprocessing.py -t ' + str(thread_count), stdout=subprocess.PIPE)
+        while train_process.poll() is None:
+            output = train_process.stdout.readline().decode('utf-8')
+            print(output, end='')
+
+    def create_config(self, model_name, learning_rate, batch_size, info_interval, eval_interval, save_interval,
+                      keep_ckpts, slow_model):
+        yaml = YAML()
+        yaml.preserve_quotes = True
+        yaml.width = 1024
+        with open("configs/train.yaml", "r") as f:
+            config = yaml.load(f)
+        config['train']['model'] = model_name
+        config['train']['learning_rate'] = learning_rate
+        config['train']['batch_size'] = batch_size
+        config["log"]["info_interval"] = int(info_interval)
+        config["log"]["eval_interval"] = int(eval_interval)
+        config["log"]["save_interval"] = int(save_interval)
+        config["log"]["keep_ckpts"] = int(keep_ckpts)
+        if slow_model:
+            config["train"]["pretrain"] = "vits_pretrain\sovits5.0.pretrain.pth"
+        else:
+            config["train"]["pretrain"] = ""
+        with open("configs/train.yaml", "w") as f:
+            yaml.dump(config, f)
+        return f"{config['log']}"
+
+    def training(self, model_name):
+        print(i18n('开始训练'))
+        train_process = subprocess.Popen('python -u svc_trainer.py -c ' + self.train_config_path + ' -n ' + str(model_name), stdout=subprocess.PIPE, creationflags=subprocess.CREATE_NEW_CONSOLE)
+        while train_process.poll() is None:
+            output = train_process.stdout.readline().decode('utf-8')
+            print(output, end='')
+
+    def onekey_training(self, model_name, thread_count, learning_rate, batch_size, info_interval, eval_interval, save_interval, keep_ckpts, slow_model):
+        print(self, model_name, thread_count, learning_rate, batch_size, info_interval, eval_interval,
+              save_interval, keep_ckpts)
+        self.create_config(model_name, learning_rate, batch_size, info_interval, eval_interval, save_interval, keep_ckpts, slow_model)
+        self.preprocessing(thread_count)
+        self.training(model_name)
+
+    def out_model(self, model_name, resume_model2):
+        print(i18n('开始导出模型'))
+        try:
+            subprocess.Popen('python -u svc_export.py -c {} -p "chkpt/{}/{}"'.format(self.train_config_path, model_name, resume_model2),stdout=subprocess.PIPE)
+            print(i18n('导出模型成功'))
+        except Exception as e:
+            print(i18n("出现错误："), e)
+
+
+    def tensorboard(self):
+        if sys.platform.startswith('win'):
+            tb_process = subprocess.Popen('tensorboard --logdir=logs --port=6006', stdout=subprocess.PIPE)
+            webbrowser.open("http://localhost:6006")
+        else:
+            p1 = subprocess.Popen(["ps", "-ef"], stdout=subprocess.PIPE) #ps -ef | grep tensorboard | awk '{print $2}' | xargs kill -9
+            p2 = subprocess.Popen(["grep", "tensorboard"], stdin=p1.stdout, stdout=subprocess.PIPE)
+            p3 = subprocess.Popen(["awk", "{print $2}"], stdin=p2.stdout, stdout=subprocess.PIPE)
+            p4 = subprocess.Popen(["xargs", "kill", "-9"], stdin=p3.stdout)
+            p1.stdout.close()
+            p2.stdout.close()
+            p3.stdout.close()
+            p4.communicate()
+            tb_process = subprocess.Popen('tensorboard --logdir=logs --port=6007', stdout=subprocess.PIPE)  # AutoDL端口设置为6007
+        while tb_process.poll() is None:
+            output = tb_process.stdout.readline().decode('utf-8')
+            print(output)
+
+    def refresh_model(self, model_name):
+        self.script_dir = os.path.dirname(os.path.abspath(__file__))
+        self.model_root = os.path.join(self.script_dir, f"chkpt/{model_name}")
+        self.names = []
+        try:
+            for self.name in os.listdir(self.model_root):
+                if self.name.endswith(".pt"):
+                    self.names.append(self.name)
+            return {"choices": sorted(self.names), "__type__": "update"}
+        except FileNotFoundError:
+            return {"label": i18n("缺少模型文件"), "__type__": "update"}
+
+    def refresh_model2(self, model_name):
+        self.script_dir = os.path.dirname(os.path.abspath(__file__))
+        self.model_root = os.path.join(self.script_dir, f"chkpt/{model_name}")
+        self.names2 = []
+        try:
+            for self.name in os.listdir(self.model_root):
+                if self.name.endswith(".pt"):
+                    self.names2.append(self.name)
+            return {"choices": sorted(self.names2), "__type__": "update"}
+        except FileNotFoundError:
+            return {"label": i18n("缺少模型文件"), "__type__": "update"}
+
+    def refresh_voice(self):
+        self.script_dir = os.path.dirname(os.path.abspath(__file__))
+        self.model_root = os.path.join(self.script_dir, "data_svc/singer")
+        self.voice_names = []
+        try:
+            for self.name in os.listdir(self.model_root):
+                if self.name.endswith(".npy"):
+                    self.voice_names.append(self.name)
+            return {"choices": sorted(self.voice_names), "__type__": "update"}
+        except FileNotFoundError:
+            return {"label": i18n("缺少文件"), "__type__": "update"}
+
+    def refresh_model_and_voice(self, model_name):
+        model_update = self.refresh_model2(model_name)
+        voice_update = self.refresh_voice()
+        return model_update, voice_update
+
+    def resume_train(self, model_name, resume_model ,learning_rate, batch_size, info_interval, eval_interval, save_interval, keep_ckpts, slow_model):
+        print(i18n('开始恢复训练'))
+        self.create_config(model_name, learning_rate, batch_size, info_interval, eval_interval, save_interval,keep_ckpts, slow_model)
+        train_process = subprocess.Popen('python -u svc_trainer.py -c {} -n {} -p "chkpt/{}/{}"'.format(self.train_config_path, model_name, model_name, resume_model), stdout=subprocess.PIPE, creationflags=subprocess.CREATE_NEW_CONSOLE)
+        while train_process.poll() is None:
+            output = train_process.stdout.readline().decode('utf-8')
+            print(output, end='')
+
+    def inference(self, input, resume_voice, keychange):
+        if os.path.exists("test.wav"):
+            os.remove("test.wav")
+            print(i18n("已清理残留文件"))
+        else:
+            print(i18n("无需清理残留文件"))
+        self.train_config_path = 'configs/train.yaml'
+        print(i18n('开始推理'))
+        shutil.copy(input, ".")
+        input_name = os.path.basename(input)
+        os.rename(input_name, "test.wav")
+        input_name = "test.wav"
+        if not input_name.endswith(".wav"):
+            data, samplerate = soundfile.read(input_name)
+            input_name = input_name.rsplit(".", 1)[0] + ".wav"
+            soundfile.write(input_name, data, samplerate)
+        train_config_path = shlex.quote(self.train_config_path)
+        keychange = shlex.quote(str(keychange))
+        cmd = ["python", "-u", "svc_inference.py", "--config", train_config_path, "--model", "sovits5.0.pth", "--spk",
+               f"data_svc/singer/{resume_voice}", "--wave", "test.wav", "--shift", keychange]
+        train_process = subprocess.run(cmd, shell=False, capture_output=True, text=True)
+        print(train_process.stdout)
+        print(train_process.stderr)
+        print(i18n("推理成功"))
+        return "svc_out.wav"
+
+class Info:
+    def __init__(self) -> None:
+        self.train = i18n('### 2023.7.11|[@OOPPEENN](https://github.com/OOPPEENN)第一次编写|[@thestmitsuk](https://github.com/thestmitsuki)二次补完')
+
+        self.inference = i18n('### 2023.7.11|[@OOPPEENN](https://github.com/OOPPEENN)第一次编写|[@thestmitsuk](https://github.com/thestmitsuki)二次补完')
+
+
+LANGUAGE_LIST = ['zh_CN', 'en_US']
+LANGUAGE_ALL = {
+    'zh_CN': {
+        'SUPER': 'END',
+        'LANGUAGE': 'zh_CN',
+        '初始化成功': '初始化成功',
+        '就绪': '就绪',
+        '预处理-训练': '预处理-训练',
+        '训练说明': '训练说明',
+        '### 预处理参数设置': '### 预处理参数设置',
+        '模型名称': '模型名称',
+        'f0提取器': 'f0提取器',
+        '预处理线程数': '预处理线程数',
+        '### 训练参数设置': '### 训练参数设置',
+        '学习率': '学习率',
+        '批大小': '批大小',
+        '训练日志记录间隔（step）': '训练日志记录间隔（step）',
+        '验证集验证间隔（epoch）': '验证集验证间隔（epoch）',
+        '检查点保存间隔（epoch）': '检查点保存间隔（epoch）',
+        '保留最新的检查点文件(0保存全部)': '保留最新的检查点文件(0保存全部)',
+        '是否添加底模': '是否添加底模',
+        '### 开始训练': '### 开始训练',
+        '打开数据集文件夹': '打开数据集文件夹',
+        '一键训练': '一键训练',
+        '启动Tensorboard': '启动Tensorboard',
+        '### 恢复训练': '### 恢复训练',
+        '从检查点恢复训练进度': '从检查点恢复训练进度',
+        '刷新': '刷新',
+        '恢复训练': '恢复训练',
+        '推理': '推理',
+        '推理说明': '推理说明',
+        '### 推理参数设置': '### 推理参数设置',
+        '变调': '变调',
+        '文件列表': '文件列表',
+        '选择要导出的模型': '选择要导出的模型',
+        '刷新模型和音色': '刷新模型和音色',
+        '导出模型': '导出模型',
+        '选择音色文件': '选择音色文件',
+        '选择待转换音频': '选择待转换音频',
+        '开始转换': '开始转换',
+        '输出音频': '输出音频',
+        '打开文件夹失败！': '打开文件夹失败！',
+        '开始预处理': '开始预处理',
+        '开始训练': '开始训练',
+        '开始导出模型': '开始导出模型',
+        '导出模型成功': '导出模型成功',
+        '出现错误：': '出现错误：',
+        '缺少模型文件': '缺少模型文件',
+        '缺少文件': '缺少文件',
+        '已清理残留文件': '已清理残留文件',
+        '无需清理残留文件': '无需清理残留文件',
+        '开始推理': '开始推理',
+        '推理成功': '推理成功',
+        '### 2023.7.11|[@OOPPEENN](https://github.com/OOPPEENN)第一次编写|[@thestmitsuk](https://github.com/thestmitsuki)二次补完': '### 2023.7.11|[@OOPPEENN](https://github.com/OOPPEENN)第一次编写|[@thestmitsuk](https://github.com/thestmitsuki)二次补完'
+    },
+    'en_US': {
+        'SUPER': 'zh_CN',
+        'LANGUAGE': 'en_US',
+        '初始化成功': 'Initialization successful',
+        '就绪': 'Ready',
+        '预处理-训练': 'Preprocessing-Training',
+        '训练说明': 'Training instructions',
+        '### 预处理参数设置': '### Preprocessing parameter settings',
+        '模型名称': 'Model name',
+        'f0提取器': 'f0 extractor',
+        '预处理线程数': 'Preprocessing thread number',
+        '### 训练参数设置': '### Training parameter settings',
+        '学习率': 'Learning rate',
+        '批大小': 'Batch size',
+        '训练日志记录间隔（step）': 'Training log recording interval (step)',
+        '验证集验证间隔（epoch）': 'Validation set validation interval (epoch)',
+        '检查点保存间隔（epoch）': 'Checkpoint save interval (epoch)',
+        '保留最新的检查点文件(0保存全部)': 'Keep the latest checkpoint file (0 save all)',
+        '是否添加底模': 'Whether to add the base model',
+        '### 开始训练': '### Start training',
+        '打开数据集文件夹': 'Open the dataset folder',
+        '一键训练': 'One-click training',
+        '启动Tensorboard': 'Start Tensorboard',
+        '### 恢复训练': '### Resume training',
+        '从检查点恢复训练进度': 'Restore training progress from checkpoint',
+        '刷新': 'Refresh',
+        '恢复训练': 'Resume training',
+        "推理": "Inference",
+        "推理说明": "Inference instructions",
+        "### 推理参数设置": "### Inference parameter settings",
+        "变调": "Pitch shift",
+        "文件列表": "File list",
+        "选择要导出的模型": "Select the model to export",
+        "刷新模型和音色": "Refresh model and timbre",
+        "导出模型": "Export model",
+        "选择音色文件": "Select timbre file",
+        "选择待转换音频": "Select audio to be converted",
+        "开始转换": "Start conversion",
+        "输出音频": "Output audio",
+        "打开文件夹失败！": "Failed to open folder!",
+        "开始预处理": "Start preprocessing",
+        "开始训练": "Start training",
+        "开始导出模型": "Start exporting model",
+        "导出模型成功": "Model exported successfully",
+        "出现错误：": "An error occurred:",
+        "缺少模型文件": "Missing model file",
+        '缺少文件': 'Missing file',
+        "已清理残留文件": "Residual files cleaned up",
+        "无需清理残留文件": "No need to clean up residual files",
+        "开始推理": "Start inference",
+        '### 2023.7.11|[@OOPPEENN](https://github.com/OOPPEENN)第一次编写|[@thestmitsuk](https://github.com/thestmitsuki)二次补完': '### 2023.7.11|[@OOPPEENN](https://github.com/OOPPEENN)first writing|[@thestmitsuk](https://github.com/thestmitsuki)second completion'
+    }
+}
+
+class I18nAuto:
+    def __init__(self, language=None):
+        self.language_list = LANGUAGE_LIST
+        self.language_all = LANGUAGE_ALL
+        self.language_map = {}
+        self.language = language or locale.getdefaultlocale()[0]
+        if self.language not in self.language_list:
+            self.language = 'zh_CN'
+        self.read_language(self.language_all['zh_CN'])
+        while self.language_all[self.language]['SUPER'] != 'END':
+            self.read_language(self.language_all[self.language])
+            self.language = self.language_all[self.language]['SUPER']
+
+    def read_language(self, lang_dict: dict):
+        self.language_map.update(lang_dict)
+
+    def __call__(self, key):
+        return self.language_map[key]
+
+if __name__ == "__main__":
+    i18n = I18nAuto()
+    webui = WebUI()
diff --git a/colab.ipynb b/colab.ipynb
new file mode 100644
index 0000000000000000000000000000000000000000..19943fa73562ea14b8775c9f56a92f0be663ab10
--- /dev/null
+++ b/colab.ipynb
@@ -0,0 +1,374 @@
+{
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "SggegFslkbbK"
+      },
+      "source": [
+        "https://github.com/PlayVoice/so-vits-svc-5.0/\n",
+        "\n",
+        "↑原仓库\n",
+        "\n",
+        "*《colab保持连接的方法》*https://zhuanlan.zhihu.com/p/144629818\n",
+        "\n",
+        "预览版本，可使用预设模型进行推理"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "M1MdDryJP73G"
+      },
+      "source": [
+        "# **环境配置&必要文件下载**\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "xfJWCr_EkO2i"
+      },
+      "outputs": [],
+      "source": [
+        "#@title 看看抽了个啥卡~~基本都是T4~~\n",
+        "!nvidia-smi"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "nMspj8t3knR6"
+      },
+      "outputs": [],
+      "source": [
+        "#@title 克隆github仓库\n",
+        "!git clone https://github.com/PlayVoice/so-vits-svc-5.0/ -b bigvgan-mix-v2"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "Kj2j81K6kubj"
+      },
+      "outputs": [],
+      "source": [
+        "#@title 安装依赖&下载必要文件\n",
+        "%cd /content/so-vits-svc-5.0\n",
+        "\n",
+        "!pip install -r requirements.txt\n",
+        "!pip install --upgrade pip setuptools numpy numba\n",
+        "\n",
+        "!wget -P hubert_pretrain/ https://github.com/bshall/hubert/releases/download/v0.1/hubert-soft-0d54a1f4.pt\n",
+        "!wget -P whisper_pretrain/ https://openaipublic.azureedge.net/main/whisper/models/81f7c96c852ee8fc832187b0132e569d6c3065a3252ed18e56effd0b6a73e524/large-v2.pt\n",
+        "!wget -P speaker_pretrain/ https://github.com/PlayVoice/so-vits-svc-5.0/releases/download/dependency/best_model.pth.tar\n",
+        "!wget -P crepe/assets https://github.com/PlayVoice/so-vits-svc-5.0/releases/download/dependency/full.pth\n",
+        "!wget -P vits_pretrain https://github.com/PlayVoice/so-vits-svc-5.0/releases/download/5.0/sovits5.0.pretrain.pth"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "v9zHS9VXly9b"
+      },
+      "outputs": [],
+      "source": [
+        "#@title 加载Google云端硬盘\n",
+        "from google.colab import drive\n",
+        "drive.mount('/content/drive')"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "hZ5KH8NgQ7os"
+      },
+      "source": [
+        "# 包含多说话人的推理预览"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "2o6m3D0IsphU"
+      },
+      "outputs": [],
+      "source": [
+        "#@title 提取内容编码\n",
+        "\n",
+        "#@markdown **将处理好的\" .wav \"输入源文件上传到云盘根目录，并修改以下选项**\n",
+        "\n",
+        "#@markdown **\" .wav \"文件【文件名】**\n",
+        "input = \"\\u30AE\\u30BF\\u30FC\\u3068\\u5B64\\u72EC\\u3068\\u84BC\\u3044\\u60D1\\u661F\" #@param {type:\"string\"}\n",
+        "input_path = \"/content/drive/MyDrive/\"\n",
+        "input_name =  input_path + input\n",
+        "!PYTHONPATH=. python whisper/inference.py -w {input_name}.wav -p test.ppg.npy"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "A7nvX5mRlwJ7"
+      },
+      "outputs": [],
+      "source": [
+        "#@title 推理\n",
+        "\n",
+        "#@markdown **将处理好的\" .wav \"输入源文件上传到云盘根目录，并修改以下选项**\n",
+        "\n",
+        "#@markdown **\" .wav \"文件【文件名】**\n",
+        "input = \"\\u30AE\\u30BF\\u30FC\\u3068\\u5B64\\u72EC\\u3068\\u84BC\\u3044\\u60D1\\u661F\" #@param {type:\"string\"}\n",
+        "input_path = \"/content/drive/MyDrive/\"\n",
+        "input_name =  input_path + input\n",
+        "#@markdown **指定说话人（0001~0056）（推荐0022、0030、0047、0051）**\n",
+        "speaker = \"0002\" #@param {type:\"string\"}\n",
+        "!PYTHONPATH=. python svc_inference.py --config configs/base.yaml --model vits_pretrain/sovits5.0.pretrain.pth --spk ./configs/singers/singer{speaker}.npy --wave {input_name}.wav  --ppg test.ppg.npy"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "F8oerogXyd3u"
+      },
+      "source": [
+        "推理结果保存在根目录，文件名为svc_out.wav"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "qKX17GElPuso"
+      },
+      "source": [
+        "# 训练"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "sVe0lEGWQBLU"
+      },
+      "source": [
+        "将音频剪裁为小于30秒的音频段，响度匹配并修改为单声道，预处理时会进行重采样所以对采样率无要求。（但是降低采样率的操作会降低你的数据质量）\n",
+        "\n",
+        "**使用Adobe Audition™的响度匹配功能可以一次性完成重采样修改声道和响度匹配。**\n",
+        "\n",
+        "之后将音频文件保存为以下文件结构：\n",
+        "```\n",
+        "dataset_raw\n",
+        "├───speaker0\n",
+        "│   ├───xxx1-xxx1.wav\n",
+        "│   ├───...\n",
+        "│   └───Lxx-0xx8.wav\n",
+        "└───speaker1\n",
+        "    ├───xx2-0xxx2.wav\n",
+        "    ├───...\n",
+        "    └───xxx7-xxx007.wav\n",
+        "```\n",
+        "\n",
+        "打包为zip格式，命名为data.zip，上传到网盘根目录。"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "vC8IthV8VYgy"
+      },
+      "outputs": [],
+      "source": [
+        "#@title 从云盘获取数据集\n",
+        "!unzip -d /content/so-vits-svc-5.0/ /content/drive/MyDrive/data.zip #自行修改路径与文件名"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "J101PiFUSL1N"
+      },
+      "outputs": [],
+      "source": [
+        "#@title 重采样\n",
+        "# 生成采样率16000Hz音频, 存储路径为：./data_svc/waves-16k\n",
+        "!python prepare/preprocess_a.py -w ./dataset_raw -o ./data_svc/waves-16k -s 16000\n",
+        "# 生成采样率32000Hz音频, 存储路径为：./data_svc/waves-32k\n",
+        "!python prepare/preprocess_a.py -w ./dataset_raw -o ./data_svc/waves-32k -s 32000"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "ZpxeYJCBSbgf"
+      },
+      "outputs": [],
+      "source": [
+        "#@title 提取f0\n",
+        "!python prepare/preprocess_f0.py -w data_svc/waves-16k/ -p data_svc/pitch"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "7VasDGhDSlP5"
+      },
+      "outputs": [],
+      "source": [
+        "#@title 使用16k音频，提取内容编码\n",
+        "!PYTHONPATH=. python prepare/preprocess_ppg.py -w data_svc/waves-16k/ -p data_svc/whisper"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "#@title 使用16k音频，提取内容编码\n",
+        "!PYTHONPATH=. python prepare/preprocess_hubert.py -w data_svc/waves-16k/ -v data_svc/hubert"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "ovRqQUINSoII"
+      },
+      "outputs": [],
+      "source": [
+        "#@title 提取音色特征\n",
+        "!PYTHONPATH=. python prepare/preprocess_speaker.py data_svc/waves-16k/ data_svc/speaker"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "s8Ba8Fd1bzzX"
+      },
+      "outputs": [],
+      "source": [
+        "#（解决“.ipynb_checkpoints”相关的错）\n",
+        "!rm -rf \"find -type d -name .ipynb_checkpoints\""
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "ic9q599_b0Ae"
+      },
+      "outputs": [],
+      "source": [
+        "#（解决“.ipynb_checkpoints”相关的错）\n",
+        "!rm -rf .ipynb_checkpoints\n",
+        "!find . -name \".ipynb_checkpoints\" -exec rm -rf {} \\;"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "QamG3_B6o3vF"
+      },
+      "outputs": [],
+      "source": [
+        "#@title 提取平均音色\n",
+        "!PYTHONPATH=. python prepare/preprocess_speaker_ave.py data_svc/speaker/ data_svc/singer"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "3wBmyQHvSs6K"
+      },
+      "outputs": [],
+      "source": [
+        "#@title 提取spec\n",
+        "!PYTHONPATH=. python prepare/preprocess_spec.py -w data_svc/waves-32k/ -s data_svc/specs"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "tUcljCLbS5O3"
+      },
+      "outputs": [],
+      "source": [
+        "#@title 生成索引\n",
+        "!python prepare/preprocess_train.py"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "30fXnscFS7Wo"
+      },
+      "outputs": [],
+      "source": [
+        "#@title 训练文件调试\n",
+        "!PYTHONPATH=. python prepare/preprocess_zzz.py"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "hacR8qDFVOWo"
+      },
+      "outputs": [],
+      "source": [
+        "#@title  设定模型备份\n",
+        "#@markdown **是否备份模型到云盘，colab随时爆炸建议备份，默认保存到云盘根目录Sovits5.0文件夹**\n",
+        "Save_to_drive = True #@param {type:\"boolean\"}\n",
+        "if Save_to_drive:\n",
+        "  !mkdir -p /content/so-vits-svc-5.0/chkpt/\n",
+        "  !rm -rf /content/so-vits-svc-5.0/chkpt/\n",
+        "  !mkdir -p /content/drive/MyDrive/Sovits5.0\n",
+        "  !ln -s /content/drive/MyDrive/Sovits5.0 /content/so-vits-svc-5.0/chkpt/"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "5BIiKIAoU3Kd"
+      },
+      "outputs": [],
+      "source": [
+        "#@title  开始训练\n",
+        "%load_ext tensorboard\n",
+        "%tensorboard --logdir /content/so-vits-svc-5.0/logs/\n",
+        "\n",
+        "!PYTHONPATH=. python svc_trainer.py -c configs/base.yaml -n sovits5.0"
+      ]
+    }
+  ],
+  "metadata": {
+    "accelerator": "GPU",
+    "colab": {
+      "provenance": []
+    },
+    "gpuClass": "standard",
+    "kernelspec": {
+      "display_name": "Python 3",
+      "name": "python3"
+    },
+    "language_info": {
+      "name": "python"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}
diff --git a/configs/base.yaml b/configs/base.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..dbf59f59eb764196566ed2479c8472ce3a1cdeb1
--- /dev/null
+++ b/configs/base.yaml
@@ -0,0 +1,72 @@
+train:
+  model: "sovits"
+  seed: 1234
+  epochs: 10000
+  learning_rate: 5e-5
+  betas: [0.8, 0.99]
+  lr_decay: 0.999875
+  eps: 1e-9
+  batch_size: 8
+  accum_step: 2
+  c_stft: 9
+  c_mel: 1.
+  c_kl: 0.2
+  port: 8001
+  pretrain: "./vits_pretrain/sovits5.0.pretrain.pth"
+#############################
+data: 
+  training_files: "files/train.txt"
+  validation_files: "files/valid.txt"
+  segment_size: 8000  # WARNING: base on hop_length
+  max_wav_value: 32768.0
+  sampling_rate: 32000
+  filter_length: 1024
+  hop_length: 320
+  win_length: 1024
+  mel_channels: 100
+  mel_fmin: 50.0
+  mel_fmax: 16000.0
+#############################
+vits:
+  ppg_dim: 1280
+  vec_dim: 256
+  spk_dim: 256
+  gin_channels: 256
+  inter_channels: 192
+  hidden_channels: 192
+  filter_channels: 640
+#############################
+gen:
+  upsample_input: 192
+  upsample_rates: [5,4,4,2,2]
+  upsample_kernel_sizes: [15,8,8,4,4]
+  upsample_initial_channel: 320
+  resblock_kernel_sizes: [3,7,11]
+  resblock_dilation_sizes: [[1,3,5], [1,3,5], [1,3,5]]
+#############################
+mpd:
+  periods: [2,3,5,7,11]
+  kernel_size: 5
+  stride: 3
+  use_spectral_norm: False
+  lReLU_slope: 0.2
+#############################
+mrd:
+  resolutions: "[(1024, 120, 600), (2048, 240, 1200), (4096, 480, 2400), (512, 50, 240)]" # (filter_length, hop_length, win_length)
+  use_spectral_norm: False
+  lReLU_slope: 0.2
+#############################
+log:
+  info_interval: 100
+  eval_interval: 1
+  save_interval: 5
+  num_audio: 6
+  pth_dir: 'chkpt'
+  log_dir: 'logs'
+  keep_ckpts: 0
+#############################
+dist_config:
+  dist_backend: "nccl"
+  dist_url: "tcp://localhost:54321"
+  world_size: 1
+
diff --git a/configs/singers/singer0001.npy b/configs/singers/singer0001.npy
new file mode 100644
index 0000000000000000000000000000000000000000..9352a330a6ac78e14129c5062d2235c05b15668c
--- /dev/null
+++ b/configs/singers/singer0001.npy
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e2879921d43bdbf11fc5d6ac91f434f905a2c5e59d75368bfbf3c6bdbddcb3cf
+size 1152
diff --git a/configs/singers/singer0002.npy b/configs/singers/singer0002.npy
new file mode 100644
index 0000000000000000000000000000000000000000..b8ccb3f218758254f2971a3dbeaa5340e7377c7f
--- /dev/null
+++ b/configs/singers/singer0002.npy
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fbe5c7925c2fdb514e2c5b450de1d2737ec7f86f1c65eeb488c1888c0b9a7069
+size 1152
diff --git a/configs/singers/singer0003.npy b/configs/singers/singer0003.npy
new file mode 100644
index 0000000000000000000000000000000000000000..3a92f50cbd7336703910831f03ac7d1cb029a90e
--- /dev/null
+++ b/configs/singers/singer0003.npy
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5665126aeb6c6fab89c79b90debf2ce2e64b321076dcb414089eff8848eac8b4
+size 1152
diff --git a/configs/singers/singer0004.npy b/configs/singers/singer0004.npy
new file mode 100644
index 0000000000000000000000000000000000000000..6ef48a0a0cb042ff8c419f747261c579aa66520e
--- /dev/null
+++ b/configs/singers/singer0004.npy
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:79f0fe5993e9adcaeae25b0fa68265d40c9c1b5539ca12d6e438477de2177819
+size 1152
diff --git a/configs/singers/singer0005.npy b/configs/singers/singer0005.npy
new file mode 100644
index 0000000000000000000000000000000000000000..ebe4251d2aef83c2c9db470bec9cdff8cf97e769
--- /dev/null
+++ b/configs/singers/singer0005.npy
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1158fb447929cf9400a31675cf9992fd3ed7558e061562189d9e6bf56d83fb2a
+size 1152
diff --git a/configs/singers/singer0006.npy b/configs/singers/singer0006.npy
new file mode 100644
index 0000000000000000000000000000000000000000..6336c044c3b98765c5bc1f9121ef36465bcaf79e
--- /dev/null
+++ b/configs/singers/singer0006.npy
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:06c1fd3a9afaa7944e4b81b7ca787e667b0dae8c7e90c6d24177245449f4e940
+size 1152
diff --git a/configs/singers/singer0007.npy b/configs/singers/singer0007.npy
new file mode 100644
index 0000000000000000000000000000000000000000..b401dcf6e4f774798010ccbf27cd8622943e7174
--- /dev/null
+++ b/configs/singers/singer0007.npy
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:36611b9e57545332b9fb97fd35a356fbe8d60258f2f5e2232168481bb6dfab5b
+size 1152
diff --git a/configs/singers/singer0008.npy b/configs/singers/singer0008.npy
new file mode 100644
index 0000000000000000000000000000000000000000..f28df113963e42afba4d76af39ec150a66de406c
--- /dev/null
+++ b/configs/singers/singer0008.npy
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8584ad6f3569a1307082cd410085d9a562807e962274b89b72487c7bc79124d4
+size 1152
diff --git a/configs/singers/singer0009.npy b/configs/singers/singer0009.npy
new file mode 100644
index 0000000000000000000000000000000000000000..15d125808f4475d4b9e9a2714839b61f36c2ae1c
--- /dev/null
+++ b/configs/singers/singer0009.npy
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b069db4e3e5ca389ffba974c74eab46caf4c60545773e5f7e5e253310619073e
+size 1152
diff --git a/configs/singers/singer0010.npy b/configs/singers/singer0010.npy
new file mode 100644
index 0000000000000000000000000000000000000000..bda76fe1913f4310bb2dab2a3eb411454aac8d12
--- /dev/null
+++ b/configs/singers/singer0010.npy
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7d4d92735e4bac1618e89198d113013db09061b6c1f74ba0c500b70b097cd407
+size 1152
diff --git a/configs/singers/singer0011.npy b/configs/singers/singer0011.npy
new file mode 100644
index 0000000000000000000000000000000000000000..0fd56c80b7fc42933a3c1b2a0c37e2e7291e44f8
--- /dev/null
+++ b/configs/singers/singer0011.npy
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:942388b4276dc06ee365f59c324ce1642e4bf810dcc99992739787e3b9ad135d
+size 1152
diff --git a/configs/singers/singer0012.npy b/configs/singers/singer0012.npy
new file mode 100644
index 0000000000000000000000000000000000000000..54261d088430996e5a0abf019c47632031bb8886
--- /dev/null
+++ b/configs/singers/singer0012.npy
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3411efcf4ee4f534cea2b742c2eca166ae971efbceab21fb41b77b8923a1ba3a
+size 1152
diff --git a/configs/singers/singer0013.npy b/configs/singers/singer0013.npy
new file mode 100644
index 0000000000000000000000000000000000000000..3eedaf46d7a1fe7865d7c2783375e0f4a010f154
--- /dev/null
+++ b/configs/singers/singer0013.npy
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6e8e30cd1bce61405db194278dd7bf207d16abf656dd22f9a20f29e3657674f3
+size 1152
diff --git a/configs/singers/singer0014.npy b/configs/singers/singer0014.npy
new file mode 100644
index 0000000000000000000000000000000000000000..602e8f6203eb05962b3102f319e0ec99db9c097b
--- /dev/null
+++ b/configs/singers/singer0014.npy
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f9cc8200753b4ba7605c9a13bf454b100025965135c5d816f7440ec53a2e6dd4
+size 1152
diff --git a/configs/singers/singer0015.npy b/configs/singers/singer0015.npy
new file mode 100644
index 0000000000000000000000000000000000000000..dfe824316e9fa153390c87f5d1fd1d4b34caab3b
--- /dev/null
+++ b/configs/singers/singer0015.npy
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:dcb58688e51dbdeb22e5dd85d27ff3904c4594c78420b8e9c9ab481adbecc5fe
+size 1152
diff --git a/configs/singers/singer0016.npy b/configs/singers/singer0016.npy
new file mode 100644
index 0000000000000000000000000000000000000000..5ce37e18e3b70a9c149e29743f227b5fd4cfdffd
--- /dev/null
+++ b/configs/singers/singer0016.npy
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:66a3c6162b8c937e9e8bbdc806b873866afce4b110664831642f7b41922bbf39
+size 1152
diff --git a/configs/singers/singer0017.npy b/configs/singers/singer0017.npy
new file mode 100644
index 0000000000000000000000000000000000000000..4104cb371c4e57ca071ec299e39e7320cc3e4569
--- /dev/null
+++ b/configs/singers/singer0017.npy
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:84782c98c930bd980f350837f4b3e8e193c49ef46aef9f92471c6136659975a9
+size 1152
diff --git a/configs/singers/singer0018.npy b/configs/singers/singer0018.npy
new file mode 100644
index 0000000000000000000000000000000000000000..fc43cc1750632008cc8057ce24b39e759bb3a047
--- /dev/null
+++ b/configs/singers/singer0018.npy
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:731ebafda06aecedfd79941978149a0f87595f04e24eab7ed5300defe9070fc0
+size 1152
diff --git a/configs/singers/singer0019.npy b/configs/singers/singer0019.npy
new file mode 100644
index 0000000000000000000000000000000000000000..5e32ca3dca1e975ba5bae37b730fb3a764fc8595
--- /dev/null
+++ b/configs/singers/singer0019.npy
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3d88e620994e4413c4c58ffb9239ef46ded60ff3eab0715c7af96cbe4092198f
+size 1152
diff --git a/configs/singers/singer0020.npy b/configs/singers/singer0020.npy
new file mode 100644
index 0000000000000000000000000000000000000000..88a0e64f47ac03688db8b45329f4de9554f86835
--- /dev/null
+++ b/configs/singers/singer0020.npy
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3e5abaabe5457a20161351dcf5f8737d63a2a92fb1de1842ea9e92e47b9ca6fe
+size 1152
diff --git a/configs/singers/singer0021.npy b/configs/singers/singer0021.npy
new file mode 100644
index 0000000000000000000000000000000000000000..d80f97eac1be1779490a37a503aaac6ac1f5d130
--- /dev/null
+++ b/configs/singers/singer0021.npy
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1d7f99c92c89a44c1f2dd0688f033f0593c8c88b0537b092928bfbaa63a8d3e9
+size 1152
diff --git a/configs/singers/singer0022.npy b/configs/singers/singer0022.npy
new file mode 100644
index 0000000000000000000000000000000000000000..64dbba2610e35274f13da90702f608f283ddb0f4
--- /dev/null
+++ b/configs/singers/singer0022.npy
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:33becb1da48b12ba4957a0ef0b25bbd51e100d5762ebc4c7d381f6b957e682a2
+size 1152
diff --git a/configs/singers/singer0023.npy b/configs/singers/singer0023.npy
new file mode 100644
index 0000000000000000000000000000000000000000..6cb1c218971618be8de47f9c8daec658d4578531
--- /dev/null
+++ b/configs/singers/singer0023.npy
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7f49cbaf3f7653f48f80854a513a334f31dca719a09cca66e257995ce4a741a9
+size 1152
diff --git a/configs/singers/singer0024.npy b/configs/singers/singer0024.npy
new file mode 100644
index 0000000000000000000000000000000000000000..0ca92912b8959fc8743b9628f08c7595b6eb94f9
--- /dev/null
+++ b/configs/singers/singer0024.npy
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:92ed584994d56473c8bab0799d213e927c5a2928facef2b93a2f95f764d868b4
+size 1152
diff --git a/configs/singers/singer0025.npy b/configs/singers/singer0025.npy
new file mode 100644
index 0000000000000000000000000000000000000000..05bd93acfad034e7e83e7965c4ce4557a4b7a277
--- /dev/null
+++ b/configs/singers/singer0025.npy
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:14b7e1f55393d5beaa2f3bbd0ef7f2be7e108993c680acb265ff24df19f7062b
+size 1152
diff --git a/configs/singers/singer0026.npy b/configs/singers/singer0026.npy
new file mode 100644
index 0000000000000000000000000000000000000000..cddbd2fddc6a51caf1a2f111f3bd6b1d3dbe2c18
--- /dev/null
+++ b/configs/singers/singer0026.npy
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:92ecc9aa68f136960c00e98aaca16e92c38960bc7eb9687aee90190972974726
+size 1152
diff --git a/configs/singers/singer0027.npy b/configs/singers/singer0027.npy
new file mode 100644
index 0000000000000000000000000000000000000000..aedcbf0c48465cb3273ba46d180711901ca911ea
--- /dev/null
+++ b/configs/singers/singer0027.npy
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f5a8a1c2a445179d38664fb55c84ee9a36350beee50efa9f850d29b394447bfa
+size 1152
diff --git a/configs/singers/singer0028.npy b/configs/singers/singer0028.npy
new file mode 100644
index 0000000000000000000000000000000000000000..788e6fdeb897960c0b1e203f9750cd2ae3969975
--- /dev/null
+++ b/configs/singers/singer0028.npy
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b79b8266c8d368dc99f49a347b2631e1e5cfb44056b5a9ab4470b42f9851ee35
+size 1152
diff --git a/configs/singers/singer0029.npy b/configs/singers/singer0029.npy
new file mode 100644
index 0000000000000000000000000000000000000000..0340a7adc7bde1ada08e8e1962ce94c6f72a9d2f
--- /dev/null
+++ b/configs/singers/singer0029.npy
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:60fa5fd9e8ba14d7f6d67304842f16382f7d2e739969bde9551222ff8c282775
+size 1152
diff --git a/configs/singers/singer0030.npy b/configs/singers/singer0030.npy
new file mode 100644
index 0000000000000000000000000000000000000000..3597a4cf4e540f6aaae2f5f0ecd6f21d52a15658
--- /dev/null
+++ b/configs/singers/singer0030.npy
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2f5070e4196c91fa713aed20aedb2a570a7b2ad8301ee61f59821dafaea3c6a7
+size 1152
diff --git a/configs/singers/singer0031.npy b/configs/singers/singer0031.npy
new file mode 100644
index 0000000000000000000000000000000000000000..73be80545df0ca0a007ce914569e088082db62d8
--- /dev/null
+++ b/configs/singers/singer0031.npy
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:47f4f8c065be1c5448c1b80e5c99087e7357cf1f8a8a55f2d844ccf1ca4931e6
+size 1152
diff --git a/configs/singers/singer0032.npy b/configs/singers/singer0032.npy
new file mode 100644
index 0000000000000000000000000000000000000000..09d9d322a3232c0ab60cd05ea0257534b05e3c35
--- /dev/null
+++ b/configs/singers/singer0032.npy
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:019f40cf49cb7ccb44fb9c6a9f6345e84f837185a1642623144b4e2969c8738b
+size 1152
diff --git a/configs/singers/singer0033.npy b/configs/singers/singer0033.npy
new file mode 100644
index 0000000000000000000000000000000000000000..a6efd8966cb9e1f2ed87cbc103cdb70ceb279213
--- /dev/null
+++ b/configs/singers/singer0033.npy
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2e05e212c93fc9e7b13174dd76721ee891bb4ea8bb1638a4c43523ed65d30f67
+size 1152
diff --git a/configs/singers/singer0034.npy b/configs/singers/singer0034.npy
new file mode 100644
index 0000000000000000000000000000000000000000..1c23504832579cc0c9c3a7b505a7d6b8cc1efd81
--- /dev/null
+++ b/configs/singers/singer0034.npy
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:715a089dd9b3e5cbf021b0f41055f59208911e49cccf375ecf8b82544f325c3d
+size 1152
diff --git a/configs/singers/singer0035.npy b/configs/singers/singer0035.npy
new file mode 100644
index 0000000000000000000000000000000000000000..894595cd6678a1befdfb843a02ee09ad7badc03c
--- /dev/null
+++ b/configs/singers/singer0035.npy
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9af8cd05182ec53ff573bce53dad049759bea1de5656915f414910eaf47f61ed
+size 1152
diff --git a/configs/singers/singer0036.npy b/configs/singers/singer0036.npy
new file mode 100644
index 0000000000000000000000000000000000000000..de86320c15d7e9e3162edc9eadb783fb306b79c3
--- /dev/null
+++ b/configs/singers/singer0036.npy
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3cec474244d86acfd24d6abf7e033b24b40b838cba2fcd3b4d0e5611313d67ef
+size 1152
diff --git a/configs/singers/singer0037.npy b/configs/singers/singer0037.npy
new file mode 100644
index 0000000000000000000000000000000000000000..36488b5b3c83fca4e3617aaefe5138002c150cd9
--- /dev/null
+++ b/configs/singers/singer0037.npy
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:316e3435d373e352fe95fcb2ec0ab1c8afdeb270ce9f13c940ba91187eecdcf3
+size 1152
diff --git a/configs/singers/singer0038.npy b/configs/singers/singer0038.npy
new file mode 100644
index 0000000000000000000000000000000000000000..9c234763efb8227d82a4770dfc0b5b885d124f13
--- /dev/null
+++ b/configs/singers/singer0038.npy
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0e6458e251512dab86abce504490de6762f9c2de66ddbc853c24c3d05eb39c96
+size 1152
diff --git a/configs/singers/singer0039.npy b/configs/singers/singer0039.npy
new file mode 100644
index 0000000000000000000000000000000000000000..64b2bc8072f901df785c32d47b5926e98179ee50
--- /dev/null
+++ b/configs/singers/singer0039.npy
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c2e484ae33eef7ac92dd784e9e3b9bca7e6c0838d50b43c674da47620f281f20
+size 1152
diff --git a/configs/singers/singer0040.npy b/configs/singers/singer0040.npy
new file mode 100644
index 0000000000000000000000000000000000000000..96dd086113c61fac151de03b22a12daf768d7f41
--- /dev/null
+++ b/configs/singers/singer0040.npy
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7b3a104163ad4cf87caff70b845b2c3e70190ce430a8f21247d350ef102071dc
+size 1152
diff --git a/configs/singers/singer0041.npy b/configs/singers/singer0041.npy
new file mode 100644
index 0000000000000000000000000000000000000000..265f3dc086d595a0675e38fd35f5c433cc3dbcef
--- /dev/null
+++ b/configs/singers/singer0041.npy
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:962ba35045f952562bbf68239c8abfda4e1888118fae7ef19814282abee2d28e
+size 1152
diff --git a/configs/singers/singer0042.npy b/configs/singers/singer0042.npy
new file mode 100644
index 0000000000000000000000000000000000000000..7b13c99f9ef87b7e64bcdac1216f772239946b4f
--- /dev/null
+++ b/configs/singers/singer0042.npy
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0cf0871ba7e939c90f2f027862f80e11151d8b1a21b6624ee05f184d024b35a3
+size 1152
diff --git a/configs/singers/singer0043.npy b/configs/singers/singer0043.npy
new file mode 100644
index 0000000000000000000000000000000000000000..11b3e4a998bb8eb219aa7b2bbfbd36234293e6e4
--- /dev/null
+++ b/configs/singers/singer0043.npy
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9494ed20a9b095d19cce17619a6372ba3371f980c643078ffda8649a30ac2f8b
+size 1152
diff --git a/configs/singers/singer0044.npy b/configs/singers/singer0044.npy
new file mode 100644
index 0000000000000000000000000000000000000000..a12211417bf2d237b0c164c2d275ed95bd3ff175
--- /dev/null
+++ b/configs/singers/singer0044.npy
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c12949caa6176fbe5f323cf643d29eef14af9a3ee03be27c938d8bb6fc2922f1
+size 1152
diff --git a/configs/singers/singer0045.npy b/configs/singers/singer0045.npy
new file mode 100644
index 0000000000000000000000000000000000000000..04962d6066594759e2fd3ec9d69687f6179d8e74
--- /dev/null
+++ b/configs/singers/singer0045.npy
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:222adf210792d1b2745ef98b717f57e0d309d8176e9b59ff56063c1e2001728d
+size 1152
diff --git a/configs/singers/singer0046.npy b/configs/singers/singer0046.npy
new file mode 100644
index 0000000000000000000000000000000000000000..74976cf00a14a550f4189100409ba940e5f81c7c
--- /dev/null
+++ b/configs/singers/singer0046.npy
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6070f66f028928114493a363e4117636afefb1a094c54ffc01f89ef261ad1882
+size 1152
diff --git a/configs/singers/singer0047.npy b/configs/singers/singer0047.npy
new file mode 100644
index 0000000000000000000000000000000000000000..50304b9bb81a71beb480c2ce786a2c8ff0aa8db5
--- /dev/null
+++ b/configs/singers/singer0047.npy
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7c8bb2fb993a55d13996df74463408c8e7e8d5e24b391887813e2ac1c204c9c4
+size 1152
diff --git a/configs/singers/singer0048.npy b/configs/singers/singer0048.npy
new file mode 100644
index 0000000000000000000000000000000000000000..71f0fbde409976dbc079d27b957d269d3dd59129
--- /dev/null
+++ b/configs/singers/singer0048.npy
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5b33ee26125ae840494dc2cb3839f7a2f6b48571c15ebc7f0aa9f2b0fef5022e
+size 1152
diff --git a/configs/singers/singer0049.npy b/configs/singers/singer0049.npy
new file mode 100644
index 0000000000000000000000000000000000000000..00eb5f5b705f1236aebc44ccc40d87fe071e12b0
--- /dev/null
+++ b/configs/singers/singer0049.npy
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e9a8d97dd4d320e4049c39e112587416d06aa70ec52c05417519bac70fe76556
+size 1152
diff --git a/configs/singers/singer0050.npy b/configs/singers/singer0050.npy
new file mode 100644
index 0000000000000000000000000000000000000000..23f9815b6ed783ba964a8bed34e1a7353fd3873c
--- /dev/null
+++ b/configs/singers/singer0050.npy
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8fc8cf73923c6a567a134bffa037b3c9d1dcfde75d5a976238df222d91517d9f
+size 1152
diff --git a/configs/singers/singer0051.npy b/configs/singers/singer0051.npy
new file mode 100644
index 0000000000000000000000000000000000000000..1ffac1a5057ba9bd5b8dc77ab4d8ddb1f02c8333
--- /dev/null
+++ b/configs/singers/singer0051.npy
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a142fdcdb2fa1e69d09df6de3f96bf863038da4e27f51320adb5483cb4f5d306
+size 1152
diff --git a/configs/singers/singer0052.npy b/configs/singers/singer0052.npy
new file mode 100644
index 0000000000000000000000000000000000000000..ce1c360dd3c195b30157c33bb49138d55b72c586
--- /dev/null
+++ b/configs/singers/singer0052.npy
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a2fb5dcbe59a636f84c9522a3278b601358584fdc340dff8db06eaa544fddd4b
+size 1152
diff --git a/configs/singers/singer0053.npy b/configs/singers/singer0053.npy
new file mode 100644
index 0000000000000000000000000000000000000000..e2328ccf8b490bf37b622bf663292cd0c32f92b8
--- /dev/null
+++ b/configs/singers/singer0053.npy
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:893f50a74fdf3e7f27debc52fd739cd96a147ae2dcbeb34e2fb8fd328fa698a5
+size 1152
diff --git a/configs/singers/singer0054.npy b/configs/singers/singer0054.npy
new file mode 100644
index 0000000000000000000000000000000000000000..aa47e0a5d03bc259e93ce2fe0cb8cef9ddc01a2b
--- /dev/null
+++ b/configs/singers/singer0054.npy
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:151b0206dd960b1304f8d752e0502e9c7b0260326f7b932b278773aa0c5bb3ef
+size 1152
diff --git a/configs/singers/singer0055.npy b/configs/singers/singer0055.npy
new file mode 100644
index 0000000000000000000000000000000000000000..944ff6e36130ab3aaa2aea7f89edc4a50fff960d
--- /dev/null
+++ b/configs/singers/singer0055.npy
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:aa92ab16df7f82f5bde66501629310a5a250ff61d5f467a7ff2b0c97fe6d8066
+size 1152
diff --git a/configs/singers/singer0056.npy b/configs/singers/singer0056.npy
new file mode 100644
index 0000000000000000000000000000000000000000..79f339eb5ef8319b5410749e19a5c647d7c2e81f
--- /dev/null
+++ b/configs/singers/singer0056.npy
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:07022464dcb678dddd3957ea431e53b4c79fd1904927d2622faef5c521da1b5e
+size 1152
diff --git a/configs/singers_sample/22-wave-girl/031.wav b/configs/singers_sample/22-wave-girl/031.wav
new file mode 100644
index 0000000000000000000000000000000000000000..6dadf271930958b6a55bb1bb15cab164e381c0ab
Binary files /dev/null and b/configs/singers_sample/22-wave-girl/031.wav differ
diff --git a/configs/singers_sample/22-wave-girl/032.wav b/configs/singers_sample/22-wave-girl/032.wav
new file mode 100644
index 0000000000000000000000000000000000000000..ee782d84355f46d3fb685a3d8342315887d7dd08
Binary files /dev/null and b/configs/singers_sample/22-wave-girl/032.wav differ
diff --git a/configs/singers_sample/22-wave-girl/033.wav b/configs/singers_sample/22-wave-girl/033.wav
new file mode 100644
index 0000000000000000000000000000000000000000..2525fa03d17d5281591e0fd4e0f35e0f8832efb8
Binary files /dev/null and b/configs/singers_sample/22-wave-girl/033.wav differ
diff --git a/configs/singers_sample/22-wave-girl/034.wav b/configs/singers_sample/22-wave-girl/034.wav
new file mode 100644
index 0000000000000000000000000000000000000000..12f8abacf3f69b33c4771bda9d4bd36d72c14a1d
Binary files /dev/null and b/configs/singers_sample/22-wave-girl/034.wav differ
diff --git a/configs/singers_sample/22-wave-girl/035.wav b/configs/singers_sample/22-wave-girl/035.wav
new file mode 100644
index 0000000000000000000000000000000000000000..67061ed6d9f01a77be336295427152235727f2f3
Binary files /dev/null and b/configs/singers_sample/22-wave-girl/035.wav differ
diff --git a/configs/singers_sample/30-wave-boy/010.wav b/configs/singers_sample/30-wave-boy/010.wav
new file mode 100644
index 0000000000000000000000000000000000000000..f155cd942d81afb15b154726dafabc8d52c8e039
Binary files /dev/null and b/configs/singers_sample/30-wave-boy/010.wav differ
diff --git a/configs/singers_sample/30-wave-boy/011.wav b/configs/singers_sample/30-wave-boy/011.wav
new file mode 100644
index 0000000000000000000000000000000000000000..4c97631687ff52e88559a46da777230332b761d3
Binary files /dev/null and b/configs/singers_sample/30-wave-boy/011.wav differ
diff --git a/configs/singers_sample/30-wave-boy/012.wav b/configs/singers_sample/30-wave-boy/012.wav
new file mode 100644
index 0000000000000000000000000000000000000000..ef425b68fda074ab2a269f2c52defe166169b7d8
Binary files /dev/null and b/configs/singers_sample/30-wave-boy/012.wav differ
diff --git a/configs/singers_sample/30-wave-boy/013.wav b/configs/singers_sample/30-wave-boy/013.wav
new file mode 100644
index 0000000000000000000000000000000000000000..a88cf259b8be0cba850407986e32542256a2a9ee
Binary files /dev/null and b/configs/singers_sample/30-wave-boy/013.wav differ
diff --git a/configs/singers_sample/30-wave-boy/014.wav b/configs/singers_sample/30-wave-boy/014.wav
new file mode 100644
index 0000000000000000000000000000000000000000..7cf8a332d01eec2c56cd34366a428bee36dc004a
Binary files /dev/null and b/configs/singers_sample/30-wave-boy/014.wav differ
diff --git a/configs/singers_sample/30-wave-boy/015.wav b/configs/singers_sample/30-wave-boy/015.wav
new file mode 100644
index 0000000000000000000000000000000000000000..d1bc022dca24ed3834b766a5f8307bd4739253a9
Binary files /dev/null and b/configs/singers_sample/30-wave-boy/015.wav differ
diff --git a/configs/singers_sample/47-wave-girl/020.wav b/configs/singers_sample/47-wave-girl/020.wav
new file mode 100644
index 0000000000000000000000000000000000000000..4176e18fb71d802f054765a58c1ca9a2935c9e22
Binary files /dev/null and b/configs/singers_sample/47-wave-girl/020.wav differ
diff --git a/configs/singers_sample/47-wave-girl/021.wav b/configs/singers_sample/47-wave-girl/021.wav
new file mode 100644
index 0000000000000000000000000000000000000000..9891ceecbf5563590ab88fec7c0bed19808b6b1a
Binary files /dev/null and b/configs/singers_sample/47-wave-girl/021.wav differ
diff --git a/configs/singers_sample/47-wave-girl/022.wav b/configs/singers_sample/47-wave-girl/022.wav
new file mode 100644
index 0000000000000000000000000000000000000000..a98fc13df8bdde03a1c11df7918c2011b4b48342
Binary files /dev/null and b/configs/singers_sample/47-wave-girl/022.wav differ
diff --git a/configs/singers_sample/47-wave-girl/023.wav b/configs/singers_sample/47-wave-girl/023.wav
new file mode 100644
index 0000000000000000000000000000000000000000..614edb86cec4a7e61fed0e3e5d590b4ce375e914
Binary files /dev/null and b/configs/singers_sample/47-wave-girl/023.wav differ
diff --git a/configs/singers_sample/47-wave-girl/024.wav b/configs/singers_sample/47-wave-girl/024.wav
new file mode 100644
index 0000000000000000000000000000000000000000..83c3fce417a04c3928bae789f760ddf1c233cd86
Binary files /dev/null and b/configs/singers_sample/47-wave-girl/024.wav differ
diff --git a/configs/singers_sample/47-wave-girl/025.wav b/configs/singers_sample/47-wave-girl/025.wav
new file mode 100644
index 0000000000000000000000000000000000000000..1e6d11027c8a2d120d0d457a0501c7a6b6902600
Binary files /dev/null and b/configs/singers_sample/47-wave-girl/025.wav differ
diff --git a/configs/singers_sample/51-wave-boy/006.wav b/configs/singers_sample/51-wave-boy/006.wav
new file mode 100644
index 0000000000000000000000000000000000000000..f2b140dbd1207bbc81d5cb207f052faa59c573a5
Binary files /dev/null and b/configs/singers_sample/51-wave-boy/006.wav differ
diff --git a/configs/singers_sample/51-wave-boy/007.wav b/configs/singers_sample/51-wave-boy/007.wav
new file mode 100644
index 0000000000000000000000000000000000000000..76115acb255395428ecc063fb63616137c4f58b0
Binary files /dev/null and b/configs/singers_sample/51-wave-boy/007.wav differ
diff --git a/configs/singers_sample/51-wave-boy/008.wav b/configs/singers_sample/51-wave-boy/008.wav
new file mode 100644
index 0000000000000000000000000000000000000000..117f33965675781598fe5200a1d73e64db797936
Binary files /dev/null and b/configs/singers_sample/51-wave-boy/008.wav differ
diff --git a/configs/singers_sample/51-wave-boy/009.wav b/configs/singers_sample/51-wave-boy/009.wav
new file mode 100644
index 0000000000000000000000000000000000000000..b38ba32ccb0b95ab53c4a55477d70c408970b563
Binary files /dev/null and b/configs/singers_sample/51-wave-boy/009.wav differ
diff --git a/configs/singers_sample/51-wave-boy/010.wav b/configs/singers_sample/51-wave-boy/010.wav
new file mode 100644
index 0000000000000000000000000000000000000000..99a43ba2013a126103bd167689b06f819fef8573
Binary files /dev/null and b/configs/singers_sample/51-wave-boy/010.wav differ
diff --git a/crepe/LICENSE.txt b/crepe/LICENSE.txt
new file mode 100644
index 0000000000000000000000000000000000000000..efc01ae87f6cc931d539ee9672a4e00aa583814c
--- /dev/null
+++ b/crepe/LICENSE.txt
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2020 Max Morrison
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/crepe/README.md b/crepe/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..296537c8aee47545f5600a6e7d84731d535e84d8
--- /dev/null
+++ b/crepe/README.md
@@ -0,0 +1,223 @@
+<h1 align="center">torchcrepe</h1>
+<div align="center">
+
+[![PyPI](https://img.shields.io/pypi/v/torchcrepe.svg)](https://pypi.python.org/pypi/torchcrepe)
+[![License](https://img.shields.io/badge/License-MIT-blue.svg)](https://opensource.org/licenses/MIT)
+[![Downloads](https://pepy.tech/badge/torchcrepe)](https://pepy.tech/project/torchcrepe)
+
+</div>
+
+Pytorch implementation of the CREPE [1] pitch tracker. The original Tensorflow
+implementation can be found [here](https://github.com/marl/crepe/). The
+provided model weights were obtained by converting the "tiny" and "full" models
+using [MMdnn](https://github.com/microsoft/MMdnn), an open-source model
+management framework.
+
+
+## Installation
+Perform the system-dependent PyTorch install using the instructions found
+[here](https://pytorch.org/).
+
+`pip install torchcrepe`
+
+
+## Usage
+
+### Computing pitch and periodicity from audio
+
+
+```python
+import torchcrepe
+
+
+# Load audio
+audio, sr = torchcrepe.load.audio( ... )
+
+# Here we'll use a 5 millisecond hop length
+hop_length = int(sr / 200.)
+
+# Provide a sensible frequency range for your domain (upper limit is 2006 Hz)
+# This would be a reasonable range for speech
+fmin = 50
+fmax = 550
+
+# Select a model capacity--one of "tiny" or "full"
+model = 'tiny'
+
+# Choose a device to use for inference
+device = 'cuda:0'
+
+# Pick a batch size that doesn't cause memory errors on your gpu
+batch_size = 2048
+
+# Compute pitch using first gpu
+pitch = torchcrepe.predict(audio,
+                           sr,
+                           hop_length,
+                           fmin,
+                           fmax,
+                           model,
+                           batch_size=batch_size,
+                           device=device)
+```
+
+A periodicity metric similar to the Crepe confidence score can also be
+extracted by passing `return_periodicity=True` to `torchcrepe.predict`.
+
+
+### Decoding
+
+By default, `torchcrepe` uses Viterbi decoding on the softmax of the network
+output. This is different than the original implementation, which uses a
+weighted average near the argmax of binary cross-entropy probabilities.
+The argmax operation can cause double/half frequency errors. These can be
+removed by penalizing large pitch jumps via Viterbi decoding. The `decode`
+submodule provides some options for decoding.
+
+```python
+# Decode using viterbi decoding (default)
+torchcrepe.predict(..., decoder=torchcrepe.decode.viterbi)
+
+# Decode using weighted argmax (as in the original implementation)
+torchcrepe.predict(..., decoder=torchcrepe.decode.weighted_argmax)
+
+# Decode using argmax
+torchcrepe.predict(..., decoder=torchcrepe.decode.argmax)
+```
+
+
+### Filtering and thresholding
+
+When periodicity is low, the pitch is less reliable. For some problems, it
+makes sense to mask these less reliable pitch values. However, the periodicity
+can be noisy and the pitch has quantization artifacts. `torchcrepe` provides
+submodules `filter` and `threshold` for this purpose. The filter and threshold
+parameters should be tuned to your data. For clean speech, a 10-20 millisecond
+window with a threshold of 0.21 has worked.
+
+```python
+# We'll use a 15 millisecond window assuming a hop length of 5 milliseconds
+win_length = 3
+
+# Median filter noisy confidence value
+periodicity = torchcrepe.filter.median(periodicity, win_length)
+
+# Remove inharmonic regions
+pitch = torchcrepe.threshold.At(.21)(pitch, periodicity)
+
+# Optionally smooth pitch to remove quantization artifacts
+pitch = torchcrepe.filter.mean(pitch, win_length)
+```
+
+For more fine-grained control over pitch thresholding, see
+`torchcrepe.threshold.Hysteresis`. This is especially useful for removing
+spurious voiced regions caused by noise in the periodicity values, but
+has more parameters and may require more manual tuning to your data.
+
+CREPE was not trained on silent audio. Therefore, it sometimes assigns high
+confidence to pitch bins in silent regions. You can use
+`torchcrepe.threshold.Silence` to manually set the periodicity in silent
+regions to zero.
+
+```python
+periodicity = torchcrepe.threshold.Silence(-60.)(periodicity,
+                                                 audio,
+                                                 sr,
+                                                 hop_length)
+```
+
+
+### Computing the CREPE model output activations
+
+```python
+batch = next(torchcrepe.preprocess(audio, sr, hop_length))
+probabilities = torchcrepe.infer(batch)
+```
+
+
+### Computing the CREPE embedding space
+
+As in Differentiable Digital Signal Processing [2], this uses the output of the
+fifth max-pooling layer as a pretrained pitch embedding
+
+```python
+embeddings = torchcrepe.embed(audio, sr, hop_length)
+```
+
+### Computing from files
+
+`torchcrepe` defines the following functions convenient for predicting
+directly from audio files on disk. Each of these functions also takes
+a `device` argument that can be used for device placement (e.g.,
+`device='cuda:0'`).
+
+```python
+torchcrepe.predict_from_file(audio_file, ...)
+torchcrepe.predict_from_file_to_file(
+    audio_file, output_pitch_file, output_periodicity_file, ...)
+torchcrepe.predict_from_files_to_files(
+    audio_files, output_pitch_files, output_periodicity_files, ...)
+
+torchcrepe.embed_from_file(audio_file, ...)
+torchcrepe.embed_from_file_to_file(audio_file, output_file, ...)
+torchcrepe.embed_from_files_to_files(audio_files, output_files, ...)
+```
+
+### Command-line interface
+
+```bash
+usage: python -m torchcrepe
+    [-h]
+    --audio_files AUDIO_FILES [AUDIO_FILES ...]
+    --output_files OUTPUT_FILES [OUTPUT_FILES ...]
+    [--hop_length HOP_LENGTH]
+    [--output_periodicity_files OUTPUT_PERIODICITY_FILES [OUTPUT_PERIODICITY_FILES ...]]
+    [--embed]
+    [--fmin FMIN]
+    [--fmax FMAX]
+    [--model MODEL]
+    [--decoder DECODER]
+    [--gpu GPU]
+    [--no_pad]
+
+optional arguments:
+  -h, --help            show this help message and exit
+  --audio_files AUDIO_FILES [AUDIO_FILES ...]
+                        The audio file to process
+  --output_files OUTPUT_FILES [OUTPUT_FILES ...]
+                        The file to save pitch or embedding
+  --hop_length HOP_LENGTH
+                        The hop length of the analysis window
+  --output_periodicity_files OUTPUT_PERIODICITY_FILES [OUTPUT_PERIODICITY_FILES ...]
+                        The file to save periodicity
+  --embed               Performs embedding instead of pitch prediction
+  --fmin FMIN           The minimum frequency allowed
+  --fmax FMAX           The maximum frequency allowed
+  --model MODEL         The model capacity. One of "tiny" or "full"
+  --decoder DECODER     The decoder to use. One of "argmax", "viterbi", or
+                        "weighted_argmax"
+  --gpu GPU             The gpu to perform inference on
+  --no_pad              Whether to pad the audio
+```
+
+
+## Tests
+
+The module tests can be run as follows.
+
+```bash
+pip install pytest
+pytest
+```
+
+
+## References
+[1] J. W. Kim, J. Salamon, P. Li, and J. P. Bello, “Crepe: A
+Convolutional Representation for Pitch Estimation,” in 2018 IEEE
+International Conference on Acoustics, Speech and Signal
+Processing (ICASSP).
+
+[2] J. H. Engel, L. Hantrakul, C. Gu, and A. Roberts,
+“DDSP: Differentiable Digital Signal Processing,” in
+2020 International Conference on Learning
+Representations (ICLR).
diff --git a/crepe/__init__.py b/crepe/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..f78e20d4a4a07cb7dfc37df643d96a34a4486ccd
--- /dev/null
+++ b/crepe/__init__.py
@@ -0,0 +1,8 @@
+from . import decode
+from .core import *
+from .model import Crepe
+from . import convert
+from . import filter
+from . import load
+from . import loudness
+from . import threshold
diff --git a/crepe/__main__.py b/crepe/__main__.py
new file mode 100644
index 0000000000000000000000000000000000000000..4d1a3120adea147778bc5829ae9b8037bed8efd0
--- /dev/null
+++ b/crepe/__main__.py
@@ -0,0 +1,148 @@
+import argparse
+import os
+import warnings
+
+import crepe
+
+
+###############################################################################
+# Entry point
+###############################################################################
+
+
+def parse_args():
+    """Parse command-line arguments"""
+    parser = argparse.ArgumentParser()
+
+    # Required arguments
+    parser.add_argument(
+        '--audio_files',
+        nargs='+',
+        required=True,
+        help='The audio file to process')
+    parser.add_argument(
+        '--output_files',
+        nargs='+',
+        required=True,
+        help='The file to save pitch or embedding')
+    parser.add_argument(
+        '--hop_length',
+        type=int,
+        help='The hop length of the analysis window')
+
+    # Optionally save harmonicity [DEPRECATED]
+    parser.add_argument(
+        '--output_harmonicity_files',
+        nargs='+',
+        help='The file to save harmonicity')
+    # Optionally save periodicity
+    parser.add_argument(
+        '--output_periodicity_files',
+        nargs='+',
+        help='The files to save periodicity')
+
+    # Optionally create embedding instead of pitch contour
+    parser.add_argument(
+        '--embed',
+        action='store_true',
+        help='Performs embedding instead of pitch prediction')
+
+    # Optional arguments
+    parser.add_argument(
+        '--fmin',
+        default=50.,
+        type=float,
+        help='The minimum frequency allowed')
+    parser.add_argument(
+        '--fmax',
+        default=crepe.MAX_FMAX,
+        type=float,
+        help='The maximum frequency allowed')
+    parser.add_argument(
+        '--model',
+        default='full',
+        help='The model capacity. One of "tiny" or "full"')
+    parser.add_argument(
+        '--decoder',
+        default='viterbi',
+        help='The decoder to use. One of "argmax", "viterbi", or ' +
+             '"weighted_argmax"')
+    parser.add_argument(
+        '--batch_size',
+        type=int,
+        help='The number of frames per batch')
+    parser.add_argument(
+        '--gpu',
+        type=int,
+        help='The gpu to perform inference on')
+    parser.add_argument(
+        '--no_pad',
+        action='store_true',
+        help='Whether to pad the audio')
+
+    return parser.parse_args()
+
+
+def make_parent_directory(file):
+    """Create parent directory for file if it does not already exist"""
+    parent = os.path.dirname(os.path.abspath(file))
+    os.makedirs(parent, exist_ok=True)
+
+
+def main():
+    # Parse command-line arguments
+    args = parse_args()
+
+    # Deprecate output_harmonicity_files
+    if args.output_harmonicity_files is not None:
+        message = (
+            'The crepe output_harmonicity_files argument is deprecated and '
+            'will be removed in a future release. Please use '
+            'output_periodicity_files. Rationale: if network confidence measured '
+            'harmonic content, the value would be low for non-harmonic, periodic '
+            'sounds (e.g., sine waves). But this is not observed.')
+        warnings.warn(message, DeprecationWarning)
+        args.output_periodicity_files = args.output_harmonicity_files
+
+    # Ensure output directory exist
+    [make_parent_directory(file) for file in args.output_files]
+    if args.output_periodicity_files is not None:
+        [make_parent_directory(file) for file in args.output_periodicity_files]
+
+    # Get inference device
+    device = 'cpu' if args.gpu is None else f'cuda:{args.gpu}'
+
+    # Get decoder
+    if args.decoder == 'argmax':
+        decoder = crepe.decode.argmax
+    elif args.decoder == 'weighted_argmax':
+        decoder = crepe.decode.weighted_argmax
+    elif args.decoder == 'viterbi':
+        decoder = crepe.decode.viterbi
+
+    # Infer pitch or embedding and save to disk
+    if args.embed:
+        crepe.embed_from_files_to_files(args.audio_files,
+                                             args.output_files,
+                                             args.hop_length,
+                                             args.model,
+                                             args.batch_size,
+                                             device,
+                                             not args.no_pad)
+    else:
+        crepe.predict_from_files_to_files(args.audio_files,
+                                               args.output_files,
+                                               None,
+                                               args.output_periodicity_files,
+                                               args.hop_length,
+                                               args.fmin,
+                                               args.fmax,
+                                               args.model,
+                                               decoder,
+                                               args.batch_size,
+                                               device,
+                                               not args.no_pad)
+
+
+# Run module entry point
+main()
diff --git a/crepe/assets/tiny.pth b/crepe/assets/tiny.pth
new file mode 100644
index 0000000000000000000000000000000000000000..79d10d896a956c54dee45257cfe6bf87425bbdf5
--- /dev/null
+++ b/crepe/assets/tiny.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d4993eea36ed1a0ad9ac549c740dae5265b049ce72004f00c2f59e01c0be8432
+size 1962363
diff --git a/crepe/convert.py b/crepe/convert.py
new file mode 100644
index 0000000000000000000000000000000000000000..27ace1e111bb1c824894af50125c60a73af9bc20
--- /dev/null
+++ b/crepe/convert.py
@@ -0,0 +1,57 @@
+import scipy
+import torch
+
+import crepe
+
+
+###############################################################################
+# Pitch unit conversions
+###############################################################################
+
+
+def bins_to_cents(bins):
+    """Converts pitch bins to cents"""
+    cents = crepe.CENTS_PER_BIN * bins + 1997.3794084376191
+
+    # Trade quantization error for noise
+    return dither(cents)
+
+
+def bins_to_frequency(bins):
+    """Converts pitch bins to frequency in Hz"""
+    return cents_to_frequency(bins_to_cents(bins))
+
+
+def cents_to_bins(cents, quantize_fn=torch.floor):
+    """Converts cents to pitch bins"""
+    bins = (cents - 1997.3794084376191) / crepe.CENTS_PER_BIN
+    return quantize_fn(bins).int()
+
+
+def cents_to_frequency(cents):
+    """Converts cents to frequency in Hz"""
+    return 10 * 2 ** (cents / 1200)
+
+
+def frequency_to_bins(frequency, quantize_fn=torch.floor):
+    """Convert frequency in Hz to pitch bins"""
+    return cents_to_bins(frequency_to_cents(frequency), quantize_fn)
+
+
+def frequency_to_cents(frequency):
+    """Convert frequency in Hz to cents"""
+    return 1200 * torch.log2(frequency / 10.)
+
+
+###############################################################################
+# Utilities
+###############################################################################
+
+
+def dither(cents):
+    """Dither the predicted pitch in cents to remove quantization error"""
+    noise = scipy.stats.triang.rvs(c=0.5,
+                                   loc=-crepe.CENTS_PER_BIN,
+                                   scale=2 * crepe.CENTS_PER_BIN,
+                                   size=cents.size())
+    return cents + cents.new_tensor(noise)
diff --git a/crepe/core.py b/crepe/core.py
new file mode 100644
index 0000000000000000000000000000000000000000..fa7f0dd8e794ac3475a69cf7c80dc880a5f1598d
--- /dev/null
+++ b/crepe/core.py
@@ -0,0 +1,738 @@
+import warnings
+
+import numpy as np
+import resampy
+import torch
+import tqdm
+
+import crepe
+
+
+__all__ = ['CENTS_PER_BIN',
+           'MAX_FMAX',
+           'PITCH_BINS',
+           'SAMPLE_RATE',
+           'WINDOW_SIZE',
+           'UNVOICED',
+           'embed',
+           'embed_from_file',
+           'embed_from_file_to_file',
+           'embed_from_files_to_files',
+           'infer',
+           'predict',
+           'predict_from_file',
+           'predict_from_file_to_file',
+           'predict_from_files_to_files',
+           'preprocess',
+           'postprocess',
+           'resample']
+
+
+###############################################################################
+# Constants
+###############################################################################
+
+
+CENTS_PER_BIN = 20  # cents
+MAX_FMAX = 2006.  # hz
+PITCH_BINS = 360
+SAMPLE_RATE = 16000  # hz
+WINDOW_SIZE = 1024  # samples
+UNVOICED = np.nan
+
+
+###############################################################################
+# Crepe pitch prediction
+###############################################################################
+
+
+def predict(audio,
+            sample_rate,
+            hop_length=None,
+            fmin=50.,
+            fmax=MAX_FMAX,
+            model='full',
+            decoder=crepe.decode.viterbi,
+            return_harmonicity=False,
+            return_periodicity=False,
+            batch_size=None,
+            device='cpu',
+            pad=True):
+    """Performs pitch estimation
+
+    Arguments
+        audio (torch.tensor [shape=(1, time)])
+            The audio signal
+        sample_rate (int)
+            The sampling rate in Hz
+        hop_length (int)
+            The hop_length in samples
+        fmin (float)
+            The minimum allowable frequency in Hz
+        fmax (float)
+            The maximum allowable frequency in Hz
+        model (string)
+            The model capacity. One of 'full' or 'tiny'.
+        decoder (function)
+            The decoder to use. See decode.py for decoders.
+        return_harmonicity (bool) [DEPRECATED]
+            Whether to also return the network confidence
+        return_periodicity (bool)
+            Whether to also return the network confidence
+        batch_size (int)
+            The number of frames per batch
+        device (string)
+            The device used to run inference
+        pad (bool)
+            Whether to zero-pad the audio
+
+    Returns
+        pitch (torch.tensor [shape=(1, 1 + int(time // hop_length))])
+        (Optional) periodicity (torch.tensor
+                                [shape=(1, 1 + int(time // hop_length))])
+    """
+    # Deprecate return_harmonicity
+    if return_harmonicity:
+        message = (
+            'The crepe return_harmonicity argument is deprecated and '
+            'will be removed in a future release. Please use '
+            'return_periodicity. Rationale: if network confidence measured '
+            'harmonics, the value would be low for non-harmonic, periodic '
+            'sounds (e.g., sine waves). But this is not observed.')
+        warnings.warn(message, DeprecationWarning)
+        return_periodicity = return_harmonicity
+
+    results = []
+
+    # Postprocessing breaks gradients, so just don't compute them
+    with torch.no_grad():
+
+        # Preprocess audio
+        generator = preprocess(audio,
+                               sample_rate,
+                               hop_length,
+                               batch_size,
+                               device,
+                               pad)
+        for frames in generator:
+
+            # Infer independent probabilities for each pitch bin
+            probabilities = infer(frames, model)
+
+            # shape=(batch, 360, time / hop_length)
+            probabilities = probabilities.reshape(
+                audio.size(0), -1, PITCH_BINS).transpose(1, 2)
+
+            # Convert probabilities to F0 and periodicity
+            result = postprocess(probabilities,
+                                 fmin,
+                                 fmax,
+                                 decoder,
+                                 return_harmonicity,
+                                 return_periodicity)
+
+            # Place on same device as audio to allow very long inputs
+            if isinstance(result, tuple):
+                result = (result[0].to(audio.device),
+                          result[1].to(audio.device))
+            else:
+                 result = result.to(audio.device)
+
+            results.append(result)
+
+    # Split pitch and periodicity
+    if return_periodicity:
+        pitch, periodicity = zip(*results)
+        return torch.cat(pitch, 1), torch.cat(periodicity, 1)
+
+    # Concatenate
+    return torch.cat(results, 1)
+
+
+def predict_from_file(audio_file,
+                      hop_length=None,
+                      fmin=50.,
+                      fmax=MAX_FMAX,
+                      model='full',
+                      decoder=crepe.decode.viterbi,
+                      return_harmonicity=False,
+                      return_periodicity=False,
+                      batch_size=None,
+                      device='cpu',
+                      pad=True):
+    """Performs pitch estimation from file on disk
+
+    Arguments
+        audio_file (string)
+            The file to perform pitch tracking on
+        hop_length (int)
+            The hop_length in samples
+        fmin (float)
+            The minimum allowable frequency in Hz
+        fmax (float)
+            The maximum allowable frequency in Hz
+        model (string)
+            The model capacity. One of 'full' or 'tiny'.
+        decoder (function)
+            The decoder to use. See decode.py for decoders.
+        return_harmonicity (bool) [DEPRECATED]
+            Whether to also return the network confidence
+        return_periodicity (bool)
+            Whether to also return the network confidence
+        batch_size (int)
+            The number of frames per batch
+        device (string)
+            The device used to run inference
+        pad (bool)
+            Whether to zero-pad the audio
+
+    Returns
+        pitch (torch.tensor [shape=(1, 1 + int(time // hop_length))])
+        (Optional) periodicity (torch.tensor
+                                [shape=(1, 1 + int(time // hop_length))])
+    """
+    # Load audio
+    audio, sample_rate = crepe.load.audio(audio_file)
+
+    # Predict
+    return predict(audio,
+                   sample_rate,
+                   hop_length,
+                   fmin,
+                   fmax,
+                   model,
+                   decoder,
+                   return_harmonicity,
+                   return_periodicity,
+                   batch_size,
+                   device,
+                   pad)
+
+
+def predict_from_file_to_file(audio_file,
+                              output_pitch_file,
+                              output_harmonicity_file=None,
+                              output_periodicity_file=None,
+                              hop_length=None,
+                              fmin=50.,
+                              fmax=MAX_FMAX,
+                              model='full',
+                              decoder=crepe.decode.viterbi,
+                              batch_size=None,
+                              device='cpu',
+                              pad=True):
+    """Performs pitch estimation from file on disk
+
+    Arguments
+        audio_file (string)
+            The file to perform pitch tracking on
+        output_pitch_file (string)
+            The file to save predicted pitch
+        output_harmonicity_file (string or None) [DEPRECATED]
+            The file to save predicted harmonicity
+        output_periodicity_file (string or None)
+            The file to save predicted periodicity
+        hop_length (int)
+            The hop_length in samples
+        fmin (float)
+            The minimum allowable frequency in Hz
+        fmax (float)
+            The maximum allowable frequency in Hz
+        model (string)
+            The model capacity. One of 'full' or 'tiny'.
+        decoder (function)
+            The decoder to use. See decode.py for decoders.
+        batch_size (int)
+            The number of frames per batch
+        device (string)
+            The device used to run inference
+        pad (bool)
+            Whether to zero-pad the audio
+    """
+    # Deprecate output_harmonicity_file
+    if output_harmonicity_file is not None:
+        message = (
+            'The crepe output_harmonicity_file argument is deprecated and '
+            'will be removed in a future release. Please use '
+            'output_periodicity_file. Rationale: if network confidence measured '
+            'harmonic content, the value would be low for non-harmonic, periodic '
+            'sounds (e.g., sine waves). But this is not observed.')
+        warnings.warn(message, DeprecationWarning)
+        output_periodicity_file = output_harmonicity_file
+
+    # Predict from file
+    prediction = predict_from_file(audio_file,
+                                   hop_length,
+                                   fmin,
+                                   fmax,
+                                   model,
+                                   decoder,
+                                   False,
+                                   output_periodicity_file is not None,
+                                   batch_size,
+                                   device,
+                                   pad)
+
+    # Save to disk
+    if output_periodicity_file is not None:
+        torch.save(prediction[0].detach(), output_pitch_file)
+        torch.save(prediction[1].detach(), output_periodicity_file)
+    else:
+        torch.save(prediction.detach(), output_pitch_file)
+
+
+def predict_from_files_to_files(audio_files,
+                                output_pitch_files,
+                                output_harmonicity_files=None,
+                                output_periodicity_files=None,
+                                hop_length=None,
+                                fmin=50.,
+                                fmax=MAX_FMAX,
+                                model='full',
+                                decoder=crepe.decode.viterbi,
+                                batch_size=None,
+                                device='cpu',
+                                pad=True):
+    """Performs pitch estimation from files on disk without reloading model
+
+    Arguments
+        audio_files (list[string])
+            The files to perform pitch tracking on
+        output_pitch_files (list[string])
+            The files to save predicted pitch
+        output_harmonicity_files (list[string] or None) [DEPRECATED]
+            The files to save predicted harmonicity
+        output_periodicity_files (list[string] or None)
+            The files to save predicted periodicity
+        hop_length (int)
+            The hop_length in samples
+        fmin (float)
+            The minimum allowable frequency in Hz
+        fmax (float)
+            The maximum allowable frequency in Hz
+        model (string)
+            The model capacity. One of 'full' or 'tiny'.
+        decoder (function)
+            The decoder to use. See decode.py for decoders.
+        batch_size (int)
+            The number of frames per batch
+        device (string)
+            The device used to run inference
+        pad (bool)
+            Whether to zero-pad the audio
+    """
+    # Deprecate output_harmonicity_files
+    if output_harmonicity_files is not None:
+        message = (
+            'The crepe output_harmonicity_files argument is deprecated and '
+            'will be removed in a future release. Please use '
+            'output_periodicity_files. Rationale: if network confidence measured '
+            'harmonic content, the value would be low for non-harmonic, periodic '
+            'sounds (e.g., sine waves). But this is not observed.')
+        warnings.warn(message, DeprecationWarning)
+        output_periodicity_files = output_harmonicity_files
+
+    if output_periodicity_files is None:
+        output_periodicity_files = len(audio_files) * [None]
+
+    # Setup iterator
+    iterator = zip(audio_files, output_pitch_files, output_periodicity_files)
+    iterator = tqdm.tqdm(iterator, desc='crepe', dynamic_ncols=True)
+    for audio_file, output_pitch_file, output_periodicity_file in iterator:
+
+        # Predict a file
+        predict_from_file_to_file(audio_file,
+                                  output_pitch_file,
+                                  None,
+                                  output_periodicity_file,
+                                  hop_length,
+                                  fmin,
+                                  fmax,
+                                  model,
+                                  decoder,
+                                  batch_size,
+                                  device,
+                                  pad)
+
+###############################################################################
+# Crepe pitch embedding
+###############################################################################
+
+
+def embed(audio,
+          sample_rate,
+          hop_length=None,
+          model='full',
+          batch_size=None,
+          device='cpu',
+          pad=True):
+    """Embeds audio to the output of CREPE's fifth maxpool layer
+
+    Arguments
+        audio (torch.tensor [shape=(1, time)])
+            The audio signals
+        sample_rate (int)
+            The sampling rate in Hz
+        hop_length (int)
+            The hop_length in samples
+        model (string)
+            The model capacity. One of 'full' or 'tiny'.
+        batch_size (int)
+            The number of frames per batch
+        device (string)
+            The device to run inference on
+        pad (bool)
+            Whether to zero-pad the audio
+
+    Returns
+        embedding (torch.tensor [shape=(1,
+                                        1 + int(time // hop_length), 32, -1)])
+    """
+    results = []
+
+    # Preprocess audio
+    generator = preprocess(audio,
+                           sample_rate,
+                           hop_length,
+                           batch_size,
+                           device,
+                           pad)
+    for frames in generator:
+
+        # Infer pitch embeddings
+        embedding = infer(frames, model, embed=True)
+
+        # shape=(batch, time / hop_length, 32, embedding_size)
+        result = embedding.reshape(audio.size(0), frames.size(0), 32, -1)
+
+        # Place on same device as audio. This allows for large inputs.
+        results.append(result.to(audio.device))
+
+    # Concatenate
+    return torch.cat(results, 1)
+
+
+def embed_from_file(audio_file,
+                    hop_length=None,
+                    model='full',
+                    batch_size=None,
+                    device='cpu',
+                    pad=True):
+    """Embeds audio from disk to the output of CREPE's fifth maxpool layer
+
+    Arguments
+        audio_file (string)
+            The wav file containing the audio to embed
+        hop_length (int)
+            The hop_length in samples
+        model (string)
+            The model capacity. One of 'full' or 'tiny'.
+        batch_size (int)
+            The number of frames per batch
+        device (string)
+            The device to run inference on
+        pad (bool)
+            Whether to zero-pad the audio
+
+    Returns
+        embedding (torch.tensor [shape=(1,
+                                        1 + int(time // hop_length), 32, -1)])
+    """
+    # Load audio
+    audio, sample_rate = crepe.load.audio(audio_file)
+
+    # Embed
+    return embed(audio,
+                 sample_rate,
+                 hop_length,
+                 model,
+                 batch_size,
+                 device,
+                 pad)
+
+
+def embed_from_file_to_file(audio_file,
+                            output_file,
+                            hop_length=None,
+                            model='full',
+                            batch_size=None,
+                            device='cpu',
+                            pad=True):
+    """Embeds audio from disk and saves to disk
+
+    Arguments
+        audio_file (string)
+            The wav file containing the audio to embed
+        hop_length (int)
+            The hop_length in samples
+        output_file (string)
+            The file to save the embedding
+        model (string)
+            The model capacity. One of 'full' or 'tiny'.
+        batch_size (int)
+            The number of frames per batch
+        device (string)
+            The device to run inference on
+        pad (bool)
+            Whether to zero-pad the audio
+    """
+    # No use computing gradients if we're just saving to file
+    with torch.no_grad():
+
+        # Embed
+        embedding = embed_from_file(audio_file,
+                                    hop_length,
+                                    model,
+                                    batch_size,
+                                    device,
+                                    pad)
+
+        # Save to disk
+        torch.save(embedding.detach(), output_file)
+
+
+def embed_from_files_to_files(audio_files,
+                              output_files,
+                              hop_length=None,
+                              model='full',
+                              batch_size=None,
+                              device='cpu',
+                              pad=True):
+    """Embeds audio from disk and saves to disk without reloading model
+
+    Arguments
+        audio_files (list[string])
+            The wav files containing the audio to embed
+        output_files (list[string])
+            The files to save the embeddings
+        hop_length (int)
+            The hop_length in samples
+        model (string)
+            The model capacity. One of 'full' or 'tiny'.
+        batch_size (int)
+            The number of frames per batch
+        device (string)
+            The device to run inference on
+        pad (bool)
+            Whether to zero-pad the audio
+    """
+    # Setup iterator
+    iterator = zip(audio_files, output_files)
+    iterator = tqdm.tqdm(iterator, desc='crepe', dynamic_ncols=True)
+    for audio_file, output_file in iterator:
+
+        # Embed a file
+        embed_from_file_to_file(audio_file,
+                                output_file,
+                                hop_length,
+                                model,
+                                batch_size,
+                                device,
+                                pad)
+
+
+###############################################################################
+# Components for step-by-step prediction
+###############################################################################
+
+
+def infer(frames, model='full', embed=False):
+    """Forward pass through the model
+
+    Arguments
+        frames (torch.tensor [shape=(time / hop_length, 1024)])
+            The network input
+        model (string)
+            The model capacity. One of 'full' or 'tiny'.
+        embed (bool)
+            Whether to stop inference at the intermediate embedding layer
+
+    Returns
+        logits (torch.tensor [shape=(1 + int(time // hop_length), 360)]) OR
+        embedding (torch.tensor [shape=(1 + int(time // hop_length),
+                                       embedding_size)])
+    """
+    # Load the model if necessary
+    if not hasattr(infer, 'model') or not hasattr(infer, 'capacity') or \
+       (hasattr(infer, 'capacity') and infer.capacity != model):
+        crepe.load.model(frames.device, model)
+
+    # Move model to correct device (no-op if devices are the same)
+    infer.model = infer.model.to(frames.device)
+
+    # Apply model
+    return infer.model(frames, embed=embed)
+
+
+def postprocess(probabilities,
+                fmin=0.,
+                fmax=MAX_FMAX,
+                decoder=crepe.decode.viterbi,
+                return_harmonicity=False,
+                return_periodicity=False):
+    """Convert model output to F0 and periodicity
+
+    Arguments
+        probabilities (torch.tensor [shape=(1, 360, time / hop_length)])
+            The probabilities for each pitch bin inferred by the network
+        fmin (float)
+            The minimum allowable frequency in Hz
+        fmax (float)
+            The maximum allowable frequency in Hz
+        viterbi (bool)
+            Whether to use viterbi decoding
+        return_harmonicity (bool) [DEPRECATED]
+            Whether to also return the network confidence
+        return_periodicity (bool)
+            Whether to also return the network confidence
+
+    Returns
+        pitch (torch.tensor [shape=(1, 1 + int(time // hop_length))])
+        periodicity (torch.tensor [shape=(1, 1 + int(time // hop_length))])
+    """
+    # Sampling is non-differentiable, so remove from graph
+    probabilities = probabilities.detach()
+
+    # Convert frequency range to pitch bin range
+    minidx = crepe.convert.frequency_to_bins(torch.tensor(fmin))
+    maxidx = crepe.convert.frequency_to_bins(torch.tensor(fmax),
+                                                  torch.ceil)
+
+    # Remove frequencies outside of allowable range
+    probabilities[:, :minidx] = -float('inf')
+    probabilities[:, maxidx:] = -float('inf')
+
+    # Perform argmax or viterbi sampling
+    bins, pitch = decoder(probabilities)
+
+    # Deprecate return_harmonicity
+    if return_harmonicity:
+        message = (
+            'The crepe return_harmonicity argument is deprecated and '
+            'will be removed in a future release. Please use '
+            'return_periodicity. Rationale: if network confidence measured '
+            'harmonics, the value would be low for non-harmonic, periodic '
+            'sounds (e.g., sine waves). But this is not observed.')
+        warnings.warn(message, DeprecationWarning)
+        return_periodicity = return_harmonicity
+
+    if not return_periodicity:
+        return pitch
+
+    # Compute periodicity from probabilities and decoded pitch bins
+    return pitch, periodicity(probabilities, bins)
+
+
+def preprocess(audio,
+               sample_rate,
+               hop_length=None,
+               batch_size=None,
+               device='cpu',
+               pad=True):
+    """Convert audio to model input
+
+    Arguments
+        audio (torch.tensor [shape=(1, time)])
+            The audio signals
+        sample_rate (int)
+            The sampling rate in Hz
+        hop_length (int)
+            The hop_length in samples
+        batch_size (int)
+            The number of frames per batch
+        device (string)
+            The device to run inference on
+        pad (bool)
+            Whether to zero-pad the audio
+
+    Returns
+        frames (torch.tensor [shape=(1 + int(time // hop_length), 1024)])
+    """
+    # Default hop length of 10 ms
+    hop_length = sample_rate // 100 if hop_length is None else hop_length
+
+    # Resample
+    if sample_rate != SAMPLE_RATE:
+        audio = resample(audio, sample_rate)
+        hop_length = int(hop_length * SAMPLE_RATE / sample_rate)
+
+    # Get total number of frames
+
+    # Maybe pad
+    if pad:
+        total_frames = 1 + int(audio.size(1) // hop_length)
+        audio = torch.nn.functional.pad(
+            audio,
+            (WINDOW_SIZE // 2, WINDOW_SIZE // 2))
+    else:
+        total_frames = 1 + int((audio.size(1) - WINDOW_SIZE) // hop_length)
+
+    # Default to running all frames in a single batch
+    batch_size = total_frames if batch_size is None else batch_size
+
+    # Generate batches
+    for i in range(0, total_frames, batch_size):
+
+        # Batch indices
+        start = max(0, i * hop_length)
+        end = min(audio.size(1),
+                  (i + batch_size - 1) * hop_length + WINDOW_SIZE)
+
+        # Chunk
+        frames = torch.nn.functional.unfold(
+            audio[:, None, None, start:end],
+            kernel_size=(1, WINDOW_SIZE),
+            stride=(1, hop_length))
+
+        # shape=(1 + int(time / hop_length, 1024)
+        frames = frames.transpose(1, 2).reshape(-1, WINDOW_SIZE)
+
+        # Place on device
+        frames = frames.to(device)
+
+        # Mean-center
+        frames -= frames.mean(dim=1, keepdim=True)
+
+        # Scale
+        # Note: during silent frames, this produces very large values. But
+        # this seems to be what the network expects.
+        frames /= torch.max(torch.tensor(1e-10, device=frames.device),
+                            frames.std(dim=1, keepdim=True))
+
+        yield frames
+
+
+###############################################################################
+# Utilities
+###############################################################################
+
+
+def periodicity(probabilities, bins):
+    """Computes the periodicity from the network output and pitch bins"""
+    # shape=(batch * time / hop_length, 360)
+    probs_stacked = probabilities.transpose(1, 2).reshape(-1, PITCH_BINS)
+
+    # shape=(batch * time / hop_length, 1)
+    bins_stacked = bins.reshape(-1, 1).to(torch.int64)
+
+    # Use maximum logit over pitch bins as periodicity
+    periodicity = probs_stacked.gather(1, bins_stacked)
+
+    # shape=(batch, time / hop_length)
+    return periodicity.reshape(probabilities.size(0), probabilities.size(2))
+
+
+def resample(audio, sample_rate):
+    """Resample audio"""
+    # Store device for later placement
+    device = audio.device
+
+    # Convert to numpy
+    audio = audio.detach().cpu().numpy().squeeze(0)
+
+    # Resample
+    # We have to use resampy if we want numbers to match Crepe
+    audio = resampy.resample(audio, sample_rate, SAMPLE_RATE)
+
+    # Convert to pytorch
+    return torch.tensor(audio, device=device).unsqueeze(0)
diff --git a/crepe/decode.py b/crepe/decode.py
new file mode 100644
index 0000000000000000000000000000000000000000..559e566b8e2c09fb7634c6ac9ce867731295901b
--- /dev/null
+++ b/crepe/decode.py
@@ -0,0 +1,80 @@
+import librosa
+import numpy as np
+import torch
+
+import crepe
+
+
+###############################################################################
+# Probability sequence decoding methods
+###############################################################################
+
+
+def argmax(logits):
+    """Sample observations by taking the argmax"""
+    bins = logits.argmax(dim=1)
+
+    # Convert to frequency in Hz
+    return bins, crepe.convert.bins_to_frequency(bins)
+
+
+def weighted_argmax(logits):
+    """Sample observations using weighted sum near the argmax"""
+    # Find center of analysis window
+    bins = logits.argmax(dim=1)
+
+    # Find bounds of analysis window
+    start = torch.max(torch.tensor(0, device=logits.device), bins - 4)
+    end = torch.min(torch.tensor(logits.size(1), device=logits.device), bins + 5)
+
+    # Mask out everything outside of window
+    for batch in range(logits.size(0)):
+        for time in range(logits.size(2)):
+            logits[batch, :start[batch, time], time] = -float('inf')
+            logits[batch, end[batch, time]:, time] = -float('inf')
+
+    # Construct weights
+    if not hasattr(weighted_argmax, 'weights'):
+        weights = crepe.convert.bins_to_cents(torch.arange(360))
+        weighted_argmax.weights = weights[None, :, None]
+
+    # Ensure devices are the same (no-op if they are)
+    weighted_argmax.weights = weighted_argmax.weights.to(logits.device)
+
+    # Convert to probabilities
+    with torch.no_grad():
+        probs = torch.sigmoid(logits)
+
+    # Apply weights
+    cents = (weighted_argmax.weights * probs).sum(dim=1) / probs.sum(dim=1)
+
+    # Convert to frequency in Hz
+    return bins, crepe.convert.cents_to_frequency(cents)
+
+
+def viterbi(logits):
+    """Sample observations using viterbi decoding"""
+    # Create viterbi transition matrix
+    if not hasattr(viterbi, 'transition'):
+        xx, yy = np.meshgrid(range(360), range(360))
+        transition = np.maximum(12 - abs(xx - yy), 0)
+        transition = transition / transition.sum(axis=1, keepdims=True)
+        viterbi.transition = transition
+
+    # Normalize logits
+    with torch.no_grad():
+        probs = torch.nn.functional.softmax(logits, dim=1)
+
+    # Convert to numpy
+    sequences = probs.cpu().numpy()
+
+    # Perform viterbi decoding
+    bins = np.array([
+        librosa.sequence.viterbi(sequence, viterbi.transition).astype(np.int64)
+        for sequence in sequences])
+
+    # Convert to pytorch
+    bins = torch.tensor(bins, device=probs.device)
+
+    # Convert to frequency in Hz
+    return bins, crepe.convert.bins_to_frequency(bins)
diff --git a/crepe/filter.py b/crepe/filter.py
new file mode 100644
index 0000000000000000000000000000000000000000..dd62ef59c7e2c7dd0c2544ae17b5ef60d0b642f6
--- /dev/null
+++ b/crepe/filter.py
@@ -0,0 +1,195 @@
+import numpy as np
+import torch
+from torch.nn import functional as F
+
+###############################################################################
+# Sequence filters
+###############################################################################
+
+
+def mean(signals, win_length=9):
+    """Averave filtering for signals containing nan values
+
+    Arguments
+        signals (torch.tensor (shape=(batch, time)))
+            The signals to filter
+        win_length
+            The size of the analysis window
+
+    Returns
+        filtered (torch.tensor (shape=(batch, time)))
+    """
+
+    assert signals.dim() == 2, "Input tensor must have 2 dimensions (batch_size, width)"
+    signals = signals.unsqueeze(1)
+
+    # Apply the mask by setting masked elements to zero, or make NaNs zero
+    mask = ~torch.isnan(signals)
+    masked_x = torch.where(mask, signals, torch.zeros_like(signals))
+
+    # Create a ones kernel with the same number of channels as the input tensor
+    ones_kernel = torch.ones(signals.size(1), 1, win_length, device=signals.device)
+
+    # Perform sum pooling
+    sum_pooled = F.conv1d(
+        masked_x,
+        ones_kernel,
+        stride=1,
+        padding=win_length // 2,
+    )
+
+    # Count the non-masked (valid) elements in each pooling window
+    valid_count = F.conv1d(
+        mask.float(),
+        ones_kernel,
+        stride=1,
+        padding=win_length // 2,
+    )
+    valid_count = valid_count.clamp(min=1)  # Avoid division by zero
+
+    # Perform masked average pooling
+    avg_pooled = sum_pooled / valid_count
+
+    # Fill zero values with NaNs
+    avg_pooled[avg_pooled == 0] = float("nan")
+
+    return avg_pooled.squeeze(1)
+
+
+def median(signals, win_length):
+    """Median filtering for signals containing nan values
+
+    Arguments
+        signals (torch.tensor (shape=(batch, time)))
+            The signals to filter
+        win_length
+            The size of the analysis window
+
+    Returns
+        filtered (torch.tensor (shape=(batch, time)))
+    """
+
+    assert signals.dim() == 2, "Input tensor must have 2 dimensions (batch_size, width)"
+    signals = signals.unsqueeze(1)
+
+    mask = ~torch.isnan(signals)
+    masked_x = torch.where(mask, signals, torch.zeros_like(signals))
+    padding = win_length // 2
+
+    x = F.pad(masked_x, (padding, padding), mode="reflect")
+    mask = F.pad(mask.float(), (padding, padding), mode="constant", value=0)
+
+    x = x.unfold(2, win_length, 1)
+    mask = mask.unfold(2, win_length, 1)
+
+    x = x.contiguous().view(x.size()[:3] + (-1,))
+    mask = mask.contiguous().view(mask.size()[:3] + (-1,))
+
+    # Combine the mask with the input tensor
+    x_masked = torch.where(mask.bool(), x.double(), float("inf")).to(x)
+
+    # Sort the masked tensor along the last dimension
+    x_sorted, _ = torch.sort(x_masked, dim=-1)
+
+    # Compute the count of non-masked (valid) values
+    valid_count = mask.sum(dim=-1)
+
+    # Calculate the index of the median value for each pooling window
+    median_idx = ((valid_count - 1) // 2).clamp(min=0)
+
+    # Gather the median values using the calculated indices
+    median_pooled = x_sorted.gather(-1, median_idx.unsqueeze(-1).long()).squeeze(-1)
+
+    # Fill infinite values with NaNs
+    median_pooled[torch.isinf(median_pooled)] = float("nan")
+
+    return median_pooled.squeeze(1)
+
+
+###############################################################################
+# Utilities
+###############################################################################
+
+
+def nanfilter(signals, win_length, filter_fn):
+    """Filters a sequence, ignoring nan values
+
+    Arguments
+        signals (torch.tensor (shape=(batch, time)))
+            The signals to filter
+        win_length
+            The size of the analysis window
+        filter_fn (function)
+            The function to use for filtering
+
+    Returns
+        filtered (torch.tensor (shape=(batch, time)))
+    """
+    # Output buffer
+    filtered = torch.empty_like(signals)
+
+    # Loop over frames
+    for i in range(signals.size(1)):
+
+        # Get analysis window bounds
+        start = max(0, i - win_length // 2)
+        end = min(signals.size(1), i + win_length // 2 + 1)
+
+        # Apply filter to window
+        filtered[:, i] = filter_fn(signals[:, start:end])
+
+    return filtered
+
+
+def nanmean(signals):
+    """Computes the mean, ignoring nans
+
+    Arguments
+        signals (torch.tensor [shape=(batch, time)])
+            The signals to filter
+
+    Returns
+        filtered (torch.tensor [shape=(batch, time)])
+    """
+    signals = signals.clone()
+
+    # Find nans
+    nans = torch.isnan(signals)
+
+    # Set nans to 0.
+    signals[nans] = 0.
+
+    # Compute average
+    return signals.sum(dim=1) / (~nans).float().sum(dim=1)
+
+
+def nanmedian(signals):
+    """Computes the median, ignoring nans
+
+    Arguments
+        signals (torch.tensor [shape=(batch, time)])
+            The signals to filter
+
+    Returns
+        filtered (torch.tensor [shape=(batch, time)])
+    """
+    # Find nans
+    nans = torch.isnan(signals)
+
+    # Compute median for each slice
+    medians = [nanmedian1d(signal[~nan]) for signal, nan in zip(signals, nans)]
+
+    # Stack results
+    return torch.tensor(medians, dtype=signals.dtype, device=signals.device)
+
+
+def nanmedian1d(signal):
+    """Computes the median. If signal is empty, returns torch.nan
+
+    Arguments
+        signal (torch.tensor [shape=(time,)])
+
+    Returns
+        median (torch.tensor [shape=(1,)])
+    """
+    return torch.median(signal) if signal.numel() else np.nan
diff --git a/crepe/load.py b/crepe/load.py
new file mode 100644
index 0000000000000000000000000000000000000000..bb5a3c355b31f0495721d6dcfc4fbc57927c4f91
--- /dev/null
+++ b/crepe/load.py
@@ -0,0 +1,36 @@
+import os
+
+import numpy as np
+import torch
+import crepe
+from scipy.io import wavfile
+
+
+def audio(filename):
+    """Load audio from disk"""
+    sample_rate, audio = wavfile.read(filename)
+
+    # Convert to float32
+    if audio.dtype == np.int16:
+        audio = audio.astype(np.float32) / np.iinfo(np.int16).max
+
+    # PyTorch is not compatible with non-writeable arrays, so we make a copy
+    return torch.tensor(np.copy(audio))[None], sample_rate
+
+
+def model(device, capacity='full'):
+    """Preloads model from disk"""
+    # Bind model and capacity
+    crepe.infer.capacity = capacity
+    crepe.infer.model = crepe.Crepe(capacity)
+
+    # Load weights
+    file = os.path.join(os.path.dirname(__file__), 'assets', f'{capacity}.pth')
+    crepe.infer.model.load_state_dict(
+        torch.load(file, map_location=device))
+
+    # Place on device
+    crepe.infer.model = crepe.infer.model.to(torch.device(device))
+
+    # Eval mode
+    crepe.infer.model.eval()
diff --git a/crepe/loudness.py b/crepe/loudness.py
new file mode 100644
index 0000000000000000000000000000000000000000..e6f5c4a648b6adfa7c0a0c8988f4ae0bfd7b051d
--- /dev/null
+++ b/crepe/loudness.py
@@ -0,0 +1,78 @@
+import warnings
+
+import librosa
+import numpy as np
+import resampy
+import torch
+
+import crepe
+
+
+###############################################################################
+# Constants
+###############################################################################
+
+
+# Minimum decibel level
+MIN_DB = -100.
+
+# Reference decibel level
+REF_DB = 20.
+
+
+###############################################################################
+# A-weighted loudness
+###############################################################################
+
+
+def a_weighted(audio, sample_rate, hop_length=None, pad=True):
+    """Retrieve the per-frame loudness"""
+    # Save device
+    device = audio.device
+
+    # Default hop length of 10 ms
+    hop_length = sample_rate // 100 if hop_length is None else hop_length
+
+    # Convert to numpy
+    audio = audio.detach().cpu().numpy().squeeze(0)
+
+    # Resample
+    if sample_rate != crepe.SAMPLE_RATE:
+        audio = resampy.resample(audio, sample_rate, crepe.SAMPLE_RATE)
+        hop_length = int(hop_length * crepe.SAMPLE_RATE / sample_rate)
+
+    # Cache weights
+    if not hasattr(a_weighted, 'weights'):
+        a_weighted.weights = perceptual_weights()
+
+    # Take stft
+    stft = librosa.stft(audio,
+                        n_fft=crepe.WINDOW_SIZE,
+                        hop_length=hop_length,
+                        win_length=crepe.WINDOW_SIZE,
+                        center=pad,
+                        pad_mode='constant')
+
+    # Compute magnitude on db scale
+    db = librosa.amplitude_to_db(np.abs(stft))
+
+    # Apply A-weighting
+    weighted = db + a_weighted.weights
+
+    # Threshold
+    weighted[weighted < MIN_DB] = MIN_DB
+
+    # Average over weighted frequencies
+    return torch.from_numpy(weighted.mean(axis=0)).float().to(device)[None]
+
+
+def perceptual_weights():
+    """A-weighted frequency-dependent perceptual loudness weights"""
+    frequencies = librosa.fft_frequencies(sr=crepe.SAMPLE_RATE,
+                                          n_fft=crepe.WINDOW_SIZE)
+
+    # A warning is raised for nearly inaudible frequencies, but it ends up
+    # defaulting to -100 db. That default is fine for our purposes.
+    with warnings.catch_warnings():
+        warnings.simplefilter('ignore', RuntimeWarning)
+        return librosa.A_weighting(frequencies)[:, None] - REF_DB
diff --git a/crepe/model.py b/crepe/model.py
new file mode 100644
index 0000000000000000000000000000000000000000..e1c1a5b687773211d77e89d096e0e0189014ac54
--- /dev/null
+++ b/crepe/model.py
@@ -0,0 +1,134 @@
+import functools
+
+import torch
+import torch.nn.functional as F
+
+import crepe
+
+
+###########################################################################
+# Model definition
+###########################################################################
+
+
+class Crepe(torch.nn.Module):
+    """Crepe model definition"""
+
+    def __init__(self, model='full'):
+        super().__init__()
+
+        # Model-specific layer parameters
+        if model == 'full':
+            in_channels = [1, 1024, 128, 128, 128, 256]
+            out_channels = [1024, 128, 128, 128, 256, 512]
+            self.in_features = 2048
+        elif model == 'tiny':
+            in_channels = [1, 128, 16, 16, 16, 32]
+            out_channels = [128, 16, 16, 16, 32, 64]
+            self.in_features = 256
+        else:
+            raise ValueError(f'Model {model} is not supported')
+
+        # Shared layer parameters
+        kernel_sizes = [(512, 1)] + 5 * [(64, 1)]
+        strides = [(4, 1)] + 5 * [(1, 1)]
+
+        # Overload with eps and momentum conversion given by MMdnn
+        batch_norm_fn = functools.partial(torch.nn.BatchNorm2d,
+                                          eps=0.0010000000474974513,
+                                          momentum=0.0)
+
+        # Layer definitions
+        self.conv1 = torch.nn.Conv2d(
+            in_channels=in_channels[0],
+            out_channels=out_channels[0],
+            kernel_size=kernel_sizes[0],
+            stride=strides[0])
+        self.conv1_BN = batch_norm_fn(
+            num_features=out_channels[0])
+
+        self.conv2 = torch.nn.Conv2d(
+            in_channels=in_channels[1],
+            out_channels=out_channels[1],
+            kernel_size=kernel_sizes[1],
+            stride=strides[1])
+        self.conv2_BN = batch_norm_fn(
+            num_features=out_channels[1])
+
+        self.conv3 = torch.nn.Conv2d(
+            in_channels=in_channels[2],
+            out_channels=out_channels[2],
+            kernel_size=kernel_sizes[2],
+            stride=strides[2])
+        self.conv3_BN = batch_norm_fn(
+            num_features=out_channels[2])
+
+        self.conv4 = torch.nn.Conv2d(
+            in_channels=in_channels[3],
+            out_channels=out_channels[3],
+            kernel_size=kernel_sizes[3],
+            stride=strides[3])
+        self.conv4_BN = batch_norm_fn(
+            num_features=out_channels[3])
+
+        self.conv5 = torch.nn.Conv2d(
+            in_channels=in_channels[4],
+            out_channels=out_channels[4],
+            kernel_size=kernel_sizes[4],
+            stride=strides[4])
+        self.conv5_BN = batch_norm_fn(
+            num_features=out_channels[4])
+
+        self.conv6 = torch.nn.Conv2d(
+            in_channels=in_channels[5],
+            out_channels=out_channels[5],
+            kernel_size=kernel_sizes[5],
+            stride=strides[5])
+        self.conv6_BN = batch_norm_fn(
+            num_features=out_channels[5])
+
+        self.classifier = torch.nn.Linear(
+            in_features=self.in_features,
+            out_features=crepe.PITCH_BINS)
+
+    def forward(self, x, embed=False):
+        # Forward pass through first five layers
+        x = self.embed(x)
+
+        if embed:
+            return x
+
+        # Forward pass through layer six
+        x = self.layer(x, self.conv6, self.conv6_BN)
+
+        # shape=(batch, self.in_features)
+        x = x.permute(0, 2, 1, 3).reshape(-1, self.in_features)
+
+        # Compute logits
+        return torch.sigmoid(self.classifier(x))
+
+    ###########################################################################
+    # Forward pass utilities
+    ###########################################################################
+
+    def embed(self, x):
+        """Map input audio to pitch embedding"""
+        # shape=(batch, 1, 1024, 1)
+        x = x[:, None, :, None]
+
+        # Forward pass through first five layers
+        x = self.layer(x, self.conv1, self.conv1_BN, (0, 0, 254, 254))
+        x = self.layer(x, self.conv2, self.conv2_BN)
+        x = self.layer(x, self.conv3, self.conv3_BN)
+        x = self.layer(x, self.conv4, self.conv4_BN)
+        x = self.layer(x, self.conv5, self.conv5_BN)
+
+        return x
+
+    def layer(self, x, conv, batch_norm, padding=(0, 0, 31, 32)):
+        """Forward pass through one layer"""
+        x = F.pad(x, padding)
+        x = conv(x)
+        x = F.relu(x)
+        x = batch_norm(x)
+        return F.max_pool2d(x, (2, 1), (2, 1))
diff --git a/crepe/threshold.py b/crepe/threshold.py
new file mode 100644
index 0000000000000000000000000000000000000000..85d6ec9bef2d03b0eb101c6b7fa4f3464cdf1554
--- /dev/null
+++ b/crepe/threshold.py
@@ -0,0 +1,134 @@
+import numpy as np
+import torch
+
+import crepe
+
+
+###############################################################################
+# Pitch thresholding methods
+###############################################################################
+
+
+class At:
+    """Simple thresholding at a specified probability value"""
+
+    def __init__(self, value):
+        self.value = value
+
+    def __call__(self, pitch, periodicity):
+        # Make a copy to prevent in-place modification
+        pitch = torch.clone(pitch)
+
+        # Threshold
+        pitch[periodicity < self.value] = crepe.UNVOICED
+        return pitch
+
+
+class Hysteresis:
+    """Hysteresis thresholding"""
+
+    def __init__(self,
+                 lower_bound=.19,
+                 upper_bound=.31,
+                 width=.2,
+                 stds=1.7,
+                 return_threshold=False):
+        self.lower_bound = lower_bound
+        self.upper_bound = upper_bound
+        self.width = width
+        self.stds = stds
+        self.return_threshold = return_threshold
+
+    def __call__(self, pitch, periodicity):
+        # Save output device
+        device = pitch.device
+
+        # Perform hysteresis in log-2 space
+        pitch = torch.log2(pitch).detach().flatten().cpu().numpy()
+
+        # Flatten periodicity
+        periodicity = periodicity.flatten().cpu().numpy()
+
+        # Ignore confidently unvoiced pitch
+        pitch[periodicity < self.lower_bound] = crepe.UNVOICED
+
+        # Whiten pitch
+        mean, std = np.nanmean(pitch), np.nanstd(pitch)
+        pitch = (pitch - mean) / std
+
+        # Require high confidence to make predictions far from the mean
+        parabola = self.width * pitch ** 2 - self.width * self.stds ** 2
+        threshold = \
+            self.lower_bound + np.clip(parabola, 0, 1 - self.lower_bound)
+        threshold[np.isnan(threshold)] = self.lower_bound
+
+        # Apply hysteresis to prevent short, unconfident voiced regions
+        i = 0
+        while i < len(periodicity) - 1:
+
+            # Detect unvoiced to voiced transition
+            if periodicity[i] < threshold[i] and \
+               periodicity[i + 1] > threshold[i + 1]:
+
+                # Grow region until next unvoiced or end of array
+                start, end, keep = i + 1, i + 1, False
+                while end < len(periodicity) and \
+                      periodicity[end] > threshold[end]:
+                    if periodicity[end] > self.upper_bound:
+                        keep = True
+                    end += 1
+
+                # Force unvoiced if we didn't pass the confidence required by
+                # the hysteresis
+                if not keep:
+                    threshold[start:end] = 1
+
+                i = end
+
+            else:
+                i += 1
+
+        # Remove pitch with low periodicity
+        pitch[periodicity < threshold] = crepe.UNVOICED
+
+        # Unwhiten
+        pitch = pitch * std + mean
+
+        # Convert to Hz
+        pitch = torch.tensor(2 ** pitch, device=device)[None, :]
+
+        # Optionally return threshold
+        if self.return_threshold:
+            return pitch, torch.tensor(threshold, device=device)
+
+        return pitch
+
+
+###############################################################################
+# Periodicity thresholding methods
+###############################################################################
+
+
+class Silence:
+    """Set periodicity to zero in silent regions"""
+
+    def __init__(self, value=-60):
+        self.value = value
+
+    def __call__(self,
+                 periodicity,
+                 audio,
+                 sample_rate=crepe.SAMPLE_RATE,
+                 hop_length=None,
+                 pad=True):
+        # Don't modify in-place
+        periodicity = torch.clone(periodicity)
+
+        # Compute loudness
+        loudness = crepe.loudness.a_weighted(
+            audio, sample_rate, hop_length, pad)
+
+        # Threshold silence
+        periodicity[loudness < self.value] = 0.
+
+        return periodicity
diff --git a/environment.yml b/environment.yml
new file mode 100644
index 0000000000000000000000000000000000000000..6986a80645a34b56943e444e730047d256317745
--- /dev/null
+++ b/environment.yml
@@ -0,0 +1,201 @@
+name: so-vits-svc-5.0
+channels:
+  - pytorch
+  - anaconda
+  - nvidia
+  - defaults
+dependencies:
+  - _libgcc_mutex=0.1=main
+  - _openmp_mutex=5.1=1_gnu
+  - blas=1.0=mkl
+  - brotlipy=0.7.0=py311h5eee18b_1002
+  - bzip2=1.0.8=h7b6447c_0
+  - ca-certificates=2023.08.22=h06a4308_0
+  - certifi=2023.7.22=py311h06a4308_0
+  - cffi=1.15.1=py311h5eee18b_3
+  - cryptography=41.0.3=py311hdda0065_0
+  - cuda-cudart=11.7.99=0
+  - cuda-cupti=11.7.101=0
+  - cuda-libraries=11.7.1=0
+  - cuda-nvrtc=11.7.99=0
+  - cuda-nvtx=11.7.91=0
+  - cuda-runtime=11.7.1=0
+  - cudatoolkit=11.3.1=h2bc3f7f_2
+  - ffmpeg=4.3=hf484d3e_0
+  - filelock=3.9.0=py311h06a4308_0
+  - freetype=2.12.1=h4a9f257_0
+  - giflib=5.2.1=h5eee18b_3
+  - gmp=6.2.1=h295c915_3
+  - gmpy2=2.1.2=py311hc9b5ff0_0
+  - gnutls=3.6.15=he1e5248_0
+  - idna=3.4=py311h06a4308_0
+  - intel-openmp=2023.1.0=hdb19cb5_46305
+  - jinja2=3.1.2=py311h06a4308_0
+  - jpeg=9e=h5eee18b_1
+  - lame=3.100=h7b6447c_0
+  - lcms2=2.12=h3be6417_0
+  - ld_impl_linux-64=2.38=h1181459_1
+  - lerc=3.0=h295c915_0
+  - libcublas=11.10.3.66=0
+  - libcufft=10.7.2.124=h4fbf590_0
+  - libcufile=1.7.2.10=0
+  - libcurand=10.3.3.141=0
+  - libcusolver=11.4.0.1=0
+  - libcusparse=11.7.4.91=0
+  - libdeflate=1.17=h5eee18b_0
+  - libffi=3.4.4=h6a678d5_0
+  - libgcc-ng=11.2.0=h1234567_1
+  - libgomp=11.2.0=h1234567_1
+  - libiconv=1.16=h7f8727e_2
+  - libidn2=2.3.4=h5eee18b_0
+  - libnpp=11.7.4.75=0
+  - libnvjpeg=11.8.0.2=0
+  - libpng=1.6.39=h5eee18b_0
+  - libstdcxx-ng=11.2.0=h1234567_1
+  - libtasn1=4.19.0=h5eee18b_0
+  - libtiff=4.5.1=h6a678d5_0
+  - libunistring=0.9.10=h27cfd23_0
+  - libuuid=1.41.5=h5eee18b_0
+  - libwebp=1.2.4=h11a3e52_1
+  - libwebp-base=1.2.4=h5eee18b_1
+  - lz4-c=1.9.4=h6a678d5_0
+  - markupsafe=2.1.1=py311h5eee18b_0
+  - mkl=2023.1.0=h213fc3f_46343
+  - mkl-service=2.4.0=py311h5eee18b_1
+  - mkl_fft=1.3.6=py311ha02d727_1
+  - mkl_random=1.2.2=py311ha02d727_1
+  - mpc=1.1.0=h10f8cd9_1
+  - mpfr=4.0.2=hb69a4c5_1
+  - mpmath=1.3.0=py311h06a4308_0
+  - ncurses=6.4=h6a678d5_0
+  - nettle=3.7.3=hbbd107a_1
+  - networkx=3.1=py311h06a4308_0
+  - numpy-base=1.25.2=py311hf175353_0
+  - openh264=2.1.1=h4ff587b_0
+  - openssl=3.0.10=h7f8727e_2
+  - pip=23.2.1=py311h06a4308_0
+  - pycparser=2.21=pyhd3eb1b0_0
+  - pyopenssl=23.2.0=py311h06a4308_0
+  - pysocks=1.7.1=py311h06a4308_0
+  - python=3.11.5=h955ad1f_0
+  - pytorch=2.0.1=py3.11_cuda11.7_cudnn8.5.0_0
+  - pytorch-cuda=11.7=h778d358_5
+  - pytorch-mutex=1.0=cuda
+  - readline=8.2=h5eee18b_0
+  - requests=2.31.0=py311h06a4308_0
+  - setuptools=68.0.0=py311h06a4308_0
+  - sqlite=3.41.2=h5eee18b_0
+  - sympy=1.11.1=py311h06a4308_0
+  - tbb=2021.8.0=hdb19cb5_0
+  - tk=8.6.12=h1ccaba5_0
+  - torchaudio=2.0.2=py311_cu117
+  - torchtriton=2.0.0=py311
+  - torchvision=0.15.2=py311_cu117
+  - typing_extensions=4.7.1=py311h06a4308_0
+  - urllib3=1.26.16=py311h06a4308_0
+  - wheel=0.38.4=py311h06a4308_0
+  - xz=5.4.2=h5eee18b_0
+  - zlib=1.2.13=h5eee18b_0
+  - zstd=1.5.5=hc292b87_0
+  - pip:
+      - absl-py==1.4.0
+      - aiofiles==23.2.1
+      - aiohttp==3.8.5
+      - aiosignal==1.3.1
+      - altair==5.1.1
+      - annotated-types==0.5.0
+      - antlr4-python3-runtime==4.9.3
+      - anyio==3.7.1
+      - async-timeout==4.0.3
+      - attrs==23.1.0
+      - audioread==3.0.0
+      - cachetools==5.3.1
+      - chardet==5.2.0
+      - charset-normalizer==3.2.0
+      - click==8.1.7
+      - contourpy==1.1.0
+      - cycler==0.11.0
+      - cython==3.0.2
+      - decorator==5.1.1
+      - fastapi==0.103.1
+      - ffmpy==0.3.1
+      - fonttools==4.42.1
+      - frozenlist==1.4.0
+      - fsspec==2023.9.0
+      - google-auth==2.23.0
+      - google-auth-oauthlib==1.0.0
+      - gradio==3.36.1
+      - gradio-client==0.5.0
+      - grpcio==1.58.0
+      - h11==0.14.0
+      - httpcore==0.18.0
+      - httpx==0.25.0
+      - huggingface-hub==0.17.1
+      - joblib==1.3.2
+      - jsonschema==4.19.0
+      - jsonschema-specifications==2023.7.1
+      - kiwisolver==1.4.5
+      - lazy-loader==0.3
+      - librosa==0.10.1
+      - linkify-it-py==2.0.2
+      - llvmlite==0.40.1
+      - markdown==3.4.4
+      - markdown-it-py==2.2.0
+      - matplotlib==3.7.3
+      - mdit-py-plugins==0.3.3
+      - mdurl==0.1.2
+      - msgpack==1.0.5
+      - multidict==6.0.4
+      - numba==0.57.1
+      - numpy==1.24.0
+      - oauthlib==3.2.2
+      - omegaconf==2.3.0
+      - orjson==3.9.7
+      - packaging==23.1
+      - pandas==2.1.0
+      - pillow==10.0.0
+      - platformdirs==3.10.0
+      - pooch==1.7.0
+      - protobuf==4.24.3
+      - pyasn1==0.5.0
+      - pyasn1-modules==0.3.0
+      - pydantic==2.3.0
+      - pydantic-core==2.6.3
+      - pydub==0.25.1
+      - pygments==2.16.1
+      - pyparsing==3.1.1
+      - python-dateutil==2.8.2
+      - python-multipart==0.0.6
+      - pytz==2023.3.post1
+      - pyworld==0.3.4
+      - pyyaml==6.0.1
+      - referencing==0.30.2
+      - regex==2023.8.8
+      - requests-oauthlib==1.3.1
+      - resampy==0.4.2
+      - rpds-py==0.10.2
+      - rsa==4.9
+      - ruamel-yaml==0.17.32
+      - ruamel-yaml-clib==0.2.7
+      - safetensors==0.3.3
+      - scikit-learn==1.3.0
+      - scipy==1.11.2
+      - semantic-version==2.10.0
+      - six==1.16.0
+      - sniffio==1.3.0
+      - soundfile==0.12.1
+      - soxr==0.3.6
+      - starlette==0.27.0
+      - tensorboard==2.14.0
+      - tensorboard-data-server==0.7.1
+      - threadpoolctl==3.2.0
+      - tokenizers==0.13.3
+      - toolz==0.12.0
+      - tqdm==4.66.1
+      - transformers==4.33.1
+      - tzdata==2023.3
+      - uc-micro-py==1.0.2
+      - uvicorn==0.23.2
+      - websockets==11.0.3
+      - werkzeug==2.3.7
+      - yarl==1.9.2
diff --git a/feature_retrieval/__init__.py b/feature_retrieval/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..80bd7dbf8d83e7c6b7659ec3bb5b896e9e251888
--- /dev/null
+++ b/feature_retrieval/__init__.py
@@ -0,0 +1,4 @@
+from .index import *
+from .train import *
+from .transform import *
+from .retrieval import *
diff --git a/feature_retrieval/index.py b/feature_retrieval/index.py
new file mode 100644
index 0000000000000000000000000000000000000000..96c770196c6c4cb2437dd53ec0f4194c60f8f112
--- /dev/null
+++ b/feature_retrieval/index.py
@@ -0,0 +1,166 @@
+import abc
+import logging
+import math
+import time
+from pathlib import Path
+from typing import TypeVar, Generic, cast, Any
+
+import numpy as np
+import numpy.typing as npt
+
+from tqdm import tqdm
+
+import faiss
+from faiss import IndexIVF, Index
+
+logger = logging.getLogger(__name__)
+
+T = TypeVar("T", bound=Index)
+NumpyArray = npt.NDArray[np.float32]
+
+
+class FaissFeatureIndex(Generic[T], abc.ABC):
+    def __init__(self, index: T) -> None:
+        self._index = index
+
+    def save(self, filepath: Path, rewrite: bool = False) -> None:
+        if filepath.exists() and not rewrite:
+            raise FileExistsError(f"index already exists by path {filepath}")
+        faiss.write_index(self._index, str(filepath))
+
+
+class FaissRetrievableFeatureIndex(FaissFeatureIndex[Index], abc.ABC):
+    """retrieve voice feature vectors by faiss index"""
+
+    def __init__(self, index: T, ratio: float, n_nearest_vectors: int) -> None:
+        super().__init__(index=index)
+        if index.metric_type != self.supported_distance:
+            raise ValueError(f"index metric type {index.metric_type=} is unsupported {self.supported_distance=}")
+
+        if 1 > n_nearest_vectors:
+            raise ValueError("n-retrieval-vectors must be gte 1")
+        self._n_nearest = n_nearest_vectors
+
+        if 0 > ratio > 1:
+            raise ValueError(f"{ratio=} must be in rage (0, 1)")
+        self._ratio = ratio
+
+    @property
+    @abc.abstractmethod
+    def supported_distance(self) -> Any:
+        raise NotImplementedError
+
+    @abc.abstractmethod
+    def _weight_nearest_vectors(self, nearest_vectors: NumpyArray, scores: NumpyArray) -> NumpyArray:
+        raise NotImplementedError
+
+    def retriv(self, features: NumpyArray) -> NumpyArray:
+        # use method search_and_reconstruct instead of recreating the whole matrix
+        scores, _, nearest_vectors = self._index.search_and_reconstruct(features, k=self._n_nearest)
+        weighted_nearest_vectors = self._weight_nearest_vectors(nearest_vectors, scores)
+        retriv_vector = (1 - self._ratio) * features + self._ratio * weighted_nearest_vectors
+        return retriv_vector
+
+
+class FaissRVCRetrievableFeatureIndex(FaissRetrievableFeatureIndex):
+    """
+    retrieve voice encoded features with algorith from RVC repository
+    https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI
+    """
+
+    @property
+    def supported_distance(self) -> Any:
+        return faiss.METRIC_L2
+
+    def _weight_nearest_vectors(self, nearest_vectors: NumpyArray, scores: NumpyArray) -> NumpyArray:
+        """
+        magic code from original RVC
+        https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI/blob/86ed98aacaa8b2037aad795abd11cdca122cf39f/vc_infer_pipeline.py#L213C18-L213C19
+
+        nearest_vectors dim (n_nearest, vector_dim)
+        scores dim (num_vectors, n_nearest)
+        """
+        logger.debug("shape: nv=%s sc=%s", nearest_vectors.shape, scores.shape)
+        weight = np.square(1 / scores)
+        weight /= weight.sum(axis=1, keepdims=True)
+        weight = np.expand_dims(weight, axis=2)
+        weighted_nearest_vectors = np.sum(nearest_vectors * weight, axis=1)
+        logger.debug(
+            "shape: nv=%s weight=%s weight_nearest=%s",
+            nearest_vectors.shape,
+            weight.shape,
+            weighted_nearest_vectors.shape,
+        )
+        return cast(NumpyArray, weighted_nearest_vectors)
+
+
+class FaissIVFTrainableFeatureIndex(FaissFeatureIndex[IndexIVF]):
+    """IVF faiss index that can train and add feature vectors"""
+
+    def __init__(self, index: IndexIVF, batch_size: int) -> None:
+        super().__init__(index=index)
+        self._batch_size = batch_size
+
+    @property
+    def _trained_index(self) -> IndexIVF:
+        if not self._index.is_trained:
+            raise RuntimeError("index needs to be trained first")
+        return self._index
+
+    @property
+    def _not_trained_index(self) -> IndexIVF:
+        if self._index.is_trained:
+            raise RuntimeError("index is already trained")
+        return self._index
+
+    def _batch_count(self, feature_matrix: NumpyArray) -> int:
+        return math.ceil(feature_matrix.shape[0] / self._batch_size)
+
+    def _split_matrix_by_batch(self, feature_matrix: NumpyArray) -> list[NumpyArray]:
+        return np.array_split(feature_matrix, indices_or_sections=self._batch_count(feature_matrix), axis=0)
+
+    def _train_index(self, train_feature_matrix: NumpyArray) -> None:
+        start = time.monotonic()
+        self._not_trained_index.train(train_feature_matrix)
+        took = time.monotonic() - start
+        logger.info("index is trained. Took %.2f seconds", took)
+
+    def add_to_index(self, feature_matrix: NumpyArray) -> None:
+        n_batches = self._batch_count(feature_matrix)
+        logger.info("adding %s batches to index", n_batches)
+        start = time.monotonic()
+        for batch in tqdm(self._split_matrix_by_batch(feature_matrix), total=n_batches):
+            self._trained_index.add(batch)
+        took = time.monotonic() - start
+        logger.info("all batches added. Took %.2f seconds", took)
+
+    def add_with_train(self, feature_matrix: NumpyArray) -> None:
+        self._train_index(feature_matrix)
+        self.add_to_index(feature_matrix)
+
+
+class FaissIVFFlatTrainableFeatureIndexBuilder:
+    def __init__(self, batch_size: int, distance: int) -> None:
+        self._batch_size = batch_size
+        self._distance = distance
+
+    def _build_index(self, num_vectors: int, vector_dim: int) -> IndexIVF:
+        n_ivf = min(int(16 * np.sqrt(num_vectors)), num_vectors // 39)
+        factory_string = f"IVF{n_ivf},Flat"
+        index = faiss.index_factory(vector_dim, factory_string, self._distance)
+        logger.debug('faiss index built by string "%s" and dimension %s', factory_string, vector_dim)
+        index_ivf = faiss.extract_index_ivf(index)
+        index_ivf.nprobe = 1
+        return index
+
+    def build(self, num_vectors: int, vector_dim: int) -> FaissIVFTrainableFeatureIndex:
+        return FaissIVFTrainableFeatureIndex(
+            index=self._build_index(num_vectors, vector_dim),
+            batch_size=self._batch_size,
+        )
+
+
+def load_retrieve_index(filepath: Path, ratio: float, n_nearest_vectors: int) -> FaissRetrievableFeatureIndex:
+    return FaissRVCRetrievableFeatureIndex(
+        index=faiss.read_index(str(filepath)), ratio=ratio, n_nearest_vectors=n_nearest_vectors
+    )
diff --git a/feature_retrieval/retrieval.py b/feature_retrieval/retrieval.py
new file mode 100644
index 0000000000000000000000000000000000000000..e2145ab20f1b07c3f0cde8c3f591b8571253c9b6
--- /dev/null
+++ b/feature_retrieval/retrieval.py
@@ -0,0 +1,44 @@
+import abc
+import logging
+
+import torch
+
+from feature_retrieval import FaissRetrievableFeatureIndex
+
+logger = logging.getLogger(__name__)
+
+
+class IRetrieval(abc.ABC):
+    @abc.abstractmethod
+    def retriv_whisper(self, vec: torch.Tensor) -> torch.Tensor:
+        raise NotImplementedError
+
+    @abc.abstractmethod
+    def retriv_hubert(self, vec: torch.Tensor) -> torch.Tensor:
+        raise NotImplementedError
+
+
+class DummyRetrieval(IRetrieval):
+    def retriv_whisper(self, vec: torch.FloatTensor) -> torch.FloatTensor:
+        logger.debug("start dummy retriv whisper")
+        return vec.clone().to(torch.device("cpu"))
+
+    def retriv_hubert(self, vec: torch.FloatTensor) -> torch.FloatTensor:
+        logger.debug("start dummy retriv hubert")
+        return vec.clone().to(torch.device("cpu"))
+
+
+class FaissIndexRetrieval(IRetrieval):
+    def __init__(self, hubert_index: FaissRetrievableFeatureIndex, whisper_index: FaissRetrievableFeatureIndex) -> None:
+        self._hubert_index = hubert_index
+        self._whisper_index = whisper_index
+
+    def retriv_whisper(self, vec: torch.Tensor) -> torch.Tensor:
+        logger.debug("start retriv whisper")
+        np_vec = self._whisper_index.retriv(vec.numpy())
+        return torch.from_numpy(np_vec)
+
+    def retriv_hubert(self, vec: torch.Tensor) -> torch.Tensor:
+        logger.debug("start retriv hubert")
+        np_vec = self._hubert_index.retriv(vec.numpy())
+        return torch.from_numpy(np_vec)
diff --git a/feature_retrieval/train.py b/feature_retrieval/train.py
new file mode 100644
index 0000000000000000000000000000000000000000..4a565804afe32175b2e5a83f747447773f5f56b4
--- /dev/null
+++ b/feature_retrieval/train.py
@@ -0,0 +1,37 @@
+from pathlib import Path
+from typing import cast
+
+import numpy as np
+
+from feature_retrieval import NumpyArray
+from feature_retrieval.index import FaissIVFFlatTrainableFeatureIndexBuilder, logger
+from feature_retrieval.transform import IFeatureMatrixTransform
+
+
+def train_index(
+    features_path: Path,
+    index_save_filepath: Path,
+    index_builder: FaissIVFFlatTrainableFeatureIndexBuilder,
+    feature_transform: IFeatureMatrixTransform,
+) -> None:
+    logger.info("start getting feature vectors from %s", features_path.absolute())
+    feature_matrix = get_feature_matrix(features_path)
+    logger.debug("fetched %s features", feature_matrix.shape[0])
+
+    logger.info("apply transform to feature matrix")
+    feature_matrix = feature_transform.transform(feature_matrix)
+    num_vectors, vector_dim = feature_matrix.shape
+    logger.debug("features transformed. Current features %s", num_vectors)
+
+    feature_index = index_builder.build(num_vectors=num_vectors, vector_dim=vector_dim)
+    logger.info("adding features to index with training")
+
+    feature_index.add_with_train(feature_matrix)
+    feature_index.save(index_save_filepath)
+    logger.info("index saved to %s", index_save_filepath.absolute())
+
+
+def get_feature_matrix(features_dir_path: Path) -> NumpyArray:
+    matrices = [np.load(str(features_path)) for features_path in features_dir_path.rglob("*.npy")]
+    feature_matrix = np.concatenate(matrices, axis=0)
+    return cast(NumpyArray, feature_matrix)
diff --git a/feature_retrieval/transform.py b/feature_retrieval/transform.py
new file mode 100644
index 0000000000000000000000000000000000000000..2c4ecf4e817ef17631fddb065e3fd0742aad4b44
--- /dev/null
+++ b/feature_retrieval/transform.py
@@ -0,0 +1,72 @@
+import abc
+import logging
+from typing import cast, Callable
+
+from sklearn.cluster import MiniBatchKMeans
+
+from feature_retrieval.index import NumpyArray
+
+
+logger = logging.getLogger(__name__)
+
+
+class IFeatureMatrixTransform:
+    """Interface for transform encoded voice feature from (n_features,vector_dim) to (m_features,vector_dim)"""
+
+    @abc.abstractmethod
+    def transform(self, matrix: NumpyArray) -> NumpyArray:
+        """transform given feature matrix from (n_features,vector_dim) to (m_features,vector_dim)"""
+        raise NotImplementedError
+
+
+class DummyFeatureTransform(IFeatureMatrixTransform):
+    """do nothing"""
+
+    def transform(self, matrix: NumpyArray) -> NumpyArray:
+        return matrix
+
+
+class MinibatchKmeansFeatureTransform(IFeatureMatrixTransform):
+    """replaces number of examples with k-means centroids using minibatch algorythm"""
+
+    def __init__(self, n_clusters: int, n_parallel: int) -> None:
+        self._n_clusters = n_clusters
+        self._n_parallel = n_parallel
+
+    @property
+    def _batch_size(self) -> int:
+        return self._n_parallel * 256
+
+    def transform(self, matrix: NumpyArray) -> NumpyArray:
+        """transform given feature matrix from (n_features,vector_dim) to (n_clusters,vector_dim)"""
+        cluster = MiniBatchKMeans(
+            n_clusters=self._n_clusters,
+            verbose=True,
+            batch_size=self._batch_size,
+            compute_labels=False,
+            init="k-means++",
+        )
+        return cast(NumpyArray, cluster.fit(matrix).cluster_centers_)
+
+
+class OnConditionFeatureTransform(IFeatureMatrixTransform):
+    """call given transform if condition is True else call otherwise transform"""
+
+    def __init__(
+        self,
+        condition: Callable[[NumpyArray], bool],
+        on_condition: IFeatureMatrixTransform,
+        otherwise: IFeatureMatrixTransform,
+    ) -> None:
+        self._condition = condition
+        self._on_condition = on_condition
+        self._otherwise = otherwise
+
+    def transform(self, matrix: NumpyArray) -> NumpyArray:
+        if self._condition(matrix):
+            transform_name = self._on_condition.__class__.__name__
+            logger.info(f"pass condition. Transform by rule {transform_name}")
+            return self._on_condition.transform(matrix)
+        transform_name = self._otherwise.__class__.__name__
+        logger.info(f"condition is not passed. Transform by rule {transform_name}")
+        return self._otherwise.transform(matrix)
diff --git a/hubert/LICENSE.txt b/hubert/LICENSE.txt
new file mode 100644
index 0000000000000000000000000000000000000000..6eb2af050447968cc32481fcfe67b5a4c6cdc69e
--- /dev/null
+++ b/hubert/LICENSE.txt
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2021 Benjamin van Niekerk
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/hubert/__init__.py b/hubert/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/hubert/hubert_model.py b/hubert/hubert_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..7fb642d89b07ca60792debab18e3454f52d8f357
--- /dev/null
+++ b/hubert/hubert_model.py
@@ -0,0 +1,222 @@
+import copy
+import random
+from typing import Optional, Tuple
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as t_func
+from torch.nn.modules.utils import consume_prefix_in_state_dict_if_present
+
+
+class Hubert(nn.Module):
+    def __init__(self, num_label_embeddings: int = 100, mask: bool = True):
+        super().__init__()
+        self._mask = mask
+        self.feature_extractor = FeatureExtractor()
+        self.feature_projection = FeatureProjection()
+        self.positional_embedding = PositionalConvEmbedding()
+        self.norm = nn.LayerNorm(768)
+        self.dropout = nn.Dropout(0.1)
+        self.encoder = TransformerEncoder(
+            nn.TransformerEncoderLayer(
+                768, 12, 3072, activation="gelu", batch_first=True
+            ),
+            12,
+        )
+        self.proj = nn.Linear(768, 256)
+
+        self.masked_spec_embed = nn.Parameter(torch.FloatTensor(768).uniform_())
+        self.label_embedding = nn.Embedding(num_label_embeddings, 256)
+
+    def mask(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        mask = None
+        if self.training and self._mask:
+            mask = _compute_mask((x.size(0), x.size(1)), 0.8, 10, x.device, 2)
+            x[mask] = self.masked_spec_embed.to(x.dtype)
+        return x, mask
+
+    def encode(
+            self, x: torch.Tensor, layer: Optional[int] = None
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        x = self.feature_extractor(x)
+        x = self.feature_projection(x.transpose(1, 2))
+        x, mask = self.mask(x)
+        x = x + self.positional_embedding(x)
+        x = self.dropout(self.norm(x))
+        x = self.encoder(x, output_layer=layer)
+        return x, mask
+
+    def logits(self, x: torch.Tensor) -> torch.Tensor:
+        logits = torch.cosine_similarity(
+            x.unsqueeze(2),
+            self.label_embedding.weight.unsqueeze(0).unsqueeze(0),
+            dim=-1,
+        )
+        return logits / 0.1
+
+    def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        x, mask = self.encode(x)
+        x = self.proj(x)
+        logits = self.logits(x)
+        return logits, mask
+
+
+class HubertSoft(Hubert):
+    def __init__(self):
+        super().__init__()
+
+    @torch.inference_mode()
+    def units(self, wav: torch.Tensor) -> torch.Tensor:
+        wav = t_func.pad(wav, ((400 - 320) // 2, (400 - 320) // 2))
+        x, _ = self.encode(wav)
+        return self.proj(x)
+
+
+class FeatureExtractor(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.conv0 = nn.Conv1d(1, 512, 10, 5, bias=False)
+        self.norm0 = nn.GroupNorm(512, 512)
+        self.conv1 = nn.Conv1d(512, 512, 3, 2, bias=False)
+        self.conv2 = nn.Conv1d(512, 512, 3, 2, bias=False)
+        self.conv3 = nn.Conv1d(512, 512, 3, 2, bias=False)
+        self.conv4 = nn.Conv1d(512, 512, 3, 2, bias=False)
+        self.conv5 = nn.Conv1d(512, 512, 2, 2, bias=False)
+        self.conv6 = nn.Conv1d(512, 512, 2, 2, bias=False)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = t_func.gelu(self.norm0(self.conv0(x)))
+        x = t_func.gelu(self.conv1(x))
+        x = t_func.gelu(self.conv2(x))
+        x = t_func.gelu(self.conv3(x))
+        x = t_func.gelu(self.conv4(x))
+        x = t_func.gelu(self.conv5(x))
+        x = t_func.gelu(self.conv6(x))
+        return x
+
+
+class FeatureProjection(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.norm = nn.LayerNorm(512)
+        self.projection = nn.Linear(512, 768)
+        self.dropout = nn.Dropout(0.1)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.norm(x)
+        x = self.projection(x)
+        x = self.dropout(x)
+        return x
+
+
+class PositionalConvEmbedding(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.conv = nn.Conv1d(
+            768,
+            768,
+            kernel_size=128,
+            padding=128 // 2,
+            groups=16,
+        )
+        self.conv = nn.utils.weight_norm(self.conv, name="weight", dim=2)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.conv(x.transpose(1, 2))
+        x = t_func.gelu(x[:, :, :-1])
+        return x.transpose(1, 2)
+
+
+class TransformerEncoder(nn.Module):
+    def __init__(
+            self, encoder_layer: nn.TransformerEncoderLayer, num_layers: int
+    ) -> None:
+        super(TransformerEncoder, self).__init__()
+        self.layers = nn.ModuleList(
+            [copy.deepcopy(encoder_layer) for _ in range(num_layers)]
+        )
+        self.num_layers = num_layers
+
+    def forward(
+            self,
+            src: torch.Tensor,
+            mask: torch.Tensor = None,
+            src_key_padding_mask: torch.Tensor = None,
+            output_layer: Optional[int] = None,
+    ) -> torch.Tensor:
+        output = src
+        for layer in self.layers[:output_layer]:
+            output = layer(
+                output, src_mask=mask, src_key_padding_mask=src_key_padding_mask
+            )
+        return output
+
+
+def _compute_mask(
+        shape: Tuple[int, int],
+        mask_prob: float,
+        mask_length: int,
+        device: torch.device,
+        min_masks: int = 0,
+) -> torch.Tensor:
+    batch_size, sequence_length = shape
+
+    if mask_length < 1:
+        raise ValueError("`mask_length` has to be bigger than 0.")
+
+    if mask_length > sequence_length:
+        raise ValueError(
+            f"`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: {mask_length} and `sequence_length`: {sequence_length}`"
+        )
+
+    # compute number of masked spans in batch
+    num_masked_spans = int(mask_prob * sequence_length / mask_length + random.random())
+    num_masked_spans = max(num_masked_spans, min_masks)
+
+    # make sure num masked indices <= sequence_length
+    if num_masked_spans * mask_length > sequence_length:
+        num_masked_spans = sequence_length // mask_length
+
+    # SpecAugment mask to fill
+    mask = torch.zeros((batch_size, sequence_length), device=device, dtype=torch.bool)
+
+    # uniform distribution to sample from, make sure that offset samples are < sequence_length
+    uniform_dist = torch.ones(
+        (batch_size, sequence_length - (mask_length - 1)), device=device
+    )
+
+    # get random indices to mask
+    mask_indices = torch.multinomial(uniform_dist, num_masked_spans)
+
+    # expand masked indices to masked spans
+    mask_indices = (
+        mask_indices.unsqueeze(dim=-1)
+        .expand((batch_size, num_masked_spans, mask_length))
+        .reshape(batch_size, num_masked_spans * mask_length)
+    )
+    offsets = (
+        torch.arange(mask_length, device=device)[None, None, :]
+        .expand((batch_size, num_masked_spans, mask_length))
+        .reshape(batch_size, num_masked_spans * mask_length)
+    )
+    mask_idxs = mask_indices + offsets
+
+    # scatter indices to mask
+    mask = mask.scatter(1, mask_idxs, True)
+
+    return mask
+
+
+def hubert_soft(
+        path: str,
+) -> HubertSoft:
+    r"""HuBERT-Soft from `"A Comparison of Discrete and Soft Speech Units for Improved Voice Conversion"`.
+    Args:
+        path (str): path of a pretrained model
+    """
+    hubert = HubertSoft()
+    checkpoint = torch.load(path)
+    consume_prefix_in_state_dict_if_present(checkpoint, "module.")
+    hubert.load_state_dict(checkpoint)
+    hubert.eval()
+    return hubert
diff --git a/hubert/inference.py b/hubert/inference.py
new file mode 100644
index 0000000000000000000000000000000000000000..ac4fbeec8cb636569aef7aeb938ef709b115ab4a
--- /dev/null
+++ b/hubert/inference.py
@@ -0,0 +1,67 @@
+import sys,os
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+import numpy as np
+import argparse
+import torch
+import librosa
+
+from hubert import hubert_model
+
+
+def load_audio(file: str, sr: int = 16000):
+    x, sr = librosa.load(file, sr=sr)
+    return x
+
+
+def load_model(path, device):
+    model = hubert_model.hubert_soft(path)
+    model.eval()
+    if not (device == "cpu"):
+        model.half()
+    model.to(device)
+    return model
+
+
+def pred_vec(model, wavPath, vecPath, device):
+    audio = load_audio(wavPath)
+    audln = audio.shape[0]
+    vec_a = []
+    idx_s = 0
+    while (idx_s + 20 * 16000 < audln):
+        feats = audio[idx_s:idx_s + 20 * 16000]
+        feats = torch.from_numpy(feats).to(device)
+        feats = feats[None, None, :]
+        if not (device == "cpu"):
+            feats = feats.half()
+        with torch.no_grad():
+            vec = model.units(feats).squeeze().data.cpu().float().numpy()
+            vec_a.extend(vec)
+        idx_s = idx_s + 20 * 16000
+    if (idx_s < audln):
+        feats = audio[idx_s:audln]
+        feats = torch.from_numpy(feats).to(device)
+        feats = feats[None, None, :]
+        if not (device == "cpu"):
+            feats = feats.half()
+        with torch.no_grad():
+            vec = model.units(feats).squeeze().data.cpu().float().numpy()
+            # print(vec.shape)   # [length, dim=256] hop=320
+            vec_a.extend(vec)
+    np.save(vecPath, vec_a, allow_pickle=False)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-w", "--wav", help="wav", dest="wav", required=True)
+    parser.add_argument("-v", "--vec", help="vec", dest="vec", required=True)
+    args = parser.parse_args()
+    print(args.wav)
+    print(args.vec)
+
+    wavPath = args.wav
+    vecPath = args.vec
+
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    hubert = load_model(os.path.join(
+        "hubert_pretrain", "hubert-soft-0d54a1f4.pt"), device)
+    pred_vec(hubert, wavPath, vecPath, device)
diff --git a/hubert_pretrain/.DS_Store b/hubert_pretrain/.DS_Store
new file mode 100644
index 0000000000000000000000000000000000000000..5008ddfcf53c02e82d7eee2e57c38e5672ef89f6
Binary files /dev/null and b/hubert_pretrain/.DS_Store differ
diff --git a/hubert_pretrain/README.md b/hubert_pretrain/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..dbecfeb8bdf2b4120f3331252dd123ffba46e30c
--- /dev/null
+++ b/hubert_pretrain/README.md
@@ -0,0 +1,3 @@
+Path for:
+
+    hubert-soft-0d54a1f4.pt
\ No newline at end of file
diff --git a/main.py b/main.py
new file mode 100644
index 0000000000000000000000000000000000000000..276ba5e2edadb1eb5bffc5f4209d26136fe2b63d
--- /dev/null
+++ b/main.py
@@ -0,0 +1,32 @@
+import gradio as gr
+
+def click_test():
+    """1から10までのランダムな数値を生成する関数"""
+    import random
+    number = random.randint(1, 10)
+    return f"生成された数値: {number}"
+
+# Gradio インターフェースの作成
+with gr.Blocks() as demo:
+    gr.Markdown("# ランダム数値ジェネレーター")
+    gr.Markdown("下のボタンをクリックすると1から10までのランダムな数値が生成されます。")
+    
+    # 結果表示用のテキストボックス
+    output_text = gr.Text(label="結果")
+    
+    # カスタムボタンの追加
+    generate_btn = gr.Button(
+        value="数値を生成する",  # ボタンのテキスト
+        variant="primary",    # ボタンのスタイル
+        size="lg"            # ボタンのサイズ
+    )
+    
+    # ボタンクリック時のイベント設定
+    generate_btn.click(
+        fn=click_test,
+        outputs=output_text
+    )
+
+# アプリケーションの起動
+if __name__ == "__main__":
+    demo.launch()
\ No newline at end of file
diff --git a/pitch/__init__.py b/pitch/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..bc418142ecbe692527dfe1f1768205a50726dd2f
--- /dev/null
+++ b/pitch/__init__.py
@@ -0,0 +1 @@
+from .inference import load_csv_pitch
\ No newline at end of file
diff --git a/pitch/core/LICENCE b/pitch/core/LICENCE
new file mode 100644
index 0000000000000000000000000000000000000000..7e7c9386da890afff77de85d059a03ea6139865c
--- /dev/null
+++ b/pitch/core/LICENCE
@@ -0,0 +1,25 @@
+MIT License
+
+Copyright (c) 2022 Sebastian Rosenzweig, Simon Schwär, Meinard Müller, International Audio Laboratories Erlangen, Germany.
+We thank the German Research Foundation (DFG) for various research grants that
+allow us for conducting fundamental research in music processing.
+The International Audio Laboratories Erlangen are a joint institution of the
+Friedrich-Alexander-Universität Erlangen-Nürnberg (FAU) and Fraunhofer
+Institute for Integrated Circuits IIS.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
+the Software, and to permit persons to whom the Software is furnished to do so,
+subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
+IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
diff --git a/pitch/core/README.md b/pitch/core/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..f75171a8bf86370f212f0a6ae508c34a7ca9c421
--- /dev/null
+++ b/pitch/core/README.md
@@ -0,0 +1,41 @@
+This repository contains a Python package called libf0 which provides open-source  implementations for four popular model-based F0-estimation approaches, YIN (Cheveigné & Kawahara, 2002), pYIN (Mauch & Dixon, 2014), an approach inspired by Melodia (Salamon & Gómez, 2012), and SWIPE (Camacho & Harris, 2008).
+
+If you use the libf0 in your research, please consider the following references.
+
+## References
+
+Sebastian Rosenzweig, Simon Schwär, and Meinard Müller.
+[A Python Library for Fundamental Frequency Estimation.](https://archives.ismir.net/ismir2022/latebreaking/000003.pdf)
+In Late Breaking Demos of the International Society for Music Information Retrieval Conference (ISMIR), Bengaluru, India, 2022.
+
+Alain de Cheveigné and Hideki Kawahara.
+YIN, a fundamental frequency estimator for speech and music. Journal of the Acoustical Society of America (JASA), 111(4):1917–1930, 2002.
+
+Matthias Mauch and Simon Dixon.
+pYIN: A fundamental frequency estimator using probabilistic threshold distributions. In IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pages 659–663, Florence, Italy, 2014.
+
+Justin Salamon and Emilia Gómez.
+Melody extraction from polyphonic music signals using pitch contour characteristics. IEEE Transactions on Audio, Speech, and Language Processing, 20(6):
+1759–1770, 2012.
+
+Arturo Camacho and John G. Harris.
+A sawtooth waveform inspired pitch estimator for speech and music. The Journal of the Acoustical Society of America, 124(3):1638–1652, 2008.
+
+Meinard Müller. Fundamentals of Music Processing – Using Python and Jupyter Notebooks. Springer Verlag, 2nd edition, 2021. ISBN 978-3-030-69807-2. doi: 10.1007/978-3-030-69808-9.
+
+## Documentation
+There is also an API documentation for libf0:
+
+https://groupmm.github.io/libf0
+
+## Contributing
+
+We are happy for suggestions and contributions. We would be grateful for either directly contacting us via email (meinard.mueller@audiolabs-erlangen.de) or for creating an issue in our Github repository. Please do not submit a pull request without prior consultation with us.
+
+## Licence
+
+The code for this toolbox is published under an MIT licence.
+
+## Acknowledgements
+
+This work was supported by the German Research Foundation (MU 2686/13-1, SCHE 280/20-1). We thank Edgar Suárez and Vojtěch Pešek for helping with the implementations. Furthermore, we thank Fatemeh Eftekhar and Maryam Pirmoradi for testing the toolbox. The International Audio Laboratories Erlangen are a joint institution of the Friedrich-Alexander-Universität Erlangen-Nürnberg (FAU) and Fraunhofer Institute for Integrated Circuits IIS.
diff --git a/pitch/core/__init__.py b/pitch/core/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/pitch/core/pyin.py b/pitch/core/pyin.py
new file mode 100644
index 0000000000000000000000000000000000000000..8045599eaf292b00c67d1783e97e7facacb6492e
--- /dev/null
+++ b/pitch/core/pyin.py
@@ -0,0 +1,481 @@
+"""
+| Description: libf0 yin implementation
+| Contributors: Sebastian Rosenzweig, Simon Schwär, Edgar Suárez, Meinard Müller
+| License: The MIT license, https://opensource.org/licenses/MIT
+| This file is part of libf0.
+"""
+import numpy as np
+from scipy.special import beta, comb  # Scipy library for binomial beta distribution
+from scipy.stats import triang      # Scipy library for triangular distribution
+from .yin import cumulative_mean_normalized_difference_function, parabolic_interpolation
+from numba import njit
+
+
+# pYIN estimate computation
+def pyin(x, Fs=22050, N=2048, H=256, F_min=55.0, F_max=1760.0, R=10, thresholds=np.arange(0.01, 1, 0.01),
+         beta_params=[1, 18], absolute_min_prob=0.01, voicing_prob=0.5):
+    """
+    Implementation of the pYIN F0-estimation algorithm.
+
+    .. [#] Matthias Mauch and Simon Dixon.
+        "PYIN: A fundamental frequency estimator using probabilistic threshold distributions".
+        IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP) (2014): 659-663.
+
+    Parameters
+    ----------
+    x : ndarray
+        Audio signal
+    Fs : int
+        Sampling rate
+    N : int
+        Window size
+    H : int
+        Hop size
+    F_min : float or int
+        Minimal frequency
+    F_max : float or int
+        Maximal frequency
+    R : int
+        Frequency resolution given in cents
+    thresholds : ndarray
+        Range of thresholds
+    beta_params : tuple or list
+        Parameters of beta-distribution in the form [alpha, beta]
+    absolute_min_prob : float
+        Prior for voice activity
+    voicing_prob: float
+        Prior for transition probability?
+    Returns
+    -------
+    f0 : ndarray
+        Estimated F0-trajectory
+    t : ndarray
+        Time axis
+    conf : ndarray
+        Confidence
+    """
+
+    if F_min > F_max:
+        raise Exception("F_min must be smaller than F_max!")
+
+    if F_min < Fs/N:        
+        raise Exception(f"The condition (F_min >= Fs/N) was not met. With Fs = {Fs}, N = {N} and F_min = {F_min} you have the following options: \n1) Set F_min >= {np.ceil(Fs/N)} Hz. \n2) Set N >= {np.ceil(Fs/F_min).astype(int)}. \n3) Set Fs <= {np.floor(F_min * N)} Hz.")
+
+    x_pad = np.concatenate((np.zeros(N // 2), x, np.zeros(N // 2)))  # Add zeros for centered estimates
+
+    # Compute Beta distribution
+    thr_idxs = np.arange(len(thresholds))
+    beta_distr = comb(len(thresholds), thr_idxs) * beta(thr_idxs+beta_params[0],
+                                                        len(thresholds)-thr_idxs+beta_params[1]) / beta(beta_params[0],
+                                                                                                        beta_params[1])
+
+    # YIN with multiple thresholds, yielding observation matrix
+    B = int(np.log2(F_max / F_min) * (1200 / R))
+    F_axis = F_min * np.power(2, np.arange(B) * R / 1200)  # for quantizing the estimated F0s
+    O, rms, p_orig, val_orig = yin_multi_thr(x_pad, Fs=Fs, N=N, H=H, F_min=F_min, F_max=F_max, thresholds=thresholds,
+                                             beta_distr=beta_distr, absolute_min_prob=absolute_min_prob, F_axis=F_axis,
+                                             voicing_prob=voicing_prob)
+
+    # Transition matrix, using triangular distribution used for pitch transition probabilities
+    max_step_cents = 50  # Pitch jump can be at most 50 cents from frame to frame
+    max_step = int(max_step_cents / R)
+    triang_distr = triang.pdf(np.arange(-max_step, max_step+1), 0.5, scale=2*max_step, loc=-max_step)
+    A = compute_transition_matrix(B, triang_distr)
+    
+    # HMM smoothing
+    C = np.ones((2*B, 1)) / (2*B)  # uniform initialization
+    f0_idxs = viterbi_log_likelihood(A, C.flatten(), O)  # libfmp Viterbi implementation
+    
+    # Obtain F0-trajectory
+    F_axis_extended = np.concatenate((F_axis, np.zeros(len(F_axis))))
+    f0 = F_axis_extended[f0_idxs]
+
+    # Suppress low power estimates
+    f0[0] = 0  # due to algorithmic reasons, we set the first value unvoiced
+    f0[rms < 0.01] = 0
+
+    # confidence
+    O_norm = O[:, np.arange(O.shape[1])]/np.max(O, axis=0)
+    conf = O_norm[f0_idxs, np.arange(O.shape[1])]
+
+    # Refine estimates by choosing the closest original YIN estimate
+    refine_estimates = True
+    if refine_estimates:
+        f0 = refine_estimates_yin(f0, p_orig, val_orig, Fs, R)
+
+    t = np.arange(O.shape[1]) * H / Fs  # Time axis
+    
+    return f0, t, conf
+
+
+@njit
+def refine_estimates_yin(f0, p_orig, val_orig, Fs, tol):
+    """
+    Refine estimates using YIN CMNDF information.
+
+    Parameters
+    ----------
+    f0 : ndarray
+        F0 in Hz
+    p_orig : ndarray
+        Original lag as computed by YIN
+    val_orig : ndarray
+        Original CMNDF values as computed by YIN
+    Fs : float
+        Sampling frequency
+    tol : float
+        Tolerance for refinements in cents
+
+    Returns
+    -------
+    f0_refined : ndarray
+        Refined F0-trajectory
+    """
+    f0_refined = np.zeros_like(f0)
+    voiced_idxs = np.where(f0 > 0)[0]
+
+    f_orig = Fs / p_orig
+
+    # find closest original YIN estimate, maximally allowed absolute deviation: R (quantization error)
+    for m in voiced_idxs:
+        diff_cents = np.abs(1200 * np.log2(f_orig[:, m] / f0[m]))
+        candidate_idxs = np.where(diff_cents < tol)[0]
+
+        if not candidate_idxs.size:
+            f0_refined[m] = f0[m]
+        else:
+            f0_refined[m] = f_orig[candidate_idxs[np.argmin(val_orig[candidate_idxs, m])], m]
+
+    return f0_refined
+
+
+@njit
+def probabilistic_thresholding(cmndf, thresholds, p_min, p_max, absolute_min_prob, F_axis, Fs, beta_distr,
+                               parabolic_interp=True):
+    """
+    Probabilistic thresholding of the YIN CMNDF.
+
+    Parameters
+    ----------
+    cmndf : ndarray
+        Cumulative Mean Normalized Difference Function
+    thresholds : ndarray
+        Array of thresholds for CMNDF
+    p_min : float
+        Period corresponding to the lower frequency bound
+    p_max : float
+        Period corresponding to the upper frequency bound
+    absolute_min_prob : float
+        Probability to chose absolute minimum
+    F_axis : ndarray
+        Frequency axis
+    Fs : float
+        Sampling rate
+    beta_distr : ndarray
+        Beta distribution that defines mapping between thresholds and probabilities
+    parabolic_interp : bool
+        Switch to activate/deactivate parabolic interpolation
+
+    Returns
+    -------
+    O_m : ndarray
+        Observations for given frame
+    lag_thr : ndarray
+        Computed lags for every threshold
+    val_thr : ndarray
+        CMNDF values for computed lag
+    """
+    # restrict search range to interval [p_min:p_max]
+    cmndf[:p_min] = np.inf
+    cmndf[p_max:] = np.inf
+
+    # find local minima (assuming that cmndf is real in [p_min:p_max], you will always find a minimum,
+    # at least p_min or p_max)
+    min_idxs = (np.argwhere((cmndf[1:-1] < cmndf[0:-2]) & (cmndf[1:-1] < cmndf[2:]))).flatten().astype(np.int64) + 1
+
+    O_m = np.zeros(2 * len(F_axis))
+
+    # return if no minima are found, e.g., when frame is silence
+    if min_idxs.size == 0:
+        return O_m, np.ones_like(thresholds)*p_min, np.ones_like(thresholds)
+
+    # Optional: Parabolic Interpolation of local minima
+    if parabolic_interp:
+        # do not interpolate at the boarders, Numba compatible workaround for np.delete()
+        min_idxs_interp = delete_numba(min_idxs, np.argwhere(min_idxs == p_min))
+        min_idxs_interp = delete_numba(min_idxs_interp, np.argwhere(min_idxs_interp == p_max - 1))
+        p_corr, cmndf[min_idxs_interp] = parabolic_interpolation(cmndf[min_idxs_interp - 1],
+                                                                 cmndf[min_idxs_interp],
+                                                                 cmndf[min_idxs_interp + 1])
+    else:
+        p_corr = np.zeros_like(min_idxs).astype(np.float64)
+
+    # set p_corr=0 at the boarders (no correction done later)
+    if min_idxs[0] == p_min:
+        p_corr = np.concatenate((np.array([0.0]), p_corr))
+
+    if min_idxs[-1] == p_max - 1:
+        p_corr = np.concatenate((p_corr, np.array([0.0])))
+
+    lag_thr = np.zeros_like(thresholds)
+    val_thr = np.zeros_like(thresholds)
+
+    # loop over all thresholds
+    for i, threshold in enumerate(thresholds):
+        # minima below absolute threshold
+        min_idxs_thr = min_idxs[cmndf[min_idxs] < threshold]
+
+        # find first local minimum
+        if not min_idxs_thr.size:
+            lag = np.argmin(cmndf)  # choose absolute minimum when no local minimum is found
+            am_prob = absolute_min_prob
+            val = np.min(cmndf)
+        else:
+            am_prob = 1
+            lag = np.min(min_idxs_thr)  # choose first local minimum
+            val = cmndf[lag]
+
+            # correct lag
+            if parabolic_interp:
+                lag += p_corr[np.argmin(min_idxs_thr)]
+
+        # ensure that lag is in [p_min:p_max]
+        if lag < p_min:
+            lag = p_min
+        elif lag >= p_max:
+            lag = p_max - 1
+
+        lag_thr[i] = lag
+        val_thr[i] = val
+
+        idx = np.argmin(np.abs(1200 * np.log2(F_axis / (Fs / lag))))  # quantize estimated period
+        O_m[idx] += am_prob * beta_distr[i]  # pYIN-Paper, Formula 4/5
+
+    return O_m, lag_thr, val_thr
+
+
+@njit
+def yin_multi_thr(x, Fs, N, H, F_min, F_max, thresholds, beta_distr, absolute_min_prob, F_axis, voicing_prob,
+                  parabolic_interp=True):
+    """
+    Applies YIN multiple times on input audio signals using different thresholds for CMNDF.
+
+    Parameters
+    ----------
+    x : ndarray
+        Input audio signal
+    Fs : int
+        Sampling rate
+    N : int
+        Window size
+    H : int
+        Hop size
+    F_min : float
+        Lower frequency bound
+    F_max : float
+        Upper frequency bound
+    thresholds : ndarray
+        Array of thresholds
+    beta_distr : ndarray
+        Beta distribution that defines mapping between thresholds and probabilities
+    absolute_min_prob :float
+        Probability to chose absolute minimum
+    F_axis : ndarray
+        Frequency axis
+    voicing_prob : float
+        Probability of a frame being voiced
+    parabolic_interp : bool
+        Switch to activate/deactivate parabolic interpolation
+
+    Returns
+    -------
+    O : ndarray
+        Observations based on YIN output
+    rms : ndarray
+        Root mean square power
+    p_orig : ndarray
+        Original YIN period estimates
+    val_orig : ndarray
+        CMNDF values corresponding to original YIN period estimates
+    """
+
+    M = int(np.floor((len(x) - N) / H)) + 1  # Compute number of estimates that will be generated
+    B = len(F_axis)
+
+    p_min = max(int(np.ceil(Fs / F_max)), 1)  # period of maximal frequency in frames
+    p_max = int(np.ceil(Fs / F_min))  # period of minimal frequency in frames
+
+    if p_max > N:
+        raise Exception("The condition (Fmin >= Fs/N) was not met.")
+
+    rms = np.zeros(M)  # RMS Power
+    O = np.zeros((2 * B, M))  # every voiced state has an unvoiced state (important for later HMM modeling)
+    p_orig = np.zeros((len(thresholds), M))
+    val_orig = np.zeros((len(thresholds), M))
+
+    for m in range(M):
+        # Take a frame from input signal
+        frame = x[m * H:m * H + N]
+
+        # Cumulative Mean Normalized Difference Function
+        cmndf = cumulative_mean_normalized_difference_function(frame, p_max)
+
+        # compute RMS power
+        rms[m] = np.sqrt(np.mean(frame ** 2))
+        
+        # Probabilistic Thresholding with different thresholds
+        O_m, p_est_thr, val_thr = probabilistic_thresholding(cmndf, thresholds, p_min, p_max, absolute_min_prob, F_axis,
+                                                             Fs, beta_distr, parabolic_interp=parabolic_interp)
+
+        O[:, m] = O_m
+        p_orig[:, m] = p_est_thr  # store original YIN estimates for later refinement
+        val_orig[:, m] = val_thr  # store original cmndf value of minimum corresponding to p_est
+
+    # normalization (pYIN-Paper, Formula 6)
+    O[0:B, :] *= voicing_prob
+    O[B:2 * B, :] = (1 - voicing_prob) * (1 - np.sum(O[0:B, :], axis=0)) / B
+    
+    return O, rms, p_orig, val_orig
+
+
+@njit
+def compute_transition_matrix(M, triang_distr):
+    """
+    Compute a transition matrix for PYIN Viterbi.
+
+    Parameters
+    ----------
+    M : int
+        Matrix dimension
+    triang_distr : ndarray
+        (Triangular) distribution, defining tolerance for jumps deviating from the main diagonal
+
+    Returns
+    -------
+    A : ndarray
+        Transition matrix
+    """
+    prob_self = 0.99
+        
+    A = np.zeros((2*M, 2*M))
+    max_step = len(triang_distr) // 2
+
+    for i in range(M):
+        if i < max_step:
+            A[i, 0:i+max_step] = prob_self * triang_distr[max_step - i:-1] / np.sum(triang_distr[max_step - i:-1])
+            A[i+M, M:i+M+max_step] = prob_self * triang_distr[max_step - i:-1] / np.sum(triang_distr[max_step - i:-1])
+
+        if i >= max_step and i < M-max_step:
+            A[i, i-max_step:i+max_step+1] = prob_self * triang_distr
+            A[i+M, (i+M)-max_step:(i+M)+max_step+1] = prob_self * triang_distr
+
+        if i >= M-max_step:
+            A[i, i-max_step:M] = prob_self * triang_distr[0:max_step - (i-M)] / np.sum(triang_distr[0:max_step - (i-M)])
+            A[i+M, i+M-max_step:2*M] = prob_self * triang_distr[0:max_step - (i - M)] / \
+                                       np.sum(triang_distr[0:max_step - (i - M)])
+
+        A[i, i+M] = 1 - prob_self
+        A[i+M, i] = 1 - prob_self
+    
+    return A
+
+
+@njit
+def viterbi_pyin(A, C, O):
+    """Viterbi algorithm (pYIN variant)
+
+        Args:
+            A : ndarray
+                State transition probability matrix of dimension I x I
+            C : ndarray
+                Initial state distribution  of dimension I X 1
+            O : ndarray
+                Likelihood matrix of dimension I x N
+
+        Returns:
+            idxs : ndarray
+                Optimal state sequence of length N
+        """
+    B = O.shape[0] // 2
+    M = O.shape[1]
+    D = np.zeros((B * 2, M))
+    E = np.zeros((B * 2, M - 1))
+
+    idxs = np.zeros(M)
+
+    for i in range(B * 2):
+        D[i, 0] = C[i, 0] * O[i, 0]  # D matrix Intial state setting
+
+    D[:, 0] = D[:, 0] / np.sum(D[:, 0])  # Normalization (using pYIN source code as a basis)
+
+    for n in range(1, M):
+        for i in range(B * 2):
+            abyd = np.multiply(A[:, i], D[:, n-1])
+            D[i, n] = np.max(abyd) * O[i, n]
+            E[i, n-1] = np.argmax(abyd)
+
+        D[:, n] = D[:, n] / np.sum(D[:, n])  # Row normalization to avoid underflow (pYIN source code sparseHMM)
+
+    idxs[M - 1] = np.argmax(D[:, M - 1])
+
+    for n in range(M - 2, 0, -1):
+        bkd = int(idxs[n+1])  # Intermediate variable to be compatible with Numba
+        idxs[n] = E[bkd, n]
+    
+    return idxs.astype(np.int32)
+
+
+@njit
+def viterbi_log_likelihood(A, C, B_O):
+    """Viterbi algorithm (log variant) for solving the uncovering problem
+
+    Notebook: C5/C5S3_Viterbi.ipynb
+
+    Args:
+        A : ndarray
+            State transition probability matrix of dimension I x I
+        C : ndarray
+            Initial state distribution  of dimension I
+        B_O : ndarray
+            Likelihood matrix of dimension I x N
+
+    Returns:
+        S_opt : ndarray
+            Optimal state sequence of length N
+    """
+    I = A.shape[0]    # Number of states
+    N = B_O.shape[1]  # Length of observation sequence
+    tiny = np.finfo(0.).tiny
+    A_log = np.log(A + tiny)
+    C_log = np.log(C + tiny)
+    B_O_log = np.log(B_O + tiny)
+
+    # Initialize D and E matrices
+    D_log = np.zeros((I, N))
+    E = np.zeros((I, N-1)).astype(np.int32)
+    D_log[:, 0] = C_log + B_O_log[:, 0]
+
+    # Compute D and E in a nested loop
+    for n in range(1, N):
+        for i in range(I):
+            temp_sum = A_log[:, i] + D_log[:, n-1]
+            D_log[i, n] = np.max(temp_sum) + B_O_log[i, n]
+            E[i, n-1] = np.argmax(temp_sum)
+
+    # Backtracking
+    S_opt = np.zeros(N).astype(np.int32)
+    S_opt[-1] = np.argmax(D_log[:, -1])
+    for n in range(N-2, -1, -1):
+        S_opt[n] = E[int(S_opt[n+1]), n]
+
+    return S_opt
+
+
+@njit
+def delete_numba(arr, num):
+    """Delete number from array, Numba compatible. Inspired by:
+        https://stackoverflow.com/questions/53602663/delete-a-row-in-numpy-array-in-numba
+    """
+    mask = np.zeros(len(arr), dtype=np.int64) == 0
+    mask[np.where(arr == num)[0]] = False
+    return arr[mask]
diff --git a/pitch/core/salience.py b/pitch/core/salience.py
new file mode 100644
index 0000000000000000000000000000000000000000..54b33ab0e4caf9bdb700b01d68ee6145fd48423d
--- /dev/null
+++ b/pitch/core/salience.py
@@ -0,0 +1,441 @@
+"""
+| Description: libf0 salience-based F0 estimation implementation
+| Author: Sebastian Rosenzweig, Simon Schwär, Meinard Müller
+| License: The MIT license, https://opensource.org/licenses/MIT
+| This file is part of libf0.
+"""
+import numpy as np
+from librosa import stft
+from scipy import ndimage, linalg
+from numba import njit
+
+
+def salience(x, Fs=22050, N=2048, H=256, F_min=55.0, F_max=1760.0, R=10.0, num_harm=10, freq_smooth_len=11,
+             alpha=0.9, gamma=0.0, constraint_region=None, tol=5, score_low=0.01, score_high=1.0):
+    """
+    Implementation of a salience-based F0-estimation algorithm using pitch contours, inspired by Melodia.
+
+    .. [#] Justin Salamon and Emilia Gómez,
+       "Melody Extraction From Polyphonic Music Signals Using Pitch Contour Characteristics."
+       IEEE Transactions on Audio, Speech, and Language Processing, vol. 20, no. 6, pp. 1759–1770, Aug. 2012.
+
+    Parameters
+    ----------
+    x : ndarray
+        Audio signal
+    Fs : int
+        Sampling rate
+    N : int
+        Window size
+    H : int
+        Hop size
+    F_min : float or int
+        Minimal frequency
+    F_max : float or int
+        Maximal frequency
+    R : int
+        Frequency resolution given in cents
+    num_harm : int
+        Number of harmonics (Default value = 10)
+    freq_smooth_len : int
+        Filter length for vertical smoothing (Default value = 11)
+    alpha : float
+        Weighting parameter for harmonics (Default value = 0.9)
+    gamma : float
+        Logarithmic compression factor (Default value = 0.0)
+    constraint_region : None or ndarray
+        Constraint regions, row-format: (t_start_sec, t_end_sec, f_start_hz, f_end,hz)
+        (Default value = None)
+    tol : int
+        Tolerance parameter for transition matrix (Default value = 5)
+    score_low : float
+        Score (low) for transition matrix (Default value = 0.01)
+    score_high : float
+        Score (high) for transition matrix (Default value = 1.0)
+
+    Returns
+    -------
+    f0 : ndarray
+        Estimated F0-trajectory
+    T_coef: ndarray
+        Time axis
+    sal: ndarray
+        Salience value of estimated F0
+    
+    See also
+    --------
+    [FMP] Notebook: C8/C8S2_SalienceRepresentation.ipynb
+    """
+
+    # compute salience representation via instantaneous frequency and harmonic summation
+    Z, F_coef_hertz = compute_salience_rep(x, Fs, N=N, H=H, F_min=F_min, F_max=F_max, R=R,
+                                           num_harm=num_harm, freq_smooth_len=freq_smooth_len,
+                                           alpha=alpha, gamma=gamma)
+
+    # compute trajectory via dynamic programming
+    T_coef = (np.arange(Z.shape[1]) * H) / Fs
+    index_CR = compute_trajectory_cr(Z, T_coef, F_coef_hertz, constraint_region,
+                                     tol=tol, score_low=score_low, score_high=score_high)
+
+    traj = F_coef_hertz[index_CR]
+    traj[index_CR == -1] = 0
+
+    # compute salience value
+    Z_max = np.max(Z, axis=0)
+    Z_norm = np.divide(Z, np.ones((Z.shape[0], 1)) * Z_max)
+    sal = Z_norm[index_CR, np.arange(Z.shape[1])]
+    sal[traj == 0] = 0
+
+    return traj, T_coef, sal
+
+
+def compute_salience_rep(x, Fs, N, H, F_min, F_max, R, num_harm, freq_smooth_len, alpha, gamma):
+    """
+    Compute salience representation [FMP, Eq. (8.56)]
+
+    Parameters
+    ----------
+    x : ndarray
+        Audio signal
+    Fs : int
+        Sampling rate
+    N : int
+        Window size
+    H : int
+        Hop size
+    F_min : float or int
+        Minimal frequency
+    F_max : float or int
+        Maximal frequency
+    R : int
+        Frequency resolution given in cents
+    num_harm : int
+        Number of harmonics
+    freq_smooth_len : int
+        Filter length for vertical smoothing
+    alpha : float
+        Weighting parameter for harmonics
+    gamma : float
+        Logarithmic compression factor
+
+    Returns
+    -------
+    Z : ndarray
+        Salience representation
+    F_coef_hertz : ndarray
+        Frequency axis in Hz
+
+    See also
+    --------
+    [FMP] Notebook: C8/C8S2_SalienceRepresentation.ipynb
+    """
+
+    X = stft(x, n_fft=N, hop_length=H, win_length=N, pad_mode='constant')
+    Y_LF_IF_bin, F_coef_hertz = compute_y_lf_if_bin_eff(X, Fs, N, H, F_min, F_max, R)
+    
+    # smoothing
+    Y_LF_IF_bin = ndimage.convolve1d(Y_LF_IF_bin, np.hanning(freq_smooth_len), axis=0, mode='constant')
+    
+    Z = compute_salience_from_logfreq_spec(Y_LF_IF_bin, R, n_harmonics=num_harm, alpha=alpha, beta=1, gamma=gamma)
+    return Z, F_coef_hertz
+
+
+def compute_y_lf_if_bin_eff(X, Fs, N, H, F_min, F_max, R):
+    """
+    Binned Log-frequency Spectrogram with variable frequency resolution based on instantaneous frequency,
+    more efficient implementation than FMP
+
+    Parameters
+    ----------
+    X : ndarray
+        Complex spectrogram
+    Fs : int
+        Sampling rate in Hz
+    N : int
+        Window size
+    H : int
+        Hop size
+    F_min : float or int
+        Minimal frequency
+    F_max : float or int
+        Maximal frequency
+    R : int
+        Frequency resolution given in cents
+
+    Returns
+    -------
+    Y_LF_IF_bin : ndarray
+        Binned log-frequency spectrogram using instantaneous frequency (shape: [freq, time])
+    F_coef_hertz : ndarray
+        Frequency axis in Hz
+    """
+
+    # calculate number of bins on log frequency axis
+    B = frequency_to_bin_index(F_max, R, F_min) + 1
+
+    # center frequencies of the final bins
+    F_coef_hertz = F_min * np.power(2, (np.arange(0, B) * R / 1200))
+
+    # calculate heterodyned phase increment (hpi)
+    k = np.arange(X.shape[0]).reshape(-1, 1)
+    omega = 2 * np.pi * k / N  # center frequency for each bin in rad
+    hpi = (np.angle(X[:, 1:]) - np.angle(X[:, 0:-1])) - omega * H
+
+    # reduce hpi to -pi:pi range
+    # this is much faster than using the modulo function below, but gives the same result
+    # hpi = np.mod(hpi + np.pi, 2 * np.pi) - np.pi
+    hpi = hpi - 2 * np.pi * (np.around((hpi / (2 * np.pi)) + 1) - 1)
+
+    # calculate instantaneous frequencies in Hz
+    inst_f = (omega + hpi / H) * Fs / (2 * np.pi)
+    # repeat the first time frame to match dimensions of X
+    inst_f = np.hstack((np.copy(inst_f[:, 0]).reshape(-1, 1), inst_f))
+
+    # mask frequencies that are not relevant
+    mask = np.logical_and(inst_f >= F_min, inst_f < F_max)
+    inst_f *= mask
+    # set 0 to nan, so it does stay at nan in the bin assignment calculation
+    inst_f[np.where(inst_f == 0)] = np.nan
+
+    # find which inst_f values belong to which bin
+    bin_assignment = frequency_to_bin_index(inst_f, R, F_min)
+    # we map the discarded values to an extra bin that we remove before returning the binned spectrogram
+    bin_assignment[np.where(np.isnan(inst_f))] = B
+
+    # perform binning on power spectrogram for each time frame separately
+    Y = np.abs(X) ** 2
+    Y_LF_IF_bin = np.zeros((B+1, Y.shape[1]))
+    for t in range(Y.shape[1]):
+        np.add.at(Y_LF_IF_bin[:, t], bin_assignment[:, t], Y[:, t])
+
+    return Y_LF_IF_bin[:B, :], F_coef_hertz
+
+
+def compute_salience_from_logfreq_spec(lf_spec, R, n_harmonics, alpha, beta, gamma, harmonic_win_len=11):
+    """
+    Compute salience representation using harmonic summation following [1]
+
+    [1] J. Salamon and E. Gomez,
+        "Melody Extraction From Polyphonic Music Signals Using Pitch Contour Characteristics."
+        IEEE Transactions on Audio, Speech, and Language Processing, vol. 20, no. 6, pp. 1759–1770, Aug. 2012.
+
+    Parameters
+    ----------
+    lf_spec : ndarray
+        (F, T) log-spectrogram
+    R : int
+        Frequency resolution given in cents
+    n_harmonics : int
+        Number of harmonics
+    alpha : float
+        Weighting parameter for harmonics
+    beta : float
+        Compression parameter for spectrogram magnitudes
+    gamma : float
+        Magnitude threshold
+    harmonic_win_len : int
+        Length of a frequency weighting window in bins
+
+    Returns
+    -------
+    Z : ndarray
+        (F, T) salience representation of the input spectrogram
+    """
+
+    # magnitude thresholding and compression
+    eps = np.finfo(np.float32).eps
+    threshold_mask = (20 * np.log10(lf_spec/np.max(lf_spec) + eps)) < gamma
+    lf_spec = lf_spec**beta * threshold_mask
+
+    # compute window
+    max_diff_bins = harmonic_win_len // 2
+    window = np.cos(np.linspace(-1, 1, 2*max_diff_bins+1)*np.pi/2)**2  # cosine^2 window
+
+    # compute indices of harmonics
+    harmonics = np.round(np.log2(np.arange(1, n_harmonics + 1)) * 1200 / R).astype(int)
+    weighting_vec = np.zeros((lf_spec.shape[0] + max_diff_bins))
+
+    # compute weights
+    for idx, h in enumerate(harmonics):
+        if h+harmonic_win_len > len(weighting_vec):
+            break  # we reached the maximum length available
+        weighting_vec[h:h+harmonic_win_len] += window * alpha**idx
+
+    # correlate lf_spec with the weighting vector on the frequency axis
+    Z = ndimage.correlate1d(lf_spec, weighting_vec[:],
+                            axis=0, mode='constant', cval=0, origin=-len(weighting_vec)//2 + max_diff_bins)
+
+    # magnitude thresholding and compression
+    threshold_mask = (20 * np.log10(Z / np.max(Z) + eps)) < gamma
+    Z = Z ** beta * threshold_mask
+
+    return Z
+
+
+def define_transition_matrix(B, tol=0, score_low=0.01, score_high=1.0):
+    """
+    Generate transition matrix for dynamic programming
+
+    Parameters
+    ----------
+    B : int
+        Number of bins
+    tol : int
+        Tolerance parameter for transition matrix (Default value = 0)
+    score_low : float
+        Score (low) for transition matrix (Default value = 0.01)
+    score_high : float
+        Score (high) for transition matrix (Default value = 1.0)
+    
+    Returns
+    -------
+    T : ndarray
+        (B, B) Transition matrix
+
+    See also
+    --------
+    [FMP] Notebook: C8/C8S2_FundFreqTracking.ipynb
+    """
+
+    col = np.ones((B,)) * score_low
+    col[0:tol+1] = np.ones((tol+1, )) * score_high
+    T = linalg.toeplitz(col)
+    return T
+
+
+@njit
+def compute_trajectory_dp(Z, T):
+    """
+    Trajectory tracking using dynamic programming
+
+    Parameters
+    ----------
+    Z : ndarray
+        Salience representation
+    T : ndarray
+        Transisition matrix
+    
+    Returns
+    -------
+    eta_DP : ndarray
+        Trajectory indices
+
+    See also
+    --------
+    [FMP] Notebook: C8/C8S2_FundFreqTracking.ipynb
+    """
+
+    B, N = Z.shape
+    eps_machine = np.finfo(np.float32).eps
+    Z_log = np.log(Z + eps_machine)
+    T_log = np.log(T + eps_machine)
+
+    E = np.zeros((B, N))
+    D = np.zeros((B, N))
+    D[:, 0] = Z_log[:, 0]
+
+    for n in np.arange(1, N):
+        for b in np.arange(0, B):
+            D[b, n] = np.max(T_log[b, :] + D[:, n-1]) + Z_log[b, n]
+            E[b, n-1] = np.argmax(T_log[b, :] + D[:, n-1])
+
+    # backtracking
+    eta_DP = np.zeros(N)
+    eta_DP[N-1] = int(np.argmax(D[:, N-1]))
+
+    for n in np.arange(N-2, -1, -1):
+        eta_DP[n] = E[int(eta_DP[n+1]), n]
+
+    return eta_DP.astype(np.int64)
+
+
+def compute_trajectory_cr(Z, T_coef, F_coef_hertz, constraint_region=None,
+                          tol=5, score_low=0.01, score_high=1.0):
+    """
+    Trajectory tracking with constraint regions
+    Notebook: C8/C8S2_FundFreqTracking.ipynb
+
+    Parameters
+    ----------
+    Z  : ndarray
+        Salience representation
+    T_coef : ndarray
+        Time axis
+    F_coef_hertz : ndarray
+        Frequency axis in Hz
+    constraint_region : ndarray or None
+        Constraint regions, row-format: (t_start_sec, t_end_sec, f_start_hz, f_end_hz)
+        (Default value = None)
+    tol : int
+        Tolerance parameter for transition matrix (Default value = 5)
+    score_low : float
+        Score (low) for transition matrix (Default value = 0.01)
+    score_high : float
+        Score (high) for transition matrix (Default value = 1.0)
+    
+    Returns
+    -------
+    eta : ndarray
+        Trajectory indices, unvoiced frames are indicated with -1
+
+    See also
+    --------
+    [FMP] Notebook: C8/C8S2_FundFreqTracking.ipynb
+    """
+
+    # do tracking within every constraint region
+    if constraint_region is not None:
+        # initialize contour, unvoiced frames are indicated with -1
+        eta = np.full(len(T_coef), -1)
+
+        for row_idx in range(constraint_region.shape[0]):
+            t_start = constraint_region[row_idx, 0]  # sec
+            t_end = constraint_region[row_idx, 1]  # sec
+            f_start = constraint_region[row_idx, 2]  # Hz
+            f_end = constraint_region[row_idx, 3]  # Hz
+
+            # convert start/end values to indices
+            t_start_idx = np.argmin(np.abs(T_coef - t_start))
+            t_end_idx = np.argmin(np.abs(T_coef - t_end))
+            f_start_idx = np.argmin(np.abs(F_coef_hertz - f_start))
+            f_end_idx = np.argmin(np.abs(F_coef_hertz - f_end))
+
+            # track in salience part
+            cur_Z = Z[f_start_idx:f_end_idx+1, t_start_idx:t_end_idx+1]
+            T = define_transition_matrix(cur_Z.shape[0], tol=tol,
+                                         score_low=score_low, score_high=score_high)
+            cur_eta = compute_trajectory_dp(cur_Z, T)
+
+            # fill contour
+            eta[t_start_idx:t_end_idx+1] = f_start_idx + cur_eta
+    else:
+        T = define_transition_matrix(Z.shape[0], tol=tol, score_low=score_low, score_high=score_high)
+        eta = compute_trajectory_dp(Z, T)
+
+    return eta
+
+
+def frequency_to_bin_index(F, R, F_ref):
+    """
+        Binning function with variable frequency resolution
+        Note: Indexing starts with 0 (opposed to [FMP, Eq. (8.49)])
+
+    Parameters
+    ----------
+    F : float or ndarray
+        Frequency in Hz
+    R : float
+        Frequency resolution in cents (Default value = 10.0)
+    F_ref : float
+        Reference frequency in Hz (Default value = 55.0)
+    
+    Returns
+    -------
+        bin_index (int): Index for bin (starting with index 0)
+
+    See also
+    --------
+    [FMP] Notebook: C8/C8S2_SalienceRepresentation.ipynb
+    """
+    bin_index = np.floor((1200 / R) * np.log2(F / F_ref) + 0.5).astype(np.int64)
+    return bin_index
diff --git a/pitch/core/swipe.py b/pitch/core/swipe.py
new file mode 100644
index 0000000000000000000000000000000000000000..99960caf303cc2403437bacb9bd50494dcaf1670
--- /dev/null
+++ b/pitch/core/swipe.py
@@ -0,0 +1,282 @@
+"""
+| Description: libf0 SWIPE implementation
+| Contributors: Sebastian Rosenzweig, Vojtěch Pešek, Simon Schwär, Meinard Müller
+| License: The MIT license, https://opensource.org/licenses/MIT
+| This file is part of libf0.
+"""
+from scipy import interpolate
+import numpy as np
+import librosa
+
+
+def swipe(x, Fs=22050, H=256, F_min=55.0, F_max=1760.0, dlog2p=1 / 96, derbs=0.1, strength_threshold=0):
+    """
+    Implementation of a sawtooth waveform inspired pitch estimator (SWIPE).
+    This version of the algorithm follows the original implementation, see `swipe_slim` for a more efficient
+    alternative.
+
+    .. [#] Arturo Camacho and John G. Harris,
+       "A sawtooth waveform inspired pitch estimator for speech and music."
+       The Journal of the Acoustical Society of America, vol. 124, no. 3, pp. 1638–1652, Sep. 2008
+
+    Parameters
+    ----------
+    x : ndarray
+        Audio signal
+    Fs : int
+        Sampling rate
+    H : int
+        Hop size
+    F_min : float or int
+        Minimal frequency
+    F_max : float or int
+        Maximal frequency
+    dlog2p : float
+        resolution of the pitch candidate bins in octaves (default value = 1/96 -> 96 bins per octave)
+    derbs : float
+        resolution of the ERB bands (default value = 0.1)
+    strength_threshold : float
+        confidence threshold [0, 1] for the pitch detection (default value = 0)
+
+    Returns
+    -------
+    f0 : ndarray
+        Estimated F0-trajectory
+    t : ndarray
+        Time axis
+    strength : ndarray
+        Confidence/Pitch Strength
+    """
+
+    t = np.arange(0, len(x), H) / Fs  # Times
+
+    # Compute pitch candidates
+    pc = 2 ** np.arange(np.log2(F_min), np.log2(F_max), dlog2p)
+
+    # Pitch strength matrix
+    S = np.zeros((len(pc), len(t)))
+
+    # Determine P2-WSs [max, min]
+    log_ws_max = np.ceil(np.log2((8 / F_min) * Fs))
+    log_ws_min = np.floor(np.log2((8 / F_max) * Fs))
+
+    # P2-WSs - window sizes in samples
+    ws = 2 ** np.arange(log_ws_max, log_ws_min - 1, -1, dtype=np.int32)
+    # print(f'window sizes in samples: {ws}')
+
+    # Determine window sizes used by each pitch candidate
+    log2pc = np.arange(np.log2(F_min), np.log2(F_max), dlog2p)
+    d = log2pc - np.log2(np.divide(8 * Fs, ws[0]))
+
+    # Create ERBs spaced frequencies (in Hertz)
+    fERBs = erbs2hz(np.arange(hz2erbs(pc[0] / 4), hz2erbs(Fs / 2), derbs))
+
+    for i in range(0, len(ws)):
+        N = ws[i]
+        H = int(N / 2)
+
+        x_zero_padded = np.concatenate([x, np.zeros(N)])
+
+        X = librosa.stft(x_zero_padded, n_fft=N, hop_length=H, pad_mode='constant', center=True)
+        ti = librosa.frames_to_time(np.arange(0, X.shape[1]), sr=Fs, hop_length=H, n_fft=N)
+        f = librosa.fft_frequencies(sr=Fs, n_fft=N)
+
+        ti = np.insert(ti, 0, 0)
+        ti = np.delete(ti, -1)
+
+        spectrum = np.abs(X)
+        magnitude = resample_ferbs(spectrum, f, fERBs)
+        loudness = np.sqrt(magnitude)
+
+        # Select candidates that use this window size
+        # First window
+        if i == 0:
+            j = np.argwhere(d < 1).flatten()
+            k = np.argwhere(d[j] > 0).flatten()
+        # Last Window
+        elif i == len(ws) - 1:
+            j = np.argwhere(d - i > -1).flatten()
+            k = np.argwhere(d[j] - i < 0).flatten()
+        else:
+            j = np.argwhere(np.abs(d - i) < 1).flatten()
+            k = np.arange(0, len(j))
+
+        pc_to_compute = pc[j]
+
+        pitch_strength = pitch_strength_all_candidates(fERBs, loudness, pc_to_compute)
+
+        resampled_pitch_strength = resample_time(pitch_strength, t, ti)
+
+        lambda_ = d[j[k]] - i
+        mu = np.ones(len(j))
+        mu[k] = 1 - np.abs(lambda_)
+
+        S[j, :] = S[j, :] + np.multiply(
+            np.ones(resampled_pitch_strength.shape) * mu.reshape((mu.shape[0], 1)),
+            resampled_pitch_strength
+        )
+
+    # Fine-tune the pitch using parabolic interpolation
+    pitches, strength = parabolic_int(S, strength_threshold, pc)
+
+    pitches[np.where(np.isnan(pitches))] = 0  # avoid NaN output
+
+    return pitches, t, strength
+
+
+def nyquist(Fs):
+    """Nyquist Frequency"""
+    return Fs / 2
+
+
+def F_coef(k, N, Fs):
+    """Physical frequency of STFT coefficients"""
+    return (k * Fs) / N
+
+
+def T_coef(m, H, Fs):
+    """Physical time of STFT coefficients"""
+    return m * H / Fs
+
+
+def stft_with_f_t(y, N, H, Fs):
+    """STFT wrapper"""
+    x = librosa.stft(y, int(N), int(H), pad_mode='constant', center=True)
+    f = F_coef(np.arange(0, x.shape[0]), N, Fs)
+    t = T_coef(np.arange(0, x.shape[1]), H, Fs)
+
+    return x, f, t
+
+
+def hz2erbs(hz):
+    """Convert Hz to ERB scale"""
+    return 21.4 * np.log10(1 + hz / 229)
+
+
+def erbs2hz(erbs):
+    """Convert ERB to Hz"""
+    return (10 ** np.divide(erbs, 21.4) - 1) * 229
+
+
+def pitch_strength_all_candidates(ferbs, loudness, pitch_candidates):
+    """Compute pitch strength for all pitch candidates"""
+    # Normalize loudness
+    normalization_loudness = np.full_like(loudness, np.sqrt(np.sum(loudness * loudness, axis=0)))
+    with np.errstate(divide='ignore', invalid='ignore'):
+        loudness = loudness / normalization_loudness
+
+    # Create pitch salience matrix
+    S = np.zeros((len(pitch_candidates), loudness.shape[1]))
+
+    for j in range(0, len(pitch_candidates)):
+        S[j, :] = pitch_strength_one(ferbs, loudness, pitch_candidates[j])
+    return S
+
+
+def pitch_strength_one(erbs_frequencies, normalized_loudness, pitch_candidate):
+    """Compute pitch strength for one pitch candidate"""
+    number_of_harmonics = np.floor(erbs_frequencies[-1] / pitch_candidate - 0.75).astype(np.int32)
+    k = np.zeros(erbs_frequencies.shape)
+
+    # f_prime / f
+    q = erbs_frequencies / pitch_candidate
+
+    for i in np.concatenate(([1], primes(number_of_harmonics))):
+        a = np.abs(q - i)
+        p = a < 0.25
+        k[p] = np.cos(np.dot(2 * np.pi, q[p]))
+        v = np.logical_and(0.25 < a, a < 0.75)
+        k[v] = k[v] + np.cos(np.dot(2 * np.pi, q[v])) / 2
+
+    # Apply envelope
+    k = np.multiply(k, np.sqrt(1.0 / erbs_frequencies))
+
+    # K+-normalize kernel
+    k = k / np.linalg.norm(k[k > 0])
+
+    # Compute pitch strength
+    S = np.dot(k, normalized_loudness)
+    return S
+
+
+def resample_ferbs(spectrum, f, ferbs):
+    """Resample to ERB scale"""
+    magnitude = np.zeros((len(ferbs), spectrum.shape[1]))
+
+    for t in range(spectrum.shape[1]):
+        spl = interpolate.splrep(f, spectrum[:, t])
+        interpolate.splev(ferbs, spl)
+
+        magnitude[:, t] = interpolate.splev(ferbs, spl)
+
+    return np.maximum(magnitude, 0)
+
+
+def resample_time(pitch_strength, resampled_time, ti):
+    """Resample time axis"""
+    if pitch_strength.shape[1] > 0:
+        pitch_strength = interpolate_one_candidate(pitch_strength, ti, resampled_time)
+    else:
+        pitch_strength = np.kron(np.ones((len(pitch_strength), len(resampled_time))), np.NaN)
+    return pitch_strength
+
+
+def interpolate_one_candidate(pitch_strength, ti, resampled_time):
+    """Interpolate time axis"""
+    pitch_strength_interpolated = np.zeros((pitch_strength.shape[0], len(resampled_time)))
+
+    for s in range(pitch_strength.shape[0]):
+        t_i = interpolate.interp1d(ti, pitch_strength[s, :], 'linear', bounds_error=True)
+        pitch_strength_interpolated[s, :] = t_i(resampled_time)
+
+    return pitch_strength_interpolated
+
+
+def parabolic_int(pitch_strength, strength_threshold, pc):
+    """Parabolic interpolation between pitch candidates using pitch strength"""
+    p = np.full((pitch_strength.shape[1],), np.NaN)
+    s = np.full((pitch_strength.shape[1],), np.NaN)
+
+    for j in range(pitch_strength.shape[1]):
+        i = np.argmax(pitch_strength[:, j])
+        s[j] = pitch_strength[i, j]
+
+        if s[j] < strength_threshold:
+            continue
+
+        if i == 0:
+            p[j] = pc[0]
+        elif i == len(pc) - 1:
+            p[j] = pc[0]
+        else:
+            I = np.arange(i - 1, i + 2)
+            tc = 1 / pc[I]
+            ntc = np.dot((tc / tc[1] - 1), 2 * np.pi)
+            if np.any(np.isnan(pitch_strength[I, j])):
+                s[j] = np.nan
+                p[j] = np.nan
+            else:
+                c = np.polyfit(ntc, pitch_strength[I, j], 2)
+                ftc = 1 / 2 ** np.arange(np.log2(pc[I[0]]), np.log2(pc[I[2]]), 1 / 12 / 64)
+                nftc = np.dot((ftc / tc[1] - 1), 2 * np.pi)
+                poly = np.polyval(c, nftc)
+                k = np.argmax(poly)
+                s[j] = poly[k]
+                p[j] = 2 ** (np.log2(pc[I[0]]) + k / 12 / 64)
+    return p, s
+
+
+def primes(n):
+    """Returns a set of n prime numbers"""
+    small_primes = np.array([2, 3, 5, 7, 11, 13, 17, 19, 23, 29, 31, 37, 41, 43, 47, 53, 59, 61, 67, 71, 73, 79, 83, 89,
+                             97, 101, 103, 107, 109, 113, 127, 131, 137, 139, 149, 151, 157, 163, 167, 173, 179, 181,
+                             191, 193, 197, 199, 211, 223, 227, 229, 233, 239, 241, 251, 257, 263, 269, 271, 277, 281,
+                             283, 293, 307, 311, 313, 317, 331, 337, 347, 349, 353, 359, 367, 373, 379, 383, 389, 397,
+                             401, 409, 419, 421, 431, 433, 439, 443, 449, 457, 461, 463, 467, 479, 487, 491, 499, 503,
+                             509, 521, 523, 541, 547, 557, 563, 569, 571, 577, 587, 593, 599, 601, 607, 613, 617, 619,
+                             631, 641, 643, 647, 653, 659, 661, 673, 677, 683, 691, 701, 709, 719, 727, 733, 739, 743,
+                             751, 757, 761, 769, 773, 787, 797, 809, 811, 821, 823, 827, 829, 839, 853, 857, 859, 863,
+                             877, 881, 883, 887, 907, 911, 919, 929, 937, 941, 947, 953, 967, 971, 977, 983, 991, 997])
+
+    b = small_primes <= n
+    return small_primes[b]
diff --git a/pitch/core/swipe_slim.py b/pitch/core/swipe_slim.py
new file mode 100644
index 0000000000000000000000000000000000000000..3cf23f2b561b47d654c2a5b98ba392bdf6c05f9a
--- /dev/null
+++ b/pitch/core/swipe_slim.py
@@ -0,0 +1,180 @@
+"""
+| Description: libf0 SWIPE slim implementation
+| Contributors: Sebastian Rosenzweig, Simon Schwär, Meinard Müller
+| License: The MIT license, https://opensource.org/licenses/MIT
+| This file is part of libf0.
+"""
+import numpy as np
+import librosa
+from .yin import parabolic_interpolation
+from scipy.interpolate import interp1d
+
+
+def swipe_slim(x, Fs=22050, H=256, F_min=55.0, F_max=1760.0, R=10, strength_threshold=0):
+    """
+    Slim and didactical implementation of a sawtooth waveform inspired pitch estimator (SWIPE).
+    This version uses a log-frequency spectrogram instead of ERB filters. Furthermore, it is implemented more
+    efficiently. See `swipe()` for the original implementation.
+
+    .. [#] A. Camacho and J. G. Harris,
+       "A sawtooth waveform inspired pitch estimator for speech and music."
+       The Journal of the Acoustical Society of America, vol. 124, no. 3, pp. 1638–1652, Sep. 2008
+
+    Parameters
+    ----------
+    x : ndarray
+        Audio signal
+    Fs : int
+        Sampling rate
+    H : int
+        Hop size
+    F_min : float or int
+        Minimal frequency
+    F_max : float or int
+        Maximal frequency
+    R : float
+        resolution of the pitch candidate bins in cents (default = 10)
+    strength_threshold : float
+        confidence threshold [0, 1] for the pitch detection (default value = 0)
+
+    Returns
+    -------
+    f0 : ndarray
+        Estimated F0-trajectory
+    t : ndarray
+        Time axis
+    conf : ndarray
+        Confidence / Pitch Strength
+    """
+
+    # compute time and frequency axis
+    t = np.arange(0, len(x), H) / Fs  # time axis
+    F_coef_log = np.arange(0, np.log2(Fs/2/F_min), R/1200)
+    F_coef_log_hz = F_min * 2 ** F_coef_log  # pitch candidates
+
+    # pre-compute kernels, one kernel for each pitch candidate in range [F_min : F_max]
+    F_min_idx = np.argmin(np.abs(F_coef_log_hz - F_min))
+    F_max_idx = np.argmin(np.abs(F_coef_log_hz - F_max))
+    B = F_max_idx - F_min_idx  # Number of pitch candidates
+    kernels = np.zeros((B, len(F_coef_log_hz)))
+    for i, f in enumerate(F_coef_log_hz[F_min_idx:F_max_idx]):
+        kernels[i, :] = compute_kernel(f, F_coef_log_hz)
+
+    # determine optimal window length for each candidate
+    L_opt = np.log2(Fs * 8 / np.array([F_min, F_max]))  # exponents for optimal window sizes 2^L, see paper Section II.G
+    L_rnd = np.arange(np.round(L_opt[1]), np.round(L_opt[0])+1).astype(np.int32)  # range of rounded exponents
+    N_pow2 = 2 ** L_rnd  # Compute rounded power-2 windows sizes
+    # Quantization error between optimal window size (see paper Section II.G) and rounded power-2 windows size
+    # Using only the largest N here, since errors for other N can be derived from err by subtracting exponent (cyclic)
+    err = np.abs(np.log2(8 * Fs / F_coef_log_hz[F_min_idx:F_max_idx]) - np.log2(np.max(N_pow2)))
+
+    S = np.zeros((B, len(t)))  # "pitch-strength" matrix
+
+    # loop through all window sizes
+    for octave, N in enumerate(N_pow2):
+        # Compute STFT
+        x_pad = np.pad(x, (0, N))  # to avoid problems during time axis interpolation
+        H = N // 2
+        X = librosa.stft(x_pad, n_fft=N, hop_length=H, win_length=N, window='hann', pad_mode='constant', center=True)
+        Y = np.abs(X)
+        T_coef_lin_s = np.arange(0, X.shape[1]) * H / Fs
+        F_coef_lin_hz = np.arange(N // 2 + 1) * Fs / N
+
+        # Resample to log-frequency axis
+        compute_Y_log = interp1d(F_coef_lin_hz, Y, kind='cubic', axis=0)
+        Y_log = compute_Y_log(F_coef_log_hz)
+
+        # Normalize magnitudes
+        Y_log /= np.sqrt(np.sum(Y_log ** 2, axis=0)) + np.finfo(float).eps
+
+        # Correlate kernels with log-spectrum for pitch candidates where N is optimal
+        S_N = np.matmul(kernels, Y_log)
+
+        # Resample time axis
+        compute_S_N_res = interp1d(T_coef_lin_s, S_N, kind='linear', axis=1)
+        S_N_res = compute_S_N_res(t)
+
+        # Weight pitch strength according to quantization error
+        candidates = (err > octave - 1) & (err < octave + 1)  # consider pitches +/- 1 octave from current window
+        mu = 1 - np.abs(err[candidates] - octave)
+
+        S[candidates, :] += np.multiply(mu.reshape(-1, 1), S_N_res[candidates, :])
+
+    # Obtain pitch estimates and corresponding confidence
+    max_indices = np.argmax(S, axis=0)
+    conf = np.max(S, axis=0)
+
+    # Parabolic Interpolation of pitch estimates for refinement
+    time_idx = np.arange(S.shape[1])
+    indeces_shift, _ = parabolic_interpolation(S[max_indices-1, time_idx],
+                                               S[max_indices, time_idx],
+                                               S[max_indices+1, time_idx])
+    compute_f0_log = interp1d(np.arange(len(F_coef_log)), F_coef_log, kind='linear')
+    f0_hz = F_min * 2 ** compute_f0_log(max_indices+indeces_shift)
+
+    # Thresholding
+    f0_hz[conf < strength_threshold] = 0  # discard estimates where confidence is low
+
+    return f0_hz, t, conf
+
+
+def compute_kernel(f, F_coef_log_hz):
+    """
+    Compute a SWIPE' kernel.
+
+    Parameters
+    ----------
+    f : float
+        Frequency in Hz
+    F_coef_log_hz :
+        Logarithmic frequency axis in Hz
+
+    Returns
+    -------
+    k : ndarray
+        Kernel
+    """
+    k = np.zeros(len(F_coef_log_hz))
+    n_harmonics = np.floor(F_coef_log_hz[-1] / f).astype(np.int32)
+    prime_numbers = prime_and_one(100)[:n_harmonics]  # only consider prime harmonics for kernel peaks
+
+    ratio = F_coef_log_hz / f
+
+    # loop through all prime harmonics
+    for p in prime_numbers:
+        a = np.abs(ratio - p)  # normalized distance between harmonic and current pitch candidate
+        main_peak_bins = a < 0.25
+        k[main_peak_bins] = np.cos(np.dot(np.array(2 * np.pi).reshape(-1, 1),
+                                          ratio[main_peak_bins].reshape(1, -1))).flatten()
+        valley_bins = np.logical_and(0.25 < a, a < 0.75)
+        k[valley_bins] += np.cos(np.dot(np.array(2 * np.pi).reshape(-1, 1),
+                                        ratio[valley_bins].reshape(1, -1))).flatten() / 2
+
+    # Apply decay
+    k = np.multiply(k, np.sqrt(1.0 / F_coef_log_hz))
+
+    # K+-normalize kernel
+    k = k / np.linalg.norm(k[k > 0])
+
+    return k
+
+
+def prime_and_one(upto=1000000):
+    """
+    Returns a set of prime numbers, adapted from http://rebrained.com/?p=458
+
+    Parameters
+    ----------
+    upto : int
+        Find prime numbers up to this number
+
+    Returns
+    -------
+    A set of prime numbers including 1 & 2
+    """
+    primes = np.arange(3, upto+1, 2)
+    isprime = np.ones((upto-1)//2, dtype=np.bool8)
+    for factor in primes[:int(np.sqrt(upto))//2]:
+        if isprime[(factor-2)//2]:
+            isprime[(factor*3-2)//2::factor] = 0
+    return np.concatenate((np.array([1, 2]), primes[isprime]))
diff --git a/pitch/core/utils.py b/pitch/core/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..bc488d104b0e9fa4ed3ab7835293aab0ae3757f6
--- /dev/null
+++ b/pitch/core/utils.py
@@ -0,0 +1,119 @@
+"""
+| Description: libf0 utility functions
+| Contributors: Sebastian Rosenzweig, Simon Schwär, Meinard Müller
+| License: The MIT license, https://opensource.org/licenses/MIT
+| This file is part of libf0.
+"""
+import numpy as np
+
+
+def sonify_trajectory_with_sinusoid(f0, t, audio_len, confidence=None, Fs=22050, smooth_len=11):
+    """
+    Sonification of trajectory with sinusoidal. Adapted from FMP notebook: C8/C8S2_FundFreqTracking.ipynb
+
+    Parameters
+    ----------
+    f0 : ndarray
+        F0-trajectory
+    t : ndarray
+        Time axis
+    audio_len : int
+        Desired audio length in samples
+    confidence : None or ndarray
+        Confidence values for amplitude control
+    Fs : int
+        Sampling rate
+    smooth_len : int
+        Smoothing filter length to avoid clicks in the sonification
+
+    Returns
+    -------
+    x_soni : ndarray
+        Sonified F0-trajectory
+    """
+    if confidence is None:
+        confidence = np.ones_like(f0)
+
+    # initialize
+    x_soni = np.zeros(audio_len)
+    amplitude_mod = np.zeros(audio_len)
+
+    # Computation of hop size
+    sine_len = int(t[1] * Fs)
+
+    t = np.arange(0, sine_len) / Fs
+    phase = 0
+
+    # loop over all F0 values, ensure continuous phase
+    for idx in np.arange(0, len(f0)):
+        cur_f = f0[idx]
+        cur_amp = confidence[idx]
+
+        if cur_f == 0:
+            phase = 0
+            continue
+
+        cur_soni = np.sin(2*np.pi*(cur_f*t+phase))
+        diff = np.maximum(0, (idx+1)*sine_len - len(x_soni))
+        if diff > 0:
+            x_soni[idx * sine_len:(idx + 1) * sine_len - diff] = cur_soni[:-diff]
+            amplitude_mod[idx * sine_len:(idx + 1) * sine_len - diff] = cur_amp
+        else:
+            x_soni[idx*sine_len:(idx+1)*sine_len-diff] = cur_soni
+            amplitude_mod[idx*sine_len:(idx+1)*sine_len-diff] = cur_amp
+
+        phase += cur_f * sine_len / Fs
+        phase -= 2 * np.round(phase/2)
+
+    # filter amplitudes to avoid transients
+    amplitude_mod = np.convolve(amplitude_mod, np.hanning(smooth_len)/np.sum(np.hanning(smooth_len)), 'same')
+    x_soni = x_soni * amplitude_mod
+    return x_soni
+
+
+def hz_to_cents(F, F_ref=55.0):
+    """
+    Converts frequency in Hz to cents.
+
+    Parameters
+    ----------
+    F : float or ndarray
+        Frequency value in Hz
+    F_ref : float
+        Reference frequency in Hz (Default value = 55.0)
+    Returns
+    -------
+    F_cents : float or ndarray
+        Frequency in cents
+    """
+
+    # Avoid division by 0
+    F_temp = np.array(F).astype(float)
+    F_temp[F_temp == 0] = np.nan
+
+    F_cents = 1200 * np.log2(F_temp / F_ref)
+
+    return F_cents
+
+
+def cents_to_hz(F_cents, F_ref=55.0):
+    """
+    Converts frequency in cents to Hz.
+
+    Parameters
+    ----------
+    F_cents : float or ndarray
+        Frequency in cents
+    F_ref : float
+        Reference frequency in Hz (Default value = 55.0)
+    Returns
+    -------
+    F : float or ndarray
+        Frequency in Hz
+    """
+    F = F_ref * 2 ** (F_cents / 1200)
+
+    # Avoid NaN output
+    F = np.nan_to_num(F, copy=False, nan=0)
+
+    return F
diff --git a/pitch/core/yin.py b/pitch/core/yin.py
new file mode 100644
index 0000000000000000000000000000000000000000..7408443a59dd6c7be820aac85445fbe4fec177fc
--- /dev/null
+++ b/pitch/core/yin.py
@@ -0,0 +1,238 @@
+"""
+| Description: libf0 YIN implementation
+| Contributors: Sebastian Rosenzweig, Simon Schwär, Edgar Suárez, Meinard Müller
+| License: The MIT license, https://opensource.org/licenses/MIT
+| This file is part of libf0.
+"""
+import numpy as np
+from numba import njit
+
+
+def yin(x, Fs=22050, N=2048, H=256, F_min=55.0, F_max=1760.0, threshold=0.15, verbose=False):
+    """
+    Implementation of the YIN algorithm.
+
+    .. [#] Alain De Cheveigné and Hideki Kawahara.
+        "YIN, a fundamental frequency estimator for speech and music."
+        The Journal of the Acoustical Society of America 111.4 (2002): 1917-1930.
+
+    Parameters
+    ----------
+    x : ndarray [shape=(L, )], real - valued
+        Audio signal
+    Fs : int
+        Sampling frequency
+    N : int
+        Window size
+    H : int
+        Hop size
+    F_min : float
+        Minimal frequency
+    F_max : float
+        Maximal frequency
+    threshold : float
+        Threshold for cumulative mean normalized difference function
+    verbose : bool
+        Switch to activate/deactivate status bar
+
+    Returns
+    -------
+    f0 : ndarray
+        Estimated F0-trajectory
+    t : ndarray
+        Time axis
+    ap: ndarray
+        Aperiodicity (indicator for voicing: the lower, the more reliable the estimate)
+    """
+
+    if F_min > F_max:
+        raise Exception("F_min must be smaller than F_max!")
+
+    if F_min < Fs/N:        
+        raise Exception(f"The condition (F_min >= Fs/N) was not met. With Fs = {Fs}, N = {N} and F_min = {F_min} you have the following options: \n1) Set F_min >= {np.ceil(Fs/N)} Hz. \n2) Set N >= {np.ceil(Fs/F_min).astype(int)}. \n3) Set Fs <= {np.floor(F_min * N)} Hz.")
+
+    x_pad = np.concatenate((np.zeros(N//2), x, np.zeros(N//2)))  # Add zeros for centered estimates
+    M = int(np.floor((len(x_pad) - N) / H)) + 1  # Compute number of estimates that will be generated
+    f0 = np.zeros(M)  # Estimated fundamental frequencies (0 for unspecified frames)
+    t = np.arange(M)*H/Fs  # Time axis
+    ap = np.zeros(M)  # Aperiodicity
+
+    lag_min = max(int(np.ceil(Fs / F_max)), 1)  # lag of maximal frequency in samples
+    lag_max = int(np.ceil(Fs / F_min))  # lag of minimal frequency in samples
+
+    for m in range(M):
+        if verbose:
+            print(f"YIN Progress: {np.ceil(100*m/M).astype(int)}%", end='\r')
+        # Take a frame from input signal
+        frame = x_pad[m*H:m*H + N]
+
+        # Cumulative Mean Normalized Difference Function
+        cmndf = cumulative_mean_normalized_difference_function(frame, lag_max)
+
+        # Absolute Thresholding
+        lag_est = absolute_thresholding(cmndf, threshold, lag_min, lag_max, parabolic_interp=True)
+
+        # Refine estimate by constraining search to vicinity of best local estimate (default: +/- 25 cents)
+        tol_cents = 25
+        lag_min_local = int(np.round(Fs / ((Fs / lag_est) * 2 ** (tol_cents/1200))))
+        if lag_min_local < lag_min:
+            lag_min_local = lag_min
+        lag_max_local = int(np.round(Fs / ((Fs / lag_est) * 2 ** (-tol_cents/1200))))
+        if lag_max_local > lag_max:
+            lag_max_local = lag_max
+        lag_new = absolute_thresholding(cmndf, threshold=np.inf, lag_min=lag_min_local, lag_max=lag_max_local,
+                                        parabolic_interp=True)
+
+        # Compute Fundamental Frequency Estimate
+        f0[m] = Fs / lag_new
+
+        # Compute Aperiodicity
+        ap[m] = aperiodicity(frame, lag_new)
+
+    return f0, t, ap
+
+
+@njit
+def cumulative_mean_normalized_difference_function(frame, lag_max):
+    """
+    Computes Cumulative Mean Normalized Difference Function (CMNDF).
+
+    Parameters
+    ----------
+    frame : ndarray
+        Audio frame
+    lag_max : int
+        Maximum expected lag in the CMNDF
+
+    Returns
+    -------
+    cmndf : ndarray
+        Cumulative Mean Normalized Difference Function
+    """
+
+    cmndf = np.zeros(lag_max+1)  # Initialize CMNDF
+    cmndf[0] = 1
+    diff_mean = 0
+
+    for tau in range(1, lag_max+1):
+        # Difference function
+        diff = np.sum((frame[0:-tau] - frame[0 + tau:]) ** 2)
+        # Iterative mean of the difference function
+        diff_mean = diff_mean*(tau-1)/tau + diff/tau
+
+        cmndf[tau] = diff / (diff_mean + np.finfo(np.float64).eps)
+
+    return cmndf
+
+
+def absolute_thresholding(cmndf, threshold, lag_min, lag_max, parabolic_interp=True):
+    """
+    Absolute thresholding:
+    Set an absolute threshold and choose the smallest value of tau that gives a minimum of d' deeper than that
+    threshold. If none is found, the global minimum is chosen instead.
+
+    Parameters
+    ----------
+    cmndf : ndarray
+        Cumulative Mean Normalized Difference Function
+    threshold : float
+        Threshold
+    lag_min : float
+        Minimal lag
+    lag_max : float
+        Maximal lag
+    parabolic_interp : bool
+        Switch to activate/deactivate parabolic interpolation
+
+    Returns
+    -------
+
+    """
+
+    # take shortcut if search range only allows for one possible lag
+    if lag_min == lag_max:
+        return lag_min
+
+    # find local minima below absolute threshold in interval [lag_min:lag_max]
+    local_min_idxs = (np.argwhere((cmndf[1:-1] < cmndf[0:-2]) & (cmndf[1:-1] < cmndf[2:]))).flatten() + 1
+    below_thr_idxs = np.argwhere(cmndf[lag_min:lag_max] < threshold).flatten() + lag_min
+    # numba compatible intersection of indices sets
+    min_idxs = np.unique(np.array([i for i in local_min_idxs for j in below_thr_idxs if i == j]))
+
+    # if no local minima below threshold are found, return global minimum
+    if not min_idxs.size:
+        return np.argmin(cmndf[lag_min:lag_max]) + lag_min
+
+    # find first local minimum
+    lag = np.min(min_idxs)  # choose first local minimum
+
+    # Optional: Parabolic Interpolation of local minima
+    if parabolic_interp:
+        lag_corr, cmndf[lag] = parabolic_interpolation(cmndf[lag-1], cmndf[lag], cmndf[lag+1])
+        lag += lag_corr
+
+    return lag
+
+
+@njit
+def parabolic_interpolation(y1, y2, y3):
+    """
+    Parabolic interpolation of an extremal value given three samples with equal spacing on the x-axis.
+    The middle value y2 is assumed to be the extremal sample of the three.
+
+    Parameters
+    ----------
+    y1: f(x1)
+    y2: f(x2)
+    y3: f(x3)
+
+    Returns
+    -------
+    x_interp: Interpolated x-value (relative to x3-x2)
+    y_interp: Interpolated y-value, f(x_interp)
+    """
+
+    a = np.finfo(np.float64).eps + (y1 + y3 - 2 * y2) / 2
+    b = (y3 - y1) / 2
+    x_interp = -b / (2 * a)
+    y_interp = y2 - (b ** 2) / (4 * a)
+
+    return x_interp, y_interp
+
+
+def aperiodicity(frame, lag_est):
+    """
+    Compute aperiodicity of given frame (serves as indicator for reliability or voicing detection).
+
+    Parameters
+    ----------
+    frame : ndarray
+        Frame
+    lag_est : float
+        Estimated lag
+
+    Returns
+    -------
+    ap: float
+        Aperiodicity (the lower, the more reliable the estimate)
+    """
+
+    lag_int = int(np.floor(lag_est))  # uncorrected period estimate
+    frac = lag_est - lag_int  # residual
+
+    # Pad frame to insure constant size
+    frame_pad = np.concatenate((frame, np.flip(frame)))  # mirror padding
+
+    # Shift frame by estimated period
+    if frac == 0:
+        frame_shift = frame_pad[lag_int:lag_int+len(frame)]
+    else:
+        # linear interpolation between adjacent shifts
+        frame_shift = (1 - frac) * frame_pad[lag_int:lag_int+len(frame)] + \
+                      frac * frame_pad[lag_int+1:lag_int+1+len(frame)]
+
+    pwr = (np.mean(frame ** 2) + np.mean(frame_shift ** 2)) / 2  # average power over fixed and shifted frame
+    res = np.mean((frame - frame_shift) ** 2) / 2  # residual power
+    ap = res / (pwr + np.finfo(np.float64).eps)
+
+    return ap
diff --git a/pitch/debug.py b/pitch/debug.py
new file mode 100644
index 0000000000000000000000000000000000000000..aa21c7c88889c0b401de6c6693e4b68c4a8ffc4d
--- /dev/null
+++ b/pitch/debug.py
@@ -0,0 +1,23 @@
+import argparse
+import numpy as np
+
+
+def save_csv_pitch(pitch, path):
+    with open(path, "w", encoding='utf-8') as pitch_file:
+        for i in range(len(pitch)):
+            t = i * 10
+            minute = t // 60000
+            seconds = (t - minute * 60000) // 1000
+            millisecond = t % 1000
+            print(
+                f"{minute}m {seconds}s {millisecond:3d},{int(pitch[i])}", file=pitch_file)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-p", "--pit", help="pit", dest="pit", required=True)  # pit for train
+    args = parser.parse_args()
+    print(args.pit)
+
+    pitch = np.load(args.pit)
+    save_csv_pitch(pitch, 'pitch_debug.csv')
diff --git a/pitch/inference.py b/pitch/inference.py
new file mode 100644
index 0000000000000000000000000000000000000000..4b204f09b4617fa9bade30d70ed9b7bf5c846b32
--- /dev/null
+++ b/pitch/inference.py
@@ -0,0 +1,134 @@
+import sys,os
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+import torch
+import librosa
+import argparse
+import numpy as np
+import crepe
+
+
+def move_average(a, n, mode="same"):
+    return (np.convolve(a, np.ones((n,))/n, mode=mode))
+
+
+def compute_f0_mouth(path, device):
+    # pip install praat-parselmouth
+    import parselmouth
+
+    x, sr = librosa.load(path, sr=16000)
+    assert sr == 16000
+    lpad = 1024 // 160
+    rpad = lpad
+    f0 = parselmouth.Sound(x, sr).to_pitch_ac(
+        time_step=160 / sr,
+        voicing_threshold=0.5,
+        pitch_floor=30,
+        pitch_ceiling=1000).selected_array['frequency']
+    f0 = np.pad(f0, [[lpad, rpad]], mode='constant')
+    return f0
+
+
+def compute_f0_salience(filename, device):
+    from pitch.core.salience import salience
+    audio, sr = librosa.load(filename, sr=16000)
+    assert sr == 16000
+    f0, t, s = salience(
+        audio,
+        Fs=sr,
+        H=320,
+        N=2048,
+        F_min=45.0,
+        F_max=1760.0)
+    f0 = np.repeat(f0, 2, -1)  # 320 -> 160 * 2
+    f0 = move_average(f0, 3)
+    return f0
+
+
+def compute_f0_voice(filename, device):
+    audio, sr = librosa.load(filename, sr=16000)
+    assert sr == 16000
+    audio = torch.tensor(np.copy(audio))[None]
+    audio = audio + torch.randn_like(audio) * 0.001
+    # Here we'll use a 10 millisecond hop length
+    hop_length = 160
+    fmin = 50
+    fmax = 1000
+    model = "full"
+    batch_size = 512
+    pitch = crepe.predict(
+        audio,
+        sr,
+        hop_length,
+        fmin,
+        fmax,
+        model,
+        batch_size=batch_size,
+        device=device,
+        return_periodicity=False,
+    )
+    pitch = crepe.filter.mean(pitch, 3)
+    pitch = pitch.squeeze(0)
+    return pitch
+
+
+def compute_f0_sing(filename, device):
+    audio, sr = librosa.load(filename, sr=16000)
+    assert sr == 16000
+    audio = torch.tensor(np.copy(audio))[None]
+    audio = audio + torch.randn_like(audio) * 0.001
+    # Here we'll use a 20 millisecond hop length
+    hop_length = 320
+    fmin = 50
+    fmax = 1000
+    model = "full"
+    batch_size = 512
+    pitch = crepe.predict(
+        audio,
+        sr,
+        hop_length,
+        fmin,
+        fmax,
+        model,
+        batch_size=batch_size,
+        device=device,
+        return_periodicity=False,
+    )
+    pitch = np.repeat(pitch, 2, -1)  # 320 -> 160 * 2
+    pitch = crepe.filter.mean(pitch, 5)
+    pitch = pitch.squeeze(0)
+    return pitch
+
+
+def save_csv_pitch(pitch, path):
+    with open(path, "w", encoding='utf-8') as pitch_file:
+        for i in range(len(pitch)):
+            t = i * 10
+            minute = t // 60000
+            seconds = (t - minute * 60000) // 1000
+            millisecond = t % 1000
+            print(
+                f"{minute}m {seconds}s {millisecond:3d},{int(pitch[i])}", file=pitch_file)
+
+
+def load_csv_pitch(path):
+    pitch = []
+    with open(path, "r", encoding='utf-8') as pitch_file:
+        for line in pitch_file.readlines():
+            pit = line.strip().split(",")[-1]
+            pitch.append(int(pit))
+    return pitch
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-w", "--wav", help="wav", dest="wav", required=True)
+    parser.add_argument("-p", "--pit", help="pit", dest="pit", required=True)  # csv for excel
+    args = parser.parse_args()
+    print(args.wav)
+    print(args.pit)
+
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    pitch = compute_f0_sing(args.wav, device)
+    save_csv_pitch(pitch, args.pit)
+    # tmp = load_csv_pitch(args.pit)
+    # save_csv_pitch(tmp, "tmp.csv")
diff --git a/prepare/preprocess_a.py b/prepare/preprocess_a.py
new file mode 100644
index 0000000000000000000000000000000000000000..87d03b5baffc1c6f355bb59dc94e299ac37b2427
--- /dev/null
+++ b/prepare/preprocess_a.py
@@ -0,0 +1,58 @@
+import os
+import librosa
+import argparse
+import numpy as np
+from tqdm import tqdm
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from scipy.io import wavfile
+
+
+def resample_wave(wav_in, wav_out, sample_rate):
+    wav, _ = librosa.load(wav_in, sr=sample_rate)
+    wav = wav / np.abs(wav).max() * 0.6
+    wav = wav / max(0.01, np.max(np.abs(wav))) * 32767 * 0.6
+    wavfile.write(wav_out, sample_rate, wav.astype(np.int16))
+
+
+def process_file(file, wavPath, spks, outPath, sr):
+    if file.endswith(".wav"):
+        file = file[:-4]
+        resample_wave(f"{wavPath}/{spks}/{file}.wav", f"{outPath}/{spks}/{file}.wav", sr)
+
+
+def process_files_with_thread_pool(wavPath, spks, outPath, sr, thread_num=None):
+    files = [f for f in os.listdir(f"./{wavPath}/{spks}") if f.endswith(".wav")]
+
+    with ThreadPoolExecutor(max_workers=thread_num) as executor:
+        futures = {executor.submit(process_file, file, wavPath, spks, outPath, sr): file for file in files}
+
+        for future in tqdm(as_completed(futures), total=len(futures), desc=f'Processing {sr} {spks}'):
+            future.result()
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-w", "--wav", help="wav", dest="wav", required=True)
+    parser.add_argument("-o", "--out", help="out", dest="out", required=True)
+    parser.add_argument("-s", "--sr", help="sample rate", dest="sr", type=int, required=True)
+    parser.add_argument("-t", "--thread_count", help="thread count to process, set 0 to use all cpu cores", dest="thread_count", type=int, default=1)
+
+    args = parser.parse_args()
+    print(args.wav)
+    print(args.out)
+    print(args.sr)
+
+    os.makedirs(args.out, exist_ok=True)
+    wavPath = args.wav
+    outPath = args.out
+
+    assert args.sr == 16000 or args.sr == 32000
+
+    for spks in os.listdir(wavPath):
+        if os.path.isdir(f"./{wavPath}/{spks}"):
+            os.makedirs(f"./{outPath}/{spks}", exist_ok=True)
+            if args.thread_count == 0:
+                process_num = os.cpu_count() // 2 + 1
+            else:
+                process_num = args.thread_count
+            process_files_with_thread_pool(wavPath, spks, outPath, args.sr, process_num)
diff --git a/prepare/preprocess_cdc.py b/prepare/preprocess_cdc.py
new file mode 100644
index 0000000000000000000000000000000000000000..730feaacd24620136e8fb60de7989cffc7f56043
--- /dev/null
+++ b/prepare/preprocess_cdc.py
@@ -0,0 +1,51 @@
+import os
+import argparse
+import torch
+import torchaudio
+
+from tqdm import tqdm
+from scipy.io.wavfile import read
+from scipy.io.wavfile import write
+# torch=1.9.0 ->  pip install torchaudio==0.9.0 -i https://mirrors.aliyun.com/pypi/simple/
+# this file is for VCTK
+
+
+MAX_WAV_VALUE = 32768.0
+
+
+def cut_direct_content(iWave, oWave):
+    source, sr = torchaudio.load(iWave)
+    stft = torch.stft(source, 1024, 256, 1024, torch.hann_window(1024), return_complex=True)
+    stft[:, 0, :] = 0
+    stft[:, 1, :] = 0
+    istft = torch.istft(stft, 1024, 256, 1024, torch.hann_window(1024))
+    audio = istft.squeeze()
+    audio = MAX_WAV_VALUE * audio
+    audio = audio.clamp(min=-MAX_WAV_VALUE, max=MAX_WAV_VALUE-1)
+    audio = audio.short()
+    audio = audio.data.cpu().detach().numpy()
+    write(oWave, sr, audio)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-i", help="input path", dest="inPath", required=True)
+    parser.add_argument("-o", help="output path", dest="outPath", required=True)
+
+    args = parser.parse_args()
+    print(args.inPath)
+    print(args.outPath)
+
+    os.makedirs(args.outPath, exist_ok=True)
+    rootPath = args.inPath
+    outPath = args.outPath
+
+    for spks in os.listdir(rootPath):
+        if (os.path.isdir(f"./{rootPath}/{spks}")):
+            os.makedirs(f"./{outPath}/{spks}", exist_ok=True)
+
+            files = [f for f in os.listdir(f"./{rootPath}/{spks}") if f.endswith(".wav")]
+            for file in tqdm(files, desc=f'Processing cdc {spks}'):
+                iWave = f"./{rootPath}/{spks}/{file}"
+                oWave = f"./{outPath}/{spks}/{file}"
+                cut_direct_content(iWave, oWave)
diff --git a/prepare/preprocess_crepe.py b/prepare/preprocess_crepe.py
new file mode 100644
index 0000000000000000000000000000000000000000..6f9fda489d8fce1c8ee5d4f44aea7165a0b0534d
--- /dev/null
+++ b/prepare/preprocess_crepe.py
@@ -0,0 +1,69 @@
+import sys,os
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+import numpy as np
+import librosa
+import torch
+import crepe
+import argparse
+from tqdm import tqdm
+
+
+def compute_f0(filename, save, device):
+    audio, sr = librosa.load(filename, sr=16000)
+    assert sr == 16000
+    # Load audio
+    audio = torch.tensor(np.copy(audio))[None]
+    audio = audio + torch.randn_like(audio) * 0.001
+    # Here we'll use a 10 millisecond hop length
+    hop_length = 160
+    # Provide a sensible frequency range for your domain (upper limit is 2006 Hz)
+    # This would be a reasonable range for speech
+    fmin = 50
+    fmax = 1000
+    # Select a model capacity--one of "tiny" or "full"
+    model = "full"
+    # Pick a batch size that doesn't cause memory errors on your gpu
+    batch_size = 512
+    # Compute pitch using first gpu
+    pitch, periodicity = crepe.predict(
+        audio,
+        sr,
+        hop_length,
+        fmin,
+        fmax,
+        model,
+        batch_size=batch_size,
+        device=device,
+        return_periodicity=True,
+    )
+    # CREPE was not trained on silent audio. some error on silent need filter.pitPath
+    periodicity = crepe.filter.median(periodicity, 7)
+    pitch = crepe.filter.mean(pitch, 5)
+    pitch[periodicity < 0.5] = 0
+    pitch = pitch.squeeze(0)
+    np.save(save, pitch, allow_pickle=False)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-w", "--wav", help="wav", dest="wav", required=True)
+    parser.add_argument("-p", "--pit", help="pit", dest="pit", required=True)
+
+    args = parser.parse_args()
+    print(args.wav)
+    print(args.pit)
+
+    os.makedirs(args.pit, exist_ok=True)
+    wavPath = args.wav
+    pitPath = args.pit
+
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+
+    for spks in os.listdir(wavPath):
+        if os.path.isdir(f"./{wavPath}/{spks}"):
+            os.makedirs(f"./{pitPath}/{spks}", exist_ok=True)
+
+            files = [f for f in os.listdir(f"./{wavPath}/{spks}") if f.endswith(".wav")]
+            for file in tqdm(files, desc=f'Processing crepe {spks}'):
+                file = file[:-4]
+                compute_f0(f"{wavPath}/{spks}/{file}.wav", f"{pitPath}/{spks}/{file}.pit", device)
diff --git a/prepare/preprocess_f0.py b/prepare/preprocess_f0.py
new file mode 100644
index 0000000000000000000000000000000000000000..1b6ae384f8511455c660caf9974815f8d781bc8c
--- /dev/null
+++ b/prepare/preprocess_f0.py
@@ -0,0 +1,62 @@
+import os
+import numpy as np
+import librosa
+import pyworld
+import argparse
+from tqdm import tqdm
+from concurrent.futures import ProcessPoolExecutor, as_completed
+
+
+def compute_f0(path, save):
+    x, sr = librosa.load(path, sr=16000)
+    assert sr == 16000
+    f0, t = pyworld.dio(
+        x.astype(np.double),
+        fs=sr,
+        f0_ceil=900,
+        frame_period=1000 * 160 / sr,
+    )
+    f0 = pyworld.stonemask(x.astype(np.double), f0, t, fs=16000)
+    for index, pitch in enumerate(f0):
+        f0[index] = round(pitch, 1)
+    np.save(save, f0, allow_pickle=False)
+
+
+def process_file(file, wavPath, spks, pitPath):
+    if file.endswith(".wav"):
+        file = file[:-4]
+        compute_f0(f"{wavPath}/{spks}/{file}.wav", f"{pitPath}/{spks}/{file}.pit")
+
+
+def process_files_with_process_pool(wavPath, spks, pitPath, process_num=None):
+    files = [f for f in os.listdir(f"./{wavPath}/{spks}") if f.endswith(".wav")]
+
+    with ProcessPoolExecutor(max_workers=process_num) as executor:
+        futures = {executor.submit(process_file, file, wavPath, spks, pitPath): file for file in files}
+
+        for future in tqdm(as_completed(futures), total=len(futures), desc=f'Processing f0 {spks}'):
+            future.result()
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-w", "--wav", help="wav", dest="wav", required=True)
+    parser.add_argument("-p", "--pit", help="pit", dest="pit", required=True)
+    parser.add_argument("-t", "--thread_count", help="thread count to process, set 0 to use all cpu cores", dest="thread_count", type=int, default=1)
+    
+    args = parser.parse_args()
+    print(args.wav)
+    print(args.pit)
+
+    os.makedirs(args.pit, exist_ok=True)
+    wavPath = args.wav
+    pitPath = args.pit
+
+    for spks in os.listdir(wavPath):
+        if os.path.isdir(f"./{wavPath}/{spks}"):
+            os.makedirs(f"./{pitPath}/{spks}", exist_ok=True)
+            if args.thread_count == 0:
+                process_num = os.cpu_count() // 2 + 1
+            else:
+                process_num = args.thread_count
+            process_files_with_process_pool(wavPath, spks, pitPath, process_num)
diff --git a/prepare/preprocess_f0_mouth.py b/prepare/preprocess_f0_mouth.py
new file mode 100644
index 0000000000000000000000000000000000000000..0a03ff6e2403dc736beb40829a1da8c416353f00
--- /dev/null
+++ b/prepare/preprocess_f0_mouth.py
@@ -0,0 +1,62 @@
+import os
+import numpy as np
+import librosa
+import argparse
+import parselmouth
+# pip install praat-parselmouth
+from tqdm import tqdm
+from concurrent.futures import ProcessPoolExecutor, as_completed
+
+
+def compute_f0(path, save):
+    x, sr = librosa.load(path, sr=16000)
+    assert sr == 16000
+    lpad = 1024 // 160
+    rpad = lpad
+    f0 = parselmouth.Sound(x, sr).to_pitch_ac(
+        time_step=160 / sr,
+        voicing_threshold=0.5,
+        pitch_floor=30,
+        pitch_ceiling=1000).selected_array['frequency']
+    f0 = np.pad(f0, [[lpad, rpad]], mode='constant')
+    np.save(save, f0, allow_pickle=False)
+
+
+def process_file(file, wavPath, spks, pitPath):
+    if file.endswith(".wav"):
+        file = file[:-4]
+        compute_f0(f"{wavPath}/{spks}/{file}.wav", f"{pitPath}/{spks}/{file}.pit")
+
+
+def process_files_with_process_pool(wavPath, spks, pitPath, process_num=None):
+    files = [f for f in os.listdir(f"./{wavPath}/{spks}") if f.endswith(".wav")]
+
+    with ProcessPoolExecutor(max_workers=process_num) as executor:
+        futures = {executor.submit(process_file, file, wavPath, spks, pitPath): file for file in files}
+
+        for future in tqdm(as_completed(futures), total=len(futures), desc=f'Processing f0 {spks}'):
+            future.result()
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-w", "--wav", help="wav", dest="wav", required=True)
+    parser.add_argument("-p", "--pit", help="pit", dest="pit", required=True)
+    parser.add_argument("-t", "--thread_count", help="thread count to process, set 0 to use all cpu cores", dest="thread_count", type=int, default=1)
+    
+    args = parser.parse_args()
+    print(args.wav)
+    print(args.pit)
+
+    os.makedirs(args.pit, exist_ok=True)
+    wavPath = args.wav
+    pitPath = args.pit
+
+    for spks in os.listdir(wavPath):
+        if os.path.isdir(f"./{wavPath}/{spks}"):
+            os.makedirs(f"./{pitPath}/{spks}", exist_ok=True)
+            if args.thread_count == 0:
+                process_num = os.cpu_count() // 2 + 1
+            else:
+                process_num = args.thread_count
+            process_files_with_process_pool(wavPath, spks, pitPath, process_num)
diff --git a/prepare/preprocess_hubert.py b/prepare/preprocess_hubert.py
new file mode 100644
index 0000000000000000000000000000000000000000..dd4265b715877a9b97cb8192a3c0d9c450cb2fbb
--- /dev/null
+++ b/prepare/preprocess_hubert.py
@@ -0,0 +1,58 @@
+import sys,os
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+import numpy as np
+import argparse
+import torch
+import librosa
+
+from tqdm import tqdm
+from hubert import hubert_model
+
+
+def load_audio(file: str, sr: int = 16000):
+    x, sr = librosa.load(file, sr=sr)
+    return x
+
+
+def load_model(path, device):
+    model = hubert_model.hubert_soft(path)
+    model.eval()
+    model.half()
+    model.to(device)
+    return model
+
+
+def pred_vec(model, wavPath, vecPath, device):
+    feats = load_audio(wavPath)
+    feats = torch.from_numpy(feats).to(device)
+    feats = feats[None, None, :].half()
+    with torch.no_grad():
+        vec = model.units(feats).squeeze().data.cpu().float().numpy()
+        # print(vec.shape)   # [length, dim=256] hop=320
+        np.save(vecPath, vec, allow_pickle=False)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-w", "--wav", help="wav", dest="wav", required=True)
+    parser.add_argument("-v", "--vec", help="vec", dest="vec", required=True)
+    
+    args = parser.parse_args()
+    print(args.wav)
+    print(args.vec)
+    os.makedirs(args.vec, exist_ok=True)
+
+    wavPath = args.wav
+    vecPath = args.vec
+
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    hubert = load_model(os.path.join("hubert_pretrain", "hubert-soft-0d54a1f4.pt"), device)
+
+    for spks in os.listdir(wavPath):
+        if os.path.isdir(f"./{wavPath}/{spks}"):
+            os.makedirs(f"./{vecPath}/{spks}", exist_ok=True)
+
+            files = [f for f in os.listdir(f"./{wavPath}/{spks}") if f.endswith(".wav")]
+            for file in tqdm(files, desc=f'Processing vec {spks}'):
+                file = file[:-4]
+                pred_vec(hubert, f"{wavPath}/{spks}/{file}.wav", f"{vecPath}/{spks}/{file}.vec", device)
diff --git a/prepare/preprocess_ppg.py b/prepare/preprocess_ppg.py
new file mode 100644
index 0000000000000000000000000000000000000000..999bec671c8b044c5d73c53614268ce36530b279
--- /dev/null
+++ b/prepare/preprocess_ppg.py
@@ -0,0 +1,71 @@
+import sys,os
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+import numpy as np
+import argparse
+import torch
+import random
+from tqdm import tqdm
+from whisper.model import Whisper, ModelDimensions
+from whisper.audio import load_audio, pad_or_trim, log_mel_spectrogram
+
+
+def load_model(path) -> Whisper:
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    checkpoint = torch.load(path, map_location="cpu")
+    dims = ModelDimensions(**checkpoint["dims"])
+    print(dims)
+    model = Whisper(dims)
+    del model.decoder
+    cut = len(model.encoder.blocks) // 4
+    cut = -1 * cut
+    del model.encoder.blocks[cut:]
+    model.load_state_dict(checkpoint["model_state_dict"], strict=False)
+    model.eval()
+    model.half()
+    model.to(device)
+    return model
+
+
+def pred_ppg(whisper: Whisper, wavPath, ppgPath):
+    audio = load_audio(wavPath)
+    audln = audio.shape[0]
+    ppgln = audln // 320
+    audio = pad_or_trim(audio)
+    mel = log_mel_spectrogram(audio).half().to(whisper.device)
+    with torch.no_grad():
+        ppg = whisper.encoder(mel.unsqueeze(0)).squeeze().data.cpu().float().numpy()
+        ppg = ppg[:ppgln,]  # [length, dim=1280]
+        # print(ppg.shape)
+        np.save(ppgPath, ppg, allow_pickle=False)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-w", "--wav", help="wav", dest="wav", required=True)
+    parser.add_argument("-p", "--ppg", help="ppg", dest="ppg", required=True)
+    args = parser.parse_args()
+    print(args.wav)
+    print(args.ppg)
+
+    os.makedirs(args.ppg, exist_ok=True)
+    wavPath = args.wav
+    ppgPath = args.ppg
+
+    whisper = load_model(os.path.join("whisper_pretrain", "large-v2.pt"))
+    spkPaths = os.listdir(wavPath)
+    random.shuffle(spkPaths)
+
+    for spks in spkPaths:
+        if os.path.isdir(f"./{wavPath}/{spks}"):
+            os.makedirs(f"./{ppgPath}/{spks}", exist_ok=True)
+
+            files = [f for f in os.listdir(f"./{wavPath}/{spks}") if f.endswith(".wav")]
+            for file in tqdm(files, desc=f'Processing ppg {spks}'):
+                if file.endswith(".wav"):
+                    # print(file)
+                    file = file[:-4]
+                    path_wav = f"{wavPath}/{spks}/{file}.wav"
+                    path_ppg = f"{ppgPath}/{spks}/{file}.ppg"
+                    if os.path.isfile(f"{path_ppg}.npy"):
+                        continue
+                    pred_ppg(whisper, path_wav, path_ppg)
diff --git a/prepare/preprocess_random.py b/prepare/preprocess_random.py
new file mode 100644
index 0000000000000000000000000000000000000000..f84977bb49d090b333a382772374830d5d1318c6
--- /dev/null
+++ b/prepare/preprocess_random.py
@@ -0,0 +1,23 @@
+import random
+
+
+if __name__ == "__main__":
+    all_items = []
+    fo = open("./files/train_all.txt", "r+", encoding='utf-8')
+    while (True):
+        try:
+            item = fo.readline().strip()
+        except Exception as e:
+            print('nothing of except:', e)
+            break
+        if (item == None or item == ""):
+            break
+        all_items.append(item)
+    fo.close()
+
+    random.shuffle(all_items)
+
+    fw = open("./files/train_all.txt", "w", encoding="utf-8")
+    for strs in all_items:
+        print(strs, file=fw)
+    fw.close()
diff --git a/prepare/preprocess_speaker.py b/prepare/preprocess_speaker.py
new file mode 100644
index 0000000000000000000000000000000000000000..797b60edbeb16f8a50a1be8bd2095f206bd8875e
--- /dev/null
+++ b/prepare/preprocess_speaker.py
@@ -0,0 +1,103 @@
+import sys,os
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+import torch
+import numpy as np
+import argparse
+
+from tqdm import tqdm
+from functools import partial
+from argparse import RawTextHelpFormatter
+from multiprocessing.pool import ThreadPool
+
+from speaker.models.lstm import LSTMSpeakerEncoder
+from speaker.config import SpeakerEncoderConfig
+from speaker.utils.audio import AudioProcessor
+from speaker.infer import read_json
+
+
+def get_spk_wavs(dataset_path, output_path):
+    wav_files = []
+    os.makedirs(f"./{output_path}", exist_ok=True)
+    for spks in os.listdir(dataset_path):
+        if os.path.isdir(f"./{dataset_path}/{spks}"):
+            os.makedirs(f"./{output_path}/{spks}", exist_ok=True)
+            for file in os.listdir(f"./{dataset_path}/{spks}"):
+                if file.endswith(".wav"):
+                    wav_files.append(f"./{dataset_path}/{spks}/{file}")
+        elif spks.endswith(".wav"):
+            wav_files.append(f"./{dataset_path}/{spks}")
+    return wav_files
+
+
+def process_wav(wav_file, dataset_path, output_path, args, speaker_encoder_ap, speaker_encoder):
+    waveform = speaker_encoder_ap.load_wav(
+        wav_file, sr=speaker_encoder_ap.sample_rate
+    )
+    spec = speaker_encoder_ap.melspectrogram(waveform)
+    spec = torch.from_numpy(spec.T)
+    if args.use_cuda:
+        spec = spec.cuda()
+    spec = spec.unsqueeze(0)
+    embed = speaker_encoder.compute_embedding(spec).detach().cpu().numpy()
+    embed = embed.squeeze()
+    embed_path = wav_file.replace(dataset_path, output_path)
+    embed_path = embed_path.replace(".wav", ".spk")
+    np.save(embed_path, embed, allow_pickle=False)
+
+
+def extract_speaker_embeddings(wav_files, dataset_path, output_path, args, speaker_encoder_ap, speaker_encoder, concurrency):
+    bound_process_wav = partial(process_wav, dataset_path=dataset_path, output_path=output_path, args=args, speaker_encoder_ap=speaker_encoder_ap, speaker_encoder=speaker_encoder)
+
+    with ThreadPool(concurrency) as pool:
+        list(tqdm(pool.imap(bound_process_wav, wav_files), total=len(wav_files)))
+
+
+if __name__ == "__main__":
+
+    parser = argparse.ArgumentParser(
+        description="""Compute embedding vectors for each wav file in a dataset.""",
+        formatter_class=RawTextHelpFormatter,
+    )
+    parser.add_argument("dataset_path", type=str, help="Path to dataset waves.")
+    parser.add_argument(
+        "output_path", type=str, help="path for output speaker/speaker_wavs.npy."
+    )
+    parser.add_argument("--use_cuda", type=bool, help="flag to set cuda.", default=True)
+    parser.add_argument("-t", "--thread_count", help="thread count to process, set 0 to use all cpu cores", dest="thread_count", type=int, default=1)
+    args = parser.parse_args()
+    dataset_path = args.dataset_path
+    output_path = args.output_path
+    thread_count = args.thread_count
+    # model
+    args.model_path = os.path.join("speaker_pretrain", "best_model.pth.tar")
+    args.config_path = os.path.join("speaker_pretrain", "config.json")
+    # config
+    config_dict = read_json(args.config_path)
+
+    # model
+    config = SpeakerEncoderConfig(config_dict)
+    config.from_dict(config_dict)
+
+    speaker_encoder = LSTMSpeakerEncoder(
+        config.model_params["input_dim"],
+        config.model_params["proj_dim"],
+        config.model_params["lstm_dim"],
+        config.model_params["num_lstm_layers"],
+    )
+
+    speaker_encoder.load_checkpoint(args.model_path, eval=True, use_cuda=args.use_cuda)
+
+    # preprocess
+    speaker_encoder_ap = AudioProcessor(**config.audio)
+    # normalize the input audio level and trim silences
+    speaker_encoder_ap.do_sound_norm = True
+    speaker_encoder_ap.do_trim_silence = True
+
+    wav_files = get_spk_wavs(dataset_path, output_path)
+
+    if thread_count == 0:
+        process_num = os.cpu_count()
+    else:
+        process_num = thread_count
+
+    extract_speaker_embeddings(wav_files, dataset_path, output_path, args, speaker_encoder_ap, speaker_encoder, process_num)
\ No newline at end of file
diff --git a/prepare/preprocess_speaker_ave.py b/prepare/preprocess_speaker_ave.py
new file mode 100644
index 0000000000000000000000000000000000000000..9423f61693a91fef7ff9836f89f157856767d924
--- /dev/null
+++ b/prepare/preprocess_speaker_ave.py
@@ -0,0 +1,54 @@
+import os
+import torch
+import argparse
+import numpy as np
+from tqdm import tqdm
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("dataset_speaker", type=str)
+    parser.add_argument("dataset_singer", type=str)
+
+    data_speaker = parser.parse_args().dataset_speaker
+    data_singer = parser.parse_args().dataset_singer
+
+    os.makedirs(data_singer, exist_ok=True)
+
+    for speaker in os.listdir(data_speaker):
+        subfile_num = 0
+        speaker_ave = 0
+
+        for file in tqdm(os.listdir(os.path.join(data_speaker, speaker)), desc=f"average {speaker}"):
+            if not file.endswith(".npy"):
+                continue
+            source_embed = np.load(os.path.join(data_speaker, speaker, file))
+            source_embed = source_embed.astype(np.float32)
+            speaker_ave = speaker_ave + source_embed
+            subfile_num = subfile_num + 1
+        if subfile_num == 0:
+            continue
+        speaker_ave = speaker_ave / subfile_num
+
+        np.save(os.path.join(data_singer, f"{speaker}.spk.npy"),
+                speaker_ave, allow_pickle=False)
+
+        # rewrite timbre code by average, if similarity is larger than cmp_val
+        rewrite_timbre_code = False
+        if not rewrite_timbre_code:
+            continue
+        cmp_src = torch.FloatTensor(speaker_ave)
+        cmp_num = 0
+        cmp_val = 0.85
+        for file in tqdm(os.listdir(os.path.join(data_speaker, speaker)), desc=f"rewrite {speaker}"):
+            if not file.endswith(".npy"):
+                continue
+            cmp_tmp = np.load(os.path.join(data_speaker, speaker, file))
+            cmp_tmp = cmp_tmp.astype(np.float32)
+            cmp_tmp = torch.FloatTensor(cmp_tmp)
+            cmp_cos = torch.cosine_similarity(cmp_src, cmp_tmp, dim=0)
+            if (cmp_cos > cmp_val):
+                cmp_num += 1
+                np.save(os.path.join(data_speaker, speaker, file),
+                        speaker_ave, allow_pickle=False)
+        print(f"rewrite timbre for {speaker} with :", cmp_num)
diff --git a/prepare/preprocess_spec.py b/prepare/preprocess_spec.py
new file mode 100644
index 0000000000000000000000000000000000000000..e2eef6e3cde6072342dffe8e39cec82d30d0f80e
--- /dev/null
+++ b/prepare/preprocess_spec.py
@@ -0,0 +1,62 @@
+import sys,os
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+import torch
+import argparse
+import multiprocessing
+from concurrent.futures import ThreadPoolExecutor
+from tqdm import tqdm
+from vits import spectrogram
+from vits import utils
+from omegaconf import OmegaConf
+
+
+def compute_spec(hps, filename, specname):
+    audio, sampling_rate = utils.load_wav_to_torch(filename)
+    assert sampling_rate == hps.sampling_rate, f"{sampling_rate} is not {hps.sampling_rate}"
+    audio_norm = audio / hps.max_wav_value
+    audio_norm = audio_norm.unsqueeze(0)
+    n_fft = hps.filter_length
+    sampling_rate = hps.sampling_rate
+    hop_size = hps.hop_length
+    win_size = hps.win_length
+    spec = spectrogram.spectrogram_torch(
+        audio_norm, n_fft, sampling_rate, hop_size, win_size, center=False)
+    spec = torch.squeeze(spec, 0)
+    torch.save(spec, specname)
+
+
+def process_file(file):
+    if file.endswith(".wav"):
+        file = file[:-4]
+        compute_spec(hps.data, f"{wavPath}/{spks}/{file}.wav", f"{spePath}/{spks}/{file}.pt")
+
+
+def process_files_with_thread_pool(wavPath, spks, thread_num):
+    files = os.listdir(f"./{wavPath}/{spks}")
+    with ThreadPoolExecutor(max_workers=thread_num) as executor:
+        list(tqdm(executor.map(process_file, files), total=len(files), desc=f'Processing spec {spks}'))
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-w", "--wav", help="wav", dest="wav", required=True)
+    parser.add_argument("-s", "--spe", help="spe", dest="spe", required=True)
+    parser.add_argument("-t", "--thread_count", help="thread count to process, set 0 to use all cpu cores", dest="thread_count", type=int, default=1)
+
+    args = parser.parse_args()
+    print(args.wav)
+    print(args.spe)
+
+    os.makedirs(args.spe, exist_ok=True)
+    wavPath = args.wav
+    spePath = args.spe
+    hps = OmegaConf.load("./configs/base.yaml")
+
+    for spks in os.listdir(wavPath):
+        if os.path.isdir(f"./{wavPath}/{spks}"):
+            os.makedirs(f"./{spePath}/{spks}", exist_ok=True)
+            if args.thread_count == 0:
+                process_num = os.cpu_count() // 2 + 1
+            else:
+                process_num = args.thread_count
+            process_files_with_thread_pool(wavPath, spks, process_num)
diff --git a/prepare/preprocess_train.py b/prepare/preprocess_train.py
new file mode 100644
index 0000000000000000000000000000000000000000..985738ec3ef4e2c5d123558ae5a9e400b1cbba85
--- /dev/null
+++ b/prepare/preprocess_train.py
@@ -0,0 +1,68 @@
+import os
+import random
+
+
+def print_error(info):
+    print(f"\033[31m File isn't existed: {info}\033[0m")
+
+
+IndexBySinger = False
+if __name__ == "__main__":
+    os.makedirs("./files/", exist_ok=True)
+
+    rootPath = "./data_svc/waves-32k/"
+    all_items = []
+    for spks in os.listdir(f"./{rootPath}"):
+        if not os.path.isdir(f"./{rootPath}/{spks}"):
+            continue
+        print(f"./{rootPath}/{spks}")
+        for file in os.listdir(f"./{rootPath}/{spks}"):
+            if file.endswith(".wav"):
+                file = file[:-4]
+
+                if (IndexBySinger == False):
+                    path_spk = f"./data_svc/speaker/{spks}/{file}.spk.npy"
+                else:
+                    path_spk = f"./data_svc/singer/{spks}.spk.npy"
+
+                path_wave = f"./data_svc/waves-32k/{spks}/{file}.wav"
+                path_spec = f"./data_svc/specs/{spks}/{file}.pt"
+                path_pitch = f"./data_svc/pitch/{spks}/{file}.pit.npy"
+                path_hubert = f"./data_svc/hubert/{spks}/{file}.vec.npy"
+                path_whisper = f"./data_svc/whisper/{spks}/{file}.ppg.npy"
+                has_error = 0
+                if not os.path.isfile(path_spk):
+                    print_error(path_spk)
+                    has_error = 1
+                if not os.path.isfile(path_wave):
+                    print_error(path_wave)
+                    has_error = 1
+                if not os.path.isfile(path_spec):
+                    print_error(path_spec)
+                    has_error = 1
+                if not os.path.isfile(path_pitch):
+                    print_error(path_pitch)
+                    has_error = 1
+                if not os.path.isfile(path_hubert):
+                    print_error(path_hubert)
+                    has_error = 1
+                if not os.path.isfile(path_whisper):
+                    print_error(path_whisper)
+                    has_error = 1
+                if has_error == 0:
+                    all_items.append(
+                        f"{path_wave}|{path_spec}|{path_pitch}|{path_hubert}|{path_whisper}|{path_spk}")
+
+    random.shuffle(all_items)
+    valids = all_items[:10]
+    valids.sort()
+    trains = all_items[10:]
+    # trains.sort()
+    fw = open("./files/valid.txt", "w", encoding="utf-8")
+    for strs in valids:
+        print(strs, file=fw)
+    fw.close()
+    fw = open("./files/train.txt", "w", encoding="utf-8")
+    for strs in trains:
+        print(strs, file=fw)
+    fw.close()
diff --git a/prepare/preprocess_trim.py b/prepare/preprocess_trim.py
new file mode 100644
index 0000000000000000000000000000000000000000..3856fd413b5a5e781fd342f6cb2c01fd7e48f300
--- /dev/null
+++ b/prepare/preprocess_trim.py
@@ -0,0 +1,50 @@
+import os
+import argparse
+
+from tqdm import tqdm
+from pydub import AudioSegment
+from pydub.silence import split_on_silence
+from pydub import effects
+# this file is for VCTK, use after CDC
+
+
+def trim_silence(iWave, oWave):
+    try:
+        audio = AudioSegment.from_wav(iWave)
+        # audio = effects.normalize(audio, 6)# max - 6dB
+        audio_chunks = split_on_silence(
+            audio,
+            min_silence_len=200,
+            silence_thresh=-45,
+            keep_silence=200,
+        )
+        for chunk in audio_chunks[1:]:
+            audio_chunks[0] += chunk
+        audio_chunks[0].export(oWave, format="wav")
+    except Exception as e:
+        print(str(e))
+        print(iWave)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-i", help="input path", dest="inPath", required=True)
+    parser.add_argument("-o", help="output path", dest="outPath", required=True)
+
+    args = parser.parse_args()
+    print(args.inPath)
+    print(args.outPath)
+
+    os.makedirs(args.outPath, exist_ok=True)
+    rootPath = args.inPath
+    outPath = args.outPath
+
+    for spks in os.listdir(rootPath):
+        if (os.path.isdir(f"./{rootPath}/{spks}")):
+            os.makedirs(f"./{outPath}/{spks}", exist_ok=True)
+
+            files = [f for f in os.listdir(f"./{rootPath}/{spks}") if f.endswith(".wav")]
+            for file in tqdm(files, desc=f'Processing sil {spks}'):
+                iWave = f"./{rootPath}/{spks}/{file}"
+                oWave = f"./{outPath}/{spks}/{file}"
+                trim_silence(iWave, oWave)
diff --git a/prepare/preprocess_zzz.py b/prepare/preprocess_zzz.py
new file mode 100644
index 0000000000000000000000000000000000000000..79e62a97271a9c5f14e220063900d48f09207c61
--- /dev/null
+++ b/prepare/preprocess_zzz.py
@@ -0,0 +1,31 @@
+import sys,os
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+from tqdm import tqdm
+from torch.utils.data import DataLoader
+from omegaconf import OmegaConf
+from vits.data_utils import TextAudioSpeakerSet
+from vits.data_utils import TextAudioSpeakerCollate
+from vits.data_utils import DistributedBucketSampler
+
+
+hps = OmegaConf.load("./configs/base.yaml")
+dataset = TextAudioSpeakerSet("files/valid.txt", hps.data)
+
+for _ in tqdm(dataset):
+    pass
+
+
+sampler = DistributedBucketSampler(
+    dataset,
+    4,
+    [150, 300, 450],
+    num_replicas=1,
+    rank=0,
+    shuffle=True)
+collate_fn = TextAudioSpeakerCollate()
+loader = DataLoader(dataset, num_workers=0, shuffle=False, pin_memory=True,
+                    collate_fn=collate_fn, batch_sampler=sampler)
+
+
+for _ in tqdm(loader):
+    pass
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..2b6efa293e00ed0fe56a5f32717fb80b2d669e43
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,19 @@
+torch==2.2.2
+torchvision==0.17.2
+torchaudio==2.2.2
+fsspec
+pyworld
+matplotlib
+soundfile
+scikit-learn
+scipy
+tensorboard
+transformers
+tqdm
+librosa
+omegaconf
+ruamel.yaml
+resampy
+numpy==1.24
+chardet
+faiss-cpu==1.7.4
diff --git a/speaker/README.md b/speaker/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..b6f541f884f6165a37540cc7fae4df7bf2fa2ac7
--- /dev/null
+++ b/speaker/README.md
@@ -0,0 +1,18 @@
+### Speaker Encoder
+
+This is an implementation of https://arxiv.org/abs/1710.10467. This model can be used for voice and speaker embedding.
+
+With the code here you can generate d-vectors for both multi-speaker and single-speaker TTS datasets, then visualise and explore them along with the associated audio files in an interactive chart.
+
+Below is an example showing embedding results of various speakers. You can generate the same plot with the provided notebook as demonstrated in [this video](https://youtu.be/KW3oO7JVa7Q).
+
+![](umap.png)
+
+Download a pretrained model from [Released Models](https://github.com/mozilla/TTS/wiki/Released-Models) page.
+
+To run the code, you need to follow the same flow as in TTS.
+
+- Define 'config.json' for your needs. Note that, audio parameters should match your TTS model.
+- Example training call ```python speaker_encoder/train.py --config_path speaker_encoder/config.json --data_path ~/Data/Libri-TTS/train-clean-360```
+- Generate embedding vectors ```python speaker_encoder/compute_embeddings.py --use_cuda true /model/path/best_model.pth.tar model/config/path/config.json dataset/path/ output_path``` . This code parses all .wav files at the given dataset path and generates the same folder structure under the output path with the generated embedding files.
+- Watch training on Tensorboard as in TTS
diff --git a/speaker/__init__.py b/speaker/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/speaker/config.py b/speaker/config.py
new file mode 100644
index 0000000000000000000000000000000000000000..7172ee231f9aaf3d9aa21e7244a1e6b48ebaad39
--- /dev/null
+++ b/speaker/config.py
@@ -0,0 +1,64 @@
+from dataclasses import asdict, dataclass, field
+from typing import Dict, List
+
+from .utils.coqpit import MISSING
+from .utils.shared_configs import BaseAudioConfig, BaseDatasetConfig, BaseTrainingConfig
+
+
+@dataclass
+class SpeakerEncoderConfig(BaseTrainingConfig):
+    """Defines parameters for Speaker Encoder model."""
+
+    model: str = "speaker_encoder"
+    audio: BaseAudioConfig = field(default_factory=BaseAudioConfig)
+    datasets: List[BaseDatasetConfig] = field(default_factory=lambda: [BaseDatasetConfig()])
+    # model params
+    model_params: Dict = field(
+        default_factory=lambda: {
+            "model_name": "lstm",
+            "input_dim": 80,
+            "proj_dim": 256,
+            "lstm_dim": 768,
+            "num_lstm_layers": 3,
+            "use_lstm_with_projection": True,
+        }
+    )
+
+    audio_augmentation: Dict = field(default_factory=lambda: {})
+
+    storage: Dict = field(
+        default_factory=lambda: {
+            "sample_from_storage_p": 0.66,  # the probability with which we'll sample from the DataSet in-memory storage
+            "storage_size": 15,  # the size of the in-memory storage with respect to a single batch
+        }
+    )
+
+    # training params
+    max_train_step: int = 1000000  # end training when number of training steps reaches this value.
+    loss: str = "angleproto"
+    grad_clip: float = 3.0
+    lr: float = 0.0001
+    lr_decay: bool = False
+    warmup_steps: int = 4000
+    wd: float = 1e-6
+
+    # logging params
+    tb_model_param_stats: bool = False
+    steps_plot_stats: int = 10
+    checkpoint: bool = True
+    save_step: int = 1000
+    print_step: int = 20
+
+    # data loader
+    num_speakers_in_batch: int = MISSING
+    num_utters_per_speaker: int = MISSING
+    num_loader_workers: int = MISSING
+    skip_speakers: bool = False
+    voice_len: float = 1.6
+
+    def check_values(self):
+        super().check_values()
+        c = asdict(self)
+        assert (
+            c["model_params"]["input_dim"] == self.audio.num_mels
+        ), " [!] model input dimendion must be equal to melspectrogram dimension."
diff --git a/speaker/infer.py b/speaker/infer.py
new file mode 100644
index 0000000000000000000000000000000000000000..b69b2ee6d0c1f00492e50fc11411cf6e245a18e8
--- /dev/null
+++ b/speaker/infer.py
@@ -0,0 +1,108 @@
+import re
+import json
+import fsspec
+import torch
+import numpy as np
+import argparse
+
+from argparse import RawTextHelpFormatter
+from .models.lstm import LSTMSpeakerEncoder
+from .config import SpeakerEncoderConfig
+from .utils.audio import AudioProcessor
+
+
+def read_json(json_path):
+    config_dict = {}
+    try:
+        with fsspec.open(json_path, "r", encoding="utf-8") as f:
+            data = json.load(f)
+    except json.decoder.JSONDecodeError:
+        # backwards compat.
+        data = read_json_with_comments(json_path)
+    config_dict.update(data)
+    return config_dict
+
+
+def read_json_with_comments(json_path):
+    """for backward compat."""
+    # fallback to json
+    with fsspec.open(json_path, "r", encoding="utf-8") as f:
+        input_str = f.read()
+    # handle comments
+    input_str = re.sub(r"\\\n", "", input_str)
+    input_str = re.sub(r"//.*\n", "\n", input_str)
+    data = json.loads(input_str)
+    return data
+
+
+if __name__ == "__main__":
+
+    parser = argparse.ArgumentParser(
+        description="""Compute embedding vectors for each wav file in a dataset.""",
+        formatter_class=RawTextHelpFormatter,
+    )
+    parser.add_argument("model_path", type=str, help="Path to model checkpoint file.")
+    parser.add_argument(
+        "config_path",
+        type=str,
+        help="Path to model config file.",
+    )
+
+    parser.add_argument("-s", "--source", help="input wave", dest="source")
+    parser.add_argument(
+        "-t", "--target", help="output 256d speaker embeddimg", dest="target"
+    )
+
+    parser.add_argument("--use_cuda", type=bool, help="flag to set cuda.", default=True)
+    parser.add_argument("--eval", type=bool, help="compute eval.", default=True)
+
+    args = parser.parse_args()
+    source_file = args.source
+    target_file = args.target
+
+    # config
+    config_dict = read_json(args.config_path)
+    # print(config_dict)
+
+    # model
+    config = SpeakerEncoderConfig(config_dict)
+    config.from_dict(config_dict)
+
+    speaker_encoder = LSTMSpeakerEncoder(
+        config.model_params["input_dim"],
+        config.model_params["proj_dim"],
+        config.model_params["lstm_dim"],
+        config.model_params["num_lstm_layers"],
+    )
+
+    speaker_encoder.load_checkpoint(args.model_path, eval=True, use_cuda=args.use_cuda)
+
+    # preprocess
+    speaker_encoder_ap = AudioProcessor(**config.audio)
+    # normalize the input audio level and trim silences
+    speaker_encoder_ap.do_sound_norm = True
+    speaker_encoder_ap.do_trim_silence = True
+
+    # compute speaker embeddings
+
+    # extract the embedding
+    waveform = speaker_encoder_ap.load_wav(
+        source_file, sr=speaker_encoder_ap.sample_rate
+    )
+    spec = speaker_encoder_ap.melspectrogram(waveform)
+    spec = torch.from_numpy(spec.T)
+    if args.use_cuda:
+        spec = spec.cuda()
+    spec = spec.unsqueeze(0)
+    embed = speaker_encoder.compute_embedding(spec).detach().cpu().numpy()
+    embed = embed.squeeze()
+    # print(embed)
+    # print(embed.size)
+    np.save(target_file, embed, allow_pickle=False)
+
+
+    if hasattr(speaker_encoder, 'module'):
+        state_dict = speaker_encoder.module.state_dict()
+    else:
+        state_dict = speaker_encoder.state_dict()
+        torch.save({'model': state_dict}, "model_small.pth")
diff --git a/speaker/models/__init__.py b/speaker/models/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/speaker/models/lstm.py b/speaker/models/lstm.py
new file mode 100644
index 0000000000000000000000000000000000000000..45e8ccefb76f5b0200f7f2d8392c87624abb4965
--- /dev/null
+++ b/speaker/models/lstm.py
@@ -0,0 +1,131 @@
+import numpy as np
+import torch
+from torch import nn
+
+from ..utils.io import load_fsspec
+
+
+class LSTMWithProjection(nn.Module):
+    def __init__(self, input_size, hidden_size, proj_size):
+        super().__init__()
+        self.input_size = input_size
+        self.hidden_size = hidden_size
+        self.proj_size = proj_size
+        self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True)
+        self.linear = nn.Linear(hidden_size, proj_size, bias=False)
+
+    def forward(self, x):
+        self.lstm.flatten_parameters()
+        o, (_, _) = self.lstm(x)
+        return self.linear(o)
+
+
+class LSTMWithoutProjection(nn.Module):
+    def __init__(self, input_dim, lstm_dim, proj_dim, num_lstm_layers):
+        super().__init__()
+        self.lstm = nn.LSTM(input_size=input_dim, hidden_size=lstm_dim, num_layers=num_lstm_layers, batch_first=True)
+        self.linear = nn.Linear(lstm_dim, proj_dim, bias=True)
+        self.relu = nn.ReLU()
+
+    def forward(self, x):
+        _, (hidden, _) = self.lstm(x)
+        return self.relu(self.linear(hidden[-1]))
+
+
+class LSTMSpeakerEncoder(nn.Module):
+    def __init__(self, input_dim, proj_dim=256, lstm_dim=768, num_lstm_layers=3, use_lstm_with_projection=True):
+        super().__init__()
+        self.use_lstm_with_projection = use_lstm_with_projection
+        layers = []
+        # choise LSTM layer
+        if use_lstm_with_projection:
+            layers.append(LSTMWithProjection(input_dim, lstm_dim, proj_dim))
+            for _ in range(num_lstm_layers - 1):
+                layers.append(LSTMWithProjection(proj_dim, lstm_dim, proj_dim))
+            self.layers = nn.Sequential(*layers)
+        else:
+            self.layers = LSTMWithoutProjection(input_dim, lstm_dim, proj_dim, num_lstm_layers)
+
+        self._init_layers()
+
+    def _init_layers(self):
+        for name, param in self.layers.named_parameters():
+            if "bias" in name:
+                nn.init.constant_(param, 0.0)
+            elif "weight" in name:
+                nn.init.xavier_normal_(param)
+
+    def forward(self, x):
+        # TODO: implement state passing for lstms
+        d = self.layers(x)
+        if self.use_lstm_with_projection:
+            d = torch.nn.functional.normalize(d[:, -1], p=2, dim=1)
+        else:
+            d = torch.nn.functional.normalize(d, p=2, dim=1)
+        return d
+
+    @torch.no_grad()
+    def inference(self, x):
+        d = self.layers.forward(x)
+        if self.use_lstm_with_projection:
+            d = torch.nn.functional.normalize(d[:, -1], p=2, dim=1)
+        else:
+            d = torch.nn.functional.normalize(d, p=2, dim=1)
+        return d
+
+    def compute_embedding(self, x, num_frames=250, num_eval=10, return_mean=True):
+        """
+        Generate embeddings for a batch of utterances
+        x: 1xTxD
+        """
+        max_len = x.shape[1]
+
+        if max_len < num_frames:
+            num_frames = max_len
+
+        offsets = np.linspace(0, max_len - num_frames, num=num_eval)
+
+        frames_batch = []
+        for offset in offsets:
+            offset = int(offset)
+            end_offset = int(offset + num_frames)
+            frames = x[:, offset:end_offset]
+            frames_batch.append(frames)
+
+        frames_batch = torch.cat(frames_batch, dim=0)
+        embeddings = self.inference(frames_batch)
+
+        if return_mean:
+            embeddings = torch.mean(embeddings, dim=0, keepdim=True)
+
+        return embeddings
+
+    def batch_compute_embedding(self, x, seq_lens, num_frames=160, overlap=0.5):
+        """
+        Generate embeddings for a batch of utterances
+        x: BxTxD
+        """
+        num_overlap = num_frames * overlap
+        max_len = x.shape[1]
+        embed = None
+        num_iters = seq_lens / (num_frames - num_overlap)
+        cur_iter = 0
+        for offset in range(0, max_len, num_frames - num_overlap):
+            cur_iter += 1
+            end_offset = min(x.shape[1], offset + num_frames)
+            frames = x[:, offset:end_offset]
+            if embed is None:
+                embed = self.inference(frames)
+            else:
+                embed[cur_iter <= num_iters, :] += self.inference(frames[cur_iter <= num_iters, :, :])
+        return embed / num_iters
+
+    # pylint: disable=unused-argument, redefined-builtin
+    def load_checkpoint(self, checkpoint_path: str, eval: bool = False, use_cuda: bool = False):
+        state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"))
+        self.load_state_dict(state["model"])
+        if use_cuda:
+            self.cuda()
+        if eval:
+            self.eval()
+            assert not self.training
diff --git a/speaker/models/resnet.py b/speaker/models/resnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..fcc850d7b87e03c2490e6b88232d9a0f668586ad
--- /dev/null
+++ b/speaker/models/resnet.py
@@ -0,0 +1,212 @@
+import numpy as np
+import torch
+from torch import nn
+
+from TTS.utils.io import load_fsspec
+
+
+class SELayer(nn.Module):
+    def __init__(self, channel, reduction=8):
+        super(SELayer, self).__init__()
+        self.avg_pool = nn.AdaptiveAvgPool2d(1)
+        self.fc = nn.Sequential(
+            nn.Linear(channel, channel // reduction),
+            nn.ReLU(inplace=True),
+            nn.Linear(channel // reduction, channel),
+            nn.Sigmoid(),
+        )
+
+    def forward(self, x):
+        b, c, _, _ = x.size()
+        y = self.avg_pool(x).view(b, c)
+        y = self.fc(y).view(b, c, 1, 1)
+        return x * y
+
+
+class SEBasicBlock(nn.Module):
+    expansion = 1
+
+    def __init__(self, inplanes, planes, stride=1, downsample=None, reduction=8):
+        super(SEBasicBlock, self).__init__()
+        self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
+        self.bn1 = nn.BatchNorm2d(planes)
+        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, padding=1, bias=False)
+        self.bn2 = nn.BatchNorm2d(planes)
+        self.relu = nn.ReLU(inplace=True)
+        self.se = SELayer(planes, reduction)
+        self.downsample = downsample
+        self.stride = stride
+
+    def forward(self, x):
+        residual = x
+
+        out = self.conv1(x)
+        out = self.relu(out)
+        out = self.bn1(out)
+
+        out = self.conv2(out)
+        out = self.bn2(out)
+        out = self.se(out)
+
+        if self.downsample is not None:
+            residual = self.downsample(x)
+
+        out += residual
+        out = self.relu(out)
+        return out
+
+
+class ResNetSpeakerEncoder(nn.Module):
+    """Implementation of the model H/ASP without batch normalization in speaker embedding. This model was proposed in: https://arxiv.org/abs/2009.14153
+    Adapted from: https://github.com/clovaai/voxceleb_trainer
+    """
+
+    # pylint: disable=W0102
+    def __init__(
+        self,
+        input_dim=64,
+        proj_dim=512,
+        layers=[3, 4, 6, 3],
+        num_filters=[32, 64, 128, 256],
+        encoder_type="ASP",
+        log_input=False,
+    ):
+        super(ResNetSpeakerEncoder, self).__init__()
+
+        self.encoder_type = encoder_type
+        self.input_dim = input_dim
+        self.log_input = log_input
+        self.conv1 = nn.Conv2d(1, num_filters[0], kernel_size=3, stride=1, padding=1)
+        self.relu = nn.ReLU(inplace=True)
+        self.bn1 = nn.BatchNorm2d(num_filters[0])
+
+        self.inplanes = num_filters[0]
+        self.layer1 = self.create_layer(SEBasicBlock, num_filters[0], layers[0])
+        self.layer2 = self.create_layer(SEBasicBlock, num_filters[1], layers[1], stride=(2, 2))
+        self.layer3 = self.create_layer(SEBasicBlock, num_filters[2], layers[2], stride=(2, 2))
+        self.layer4 = self.create_layer(SEBasicBlock, num_filters[3], layers[3], stride=(2, 2))
+
+        self.instancenorm = nn.InstanceNorm1d(input_dim)
+
+        outmap_size = int(self.input_dim / 8)
+
+        self.attention = nn.Sequential(
+            nn.Conv1d(num_filters[3] * outmap_size, 128, kernel_size=1),
+            nn.ReLU(),
+            nn.BatchNorm1d(128),
+            nn.Conv1d(128, num_filters[3] * outmap_size, kernel_size=1),
+            nn.Softmax(dim=2),
+        )
+
+        if self.encoder_type == "SAP":
+            out_dim = num_filters[3] * outmap_size
+        elif self.encoder_type == "ASP":
+            out_dim = num_filters[3] * outmap_size * 2
+        else:
+            raise ValueError("Undefined encoder")
+
+        self.fc = nn.Linear(out_dim, proj_dim)
+
+        self._init_layers()
+
+    def _init_layers(self):
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.kaiming_normal_(m.weight, mode="fan_out", nonlinearity="relu")
+            elif isinstance(m, nn.BatchNorm2d):
+                nn.init.constant_(m.weight, 1)
+                nn.init.constant_(m.bias, 0)
+
+    def create_layer(self, block, planes, blocks, stride=1):
+        downsample = None
+        if stride != 1 or self.inplanes != planes * block.expansion:
+            downsample = nn.Sequential(
+                nn.Conv2d(self.inplanes, planes * block.expansion, kernel_size=1, stride=stride, bias=False),
+                nn.BatchNorm2d(planes * block.expansion),
+            )
+
+        layers = []
+        layers.append(block(self.inplanes, planes, stride, downsample))
+        self.inplanes = planes * block.expansion
+        for _ in range(1, blocks):
+            layers.append(block(self.inplanes, planes))
+
+        return nn.Sequential(*layers)
+
+    # pylint: disable=R0201
+    def new_parameter(self, *size):
+        out = nn.Parameter(torch.FloatTensor(*size))
+        nn.init.xavier_normal_(out)
+        return out
+
+    def forward(self, x, l2_norm=False):
+        x = x.transpose(1, 2)
+        with torch.no_grad():
+            with torch.cuda.amp.autocast(enabled=False):
+                if self.log_input:
+                    x = (x + 1e-6).log()
+                x = self.instancenorm(x).unsqueeze(1)
+
+        x = self.conv1(x)
+        x = self.relu(x)
+        x = self.bn1(x)
+
+        x = self.layer1(x)
+        x = self.layer2(x)
+        x = self.layer3(x)
+        x = self.layer4(x)
+
+        x = x.reshape(x.size()[0], -1, x.size()[-1])
+
+        w = self.attention(x)
+
+        if self.encoder_type == "SAP":
+            x = torch.sum(x * w, dim=2)
+        elif self.encoder_type == "ASP":
+            mu = torch.sum(x * w, dim=2)
+            sg = torch.sqrt((torch.sum((x ** 2) * w, dim=2) - mu ** 2).clamp(min=1e-5))
+            x = torch.cat((mu, sg), 1)
+
+        x = x.view(x.size()[0], -1)
+        x = self.fc(x)
+
+        if l2_norm:
+            x = torch.nn.functional.normalize(x, p=2, dim=1)
+        return x
+
+    @torch.no_grad()
+    def compute_embedding(self, x, num_frames=250, num_eval=10, return_mean=True):
+        """
+        Generate embeddings for a batch of utterances
+        x: 1xTxD
+        """
+        max_len = x.shape[1]
+
+        if max_len < num_frames:
+            num_frames = max_len
+
+        offsets = np.linspace(0, max_len - num_frames, num=num_eval)
+
+        frames_batch = []
+        for offset in offsets:
+            offset = int(offset)
+            end_offset = int(offset + num_frames)
+            frames = x[:, offset:end_offset]
+            frames_batch.append(frames)
+
+        frames_batch = torch.cat(frames_batch, dim=0)
+        embeddings = self.forward(frames_batch, l2_norm=True)
+
+        if return_mean:
+            embeddings = torch.mean(embeddings, dim=0, keepdim=True)
+
+        return embeddings
+
+    def load_checkpoint(self, config: dict, checkpoint_path: str, eval: bool = False, use_cuda: bool = False):
+        state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"))
+        self.load_state_dict(state["model"])
+        if use_cuda:
+            self.cuda()
+        if eval:
+            self.eval()
+            assert not self.training
diff --git a/speaker/umap.png b/speaker/umap.png
new file mode 100644
index 0000000000000000000000000000000000000000..ca8aefeac8cbe616983b35e968c9c9133eb41ede
Binary files /dev/null and b/speaker/umap.png differ
diff --git a/speaker/utils/__init__.py b/speaker/utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/speaker/utils/audio.py b/speaker/utils/audio.py
new file mode 100644
index 0000000000000000000000000000000000000000..e2c9627e93cf7ba864532144bb4522cf575d3d2b
--- /dev/null
+++ b/speaker/utils/audio.py
@@ -0,0 +1,822 @@
+from typing import Dict, Tuple
+
+import librosa
+import numpy as np
+import pyworld as pw
+import scipy.io.wavfile
+import scipy.signal
+import soundfile as sf
+import torch
+from torch import nn
+
+class StandardScaler:
+    """StandardScaler for mean-scale normalization with the given mean and scale values."""
+
+    def __init__(self, mean: np.ndarray = None, scale: np.ndarray = None) -> None:
+        self.mean_ = mean
+        self.scale_ = scale
+
+    def set_stats(self, mean, scale):
+        self.mean_ = mean
+        self.scale_ = scale
+
+    def reset_stats(self):
+        delattr(self, "mean_")
+        delattr(self, "scale_")
+
+    def transform(self, X):
+        X = np.asarray(X)
+        X -= self.mean_
+        X /= self.scale_
+        return X
+
+    def inverse_transform(self, X):
+        X = np.asarray(X)
+        X *= self.scale_
+        X += self.mean_
+        return X
+
+class TorchSTFT(nn.Module):  # pylint: disable=abstract-method
+    """Some of the audio processing funtions using Torch for faster batch processing.
+
+    TODO: Merge this with audio.py
+    """
+
+    def __init__(
+        self,
+        n_fft,
+        hop_length,
+        win_length,
+        pad_wav=False,
+        window="hann_window",
+        sample_rate=None,
+        mel_fmin=0,
+        mel_fmax=None,
+        n_mels=80,
+        use_mel=False,
+        do_amp_to_db=False,
+        spec_gain=1.0,
+    ):
+        super().__init__()
+        self.n_fft = n_fft
+        self.hop_length = hop_length
+        self.win_length = win_length
+        self.pad_wav = pad_wav
+        self.sample_rate = sample_rate
+        self.mel_fmin = mel_fmin
+        self.mel_fmax = mel_fmax
+        self.n_mels = n_mels
+        self.use_mel = use_mel
+        self.do_amp_to_db = do_amp_to_db
+        self.spec_gain = spec_gain
+        self.window = nn.Parameter(getattr(torch, window)(win_length), requires_grad=False)
+        self.mel_basis = None
+        if use_mel:
+            self._build_mel_basis()
+
+    def __call__(self, x):
+        """Compute spectrogram frames by torch based stft.
+
+        Args:
+            x (Tensor): input waveform
+
+        Returns:
+            Tensor: spectrogram frames.
+
+        Shapes:
+            x: [B x T] or [:math:`[B, 1, T]`]
+        """
+        if x.ndim == 2:
+            x = x.unsqueeze(1)
+        if self.pad_wav:
+            padding = int((self.n_fft - self.hop_length) / 2)
+            x = torch.nn.functional.pad(x, (padding, padding), mode="reflect")
+        # B x D x T x 2
+        o = torch.stft(
+            x.squeeze(1),
+            self.n_fft,
+            self.hop_length,
+            self.win_length,
+            self.window,
+            center=True,
+            pad_mode="reflect",  # compatible with audio.py
+            normalized=False,
+            onesided=True,
+            return_complex=False,
+        )
+        M = o[:, :, :, 0]
+        P = o[:, :, :, 1]
+        S = torch.sqrt(torch.clamp(M ** 2 + P ** 2, min=1e-8))
+        if self.use_mel:
+            S = torch.matmul(self.mel_basis.to(x), S)
+        if self.do_amp_to_db:
+            S = self._amp_to_db(S, spec_gain=self.spec_gain)
+        return S
+
+    def _build_mel_basis(self):
+        mel_basis = librosa.filters.mel(
+            sr=self.sample_rate, n_fft=self.n_fft, n_mels=self.n_mels, fmin=self.mel_fmin, fmax=self.mel_fmax
+        )
+        self.mel_basis = torch.from_numpy(mel_basis).float()
+
+    @staticmethod
+    def _amp_to_db(x, spec_gain=1.0):
+        return torch.log(torch.clamp(x, min=1e-5) * spec_gain)
+
+    @staticmethod
+    def _db_to_amp(x, spec_gain=1.0):
+        return torch.exp(x) / spec_gain
+
+
+# pylint: disable=too-many-public-methods
+class AudioProcessor(object):
+    """Audio Processor for TTS used by all the data pipelines.
+
+    Note:
+        All the class arguments are set to default values to enable a flexible initialization
+        of the class with the model config. They are not meaningful for all the arguments.
+
+    Args:
+        sample_rate (int, optional):
+            target audio sampling rate. Defaults to None.
+
+        resample (bool, optional):
+            enable/disable resampling of the audio clips when the target sampling rate does not match the original sampling rate. Defaults to False.
+
+        num_mels (int, optional):
+            number of melspectrogram dimensions. Defaults to None.
+
+        log_func (int, optional):
+            log exponent used for converting spectrogram aplitude to DB.
+
+        min_level_db (int, optional):
+            minimum db threshold for the computed melspectrograms. Defaults to None.
+
+        frame_shift_ms (int, optional):
+            milliseconds of frames between STFT columns. Defaults to None.
+
+        frame_length_ms (int, optional):
+            milliseconds of STFT window length. Defaults to None.
+
+        hop_length (int, optional):
+            number of frames between STFT columns. Used if ```frame_shift_ms``` is None. Defaults to None.
+
+        win_length (int, optional):
+            STFT window length. Used if ```frame_length_ms``` is None. Defaults to None.
+
+        ref_level_db (int, optional):
+            reference DB level to avoid background noise. In general <20DB corresponds to the air noise. Defaults to None.
+
+        fft_size (int, optional):
+            FFT window size for STFT. Defaults to 1024.
+
+        power (int, optional):
+            Exponent value applied to the spectrogram before GriffinLim. Defaults to None.
+
+        preemphasis (float, optional):
+            Preemphasis coefficient. Preemphasis is disabled if == 0.0. Defaults to 0.0.
+
+        signal_norm (bool, optional):
+            enable/disable signal normalization. Defaults to None.
+
+        symmetric_norm (bool, optional):
+            enable/disable symmetric normalization. If set True normalization is performed in the range [-k, k] else [0, k], Defaults to None.
+
+        max_norm (float, optional):
+            ```k``` defining the normalization range. Defaults to None.
+
+        mel_fmin (int, optional):
+            minimum filter frequency for computing melspectrograms. Defaults to None.
+
+        mel_fmax (int, optional):
+            maximum filter frequency for computing melspectrograms.. Defaults to None.
+
+        spec_gain (int, optional):
+            gain applied when converting amplitude to DB. Defaults to 20.
+
+        stft_pad_mode (str, optional):
+            Padding mode for STFT. Defaults to 'reflect'.
+
+        clip_norm (bool, optional):
+            enable/disable clipping the our of range values in the normalized audio signal. Defaults to True.
+
+        griffin_lim_iters (int, optional):
+            Number of GriffinLim iterations. Defaults to None.
+
+        do_trim_silence (bool, optional):
+            enable/disable silence trimming when loading the audio signal. Defaults to False.
+
+        trim_db (int, optional):
+            DB threshold used for silence trimming. Defaults to 60.
+
+        do_sound_norm (bool, optional):
+            enable/disable signal normalization. Defaults to False.
+
+        do_amp_to_db_linear (bool, optional):
+            enable/disable amplitude to dB conversion of linear spectrograms. Defaults to True.
+
+        do_amp_to_db_mel (bool, optional):
+            enable/disable amplitude to dB conversion of mel spectrograms. Defaults to True.
+
+        stats_path (str, optional):
+            Path to the computed stats file. Defaults to None.
+
+        verbose (bool, optional):
+            enable/disable logging. Defaults to True.
+
+    """
+
+    def __init__(
+        self,
+        sample_rate=None,
+        resample=False,
+        num_mels=None,
+        log_func="np.log10",
+        min_level_db=None,
+        frame_shift_ms=None,
+        frame_length_ms=None,
+        hop_length=None,
+        win_length=None,
+        ref_level_db=None,
+        fft_size=1024,
+        power=None,
+        preemphasis=0.0,
+        signal_norm=None,
+        symmetric_norm=None,
+        max_norm=None,
+        mel_fmin=None,
+        mel_fmax=None,
+        spec_gain=20,
+        stft_pad_mode="reflect",
+        clip_norm=True,
+        griffin_lim_iters=None,
+        do_trim_silence=False,
+        trim_db=60,
+        do_sound_norm=False,
+        do_amp_to_db_linear=True,
+        do_amp_to_db_mel=True,
+        stats_path=None,
+        verbose=True,
+        **_,
+    ):
+
+        # setup class attributed
+        self.sample_rate = sample_rate
+        self.resample = resample
+        self.num_mels = num_mels
+        self.log_func = log_func
+        self.min_level_db = min_level_db or 0
+        self.frame_shift_ms = frame_shift_ms
+        self.frame_length_ms = frame_length_ms
+        self.ref_level_db = ref_level_db
+        self.fft_size = fft_size
+        self.power = power
+        self.preemphasis = preemphasis
+        self.griffin_lim_iters = griffin_lim_iters
+        self.signal_norm = signal_norm
+        self.symmetric_norm = symmetric_norm
+        self.mel_fmin = mel_fmin or 0
+        self.mel_fmax = mel_fmax
+        self.spec_gain = float(spec_gain)
+        self.stft_pad_mode = stft_pad_mode
+        self.max_norm = 1.0 if max_norm is None else float(max_norm)
+        self.clip_norm = clip_norm
+        self.do_trim_silence = do_trim_silence
+        self.trim_db = trim_db
+        self.do_sound_norm = do_sound_norm
+        self.do_amp_to_db_linear = do_amp_to_db_linear
+        self.do_amp_to_db_mel = do_amp_to_db_mel
+        self.stats_path = stats_path
+        # setup exp_func for db to amp conversion
+        if log_func == "np.log":
+            self.base = np.e
+        elif log_func == "np.log10":
+            self.base = 10
+        else:
+            raise ValueError(" [!] unknown `log_func` value.")
+        # setup stft parameters
+        if hop_length is None:
+            # compute stft parameters from given time values
+            self.hop_length, self.win_length = self._stft_parameters()
+        else:
+            # use stft parameters from config file
+            self.hop_length = hop_length
+            self.win_length = win_length
+        assert min_level_db != 0.0, " [!] min_level_db is 0"
+        assert self.win_length <= self.fft_size, " [!] win_length cannot be larger than fft_size"
+        members = vars(self)
+        if verbose:
+            print(" > Setting up Audio Processor...")
+            for key, value in members.items():
+                print(" | > {}:{}".format(key, value))
+        # create spectrogram utils
+        self.mel_basis = self._build_mel_basis()
+        self.inv_mel_basis = np.linalg.pinv(self._build_mel_basis())
+        # setup scaler
+        if stats_path and signal_norm:
+            mel_mean, mel_std, linear_mean, linear_std, _ = self.load_stats(stats_path)
+            self.setup_scaler(mel_mean, mel_std, linear_mean, linear_std)
+            self.signal_norm = True
+            self.max_norm = None
+            self.clip_norm = None
+            self.symmetric_norm = None
+
+    ### setting up the parameters ###
+    def _build_mel_basis(
+        self,
+    ) -> np.ndarray:
+        """Build melspectrogram basis.
+
+        Returns:
+            np.ndarray: melspectrogram basis.
+        """
+        if self.mel_fmax is not None:
+            assert self.mel_fmax <= self.sample_rate // 2
+        return librosa.filters.mel(
+            sr=self.sample_rate, n_fft=self.fft_size, n_mels=self.num_mels, fmin=self.mel_fmin, fmax=self.mel_fmax
+        )
+
+    def _stft_parameters(
+        self,
+    ) -> Tuple[int, int]:
+        """Compute the real STFT parameters from the time values.
+
+        Returns:
+            Tuple[int, int]: hop length and window length for STFT.
+        """
+        factor = self.frame_length_ms / self.frame_shift_ms
+        assert (factor).is_integer(), " [!] frame_shift_ms should divide frame_length_ms"
+        hop_length = int(self.frame_shift_ms / 1000.0 * self.sample_rate)
+        win_length = int(hop_length * factor)
+        return hop_length, win_length
+
+    ### normalization ###
+    def normalize(self, S: np.ndarray) -> np.ndarray:
+        """Normalize values into `[0, self.max_norm]` or `[-self.max_norm, self.max_norm]`
+
+        Args:
+            S (np.ndarray): Spectrogram to normalize.
+
+        Raises:
+            RuntimeError: Mean and variance is computed from incompatible parameters.
+
+        Returns:
+            np.ndarray: Normalized spectrogram.
+        """
+        # pylint: disable=no-else-return
+        S = S.copy()
+        if self.signal_norm:
+            # mean-var scaling
+            if hasattr(self, "mel_scaler"):
+                if S.shape[0] == self.num_mels:
+                    return self.mel_scaler.transform(S.T).T
+                elif S.shape[0] == self.fft_size / 2:
+                    return self.linear_scaler.transform(S.T).T
+                else:
+                    raise RuntimeError(" [!] Mean-Var stats does not match the given feature dimensions.")
+            # range normalization
+            S -= self.ref_level_db  # discard certain range of DB assuming it is air noise
+            S_norm = (S - self.min_level_db) / (-self.min_level_db)
+            if self.symmetric_norm:
+                S_norm = ((2 * self.max_norm) * S_norm) - self.max_norm
+                if self.clip_norm:
+                    S_norm = np.clip(
+                        S_norm, -self.max_norm, self.max_norm  # pylint: disable=invalid-unary-operand-type
+                    )
+                return S_norm
+            else:
+                S_norm = self.max_norm * S_norm
+                if self.clip_norm:
+                    S_norm = np.clip(S_norm, 0, self.max_norm)
+                return S_norm
+        else:
+            return S
+
+    def denormalize(self, S: np.ndarray) -> np.ndarray:
+        """Denormalize spectrogram values.
+
+        Args:
+            S (np.ndarray): Spectrogram to denormalize.
+
+        Raises:
+            RuntimeError: Mean and variance are incompatible.
+
+        Returns:
+            np.ndarray: Denormalized spectrogram.
+        """
+        # pylint: disable=no-else-return
+        S_denorm = S.copy()
+        if self.signal_norm:
+            # mean-var scaling
+            if hasattr(self, "mel_scaler"):
+                if S_denorm.shape[0] == self.num_mels:
+                    return self.mel_scaler.inverse_transform(S_denorm.T).T
+                elif S_denorm.shape[0] == self.fft_size / 2:
+                    return self.linear_scaler.inverse_transform(S_denorm.T).T
+                else:
+                    raise RuntimeError(" [!] Mean-Var stats does not match the given feature dimensions.")
+            if self.symmetric_norm:
+                if self.clip_norm:
+                    S_denorm = np.clip(
+                        S_denorm, -self.max_norm, self.max_norm  # pylint: disable=invalid-unary-operand-type
+                    )
+                S_denorm = ((S_denorm + self.max_norm) * -self.min_level_db / (2 * self.max_norm)) + self.min_level_db
+                return S_denorm + self.ref_level_db
+            else:
+                if self.clip_norm:
+                    S_denorm = np.clip(S_denorm, 0, self.max_norm)
+                S_denorm = (S_denorm * -self.min_level_db / self.max_norm) + self.min_level_db
+                return S_denorm + self.ref_level_db
+        else:
+            return S_denorm
+
+    ### Mean-STD scaling ###
+    def load_stats(self, stats_path: str) -> Tuple[np.array, np.array, np.array, np.array, Dict]:
+        """Loading mean and variance statistics from a `npy` file.
+
+        Args:
+            stats_path (str): Path to the `npy` file containing
+
+        Returns:
+            Tuple[np.array, np.array, np.array, np.array, Dict]: loaded statistics and the config used to
+                compute them.
+        """
+        stats = np.load(stats_path, allow_pickle=True).item()  # pylint: disable=unexpected-keyword-arg
+        mel_mean = stats["mel_mean"]
+        mel_std = stats["mel_std"]
+        linear_mean = stats["linear_mean"]
+        linear_std = stats["linear_std"]
+        stats_config = stats["audio_config"]
+        # check all audio parameters used for computing stats
+        skip_parameters = ["griffin_lim_iters", "stats_path", "do_trim_silence", "ref_level_db", "power"]
+        for key in stats_config.keys():
+            if key in skip_parameters:
+                continue
+            if key not in ["sample_rate", "trim_db"]:
+                assert (
+                    stats_config[key] == self.__dict__[key]
+                ), f" [!] Audio param {key} does not match the value used for computing mean-var stats. {stats_config[key]} vs {self.__dict__[key]}"
+        return mel_mean, mel_std, linear_mean, linear_std, stats_config
+
+    # pylint: disable=attribute-defined-outside-init
+    def setup_scaler(
+        self, mel_mean: np.ndarray, mel_std: np.ndarray, linear_mean: np.ndarray, linear_std: np.ndarray
+    ) -> None:
+        """Initialize scaler objects used in mean-std normalization.
+
+        Args:
+            mel_mean (np.ndarray): Mean for melspectrograms.
+            mel_std (np.ndarray): STD for melspectrograms.
+            linear_mean (np.ndarray): Mean for full scale spectrograms.
+            linear_std (np.ndarray): STD for full scale spectrograms.
+        """
+        self.mel_scaler = StandardScaler()
+        self.mel_scaler.set_stats(mel_mean, mel_std)
+        self.linear_scaler = StandardScaler()
+        self.linear_scaler.set_stats(linear_mean, linear_std)
+
+    ### DB and AMP conversion ###
+    # pylint: disable=no-self-use
+    def _amp_to_db(self, x: np.ndarray) -> np.ndarray:
+        """Convert amplitude values to decibels.
+
+        Args:
+            x (np.ndarray): Amplitude spectrogram.
+
+        Returns:
+            np.ndarray: Decibels spectrogram.
+        """
+        return self.spec_gain * _log(np.maximum(1e-5, x), self.base)
+
+    # pylint: disable=no-self-use
+    def _db_to_amp(self, x: np.ndarray) -> np.ndarray:
+        """Convert decibels spectrogram to amplitude spectrogram.
+
+        Args:
+            x (np.ndarray): Decibels spectrogram.
+
+        Returns:
+            np.ndarray: Amplitude spectrogram.
+        """
+        return _exp(x / self.spec_gain, self.base)
+
+    ### Preemphasis ###
+    def apply_preemphasis(self, x: np.ndarray) -> np.ndarray:
+        """Apply pre-emphasis to the audio signal. Useful to reduce the correlation between neighbouring signal values.
+
+        Args:
+            x (np.ndarray): Audio signal.
+
+        Raises:
+            RuntimeError: Preemphasis coeff is set to 0.
+
+        Returns:
+            np.ndarray: Decorrelated audio signal.
+        """
+        if self.preemphasis == 0:
+            raise RuntimeError(" [!] Preemphasis is set 0.0.")
+        return scipy.signal.lfilter([1, -self.preemphasis], [1], x)
+
+    def apply_inv_preemphasis(self, x: np.ndarray) -> np.ndarray:
+        """Reverse pre-emphasis."""
+        if self.preemphasis == 0:
+            raise RuntimeError(" [!] Preemphasis is set 0.0.")
+        return scipy.signal.lfilter([1], [1, -self.preemphasis], x)
+
+    ### SPECTROGRAMs ###
+    def _linear_to_mel(self, spectrogram: np.ndarray) -> np.ndarray:
+        """Project a full scale spectrogram to a melspectrogram.
+
+        Args:
+            spectrogram (np.ndarray): Full scale spectrogram.
+
+        Returns:
+            np.ndarray: Melspectrogram
+        """
+        return np.dot(self.mel_basis, spectrogram)
+
+    def _mel_to_linear(self, mel_spec: np.ndarray) -> np.ndarray:
+        """Convert a melspectrogram to full scale spectrogram."""
+        return np.maximum(1e-10, np.dot(self.inv_mel_basis, mel_spec))
+
+    def spectrogram(self, y: np.ndarray) -> np.ndarray:
+        """Compute a spectrogram from a waveform.
+
+        Args:
+            y (np.ndarray): Waveform.
+
+        Returns:
+            np.ndarray: Spectrogram.
+        """
+        if self.preemphasis != 0:
+            D = self._stft(self.apply_preemphasis(y))
+        else:
+            D = self._stft(y)
+        if self.do_amp_to_db_linear:
+            S = self._amp_to_db(np.abs(D))
+        else:
+            S = np.abs(D)
+        return self.normalize(S).astype(np.float32)
+
+    def melspectrogram(self, y: np.ndarray) -> np.ndarray:
+        """Compute a melspectrogram from a waveform."""
+        if self.preemphasis != 0:
+            D = self._stft(self.apply_preemphasis(y))
+        else:
+            D = self._stft(y)
+        if self.do_amp_to_db_mel:
+            S = self._amp_to_db(self._linear_to_mel(np.abs(D)))
+        else:
+            S = self._linear_to_mel(np.abs(D))
+        return self.normalize(S).astype(np.float32)
+
+    def inv_spectrogram(self, spectrogram: np.ndarray) -> np.ndarray:
+        """Convert a spectrogram to a waveform using Griffi-Lim vocoder."""
+        S = self.denormalize(spectrogram)
+        S = self._db_to_amp(S)
+        # Reconstruct phase
+        if self.preemphasis != 0:
+            return self.apply_inv_preemphasis(self._griffin_lim(S ** self.power))
+        return self._griffin_lim(S ** self.power)
+
+    def inv_melspectrogram(self, mel_spectrogram: np.ndarray) -> np.ndarray:
+        """Convert a melspectrogram to a waveform using Griffi-Lim vocoder."""
+        D = self.denormalize(mel_spectrogram)
+        S = self._db_to_amp(D)
+        S = self._mel_to_linear(S)  # Convert back to linear
+        if self.preemphasis != 0:
+            return self.apply_inv_preemphasis(self._griffin_lim(S ** self.power))
+        return self._griffin_lim(S ** self.power)
+
+    def out_linear_to_mel(self, linear_spec: np.ndarray) -> np.ndarray:
+        """Convert a full scale linear spectrogram output of a network to a melspectrogram.
+
+        Args:
+            linear_spec (np.ndarray): Normalized full scale linear spectrogram.
+
+        Returns:
+            np.ndarray: Normalized melspectrogram.
+        """
+        S = self.denormalize(linear_spec)
+        S = self._db_to_amp(S)
+        S = self._linear_to_mel(np.abs(S))
+        S = self._amp_to_db(S)
+        mel = self.normalize(S)
+        return mel
+
+    ### STFT and ISTFT ###
+    def _stft(self, y: np.ndarray) -> np.ndarray:
+        """Librosa STFT wrapper.
+
+        Args:
+            y (np.ndarray): Audio signal.
+
+        Returns:
+            np.ndarray: Complex number array.
+        """
+        return librosa.stft(
+            y=y,
+            n_fft=self.fft_size,
+            hop_length=self.hop_length,
+            win_length=self.win_length,
+            pad_mode=self.stft_pad_mode,
+            window="hann",
+            center=True,
+        )
+
+    def _istft(self, y: np.ndarray) -> np.ndarray:
+        """Librosa iSTFT wrapper."""
+        return librosa.istft(y, hop_length=self.hop_length, win_length=self.win_length)
+
+    def _griffin_lim(self, S):
+        angles = np.exp(2j * np.pi * np.random.rand(*S.shape))
+        S_complex = np.abs(S).astype(np.complex)
+        y = self._istft(S_complex * angles)
+        if not np.isfinite(y).all():
+            print(" [!] Waveform is not finite everywhere. Skipping the GL.")
+            return np.array([0.0])
+        for _ in range(self.griffin_lim_iters):
+            angles = np.exp(1j * np.angle(self._stft(y)))
+            y = self._istft(S_complex * angles)
+        return y
+
+    def compute_stft_paddings(self, x, pad_sides=1):
+        """Compute paddings used by Librosa's STFT. Compute right padding (final frame) or both sides padding
+        (first and final frames)"""
+        assert pad_sides in (1, 2)
+        pad = (x.shape[0] // self.hop_length + 1) * self.hop_length - x.shape[0]
+        if pad_sides == 1:
+            return 0, pad
+        return pad // 2, pad // 2 + pad % 2
+
+    def compute_f0(self, x: np.ndarray) -> np.ndarray:
+        """Compute pitch (f0) of a waveform using the same parameters used for computing melspectrogram.
+
+        Args:
+            x (np.ndarray): Waveform.
+
+        Returns:
+            np.ndarray: Pitch.
+
+        Examples:
+            >>> WAV_FILE = filename = librosa.util.example_audio_file()
+            >>> from TTS.config import BaseAudioConfig
+            >>> from TTS.utils.audio import AudioProcessor
+            >>> conf = BaseAudioConfig(mel_fmax=8000)
+            >>> ap = AudioProcessor(**conf)
+            >>> wav = ap.load_wav(WAV_FILE, sr=22050)[:5 * 22050]
+            >>> pitch = ap.compute_f0(wav)
+        """
+        f0, t = pw.dio(
+            x.astype(np.double),
+            fs=self.sample_rate,
+            f0_ceil=self.mel_fmax,
+            frame_period=1000 * self.hop_length / self.sample_rate,
+        )
+        f0 = pw.stonemask(x.astype(np.double), f0, t, self.sample_rate)
+        # pad = int((self.win_length / self.hop_length) / 2)
+        # f0 = [0.0] * pad + f0 + [0.0] * pad
+        # f0 = np.pad(f0, (pad, pad), mode="constant", constant_values=0)
+        # f0 = np.array(f0, dtype=np.float32)
+
+        # f01, _, _ = librosa.pyin(
+        #     x,
+        #     fmin=65 if self.mel_fmin == 0 else self.mel_fmin,
+        #     fmax=self.mel_fmax,
+        #     frame_length=self.win_length,
+        #     sr=self.sample_rate,
+        #     fill_na=0.0,
+        # )
+
+        # spec = self.melspectrogram(x)
+        return f0
+
+    ### Audio Processing ###
+    def find_endpoint(self, wav: np.ndarray, threshold_db=-40, min_silence_sec=0.8) -> int:
+        """Find the last point without silence at the end of a audio signal.
+
+        Args:
+            wav (np.ndarray): Audio signal.
+            threshold_db (int, optional): Silence threshold in decibels. Defaults to -40.
+            min_silence_sec (float, optional): Ignore silences that are shorter then this in secs. Defaults to 0.8.
+
+        Returns:
+            int: Last point without silence.
+        """
+        window_length = int(self.sample_rate * min_silence_sec)
+        hop_length = int(window_length / 4)
+        threshold = self._db_to_amp(threshold_db)
+        for x in range(hop_length, len(wav) - window_length, hop_length):
+            if np.max(wav[x : x + window_length]) < threshold:
+                return x + hop_length
+        return len(wav)
+
+    def trim_silence(self, wav):
+        """Trim silent parts with a threshold and 0.01 sec margin"""
+        margin = int(self.sample_rate * 0.01)
+        wav = wav[margin:-margin]
+        return librosa.effects.trim(wav, top_db=self.trim_db, frame_length=self.win_length, hop_length=self.hop_length)[
+            0
+        ]
+
+    @staticmethod
+    def sound_norm(x: np.ndarray) -> np.ndarray:
+        """Normalize the volume of an audio signal.
+
+        Args:
+            x (np.ndarray): Raw waveform.
+
+        Returns:
+            np.ndarray: Volume normalized waveform.
+        """
+        return x / abs(x).max() * 0.95
+
+    ### save and load ###
+    def load_wav(self, filename: str, sr: int = None) -> np.ndarray:
+        """Read a wav file using Librosa and optionally resample, silence trim, volume normalize.
+
+        Args:
+            filename (str): Path to the wav file.
+            sr (int, optional): Sampling rate for resampling. Defaults to None.
+
+        Returns:
+            np.ndarray: Loaded waveform.
+        """
+        if self.resample:
+            x, sr = librosa.load(filename, sr=self.sample_rate)
+        elif sr is None:
+            x, sr = sf.read(filename)
+            assert self.sample_rate == sr, "%s vs %s" % (self.sample_rate, sr)
+        else:
+            x, sr = librosa.load(filename, sr=sr)
+        if self.do_trim_silence:
+            try:
+                x = self.trim_silence(x)
+            except ValueError:
+                print(f" [!] File cannot be trimmed for silence - {filename}")
+        if self.do_sound_norm:
+            x = self.sound_norm(x)
+        return x
+
+    def save_wav(self, wav: np.ndarray, path: str, sr: int = None) -> None:
+        """Save a waveform to a file using Scipy.
+
+        Args:
+            wav (np.ndarray): Waveform to save.
+            path (str): Path to a output file.
+            sr (int, optional): Sampling rate used for saving to the file. Defaults to None.
+        """
+        wav_norm = wav * (32767 / max(0.01, np.max(np.abs(wav))))
+        scipy.io.wavfile.write(path, sr if sr else self.sample_rate, wav_norm.astype(np.int16))
+
+    @staticmethod
+    def mulaw_encode(wav: np.ndarray, qc: int) -> np.ndarray:
+        mu = 2 ** qc - 1
+        # wav_abs = np.minimum(np.abs(wav), 1.0)
+        signal = np.sign(wav) * np.log(1 + mu * np.abs(wav)) / np.log(1.0 + mu)
+        # Quantize signal to the specified number of levels.
+        signal = (signal + 1) / 2 * mu + 0.5
+        return np.floor(
+            signal,
+        )
+
+    @staticmethod
+    def mulaw_decode(wav, qc):
+        """Recovers waveform from quantized values."""
+        mu = 2 ** qc - 1
+        x = np.sign(wav) / mu * ((1 + mu) ** np.abs(wav) - 1)
+        return x
+
+    @staticmethod
+    def encode_16bits(x):
+        return np.clip(x * 2 ** 15, -(2 ** 15), 2 ** 15 - 1).astype(np.int16)
+
+    @staticmethod
+    def quantize(x: np.ndarray, bits: int) -> np.ndarray:
+        """Quantize a waveform to a given number of bits.
+
+        Args:
+            x (np.ndarray): Waveform to quantize. Must be normalized into the range `[-1, 1]`.
+            bits (int): Number of quantization bits.
+
+        Returns:
+            np.ndarray: Quantized waveform.
+        """
+        return (x + 1.0) * (2 ** bits - 1) / 2
+
+    @staticmethod
+    def dequantize(x, bits):
+        """Dequantize a waveform from the given number of bits."""
+        return 2 * x / (2 ** bits - 1) - 1
+
+
+def _log(x, base):
+    if base == 10:
+        return np.log10(x)
+    return np.log(x)
+
+
+def _exp(x, base):
+    if base == 10:
+        return np.power(10, x)
+    return np.exp(x)
diff --git a/speaker/utils/coqpit.py b/speaker/utils/coqpit.py
new file mode 100644
index 0000000000000000000000000000000000000000..e214c8b8a2045701b15e77c5d66012a64f135429
--- /dev/null
+++ b/speaker/utils/coqpit.py
@@ -0,0 +1,954 @@
+import argparse
+import functools
+import json
+import operator
+import os
+from collections.abc import MutableMapping
+from dataclasses import MISSING as _MISSING
+from dataclasses import Field, asdict, dataclass, fields, is_dataclass, replace
+from pathlib import Path
+from pprint import pprint
+from typing import Any, Dict, Generic, List, Optional, Type, TypeVar, Union, get_type_hints
+
+T = TypeVar("T")
+MISSING: Any = "???"
+
+
+class _NoDefault(Generic[T]):
+    pass
+
+
+NoDefaultVar = Union[_NoDefault[T], T]
+no_default: NoDefaultVar = _NoDefault()
+
+
+def is_primitive_type(arg_type: Any) -> bool:
+    """Check if the input type is one of `int, float, str, bool`.
+
+    Args:
+        arg_type (typing.Any): input type to check.
+
+    Returns:
+        bool: True if input type is one of `int, float, str, bool`.
+    """
+    try:
+        return isinstance(arg_type(), (int, float, str, bool))
+    except (AttributeError, TypeError):
+        return False
+
+
+def is_list(arg_type: Any) -> bool:
+    """Check if the input type is `list`
+
+    Args:
+        arg_type (typing.Any): input type.
+
+    Returns:
+        bool: True if input type is `list`
+    """
+    try:
+        return arg_type is list or arg_type is List or arg_type.__origin__ is list or arg_type.__origin__ is List
+    except AttributeError:
+        return False
+
+
+def is_dict(arg_type: Any) -> bool:
+    """Check if the input type is `dict`
+
+    Args:
+        arg_type (typing.Any): input type.
+
+    Returns:
+        bool: True if input type is `dict`
+    """
+    try:
+        return arg_type is dict or arg_type is Dict or arg_type.__origin__ is dict
+    except AttributeError:
+        return False
+
+
+def is_union(arg_type: Any) -> bool:
+    """Check if the input type is `Union`.
+
+    Args:
+        arg_type (typing.Any): input type.
+
+    Returns:
+        bool: True if input type is `Union`
+    """
+    try:
+        return safe_issubclass(arg_type.__origin__, Union)
+    except AttributeError:
+        return False
+
+
+def safe_issubclass(cls, classinfo) -> bool:
+    """Check if the input type is a subclass of the given class.
+
+    Args:
+        cls (type): input type.
+        classinfo (type): parent class.
+
+    Returns:
+        bool: True if the input type is a subclass of the given class
+    """
+    try:
+        r = issubclass(cls, classinfo)
+    except Exception:  # pylint: disable=broad-except
+        return cls is classinfo
+    else:
+        return r
+
+
+def _coqpit_json_default(obj: Any) -> Any:
+    if isinstance(obj, Path):
+        return str(obj)
+    raise TypeError(f"Can't encode object of type {type(obj).__name__}")
+
+
+def _default_value(x: Field):
+    """Return the default value of the input Field.
+
+    Args:
+        x (Field): input Field.
+
+    Returns:
+        object: default value of the input Field.
+    """
+    if x.default not in (MISSING, _MISSING):
+        return x.default
+    if x.default_factory not in (MISSING, _MISSING):
+        return x.default_factory()
+    return x.default
+
+
+def _is_optional_field(field) -> bool:
+    """Check if the input field is optional.
+
+    Args:
+        field (Field): input Field to check.
+
+    Returns:
+        bool: True if the input field is optional.
+    """
+    # return isinstance(field.type, _GenericAlias) and type(None) in getattr(field.type, "__args__")
+    return type(None) in getattr(field.type, "__args__")
+
+
+def my_get_type_hints(
+    cls,
+):
+    """Custom `get_type_hints` dealing with https://github.com/python/typing/issues/737
+
+    Returns:
+        [dataclass]: dataclass to get the type hints of its fields.
+    """
+    r_dict = {}
+    for base in cls.__class__.__bases__:
+        if base == object:
+            break
+        r_dict.update(my_get_type_hints(base))
+    r_dict.update(get_type_hints(cls))
+    return r_dict
+
+
+def _serialize(x):
+    """Pick the right serialization for the datatype of the given input.
+
+    Args:
+        x (object): input object.
+
+    Returns:
+        object: serialized object.
+    """
+    if isinstance(x, Path):
+        return str(x)
+    if isinstance(x, dict):
+        return {k: _serialize(v) for k, v in x.items()}
+    if isinstance(x, list):
+        return [_serialize(xi) for xi in x]
+    if isinstance(x, Serializable) or issubclass(type(x), Serializable):
+        return x.serialize()
+    if isinstance(x, type) and issubclass(x, Serializable):
+        return x.serialize(x)
+    return x
+
+
+def _deserialize_dict(x: Dict) -> Dict:
+    """Deserialize dict.
+
+    Args:
+        x (Dict): value to deserialized.
+
+    Returns:
+        Dict: deserialized dictionary.
+    """
+    out_dict = {}
+    for k, v in x.items():
+        if v is None:  # if {'key':None}
+            out_dict[k] = None
+        else:
+            out_dict[k] = _deserialize(v, type(v))
+    return out_dict
+
+
+def _deserialize_list(x: List, field_type: Type) -> List:
+    """Deserialize values for List typed fields.
+
+    Args:
+        x (List): value to be deserialized
+        field_type (Type): field type.
+
+    Raises:
+        ValueError: Coqpit does not support multi type-hinted lists.
+
+    Returns:
+        [List]: deserialized list.
+    """
+    field_args = None
+    if hasattr(field_type, "__args__") and field_type.__args__:
+        field_args = field_type.__args__
+    elif hasattr(field_type, "__parameters__") and field_type.__parameters__:
+        # bandaid for python 3.6
+        field_args = field_type.__parameters__
+    if field_args:
+        if len(field_args) > 1:
+            raise ValueError(" [!] Coqpit does not support multi-type hinted 'List'")
+        field_arg = field_args[0]
+        # if field type is TypeVar set the current type by the value's type.
+        if isinstance(field_arg, TypeVar):
+            field_arg = type(x)
+        return [_deserialize(xi, field_arg) for xi in x]
+    return x
+
+
+def _deserialize_union(x: Any, field_type: Type) -> Any:
+    """Deserialize values for Union typed fields
+
+    Args:
+        x (Any): value to be deserialized.
+        field_type (Type): field type.
+
+    Returns:
+        [Any]: desrialized value.
+    """
+    for arg in field_type.__args__:
+        # stop after first matching type in Union
+        try:
+            x = _deserialize(x, arg)
+            break
+        except ValueError:
+            pass
+    return x
+
+
+def _deserialize_primitive_types(x: Union[int, float, str, bool], field_type: Type) -> Union[int, float, str, bool]:
+    """Deserialize python primitive types (float, int, str, bool).
+    It handles `inf` values exclusively and keeps them float against int fields since int does not support inf values.
+
+    Args:
+        x (Union[int, float, str, bool]): value to be deserialized.
+        field_type (Type): field type.
+
+    Returns:
+        Union[int, float, str, bool]: deserialized value.
+    """
+
+    if isinstance(x, (str, bool)):
+        return x
+    if isinstance(x, (int, float)):
+        if x == float("inf") or x == float("-inf"):
+            # if value type is inf return regardless.
+            return x
+        x = field_type(x)
+        return x
+    # TODO: Raise an error when x does not match the types.
+    return None
+
+
+def _deserialize(x: Any, field_type: Any) -> Any:
+    """Pick the right desrialization for the given object and the corresponding field type.
+
+    Args:
+        x (object): object to be deserialized.
+        field_type (type): expected type after deserialization.
+
+    Returns:
+        object: deserialized object
+
+    """
+    # pylint: disable=too-many-return-statements
+    if is_dict(field_type):
+        return _deserialize_dict(x)
+    if is_list(field_type):
+        return _deserialize_list(x, field_type)
+    if is_union(field_type):
+        return _deserialize_union(x, field_type)
+    if issubclass(field_type, Serializable):
+        return field_type.deserialize_immutable(x)
+    if is_primitive_type(field_type):
+        return _deserialize_primitive_types(x, field_type)
+    raise ValueError(f" [!] '{type(x)}' value type of '{x}' does not match '{field_type}' field type.")
+
+
+# Recursive setattr (supports dotted attr names)
+def rsetattr(obj, attr, val):
+    def _setitem(obj, attr, val):
+        return operator.setitem(obj, int(attr), val)
+
+    pre, _, post = attr.rpartition(".")
+    setfunc = _setitem if post.isnumeric() else setattr
+
+    return setfunc(rgetattr(obj, pre) if pre else obj, post, val)
+
+
+# Recursive getattr (supports dotted attr names)
+def rgetattr(obj, attr, *args):
+    def _getitem(obj, attr):
+        return operator.getitem(obj, int(attr), *args)
+
+    def _getattr(obj, attr):
+        getfunc = _getitem if attr.isnumeric() else getattr
+        return getfunc(obj, attr, *args)
+
+    return functools.reduce(_getattr, [obj] + attr.split("."))
+
+
+# Recursive setitem (supports dotted attr names)
+def rsetitem(obj, attr, val):
+    pre, _, post = attr.rpartition(".")
+    return operator.setitem(rgetitem(obj, pre) if pre else obj, post, val)
+
+
+# Recursive getitem (supports dotted attr names)
+def rgetitem(obj, attr, *args):
+    def _getitem(obj, attr):
+        return operator.getitem(obj, int(attr) if attr.isnumeric() else attr, *args)
+
+    return functools.reduce(_getitem, [obj] + attr.split("."))
+
+
+@dataclass
+class Serializable:
+    """Gives serialization ability to any inheriting dataclass."""
+
+    def __post_init__(self):
+        self._validate_contracts()
+        for key, value in self.__dict__.items():
+            if value is no_default:
+                raise TypeError(f"__init__ missing 1 required argument: '{key}'")
+
+    def _validate_contracts(self):
+        dataclass_fields = fields(self)
+
+        for field in dataclass_fields:
+
+            value = getattr(self, field.name)
+
+            if value is None:
+                if not _is_optional_field(field):
+                    raise TypeError(f"{field.name} is not optional")
+
+            contract = field.metadata.get("contract", None)
+
+            if contract is not None:
+                if value is not None and not contract(value):
+                    raise ValueError(f"break the contract for {field.name}, {self.__class__.__name__}")
+
+    def validate(self):
+        """validate if object can serialize / deserialize correctly."""
+        self._validate_contracts()
+        if self != self.__class__.deserialize(  # pylint: disable=no-value-for-parameter
+            json.loads(json.dumps(self.serialize()))
+        ):
+            raise ValueError("could not be deserialized with same value")
+
+    def to_dict(self) -> dict:
+        """Transform serializable object to dict."""
+        cls_fields = fields(self)
+        o = {}
+        for cls_field in cls_fields:
+            o[cls_field.name] = getattr(self, cls_field.name)
+        return o
+
+    def serialize(self) -> dict:
+        """Serialize object to be json serializable representation."""
+        if not is_dataclass(self):
+            raise TypeError("need to be decorated as dataclass")
+
+        dataclass_fields = fields(self)
+
+        o = {}
+
+        for field in dataclass_fields:
+            value = getattr(self, field.name)
+            value = _serialize(value)
+            o[field.name] = value
+        return o
+
+    def deserialize(self, data: dict) -> "Serializable":
+        """Parse input dictionary and desrialize its fields to a dataclass.
+
+        Returns:
+            self: deserialized `self`.
+        """
+        if not isinstance(data, dict):
+            raise ValueError()
+        data = data.copy()
+        init_kwargs = {}
+        for field in fields(self):
+            # if field.name == 'dataset_config':
+            if field.name not in data:
+                if field.name in vars(self):
+                    init_kwargs[field.name] = vars(self)[field.name]
+                    continue
+                raise ValueError(f' [!] Missing required field "{field.name}"')
+            value = data.get(field.name, _default_value(field))
+            if value is None:
+                init_kwargs[field.name] = value
+                continue
+            if value == MISSING:
+                raise ValueError(f"deserialized with unknown value for {field.name} in {self.__name__}")
+            value = _deserialize(value, field.type)
+            init_kwargs[field.name] = value
+        for k, v in init_kwargs.items():
+            setattr(self, k, v)
+        return self
+
+    @classmethod
+    def deserialize_immutable(cls, data: dict) -> "Serializable":
+        """Parse input dictionary and desrialize its fields to a dataclass.
+
+        Returns:
+            Newly created deserialized object.
+        """
+        if not isinstance(data, dict):
+            raise ValueError()
+        data = data.copy()
+        init_kwargs = {}
+        for field in fields(cls):
+            # if field.name == 'dataset_config':
+            if field.name not in data:
+                if field.name in vars(cls):
+                    init_kwargs[field.name] = vars(cls)[field.name]
+                    continue
+                # if not in cls and the default value is not Missing use it
+                default_value = _default_value(field)
+                if default_value not in (MISSING, _MISSING):
+                    init_kwargs[field.name] = default_value
+                    continue
+                raise ValueError(f' [!] Missing required field "{field.name}"')
+            value = data.get(field.name, _default_value(field))
+            if value is None:
+                init_kwargs[field.name] = value
+                continue
+            if value == MISSING:
+                raise ValueError(f"Deserialized with unknown value for {field.name} in {cls.__name__}")
+            value = _deserialize(value, field.type)
+            init_kwargs[field.name] = value
+        return cls(**init_kwargs)
+
+
+# ---------------------------------------------------------------------------- #
+#                        Argument Parsing from `argparse`                      #
+# ---------------------------------------------------------------------------- #
+
+
+def _get_help(field):
+    try:
+        field_help = field.metadata["help"]
+    except KeyError:
+        field_help = ""
+    return field_help
+
+
+def _init_argparse(
+    parser,
+    field_name,
+    field_type,
+    field_default,
+    field_default_factory,
+    field_help,
+    arg_prefix="",
+    help_prefix="",
+    relaxed_parser=False,
+):
+    has_default = False
+    default = None
+    if field_default:
+        has_default = True
+        default = field_default
+    elif field_default_factory not in (None, _MISSING):
+        has_default = True
+        default = field_default_factory()
+
+    if not has_default and not is_primitive_type(field_type) and not is_list(field_type):
+        # aggregate types (fields with a Coqpit subclass as type) are not supported without None
+        return parser
+    arg_prefix = field_name if arg_prefix == "" else f"{arg_prefix}.{field_name}"
+    help_prefix = field_help if help_prefix == "" else f"{help_prefix} - {field_help}"
+    if is_dict(field_type):  # pylint: disable=no-else-raise
+        # NOTE: accept any string in json format as input to dict field.
+        parser.add_argument(
+            f"--{arg_prefix}",
+            dest=arg_prefix,
+            default=json.dumps(field_default) if field_default else None,
+            type=json.loads,
+        )
+    elif is_list(field_type):
+        # TODO: We need a more clear help msg for lists.
+        if hasattr(field_type, "__args__"):  # if the list is hinted
+            if len(field_type.__args__) > 1 and not relaxed_parser:
+                raise ValueError(" [!] Coqpit does not support multi-type hinted 'List'")
+            list_field_type = field_type.__args__[0]
+        else:
+            raise ValueError(" [!] Coqpit does not support un-hinted 'List'")
+
+        # TODO: handle list of lists
+        if is_list(list_field_type) and relaxed_parser:
+            return parser
+
+        if not has_default or field_default_factory is list:
+            if not is_primitive_type(list_field_type) and not relaxed_parser:
+                raise NotImplementedError(" [!] Empty list with non primitive inner type is currently not supported.")
+
+            # If the list's default value is None, the user can specify the entire list by passing multiple parameters
+            parser.add_argument(
+                f"--{arg_prefix}",
+                nargs="*",
+                type=list_field_type,
+                help=f"Coqpit Field: {help_prefix}",
+            )
+        else:
+            # If a default value is defined, just enable editing the values from argparse
+            # TODO: allow inserting a new value/obj to the end of the list.
+            for idx, fv in enumerate(default):
+                parser = _init_argparse(
+                    parser,
+                    str(idx),
+                    list_field_type,
+                    fv,
+                    field_default_factory,
+                    field_help="",
+                    help_prefix=f"{help_prefix} - ",
+                    arg_prefix=f"{arg_prefix}",
+                    relaxed_parser=relaxed_parser,
+                )
+    elif is_union(field_type):
+        # TODO: currently I don't know how to handle Union type on argparse
+        if not relaxed_parser:
+            raise NotImplementedError(
+                " [!] Parsing `Union` field from argparse is not yet implemented. Please create an issue."
+            )
+    elif issubclass(field_type, Serializable):
+        return default.init_argparse(
+            parser, arg_prefix=arg_prefix, help_prefix=help_prefix, relaxed_parser=relaxed_parser
+        )
+    elif isinstance(field_type(), bool):
+
+        def parse_bool(x):
+            if x not in ("true", "false"):
+                raise ValueError(f' [!] Value for boolean field must be either "true" or "false". Got "{x}".')
+            return x == "true"
+
+        parser.add_argument(
+            f"--{arg_prefix}",
+            type=parse_bool,
+            default=field_default,
+            help=f"Coqpit Field: {help_prefix}",
+            metavar="true/false",
+        )
+    elif is_primitive_type(field_type):
+        parser.add_argument(
+            f"--{arg_prefix}",
+            default=field_default,
+            type=field_type,
+            help=f"Coqpit Field: {help_prefix}",
+        )
+    else:
+        if not relaxed_parser:
+            raise NotImplementedError(f" [!] '{field_type}' is not supported by arg_parser. Please file a bug report.")
+    return parser
+
+
+# ---------------------------------------------------------------------------- #
+#                               Main Coqpit Class                              #
+# ---------------------------------------------------------------------------- #
+
+
+@dataclass
+class Coqpit(Serializable, MutableMapping):
+    """Coqpit base class to be inherited by any Coqpit dataclasses.
+    It overrides Python `dict` interface and provides `dict` compatible API.
+    It also enables serializing/deserializing a dataclass to/from a json file, plus some semi-dynamic type and value check.
+    Note that it does not support all datatypes and likely to fail in some cases.
+    """
+
+    _initialized = False
+
+    def _is_initialized(self):
+        """Check if Coqpit is initialized. Useful to prevent running some aux functions
+        at the initialization when no attribute has been defined."""
+        return "_initialized" in vars(self) and self._initialized
+
+    def __post_init__(self):
+        self._initialized = True
+        try:
+            self.check_values()
+        except AttributeError:
+            pass
+
+    ## `dict` API functions
+
+    def __iter__(self):
+        return iter(asdict(self))
+
+    def __len__(self):
+        return len(fields(self))
+
+    def __setitem__(self, arg: str, value: Any):
+        setattr(self, arg, value)
+
+    def __getitem__(self, arg: str):
+        """Access class attributes with ``[arg]``."""
+        return self.__dict__[arg]
+
+    def __delitem__(self, arg: str):
+        delattr(self, arg)
+
+    def _keytransform(self, key):  # pylint: disable=no-self-use
+        return key
+
+    ## end `dict` API functions
+
+    def __getattribute__(self, arg: str):  # pylint: disable=no-self-use
+        """Check if the mandatory field is defined when accessing it."""
+        value = super().__getattribute__(arg)
+        if isinstance(value, str) and value == "???":
+            raise AttributeError(f" [!] MISSING field {arg} must be defined.")
+        return value
+
+    def __contains__(self, arg: str):
+        return arg in self.to_dict()
+
+    def get(self, key: str, default: Any = None):
+        if self.has(key):
+            return asdict(self)[key]
+        return default
+
+    def items(self):
+        return asdict(self).items()
+
+    def merge(self, coqpits: Union["Coqpit", List["Coqpit"]]):
+        """Merge a coqpit instance or a list of coqpit instances to self.
+        Note that it does not pass the fields and overrides attributes with
+        the last Coqpit instance in the given List.
+        TODO: find a way to merge instances with all the class internals.
+
+        Args:
+            coqpits (Union[Coqpit, List[Coqpit]]): coqpit instance or list of instances to be merged.
+        """
+
+        def _merge(coqpit):
+            self.__dict__.update(coqpit.__dict__)
+            self.__annotations__.update(coqpit.__annotations__)
+            self.__dataclass_fields__.update(coqpit.__dataclass_fields__)
+
+        if isinstance(coqpits, list):
+            for coqpit in coqpits:
+                _merge(coqpit)
+        else:
+            _merge(coqpits)
+
+    def check_values(self):
+        pass
+
+    def has(self, arg: str) -> bool:
+        return arg in vars(self)
+
+    def copy(self):
+        return replace(self)
+
+    def update(self, new: dict, allow_new=False) -> None:
+        """Update Coqpit fields by the input ```dict```.
+
+        Args:
+            new (dict): dictionary with new values.
+            allow_new (bool, optional): allow new fields to add. Defaults to False.
+        """
+        for key, value in new.items():
+            if allow_new:
+                setattr(self, key, value)
+            else:
+                if hasattr(self, key):
+                    setattr(self, key, value)
+                else:
+                    raise KeyError(f" [!] No key - {key}")
+
+    def pprint(self) -> None:
+        """Print Coqpit fields in a format."""
+        pprint(asdict(self))
+
+    def to_dict(self) -> dict:
+        # return asdict(self)
+        return self.serialize()
+
+    def from_dict(self, data: dict) -> None:
+        self = self.deserialize(data)  # pylint: disable=self-cls-assignment
+
+    @classmethod
+    def new_from_dict(cls: Serializable, data: dict) -> "Coqpit":
+        return cls.deserialize_immutable(data)
+
+    def to_json(self) -> str:
+        """Returns a JSON string representation."""
+        return json.dumps(asdict(self), indent=4, default=_coqpit_json_default)
+
+    def save_json(self, file_name: str) -> None:
+        """Save Coqpit to a json file.
+
+        Args:
+            file_name (str): path to the output json file.
+        """
+        with open(file_name, "w", encoding="utf8") as f:
+            json.dump(asdict(self), f, indent=4)
+
+    def load_json(self, file_name: str) -> None:
+        """Load a json file and update matching config fields with type checking.
+        Non-matching parameters in the json file are ignored.
+
+        Args:
+            file_name (str): path to the json file.
+
+        Returns:
+            Coqpit: new Coqpit with updated config fields.
+        """
+        with open(file_name, "r", encoding="utf8") as f:
+            input_str = f.read()
+            dump_dict = json.loads(input_str)
+        # TODO: this looks stupid 💆
+        self = self.deserialize(dump_dict)  # pylint: disable=self-cls-assignment
+        self.check_values()
+
+    @classmethod
+    def init_from_argparse(
+        cls, args: Optional[Union[argparse.Namespace, List[str]]] = None, arg_prefix: str = "coqpit"
+    ) -> "Coqpit":
+        """Create a new Coqpit instance from argparse input.
+
+        Args:
+            args (namespace or list of str, optional): parsed argparse.Namespace or list of command line parameters. If unspecified will use a newly created parser with ```init_argparse()```.
+            arg_prefix: prefix to add to CLI parameters. Gets forwarded to ```init_argparse``` when ```args``` is not passed.
+        """
+        if not args:
+            # If args was not specified, parse from sys.argv
+            parser = cls.init_argparse(cls, arg_prefix=arg_prefix)
+            args = parser.parse_args()  # pylint: disable=E1120, E1111
+        if isinstance(args, list):
+            # If a list was passed in (eg. the second result of `parse_known_args`, run that through argparse first to get a parsed Namespace
+            parser = cls.init_argparse(cls, arg_prefix=arg_prefix)
+            args = parser.parse_args(args)  # pylint: disable=E1120, E1111
+
+        # Handle list and object attributes with defaults, which can be modified
+        # directly (eg. --coqpit.list.0.val_a 1), by constructing real objects
+        # from defaults and passing those to `cls.__init__`
+        args_with_lists_processed = {}
+        class_fields = fields(cls)
+        for field in class_fields:
+            has_default = False
+            default = None
+            field_default = field.default if field.default is not _MISSING else None
+            field_default_factory = field.default_factory if field.default_factory is not _MISSING else None
+            if field_default:
+                has_default = True
+                default = field_default
+            elif field_default_factory:
+                has_default = True
+                default = field_default_factory()
+
+            if has_default and (not is_primitive_type(field.type) or is_list(field.type)):
+                args_with_lists_processed[field.name] = default
+
+        args_dict = vars(args)
+        for k, v in args_dict.items():
+            # Remove argparse prefix (eg. "--coqpit." if present)
+            if k.startswith(f"{arg_prefix}."):
+                k = k[len(f"{arg_prefix}.") :]
+
+            rsetitem(args_with_lists_processed, k, v)
+
+        return cls(**args_with_lists_processed)
+
+    def parse_args(
+        self, args: Optional[Union[argparse.Namespace, List[str]]] = None, arg_prefix: str = "coqpit"
+    ) -> None:
+        """Update config values from argparse arguments with some meta-programming ✨.
+
+        Args:
+            args (namespace or list of str, optional): parsed argparse.Namespace or list of command line parameters. If unspecified will use a newly created parser with ```init_argparse()```.
+            arg_prefix: prefix to add to CLI parameters. Gets forwarded to ```init_argparse``` when ```args``` is not passed.
+        """
+        if not args:
+            # If args was not specified, parse from sys.argv
+            parser = self.init_argparse(arg_prefix=arg_prefix)
+            args = parser.parse_args()
+        if isinstance(args, list):
+            # If a list was passed in (eg. the second result of `parse_known_args`, run that through argparse first to get a parsed Namespace
+            parser = self.init_argparse(arg_prefix=arg_prefix)
+            args = parser.parse_args(args)
+
+        args_dict = vars(args)
+
+        for k, v in args_dict.items():
+            if k.startswith(f"{arg_prefix}."):
+                k = k[len(f"{arg_prefix}.") :]
+            try:
+                rgetattr(self, k)
+            except (TypeError, AttributeError) as e:
+                raise Exception(f" [!] '{k}' not exist to override from argparse.") from e
+
+            rsetattr(self, k, v)
+
+        self.check_values()
+
+    def parse_known_args(
+        self,
+        args: Optional[Union[argparse.Namespace, List[str]]] = None,
+        arg_prefix: str = "coqpit",
+        relaxed_parser=False,
+    ) -> List[str]:
+        """Update config values from argparse arguments. Ignore unknown arguments.
+           This is analog to argparse.ArgumentParser.parse_known_args (vs parse_args).
+
+        Args:
+            args (namespace or list of str, optional): parsed argparse.Namespace or list of command line parameters. If unspecified will use a newly created parser with ```init_argparse()```.
+            arg_prefix: prefix to add to CLI parameters. Gets forwarded to ```init_argparse``` when ```args``` is not passed.
+            relaxed_parser (bool, optional): If True, do not force all the fields to have compatible types with the argparser. Defaults to False.
+
+        Returns:
+            List of unknown parameters.
+        """
+        if not args:
+            # If args was not specified, parse from sys.argv
+            parser = self.init_argparse(arg_prefix=arg_prefix, relaxed_parser=relaxed_parser)
+            args, unknown = parser.parse_known_args()
+        if isinstance(args, list):
+            # If a list was passed in (eg. the second result of `parse_known_args`, run that through argparse first to get a parsed Namespace
+            parser = self.init_argparse(arg_prefix=arg_prefix, relaxed_parser=relaxed_parser)
+            args, unknown = parser.parse_known_args(args)
+
+        self.parse_args(args)
+        return unknown
+
+    def init_argparse(
+        self,
+        parser: Optional[argparse.ArgumentParser] = None,
+        arg_prefix="coqpit",
+        help_prefix="",
+        relaxed_parser=False,
+    ) -> argparse.ArgumentParser:
+        """Pass Coqpit fields as argparse arguments. This allows to edit values through command-line.
+
+        Args:
+            parser (argparse.ArgumentParser, optional): argparse.ArgumentParser instance. If unspecified a new one will be created.
+            arg_prefix (str, optional): Prefix to be used for the argument name. Defaults to 'coqpit'.
+            help_prefix (str, optional): Prefix to be used for the argument description. Defaults to ''.
+            relaxed_parser (bool, optional): If True, do not force all the fields to have compatible types with the argparser. Defaults to False.
+
+        Returns:
+            argparse.ArgumentParser: parser instance with the new arguments.
+        """
+        if not parser:
+            parser = argparse.ArgumentParser()
+        class_fields = fields(self)
+        for field in class_fields:
+            if field.name in vars(self):
+                # use the current value of the field
+                # prevent dropping the current value
+                field_default = vars(self)[field.name]
+            else:
+                # use the default value of the field
+                field_default = field.default if field.default is not _MISSING else None
+            field_type = field.type
+            field_default_factory = field.default_factory
+            field_help = _get_help(field)
+            _init_argparse(
+                parser,
+                field.name,
+                field_type,
+                field_default,
+                field_default_factory,
+                field_help,
+                arg_prefix,
+                help_prefix,
+                relaxed_parser,
+            )
+        return parser
+
+
+def check_argument(
+    name,
+    c,
+    is_path: bool = False,
+    prerequest: str = None,
+    enum_list: list = None,
+    max_val: float = None,
+    min_val: float = None,
+    restricted: bool = False,
+    alternative: str = None,
+    allow_none: bool = True,
+) -> None:
+    """Simple type and value checking for Coqpit.
+    It is intended to be used under ```__post_init__()``` of config dataclasses.
+
+    Args:
+        name (str): name of the field to be checked.
+        c (dict): config dictionary.
+        is_path (bool, optional): if ```True``` check if the path is exist. Defaults to False.
+        prerequest (list or str, optional): a list of field name that are prerequestedby the target field name.
+            Defaults to ```[]```.
+        enum_list (list, optional): list of possible values for the target field. Defaults to None.
+        max_val (float, optional): maximum possible value for the target field. Defaults to None.
+        min_val (float, optional): minimum possible value for the target field. Defaults to None.
+        restricted (bool, optional): if ```True``` the target field has to be defined. Defaults to False.
+        alternative (str, optional): a field name superceding the target field. Defaults to None.
+        allow_none (bool, optional): if ```True``` allow the target field to be ```None```. Defaults to False.
+
+
+    Example:
+        >>> num_mels = 5
+        >>> check_argument('num_mels', c, restricted=True, min_val=10, max_val=2056)
+        >>> fft_size = 128
+        >>> check_argument('fft_size', c, restricted=True, min_val=128, max_val=4058)
+    """
+    # check if None allowed
+    if allow_none and c[name] is None:
+        return
+    if not allow_none:
+        assert c[name] is not None, f" [!] None value is not allowed for {name}."
+    # check if restricted and it it is check if it exists
+    if isinstance(restricted, bool) and restricted:
+        assert name in c.keys(), f" [!] {name} not defined in config.json"
+    # check prerequest fields are defined
+    if isinstance(prerequest, list):
+        assert any(
+            f not in c.keys() for f in prerequest
+        ), f" [!] prequested fields {prerequest} for {name} are not defined."
+    else:
+        assert (
+            prerequest is None or prerequest in c.keys()
+        ), f" [!] prequested fields {prerequest} for {name} are not defined."
+    # check if the path exists
+    if is_path:
+        assert os.path.exists(c[name]), f' [!] path for {name} ("{c[name]}") does not exist.'
+    # skip the rest if the alternative field is defined.
+    if alternative in c.keys() and c[alternative] is not None:
+        return
+    # check value constraints
+    if name in c.keys():
+        if max_val is not None:
+            assert c[name] <= max_val, f" [!] {name} is larger than max value {max_val}"
+        if min_val is not None:
+            assert c[name] >= min_val, f" [!] {name} is smaller than min value {min_val}"
+        if enum_list is not None:
+            assert c[name].lower() in enum_list, f" [!] {name} is not a valid value"
diff --git a/speaker/utils/io.py b/speaker/utils/io.py
new file mode 100644
index 0000000000000000000000000000000000000000..1d4c07940d872cb6773d388029595aecf67e4408
--- /dev/null
+++ b/speaker/utils/io.py
@@ -0,0 +1,198 @@
+import datetime
+import json
+import os
+import pickle as pickle_tts
+import shutil
+from typing import Any, Callable, Dict, Union
+
+import fsspec
+import torch
+from .coqpit import Coqpit
+
+
+class RenamingUnpickler(pickle_tts.Unpickler):
+    """Overload default pickler to solve module renaming problem"""
+
+    def find_class(self, module, name):
+        return super().find_class(module.replace("mozilla_voice_tts", "TTS"), name)
+
+
+class AttrDict(dict):
+    """A custom dict which converts dict keys
+    to class attributes"""
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.__dict__ = self
+
+
+def copy_model_files(config: Coqpit, out_path, new_fields):
+    """Copy config.json and other model files to training folder and add
+    new fields.
+
+    Args:
+        config (Coqpit): Coqpit config defining the training run.
+        out_path (str): output path to copy the file.
+        new_fields (dict): new fileds to be added or edited
+            in the config file.
+    """
+    copy_config_path = os.path.join(out_path, "config.json")
+    # add extra information fields
+    config.update(new_fields, allow_new=True)
+    # TODO: Revert to config.save_json() once Coqpit supports arbitrary paths.
+    with fsspec.open(copy_config_path, "w", encoding="utf8") as f:
+        json.dump(config.to_dict(), f, indent=4)
+
+    # copy model stats file if available
+    if config.audio.stats_path is not None:
+        copy_stats_path = os.path.join(out_path, "scale_stats.npy")
+        filesystem = fsspec.get_mapper(copy_stats_path).fs
+        if not filesystem.exists(copy_stats_path):
+            with fsspec.open(config.audio.stats_path, "rb") as source_file:
+                with fsspec.open(copy_stats_path, "wb") as target_file:
+                    shutil.copyfileobj(source_file, target_file)
+
+
+def load_fsspec(
+    path: str,
+    map_location: Union[str, Callable, torch.device, Dict[Union[str, torch.device], Union[str, torch.device]]] = None,
+    **kwargs,
+) -> Any:
+    """Like torch.load but can load from other locations (e.g. s3:// , gs://).
+
+    Args:
+        path: Any path or url supported by fsspec.
+        map_location: torch.device or str.
+        **kwargs: Keyword arguments forwarded to torch.load.
+
+    Returns:
+        Object stored in path.
+    """
+    with fsspec.open(path, "rb") as f:
+        return torch.load(f, map_location=map_location, **kwargs)
+
+
+def load_checkpoint(model, checkpoint_path, use_cuda=False, eval=False):  # pylint: disable=redefined-builtin
+    try:
+        state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"))
+    except ModuleNotFoundError:
+        pickle_tts.Unpickler = RenamingUnpickler
+        state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"), pickle_module=pickle_tts)
+    model.load_state_dict(state["model"])
+    if use_cuda:
+        model.cuda()
+    if eval:
+        model.eval()
+    return model, state
+
+
+def save_fsspec(state: Any, path: str, **kwargs):
+    """Like torch.save but can save to other locations (e.g. s3:// , gs://).
+
+    Args:
+        state: State object to save
+        path: Any path or url supported by fsspec.
+        **kwargs: Keyword arguments forwarded to torch.save.
+    """
+    with fsspec.open(path, "wb") as f:
+        torch.save(state, f, **kwargs)
+
+
+def save_model(config, model, optimizer, scaler, current_step, epoch, output_path, **kwargs):
+    if hasattr(model, "module"):
+        model_state = model.module.state_dict()
+    else:
+        model_state = model.state_dict()
+    if isinstance(optimizer, list):
+        optimizer_state = [optim.state_dict() for optim in optimizer]
+    else:
+        optimizer_state = optimizer.state_dict() if optimizer is not None else None
+
+    if isinstance(scaler, list):
+        scaler_state = [s.state_dict() for s in scaler]
+    else:
+        scaler_state = scaler.state_dict() if scaler is not None else None
+
+    if isinstance(config, Coqpit):
+        config = config.to_dict()
+
+    state = {
+        "config": config,
+        "model": model_state,
+        "optimizer": optimizer_state,
+        "scaler": scaler_state,
+        "step": current_step,
+        "epoch": epoch,
+        "date": datetime.date.today().strftime("%B %d, %Y"),
+    }
+    state.update(kwargs)
+    save_fsspec(state, output_path)
+
+
+def save_checkpoint(
+    config,
+    model,
+    optimizer,
+    scaler,
+    current_step,
+    epoch,
+    output_folder,
+    **kwargs,
+):
+    file_name = "checkpoint_{}.pth.tar".format(current_step)
+    checkpoint_path = os.path.join(output_folder, file_name)
+    print("\n > CHECKPOINT : {}".format(checkpoint_path))
+    save_model(
+        config,
+        model,
+        optimizer,
+        scaler,
+        current_step,
+        epoch,
+        checkpoint_path,
+        **kwargs,
+    )
+
+
+def save_best_model(
+    current_loss,
+    best_loss,
+    config,
+    model,
+    optimizer,
+    scaler,
+    current_step,
+    epoch,
+    out_path,
+    keep_all_best=False,
+    keep_after=10000,
+    **kwargs,
+):
+    if current_loss < best_loss:
+        best_model_name = f"best_model_{current_step}.pth.tar"
+        checkpoint_path = os.path.join(out_path, best_model_name)
+        print(" > BEST MODEL : {}".format(checkpoint_path))
+        save_model(
+            config,
+            model,
+            optimizer,
+            scaler,
+            current_step,
+            epoch,
+            checkpoint_path,
+            model_loss=current_loss,
+            **kwargs,
+        )
+        fs = fsspec.get_mapper(out_path).fs
+        # only delete previous if current is saved successfully
+        if not keep_all_best or (current_step < keep_after):
+            model_names = fs.glob(os.path.join(out_path, "best_model*.pth.tar"))
+            for model_name in model_names:
+                if os.path.basename(model_name) != best_model_name:
+                    fs.rm(model_name)
+        # create a shortcut which always points to the currently best model
+        shortcut_name = "best_model.pth.tar"
+        shortcut_path = os.path.join(out_path, shortcut_name)
+        fs.copy(checkpoint_path, shortcut_path)
+        best_loss = current_loss
+    return best_loss
diff --git a/speaker/utils/shared_configs.py b/speaker/utils/shared_configs.py
new file mode 100644
index 0000000000000000000000000000000000000000..a89d3a91c31679989b60b657fe8ef6ace5f02552
--- /dev/null
+++ b/speaker/utils/shared_configs.py
@@ -0,0 +1,342 @@
+from dataclasses import asdict, dataclass
+from typing import List
+
+from .coqpit import Coqpit, check_argument
+
+
+@dataclass
+class BaseAudioConfig(Coqpit):
+    """Base config to definge audio processing parameters. It is used to initialize
+    ```TTS.utils.audio.AudioProcessor.```
+
+    Args:
+        fft_size (int):
+            Number of STFT frequency levels aka.size of the linear spectogram frame. Defaults to 1024.
+
+        win_length (int):
+            Each frame of audio is windowed by window of length ```win_length``` and then padded with zeros to match
+            ```fft_size```. Defaults to 1024.
+
+        hop_length (int):
+            Number of audio samples between adjacent STFT columns. Defaults to 1024.
+
+        frame_shift_ms (int):
+            Set ```hop_length``` based on milliseconds and sampling rate.
+
+        frame_length_ms (int):
+            Set ```win_length``` based on milliseconds and sampling rate.
+
+        stft_pad_mode (str):
+            Padding method used in STFT. 'reflect' or 'center'. Defaults to 'reflect'.
+
+        sample_rate (int):
+            Audio sampling rate. Defaults to 22050.
+
+        resample (bool):
+            Enable / Disable resampling audio to ```sample_rate```. Defaults to ```False```.
+
+        preemphasis (float):
+            Preemphasis coefficient. Defaults to 0.0.
+
+        ref_level_db (int): 20
+            Reference Db level to rebase the audio signal and ignore the level below. 20Db is assumed the sound of air.
+            Defaults to 20.
+
+        do_sound_norm (bool):
+            Enable / Disable sound normalization to reconcile the volume differences among samples. Defaults to False.
+
+        log_func (str):
+            Numpy log function used for amplitude to DB conversion. Defaults to 'np.log10'.
+
+        do_trim_silence (bool):
+            Enable / Disable trimming silences at the beginning and the end of the audio clip. Defaults to ```True```.
+
+        do_amp_to_db_linear (bool, optional):
+            enable/disable amplitude to dB conversion of linear spectrograms. Defaults to True.
+
+        do_amp_to_db_mel (bool, optional):
+            enable/disable amplitude to dB conversion of mel spectrograms. Defaults to True.
+
+        trim_db (int):
+            Silence threshold used for silence trimming. Defaults to 45.
+
+        power (float):
+            Exponent used for expanding spectrogra levels before running Griffin Lim. It helps to reduce the
+            artifacts in the synthesized voice. Defaults to 1.5.
+
+        griffin_lim_iters (int):
+            Number of Griffing Lim iterations. Defaults to 60.
+
+        num_mels (int):
+            Number of mel-basis frames that defines the frame lengths of each mel-spectrogram frame. Defaults to 80.
+
+        mel_fmin (float): Min frequency level used for the mel-basis filters. ~50 for male and ~95 for female voices.
+            It needs to be adjusted for a dataset. Defaults to 0.
+
+        mel_fmax (float):
+            Max frequency level used for the mel-basis filters. It needs to be adjusted for a dataset.
+
+        spec_gain (int):
+            Gain applied when converting amplitude to DB. Defaults to 20.
+
+        signal_norm (bool):
+            enable/disable signal normalization. Defaults to True.
+
+        min_level_db (int):
+            minimum db threshold for the computed melspectrograms. Defaults to -100.
+
+        symmetric_norm (bool):
+            enable/disable symmetric normalization. If set True normalization is performed in the range [-k, k] else
+            [0, k], Defaults to True.
+
+        max_norm (float):
+            ```k``` defining the normalization range. Defaults to 4.0.
+
+        clip_norm (bool):
+            enable/disable clipping the our of range values in the normalized audio signal. Defaults to True.
+
+        stats_path (str):
+            Path to the computed stats file. Defaults to None.
+    """
+
+    # stft parameters
+    fft_size: int = 1024
+    win_length: int = 1024
+    hop_length: int = 256
+    frame_shift_ms: int = None
+    frame_length_ms: int = None
+    stft_pad_mode: str = "reflect"
+    # audio processing parameters
+    sample_rate: int = 22050
+    resample: bool = False
+    preemphasis: float = 0.0
+    ref_level_db: int = 20
+    do_sound_norm: bool = False
+    log_func: str = "np.log10"
+    # silence trimming
+    do_trim_silence: bool = True
+    trim_db: int = 45
+    # griffin-lim params
+    power: float = 1.5
+    griffin_lim_iters: int = 60
+    # mel-spec params
+    num_mels: int = 80
+    mel_fmin: float = 0.0
+    mel_fmax: float = None
+    spec_gain: int = 20
+    do_amp_to_db_linear: bool = True
+    do_amp_to_db_mel: bool = True
+    # normalization params
+    signal_norm: bool = True
+    min_level_db: int = -100
+    symmetric_norm: bool = True
+    max_norm: float = 4.0
+    clip_norm: bool = True
+    stats_path: str = None
+
+    def check_values(
+        self,
+    ):
+        """Check config fields"""
+        c = asdict(self)
+        check_argument("num_mels", c, restricted=True, min_val=10, max_val=2056)
+        check_argument("fft_size", c, restricted=True, min_val=128, max_val=4058)
+        check_argument("sample_rate", c, restricted=True, min_val=512, max_val=100000)
+        check_argument(
+            "frame_length_ms",
+            c,
+            restricted=True,
+            min_val=10,
+            max_val=1000,
+            alternative="win_length",
+        )
+        check_argument("frame_shift_ms", c, restricted=True, min_val=1, max_val=1000, alternative="hop_length")
+        check_argument("preemphasis", c, restricted=True, min_val=0, max_val=1)
+        check_argument("min_level_db", c, restricted=True, min_val=-1000, max_val=10)
+        check_argument("ref_level_db", c, restricted=True, min_val=0, max_val=1000)
+        check_argument("power", c, restricted=True, min_val=1, max_val=5)
+        check_argument("griffin_lim_iters", c, restricted=True, min_val=10, max_val=1000)
+
+        # normalization parameters
+        check_argument("signal_norm", c, restricted=True)
+        check_argument("symmetric_norm", c, restricted=True)
+        check_argument("max_norm", c, restricted=True, min_val=0.1, max_val=1000)
+        check_argument("clip_norm", c, restricted=True)
+        check_argument("mel_fmin", c, restricted=True, min_val=0.0, max_val=1000)
+        check_argument("mel_fmax", c, restricted=True, min_val=500.0, allow_none=True)
+        check_argument("spec_gain", c, restricted=True, min_val=1, max_val=100)
+        check_argument("do_trim_silence", c, restricted=True)
+        check_argument("trim_db", c, restricted=True)
+
+
+@dataclass
+class BaseDatasetConfig(Coqpit):
+    """Base config for TTS datasets.
+
+    Args:
+        name (str):
+            Dataset name that defines the preprocessor in use. Defaults to None.
+
+        path (str):
+            Root path to the dataset files. Defaults to None.
+
+        meta_file_train (str):
+            Name of the dataset meta file. Or a list of speakers to be ignored at training for multi-speaker datasets.
+            Defaults to None.
+
+        unused_speakers (List):
+            List of speakers IDs that are not used at the training. Default None.
+
+        meta_file_val (str):
+            Name of the dataset meta file that defines the instances used at validation.
+
+        meta_file_attn_mask (str):
+            Path to the file that lists the attention mask files used with models that require attention masks to
+            train the duration predictor.
+    """
+
+    name: str = ""
+    path: str = ""
+    meta_file_train: str = ""
+    ununsed_speakers: List[str] = None
+    meta_file_val: str = ""
+    meta_file_attn_mask: str = ""
+
+    def check_values(
+        self,
+    ):
+        """Check config fields"""
+        c = asdict(self)
+        check_argument("name", c, restricted=True)
+        check_argument("path", c, restricted=True)
+        check_argument("meta_file_train", c, restricted=True)
+        check_argument("meta_file_val", c, restricted=False)
+        check_argument("meta_file_attn_mask", c, restricted=False)
+
+
+@dataclass
+class BaseTrainingConfig(Coqpit):
+    """Base config to define the basic training parameters that are shared
+    among all the models.
+
+    Args:
+        model (str):
+            Name of the model that is used in the training.
+
+        run_name (str):
+            Name of the experiment. This prefixes the output folder name. Defaults to `coqui_tts`.
+
+        run_description (str):
+            Short description of the experiment.
+
+        epochs (int):
+            Number training epochs. Defaults to 10000.
+
+        batch_size (int):
+            Training batch size.
+
+        eval_batch_size (int):
+            Validation batch size.
+
+        mixed_precision (bool):
+            Enable / Disable mixed precision training. It reduces the VRAM use and allows larger batch sizes, however
+            it may also cause numerical unstability in some cases.
+
+        scheduler_after_epoch (bool):
+            If true, run the scheduler step after each epoch else run it after each model step.
+
+        run_eval (bool):
+            Enable / Disable evaluation (validation) run. Defaults to True.
+
+        test_delay_epochs (int):
+            Number of epochs before starting to use evaluation runs. Initially, models do not generate meaningful
+            results, hence waiting for a couple of epochs might save some time.
+
+        print_eval (bool):
+            Enable / Disable console logging for evalutaion steps. If disabled then it only shows the final values at
+            the end of the evaluation. Default to ```False```.
+
+        print_step (int):
+            Number of steps required to print the next training log.
+
+        log_dashboard (str): "tensorboard" or "wandb"
+            Set the experiment tracking tool
+
+        plot_step (int):
+            Number of steps required to log training on Tensorboard.
+
+        model_param_stats (bool):
+            Enable / Disable logging internal model stats for model diagnostic. It might be useful for model debugging.
+            Defaults to ```False```.
+
+        project_name (str):
+            Name of the project. Defaults to config.model
+
+        wandb_entity (str):
+            Name of W&B entity/team. Enables collaboration across a team or org.
+
+        log_model_step (int):
+            Number of steps required to log a checkpoint as W&B artifact
+
+        save_step (int):ipt
+            Number of steps required to save the next checkpoint.
+
+        checkpoint (bool):
+            Enable / Disable checkpointing.
+
+        keep_all_best (bool):
+            Enable / Disable keeping all the saved best models instead of overwriting the previous one. Defaults
+            to ```False```.
+
+        keep_after (int):
+            Number of steps to wait before saving all the best models. In use if ```keep_all_best == True```. Defaults
+            to 10000.
+
+        num_loader_workers (int):
+            Number of workers for training time dataloader.
+
+        num_eval_loader_workers (int):
+            Number of workers for evaluation time dataloader.
+
+        output_path (str):
+            Path for training output folder, either a local file path or other
+            URLs supported by both fsspec and tensorboardX, e.g. GCS (gs://) or
+            S3 (s3://) paths. The nonexist part of the given path is created
+            automatically. All training artefacts are saved there.
+    """
+
+    model: str = None
+    run_name: str = "coqui_tts"
+    run_description: str = ""
+    # training params
+    epochs: int = 10000
+    batch_size: int = None
+    eval_batch_size: int = None
+    mixed_precision: bool = False
+    scheduler_after_epoch: bool = False
+    # eval params
+    run_eval: bool = True
+    test_delay_epochs: int = 0
+    print_eval: bool = False
+    # logging
+    dashboard_logger: str = "tensorboard"
+    print_step: int = 25
+    plot_step: int = 100
+    model_param_stats: bool = False
+    project_name: str = None
+    log_model_step: int = None
+    wandb_entity: str = None
+    # checkpointing
+    save_step: int = 10000
+    checkpoint: bool = True
+    keep_all_best: bool = False
+    keep_after: int = 10000
+    # dataloading
+    num_loader_workers: int = 0
+    num_eval_loader_workers: int = 0
+    use_noise_augment: bool = False
+    # paths
+    output_path: str = None
+    # distributed
+    distributed_backend: str = "nccl"
+    distributed_url: str = "tcp://localhost:54321"
diff --git a/speaker_pretrain/.DS_Store b/speaker_pretrain/.DS_Store
new file mode 100644
index 0000000000000000000000000000000000000000..5008ddfcf53c02e82d7eee2e57c38e5672ef89f6
Binary files /dev/null and b/speaker_pretrain/.DS_Store differ
diff --git a/speaker_pretrain/README.md b/speaker_pretrain/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..b585941e1639b2dee0ef76773c27a1dad4407e83
--- /dev/null
+++ b/speaker_pretrain/README.md
@@ -0,0 +1,5 @@
+Path for:
+
+    best_model.pth.tar
+
+    config.json
diff --git a/speaker_pretrain/config.json b/speaker_pretrain/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..e330aabe8aba41a76af1250f90bb35cbe15d3cdc
--- /dev/null
+++ b/speaker_pretrain/config.json
@@ -0,0 +1,104 @@
+{
+    "model_name": "lstm",
+    "run_name": "mueller91",
+    "run_description": "train speaker encoder with voxceleb1, voxceleb2 and libriSpeech ",
+    "audio":{
+        // Audio processing parameters
+        "num_mels": 80,         // size of the mel spec frame.
+        "fft_size": 1024,       // number of stft frequency levels. Size of the linear spectogram frame.
+        "sample_rate": 16000,   // DATASET-RELATED: wav sample-rate. If different than the original data, it is resampled.
+        "win_length": 1024,     // stft window length in ms.
+        "hop_length": 256,      // stft window hop-lengh in ms.
+        "frame_length_ms": null,  // stft window length in ms.If null, 'win_length' is used.
+        "frame_shift_ms": null,   // stft window hop-lengh in ms. If null, 'hop_length' is used.
+        "preemphasis": 0.98,    // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis.
+        "min_level_db": -100,   // normalization range
+        "ref_level_db": 20,     // reference level db, theoretically 20db is the sound of air.
+        "power": 1.5,           // value to sharpen wav signals after GL algorithm.
+        "griffin_lim_iters": 60,// #griffin-lim iterations. 30-60 is a good range. Larger the value, slower the generation.
+        // Normalization parameters
+        "signal_norm": true,    // normalize the spec values in range [0, 1]
+        "symmetric_norm": true, // move normalization to range [-1, 1]
+        "max_norm": 4.0,          // scale normalization to range [-max_norm, max_norm] or [0, max_norm]
+        "clip_norm": true,      // clip normalized values into the range.
+        "mel_fmin": 0.0,         // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!!
+        "mel_fmax": 8000.0,        // maximum freq level for mel-spec. Tune for dataset!!
+        "do_trim_silence": true,  // enable trimming of slience of audio as you load it. LJspeech (false), TWEB (false), Nancy (true)
+        "trim_db": 60          // threshold for timming silence. Set this according to your dataset.
+    },
+    "reinit_layers": [],
+    "loss": "angleproto", // "ge2e" to use Generalized End-to-End loss and "angleproto" to use Angular Prototypical loss (new SOTA)
+    "grad_clip": 3.0, // upper limit for gradients for clipping.
+    "epochs": 1000, // total number of epochs to train.
+    "lr": 0.0001, // Initial learning rate. If Noam decay is active, maximum learning rate.
+    "lr_decay": false, // if true, Noam learning rate decaying is applied through training.
+    "warmup_steps": 4000, // Noam decay steps to increase the learning rate from 0 to "lr"
+    "tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging. 
+    "steps_plot_stats": 10, // number of steps to plot embeddings.
+    "num_speakers_in_batch": 64, // Batch size for training. Lower values than 32 might cause hard to learn attention. It is overwritten by 'gradual_training'.
+    "voice_len": 2.0, // size of the voice
+    "num_utters_per_speaker": 10,  //
+    "num_loader_workers": 8,        // number of training data loader processes. Don't set it too big. 4-8 are good values.
+    "wd": 0.000001, // Weight decay weight.
+    "checkpoint": true, // If true, it saves checkpoints per "save_step"
+    "save_step": 1000, // Number of training steps expected to save traning stats and checkpoints.
+    "print_step": 20, // Number of steps to log traning on console.
+    "output_path": "../../OutputsMozilla/checkpoints/speaker_encoder/", // DATASET-RELATED: output path for all training outputs.
+    "model": {
+        "input_dim": 80,
+        "proj_dim": 256,
+        "lstm_dim": 768,
+        "num_lstm_layers": 3,
+        "use_lstm_with_projection": true
+    },
+    "storage": {
+        "sample_from_storage_p": 0.9,  // the probability with which we'll sample from the DataSet in-memory storage
+        "storage_size": 25,   // the size of the in-memory storage with respect to a single batch
+        "additive_noise": 1e-5   // add very small gaussian noise to the data in order to increase robustness
+    },
+    "datasets": 
+        [
+            {
+                "name": "vctk_slim",
+                "path": "../../../audio-datasets/en/VCTK-Corpus/",
+                "meta_file_train": null,
+                "meta_file_val": null
+            },
+            {
+                "name": "libri_tts",
+                "path": "../../../audio-datasets/en/LibriTTS/train-clean-100",
+                "meta_file_train": null,
+                "meta_file_val": null
+            },
+            {
+                "name": "libri_tts",
+                "path": "../../../audio-datasets/en/LibriTTS/train-clean-360",
+                "meta_file_train": null,
+                "meta_file_val": null
+            },
+            {
+                "name": "libri_tts",
+                "path": "../../../audio-datasets/en/LibriTTS/train-other-500",
+                "meta_file_train": null,
+                "meta_file_val": null
+            },
+            {
+                "name": "voxceleb1",
+                "path": "../../../audio-datasets/en/voxceleb1/",
+                "meta_file_train": null,
+                "meta_file_val": null
+            },
+            {
+                "name": "voxceleb2",
+                "path": "../../../audio-datasets/en/voxceleb2/",
+                "meta_file_train": null,
+                "meta_file_val": null
+            },
+            {
+                "name": "common_voice",
+                "path": "../../../audio-datasets/en/MozillaCommonVoice",
+                "meta_file_train": "train.tsv",
+                "meta_file_val": "test.tsv"
+            }
+        ]
+}
\ No newline at end of file
diff --git a/svc_eva.py b/svc_eva.py
new file mode 100644
index 0000000000000000000000000000000000000000..905d34e7e432299d2aa3bf9a500b178569bbd96f
--- /dev/null
+++ b/svc_eva.py
@@ -0,0 +1,20 @@
+import os
+import numpy as np
+
+# average -> ave -> eva :haha
+
+eva_conf = {
+    './configs/singers/singer0022.npy': 0,
+    './configs/singers/singer0030.npy': 0,
+    './configs/singers/singer0047.npy': 0.5,
+    './configs/singers/singer0051.npy': 0.5,
+}
+
+if __name__ == "__main__":
+
+    eva = np.zeros(256)
+    for k, v in eva_conf.items():
+        assert os.path.isfile(k), k
+        spk = np.load(k)
+        eva = eva + spk * v
+    np.save("eva.spk.npy", eva, allow_pickle=False)
diff --git a/svc_export.py b/svc_export.py
new file mode 100644
index 0000000000000000000000000000000000000000..13dea0c9a8f9aedfe9cfb77d1d1b81fcb5b922bb
--- /dev/null
+++ b/svc_export.py
@@ -0,0 +1,68 @@
+import sys,os
+sys.path.append(os.path.dirname(os.path.abspath(__file__)))
+import torch
+import argparse
+from omegaconf import OmegaConf
+
+from vits.models import SynthesizerInfer
+
+
+def load_model(checkpoint_path, model):
+    assert os.path.isfile(checkpoint_path)
+    checkpoint_dict = torch.load(checkpoint_path, map_location="cpu")
+    saved_state_dict = checkpoint_dict["model_g"]
+    if hasattr(model, "module"):
+        state_dict = model.module.state_dict()
+    else:
+        state_dict = model.state_dict()
+    new_state_dict = {}
+    for k, v in state_dict.items():
+        try:
+            new_state_dict[k] = saved_state_dict[k]
+        except:
+            new_state_dict[k] = v
+    if hasattr(model, "module"):
+        model.module.load_state_dict(new_state_dict)
+    else:
+        model.load_state_dict(new_state_dict)
+    return model
+
+
+def save_pretrain(checkpoint_path, save_path):
+    assert os.path.isfile(checkpoint_path)
+    checkpoint_dict = torch.load(checkpoint_path, map_location="cpu")
+    torch.save({
+        'model_g': checkpoint_dict['model_g'],
+        'model_d': checkpoint_dict['model_d'],
+    }, save_path)
+
+
+def save_model(model, checkpoint_path):
+    if hasattr(model, 'module'):
+        state_dict = model.module.state_dict()
+    else:
+        state_dict = model.state_dict()
+    torch.save({'model_g': state_dict}, checkpoint_path)
+
+
+def main(args):
+    hp = OmegaConf.load(args.config)
+    model = SynthesizerInfer(
+        hp.data.filter_length // 2 + 1,
+        hp.data.segment_size // hp.data.hop_length,
+        hp)
+
+    # save_pretrain(args.checkpoint_path, "sovits5.0.pretrain.pth")
+    load_model(args.checkpoint_path, model)
+    save_model(model, "sovits5.0.pth")
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-c', '--config', type=str, required=True,
+                        help="yaml file for config. will use hp_str from checkpoint if not given.")
+    parser.add_argument('-p', '--checkpoint_path', type=str, required=True,
+                        help="path of checkpoint pt file for evaluation")
+    args = parser.parse_args()
+
+    main(args)
diff --git a/svc_inference.py b/svc_inference.py
new file mode 100644
index 0000000000000000000000000000000000000000..6e8560eab01bcccbf9118780225a520f9107001f
--- /dev/null
+++ b/svc_inference.py
@@ -0,0 +1,241 @@
+import logging
+import sys,os
+from pathlib import Path
+
+sys.path.append(os.path.dirname(os.path.abspath(__file__)))
+import torch
+import argparse
+import numpy as np
+
+from omegaconf import OmegaConf
+from scipy.io.wavfile import write
+from vits.models import SynthesizerInfer
+from pitch import load_csv_pitch
+from feature_retrieval import IRetrieval, DummyRetrieval, FaissIndexRetrieval, load_retrieve_index
+
+logger = logging.getLogger(__name__)
+
+
+def get_speaker_name_from_path(speaker_path: Path) -> str:
+    suffixes = "".join(speaker_path.suffixes)
+    filename = speaker_path.name
+    return filename.rstrip(suffixes)
+
+
+def create_retrival(cli_args) -> IRetrieval:
+    if not cli_args.enable_retrieval:
+        logger.info("infer without retrival")
+        return DummyRetrieval()
+    else:
+        logger.info("load index retrival model")
+
+    speaker_name = get_speaker_name_from_path(Path(args.spk))
+    base_path = Path(".").absolute() / "data_svc" / "indexes" / speaker_name
+
+    if cli_args.hubert_index_path:
+        hubert_index_filepath = cli_args.hubert_index_path
+    else:
+        index_name = f"{cli_args.retrieval_index_prefix}hubert.index"
+        hubert_index_filepath = base_path / index_name
+
+    if cli_args.whisper_index_path:
+        whisper_index_filepath = cli_args.whisper_index_path
+    else:
+        index_name = f"{cli_args.retrieval_index_prefix}whisper.index"
+        whisper_index_filepath = base_path / index_name
+
+    return FaissIndexRetrieval(
+        hubert_index=load_retrieve_index(
+            filepath=hubert_index_filepath,
+            ratio=cli_args.retrieval_ratio,
+            n_nearest_vectors=cli_args.n_retrieval_vectors
+        ),
+        whisper_index=load_retrieve_index(
+            filepath=whisper_index_filepath,
+            ratio=cli_args.retrieval_ratio,
+            n_nearest_vectors=cli_args.n_retrieval_vectors
+        ),
+    )
+
+
+def load_svc_model(checkpoint_path, model):
+    assert os.path.isfile(checkpoint_path)
+    checkpoint_dict = torch.load(checkpoint_path, map_location="cpu")
+    saved_state_dict = checkpoint_dict["model_g"]
+    state_dict = model.state_dict()
+    new_state_dict = {}
+    for k, v in state_dict.items():
+        try:
+            new_state_dict[k] = saved_state_dict[k]
+        except:
+            print("%s is not in the checkpoint" % k)
+            new_state_dict[k] = v
+    model.load_state_dict(new_state_dict)
+    return model
+
+
+def svc_infer(model, retrieval: IRetrieval, spk, pit, ppg, vec, hp, device):
+    len_pit = pit.size()[0]
+    len_vec = vec.size()[0]
+    len_ppg = ppg.size()[0]
+    len_min = min(len_pit, len_vec)
+    len_min = min(len_min, len_ppg)
+    pit = pit[:len_min]
+    vec = vec[:len_min, :]
+    ppg = ppg[:len_min, :]
+
+    with torch.no_grad():
+        spk = spk.unsqueeze(0).to(device)
+        source = pit.unsqueeze(0).to(device)
+        source = model.pitch2source(source)
+        pitwav = model.source2wav(source)
+        write("svc_out_pit.wav", hp.data.sampling_rate, pitwav)
+
+        hop_size = hp.data.hop_length
+        all_frame = len_min
+        hop_frame = 10
+        out_chunk = 2500  # 25 S
+        out_index = 0
+        out_audio = []
+
+        while (out_index < all_frame):
+
+            if (out_index == 0):  # start frame
+                cut_s = 0
+                cut_s_out = 0
+            else:
+                cut_s = out_index - hop_frame
+                cut_s_out = hop_frame * hop_size
+
+            if (out_index + out_chunk + hop_frame > all_frame):  # end frame
+                cut_e = all_frame
+                cut_e_out = -1
+            else:
+                cut_e = out_index + out_chunk + hop_frame
+                cut_e_out = -1 * hop_frame * hop_size
+
+            sub_ppg = retrieval.retriv_whisper(ppg[cut_s:cut_e, :])
+            sub_vec = retrieval.retriv_hubert(vec[cut_s:cut_e, :])
+            sub_ppg = sub_ppg.unsqueeze(0).to(device)
+            sub_vec = sub_vec.unsqueeze(0).to(device)
+            sub_pit = pit[cut_s:cut_e].unsqueeze(0).to(device)
+            sub_len = torch.LongTensor([cut_e - cut_s]).to(device)
+            sub_har = source[:, :, cut_s *
+                             hop_size:cut_e * hop_size].to(device)
+            sub_out = model.inference(
+                sub_ppg, sub_vec, sub_pit, spk, sub_len, sub_har)
+            sub_out = sub_out[0, 0].data.cpu().detach().numpy()
+
+            sub_out = sub_out[cut_s_out:cut_e_out]
+            out_audio.extend(sub_out)
+            out_index = out_index + out_chunk
+
+        out_audio = np.asarray(out_audio)
+    return out_audio
+
+
+def main(args):
+    if (args.ppg == None):
+        args.ppg = "svc_tmp.ppg.npy"
+        print(
+            f"Auto run : python whisper/inference.py -w {args.wave} -p {args.ppg}")
+        os.system(f"python whisper/inference.py -w {args.wave} -p {args.ppg}")
+
+    if (args.vec == None):
+        args.vec = "svc_tmp.vec.npy"
+        print(
+            f"Auto run : python hubert/inference.py -w {args.wave} -v {args.vec}")
+        os.system(f"python hubert/inference.py -w {args.wave} -v {args.vec}")
+
+    if (args.pit == None):
+        args.pit = "svc_tmp.pit.csv"
+        print(
+            f"Auto run : python pitch/inference.py -w {args.wave} -p {args.pit}")
+        os.system(f"python pitch/inference.py -w {args.wave} -p {args.pit}")
+
+    if args.debug:
+        logging.basicConfig(level=logging.DEBUG)
+    else:
+        logging.basicConfig(level=logging.INFO)
+
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    hp = OmegaConf.load(args.config)
+    model = SynthesizerInfer(
+        hp.data.filter_length // 2 + 1,
+        hp.data.segment_size // hp.data.hop_length,
+        hp)
+    load_svc_model(args.model, model)
+    retrieval = create_retrival(args)
+    model.eval()
+    model.to(device)
+
+    spk = np.load(args.spk)
+    spk = torch.FloatTensor(spk)
+
+    ppg = np.load(args.ppg)
+    ppg = np.repeat(ppg, 2, 0)  # 320 PPG -> 160 * 2
+    ppg = torch.FloatTensor(ppg)
+    # ppg = torch.zeros_like(ppg)
+
+    vec = np.load(args.vec)
+    vec = np.repeat(vec, 2, 0)  # 320 PPG -> 160 * 2
+    vec = torch.FloatTensor(vec)
+    # vec = torch.zeros_like(vec)
+
+    pit = load_csv_pitch(args.pit)
+    print("pitch shift: ", args.shift)
+    if (args.shift == 0):
+        pass
+    else:
+        pit = np.array(pit)
+        source = pit[pit > 0]
+        source_ave = source.mean()
+        source_min = source.min()
+        source_max = source.max()
+        print(f"source pitch statics: mean={source_ave:0.1f}, \
+                min={source_min:0.1f}, max={source_max:0.1f}")
+        shift = args.shift
+        shift = 2 ** (shift / 12)
+        pit = pit * shift
+    pit = torch.FloatTensor(pit)
+
+    out_audio = svc_infer(model, retrieval, spk, pit, ppg, vec, hp, device)
+    write("svc_out.wav", hp.data.sampling_rate, out_audio)
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--config', type=str, required=True,
+                        help="yaml file for config.")
+    parser.add_argument('--model', type=str, required=True,
+                        help="path of model for evaluation")
+    parser.add_argument('--wave', type=str, required=True,
+                        help="Path of raw audio.")
+    parser.add_argument('--spk', type=str, required=True,
+                        help="Path of speaker.")
+    parser.add_argument('--ppg', type=str,
+                        help="Path of content vector.")
+    parser.add_argument('--vec', type=str,
+                        help="Path of hubert vector.")
+    parser.add_argument('--pit', type=str,
+                        help="Path of pitch csv file.")
+    parser.add_argument('--shift', type=int, default=0,
+                        help="Pitch shift key.")
+
+    parser.add_argument('--enable-retrieval', action="store_true",
+                        help="Enable index feature retrieval")
+    parser.add_argument('--retrieval-index-prefix', default='',
+                        help='retrieval index file prefix. Will load file %prefix%hubert.index/%prefix%whisper.index')
+    parser.add_argument('--retrieval-ratio', type=float, default=.5,
+                        help="ratio of feature retrieval effect. Must be in range 0..1")
+    parser.add_argument('--n-retrieval-vectors', type=int, default=3,
+                        help="get n nearest vectors from retrieval index. Works stably in range 1..3")
+    parser.add_argument('--hubert-index-path', required=False,
+                        help='path to hubert index file. Default data_svc/indexes/speaker.../%prefix%hubert.index')
+    parser.add_argument('--whisper-index-path', required=False,
+                        help='path to whisper index file. Default data_svc/indexes/speaker.../%prefix%whisper.index')
+
+    parser.add_argument('--debug', action="store_true")
+    args = parser.parse_args()
+
+    main(args)
diff --git a/svc_inference_batch.py b/svc_inference_batch.py
new file mode 100644
index 0000000000000000000000000000000000000000..f86c19ad55216f8f882f274bff61c76005edf1ad
--- /dev/null
+++ b/svc_inference_batch.py
@@ -0,0 +1,43 @@
+import sys,os
+sys.path.append(os.path.dirname(os.path.abspath(__file__)))
+import tqdm
+import torch
+import argparse
+
+from whisper.inference import load_model, pred_ppg
+
+# How to use
+# python svc_inference_batch.py --config configs/base.yaml --model vits_pretrain/sovits5.0.pth --wave test_waves/ --spk configs/singers/singer0047.npy
+
+out_path = "./_svc_out"
+os.makedirs(out_path, exist_ok=True)
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--config', type=str, required=True,
+                        help="yaml file for config.")
+    parser.add_argument('--model', type=str, required=True,
+                        help="path of model for evaluation")
+    parser.add_argument('--wave', type=str, required=True,
+                        help="Path of raw audio.")
+    parser.add_argument('--spk', type=str, required=True,
+                    help="Path of speaker.")
+    parser.add_argument('--shift', type=int, default=0,
+                    help="Pitch shift key.")
+    args = parser.parse_args()
+    wave_path = args.wave
+    assert os.path.isdir(wave_path), f"{wave_path} is not folder"
+    waves = [file for file in os.listdir(wave_path) if file.endswith(".wav")]
+    for file in waves:
+        print(file)
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    whisper = load_model(os.path.join("whisper_pretrain", "large-v2.pt"), device=device)
+    for file in tqdm.tqdm(waves, desc="whisper"):
+        pred_ppg(whisper, f"{wave_path}/{file}", f"{out_path}/{file}.ppg.npy", device=device)
+    del whisper
+
+    for file in tqdm.tqdm(waves, desc="svc"):
+        os.system(
+            f"python svc_inference.py --config {args.config} --model {args.model} --wave {wave_path}/{file} --ppg {out_path}/{file}.ppg.npy --spk {args.spk} --shift {args.shift}")
+        os.system(f"mv svc_out.wav {out_path}/{file}")
+        os.system(f"rm {out_path}/{file}.ppg.npy")
diff --git a/svc_inference_post.py b/svc_inference_post.py
new file mode 100644
index 0000000000000000000000000000000000000000..4d0bc24ffde5100dbe60889b844e1223d020f7b7
--- /dev/null
+++ b/svc_inference_post.py
@@ -0,0 +1,51 @@
+import sys, os
+sys.path.append(os.path.dirname(os.path.abspath(__file__)))
+import torch
+import librosa
+import argparse
+import numpy as np
+from scipy.io.wavfile import write
+from vad.utils import init_jit_model, get_speech_timestamps
+
+
+def load_audio(file: str, sr: int = 16000):
+    x, sr = librosa.load(file, sr=sr)
+    return x
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument('--ref', type=str, required=True,
+                        help="Path of ref audio.")
+    parser.add_argument('--svc', type=str, required=True,
+                        help="Path of svc audio.")
+    parser.add_argument('--out', type=str, required=True,
+                        help="Path of out audio.")
+
+    args = parser.parse_args()
+    print("svc in wave :", args.ref)
+    print("svc out wave :", args.svc)
+    print("svc post wave :", args.out)
+
+    model = init_jit_model(os.path.join('vad/assets', 'silero_vad.jit'))
+    model.eval()
+
+    ref_wave = load_audio(args.ref, sr=16000)
+    tmp_wave = torch.from_numpy(ref_wave).squeeze(0)
+    tag_wave = get_speech_timestamps(
+        tmp_wave, model, threshold=0.2, sampling_rate=16000)
+
+    ref_wave[:] = 0
+    for tag in tag_wave:
+        ref_wave[tag["start"]:tag["end"]] = 1
+
+    ref_wave = np.repeat(ref_wave, 2, -1)
+    svc_wave = load_audio(args.svc, sr=32000)
+
+    min_len = min(len(ref_wave), len(svc_wave))
+    ref_wave = ref_wave[:min_len]
+    svc_wave = svc_wave[:min_len]
+    svc_wave[ref_wave == 0] = 0
+
+    write(args.out, 32000, svc_wave)
diff --git a/svc_inference_shift.py b/svc_inference_shift.py
new file mode 100644
index 0000000000000000000000000000000000000000..6aa74a90ebdbca5d4ed9b3185aca4b8494a35b1c
--- /dev/null
+++ b/svc_inference_shift.py
@@ -0,0 +1,102 @@
+import sys,os
+sys.path.append(os.path.dirname(os.path.abspath(__file__)))
+import torch
+import argparse
+import numpy as np
+
+from omegaconf import OmegaConf
+from scipy.io.wavfile import write
+from pitch import load_csv_pitch
+from vits.models import SynthesizerInfer
+from svc_inference import load_svc_model, svc_infer
+
+
+def main(args):
+    if (args.ppg == None):
+        args.ppg = "svc_tmp.ppg.npy"
+        print(
+            f"Auto run : python whisper/inference.py -w {args.wave} -p {args.ppg}")
+        os.system(f"python whisper/inference.py -w {args.wave} -p {args.ppg}")
+
+    if (args.vec == None):
+        args.vec = "svc_tmp.vec.npy"
+        print(
+            f"Auto run : python hubert/inference.py -w {args.wave} -v {args.vec}")
+        os.system(f"python hubert/inference.py -w {args.wave} -v {args.vec}")
+
+    if (args.pit == None):
+        args.pit = "svc_tmp.pit.csv"
+        print(
+            f"Auto run : python pitch/inference.py -w {args.wave} -p {args.pit}")
+        os.system(f"python pitch/inference.py -w {args.wave} -p {args.pit}")
+
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    hp = OmegaConf.load(args.config)
+    model = SynthesizerInfer(
+        hp.data.filter_length // 2 + 1,
+        hp.data.segment_size // hp.data.hop_length,
+        hp)
+    load_svc_model(args.model, model)
+    model.eval()
+    model.to(device)
+
+    spk = np.load(args.spk)
+    spk = torch.FloatTensor(spk)
+
+    ppg = np.load(args.ppg)
+    ppg = np.repeat(ppg, 2, 0)
+    ppg = torch.FloatTensor(ppg)
+
+    vec = np.load(args.vec)
+    vec = np.repeat(vec, 2, 0)
+    vec = torch.FloatTensor(vec)
+
+    pit = load_csv_pitch(args.pit)
+
+    shift_l = args.shift_l
+    shift_r = args.shift_r
+
+    print(f"pitch shift: [{shift_l}, {shift_r}]")
+
+    for shift in range(shift_l, shift_r + 1):
+        print(shift)
+        tmp = np.array(pit)
+        tmp = tmp * (2 ** (shift / 12))
+        tmp = torch.FloatTensor(tmp)
+
+        out_audio = svc_infer(model, spk, tmp, ppg, vec, hp, device)
+        write(os.path.join("./_svc_out", f"svc_out_{shift}.wav"),
+              hp.data.sampling_rate, out_audio)
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--config', type=str, required=True,
+                        help="yaml file for config.")
+    parser.add_argument('--model', type=str, required=True,
+                        help="path of model for evaluation")
+    parser.add_argument('--wave', type=str, required=True,
+                        help="Path of raw audio.")
+    parser.add_argument('--spk', type=str, required=True,
+                        help="Path of speaker.")
+    parser.add_argument('--ppg', type=str,
+                        help="Path of content vector.")
+    parser.add_argument('--vec', type=str,
+                        help="Path of hubert vector.")
+    parser.add_argument('--pit', type=str,
+                        help="Path of pitch csv file.")
+    parser.add_argument('--shift_l', type=int, default=0,
+                        help="Pitch shift key for [shift_l, shift_r]")
+    parser.add_argument('--shift_r', type=int, default=0,
+                        help="Pitch shift key for [shift_l, shift_r]")
+    args = parser.parse_args()
+
+    assert args.shift_l >= -12
+    assert args.shift_r >= -12
+    assert args.shift_l <= 12
+    assert args.shift_r <= 12
+    assert args.shift_l <= args.shift_r
+
+    os.makedirs("./_svc_out", exist_ok=True)
+
+    main(args)
diff --git a/svc_merge.py b/svc_merge.py
new file mode 100644
index 0000000000000000000000000000000000000000..d84f6c1ddc0461530a590e7b815bfa69bf6366e4
--- /dev/null
+++ b/svc_merge.py
@@ -0,0 +1,58 @@
+import os
+import torch
+import argparse
+import collections
+
+
+def load_model(checkpoint_path):
+    assert os.path.isfile(checkpoint_path)
+    checkpoint_dict = torch.load(checkpoint_path, map_location="cpu")
+    saved_state_dict = checkpoint_dict["model_g"]
+    return saved_state_dict
+
+
+def save_model(state_dict, checkpoint_path):
+    torch.save({'model_g': state_dict}, checkpoint_path)
+
+
+def average_model(model_list):
+    model_keys = list(model_list[0].keys())
+    model_average = collections.OrderedDict()
+    for key in model_keys:
+        key_sum = 0
+        for i in range(len(model_list)):
+            key_sum = (key_sum + model_list[i][key])
+        model_average[key] = torch.div(key_sum, float(len(model_list)))
+    return model_average
+#   ss_list = []
+#   ss_list.append(s1)
+#   ss_list.append(s2)
+#   ss_merge = average_model(ss_list)
+
+
+def merge_model(model1, model2, rate):
+    model_keys = model1.keys()
+    model_merge = collections.OrderedDict()
+    for key in model_keys:
+        key_merge = rate * model1[key] + (1 - rate) * model2[key]
+        model_merge[key] = key_merge
+    return model_merge
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-m1', '--model1', type=str, required=True)
+    parser.add_argument('-m2', '--model2', type=str, required=True)
+    parser.add_argument('-r1', '--rate', type=float, required=True)
+    args = parser.parse_args()
+
+    print(args.model1)
+    print(args.model2)
+    print(args.rate)
+
+    assert args.rate > 0 and args.rate < 1, f"{args.rate} should be in range (0, 1)"
+    s1 = load_model(args.model1)
+    s2 = load_model(args.model2)
+
+    merge = merge_model(s1, s2, args.rate)
+    save_model(merge, "sovits5.0_merge.pth")
diff --git a/svc_preprocessing.py b/svc_preprocessing.py
new file mode 100644
index 0000000000000000000000000000000000000000..6ae5c2be49f987aba20d6d17cb7c5a8fa22ed878
--- /dev/null
+++ b/svc_preprocessing.py
@@ -0,0 +1,34 @@
+import os
+import torch
+import argparse
+import subprocess
+
+assert torch.cuda.is_available(), "\033[31m You need GPU to Train! \033[0m"
+print("CPU Count is :", os.cpu_count())
+
+parser = argparse.ArgumentParser()
+parser.add_argument("-t", type=int, default=0, help="thread count")
+args = parser.parse_args()
+
+
+commands = [
+   "python prepare/preprocess_a.py -w ./dataset_raw -o ./data_svc/waves-16k -s 16000 -t 0",
+   "python prepare/preprocess_a.py -w ./dataset_raw -o ./data_svc/waves-32k -s 32000 -t 0",
+   "python prepare/preprocess_crepe.py -w data_svc/waves-16k/ -p data_svc/pitch",
+   "python prepare/preprocess_ppg.py -w data_svc/waves-16k/ -p data_svc/whisper",
+   "python prepare/preprocess_hubert.py -w data_svc/waves-16k/ -v data_svc/hubert",
+   "python prepare/preprocess_speaker.py data_svc/waves-16k/ data_svc/speaker -t 0",
+   "python prepare/preprocess_speaker_ave.py data_svc/speaker/ data_svc/singer",
+   "python prepare/preprocess_spec.py -w data_svc/waves-32k/ -s data_svc/specs -t 0",
+   "python prepare/preprocess_train.py",
+   "python prepare/preprocess_zzz.py",
+]
+
+
+for command in commands:
+   print(f"Command: {command}")
+
+   process = subprocess.Popen(command, shell=True)
+   outcode = process.wait()
+   if (outcode):
+      break
diff --git a/svc_train_retrieval.py b/svc_train_retrieval.py
new file mode 100644
index 0000000000000000000000000000000000000000..4b8e293533ad5f29792e687db6fcc9e7ef9834ff
--- /dev/null
+++ b/svc_train_retrieval.py
@@ -0,0 +1,114 @@
+import argparse
+import logging
+import multiprocessing
+from functools import partial
+from pathlib import Path
+
+import faiss
+
+from feature_retrieval import (
+    train_index,
+    FaissIVFFlatTrainableFeatureIndexBuilder,
+    OnConditionFeatureTransform,
+    MinibatchKmeansFeatureTransform,
+    DummyFeatureTransform,
+)
+
+logger = logging.getLogger(__name__)
+
+
+def get_speaker_list(base_path: Path):
+    speakers_path = base_path / "waves-16k"
+    if not speakers_path.exists():
+        raise FileNotFoundError(f"path {speakers_path} does not exists")
+    return [speaker_dir.name for speaker_dir in speakers_path.iterdir() if speaker_dir.is_dir()]
+
+
+def create_indexes_path(base_path: Path) -> Path:
+    indexes_path = base_path / "indexes"
+    logger.info("create indexes folder %s", indexes_path)
+    indexes_path.mkdir(exist_ok=True)
+    return indexes_path
+
+
+def create_index(
+        feature_name: str,
+        prefix: str,
+        speaker: str,
+        base_path: Path,
+        indexes_path: Path,
+        compress_features_after: int,
+        n_clusters: int,
+        n_parallel: int,
+        train_batch_size: int = 8192,
+) -> None:
+    features_path = base_path / feature_name / speaker
+    if not features_path.exists():
+        raise ValueError(f'features not found by path {features_path}')
+    index_path = indexes_path / speaker
+    index_path.mkdir(exist_ok=True)
+    index_filename = f"{prefix}{feature_name}.index"
+    index_filepath = index_path / index_filename
+    logger.debug('index will be save to %s', index_filepath)
+
+    builder = FaissIVFFlatTrainableFeatureIndexBuilder(train_batch_size, distance=faiss.METRIC_L2)
+    transform = OnConditionFeatureTransform(
+        condition=lambda matrix: matrix.shape[0] > compress_features_after,
+        on_condition=MinibatchKmeansFeatureTransform(n_clusters, n_parallel),
+        otherwise=DummyFeatureTransform()
+    )
+    train_index(features_path, index_filepath, builder, transform)
+
+
+def main() -> None:
+    arg_parser = argparse.ArgumentParser("crate faiss indexes for feature retrieval")
+    arg_parser.add_argument("--debug", action="store_true")
+    arg_parser.add_argument("--prefix", default='', help="add prefix to index filename")
+    arg_parser.add_argument('--speakers', nargs="+",
+                            help="speaker names to create an index. By default all speakers are from data_svc")
+    arg_parser.add_argument("--compress-features-after", type=int, default=200_000,
+                            help="If the number of features is greater than the value compress "
+                                 "feature vectors using MiniBatchKMeans.")
+    arg_parser.add_argument("--n-clusters", type=int, default=10_000,
+                            help="Number of centroids to which features will be compressed")
+
+    arg_parser.add_argument("--n-parallel", type=int, default=multiprocessing.cpu_count()-1,
+                            help="Nuber of parallel job of MinibatchKmeans. Default is cpus-1")
+    args = arg_parser.parse_args()
+
+    if args.debug:
+        logging.basicConfig(level=logging.DEBUG)
+    else:
+        logging.basicConfig(level=logging.INFO)
+
+    base_path = Path(".").absolute() / "data_svc"
+    if args.speakers:
+        speakers = args.speakers
+    else:
+        speakers = get_speaker_list(base_path)
+
+    logger.info("got %s speakers: %s", len(speakers), speakers)
+    indexes_path = create_indexes_path(base_path)
+
+    create_index_func = partial(
+        create_index,
+        prefix=args.prefix,
+        base_path=base_path,
+        indexes_path=indexes_path,
+        compress_features_after=args.compress_features_after,
+        n_clusters=args.n_clusters,
+        n_parallel=args.n_parallel,
+    )
+
+    for speaker in speakers:
+        logger.info("create hubert index for speaker %s", speaker)
+        create_index_func(feature_name="hubert", speaker=speaker)
+
+        logger.info("create whisper index for speaker %s", speaker)
+        create_index_func(feature_name="whisper", speaker=speaker)
+
+    logger.info("done!")
+
+
+if __name__ == '__main__':
+    main()
diff --git a/svc_trainer.py b/svc_trainer.py
new file mode 100644
index 0000000000000000000000000000000000000000..a42186d379936b67d5dcd7c3049cc5185b24e454
--- /dev/null
+++ b/svc_trainer.py
@@ -0,0 +1,43 @@
+import sys,os
+sys.path.append(os.path.dirname(os.path.abspath(__file__)))
+import argparse
+import torch
+import torch.multiprocessing as mp
+from omegaconf import OmegaConf
+
+from vits_extend.train import train
+
+torch.backends.cudnn.benchmark = True
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-c', '--config', type=str, required=True,
+                        help="yaml file for configuration")
+    parser.add_argument('-p', '--checkpoint_path', type=str, default=None,
+                        help="path of checkpoint pt file to resume training")
+    parser.add_argument('-n', '--name', type=str, required=True,
+                        help="name of the model for logging, saving checkpoint")
+    args = parser.parse_args()
+
+    hp = OmegaConf.load(args.config)
+    with open(args.config, 'r') as f:
+        hp_str = ''.join(f.readlines())
+
+    assert hp.data.hop_length == 320, \
+        'hp.data.hop_length must be equal to 320, got %d' % hp.data.hop_length
+
+    args.num_gpus = 0
+    torch.manual_seed(hp.train.seed)
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed(hp.train.seed)
+        args.num_gpus = torch.cuda.device_count()
+        print('Batch size per GPU :', hp.train.batch_size)
+
+        if args.num_gpus > 1:
+            mp.spawn(train, nprocs=args.num_gpus,
+                     args=(args, args.checkpoint_path, hp, hp_str,))
+        else:
+            train(0, args, args.checkpoint_path, hp, hp_str)
+    else:
+        print('No GPU find!')
diff --git a/test.wav b/test.wav
new file mode 100644
index 0000000000000000000000000000000000000000..290a4c3dde752c39de9575dbbd7b194a0c09f274
--- /dev/null
+++ b/test.wav
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9b9d96b70f6ec6a72410bff26b71004010c203c8010fd6fb0c60c4acd53fd2ec
+size 4849732
diff --git a/vad/LICENSE b/vad/LICENSE
new file mode 100644
index 0000000000000000000000000000000000000000..0bf5e90cac691b999d4a35044f97167d7bbbf0b9
--- /dev/null
+++ b/vad/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2020-present Silero Team
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
\ No newline at end of file
diff --git a/vad/assets/silero_vad.jit b/vad/assets/silero_vad.jit
new file mode 100644
index 0000000000000000000000000000000000000000..2a0958e90969784c90489cea36ab538a7b44384b
--- /dev/null
+++ b/vad/assets/silero_vad.jit
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:99033608562094bbb44e2363198cd47647a668f846c4c9a9edde68b4800b5fd4
+size 1439299
diff --git a/vad/utils.py b/vad/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..faf56bddcd5012e5a9877a2ba5bf3fd6db2d2df3
--- /dev/null
+++ b/vad/utils.py
@@ -0,0 +1,533 @@
+import torch
+import torchaudio
+from typing import Callable, List
+import torch.nn.functional as F
+import warnings
+
+languages = ['ru', 'en', 'de', 'es']
+
+
+class OnnxWrapper():
+
+    def __init__(self, path, force_onnx_cpu=False):
+        import numpy as np
+        global np
+        import onnxruntime
+
+        opts = onnxruntime.SessionOptions()
+        opts.inter_op_num_threads = 1
+        opts.intra_op_num_threads = 1
+
+        if force_onnx_cpu and 'CPUExecutionProvider' in onnxruntime.get_available_providers():
+            self.session = onnxruntime.InferenceSession(path, providers=['CPUExecutionProvider'], sess_options=opts)
+        else:
+            self.session = onnxruntime.InferenceSession(path, sess_options=opts)
+
+        self.reset_states()
+        self.sample_rates = [8000, 16000]
+
+    def _validate_input(self, x, sr: int):
+        if x.dim() == 1:
+            x = x.unsqueeze(0)
+        if x.dim() > 2:
+            raise ValueError(f"Too many dimensions for input audio chunk {x.dim()}")
+
+        if sr != 16000 and (sr % 16000 == 0):
+            step = sr // 16000
+            x = x[:,::step]
+            sr = 16000
+
+        if sr not in self.sample_rates:
+            raise ValueError(f"Supported sampling rates: {self.sample_rates} (or multiply of 16000)")
+
+        if sr / x.shape[1] > 31.25:
+            raise ValueError("Input audio chunk is too short")
+
+        return x, sr
+
+    def reset_states(self, batch_size=1):
+        self._h = np.zeros((2, batch_size, 64)).astype('float32')
+        self._c = np.zeros((2, batch_size, 64)).astype('float32')
+        self._last_sr = 0
+        self._last_batch_size = 0
+
+    def __call__(self, x, sr: int):
+
+        x, sr = self._validate_input(x, sr)
+        batch_size = x.shape[0]
+
+        if not self._last_batch_size:
+            self.reset_states(batch_size)
+        if (self._last_sr) and (self._last_sr != sr):
+            self.reset_states(batch_size)
+        if (self._last_batch_size) and (self._last_batch_size != batch_size):
+            self.reset_states(batch_size)
+
+        if sr in [8000, 16000]:
+            ort_inputs = {'input': x.numpy(), 'h': self._h, 'c': self._c, 'sr': np.array(sr, dtype='int64')}
+            ort_outs = self.session.run(None, ort_inputs)
+            out, self._h, self._c = ort_outs
+        else:
+            raise ValueError()
+
+        self._last_sr = sr
+        self._last_batch_size = batch_size
+
+        out = torch.tensor(out)
+        return out
+
+    def audio_forward(self, x, sr: int, num_samples: int = 512):
+        outs = []
+        x, sr = self._validate_input(x, sr)
+
+        if x.shape[1] % num_samples:
+            pad_num = num_samples - (x.shape[1] % num_samples)
+            x = torch.nn.functional.pad(x, (0, pad_num), 'constant', value=0.0)
+
+        self.reset_states(x.shape[0])
+        for i in range(0, x.shape[1], num_samples):
+            wavs_batch = x[:, i:i+num_samples]
+            out_chunk = self.__call__(wavs_batch, sr)
+            outs.append(out_chunk)
+
+        stacked = torch.cat(outs, dim=1)
+        return stacked.cpu()
+
+
+class Validator():
+    def __init__(self, url, force_onnx_cpu):
+        self.onnx = True if url.endswith('.onnx') else False
+        torch.hub.download_url_to_file(url, 'inf.model')
+        if self.onnx:
+            import onnxruntime
+            if force_onnx_cpu and 'CPUExecutionProvider' in onnxruntime.get_available_providers():
+                self.model = onnxruntime.InferenceSession('inf.model', providers=['CPUExecutionProvider'])
+            else:
+                self.model = onnxruntime.InferenceSession('inf.model')
+        else:
+            self.model = init_jit_model(model_path='inf.model')
+
+    def __call__(self, inputs: torch.Tensor):
+        with torch.no_grad():
+            if self.onnx:
+                ort_inputs = {'input': inputs.cpu().numpy()}
+                outs = self.model.run(None, ort_inputs)
+                outs = [torch.Tensor(x) for x in outs]
+            else:
+                outs = self.model(inputs)
+
+        return outs
+
+
+def read_audio(path: str,
+               sampling_rate: int = 16000):
+
+    wav, sr = torchaudio.load(path)
+
+    if wav.size(0) > 1:
+        wav = wav.mean(dim=0, keepdim=True)
+
+    if sr != sampling_rate:
+        transform = torchaudio.transforms.Resample(orig_freq=sr,
+                                                   new_freq=sampling_rate)
+        wav = transform(wav)
+        sr = sampling_rate
+
+    assert sr == sampling_rate
+    return wav.squeeze(0)
+
+
+def save_audio(path: str,
+               tensor: torch.Tensor,
+               sampling_rate: int = 16000):
+    torchaudio.save(path, tensor.unsqueeze(0), sampling_rate, bits_per_sample=16)
+
+
+def init_jit_model(model_path: str,
+                   device=torch.device('cpu')):
+    torch.set_grad_enabled(False)
+    model = torch.jit.load(model_path, map_location=device)
+    model.eval()
+    return model
+
+
+def make_visualization(probs, step):
+    import pandas as pd
+    pd.DataFrame({'probs': probs},
+                 index=[x * step for x in range(len(probs))]).plot(figsize=(16, 8),
+                 kind='area', ylim=[0, 1.05], xlim=[0, len(probs) * step],
+                 xlabel='seconds',
+                 ylabel='speech probability',
+                 colormap='tab20')
+
+
+def get_speech_timestamps(audio: torch.Tensor,
+                          model,
+                          threshold: float = 0.5,
+                          sampling_rate: int = 16000,
+                          min_speech_duration_ms: int = 250,
+                          max_speech_duration_s: float = float('inf'),
+                          min_silence_duration_ms: int = 100,
+                          window_size_samples: int = 512,
+                          speech_pad_ms: int = 30,
+                          return_seconds: bool = False,
+                          visualize_probs: bool = False,
+                          progress_tracking_callback: Callable[[float], None] = None):
+
+    """
+    This method is used for splitting long audios into speech chunks using silero VAD
+
+    Parameters
+    ----------
+    audio: torch.Tensor, one dimensional
+        One dimensional float torch.Tensor, other types are casted to torch if possible
+
+    model: preloaded .jit silero VAD model
+
+    threshold: float (default - 0.5)
+        Speech threshold. Silero VAD outputs speech probabilities for each audio chunk, probabilities ABOVE this value are considered as SPEECH.
+        It is better to tune this parameter for each dataset separately, but "lazy" 0.5 is pretty good for most datasets.
+
+    sampling_rate: int (default - 16000)
+        Currently silero VAD models support 8000 and 16000 sample rates
+
+    min_speech_duration_ms: int (default - 250 milliseconds)
+        Final speech chunks shorter min_speech_duration_ms are thrown out
+
+    max_speech_duration_s: int (default -  inf)
+        Maximum duration of speech chunks in seconds
+        Chunks longer than max_speech_duration_s will be split at the timestamp of the last silence that lasts more than 100ms (if any), to prevent agressive cutting.
+        Otherwise, they will be split aggressively just before max_speech_duration_s.
+
+    min_silence_duration_ms: int (default - 100 milliseconds)
+        In the end of each speech chunk wait for min_silence_duration_ms before separating it
+
+    window_size_samples: int (default - 1536 samples)
+        Audio chunks of window_size_samples size are fed to the silero VAD model.
+        WARNING! Silero VAD models were trained using 512, 1024, 1536 samples for 16000 sample rate and 256, 512, 768 samples for 8000 sample rate.
+        Values other than these may affect model perfomance!!
+
+    speech_pad_ms: int (default - 30 milliseconds)
+        Final speech chunks are padded by speech_pad_ms each side
+
+    return_seconds: bool (default - False)
+        whether return timestamps in seconds (default - samples)
+
+    visualize_probs: bool (default - False)
+        whether draw prob hist or not
+
+    progress_tracking_callback: Callable[[float], None] (default - None)
+        callback function taking progress in percents as an argument
+
+    Returns
+    ----------
+    speeches: list of dicts
+        list containing ends and beginnings of speech chunks (samples or seconds based on return_seconds)
+    """
+
+    if not torch.is_tensor(audio):
+        try:
+            audio = torch.Tensor(audio)
+        except:
+            raise TypeError("Audio cannot be casted to tensor. Cast it manually")
+
+    if len(audio.shape) > 1:
+        for i in range(len(audio.shape)):  # trying to squeeze empty dimensions
+            audio = audio.squeeze(0)
+        if len(audio.shape) > 1:
+            raise ValueError("More than one dimension in audio. Are you trying to process audio with 2 channels?")
+
+    if sampling_rate > 16000 and (sampling_rate % 16000 == 0):
+        step = sampling_rate // 16000
+        sampling_rate = 16000
+        audio = audio[::step]
+        warnings.warn('Sampling rate is a multiply of 16000, casting to 16000 manually!')
+    else:
+        step = 1
+
+    if sampling_rate == 8000 and window_size_samples > 768:
+        warnings.warn('window_size_samples is too big for 8000 sampling_rate! Better set window_size_samples to 256, 512 or 768 for 8000 sample rate!')
+    if window_size_samples not in [256, 512, 768, 1024, 1536]:
+        warnings.warn('Unusual window_size_samples! Supported window_size_samples:\n - [512, 1024, 1536] for 16000 sampling_rate\n - [256, 512, 768] for 8000 sampling_rate')
+
+    model.reset_states()
+    min_speech_samples = sampling_rate * min_speech_duration_ms / 1000
+    speech_pad_samples = sampling_rate * speech_pad_ms / 1000
+    max_speech_samples = sampling_rate * max_speech_duration_s - window_size_samples - 2 * speech_pad_samples
+    min_silence_samples = sampling_rate * min_silence_duration_ms / 1000
+    min_silence_samples_at_max_speech = sampling_rate * 98 / 1000
+
+    audio_length_samples = len(audio)
+
+    speech_probs = []
+    for current_start_sample in range(0, audio_length_samples, window_size_samples):
+        chunk = audio[current_start_sample: current_start_sample + window_size_samples]
+        if len(chunk) < window_size_samples:
+            chunk = torch.nn.functional.pad(chunk, (0, int(window_size_samples - len(chunk))))
+        speech_prob = model(chunk, sampling_rate).item()
+        speech_probs.append(speech_prob)
+        # caculate progress and seng it to callback function
+        progress = current_start_sample + window_size_samples
+        if progress > audio_length_samples:
+            progress = audio_length_samples
+        progress_percent = (progress / audio_length_samples) * 100
+        if progress_tracking_callback:
+            progress_tracking_callback(progress_percent)
+
+    triggered = False
+    speeches = []
+    current_speech = {}
+    neg_threshold = threshold - 0.15
+    temp_end = 0 # to save potential segment end (and tolerate some silence)
+    prev_end = next_start = 0 # to save potential segment limits in case of maximum segment size reached
+
+    for i, speech_prob in enumerate(speech_probs):
+        if (speech_prob >= threshold) and temp_end:
+            temp_end = 0
+            if next_start < prev_end:
+               next_start = window_size_samples * i
+
+        if (speech_prob >= threshold) and not triggered:
+            triggered = True
+            current_speech['start'] = window_size_samples * i
+            continue
+
+        if triggered and (window_size_samples * i) - current_speech['start'] > max_speech_samples:
+            if prev_end:
+                current_speech['end'] = prev_end
+                speeches.append(current_speech)
+                current_speech = {}
+                if next_start < prev_end: # previously reached silence (< neg_thres) and is still not speech (< thres)
+                    triggered = False
+                else:
+                    current_speech['start'] = next_start
+                prev_end = next_start = temp_end = 0
+            else:
+                current_speech['end'] = window_size_samples * i
+                speeches.append(current_speech)
+                current_speech = {}
+                prev_end = next_start = temp_end = 0
+                triggered = False
+                continue
+
+        if (speech_prob < neg_threshold) and triggered:
+            if not temp_end:
+                temp_end = window_size_samples * i
+            if ((window_size_samples * i) - temp_end) > min_silence_samples_at_max_speech : # condition to avoid cutting in very short silence
+                prev_end = temp_end
+            if (window_size_samples * i) - temp_end < min_silence_samples:
+                continue
+            else:
+                current_speech['end'] = temp_end
+                if (current_speech['end'] - current_speech['start']) > min_speech_samples:
+                    speeches.append(current_speech)
+                current_speech = {}
+                prev_end = next_start = temp_end = 0
+                triggered = False
+                continue
+
+    if current_speech and (audio_length_samples - current_speech['start']) > min_speech_samples:
+        current_speech['end'] = audio_length_samples
+        speeches.append(current_speech)
+
+    for i, speech in enumerate(speeches):
+        if i == 0:
+            speech['start'] = int(max(0, speech['start'] - speech_pad_samples))
+        if i != len(speeches) - 1:
+            silence_duration = speeches[i+1]['start'] - speech['end']
+            if silence_duration < 2 * speech_pad_samples:
+                speech['end'] += int(silence_duration // 2)
+                speeches[i+1]['start'] = int(max(0, speeches[i+1]['start'] - silence_duration // 2))
+            else:
+                speech['end'] = int(min(audio_length_samples, speech['end'] + speech_pad_samples))
+                speeches[i+1]['start'] = int(max(0, speeches[i+1]['start'] - speech_pad_samples))
+        else:
+            speech['end'] = int(min(audio_length_samples, speech['end'] + speech_pad_samples))
+
+    if return_seconds:
+        for speech_dict in speeches:
+            speech_dict['start'] = round(speech_dict['start'] / sampling_rate, 1)
+            speech_dict['end'] = round(speech_dict['end'] / sampling_rate, 1)
+    elif step > 1:
+        for speech_dict in speeches:
+            speech_dict['start'] *= step
+            speech_dict['end'] *= step
+
+    if visualize_probs:
+        make_visualization(speech_probs, window_size_samples / sampling_rate)
+
+    return speeches
+
+
+def get_number_ts(wav: torch.Tensor,
+                  model,
+                  model_stride=8,
+                  hop_length=160,
+                  sample_rate=16000):
+    wav = torch.unsqueeze(wav, dim=0)
+    perframe_logits = model(wav)[0]
+    perframe_preds = torch.argmax(torch.softmax(perframe_logits, dim=1), dim=1).squeeze()   # (1, num_frames_strided)
+    extended_preds = []
+    for i in perframe_preds:
+        extended_preds.extend([i.item()] * model_stride)
+    # len(extended_preds) is *num_frames_real*; for each frame of audio we know if it has a number in it.
+    triggered = False
+    timings = []
+    cur_timing = {}
+    for i, pred in enumerate(extended_preds):
+        if pred == 1:
+            if not triggered:
+                cur_timing['start'] = int((i * hop_length) / (sample_rate / 1000))
+                triggered = True
+        elif pred == 0:
+            if triggered:
+                cur_timing['end'] = int((i * hop_length) / (sample_rate / 1000))
+                timings.append(cur_timing)
+                cur_timing = {}
+                triggered = False
+    if cur_timing:
+        cur_timing['end'] = int(len(wav) / (sample_rate / 1000))
+        timings.append(cur_timing)
+    return timings
+
+
+def get_language(wav: torch.Tensor,
+                 model):
+    wav = torch.unsqueeze(wav, dim=0)
+    lang_logits = model(wav)[2]
+    lang_pred = torch.argmax(torch.softmax(lang_logits, dim=1), dim=1).item()   # from 0 to len(languages) - 1
+    assert lang_pred < len(languages)
+    return languages[lang_pred]
+
+
+def get_language_and_group(wav: torch.Tensor,
+                           model,
+                           lang_dict: dict,
+                           lang_group_dict: dict,
+                           top_n=1):
+    wav = torch.unsqueeze(wav, dim=0)
+    lang_logits, lang_group_logits = model(wav)
+
+    softm = torch.softmax(lang_logits, dim=1).squeeze()
+    softm_group = torch.softmax(lang_group_logits, dim=1).squeeze()
+
+    srtd = torch.argsort(softm, descending=True)
+    srtd_group = torch.argsort(softm_group, descending=True)
+
+    outs = []
+    outs_group = []
+    for i in range(top_n):
+        prob = round(softm[srtd[i]].item(), 2)
+        prob_group = round(softm_group[srtd_group[i]].item(), 2)
+        outs.append((lang_dict[str(srtd[i].item())], prob))
+        outs_group.append((lang_group_dict[str(srtd_group[i].item())], prob_group))
+
+    return outs, outs_group
+
+
+class VADIterator:
+    def __init__(self,
+                 model,
+                 threshold: float = 0.5,
+                 sampling_rate: int = 16000,
+                 min_silence_duration_ms: int = 100,
+                 speech_pad_ms: int = 30
+                 ):
+
+        """
+        Class for stream imitation
+
+        Parameters
+        ----------
+        model: preloaded .jit silero VAD model
+
+        threshold: float (default - 0.5)
+            Speech threshold. Silero VAD outputs speech probabilities for each audio chunk, probabilities ABOVE this value are considered as SPEECH.
+            It is better to tune this parameter for each dataset separately, but "lazy" 0.5 is pretty good for most datasets.
+
+        sampling_rate: int (default - 16000)
+            Currently silero VAD models support 8000 and 16000 sample rates
+
+        min_silence_duration_ms: int (default - 100 milliseconds)
+            In the end of each speech chunk wait for min_silence_duration_ms before separating it
+
+        speech_pad_ms: int (default - 30 milliseconds)
+            Final speech chunks are padded by speech_pad_ms each side
+        """
+
+        self.model = model
+        self.threshold = threshold
+        self.sampling_rate = sampling_rate
+
+        if sampling_rate not in [8000, 16000]:
+            raise ValueError('VADIterator does not support sampling rates other than [8000, 16000]')
+
+        self.min_silence_samples = sampling_rate * min_silence_duration_ms / 1000
+        self.speech_pad_samples = sampling_rate * speech_pad_ms / 1000
+        self.reset_states()
+
+    def reset_states(self):
+
+        self.model.reset_states()
+        self.triggered = False
+        self.temp_end = 0
+        self.current_sample = 0
+
+    def __call__(self, x, return_seconds=False):
+        """
+        x: torch.Tensor
+            audio chunk (see examples in repo)
+
+        return_seconds: bool (default - False)
+            whether return timestamps in seconds (default - samples)
+        """
+
+        if not torch.is_tensor(x):
+            try:
+                x = torch.Tensor(x)
+            except:
+                raise TypeError("Audio cannot be casted to tensor. Cast it manually")
+
+        window_size_samples = len(x[0]) if x.dim() == 2 else len(x)
+        self.current_sample += window_size_samples
+
+        speech_prob = self.model(x, self.sampling_rate).item()
+
+        if (speech_prob >= self.threshold) and self.temp_end:
+            self.temp_end = 0
+
+        if (speech_prob >= self.threshold) and not self.triggered:
+            self.triggered = True
+            speech_start = self.current_sample - self.speech_pad_samples
+            return {'start': int(speech_start) if not return_seconds else round(speech_start / self.sampling_rate, 1)}
+
+        if (speech_prob < self.threshold - 0.15) and self.triggered:
+            if not self.temp_end:
+                self.temp_end = self.current_sample
+            if self.current_sample - self.temp_end < self.min_silence_samples:
+                return None
+            else:
+                speech_end = self.temp_end + self.speech_pad_samples
+                self.temp_end = 0
+                self.triggered = False
+                return {'end': int(speech_end) if not return_seconds else round(speech_end / self.sampling_rate, 1)}
+
+        return None
+
+
+def collect_chunks(tss: List[dict],
+                   wav: torch.Tensor):
+    chunks = []
+    for i in tss:
+        chunks.append(wav[i['start']: i['end']])
+    return torch.cat(chunks)
+
+
+def drop_chunks(tss: List[dict],
+                wav: torch.Tensor):
+    chunks = []
+    cur_start = 0
+    for i in tss:
+        chunks.append((wav[cur_start: i['start']]))
+        cur_start = i['end']
+    return torch.cat(chunks)
diff --git a/vits/LICENSE b/vits/LICENSE
new file mode 100644
index 0000000000000000000000000000000000000000..6a6c3181fcdc4e20901a6ecbee5a406b78a5b560
--- /dev/null
+++ b/vits/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2021 Jaehyeon Kim
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/vits/__init__.py b/vits/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/vits/attentions.py b/vits/attentions.py
new file mode 100644
index 0000000000000000000000000000000000000000..26624519b01497cc402dea5f860cf5022d1e7c89
--- /dev/null
+++ b/vits/attentions.py
@@ -0,0 +1,416 @@
+import copy
+import math
+import numpy as np
+import torch
+from torch import nn
+from torch.nn import functional as F
+
+from vits import commons
+from vits.modules import LayerNorm
+
+
+class Encoder(nn.Module):
+    def __init__(
+        self,
+        hidden_channels,
+        filter_channels,
+        n_heads,
+        n_layers,
+        kernel_size=1,
+        p_dropout=0.0,
+        window_size=4,
+        **kwargs
+    ):
+        super().__init__()
+        self.hidden_channels = hidden_channels
+        self.filter_channels = filter_channels
+        self.n_heads = n_heads
+        self.n_layers = n_layers
+        self.kernel_size = kernel_size
+        self.p_dropout = p_dropout
+        self.window_size = window_size
+
+        self.drop = nn.Dropout(p_dropout)
+        self.attn_layers = nn.ModuleList()
+        self.norm_layers_1 = nn.ModuleList()
+        self.ffn_layers = nn.ModuleList()
+        self.norm_layers_2 = nn.ModuleList()
+        for i in range(self.n_layers):
+            self.attn_layers.append(
+                MultiHeadAttention(
+                    hidden_channels,
+                    hidden_channels,
+                    n_heads,
+                    p_dropout=p_dropout,
+                    window_size=window_size,
+                )
+            )
+            self.norm_layers_1.append(LayerNorm(hidden_channels))
+            self.ffn_layers.append(
+                FFN(
+                    hidden_channels,
+                    hidden_channels,
+                    filter_channels,
+                    kernel_size,
+                    p_dropout=p_dropout,
+                )
+            )
+            self.norm_layers_2.append(LayerNorm(hidden_channels))
+
+    def forward(self, x, x_mask):
+        attn_mask = x_mask.unsqueeze(2) * x_mask.unsqueeze(-1)
+        x = x * x_mask
+        for i in range(self.n_layers):
+            y = self.attn_layers[i](x, x, attn_mask)
+            y = self.drop(y)
+            x = self.norm_layers_1[i](x + y)
+
+            y = self.ffn_layers[i](x, x_mask)
+            y = self.drop(y)
+            x = self.norm_layers_2[i](x + y)
+        x = x * x_mask
+        return x
+
+
+class Decoder(nn.Module):
+    def __init__(
+        self,
+        hidden_channels,
+        filter_channels,
+        n_heads,
+        n_layers,
+        kernel_size=1,
+        p_dropout=0.0,
+        proximal_bias=False,
+        proximal_init=True,
+        **kwargs
+    ):
+        super().__init__()
+        self.hidden_channels = hidden_channels
+        self.filter_channels = filter_channels
+        self.n_heads = n_heads
+        self.n_layers = n_layers
+        self.kernel_size = kernel_size
+        self.p_dropout = p_dropout
+        self.proximal_bias = proximal_bias
+        self.proximal_init = proximal_init
+
+        self.drop = nn.Dropout(p_dropout)
+        self.self_attn_layers = nn.ModuleList()
+        self.norm_layers_0 = nn.ModuleList()
+        self.encdec_attn_layers = nn.ModuleList()
+        self.norm_layers_1 = nn.ModuleList()
+        self.ffn_layers = nn.ModuleList()
+        self.norm_layers_2 = nn.ModuleList()
+        for i in range(self.n_layers):
+            self.self_attn_layers.append(
+                MultiHeadAttention(
+                    hidden_channels,
+                    hidden_channels,
+                    n_heads,
+                    p_dropout=p_dropout,
+                    proximal_bias=proximal_bias,
+                    proximal_init=proximal_init,
+                )
+            )
+            self.norm_layers_0.append(LayerNorm(hidden_channels))
+            self.encdec_attn_layers.append(
+                MultiHeadAttention(
+                    hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout
+                )
+            )
+            self.norm_layers_1.append(LayerNorm(hidden_channels))
+            self.ffn_layers.append(
+                FFN(
+                    hidden_channels,
+                    hidden_channels,
+                    filter_channels,
+                    kernel_size,
+                    p_dropout=p_dropout,
+                    causal=True,
+                )
+            )
+            self.norm_layers_2.append(LayerNorm(hidden_channels))
+
+    def forward(self, x, x_mask, h, h_mask):
+        """
+        x: decoder input
+        h: encoder output
+        """
+        self_attn_mask = commons.subsequent_mask(x_mask.size(2)).to(
+            device=x.device, dtype=x.dtype
+        )
+        encdec_attn_mask = h_mask.unsqueeze(2) * x_mask.unsqueeze(-1)
+        x = x * x_mask
+        for i in range(self.n_layers):
+            y = self.self_attn_layers[i](x, x, self_attn_mask)
+            y = self.drop(y)
+            x = self.norm_layers_0[i](x + y)
+
+            y = self.encdec_attn_layers[i](x, h, encdec_attn_mask)
+            y = self.drop(y)
+            x = self.norm_layers_1[i](x + y)
+
+            y = self.ffn_layers[i](x, x_mask)
+            y = self.drop(y)
+            x = self.norm_layers_2[i](x + y)
+        x = x * x_mask
+        return x
+
+
+class MultiHeadAttention(nn.Module):
+    def __init__(
+        self,
+        channels,
+        out_channels,
+        n_heads,
+        p_dropout=0.0,
+        window_size=None,
+        heads_share=True,
+        block_length=None,
+        proximal_bias=False,
+        proximal_init=False,
+    ):
+        super().__init__()
+        assert channels % n_heads == 0
+
+        self.channels = channels
+        self.out_channels = out_channels
+        self.n_heads = n_heads
+        self.p_dropout = p_dropout
+        self.window_size = window_size
+        self.heads_share = heads_share
+        self.block_length = block_length
+        self.proximal_bias = proximal_bias
+        self.proximal_init = proximal_init
+        self.attn = None
+
+        self.k_channels = channels // n_heads
+        self.conv_q = nn.Conv1d(channels, channels, 1)
+        self.conv_k = nn.Conv1d(channels, channels, 1)
+        self.conv_v = nn.Conv1d(channels, channels, 1)
+        self.conv_o = nn.Conv1d(channels, out_channels, 1)
+        self.drop = nn.Dropout(p_dropout)
+
+        if window_size is not None:
+            n_heads_rel = 1 if heads_share else n_heads
+            rel_stddev = self.k_channels**-0.5
+            self.emb_rel_k = nn.Parameter(
+                torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels)
+                * rel_stddev
+            )
+            self.emb_rel_v = nn.Parameter(
+                torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels)
+                * rel_stddev
+            )
+
+        nn.init.xavier_uniform_(self.conv_q.weight)
+        nn.init.xavier_uniform_(self.conv_k.weight)
+        nn.init.xavier_uniform_(self.conv_v.weight)
+        if proximal_init:
+            with torch.no_grad():
+                self.conv_k.weight.copy_(self.conv_q.weight)
+                self.conv_k.bias.copy_(self.conv_q.bias)
+
+    def forward(self, x, c, attn_mask=None):
+        q = self.conv_q(x)
+        k = self.conv_k(c)
+        v = self.conv_v(c)
+
+        x, self.attn = self.attention(q, k, v, mask=attn_mask)
+
+        x = self.conv_o(x)
+        return x
+
+    def attention(self, query, key, value, mask=None):
+        # reshape [b, d, t] -> [b, n_h, t, d_k]
+        b, d, t_s, t_t = (*key.size(), query.size(2))
+        query = query.view(b, self.n_heads, self.k_channels, t_t).transpose(2, 3)
+        key = key.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
+        value = value.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
+
+        scores = torch.matmul(query / math.sqrt(self.k_channels), key.transpose(-2, -1))
+        if self.window_size is not None:
+            assert (
+                t_s == t_t
+            ), "Relative attention is only available for self-attention."
+            key_relative_embeddings = self._get_relative_embeddings(self.emb_rel_k, t_s)
+            rel_logits = self._matmul_with_relative_keys(
+                query / math.sqrt(self.k_channels), key_relative_embeddings
+            )
+            scores_local = self._relative_position_to_absolute_position(rel_logits)
+            scores = scores + scores_local
+        if self.proximal_bias:
+            assert t_s == t_t, "Proximal bias is only available for self-attention."
+            scores = scores + self._attention_bias_proximal(t_s).to(
+                device=scores.device, dtype=scores.dtype
+            )
+        if mask is not None:
+            scores = scores.masked_fill(mask == 0, -1e4)
+            if self.block_length is not None:
+                assert (
+                    t_s == t_t
+                ), "Local attention is only available for self-attention."
+                block_mask = (
+                    torch.ones_like(scores)
+                    .triu(-self.block_length)
+                    .tril(self.block_length)
+                )
+                scores = scores.masked_fill(block_mask == 0, -1e4)
+        p_attn = F.softmax(scores, dim=-1)  # [b, n_h, t_t, t_s]
+        p_attn = self.drop(p_attn)
+        output = torch.matmul(p_attn, value)
+        if self.window_size is not None:
+            relative_weights = self._absolute_position_to_relative_position(p_attn)
+            value_relative_embeddings = self._get_relative_embeddings(
+                self.emb_rel_v, t_s
+            )
+            output = output + self._matmul_with_relative_values(
+                relative_weights, value_relative_embeddings
+            )
+        output = (
+            output.transpose(2, 3).contiguous().view(b, d, t_t)
+        )  # [b, n_h, t_t, d_k] -> [b, d, t_t]
+        return output, p_attn
+
+    def _matmul_with_relative_values(self, x, y):
+        """
+        x: [b, h, l, m]
+        y: [h or 1, m, d]
+        ret: [b, h, l, d]
+        """
+        ret = torch.matmul(x, y.unsqueeze(0))
+        return ret
+
+    def _matmul_with_relative_keys(self, x, y):
+        """
+        x: [b, h, l, d]
+        y: [h or 1, m, d]
+        ret: [b, h, l, m]
+        """
+        ret = torch.matmul(x, y.unsqueeze(0).transpose(-2, -1))
+        return ret
+
+    def _get_relative_embeddings(self, relative_embeddings, length):
+        max_relative_position = 2 * self.window_size + 1
+        # Pad first before slice to avoid using cond ops.
+        pad_length = max(length - (self.window_size + 1), 0)
+        slice_start_position = max((self.window_size + 1) - length, 0)
+        slice_end_position = slice_start_position + 2 * length - 1
+        if pad_length > 0:
+            padded_relative_embeddings = F.pad(
+                relative_embeddings,
+                commons.convert_pad_shape([[0, 0], [pad_length, pad_length], [0, 0]]),
+            )
+        else:
+            padded_relative_embeddings = relative_embeddings
+        used_relative_embeddings = padded_relative_embeddings[
+            :, slice_start_position:slice_end_position
+        ]
+        return used_relative_embeddings
+
+    def _relative_position_to_absolute_position(self, x):
+        """
+        x: [b, h, l, 2*l-1]
+        ret: [b, h, l, l]
+        """
+        batch, heads, length, _ = x.size()
+        # Concat columns of pad to shift from relative to absolute indexing.
+        x = F.pad(x, commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, 1]]))
+
+        # Concat extra elements so to add up to shape (len+1, 2*len-1).
+        x_flat = x.view([batch, heads, length * 2 * length])
+        x_flat = F.pad(
+            x_flat, commons.convert_pad_shape([[0, 0], [0, 0], [0, length - 1]])
+        )
+
+        # Reshape and slice out the padded elements.
+        x_final = x_flat.view([batch, heads, length + 1, 2 * length - 1])[
+            :, :, :length, length - 1 :
+        ]
+        return x_final
+
+    def _absolute_position_to_relative_position(self, x):
+        """
+        x: [b, h, l, l]
+        ret: [b, h, l, 2*l-1]
+        """
+        batch, heads, length, _ = x.size()
+        # padd along column
+        x = F.pad(
+            x, commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, length - 1]])
+        )
+        x_flat = x.view([batch, heads, length**2 + length * (length - 1)])
+        # add 0's in the beginning that will skew the elements after reshape
+        x_flat = F.pad(x_flat, commons.convert_pad_shape([[0, 0], [0, 0], [length, 0]]))
+        x_final = x_flat.view([batch, heads, length, 2 * length])[:, :, :, 1:]
+        return x_final
+
+    def _attention_bias_proximal(self, length):
+        """Bias for self-attention to encourage attention to close positions.
+        Args:
+          length: an integer scalar.
+        Returns:
+          a Tensor with shape [1, 1, length, length]
+        """
+        r = torch.arange(length, dtype=torch.float32)
+        diff = torch.unsqueeze(r, 0) - torch.unsqueeze(r, 1)
+        return torch.unsqueeze(torch.unsqueeze(-torch.log1p(torch.abs(diff)), 0), 0)
+
+
+class FFN(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        filter_channels,
+        kernel_size,
+        p_dropout=0.0,
+        activation=None,
+        causal=False,
+    ):
+        super().__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.filter_channels = filter_channels
+        self.kernel_size = kernel_size
+        self.p_dropout = p_dropout
+        self.activation = activation
+        self.causal = causal
+
+        if causal:
+            self.padding = self._causal_padding
+        else:
+            self.padding = self._same_padding
+
+        self.conv_1 = nn.Conv1d(in_channels, filter_channels, kernel_size)
+        self.conv_2 = nn.Conv1d(filter_channels, out_channels, kernel_size)
+        self.drop = nn.Dropout(p_dropout)
+
+    def forward(self, x, x_mask):
+        x = self.conv_1(self.padding(x * x_mask))
+        if self.activation == "gelu":
+            x = x * torch.sigmoid(1.702 * x)
+        else:
+            x = torch.relu(x)
+        x = self.drop(x)
+        x = self.conv_2(self.padding(x * x_mask))
+        return x * x_mask
+
+    def _causal_padding(self, x):
+        if self.kernel_size == 1:
+            return x
+        pad_l = self.kernel_size - 1
+        pad_r = 0
+        padding = [[0, 0], [0, 0], [pad_l, pad_r]]
+        x = F.pad(x, commons.convert_pad_shape(padding))
+        return x
+
+    def _same_padding(self, x):
+        if self.kernel_size == 1:
+            return x
+        pad_l = (self.kernel_size - 1) // 2
+        pad_r = self.kernel_size // 2
+        padding = [[0, 0], [0, 0], [pad_l, pad_r]]
+        x = F.pad(x, commons.convert_pad_shape(padding))
+        return x
diff --git a/vits/commons.py b/vits/commons.py
new file mode 100644
index 0000000000000000000000000000000000000000..045a538d5a3ef8033eca70639a894346b11d5f61
--- /dev/null
+++ b/vits/commons.py
@@ -0,0 +1,187 @@
+import math
+import numpy as np
+import torch
+from torch import nn
+from torch.nn import functional as F
+
+
+def slice_pitch_segments(x, ids_str, segment_size=4):
+    ret = torch.zeros_like(x[:, :segment_size])
+    for i in range(x.size(0)):
+        idx_str = ids_str[i]
+        idx_end = idx_str + segment_size
+        ret[i] = x[i, idx_str:idx_end]
+    return ret
+
+
+def rand_slice_segments_with_pitch(x, pitch, x_lengths=None, segment_size=4):
+    b, d, t = x.size()
+    if x_lengths is None:
+        x_lengths = t
+    ids_str_max = x_lengths - segment_size + 1
+    ids_str = (torch.rand([b]).to(device=x.device) * ids_str_max).to(dtype=torch.long)
+    ret = slice_segments(x, ids_str, segment_size)
+    ret_pitch = slice_pitch_segments(pitch, ids_str, segment_size)
+    return ret, ret_pitch, ids_str
+
+
+def rand_spec_segments(x, x_lengths=None, segment_size=4):
+    b, d, t = x.size()
+    if x_lengths is None:
+        x_lengths = t
+    ids_str_max = x_lengths - segment_size
+    ids_str = (torch.rand([b]).to(device=x.device) * ids_str_max).to(dtype=torch.long)
+    ret = slice_segments(x, ids_str, segment_size)
+    return ret, ids_str
+
+
+def init_weights(m, mean=0.0, std=0.01):
+    classname = m.__class__.__name__
+    if classname.find("Conv") != -1:
+        m.weight.data.normal_(mean, std)
+
+
+def get_padding(kernel_size, dilation=1):
+    return int((kernel_size * dilation - dilation) / 2)
+
+
+def convert_pad_shape(pad_shape):
+    l = pad_shape[::-1]
+    pad_shape = [item for sublist in l for item in sublist]
+    return pad_shape
+
+
+def kl_divergence(m_p, logs_p, m_q, logs_q):
+    """KL(P||Q)"""
+    kl = (logs_q - logs_p) - 0.5
+    kl += (
+        0.5 * (torch.exp(2.0 * logs_p) + ((m_p - m_q) ** 2)) * torch.exp(-2.0 * logs_q)
+    )
+    return kl
+
+
+def rand_gumbel(shape):
+    """Sample from the Gumbel distribution, protect from overflows."""
+    uniform_samples = torch.rand(shape) * 0.99998 + 0.00001
+    return -torch.log(-torch.log(uniform_samples))
+
+
+def rand_gumbel_like(x):
+    g = rand_gumbel(x.size()).to(dtype=x.dtype, device=x.device)
+    return g
+
+
+def slice_segments(x, ids_str, segment_size=4):
+    ret = torch.zeros_like(x[:, :, :segment_size])
+    for i in range(x.size(0)):
+        idx_str = ids_str[i]
+        idx_end = idx_str + segment_size
+        ret[i] = x[i, :, idx_str:idx_end]
+    return ret
+
+
+def rand_slice_segments(x, x_lengths=None, segment_size=4):
+    b, d, t = x.size()
+    if x_lengths is None:
+        x_lengths = t
+    ids_str_max = x_lengths - segment_size + 1
+    ids_str = (torch.rand([b]).to(device=x.device) * ids_str_max).to(dtype=torch.long)
+    ret = slice_segments(x, ids_str, segment_size)
+    return ret, ids_str
+
+
+def get_timing_signal_1d(length, channels, min_timescale=1.0, max_timescale=1.0e4):
+    position = torch.arange(length, dtype=torch.float)
+    num_timescales = channels // 2
+    log_timescale_increment = math.log(float(max_timescale) / float(min_timescale)) / (
+        num_timescales - 1
+    )
+    inv_timescales = min_timescale * torch.exp(
+        torch.arange(num_timescales, dtype=torch.float) * -log_timescale_increment
+    )
+    scaled_time = position.unsqueeze(0) * inv_timescales.unsqueeze(1)
+    signal = torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], 0)
+    signal = F.pad(signal, [0, 0, 0, channels % 2])
+    signal = signal.view(1, channels, length)
+    return signal
+
+
+def add_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4):
+    b, channels, length = x.size()
+    signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale)
+    return x + signal.to(dtype=x.dtype, device=x.device)
+
+
+def cat_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4, axis=1):
+    b, channels, length = x.size()
+    signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale)
+    return torch.cat([x, signal.to(dtype=x.dtype, device=x.device)], axis)
+
+
+def subsequent_mask(length):
+    mask = torch.tril(torch.ones(length, length)).unsqueeze(0).unsqueeze(0)
+    return mask
+
+
+@torch.jit.script
+def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels):
+    n_channels_int = n_channels[0]
+    in_act = input_a + input_b
+    t_act = torch.tanh(in_act[:, :n_channels_int, :])
+    s_act = torch.sigmoid(in_act[:, n_channels_int:, :])
+    acts = t_act * s_act
+    return acts
+
+
+def convert_pad_shape(pad_shape):
+    l = pad_shape[::-1]
+    pad_shape = [item for sublist in l for item in sublist]
+    return pad_shape
+
+
+def shift_1d(x):
+    x = F.pad(x, convert_pad_shape([[0, 0], [0, 0], [1, 0]]))[:, :, :-1]
+    return x
+
+
+def sequence_mask(length, max_length=None):
+    if max_length is None:
+        max_length = length.max()
+    x = torch.arange(max_length, dtype=length.dtype, device=length.device)
+    return x.unsqueeze(0) < length.unsqueeze(1)
+
+
+def generate_path(duration, mask):
+    """
+    duration: [b, 1, t_x]
+    mask: [b, 1, t_y, t_x]
+    """
+    device = duration.device
+
+    b, _, t_y, t_x = mask.shape
+    cum_duration = torch.cumsum(duration, -1)
+
+    cum_duration_flat = cum_duration.view(b * t_x)
+    path = sequence_mask(cum_duration_flat, t_y).to(mask.dtype)
+    path = path.view(b, t_x, t_y)
+    path = path - F.pad(path, convert_pad_shape([[0, 0], [1, 0], [0, 0]]))[:, :-1]
+    path = path.unsqueeze(1).transpose(2, 3) * mask
+    return path
+
+
+def clip_grad_value_(parameters, clip_value, norm_type=2):
+    if isinstance(parameters, torch.Tensor):
+        parameters = [parameters]
+    parameters = list(filter(lambda p: p.grad is not None, parameters))
+    norm_type = float(norm_type)
+    if clip_value is not None:
+        clip_value = float(clip_value)
+
+    total_norm = 0
+    for p in parameters:
+        param_norm = p.grad.data.norm(norm_type)
+        total_norm += param_norm.item() ** norm_type
+        if clip_value is not None:
+            p.grad.data.clamp_(min=-clip_value, max=clip_value)
+    total_norm = total_norm ** (1.0 / norm_type)
+    return total_norm
diff --git a/vits/data_utils.py b/vits/data_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..bb9c6635f7287ffa7307893b210680a65754c898
--- /dev/null
+++ b/vits/data_utils.py
@@ -0,0 +1,325 @@
+import os
+import numpy as np
+import random
+import torch
+import torch.utils.data
+
+
+from vits.utils import load_wav_to_torch
+
+
+def load_filepaths(filename, split="|"):
+    with open(filename, encoding='utf-8') as f:
+        filepaths = [line.strip().split(split) for line in f]
+    return filepaths
+
+
+class TextAudioSpeakerSet(torch.utils.data.Dataset):
+    def __init__(self, filename, hparams):
+        self.items = load_filepaths(filename)
+        self.max_wav_value = hparams.max_wav_value
+        self.sampling_rate = hparams.sampling_rate
+        self.segment_size = hparams.segment_size
+        self.hop_length = hparams.hop_length
+        self._filter()
+        print(f'----------{len(self.items)}----------')
+
+    def _filter(self):
+        lengths = []
+        items_new = []
+        items_min = int(self.segment_size / self.hop_length * 4)  # 1 S
+        items_max = int(self.segment_size / self.hop_length * 16)  # 4 S
+        for wavpath, spec, pitch, vec, ppg, spk in self.items:
+            if not os.path.isfile(wavpath):
+                continue
+            if not os.path.isfile(spec):
+                continue
+            if not os.path.isfile(pitch):
+                continue
+            if not os.path.isfile(vec):
+                continue
+            if not os.path.isfile(ppg):
+                continue
+            if not os.path.isfile(spk):
+                continue
+            temp = np.load(pitch)
+            usel = int(temp.shape[0] - 1)  # useful length
+            if (usel < items_min):
+                continue
+            if (usel >= items_max):
+                usel = items_max
+            items_new.append([wavpath, spec, pitch, vec, ppg, spk, usel])
+            lengths.append(usel)
+        self.items = items_new
+        self.lengths = lengths
+
+    def read_wav(self, filename):
+        audio, sampling_rate = load_wav_to_torch(filename)
+        assert sampling_rate == self.sampling_rate, f"error: this sample rate of {filename} is {sampling_rate}"
+        audio_norm = audio / self.max_wav_value
+        audio_norm = audio_norm.unsqueeze(0)
+        return audio_norm
+
+    def __getitem__(self, index):
+        return self.my_getitem(index)
+
+    def __len__(self):
+        return len(self.items)
+
+    def my_getitem(self, idx):
+        item = self.items[idx]
+        # print(item)
+        wav = item[0]
+        spe = item[1]
+        pit = item[2]
+        vec = item[3]
+        ppg = item[4]
+        spk = item[5]
+        use = item[6]
+
+        wav = self.read_wav(wav)
+        spe = torch.load(spe)
+
+        pit = np.load(pit)
+        vec = np.load(vec)
+        vec = np.repeat(vec, 2, 0)  # 320 PPG -> 160 * 2
+        ppg = np.load(ppg)
+        ppg = np.repeat(ppg, 2, 0)  # 320 PPG -> 160 * 2
+        spk = np.load(spk)
+
+        pit = torch.FloatTensor(pit)
+        vec = torch.FloatTensor(vec)
+        ppg = torch.FloatTensor(ppg)
+        spk = torch.FloatTensor(spk)
+
+        len_pit = pit.size()[0]
+        len_vec = vec.size()[0] - 2 # for safe
+        len_ppg = ppg.size()[0] - 2 # for safe
+        len_min = min(len_pit, len_vec)
+        len_min = min(len_min, len_ppg)
+        len_wav = len_min * self.hop_length
+
+        pit = pit[:len_min]
+        vec = vec[:len_min, :]
+        ppg = ppg[:len_min, :]
+        spe = spe[:, :len_min]
+        wav = wav[:, :len_wav]
+        if len_min > use:
+            max_frame_start = ppg.size(0) - use - 1
+            frame_start = random.randint(0, max_frame_start)
+            frame_end = frame_start + use
+
+            pit = pit[frame_start:frame_end]
+            vec = vec[frame_start:frame_end, :]
+            ppg = ppg[frame_start:frame_end, :]
+            spe = spe[:, frame_start:frame_end]
+
+            wav_start = frame_start * self.hop_length
+            wav_end = frame_end * self.hop_length
+            wav = wav[:, wav_start:wav_end]
+        # print(spe.shape)
+        # print(wav.shape)
+        # print(ppg.shape)
+        # print(pit.shape)
+        # print(spk.shape)
+        return spe, wav, ppg, vec, pit, spk
+
+
+class TextAudioSpeakerCollate:
+    """Zero-pads model inputs and targets"""
+
+    def __call__(self, batch):
+        # Right zero-pad all one-hot text sequences to max input length
+        # mel: [freq, length]
+        # wav: [1, length]
+        # ppg: [len, 1024]
+        # pit: [len]
+        # spk: [256]
+        _, ids_sorted_decreasing = torch.sort(
+            torch.LongTensor([x[0].size(1) for x in batch]), dim=0, descending=True
+        )
+
+        max_spe_len = max([x[0].size(1) for x in batch])
+        max_wav_len = max([x[1].size(1) for x in batch])
+        spe_lengths = torch.LongTensor(len(batch))
+        wav_lengths = torch.LongTensor(len(batch))
+        spe_padded = torch.FloatTensor(
+            len(batch), batch[0][0].size(0), max_spe_len)
+        wav_padded = torch.FloatTensor(len(batch), 1, max_wav_len)
+        spe_padded.zero_()
+        wav_padded.zero_()
+
+        max_ppg_len = max([x[2].size(0) for x in batch])
+        ppg_lengths = torch.FloatTensor(len(batch))
+        ppg_padded = torch.FloatTensor(
+            len(batch), max_ppg_len, batch[0][2].size(1))
+        vec_padded = torch.FloatTensor(
+            len(batch), max_ppg_len, batch[0][3].size(1))
+        pit_padded = torch.FloatTensor(len(batch), max_ppg_len)
+        ppg_padded.zero_()
+        vec_padded.zero_()
+        pit_padded.zero_()
+        spk = torch.FloatTensor(len(batch), batch[0][5].size(0))
+
+        for i in range(len(ids_sorted_decreasing)):
+            row = batch[ids_sorted_decreasing[i]]
+
+            spe = row[0]
+            spe_padded[i, :, : spe.size(1)] = spe
+            spe_lengths[i] = spe.size(1)
+
+            wav = row[1]
+            wav_padded[i, :, : wav.size(1)] = wav
+            wav_lengths[i] = wav.size(1)
+
+            ppg = row[2]
+            ppg_padded[i, : ppg.size(0), :] = ppg
+            ppg_lengths[i] = ppg.size(0)
+
+            vec = row[3]
+            vec_padded[i, : vec.size(0), :] = vec
+
+            pit = row[4]
+            pit_padded[i, : pit.size(0)] = pit
+
+            spk[i] = row[5]
+        # print(ppg_padded.shape)
+        # print(ppg_lengths.shape)
+        # print(pit_padded.shape)
+        # print(spk.shape)
+        # print(spe_padded.shape)
+        # print(spe_lengths.shape)
+        # print(wav_padded.shape)
+        # print(wav_lengths.shape)
+        return (
+            ppg_padded,
+            ppg_lengths,
+            vec_padded,
+            pit_padded,
+            spk,
+            spe_padded,
+            spe_lengths,
+            wav_padded,
+            wav_lengths,
+        )
+
+
+class DistributedBucketSampler(torch.utils.data.distributed.DistributedSampler):
+    """
+    Maintain similar input lengths in a batch.
+    Length groups are specified by boundaries.
+    Ex) boundaries = [b1, b2, b3] -> any batch is included either {x | b1 < length(x) <=b2} or {x | b2 < length(x) <= b3}.
+    It removes samples which are not included in the boundaries.
+    Ex) boundaries = [b1, b2, b3] -> any x s.t. length(x) <= b1 or length(x) > b3 are discarded.
+    """
+
+    def __init__(
+        self,
+        dataset,
+        batch_size,
+        boundaries,
+        num_replicas=None,
+        rank=None,
+        shuffle=True,
+    ):
+        super().__init__(dataset, num_replicas=num_replicas, rank=rank, shuffle=shuffle)
+        self.lengths = dataset.lengths
+        self.batch_size = batch_size
+        self.boundaries = boundaries
+
+        self.buckets, self.num_samples_per_bucket = self._create_buckets()
+        self.total_size = sum(self.num_samples_per_bucket)
+        self.num_samples = self.total_size // self.num_replicas
+
+    def _create_buckets(self):
+        buckets = [[] for _ in range(len(self.boundaries) - 1)]
+        for i in range(len(self.lengths)):
+            length = self.lengths[i]
+            idx_bucket = self._bisect(length)
+            if idx_bucket != -1:
+                buckets[idx_bucket].append(i)
+
+        for i in range(len(buckets) - 1, 0, -1):
+            if len(buckets[i]) == 0:
+                buckets.pop(i)
+                self.boundaries.pop(i + 1)
+
+        num_samples_per_bucket = []
+        for i in range(len(buckets)):
+            len_bucket = len(buckets[i])
+            total_batch_size = self.num_replicas * self.batch_size
+            rem = (
+                total_batch_size - (len_bucket % total_batch_size)
+            ) % total_batch_size
+            num_samples_per_bucket.append(len_bucket + rem)
+        return buckets, num_samples_per_bucket
+
+    def __iter__(self):
+        # deterministically shuffle based on epoch
+        g = torch.Generator()
+        g.manual_seed(self.epoch)
+
+        indices = []
+        if self.shuffle:
+            for bucket in self.buckets:
+                indices.append(torch.randperm(
+                    len(bucket), generator=g).tolist())
+        else:
+            for bucket in self.buckets:
+                indices.append(list(range(len(bucket))))
+
+        batches = []
+        for i in range(len(self.buckets)):
+            bucket = self.buckets[i]
+            len_bucket = len(bucket)
+            if (len_bucket == 0):
+                continue
+            ids_bucket = indices[i]
+            num_samples_bucket = self.num_samples_per_bucket[i]
+
+            # add extra samples to make it evenly divisible
+            rem = num_samples_bucket - len_bucket
+            ids_bucket = (
+                ids_bucket
+                + ids_bucket * (rem // len_bucket)
+                + ids_bucket[: (rem % len_bucket)]
+            )
+
+            # subsample
+            ids_bucket = ids_bucket[self.rank:: self.num_replicas]
+
+            # batching
+            for j in range(len(ids_bucket) // self.batch_size):
+                batch = [
+                    bucket[idx]
+                    for idx in ids_bucket[
+                        j * self.batch_size: (j + 1) * self.batch_size
+                    ]
+                ]
+                batches.append(batch)
+
+        if self.shuffle:
+            batch_ids = torch.randperm(len(batches), generator=g).tolist()
+            batches = [batches[i] for i in batch_ids]
+        self.batches = batches
+
+        assert len(self.batches) * self.batch_size == self.num_samples
+        return iter(self.batches)
+
+    def _bisect(self, x, lo=0, hi=None):
+        if hi is None:
+            hi = len(self.boundaries) - 1
+
+        if hi > lo:
+            mid = (hi + lo) // 2
+            if self.boundaries[mid] < x and x <= self.boundaries[mid + 1]:
+                return mid
+            elif x <= self.boundaries[mid]:
+                return self._bisect(x, lo, mid)
+            else:
+                return self._bisect(x, mid + 1, hi)
+        else:
+            return -1
+
+    def __len__(self):
+        return self.num_samples // self.batch_size
diff --git a/vits/losses.py b/vits/losses.py
new file mode 100644
index 0000000000000000000000000000000000000000..9244de65482650fedea4de6e3cc26d4700ee76e9
--- /dev/null
+++ b/vits/losses.py
@@ -0,0 +1,79 @@
+import torch
+
+
+def feature_loss(fmap_r, fmap_g):
+    loss = 0
+    for dr, dg in zip(fmap_r, fmap_g):
+        for rl, gl in zip(dr, dg):
+            rl = rl.float().detach()
+            gl = gl.float()
+            loss += torch.mean(torch.abs(rl - gl))
+
+    return loss * 2
+
+
+def discriminator_loss(disc_real_outputs, disc_generated_outputs):
+    loss = 0
+    r_losses = []
+    g_losses = []
+    for dr, dg in zip(disc_real_outputs, disc_generated_outputs):
+        dr = dr.float()
+        dg = dg.float()
+        r_loss = torch.mean((1 - dr) ** 2)
+        g_loss = torch.mean(dg**2)
+        loss += r_loss + g_loss
+        r_losses.append(r_loss.item())
+        g_losses.append(g_loss.item())
+
+    return loss, r_losses, g_losses
+
+
+def generator_loss(disc_outputs):
+    loss = 0
+    gen_losses = []
+    for dg in disc_outputs:
+        dg = dg.float()
+        l = torch.mean((1 - dg) ** 2)
+        gen_losses.append(l)
+        loss += l
+
+    return loss, gen_losses
+
+
+def kl_loss(z_p, logs_q, m_p, logs_p, total_logdet, z_mask):
+    """
+    z_p, logs_q: [b, h, t_t]
+    m_p, logs_p: [b, h, t_t]
+    total_logdet: [b] - total_logdet summed over each batch
+    """
+    z_p = z_p.float()
+    logs_q = logs_q.float()
+    m_p = m_p.float()
+    logs_p = logs_p.float()
+    z_mask = z_mask.float()
+
+    kl = logs_p - logs_q - 0.5
+    kl += 0.5 * ((z_p - m_p) ** 2) * torch.exp(-2.0 * logs_p)
+    kl = torch.sum(kl * z_mask)
+    # add total_logdet (Negative LL)
+    kl -= torch.sum(total_logdet)
+    l = kl / torch.sum(z_mask)
+    return l
+
+
+def kl_loss_back(z_p, logs_q, m_p, logs_p, z_mask):
+    """
+    z_p, logs_q: [b, h, t_t]
+    m_p, logs_p: [b, h, t_t]
+    """
+    z_p = z_p.float()
+    logs_q = logs_q.float()
+    m_p = m_p.float()
+    logs_p = logs_p.float()
+    z_mask = z_mask.float()
+
+    kl = logs_p - logs_q - 0.5
+    kl += 0.5 * ((z_p - m_p) ** 2) * torch.exp(-2.0 * logs_p)
+    kl = torch.sum(kl * z_mask)
+    l = kl / torch.sum(z_mask)
+    return l
diff --git a/vits/models.py b/vits/models.py
new file mode 100644
index 0000000000000000000000000000000000000000..49c74ded38ee5e3731d563b3c2cbdb2bb821a5ac
--- /dev/null
+++ b/vits/models.py
@@ -0,0 +1,256 @@
+
+import torch
+
+from torch import nn
+from torch.nn import functional as F
+from vits import attentions
+from vits import commons
+from vits import modules
+from vits.utils import f0_to_coarse
+from vits_decoder.generator import Generator
+from vits.modules_grl import SpeakerClassifier
+
+
+class TextEncoder(nn.Module):
+    def __init__(self,
+                 in_channels,
+                 vec_channels,
+                 out_channels,
+                 hidden_channels,
+                 filter_channels,
+                 n_heads,
+                 n_layers,
+                 kernel_size,
+                 p_dropout):
+        super().__init__()
+        self.out_channels = out_channels
+        self.pre = nn.Conv1d(in_channels, hidden_channels, kernel_size=5, padding=2)
+        self.hub = nn.Conv1d(vec_channels, hidden_channels, kernel_size=5, padding=2)
+        self.pit = nn.Embedding(256, hidden_channels)
+        self.enc = attentions.Encoder(
+            hidden_channels,
+            filter_channels,
+            n_heads,
+            n_layers,
+            kernel_size,
+            p_dropout)
+        self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
+
+    def forward(self, x, x_lengths, v, f0):
+        x = torch.transpose(x, 1, -1)  # [b, h, t]
+        x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(
+            x.dtype
+        )
+        x = self.pre(x) * x_mask
+        v = torch.transpose(v, 1, -1)  # [b, h, t]
+        v = self.hub(v) * x_mask
+        x = x + v + self.pit(f0).transpose(1, 2)
+        x = self.enc(x * x_mask, x_mask)
+        stats = self.proj(x) * x_mask
+        m, logs = torch.split(stats, self.out_channels, dim=1)
+        z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask
+        return z, m, logs, x_mask, x
+
+
+class ResidualCouplingBlock(nn.Module):
+    def __init__(
+        self,
+        channels,
+        hidden_channels,
+        kernel_size,
+        dilation_rate,
+        n_layers,
+        n_flows=4,
+        gin_channels=0,
+    ):
+        super().__init__()
+        self.flows = nn.ModuleList()
+        for i in range(n_flows):
+            self.flows.append(
+                modules.ResidualCouplingLayer(
+                    channels,
+                    hidden_channels,
+                    kernel_size,
+                    dilation_rate,
+                    n_layers,
+                    gin_channels=gin_channels,
+                    mean_only=True,
+                )
+            )
+            self.flows.append(modules.Flip())
+
+    def forward(self, x, x_mask, g=None, reverse=False):
+        if not reverse:
+            total_logdet = 0
+            for flow in self.flows:
+                x, log_det = flow(x, x_mask, g=g, reverse=reverse)
+                total_logdet += log_det
+            return x, total_logdet
+        else:
+            total_logdet = 0
+            for flow in reversed(self.flows):
+                x, log_det = flow(x, x_mask, g=g, reverse=reverse)
+                total_logdet += log_det
+            return x, total_logdet
+
+    def remove_weight_norm(self):
+        for i in range(self.n_flows):
+            self.flows[i * 2].remove_weight_norm()
+
+
+class PosteriorEncoder(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        hidden_channels,
+        kernel_size,
+        dilation_rate,
+        n_layers,
+        gin_channels=0,
+    ):
+        super().__init__()
+        self.out_channels = out_channels
+        self.pre = nn.Conv1d(in_channels, hidden_channels, 1)
+        self.enc = modules.WN(
+            hidden_channels,
+            kernel_size,
+            dilation_rate,
+            n_layers,
+            gin_channels=gin_channels,
+        )
+        self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
+
+    def forward(self, x, x_lengths, g=None):
+        x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(
+            x.dtype
+        )
+        x = self.pre(x) * x_mask
+        x = self.enc(x, x_mask, g=g)
+        stats = self.proj(x) * x_mask
+        m, logs = torch.split(stats, self.out_channels, dim=1)
+        z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask
+        return z, m, logs, x_mask
+
+    def remove_weight_norm(self):
+        self.enc.remove_weight_norm()
+
+
+class SynthesizerTrn(nn.Module):
+    def __init__(
+        self,
+        spec_channels,
+        segment_size,
+        hp
+    ):
+        super().__init__()
+        self.segment_size = segment_size
+        self.emb_g = nn.Linear(hp.vits.spk_dim, hp.vits.gin_channels)
+        self.enc_p = TextEncoder(
+            hp.vits.ppg_dim,
+            hp.vits.vec_dim,
+            hp.vits.inter_channels,
+            hp.vits.hidden_channels,
+            hp.vits.filter_channels,
+            2,
+            6,
+            3,
+            0.1,
+        )
+        self.speaker_classifier = SpeakerClassifier(
+            hp.vits.hidden_channels,
+            hp.vits.spk_dim,
+        )
+        self.enc_q = PosteriorEncoder(
+            spec_channels,
+            hp.vits.inter_channels,
+            hp.vits.hidden_channels,
+            5,
+            1,
+            16,
+            gin_channels=hp.vits.gin_channels,
+        )
+        self.flow = ResidualCouplingBlock(
+            hp.vits.inter_channels,
+            hp.vits.hidden_channels,
+            5,
+            1,
+            4,
+            gin_channels=hp.vits.spk_dim
+        )
+        self.dec = Generator(hp=hp)
+
+    def forward(self, ppg, vec, pit, spec, spk, ppg_l, spec_l):
+        ppg = ppg + torch.randn_like(ppg) * 1  # Perturbation
+        vec = vec + torch.randn_like(vec) * 2  # Perturbation
+        g = self.emb_g(F.normalize(spk)).unsqueeze(-1)
+        z_p, m_p, logs_p, ppg_mask, x = self.enc_p(
+            ppg, ppg_l, vec, f0=f0_to_coarse(pit))
+        z_q, m_q, logs_q, spec_mask = self.enc_q(spec, spec_l, g=g)
+
+        z_slice, pit_slice, ids_slice = commons.rand_slice_segments_with_pitch(
+            z_q, pit, spec_l, self.segment_size)
+        audio = self.dec(spk, z_slice, pit_slice)
+
+        # SNAC to flow
+        z_f, logdet_f = self.flow(z_q, spec_mask, g=spk)
+        z_r, logdet_r = self.flow(z_p, spec_mask, g=spk, reverse=True)
+        # speaker
+        spk_preds = self.speaker_classifier(x)
+        return audio, ids_slice, spec_mask, (z_f, z_r, z_p, m_p, logs_p, z_q, m_q, logs_q, logdet_f, logdet_r), spk_preds
+
+    def infer(self, ppg, vec, pit, spk, ppg_l):
+        ppg = ppg + torch.randn_like(ppg) * 0.0001  # Perturbation
+        z_p, m_p, logs_p, ppg_mask, x = self.enc_p(
+            ppg, ppg_l, vec, f0=f0_to_coarse(pit))
+        z, _ = self.flow(z_p, ppg_mask, g=spk, reverse=True)
+        o = self.dec(spk, z * ppg_mask, f0=pit)
+        return o
+
+
+class SynthesizerInfer(nn.Module):
+    def __init__(
+        self,
+        spec_channels,
+        segment_size,
+        hp
+    ):
+        super().__init__()
+        self.segment_size = segment_size
+        self.enc_p = TextEncoder(
+            hp.vits.ppg_dim,
+            hp.vits.vec_dim,
+            hp.vits.inter_channels,
+            hp.vits.hidden_channels,
+            hp.vits.filter_channels,
+            2,
+            6,
+            3,
+            0.1,
+        )
+        self.flow = ResidualCouplingBlock(
+            hp.vits.inter_channels,
+            hp.vits.hidden_channels,
+            5,
+            1,
+            4,
+            gin_channels=hp.vits.spk_dim
+        )
+        self.dec = Generator(hp=hp)
+
+    def remove_weight_norm(self):
+        self.flow.remove_weight_norm()
+        self.dec.remove_weight_norm()
+
+    def pitch2source(self, f0):
+        return self.dec.pitch2source(f0)
+
+    def source2wav(self, source):
+        return self.dec.source2wav(source)
+
+    def inference(self, ppg, vec, pit, spk, ppg_l, source):
+        z_p, m_p, logs_p, ppg_mask, x = self.enc_p(
+            ppg, ppg_l, vec, f0=f0_to_coarse(pit))
+        z, _ = self.flow(z_p, ppg_mask, g=spk, reverse=True)
+        o = self.dec.inference(spk, z * ppg_mask, source)
+        return o
diff --git a/vits/modules.py b/vits/modules.py
new file mode 100644
index 0000000000000000000000000000000000000000..0a0e545add587fd5f18437ef4756f34ae23b0e08
--- /dev/null
+++ b/vits/modules.py
@@ -0,0 +1,324 @@
+import torch
+from torch import nn
+from torch.nn import functional as F
+from vits import commons
+
+
+LRELU_SLOPE = 0.1
+
+
+class LayerNorm(nn.Module):
+    def __init__(self, channels, eps=1e-5):
+        super().__init__()
+        self.channels = channels
+        self.eps = eps
+
+        self.gamma = nn.Parameter(torch.ones(channels))
+        self.beta = nn.Parameter(torch.zeros(channels))
+
+    def forward(self, x):
+        x = x.transpose(1, -1)
+        x = F.layer_norm(x, (self.channels,), self.gamma, self.beta, self.eps)
+        return x.transpose(1, -1)
+
+
+class ConvReluNorm(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        hidden_channels,
+        out_channels,
+        kernel_size,
+        n_layers,
+        p_dropout,
+    ):
+        super().__init__()
+        self.in_channels = in_channels
+        self.hidden_channels = hidden_channels
+        self.out_channels = out_channels
+        self.kernel_size = kernel_size
+        self.n_layers = n_layers
+        self.p_dropout = p_dropout
+        assert n_layers > 1, "Number of layers should be larger than 0."
+
+        self.conv_layers = nn.ModuleList()
+        self.norm_layers = nn.ModuleList()
+        self.conv_layers.append(
+            nn.Conv1d(
+                in_channels, hidden_channels, kernel_size, padding=kernel_size // 2
+            )
+        )
+        self.norm_layers.append(LayerNorm(hidden_channels))
+        self.relu_drop = nn.Sequential(nn.ReLU(), nn.Dropout(p_dropout))
+        for _ in range(n_layers - 1):
+            self.conv_layers.append(
+                nn.Conv1d(
+                    hidden_channels,
+                    hidden_channels,
+                    kernel_size,
+                    padding=kernel_size // 2,
+                )
+            )
+            self.norm_layers.append(LayerNorm(hidden_channels))
+        self.proj = nn.Conv1d(hidden_channels, out_channels, 1)
+        self.proj.weight.data.zero_()
+        self.proj.bias.data.zero_()
+
+    def forward(self, x, x_mask):
+        x_org = x
+        for i in range(self.n_layers):
+            x = self.conv_layers[i](x * x_mask)
+            x = self.norm_layers[i](x)
+            x = self.relu_drop(x)
+        x = x_org + self.proj(x)
+        return x * x_mask
+
+
+class DDSConv(nn.Module):
+    """
+    Dialted and Depth-Separable Convolution
+    """
+
+    def __init__(self, channels, kernel_size, n_layers, p_dropout=0.0):
+        super().__init__()
+        self.channels = channels
+        self.kernel_size = kernel_size
+        self.n_layers = n_layers
+        self.p_dropout = p_dropout
+
+        self.drop = nn.Dropout(p_dropout)
+        self.convs_sep = nn.ModuleList()
+        self.convs_1x1 = nn.ModuleList()
+        self.norms_1 = nn.ModuleList()
+        self.norms_2 = nn.ModuleList()
+        for i in range(n_layers):
+            dilation = kernel_size**i
+            padding = (kernel_size * dilation - dilation) // 2
+            self.convs_sep.append(
+                nn.Conv1d(
+                    channels,
+                    channels,
+                    kernel_size,
+                    groups=channels,
+                    dilation=dilation,
+                    padding=padding,
+                )
+            )
+            self.convs_1x1.append(nn.Conv1d(channels, channels, 1))
+            self.norms_1.append(LayerNorm(channels))
+            self.norms_2.append(LayerNorm(channels))
+
+    def forward(self, x, x_mask, g=None):
+        if g is not None:
+            x = x + g
+        for i in range(self.n_layers):
+            y = self.convs_sep[i](x * x_mask)
+            y = self.norms_1[i](y)
+            y = F.gelu(y)
+            y = self.convs_1x1[i](y)
+            y = self.norms_2[i](y)
+            y = F.gelu(y)
+            y = self.drop(y)
+            x = x + y
+        return x * x_mask
+
+
+class WN(torch.nn.Module):
+    def __init__(
+        self,
+        hidden_channels,
+        kernel_size,
+        dilation_rate,
+        n_layers,
+        gin_channels=0,
+        p_dropout=0,
+    ):
+        super(WN, self).__init__()
+        assert kernel_size % 2 == 1
+        self.hidden_channels = hidden_channels
+        self.kernel_size = (kernel_size,)
+        self.dilation_rate = dilation_rate
+        self.n_layers = n_layers
+        self.gin_channels = gin_channels
+        self.p_dropout = p_dropout
+
+        self.in_layers = torch.nn.ModuleList()
+        self.res_skip_layers = torch.nn.ModuleList()
+        self.drop = nn.Dropout(p_dropout)
+
+        if gin_channels != 0:
+            cond_layer = torch.nn.Conv1d(
+                gin_channels, 2 * hidden_channels * n_layers, 1
+            )
+            self.cond_layer = torch.nn.utils.weight_norm(cond_layer, name="weight")
+
+        for i in range(n_layers):
+            dilation = dilation_rate**i
+            padding = int((kernel_size * dilation - dilation) / 2)
+            in_layer = torch.nn.Conv1d(
+                hidden_channels,
+                2 * hidden_channels,
+                kernel_size,
+                dilation=dilation,
+                padding=padding,
+            )
+            in_layer = torch.nn.utils.weight_norm(in_layer, name="weight")
+            self.in_layers.append(in_layer)
+
+            # last one is not necessary
+            if i < n_layers - 1:
+                res_skip_channels = 2 * hidden_channels
+            else:
+                res_skip_channels = hidden_channels
+
+            res_skip_layer = torch.nn.Conv1d(hidden_channels, res_skip_channels, 1)
+            res_skip_layer = torch.nn.utils.weight_norm(res_skip_layer, name="weight")
+            self.res_skip_layers.append(res_skip_layer)
+
+    def forward(self, x, x_mask, g=None, **kwargs):
+        output = torch.zeros_like(x)
+        n_channels_tensor = torch.IntTensor([self.hidden_channels])
+
+        if g is not None:
+            g = self.cond_layer(g)
+
+        for i in range(self.n_layers):
+            x_in = self.in_layers[i](x)
+            if g is not None:
+                cond_offset = i * 2 * self.hidden_channels
+                g_l = g[:, cond_offset : cond_offset + 2 * self.hidden_channels, :]
+            else:
+                g_l = torch.zeros_like(x_in)
+
+            acts = commons.fused_add_tanh_sigmoid_multiply(x_in, g_l, n_channels_tensor)
+            acts = self.drop(acts)
+
+            res_skip_acts = self.res_skip_layers[i](acts)
+            if i < self.n_layers - 1:
+                res_acts = res_skip_acts[:, : self.hidden_channels, :]
+                x = (x + res_acts) * x_mask
+                output = output + res_skip_acts[:, self.hidden_channels:, :]
+            else:
+                output = output + res_skip_acts
+        return output * x_mask
+
+    def remove_weight_norm(self):
+        if self.gin_channels != 0:
+            torch.nn.utils.remove_weight_norm(self.cond_layer)
+        for l in self.in_layers:
+            torch.nn.utils.remove_weight_norm(l)
+        for l in self.res_skip_layers:
+            torch.nn.utils.remove_weight_norm(l)
+
+
+class Log(nn.Module):
+    def forward(self, x, x_mask, reverse=False, **kwargs):
+        if not reverse:
+            y = torch.log(torch.clamp_min(x, 1e-5)) * x_mask
+            logdet = torch.sum(-y, [1, 2])
+            return y, logdet
+        else:
+            x = torch.exp(x) * x_mask
+            return x
+
+
+class Flip(nn.Module):
+    def forward(self, x, *args, reverse=False, **kwargs):
+        x = torch.flip(x, [1])
+        logdet = torch.zeros(x.size(0)).to(dtype=x.dtype, device=x.device)
+        return x, logdet
+
+
+class ElementwiseAffine(nn.Module):
+    def __init__(self, channels):
+        super().__init__()
+        self.channels = channels
+        self.m = nn.Parameter(torch.zeros(channels, 1))
+        self.logs = nn.Parameter(torch.zeros(channels, 1))
+
+    def forward(self, x, x_mask, reverse=False, **kwargs):
+        if not reverse:
+            y = self.m + torch.exp(self.logs) * x
+            y = y * x_mask
+            logdet = torch.sum(self.logs * x_mask, [1, 2])
+            return y, logdet
+        else:
+            x = (x - self.m) * torch.exp(-self.logs) * x_mask
+            return x
+
+
+class ResidualCouplingLayer(nn.Module):
+    def __init__(
+        self,
+        channels,
+        hidden_channels,
+        kernel_size,
+        dilation_rate,
+        n_layers,
+        p_dropout=0,
+        gin_channels=0,
+        mean_only=False,
+    ):
+        assert channels % 2 == 0, "channels should be divisible by 2"
+        super().__init__()
+        self.channels = channels
+        self.hidden_channels = hidden_channels
+        self.kernel_size = kernel_size
+        self.dilation_rate = dilation_rate
+        self.n_layers = n_layers
+        self.half_channels = channels // 2
+        self.mean_only = mean_only
+
+        self.pre = nn.Conv1d(self.half_channels, hidden_channels, 1)
+        # no use gin_channels
+        self.enc = WN(
+            hidden_channels,
+            kernel_size,
+            dilation_rate,
+            n_layers,
+            p_dropout=p_dropout,
+        )
+        self.post = nn.Conv1d(
+            hidden_channels, self.half_channels * (2 - mean_only), 1)
+        self.post.weight.data.zero_()
+        self.post.bias.data.zero_()
+        # SNAC Speaker-normalized Affine Coupling Layer
+        self.snac = nn.Conv1d(gin_channels, 2 * self.half_channels, 1)
+
+    def forward(self, x, x_mask, g=None, reverse=False):
+        speaker = self.snac(g.unsqueeze(-1))
+        speaker_m, speaker_v = speaker.chunk(2, dim=1)  # (B, half_channels, 1)
+        x0, x1 = torch.split(x, [self.half_channels] * 2, 1)
+        # x0 norm
+        x0_norm = (x0 - speaker_m) * torch.exp(-speaker_v) * x_mask
+        h = self.pre(x0_norm) * x_mask
+        # don't use global condition
+        h = self.enc(h, x_mask)
+        stats = self.post(h) * x_mask
+        if not self.mean_only:
+            m, logs = torch.split(stats, [self.half_channels] * 2, 1)
+        else:
+            m = stats
+            logs = torch.zeros_like(m)
+
+        if not reverse:
+            # x1 norm before affine xform
+            x1_norm = (x1 - speaker_m) * torch.exp(-speaker_v) * x_mask
+            x1 = (m + x1_norm * torch.exp(logs)) * x_mask
+            x = torch.cat([x0, x1], 1)
+            # speaker var to logdet
+            logdet = torch.sum(logs * x_mask, [1, 2]) - torch.sum(
+                speaker_v.expand(-1, -1, logs.size(-1)) * x_mask, [1, 2])
+            return x, logdet
+        else:
+            x1 = (x1 - m) * torch.exp(-logs) * x_mask
+            # x1 denorm before output
+            x1 = (speaker_m + x1 * torch.exp(speaker_v)) * x_mask
+            x = torch.cat([x0, x1], 1)
+            # speaker var to logdet
+            logdet = torch.sum(-logs * x_mask, [1, 2]) + torch.sum(
+                speaker_v.expand(-1, -1, logs.size(-1)) * x_mask, [1, 2])
+            return x, logdet
+
+    def remove_weight_norm(self):
+        self.enc.remove_weight_norm()
diff --git a/vits/modules_grl.py b/vits/modules_grl.py
new file mode 100644
index 0000000000000000000000000000000000000000..3c8510725210f5f31b3677f2e8f30c3b6c215f0f
--- /dev/null
+++ b/vits/modules_grl.py
@@ -0,0 +1,62 @@
+# Adapted from https://github.com/ubisoft/ubisoft-laforge-daft-exprt Apache License Version 2.0
+# Unsupervised Domain Adaptation by Backpropagation
+
+import torch
+import torch.nn as nn
+
+from torch.autograd import Function
+from torch.nn.utils import weight_norm
+
+
+class GradientReversalFunction(Function):
+    @staticmethod
+    def forward(ctx, x, lambda_):
+        ctx.lambda_ = lambda_
+        return x.clone()
+
+    @staticmethod
+    def backward(ctx, grads):
+        lambda_ = ctx.lambda_
+        lambda_ = grads.new_tensor(lambda_)
+        dx = -lambda_ * grads
+        return dx, None
+
+
+class GradientReversal(torch.nn.Module):
+    ''' Gradient Reversal Layer
+            Y. Ganin, V. Lempitsky,
+            "Unsupervised Domain Adaptation by Backpropagation",
+            in ICML, 2015.
+        Forward pass is the identity function
+        In the backward pass, upstream gradients are multiplied by -lambda (i.e. gradient are reversed)
+    '''
+
+    def __init__(self, lambda_reversal=1):
+        super(GradientReversal, self).__init__()
+        self.lambda_ = lambda_reversal
+
+    def forward(self, x):
+        return GradientReversalFunction.apply(x, self.lambda_)
+
+
+class SpeakerClassifier(nn.Module):
+
+    def __init__(self, embed_dim, spk_dim):
+        super(SpeakerClassifier, self).__init__()
+        self.classifier = nn.Sequential(
+            GradientReversal(lambda_reversal=1),
+            weight_norm(nn.Conv1d(embed_dim, embed_dim, kernel_size=5, padding=2)),
+            nn.ReLU(),
+            weight_norm(nn.Conv1d(embed_dim, embed_dim, kernel_size=5, padding=2)),
+            nn.ReLU(),
+            weight_norm(nn.Conv1d(embed_dim, spk_dim, kernel_size=5, padding=2))
+        )
+
+    def forward(self, x):
+        ''' Forward function of Speaker Classifier:
+            x = (B, embed_dim, len)
+        '''
+        # pass through classifier
+        outputs = self.classifier(x)  # (B, nb_speakers)
+        outputs = torch.mean(outputs, dim=-1)
+        return outputs
diff --git a/vits/spectrogram.py b/vits/spectrogram.py
new file mode 100644
index 0000000000000000000000000000000000000000..67b54b1757f977f840ba97e0ad28b241fceeecd7
--- /dev/null
+++ b/vits/spectrogram.py
@@ -0,0 +1,140 @@
+import torch
+import torch.utils.data
+
+from librosa.filters import mel as librosa_mel_fn
+
+MAX_WAV_VALUE = 32768.0
+
+
+def dynamic_range_compression_torch(x, C=1, clip_val=1e-5):
+    """
+    PARAMS
+    ------
+    C: compression factor
+    """
+    return torch.log(torch.clamp(x, min=clip_val) * C)
+
+
+def dynamic_range_decompression_torch(x, C=1):
+    """
+    PARAMS
+    ------
+    C: compression factor used to compress
+    """
+    return torch.exp(x) / C
+
+
+def spectral_normalize_torch(magnitudes):
+    output = dynamic_range_compression_torch(magnitudes)
+    return output
+
+
+def spectral_de_normalize_torch(magnitudes):
+    output = dynamic_range_decompression_torch(magnitudes)
+    return output
+
+
+mel_basis = {}
+hann_window = {}
+
+
+def spectrogram_torch(y, n_fft, sampling_rate, hop_size, win_size, center=False):
+    if torch.min(y) < -1.0:
+        print("min value is ", torch.min(y))
+    if torch.max(y) > 1.0:
+        print("max value is ", torch.max(y))
+
+    global hann_window
+    dtype_device = str(y.dtype) + "_" + str(y.device)
+    wnsize_dtype_device = str(win_size) + "_" + dtype_device
+    if wnsize_dtype_device not in hann_window:
+        hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to(
+            dtype=y.dtype, device=y.device
+        )
+
+    y = torch.nn.functional.pad(
+        y.unsqueeze(1),
+        (int((n_fft - hop_size) / 2), int((n_fft - hop_size) / 2)),
+        mode="reflect",
+    )
+    y = y.squeeze(1)
+
+    spec = torch.stft(
+        y,
+        n_fft,
+        hop_length=hop_size,
+        win_length=win_size,
+        window=hann_window[wnsize_dtype_device],
+        center=center,
+        pad_mode="reflect",
+        normalized=False,
+        onesided=True,
+        return_complex=False,
+    )
+
+    spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6)
+    return spec
+
+
+def spec_to_mel_torch(spec, n_fft, num_mels, sampling_rate, fmin, fmax):
+    global mel_basis
+    dtype_device = str(spec.dtype) + "_" + str(spec.device)
+    fmax_dtype_device = str(fmax) + "_" + dtype_device
+    if fmax_dtype_device not in mel_basis:
+        mel = librosa_mel_fn(sr=sampling_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax)
+        mel_basis[fmax_dtype_device] = torch.from_numpy(mel).to(
+            dtype=spec.dtype, device=spec.device
+        )
+    spec = torch.matmul(mel_basis[fmax_dtype_device], spec)
+    spec = spectral_normalize_torch(spec)
+    return spec
+
+
+def mel_spectrogram_torch(
+    y, n_fft, num_mels, sampling_rate, hop_size, win_size, fmin, fmax, center=False
+):
+    if torch.min(y) < -1.0:
+        print("min value is ", torch.min(y))
+    if torch.max(y) > 1.0:
+        print("max value is ", torch.max(y))
+
+    global mel_basis, hann_window
+    dtype_device = str(y.dtype) + "_" + str(y.device)
+    fmax_dtype_device = str(fmax) + "_" + dtype_device
+    wnsize_dtype_device = str(win_size) + "_" + dtype_device
+    if fmax_dtype_device not in mel_basis:
+        mel = librosa_mel_fn(sr=sampling_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax)
+        mel_basis[fmax_dtype_device] = torch.from_numpy(mel).to(
+            dtype=y.dtype, device=y.device
+        )
+    if wnsize_dtype_device not in hann_window:
+        hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to(
+            dtype=y.dtype, device=y.device
+        )
+
+    y = torch.nn.functional.pad(
+        y.unsqueeze(1),
+        (int((n_fft - hop_size) / 2), int((n_fft - hop_size) / 2)),
+        mode="reflect",
+    )
+    y = y.squeeze(1)
+
+    spec = torch.stft(
+        y,
+        n_fft,
+        hop_length=hop_size,
+        win_length=win_size,
+        window=hann_window[wnsize_dtype_device],
+        center=center,
+        pad_mode="reflect",
+        normalized=False,
+        onesided=True,
+        return_complex=False,
+    )
+
+    spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6)
+
+    spec = torch.matmul(mel_basis[fmax_dtype_device], spec)
+    spec = spectral_normalize_torch(spec)
+
+    return spec
diff --git a/vits/utils.py b/vits/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..f2ae3a16ecd0112c41422a0696a09740a3f2f6a3
--- /dev/null
+++ b/vits/utils.py
@@ -0,0 +1,33 @@
+import torch
+import numpy as np
+from scipy.io.wavfile import read
+
+MATPLOTLIB_FLAG = False
+
+
+def load_wav_to_torch(full_path):
+    sampling_rate, data = read(full_path)
+    return torch.FloatTensor(data.astype(np.float32)), sampling_rate
+
+
+f0_bin = 256
+f0_max = 1100.0
+f0_min = 50.0
+f0_mel_min = 1127 * np.log(1 + f0_min / 700)
+f0_mel_max = 1127 * np.log(1 + f0_max / 700)
+
+
+def f0_to_coarse(f0):
+    is_torch = isinstance(f0, torch.Tensor)
+    f0_mel = 1127 * (1 + f0 / 700).log() if is_torch else 1127 * \
+        np.log(1 + f0 / 700)
+    f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * \
+        (f0_bin - 2) / (f0_mel_max - f0_mel_min) + 1
+
+    f0_mel[f0_mel <= 1] = 1
+    f0_mel[f0_mel > f0_bin - 1] = f0_bin - 1
+    f0_coarse = (
+        f0_mel + 0.5).long() if is_torch else np.rint(f0_mel).astype(np.int)
+    assert f0_coarse.max() <= 255 and f0_coarse.min(
+    ) >= 1, (f0_coarse.max(), f0_coarse.min())
+    return f0_coarse
diff --git a/vits_decoder/LICENSE.txt b/vits_decoder/LICENSE.txt
new file mode 100644
index 0000000000000000000000000000000000000000..e9663595cc28938f88d6299acd3ba791542e4c0c
--- /dev/null
+++ b/vits_decoder/LICENSE.txt
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2022 NVIDIA CORPORATION.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software. 
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
\ No newline at end of file
diff --git a/vits_decoder/__init__.py b/vits_decoder/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..986a0cfe522626f45f6c2d4dede44374c86bbe71
--- /dev/null
+++ b/vits_decoder/__init__.py
@@ -0,0 +1 @@
+from .alias.act import SnakeAlias
\ No newline at end of file
diff --git a/vits_decoder/alias/LICENSE-alias.txt b/vits_decoder/alias/LICENSE-alias.txt
new file mode 100644
index 0000000000000000000000000000000000000000..261eeb9e9f8b2b4b0d119366dda99c6fd7d35c64
--- /dev/null
+++ b/vits_decoder/alias/LICENSE-alias.txt
@@ -0,0 +1,201 @@
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
diff --git a/vits_decoder/alias/LICENSE-snake.txt b/vits_decoder/alias/LICENSE-snake.txt
new file mode 100644
index 0000000000000000000000000000000000000000..9c28182ace9ed5b2d9c8ee4b0e003d1f6f10c757
--- /dev/null
+++ b/vits_decoder/alias/LICENSE-snake.txt
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2020 Edward Dixon
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/vits_decoder/alias/__init__.py b/vits_decoder/alias/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a2318b63198250856809c0cb46210a4147b829bc
--- /dev/null
+++ b/vits_decoder/alias/__init__.py
@@ -0,0 +1,6 @@
+# Adapted from https://github.com/junjun3518/alias-free-torch under the Apache License 2.0
+#   LICENSE is in incl_licenses directory.
+
+from .filter import *
+from .resample import *
+from .act import *
\ No newline at end of file
diff --git a/vits_decoder/alias/act.py b/vits_decoder/alias/act.py
new file mode 100644
index 0000000000000000000000000000000000000000..308344fb6ccbc39317c584a3ee1fb2f29084678e
--- /dev/null
+++ b/vits_decoder/alias/act.py
@@ -0,0 +1,129 @@
+# Adapted from https://github.com/junjun3518/alias-free-torch under the Apache License 2.0
+#   LICENSE is in incl_licenses directory.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from torch import sin, pow
+from torch.nn import Parameter
+from .resample import UpSample1d, DownSample1d
+
+
+class Activation1d(nn.Module):
+    def __init__(self,
+                 activation,
+                 up_ratio: int = 2,
+                 down_ratio: int = 2,
+                 up_kernel_size: int = 12,
+                 down_kernel_size: int = 12):
+        super().__init__()
+        self.up_ratio = up_ratio
+        self.down_ratio = down_ratio
+        self.act = activation
+        self.upsample = UpSample1d(up_ratio, up_kernel_size)
+        self.downsample = DownSample1d(down_ratio, down_kernel_size)
+
+    # x: [B,C,T]
+    def forward(self, x):
+        x = self.upsample(x)
+        x = self.act(x)
+        x = self.downsample(x)
+
+        return x
+
+
+class SnakeBeta(nn.Module):
+    '''
+    A modified Snake function which uses separate parameters for the magnitude of the periodic components
+    Shape:
+        - Input: (B, C, T)
+        - Output: (B, C, T), same shape as the input
+    Parameters:
+        - alpha - trainable parameter that controls frequency
+        - beta - trainable parameter that controls magnitude
+    References:
+        - This activation function is a modified version based on this paper by Liu Ziyin, Tilman Hartwig, Masahito Ueda:
+        https://arxiv.org/abs/2006.08195
+    Examples:
+        >>> a1 = snakebeta(256)
+        >>> x = torch.randn(256)
+        >>> x = a1(x)
+    '''
+
+    def __init__(self, in_features, alpha=1.0, alpha_trainable=True, alpha_logscale=False):
+        '''
+        Initialization.
+        INPUT:
+            - in_features: shape of the input
+            - alpha - trainable parameter that controls frequency
+            - beta - trainable parameter that controls magnitude
+            alpha is initialized to 1 by default, higher values = higher-frequency.
+            beta is initialized to 1 by default, higher values = higher-magnitude.
+            alpha will be trained along with the rest of your model.
+        '''
+        super(SnakeBeta, self).__init__()
+        self.in_features = in_features
+        # initialize alpha
+        self.alpha_logscale = alpha_logscale
+        if self.alpha_logscale:  # log scale alphas initialized to zeros
+            self.alpha = Parameter(torch.zeros(in_features) * alpha)
+            self.beta = Parameter(torch.zeros(in_features) * alpha)
+        else:  # linear scale alphas initialized to ones
+            self.alpha = Parameter(torch.ones(in_features) * alpha)
+            self.beta = Parameter(torch.ones(in_features) * alpha)
+        self.alpha.requires_grad = alpha_trainable
+        self.beta.requires_grad = alpha_trainable
+        self.no_div_by_zero = 0.000000001
+
+    def forward(self, x):
+        '''
+        Forward pass of the function.
+        Applies the function to the input elementwise.
+        SnakeBeta = x + 1/b * sin^2 (xa)
+        '''
+        alpha = self.alpha.unsqueeze(
+            0).unsqueeze(-1)  # line up with x to [B, C, T]
+        beta = self.beta.unsqueeze(0).unsqueeze(-1)
+        if self.alpha_logscale:
+            alpha = torch.exp(alpha)
+            beta = torch.exp(beta)
+        x = x + (1.0 / (beta + self.no_div_by_zero)) * pow(sin(x * alpha), 2)
+        return x
+
+
+class Mish(nn.Module):
+    """
+    Mish activation function is proposed in "Mish: A Self 
+    Regularized Non-Monotonic Neural Activation Function" 
+    paper, https://arxiv.org/abs/1908.08681.
+    """
+
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x):
+        return x * torch.tanh(F.softplus(x))
+
+
+class SnakeAlias(nn.Module):
+    def __init__(self,
+                 channels,
+                 up_ratio: int = 2,
+                 down_ratio: int = 2,
+                 up_kernel_size: int = 12,
+                 down_kernel_size: int = 12):
+        super().__init__()
+        self.up_ratio = up_ratio
+        self.down_ratio = down_ratio
+        self.act = SnakeBeta(channels, alpha_logscale=True)
+        self.upsample = UpSample1d(up_ratio, up_kernel_size)
+        self.downsample = DownSample1d(down_ratio, down_kernel_size)
+
+    # x: [B,C,T]
+    def forward(self, x):
+        x = self.upsample(x)
+        x = self.act(x)
+        x = self.downsample(x)
+
+        return x
\ No newline at end of file
diff --git a/vits_decoder/alias/filter.py b/vits_decoder/alias/filter.py
new file mode 100644
index 0000000000000000000000000000000000000000..7ad6ea87c1f10ddd94c544037791d7a4634d5ae1
--- /dev/null
+++ b/vits_decoder/alias/filter.py
@@ -0,0 +1,95 @@
+# Adapted from https://github.com/junjun3518/alias-free-torch under the Apache License 2.0
+#   LICENSE is in incl_licenses directory.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import math
+
+if 'sinc' in dir(torch):
+    sinc = torch.sinc
+else:
+    # This code is adopted from adefossez's julius.core.sinc under the MIT License
+    # https://adefossez.github.io/julius/julius/core.html
+    #   LICENSE is in incl_licenses directory.
+    def sinc(x: torch.Tensor):
+        """
+        Implementation of sinc, i.e. sin(pi * x) / (pi * x)
+        __Warning__: Different to julius.sinc, the input is multiplied by `pi`!
+        """
+        return torch.where(x == 0,
+                           torch.tensor(1., device=x.device, dtype=x.dtype),
+                           torch.sin(math.pi * x) / math.pi / x)
+
+
+# This code is adopted from adefossez's julius.lowpass.LowPassFilters under the MIT License
+# https://adefossez.github.io/julius/julius/lowpass.html
+#   LICENSE is in incl_licenses directory.
+def kaiser_sinc_filter1d(cutoff, half_width, kernel_size): # return filter [1,1,kernel_size]
+    even = (kernel_size % 2 == 0)
+    half_size = kernel_size // 2
+
+    #For kaiser window
+    delta_f = 4 * half_width
+    A = 2.285 * (half_size - 1) * math.pi * delta_f + 7.95
+    if A > 50.:
+        beta = 0.1102 * (A - 8.7)
+    elif A >= 21.:
+        beta = 0.5842 * (A - 21)**0.4 + 0.07886 * (A - 21.)
+    else:
+        beta = 0.
+    window = torch.kaiser_window(kernel_size, beta=beta, periodic=False)
+
+    # ratio = 0.5/cutoff -> 2 * cutoff = 1 / ratio
+    if even:
+        time = (torch.arange(-half_size, half_size) + 0.5)
+    else:
+        time = torch.arange(kernel_size) - half_size
+    if cutoff == 0:
+        filter_ = torch.zeros_like(time)
+    else:
+        filter_ = 2 * cutoff * window * sinc(2 * cutoff * time)
+        # Normalize filter to have sum = 1, otherwise we will have a small leakage
+        # of the constant component in the input signal.
+        filter_ /= filter_.sum()
+        filter = filter_.view(1, 1, kernel_size)
+
+    return filter
+
+
+class LowPassFilter1d(nn.Module):
+    def __init__(self,
+                 cutoff=0.5,
+                 half_width=0.6,
+                 stride: int = 1,
+                 padding: bool = True,
+                 padding_mode: str = 'replicate',
+                 kernel_size: int = 12):
+        # kernel_size should be even number for stylegan3 setup,
+        # in this implementation, odd number is also possible.
+        super().__init__()
+        if cutoff < -0.:
+            raise ValueError("Minimum cutoff must be larger than zero.")
+        if cutoff > 0.5:
+            raise ValueError("A cutoff above 0.5 does not make sense.")
+        self.kernel_size = kernel_size
+        self.even = (kernel_size % 2 == 0)
+        self.pad_left = kernel_size // 2 - int(self.even)
+        self.pad_right = kernel_size // 2
+        self.stride = stride
+        self.padding = padding
+        self.padding_mode = padding_mode
+        filter = kaiser_sinc_filter1d(cutoff, half_width, kernel_size)
+        self.register_buffer("filter", filter)
+
+    #input [B, C, T]
+    def forward(self, x):
+        _, C, _ = x.shape
+
+        if self.padding:
+            x = F.pad(x, (self.pad_left, self.pad_right),
+                      mode=self.padding_mode)
+        out = F.conv1d(x, self.filter.expand(C, -1, -1),
+                       stride=self.stride, groups=C)
+
+        return out
\ No newline at end of file
diff --git a/vits_decoder/alias/resample.py b/vits_decoder/alias/resample.py
new file mode 100644
index 0000000000000000000000000000000000000000..750e6c3402cc5ac939c4b9d075246562e0e1d1a7
--- /dev/null
+++ b/vits_decoder/alias/resample.py
@@ -0,0 +1,49 @@
+# Adapted from https://github.com/junjun3518/alias-free-torch under the Apache License 2.0
+#   LICENSE is in incl_licenses directory.
+
+import torch.nn as nn
+from torch.nn import functional as F
+from .filter import LowPassFilter1d
+from .filter import kaiser_sinc_filter1d
+
+
+class UpSample1d(nn.Module):
+    def __init__(self, ratio=2, kernel_size=None):
+        super().__init__()
+        self.ratio = ratio
+        self.kernel_size = int(6 * ratio // 2) * 2 if kernel_size is None else kernel_size
+        self.stride = ratio
+        self.pad = self.kernel_size // ratio - 1
+        self.pad_left = self.pad * self.stride + (self.kernel_size - self.stride) // 2
+        self.pad_right = self.pad * self.stride + (self.kernel_size - self.stride + 1) // 2
+        filter = kaiser_sinc_filter1d(cutoff=0.5 / ratio,
+                                      half_width=0.6 / ratio,
+                                      kernel_size=self.kernel_size)
+        self.register_buffer("filter", filter)
+
+    # x: [B, C, T]
+    def forward(self, x):
+        _, C, _ = x.shape
+
+        x = F.pad(x, (self.pad, self.pad), mode='replicate')
+        x = self.ratio * F.conv_transpose1d(
+            x, self.filter.expand(C, -1, -1), stride=self.stride, groups=C)
+        x = x[..., self.pad_left:-self.pad_right]
+
+        return x
+
+
+class DownSample1d(nn.Module):
+    def __init__(self, ratio=2, kernel_size=None):
+        super().__init__()
+        self.ratio = ratio
+        self.kernel_size = int(6 * ratio // 2) * 2 if kernel_size is None else kernel_size
+        self.lowpass = LowPassFilter1d(cutoff=0.5 / ratio,
+                                       half_width=0.6 / ratio,
+                                       stride=ratio,
+                                       kernel_size=self.kernel_size)
+
+    def forward(self, x):
+        xx = self.lowpass(x)
+
+        return xx
\ No newline at end of file
diff --git a/vits_decoder/bigv.py b/vits_decoder/bigv.py
new file mode 100644
index 0000000000000000000000000000000000000000..029362c34b2c850cc2d59eea4410f77380d84bbe
--- /dev/null
+++ b/vits_decoder/bigv.py
@@ -0,0 +1,64 @@
+import torch
+import torch.nn as nn
+
+from torch.nn import Conv1d
+from torch.nn.utils import weight_norm, remove_weight_norm
+from .alias.act import SnakeAlias
+
+
+def init_weights(m, mean=0.0, std=0.01):
+    classname = m.__class__.__name__
+    if classname.find("Conv") != -1:
+        m.weight.data.normal_(mean, std)
+
+
+def get_padding(kernel_size, dilation=1):
+    return int((kernel_size*dilation - dilation)/2)
+
+
+class AMPBlock(torch.nn.Module):
+    def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5)):
+        super(AMPBlock, self).__init__()
+        self.convs1 = nn.ModuleList([
+            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0],
+                               padding=get_padding(kernel_size, dilation[0]))),
+            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1],
+                               padding=get_padding(kernel_size, dilation[1]))),
+            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[2],
+                               padding=get_padding(kernel_size, dilation[2])))
+        ])
+        self.convs1.apply(init_weights)
+
+        self.convs2 = nn.ModuleList([
+            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
+                               padding=get_padding(kernel_size, 1))),
+            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
+                               padding=get_padding(kernel_size, 1))),
+            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
+                               padding=get_padding(kernel_size, 1)))
+        ])
+        self.convs2.apply(init_weights)
+
+        # total number of conv layers
+        self.num_layers = len(self.convs1) + len(self.convs2)
+
+        # periodic nonlinearity with snakebeta function and anti-aliasing
+        self.activations = nn.ModuleList([
+            SnakeAlias(channels) for _ in range(self.num_layers)
+        ])
+
+    def forward(self, x):
+        acts1, acts2 = self.activations[::2], self.activations[1::2]
+        for c1, c2, a1, a2 in zip(self.convs1, self.convs2, acts1, acts2):
+            xt = a1(x)
+            xt = c1(xt)
+            xt = a2(xt)
+            xt = c2(xt)
+            x = xt + x
+        return x
+
+    def remove_weight_norm(self):
+        for l in self.convs1:
+            remove_weight_norm(l)
+        for l in self.convs2:
+            remove_weight_norm(l)
\ No newline at end of file
diff --git a/vits_decoder/discriminator.py b/vits_decoder/discriminator.py
new file mode 100644
index 0000000000000000000000000000000000000000..764c0ca806b707e4f36ca2abb64ce79971358dd9
--- /dev/null
+++ b/vits_decoder/discriminator.py
@@ -0,0 +1,39 @@
+import torch
+import torch.nn as nn
+
+from omegaconf import OmegaConf
+from .msd import ScaleDiscriminator
+from .mpd import MultiPeriodDiscriminator
+from .mrd import MultiResolutionDiscriminator
+
+
+class Discriminator(nn.Module):
+    def __init__(self, hp):
+        super(Discriminator, self).__init__()
+        self.MRD = MultiResolutionDiscriminator(hp)
+        self.MPD = MultiPeriodDiscriminator(hp)
+        self.MSD = ScaleDiscriminator()
+
+    def forward(self, x):
+        r = self.MRD(x)
+        p = self.MPD(x)
+        s = self.MSD(x)
+        return r + p + s
+
+
+if __name__ == '__main__':
+    hp = OmegaConf.load('../config/base.yaml')
+    model = Discriminator(hp)
+
+    x = torch.randn(3, 1, 16384)
+    print(x.shape)
+
+    output = model(x)
+    for features, score in output:
+        for feat in features:
+            print(feat.shape)
+        print(score.shape)
+
+    pytorch_total_params = sum(p.numel()
+                               for p in model.parameters() if p.requires_grad)
+    print(pytorch_total_params)
diff --git a/vits_decoder/generator.py b/vits_decoder/generator.py
new file mode 100644
index 0000000000000000000000000000000000000000..787302bd496ee0545d9699b1cff1835a243cd62b
--- /dev/null
+++ b/vits_decoder/generator.py
@@ -0,0 +1,200 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import numpy as np
+
+from torch.nn import Conv1d
+from torch.nn import ConvTranspose1d
+from torch.nn.utils import weight_norm
+from torch.nn.utils import remove_weight_norm
+
+from .nsf import SourceModuleHnNSF
+from .bigv import init_weights, AMPBlock, SnakeAlias
+
+
+class SpeakerAdapter(nn.Module):
+
+    def __init__(self,
+                 speaker_dim,
+                 adapter_dim,
+                 epsilon=1e-5
+                 ):
+        super(SpeakerAdapter, self).__init__()
+        self.speaker_dim = speaker_dim
+        self.adapter_dim = adapter_dim
+        self.epsilon = epsilon
+        self.W_scale = nn.Linear(self.speaker_dim, self.adapter_dim)
+        self.W_bias = nn.Linear(self.speaker_dim, self.adapter_dim)
+        self.reset_parameters()
+
+    def reset_parameters(self):
+        torch.nn.init.constant_(self.W_scale.weight, 0.0)
+        torch.nn.init.constant_(self.W_scale.bias, 1.0)
+        torch.nn.init.constant_(self.W_bias.weight, 0.0)
+        torch.nn.init.constant_(self.W_bias.bias, 0.0)
+
+    def forward(self, x, speaker_embedding):
+        x = x.transpose(1, -1)
+        mean = x.mean(dim=-1, keepdim=True)
+        var = ((x - mean) ** 2).mean(dim=-1, keepdim=True)
+        std = (var + self.epsilon).sqrt()
+        y = (x - mean) / std
+        scale = self.W_scale(speaker_embedding)
+        bias = self.W_bias(speaker_embedding)
+        y *= scale.unsqueeze(1)
+        y += bias.unsqueeze(1)
+        y = y.transpose(1, -1)
+        return y
+
+
+class Generator(torch.nn.Module):
+    # this is our main BigVGAN model. Applies anti-aliased periodic activation for resblocks.
+    def __init__(self, hp):
+        super(Generator, self).__init__()
+        self.hp = hp
+        self.num_kernels = len(hp.gen.resblock_kernel_sizes)
+        self.num_upsamples = len(hp.gen.upsample_rates)
+        # speaker adaper, 256 should change by what speaker encoder you use
+        self.adapter = SpeakerAdapter(hp.vits.spk_dim, hp.gen.upsample_input)
+        # pre conv
+        self.conv_pre = Conv1d(hp.gen.upsample_input,
+                               hp.gen.upsample_initial_channel, 7, 1, padding=3)
+        # nsf
+        self.f0_upsamp = torch.nn.Upsample(
+            scale_factor=np.prod(hp.gen.upsample_rates))
+        self.m_source = SourceModuleHnNSF(sampling_rate=hp.data.sampling_rate)
+        self.noise_convs = nn.ModuleList()
+        # transposed conv-based upsamplers. does not apply anti-aliasing
+        self.ups = nn.ModuleList()
+        for i, (u, k) in enumerate(zip(hp.gen.upsample_rates, hp.gen.upsample_kernel_sizes)):
+            # print(f'ups: {i} {k}, {u}, {(k - u) // 2}')
+            # base
+            self.ups.append(
+                weight_norm(
+                    ConvTranspose1d(
+                        hp.gen.upsample_initial_channel // (2 ** i),
+                        hp.gen.upsample_initial_channel // (2 ** (i + 1)),
+                        k,
+                        u,
+                        padding=(k - u) // 2)
+                )
+            )
+            # nsf
+            if i + 1 < len(hp.gen.upsample_rates):
+                stride_f0 = np.prod(hp.gen.upsample_rates[i + 1:])
+                stride_f0 = int(stride_f0)
+                self.noise_convs.append(
+                    Conv1d(
+                        1,
+                        hp.gen.upsample_initial_channel // (2 ** (i + 1)),
+                        kernel_size=stride_f0 * 2,
+                        stride=stride_f0,
+                        padding=stride_f0 // 2,
+                    )
+                )
+            else:
+                self.noise_convs.append(
+                    Conv1d(1, hp.gen.upsample_initial_channel //
+                           (2 ** (i + 1)), kernel_size=1)
+                )
+
+        # residual blocks using anti-aliased multi-periodicity composition modules (AMP)
+        self.resblocks = nn.ModuleList()
+        for i in range(len(self.ups)):
+            ch = hp.gen.upsample_initial_channel // (2 ** (i + 1))
+            for k, d in zip(hp.gen.resblock_kernel_sizes, hp.gen.resblock_dilation_sizes):
+                self.resblocks.append(AMPBlock(ch, k, d))
+
+        # post conv
+        self.activation_post = SnakeAlias(ch)
+        self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False)
+        # weight initialization
+        self.ups.apply(init_weights)
+
+    def forward(self, spk, x, f0):
+        # Perturbation
+        x = x + torch.randn_like(x)
+        # adapter
+        x = self.adapter(x, spk)
+        x = self.conv_pre(x)
+        x = x * torch.tanh(F.softplus(x))
+        # nsf
+        f0 = f0[:, None]
+        f0 = self.f0_upsamp(f0).transpose(1, 2)
+        har_source = self.m_source(f0)
+        har_source = har_source.transpose(1, 2)
+
+        for i in range(self.num_upsamples):
+            # upsampling
+            x = self.ups[i](x)
+            # nsf
+            x_source = self.noise_convs[i](har_source)
+            x = x + x_source
+            # AMP blocks
+            xs = None
+            for j in range(self.num_kernels):
+                if xs is None:
+                    xs = self.resblocks[i * self.num_kernels + j](x)
+                else:
+                    xs += self.resblocks[i * self.num_kernels + j](x)
+            x = xs / self.num_kernels
+
+        # post conv
+        x = self.activation_post(x)
+        x = self.conv_post(x)
+        x = torch.tanh(x)
+        return x
+
+    def remove_weight_norm(self):
+        for l in self.ups:
+            remove_weight_norm(l)
+        for l in self.resblocks:
+            l.remove_weight_norm()
+
+    def eval(self, inference=False):
+        super(Generator, self).eval()
+        # don't remove weight norm while validation in training loop
+        if inference:
+            self.remove_weight_norm()
+
+    def pitch2source(self, f0):
+        f0 = f0[:, None]
+        f0 = self.f0_upsamp(f0).transpose(1, 2)  # [1,len,1]
+        har_source = self.m_source(f0)
+        har_source = har_source.transpose(1, 2)  # [1,1,len]
+        return har_source
+
+    def source2wav(self, audio):
+        MAX_WAV_VALUE = 32768.0
+        audio = audio.squeeze()
+        audio = MAX_WAV_VALUE * audio
+        audio = audio.clamp(min=-MAX_WAV_VALUE, max=MAX_WAV_VALUE-1)
+        audio = audio.short()
+        return audio.cpu().detach().numpy()
+
+    def inference(self, spk, x, har_source):
+        # adapter
+        x = self.adapter(x, spk)
+        x = self.conv_pre(x)
+        x = x * torch.tanh(F.softplus(x))
+
+        for i in range(self.num_upsamples):
+            # upsampling
+            x = self.ups[i](x)
+            # nsf
+            x_source = self.noise_convs[i](har_source)
+            x = x + x_source
+            # AMP blocks
+            xs = None
+            for j in range(self.num_kernels):
+                if xs is None:
+                    xs = self.resblocks[i * self.num_kernels + j](x)
+                else:
+                    xs += self.resblocks[i * self.num_kernels + j](x)
+            x = xs / self.num_kernels
+
+        # post conv
+        x = self.activation_post(x)
+        x = self.conv_post(x)
+        x = torch.tanh(x)
+        return x
diff --git a/vits_decoder/med.py b/vits_decoder/med.py
new file mode 100644
index 0000000000000000000000000000000000000000..77554d3c07b98328c0cc5c9b0b8301c22568f55c
--- /dev/null
+++ b/vits_decoder/med.py
@@ -0,0 +1,65 @@
+import torch
+import torchaudio
+import typing as T
+
+
+class MelspecDiscriminator(torch.nn.Module):
+    """mel spectrogram (frequency domain) discriminator"""
+
+    def __init__(self) -> None:
+        super().__init__()
+        self.SAMPLE_RATE = 48000
+        # mel filterbank transform
+        self._melspec = torchaudio.transforms.MelSpectrogram(
+            sample_rate=self.SAMPLE_RATE,
+            n_fft=2048,
+            win_length=int(0.025 * self.SAMPLE_RATE),
+            hop_length=int(0.010 * self.SAMPLE_RATE),
+            n_mels=128,
+            power=1,
+        )
+
+        # time-frequency 2D convolutions
+        kernel_sizes = [(7, 7), (4, 4), (4, 4), (4, 4)]
+        strides = [(1, 2), (1, 2), (1, 2), (1, 2)]
+        self._convs = torch.nn.ModuleList(
+            [
+                torch.nn.Sequential(
+                    torch.nn.Conv2d(
+                        in_channels=1 if i == 0 else 32,
+                        out_channels=64,
+                        kernel_size=k,
+                        stride=s,
+                        padding=(1, 2),
+                        bias=False,
+                    ),
+                    torch.nn.BatchNorm2d(num_features=64),
+                    torch.nn.GLU(dim=1),
+                )
+                for i, (k, s) in enumerate(zip(kernel_sizes, strides))
+            ]
+        )
+
+        # output adversarial projection
+        self._postnet = torch.nn.Conv2d(
+            in_channels=32,
+            out_channels=1,
+            kernel_size=(15, 3),
+            stride=(1, 2),
+        )
+
+    def forward(self, x: torch.Tensor) -> T.Tuple[torch.Tensor, T.List[torch.Tensor]]:
+        # apply the log-scale mel spectrogram transform
+        x = torch.log(self._melspec(x) + 1e-5)
+
+        # compute hidden layers and feature maps
+        f = []
+        for c in self._convs:
+            x = c(x)
+            f.append(x)
+
+        # apply the output projection and global average pooling
+        x = self._postnet(x)
+        x = x.mean(dim=[-2, -1])
+
+        return [(f, x)]
diff --git a/vits_decoder/mpd.py b/vits_decoder/mpd.py
new file mode 100644
index 0000000000000000000000000000000000000000..2dc63e859dd2920f9d02b285ebc4dae8cf318d6a
--- /dev/null
+++ b/vits_decoder/mpd.py
@@ -0,0 +1,61 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.nn.utils import weight_norm, spectral_norm
+
+class DiscriminatorP(nn.Module):
+    def __init__(self, hp, period):
+        super(DiscriminatorP, self).__init__()
+
+        self.LRELU_SLOPE = hp.mpd.lReLU_slope
+        self.period = period
+
+        kernel_size = hp.mpd.kernel_size
+        stride = hp.mpd.stride
+        norm_f = weight_norm if hp.mpd.use_spectral_norm == False else spectral_norm
+
+        self.convs = nn.ModuleList([
+            norm_f(nn.Conv2d(1, 64, (kernel_size, 1), (stride, 1), padding=(kernel_size // 2, 0))),
+            norm_f(nn.Conv2d(64, 128, (kernel_size, 1), (stride, 1), padding=(kernel_size // 2, 0))),
+            norm_f(nn.Conv2d(128, 256, (kernel_size, 1), (stride, 1), padding=(kernel_size // 2, 0))),
+            norm_f(nn.Conv2d(256, 512, (kernel_size, 1), (stride, 1), padding=(kernel_size // 2, 0))),
+            norm_f(nn.Conv2d(512, 1024, (kernel_size, 1), 1, padding=(kernel_size // 2, 0))),
+        ])
+        self.conv_post = norm_f(nn.Conv2d(1024, 1, (3, 1), 1, padding=(1, 0)))
+
+    def forward(self, x):
+        fmap = []
+
+        # 1d to 2d
+        b, c, t = x.shape
+        if t % self.period != 0: # pad first
+            n_pad = self.period - (t % self.period)
+            x = F.pad(x, (0, n_pad), "reflect")
+            t = t + n_pad
+        x = x.view(b, c, t // self.period, self.period)
+
+        for l in self.convs:
+            x = l(x)
+            x = F.leaky_relu(x, self.LRELU_SLOPE)
+            fmap.append(x)
+        x = self.conv_post(x)
+        fmap.append(x)
+        x = torch.flatten(x, 1, -1)
+
+        return fmap, x
+
+
+class MultiPeriodDiscriminator(nn.Module):
+    def __init__(self, hp):
+        super(MultiPeriodDiscriminator, self).__init__()
+
+        self.discriminators = nn.ModuleList(
+            [DiscriminatorP(hp, period) for period in hp.mpd.periods]
+        )
+
+    def forward(self, x):
+        ret = list()
+        for disc in self.discriminators:
+            ret.append(disc(x))
+
+        return ret  # [(feat, score), (feat, score), (feat, score), (feat, score), (feat, score)]
diff --git a/vits_decoder/mrd.py b/vits_decoder/mrd.py
new file mode 100644
index 0000000000000000000000000000000000000000..da6db1a416366603d2e65b400d66c44262e2baef
--- /dev/null
+++ b/vits_decoder/mrd.py
@@ -0,0 +1,62 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.nn.utils import weight_norm, spectral_norm
+
+class DiscriminatorR(torch.nn.Module):
+    def __init__(self, hp, resolution):
+        super(DiscriminatorR, self).__init__()
+
+        self.resolution = resolution
+        self.LRELU_SLOPE = hp.mpd.lReLU_slope
+
+        norm_f = weight_norm if hp.mrd.use_spectral_norm == False else spectral_norm
+
+        self.convs = nn.ModuleList([
+            norm_f(nn.Conv2d(1, 32, (3, 9), padding=(1, 4))),
+            norm_f(nn.Conv2d(32, 32, (3, 9), stride=(1, 2), padding=(1, 4))),
+            norm_f(nn.Conv2d(32, 32, (3, 9), stride=(1, 2), padding=(1, 4))),
+            norm_f(nn.Conv2d(32, 32, (3, 9), stride=(1, 2), padding=(1, 4))),
+            norm_f(nn.Conv2d(32, 32, (3, 3), padding=(1, 1))),
+        ])
+        self.conv_post = norm_f(nn.Conv2d(32, 1, (3, 3), padding=(1, 1)))
+
+    def forward(self, x):
+        fmap = []
+
+        x = self.spectrogram(x)
+        x = x.unsqueeze(1)
+        for l in self.convs:
+            x = l(x)
+            x = F.leaky_relu(x, self.LRELU_SLOPE)
+            fmap.append(x)
+        x = self.conv_post(x)
+        fmap.append(x)
+        x = torch.flatten(x, 1, -1)
+
+        return fmap, x
+
+    def spectrogram(self, x):
+        n_fft, hop_length, win_length = self.resolution
+        x = F.pad(x, (int((n_fft - hop_length) / 2), int((n_fft - hop_length) / 2)), mode='reflect')
+        x = x.squeeze(1)
+        x = torch.stft(x, n_fft=n_fft, hop_length=hop_length, win_length=win_length, center=False, return_complex=False) #[B, F, TT, 2]
+        mag = torch.norm(x, p=2, dim =-1) #[B, F, TT]
+
+        return mag
+
+
+class MultiResolutionDiscriminator(torch.nn.Module):
+    def __init__(self, hp):
+        super(MultiResolutionDiscriminator, self).__init__()
+        self.resolutions = eval(hp.mrd.resolutions)
+        self.discriminators = nn.ModuleList(
+            [DiscriminatorR(hp, resolution) for resolution in self.resolutions]
+        )
+
+    def forward(self, x):
+        ret = list()
+        for disc in self.discriminators:
+            ret.append(disc(x))
+
+        return ret  # [(feat, score), (feat, score), (feat, score)]
diff --git a/vits_decoder/msd.py b/vits_decoder/msd.py
new file mode 100644
index 0000000000000000000000000000000000000000..9e254fa3f1b53368332751a3e7235e93297c44c3
--- /dev/null
+++ b/vits_decoder/msd.py
@@ -0,0 +1,29 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.nn.utils import weight_norm
+
+
+class ScaleDiscriminator(torch.nn.Module):
+    def __init__(self):
+        super(ScaleDiscriminator, self).__init__()
+        self.convs = nn.ModuleList([
+            weight_norm(nn.Conv1d(1, 16, 15, 1, padding=7)),
+            weight_norm(nn.Conv1d(16, 64, 41, 4, groups=4, padding=20)),
+            weight_norm(nn.Conv1d(64, 256, 41, 4, groups=16, padding=20)),
+            weight_norm(nn.Conv1d(256, 1024, 41, 4, groups=64, padding=20)),
+            weight_norm(nn.Conv1d(1024, 1024, 41, 4, groups=256, padding=20)),
+            weight_norm(nn.Conv1d(1024, 1024, 5, 1, padding=2)),
+        ])
+        self.conv_post = weight_norm(nn.Conv1d(1024, 1, 3, 1, padding=1))
+
+    def forward(self, x):
+        fmap = []
+        for l in self.convs:
+            x = l(x)
+            x = F.leaky_relu(x, 0.1)
+            fmap.append(x)
+        x = self.conv_post(x)
+        fmap.append(x)
+        x = torch.flatten(x, 1, -1)
+        return [(fmap, x)]
diff --git a/vits_decoder/nsf.py b/vits_decoder/nsf.py
new file mode 100644
index 0000000000000000000000000000000000000000..1e9e6c7e344eb616a7ca427da1a02a2c2093c942
--- /dev/null
+++ b/vits_decoder/nsf.py
@@ -0,0 +1,394 @@
+import torch
+import numpy as np
+import sys
+import torch.nn.functional as torch_nn_func
+
+
+class PulseGen(torch.nn.Module):
+    """Definition of Pulse train generator
+
+    There are many ways to implement pulse generator.
+    Here, PulseGen is based on SinGen. For a perfect
+    """
+
+    def __init__(self, samp_rate, pulse_amp=0.1, noise_std=0.003, voiced_threshold=0):
+        super(PulseGen, self).__init__()
+        self.pulse_amp = pulse_amp
+        self.sampling_rate = samp_rate
+        self.voiced_threshold = voiced_threshold
+        self.noise_std = noise_std
+        self.l_sinegen = SineGen(
+            self.sampling_rate,
+            harmonic_num=0,
+            sine_amp=self.pulse_amp,
+            noise_std=0,
+            voiced_threshold=self.voiced_threshold,
+            flag_for_pulse=True,
+        )
+
+    def forward(self, f0):
+        """Pulse train generator
+        pulse_train, uv = forward(f0)
+        input F0: tensor(batchsize=1, length, dim=1)
+                  f0 for unvoiced steps should be 0
+        output pulse_train: tensor(batchsize=1, length, dim)
+        output uv: tensor(batchsize=1, length, 1)
+
+        Note: self.l_sine doesn't make sure that the initial phase of
+        a voiced segment is np.pi, the first pulse in a voiced segment
+        may not be at the first time step within a voiced segment
+        """
+        with torch.no_grad():
+            sine_wav, uv, noise = self.l_sinegen(f0)
+
+            # sine without additive noise
+            pure_sine = sine_wav - noise
+
+            # step t corresponds to a pulse if
+            # sine[t] > sine[t+1] & sine[t] > sine[t-1]
+            # & sine[t-1], sine[t+1], and sine[t] are voiced
+            # or
+            # sine[t] is voiced, sine[t-1] is unvoiced
+            # we use torch.roll to simulate sine[t+1] and sine[t-1]
+            sine_1 = torch.roll(pure_sine, shifts=1, dims=1)
+            uv_1 = torch.roll(uv, shifts=1, dims=1)
+            uv_1[:, 0, :] = 0
+            sine_2 = torch.roll(pure_sine, shifts=-1, dims=1)
+            uv_2 = torch.roll(uv, shifts=-1, dims=1)
+            uv_2[:, -1, :] = 0
+
+            loc = (pure_sine > sine_1) * (pure_sine > sine_2) \
+                  * (uv_1 > 0) * (uv_2 > 0) * (uv > 0) \
+                  + (uv_1 < 1) * (uv > 0)
+
+            # pulse train without noise
+            pulse_train = pure_sine * loc
+
+            # additive noise to pulse train
+            # note that noise from sinegen is zero in voiced regions
+            pulse_noise = torch.randn_like(pure_sine) * self.noise_std
+
+            # with additive noise on pulse, and unvoiced regions
+            pulse_train += pulse_noise * loc + pulse_noise * (1 - uv)
+        return pulse_train, sine_wav, uv, pulse_noise
+
+
+class SignalsConv1d(torch.nn.Module):
+    """Filtering input signal with time invariant filter
+    Note: FIRFilter conducted filtering given fixed FIR weight
+          SignalsConv1d convolves two signals
+    Note: this is based on torch.nn.functional.conv1d
+
+    """
+
+    def __init__(self):
+        super(SignalsConv1d, self).__init__()
+
+    def forward(self, signal, system_ir):
+        """output = forward(signal, system_ir)
+
+        signal:    (batchsize, length1, dim)
+        system_ir: (length2, dim)
+
+        output:    (batchsize, length1, dim)
+        """
+        if signal.shape[-1] != system_ir.shape[-1]:
+            print("Error: SignalsConv1d expects shape:")
+            print("signal    (batchsize, length1, dim)")
+            print("system_id (batchsize, length2, dim)")
+            print("But received signal: {:s}".format(str(signal.shape)))
+            print(" system_ir: {:s}".format(str(system_ir.shape)))
+            sys.exit(1)
+        padding_length = system_ir.shape[0] - 1
+        groups = signal.shape[-1]
+
+        # pad signal on the left
+        signal_pad = torch_nn_func.pad(signal.permute(0, 2, 1), (padding_length, 0))
+        # prepare system impulse response as (dim, 1, length2)
+        # also flip the impulse response
+        ir = torch.flip(system_ir.unsqueeze(1).permute(2, 1, 0), dims=[2])
+        # convolute
+        output = torch_nn_func.conv1d(signal_pad, ir, groups=groups)
+        return output.permute(0, 2, 1)
+
+
+class CyclicNoiseGen_v1(torch.nn.Module):
+    """CyclicnoiseGen_v1
+    Cyclic noise with a single parameter of beta.
+    Pytorch v1 implementation assumes f_t is also fixed
+    """
+
+    def __init__(self, samp_rate, noise_std=0.003, voiced_threshold=0):
+        super(CyclicNoiseGen_v1, self).__init__()
+        self.samp_rate = samp_rate
+        self.noise_std = noise_std
+        self.voiced_threshold = voiced_threshold
+
+        self.l_pulse = PulseGen(
+            samp_rate,
+            pulse_amp=1.0,
+            noise_std=noise_std,
+            voiced_threshold=voiced_threshold,
+        )
+        self.l_conv = SignalsConv1d()
+
+    def noise_decay(self, beta, f0mean):
+        """decayed_noise = noise_decay(beta, f0mean)
+        decayed_noise =  n[t]exp(-t * f_mean / beta / samp_rate)
+
+        beta: (dim=1) or (batchsize=1, 1, dim=1)
+        f0mean (batchsize=1, 1, dim=1)
+
+        decayed_noise (batchsize=1, length, dim=1)
+        """
+        with torch.no_grad():
+            # exp(-1.0 n / T) < 0.01 => n > -log(0.01)*T = 4.60*T
+            # truncate the noise when decayed by -40 dB
+            length = 4.6 * self.samp_rate / f0mean
+            length = length.int()
+            time_idx = torch.arange(0, length, device=beta.device)
+            time_idx = time_idx.unsqueeze(0).unsqueeze(2)
+            time_idx = time_idx.repeat(beta.shape[0], 1, beta.shape[2])
+
+        noise = torch.randn(time_idx.shape, device=beta.device)
+
+        # due to Pytorch implementation, use f0_mean as the f0 factor
+        decay = torch.exp(-time_idx * f0mean / beta / self.samp_rate)
+        return noise * self.noise_std * decay
+
+    def forward(self, f0s, beta):
+        """Producde cyclic-noise"""
+        # pulse train
+        pulse_train, sine_wav, uv, noise = self.l_pulse(f0s)
+        pure_pulse = pulse_train - noise
+
+        # decayed_noise (length, dim=1)
+        if (uv < 1).all():
+            # all unvoiced
+            cyc_noise = torch.zeros_like(sine_wav)
+        else:
+            f0mean = f0s[uv > 0].mean()
+
+            decayed_noise = self.noise_decay(beta, f0mean)[0, :, :]
+            # convolute
+            cyc_noise = self.l_conv(pure_pulse, decayed_noise)
+
+        # add noise in invoiced segments
+        cyc_noise = cyc_noise + noise * (1.0 - uv)
+        return cyc_noise, pulse_train, sine_wav, uv, noise
+
+
+class SineGen(torch.nn.Module):
+    """Definition of sine generator
+    SineGen(samp_rate, harmonic_num = 0,
+            sine_amp = 0.1, noise_std = 0.003,
+            voiced_threshold = 0,
+            flag_for_pulse=False)
+
+    samp_rate: sampling rate in Hz
+    harmonic_num: number of harmonic overtones (default 0)
+    sine_amp: amplitude of sine-wavefrom (default 0.1)
+    noise_std: std of Gaussian noise (default 0.003)
+    voiced_thoreshold: F0 threshold for U/V classification (default 0)
+    flag_for_pulse: this SinGen is used inside PulseGen (default False)
+
+    Note: when flag_for_pulse is True, the first time step of a voiced
+        segment is always sin(np.pi) or cos(0)
+    """
+
+    def __init__(
+        self,
+        samp_rate,
+        harmonic_num=0,
+        sine_amp=0.1,
+        noise_std=0.003,
+        voiced_threshold=0,
+        flag_for_pulse=False,
+    ):
+        super(SineGen, self).__init__()
+        self.sine_amp = sine_amp
+        self.noise_std = noise_std
+        self.harmonic_num = harmonic_num
+        self.dim = self.harmonic_num + 1
+        self.sampling_rate = samp_rate
+        self.voiced_threshold = voiced_threshold
+        self.flag_for_pulse = flag_for_pulse
+
+    def _f02uv(self, f0):
+        # generate uv signal
+        uv = torch.ones_like(f0)
+        uv = uv * (f0 > self.voiced_threshold)
+        return uv
+
+    def _f02sine(self, f0_values):
+        """f0_values: (batchsize, length, dim)
+        where dim indicates fundamental tone and overtones
+        """
+        # convert to F0 in rad. The interger part n can be ignored
+        # because 2 * np.pi * n doesn't affect phase
+        rad_values = (f0_values / self.sampling_rate) % 1
+
+        # initial phase noise (no noise for fundamental component)
+        rand_ini = torch.rand(
+            f0_values.shape[0], f0_values.shape[2], device=f0_values.device
+        )
+        rand_ini[:, 0] = 0
+        rad_values[:, 0, :] = rad_values[:, 0, :] + rand_ini
+
+        # instantanouse phase sine[t] = sin(2*pi \sum_i=1 ^{t} rad)
+        if not self.flag_for_pulse:
+            # for normal case
+
+            # To prevent torch.cumsum numerical overflow,
+            # it is necessary to add -1 whenever \sum_k=1^n rad_value_k > 1.
+            # Buffer tmp_over_one_idx indicates the time step to add -1.
+            # This will not change F0 of sine because (x-1) * 2*pi = x * 2*pi
+            tmp_over_one = torch.cumsum(rad_values, 1) % 1
+            tmp_over_one_idx = (tmp_over_one[:, 1:, :] - tmp_over_one[:, :-1, :]) < 0
+            cumsum_shift = torch.zeros_like(rad_values)
+            cumsum_shift[:, 1:, :] = tmp_over_one_idx * -1.0
+
+            sines = torch.sin(
+                torch.cumsum(rad_values + cumsum_shift, dim=1) * 2 * np.pi
+            )
+        else:
+            # If necessary, make sure that the first time step of every
+            # voiced segments is sin(pi) or cos(0)
+            # This is used for pulse-train generation
+
+            # identify the last time step in unvoiced segments
+            uv = self._f02uv(f0_values)
+            uv_1 = torch.roll(uv, shifts=-1, dims=1)
+            uv_1[:, -1, :] = 1
+            u_loc = (uv < 1) * (uv_1 > 0)
+
+            # get the instantanouse phase
+            tmp_cumsum = torch.cumsum(rad_values, dim=1)
+            # different batch needs to be processed differently
+            for idx in range(f0_values.shape[0]):
+                temp_sum = tmp_cumsum[idx, u_loc[idx, :, 0], :]
+                temp_sum[1:, :] = temp_sum[1:, :] - temp_sum[0:-1, :]
+                # stores the accumulation of i.phase within
+                # each voiced segments
+                tmp_cumsum[idx, :, :] = 0
+                tmp_cumsum[idx, u_loc[idx, :, 0], :] = temp_sum
+
+            # rad_values - tmp_cumsum: remove the accumulation of i.phase
+            # within the previous voiced segment.
+            i_phase = torch.cumsum(rad_values - tmp_cumsum, dim=1)
+
+            # get the sines
+            sines = torch.cos(i_phase * 2 * np.pi)
+        return sines
+
+    def forward(self, f0):
+        """sine_tensor, uv = forward(f0)
+        input F0: tensor(batchsize=1, length, dim=1)
+                  f0 for unvoiced steps should be 0
+        output sine_tensor: tensor(batchsize=1, length, dim)
+        output uv: tensor(batchsize=1, length, 1)
+        """
+        with torch.no_grad():
+            f0_buf = torch.zeros(f0.shape[0], f0.shape[1], self.dim, device=f0.device)
+            # fundamental component
+            f0_buf[:, :, 0] = f0[:, :, 0]
+            for idx in np.arange(self.harmonic_num):
+                # idx + 2: the (idx+1)-th overtone, (idx+2)-th harmonic
+                f0_buf[:, :, idx + 1] = f0_buf[:, :, 0] * (idx + 2)
+
+            # generate sine waveforms
+            sine_waves = self._f02sine(f0_buf) * self.sine_amp
+
+            # generate uv signal
+            # uv = torch.ones(f0.shape)
+            # uv = uv * (f0 > self.voiced_threshold)
+            uv = self._f02uv(f0)
+
+            # noise: for unvoiced should be similar to sine_amp
+            #        std = self.sine_amp/3 -> max value ~ self.sine_amp
+            # .       for voiced regions is self.noise_std
+            noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3
+            noise = noise_amp * torch.randn_like(sine_waves)
+
+            # first: set the unvoiced part to 0 by uv
+            # then: additive noise
+            sine_waves = sine_waves * uv + noise
+        return sine_waves
+
+
+class SourceModuleCycNoise_v1(torch.nn.Module):
+    """SourceModuleCycNoise_v1
+    SourceModule(sampling_rate, noise_std=0.003, voiced_threshod=0)
+    sampling_rate: sampling_rate in Hz
+
+    noise_std: std of Gaussian noise (default: 0.003)
+    voiced_threshold: threshold to set U/V given F0 (default: 0)
+
+    cyc, noise, uv = SourceModuleCycNoise_v1(F0_upsampled, beta)
+    F0_upsampled (batchsize, length, 1)
+    beta (1)
+    cyc (batchsize, length, 1)
+    noise (batchsize, length, 1)
+    uv (batchsize, length, 1)
+    """
+
+    def __init__(self, sampling_rate, noise_std=0.003, voiced_threshod=0):
+        super(SourceModuleCycNoise_v1, self).__init__()
+        self.sampling_rate = sampling_rate
+        self.noise_std = noise_std
+        self.l_cyc_gen = CyclicNoiseGen_v1(sampling_rate, noise_std, voiced_threshod)
+
+    def forward(self, f0_upsamped, beta):
+        """
+        cyc, noise, uv = SourceModuleCycNoise_v1(F0, beta)
+        F0_upsampled (batchsize, length, 1)
+        beta (1)
+        cyc (batchsize, length, 1)
+        noise (batchsize, length, 1)
+        uv (batchsize, length, 1)
+        """
+        # source for harmonic branch
+        cyc, pulse, sine, uv, add_noi = self.l_cyc_gen(f0_upsamped, beta)
+
+        # source for noise branch, in the same shape as uv
+        noise = torch.randn_like(uv) * self.noise_std / 3
+        return cyc, noise, uv
+
+
+class SourceModuleHnNSF(torch.nn.Module):
+    def __init__(
+        self,
+        sampling_rate=32000,
+        sine_amp=0.1,
+        add_noise_std=0.003,
+        voiced_threshod=0,
+    ):
+        super(SourceModuleHnNSF, self).__init__()
+        harmonic_num = 10
+        self.sine_amp = sine_amp
+        self.noise_std = add_noise_std
+
+        # to produce sine waveforms
+        self.l_sin_gen = SineGen(
+            sampling_rate, harmonic_num, sine_amp, add_noise_std, voiced_threshod
+        )
+
+        # to merge source harmonics into a single excitation
+        self.l_tanh = torch.nn.Tanh()
+        self.register_buffer('merge_w', torch.FloatTensor([[
+            0.2942, -0.2243, 0.0033, -0.0056, -0.0020, -0.0046,
+            0.0221, -0.0083, -0.0241, -0.0036, -0.0581]]))
+        self.register_buffer('merge_b', torch.FloatTensor([0.0008]))
+
+    def forward(self, x):
+        """
+        Sine_source = SourceModuleHnNSF(F0_sampled)
+        F0_sampled (batchsize, length, 1)
+        Sine_source (batchsize, length, 1)
+        """
+        # source for harmonic branch
+        sine_wavs = self.l_sin_gen(x)
+        sine_wavs = torch_nn_func.linear(
+            sine_wavs, self.merge_w) + self.merge_b
+        sine_merge = self.l_tanh(sine_wavs)
+        return sine_merge
diff --git a/vits_extend/__init__.py b/vits_extend/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/vits_extend/dataloader.py b/vits_extend/dataloader.py
new file mode 100644
index 0000000000000000000000000000000000000000..5f26fe0e15f719b6594110799f3863e720377150
--- /dev/null
+++ b/vits_extend/dataloader.py
@@ -0,0 +1,38 @@
+from torch.utils.data import DataLoader
+from vits.data_utils import DistributedBucketSampler
+from vits.data_utils import TextAudioSpeakerCollate
+from vits.data_utils import TextAudioSpeakerSet
+
+
+def create_dataloader_train(hps, n_gpus, rank):
+    collate_fn = TextAudioSpeakerCollate()
+    train_dataset = TextAudioSpeakerSet(hps.data.training_files, hps.data)
+    train_sampler = DistributedBucketSampler(
+        train_dataset,
+        hps.train.batch_size,
+        [150, 300, 450],
+        num_replicas=n_gpus,
+        rank=rank,
+        shuffle=True)
+    train_loader = DataLoader(
+        train_dataset,
+        num_workers=4,
+        shuffle=False,
+        pin_memory=True,
+        collate_fn=collate_fn,
+        batch_sampler=train_sampler)
+    return train_loader
+
+
+def create_dataloader_eval(hps):
+    collate_fn = TextAudioSpeakerCollate()
+    eval_dataset = TextAudioSpeakerSet(hps.data.validation_files, hps.data)
+    eval_loader = DataLoader(
+        eval_dataset,
+        num_workers=2,
+        shuffle=False,
+        batch_size=hps.train.batch_size,
+        pin_memory=True,
+        drop_last=False,
+        collate_fn=collate_fn)
+    return eval_loader
diff --git a/vits_extend/plotting.py b/vits_extend/plotting.py
new file mode 100644
index 0000000000000000000000000000000000000000..89ff909af85f1ab8788a8047abe8434844a8e16c
--- /dev/null
+++ b/vits_extend/plotting.py
@@ -0,0 +1,49 @@
+import logging
+mpl_logger = logging.getLogger('matplotlib')  # must before import matplotlib
+mpl_logger.setLevel(logging.WARNING)
+import matplotlib
+matplotlib.use("Agg")
+
+import numpy as np
+import matplotlib.pylab as plt
+
+
+def save_figure_to_numpy(fig):
+    # save it to a numpy array.
+    data = np.fromstring(fig.canvas.tostring_rgb(), dtype=np.uint8, sep='')
+    data = data.reshape(fig.canvas.get_width_height()[::-1] + (3,))
+    data = np.transpose(data, (2, 0, 1))
+    return data
+
+
+def plot_waveform_to_numpy(waveform):
+    fig, ax = plt.subplots(figsize=(12, 4))
+    ax.plot()
+    ax.plot(range(len(waveform)), waveform,
+            linewidth=0.1, alpha=0.7, color='blue')
+
+    plt.xlabel("Samples")
+    plt.ylabel("Amplitude")
+    plt.ylim(-1, 1)
+    plt.tight_layout()
+
+    fig.canvas.draw()
+    data = save_figure_to_numpy(fig)
+    plt.close()
+
+    return data
+
+
+def plot_spectrogram_to_numpy(spectrogram):
+    fig, ax = plt.subplots(figsize=(12, 4))
+    im = ax.imshow(spectrogram, aspect="auto", origin="lower",
+                   interpolation='none')
+    plt.colorbar(im, ax=ax)
+    plt.xlabel("Frames")
+    plt.ylabel("Channels")
+    plt.tight_layout()
+
+    fig.canvas.draw()
+    data = save_figure_to_numpy(fig)
+    plt.close()
+    return data
diff --git a/vits_extend/stft.py b/vits_extend/stft.py
new file mode 100644
index 0000000000000000000000000000000000000000..9510305ffa19528c80380f1e30bb71e38e9fbcf8
--- /dev/null
+++ b/vits_extend/stft.py
@@ -0,0 +1,104 @@
+# MIT License
+#
+# Copyright (c) 2020 Jungil Kong
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+import math
+import os
+import random
+import torch
+import torch.utils.data
+import numpy as np
+from librosa.util import normalize
+from scipy.io.wavfile import read
+from librosa.filters import mel as librosa_mel_fn
+
+
+class TacotronSTFT(torch.nn.Module):
+    def __init__(self, filter_length=512, hop_length=160, win_length=512,
+                 n_mel_channels=80, sampling_rate=16000, mel_fmin=0.0,
+                 mel_fmax=None, center=False, device='cpu'):
+        super(TacotronSTFT, self).__init__()
+        self.n_mel_channels = n_mel_channels
+        self.sampling_rate = sampling_rate
+        self.n_fft = filter_length
+        self.hop_size = hop_length
+        self.win_size = win_length
+        self.fmin = mel_fmin
+        self.fmax = mel_fmax
+        self.center = center
+
+        mel = librosa_mel_fn(
+            sr=sampling_rate, n_fft=filter_length, n_mels=n_mel_channels, fmin=mel_fmin, fmax=mel_fmax)
+
+        mel_basis = torch.from_numpy(mel).float().to(device)
+        hann_window = torch.hann_window(win_length).to(device)
+
+        self.register_buffer('mel_basis', mel_basis)
+        self.register_buffer('hann_window', hann_window)
+
+    def linear_spectrogram(self, y):
+        assert (torch.min(y.data) >= -1)
+        assert (torch.max(y.data) <= 1)
+
+        y = torch.nn.functional.pad(y.unsqueeze(1),
+                                    (int((self.n_fft - self.hop_size) / 2), int((self.n_fft - self.hop_size) / 2)),
+                                    mode='reflect')
+        y = y.squeeze(1)
+        spec = torch.stft(y, self.n_fft, hop_length=self.hop_size, win_length=self.win_size, window=self.hann_window,
+                          center=self.center, pad_mode='reflect', normalized=False, onesided=True, return_complex=False)
+        spec = torch.norm(spec, p=2, dim=-1)
+
+        return spec
+
+    def mel_spectrogram(self, y):
+        """Computes mel-spectrograms from a batch of waves
+        PARAMS
+        ------
+        y: Variable(torch.FloatTensor) with shape (B, T) in range [-1, 1]
+
+        RETURNS
+        -------
+        mel_output: torch.FloatTensor of shape (B, n_mel_channels, T)
+        """
+        assert(torch.min(y.data) >= -1)
+        assert(torch.max(y.data) <= 1)
+
+        y = torch.nn.functional.pad(y.unsqueeze(1),
+                                    (int((self.n_fft - self.hop_size) / 2), int((self.n_fft - self.hop_size) / 2)),
+                                    mode='reflect')
+        y = y.squeeze(1)
+
+        spec = torch.stft(y, self.n_fft, hop_length=self.hop_size, win_length=self.win_size, window=self.hann_window,
+                          center=self.center, pad_mode='reflect', normalized=False, onesided=True, return_complex=False)
+
+        spec = torch.sqrt(spec.pow(2).sum(-1) + (1e-9))
+
+        spec = torch.matmul(self.mel_basis, spec)
+        spec = self.spectral_normalize_torch(spec)
+
+        return spec
+
+    def spectral_normalize_torch(self, magnitudes):
+        output = self.dynamic_range_compression_torch(magnitudes)
+        return output
+
+    def dynamic_range_compression_torch(self, x, C=1, clip_val=1e-5):
+        return torch.log(torch.clamp(x, min=clip_val) * C)
diff --git a/vits_extend/stft_loss.py b/vits_extend/stft_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..ed672b0000b993067668413f2dc6562ae8febdeb
--- /dev/null
+++ b/vits_extend/stft_loss.py
@@ -0,0 +1,133 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2019 Tomoki Hayashi
+#  MIT License (https://opensource.org/licenses/MIT)
+
+"""STFT-based Loss modules."""
+
+import torch
+import torch.nn.functional as F
+
+
+def stft(x, fft_size, hop_size, win_length, window):
+    """Perform STFT and convert to magnitude spectrogram.
+    Args:
+        x (Tensor): Input signal tensor (B, T).
+        fft_size (int): FFT size.
+        hop_size (int): Hop size.
+        win_length (int): Window length.
+        window (str): Window function type.
+    Returns:
+        Tensor: Magnitude spectrogram (B, #frames, fft_size // 2 + 1).
+    """
+    x_stft = torch.stft(x, fft_size, hop_size, win_length, window, return_complex=False)
+    real = x_stft[..., 0]
+    imag = x_stft[..., 1]
+
+    # NOTE(kan-bayashi): clamp is needed to avoid nan or inf
+    return torch.sqrt(torch.clamp(real ** 2 + imag ** 2, min=1e-7)).transpose(2, 1)
+
+
+class SpectralConvergengeLoss(torch.nn.Module):
+    """Spectral convergence loss module."""
+
+    def __init__(self):
+        """Initilize spectral convergence loss module."""
+        super(SpectralConvergengeLoss, self).__init__()
+
+    def forward(self, x_mag, y_mag):
+        """Calculate forward propagation.
+        Args:
+            x_mag (Tensor): Magnitude spectrogram of predicted signal (B, #frames, #freq_bins).
+            y_mag (Tensor): Magnitude spectrogram of groundtruth signal (B, #frames, #freq_bins).
+        Returns:
+            Tensor: Spectral convergence loss value.
+        """
+        return torch.norm(y_mag - x_mag, p="fro") / torch.norm(y_mag, p="fro")
+
+
+class LogSTFTMagnitudeLoss(torch.nn.Module):
+    """Log STFT magnitude loss module."""
+
+    def __init__(self):
+        """Initilize los STFT magnitude loss module."""
+        super(LogSTFTMagnitudeLoss, self).__init__()
+
+    def forward(self, x_mag, y_mag):
+        """Calculate forward propagation.
+        Args:
+            x_mag (Tensor): Magnitude spectrogram of predicted signal (B, #frames, #freq_bins).
+            y_mag (Tensor): Magnitude spectrogram of groundtruth signal (B, #frames, #freq_bins).
+        Returns:
+            Tensor: Log STFT magnitude loss value.
+        """
+        return F.l1_loss(torch.log(y_mag), torch.log(x_mag))
+
+
+class STFTLoss(torch.nn.Module):
+    """STFT loss module."""
+
+    def __init__(self, device, fft_size=1024, shift_size=120, win_length=600, window="hann_window"):
+        """Initialize STFT loss module."""
+        super(STFTLoss, self).__init__()
+        self.fft_size = fft_size
+        self.shift_size = shift_size
+        self.win_length = win_length
+        self.window = getattr(torch, window)(win_length).to(device)
+        self.spectral_convergenge_loss = SpectralConvergengeLoss()
+        self.log_stft_magnitude_loss = LogSTFTMagnitudeLoss()
+
+    def forward(self, x, y):
+        """Calculate forward propagation.
+        Args:
+            x (Tensor): Predicted signal (B, T).
+            y (Tensor): Groundtruth signal (B, T).
+        Returns:
+            Tensor: Spectral convergence loss value.
+            Tensor: Log STFT magnitude loss value.
+        """
+        x_mag = stft(x, self.fft_size, self.shift_size, self.win_length, self.window)
+        y_mag = stft(y, self.fft_size, self.shift_size, self.win_length, self.window)
+        sc_loss = self.spectral_convergenge_loss(x_mag, y_mag)
+        mag_loss = self.log_stft_magnitude_loss(x_mag, y_mag)
+
+        return sc_loss, mag_loss
+
+
+class MultiResolutionSTFTLoss(torch.nn.Module):
+    """Multi resolution STFT loss module."""
+
+    def __init__(self,
+                 device,
+                 resolutions,
+                 window="hann_window"):
+        """Initialize Multi resolution STFT loss module.
+        Args:
+            resolutions (list): List of (FFT size, hop size, window length).
+            window (str): Window function type.
+        """
+        super(MultiResolutionSTFTLoss, self).__init__()
+        self.stft_losses = torch.nn.ModuleList()
+        for fs, ss, wl in resolutions:
+            self.stft_losses += [STFTLoss(device, fs, ss, wl, window)]
+
+    def forward(self, x, y):
+        """Calculate forward propagation.
+        Args:
+            x (Tensor): Predicted signal (B, T).
+            y (Tensor): Groundtruth signal (B, T).
+        Returns:
+            Tensor: Multi resolution spectral convergence loss value.
+            Tensor: Multi resolution log STFT magnitude loss value.
+        """
+        sc_loss = 0.0
+        mag_loss = 0.0
+        for f in self.stft_losses:
+            sc_l, mag_l = f(x, y)
+            sc_loss += sc_l
+            mag_loss += mag_l
+
+        sc_loss /= len(self.stft_losses)
+        mag_loss /= len(self.stft_losses)
+
+        return sc_loss, mag_loss
diff --git a/vits_extend/train.py b/vits_extend/train.py
new file mode 100644
index 0000000000000000000000000000000000000000..9a93c294476e70262fa3e399c74c60698157b13c
--- /dev/null
+++ b/vits_extend/train.py
@@ -0,0 +1,312 @@
+import os
+import time
+import logging
+import math
+import tqdm
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.distributed import init_process_group
+from torch.nn.parallel import DistributedDataParallel
+
+from vits_extend.dataloader import create_dataloader_train
+from vits_extend.dataloader import create_dataloader_eval
+from vits_extend.writer import MyWriter
+from vits_extend.stft import TacotronSTFT
+from vits_extend.stft_loss import MultiResolutionSTFTLoss
+from vits_extend.validation import validate
+from vits_decoder.discriminator import Discriminator
+from vits.models import SynthesizerTrn
+from vits import commons
+from vits.losses import kl_loss
+from vits.commons import clip_grad_value_
+
+
+def load_part(model, saved_state_dict):
+    if hasattr(model, 'module'):
+        state_dict = model.module.state_dict()
+    else:
+        state_dict = model.state_dict()
+    new_state_dict = {}
+    for k, v in state_dict.items():
+        if k.startswith('TODO'):
+            new_state_dict[k] = v
+        else:
+            new_state_dict[k] = saved_state_dict[k]
+    if hasattr(model, 'module'):
+        model.module.load_state_dict(new_state_dict)
+    else:
+        model.load_state_dict(new_state_dict)
+    return model
+
+
+def load_model(model, saved_state_dict):
+    if hasattr(model, 'module'):
+        state_dict = model.module.state_dict()
+    else:
+        state_dict = model.state_dict()
+    new_state_dict = {}
+    for k, v in state_dict.items():
+        try:
+            new_state_dict[k] = saved_state_dict[k]
+        except:
+            print("%s is not in the checkpoint" % k)
+            new_state_dict[k] = v
+    if hasattr(model, 'module'):
+        model.module.load_state_dict(new_state_dict)
+    else:
+        model.load_state_dict(new_state_dict)
+    return model
+
+
+def train(rank, args, chkpt_path, hp, hp_str):
+
+    if args.num_gpus > 1:
+        init_process_group(backend=hp.dist_config.dist_backend, init_method=hp.dist_config.dist_url,
+                           world_size=hp.dist_config.world_size * args.num_gpus, rank=rank)
+
+    torch.cuda.manual_seed(hp.train.seed)
+    device = torch.device('cuda:{:d}'.format(rank))
+
+    model_g = SynthesizerTrn(
+        hp.data.filter_length // 2 + 1,
+        hp.data.segment_size // hp.data.hop_length,
+        hp).to(device)
+    model_d = Discriminator(hp).to(device)
+
+    optim_g = torch.optim.AdamW(model_g.parameters(),
+                                lr=hp.train.learning_rate, betas=hp.train.betas, eps=hp.train.eps)
+    optim_d = torch.optim.AdamW(model_d.parameters(),
+                                lr=(hp.train.learning_rate / hp.train.accum_step), betas=hp.train.betas, eps=hp.train.eps)
+
+    init_epoch = 1
+    step = 0
+
+    stft = TacotronSTFT(filter_length=hp.data.filter_length,
+                        hop_length=hp.data.hop_length,
+                        win_length=hp.data.win_length,
+                        n_mel_channels=hp.data.mel_channels,
+                        sampling_rate=hp.data.sampling_rate,
+                        mel_fmin=hp.data.mel_fmin,
+                        mel_fmax=hp.data.mel_fmax,
+                        center=False,
+                        device=device)
+    # define logger, writer, valloader, stft at rank_zero
+    if rank == 0:
+        pth_dir = os.path.join(hp.log.pth_dir, args.name)
+        log_dir = os.path.join(hp.log.log_dir, args.name)
+        os.makedirs(pth_dir, exist_ok=True)
+        os.makedirs(log_dir, exist_ok=True)
+
+        logging.basicConfig(
+            level=logging.INFO,
+            format='%(asctime)s - %(levelname)s - %(message)s',
+            handlers=[
+                logging.FileHandler(os.path.join(log_dir, '%s-%d.log' % (args.name, time.time()))),
+                logging.StreamHandler()
+            ]
+        )
+        logger = logging.getLogger()
+        writer = MyWriter(hp, log_dir)
+        valloader = create_dataloader_eval(hp)
+
+    if os.path.isfile(hp.train.pretrain):
+        if rank == 0:
+            logger.info("Start from 32k pretrain model: %s" % hp.train.pretrain)
+        checkpoint = torch.load(hp.train.pretrain, map_location='cpu')
+        load_model(model_g, checkpoint['model_g'])
+        load_model(model_d, checkpoint['model_d'])
+
+    if chkpt_path is not None:
+        if rank == 0:
+            logger.info("Resuming from checkpoint: %s" % chkpt_path)
+        checkpoint = torch.load(chkpt_path, map_location='cpu')
+        load_model(model_g, checkpoint['model_g'])
+        load_model(model_d, checkpoint['model_d'])
+        optim_g.load_state_dict(checkpoint['optim_g'])
+        optim_d.load_state_dict(checkpoint['optim_d'])
+        init_epoch = checkpoint['epoch']
+        step = checkpoint['step']
+
+        if rank == 0:
+            if hp_str != checkpoint['hp_str']:
+                logger.warning("New hparams is different from checkpoint. Will use new.")
+    else:
+        if rank == 0:
+            logger.info("Starting new training run.")
+
+    if args.num_gpus > 1:
+        model_g = DistributedDataParallel(model_g, device_ids=[rank])
+        model_d = DistributedDataParallel(model_d, device_ids=[rank])
+
+    # this accelerates training when the size of minibatch is always consistent.
+    # if not consistent, it'll horribly slow down.
+    torch.backends.cudnn.benchmark = True
+
+    scheduler_g = torch.optim.lr_scheduler.ExponentialLR(optim_g, gamma=hp.train.lr_decay, last_epoch=init_epoch-2)
+    scheduler_d = torch.optim.lr_scheduler.ExponentialLR(optim_d, gamma=hp.train.lr_decay, last_epoch=init_epoch-2)
+
+    stft_criterion = MultiResolutionSTFTLoss(device, eval(hp.mrd.resolutions))
+    spkc_criterion = nn.CosineEmbeddingLoss()
+
+    trainloader = create_dataloader_train(hp, args.num_gpus, rank)
+
+    for epoch in range(init_epoch, hp.train.epochs):
+
+        trainloader.batch_sampler.set_epoch(epoch)
+
+        if rank == 0 and epoch % hp.log.eval_interval == 0:
+            with torch.no_grad():
+                validate(hp, args, model_g, model_d, valloader, stft, writer, step, device)
+
+        if rank == 0:
+            loader = tqdm.tqdm(trainloader, desc='Loading train data')
+        else:
+            loader = trainloader
+
+        model_g.train()
+        model_d.train()
+
+        for ppg, ppg_l, vec, pit, spk, spec, spec_l, audio, audio_l in loader:
+
+            ppg = ppg.to(device)
+            vec = vec.to(device)
+            pit = pit.to(device)
+            spk = spk.to(device)
+            spec = spec.to(device)
+            audio = audio.to(device)
+            ppg_l = ppg_l.to(device)
+            spec_l = spec_l.to(device)
+            audio_l = audio_l.to(device)
+
+            # generator
+            fake_audio, ids_slice, z_mask, \
+                (z_f, z_r, z_p, m_p, logs_p, z_q, m_q, logs_q, logdet_f, logdet_r), spk_preds = model_g(
+                    ppg, vec, pit, spec, spk, ppg_l, spec_l)
+
+            audio = commons.slice_segments(
+                audio, ids_slice * hp.data.hop_length, hp.data.segment_size)  # slice
+            # Spk Loss
+            spk_loss = spkc_criterion(spk, spk_preds, torch.Tensor(spk_preds.size(0))
+                                .to(device).fill_(1.0))
+            # Mel Loss
+            mel_fake = stft.mel_spectrogram(fake_audio.squeeze(1))
+            mel_real = stft.mel_spectrogram(audio.squeeze(1))
+            mel_loss = F.l1_loss(mel_fake, mel_real) * hp.train.c_mel
+
+            # Multi-Resolution STFT Loss
+            sc_loss, mag_loss = stft_criterion(fake_audio.squeeze(1), audio.squeeze(1))
+            stft_loss = (sc_loss + mag_loss) * hp.train.c_stft
+
+            # Generator Loss
+            disc_fake = model_d(fake_audio)
+            score_loss = 0.0
+            for (_, score_fake) in disc_fake:
+                score_loss += torch.mean(torch.pow(score_fake - 1.0, 2))
+            score_loss = score_loss / len(disc_fake)
+
+            # Feature Loss
+            disc_real = model_d(audio)
+            feat_loss = 0.0
+            for (feat_fake, _), (feat_real, _) in zip(disc_fake, disc_real):
+                for fake, real in zip(feat_fake, feat_real):
+                    feat_loss += torch.mean(torch.abs(fake - real))
+            feat_loss = feat_loss / len(disc_fake)
+            feat_loss = feat_loss * 2
+
+            # Kl Loss
+            loss_kl_f = kl_loss(z_f, logs_q, m_p, logs_p, logdet_f, z_mask) * hp.train.c_kl
+            loss_kl_r = kl_loss(z_r, logs_p, m_q, logs_q, logdet_r, z_mask) * hp.train.c_kl
+
+            # Loss
+            loss_g = score_loss + feat_loss + mel_loss + stft_loss + loss_kl_f + loss_kl_r * 0.5 + spk_loss * 2
+            loss_g.backward()
+
+            if ((step + 1) % hp.train.accum_step == 0) or (step + 1 == len(loader)):
+                # accumulate gradients for accum steps
+                for param in model_g.parameters():
+                    param.grad /= hp.train.accum_step
+                clip_grad_value_(model_g.parameters(),  None)
+                # update model
+                optim_g.step()
+                optim_g.zero_grad()
+
+            # discriminator
+            optim_d.zero_grad()
+            disc_fake = model_d(fake_audio.detach())
+            disc_real = model_d(audio)
+
+            loss_d = 0.0
+            for (_, score_fake), (_, score_real) in zip(disc_fake, disc_real):
+                loss_d += torch.mean(torch.pow(score_real - 1.0, 2))
+                loss_d += torch.mean(torch.pow(score_fake, 2))
+            loss_d = loss_d / len(disc_fake)
+
+            loss_d.backward()
+            clip_grad_value_(model_d.parameters(),  None)
+            optim_d.step()
+
+            step += 1
+            # logging
+            loss_g = loss_g.item()
+            loss_d = loss_d.item()
+            loss_s = stft_loss.item()
+            loss_m = mel_loss.item()
+            loss_k = loss_kl_f.item()
+            loss_r = loss_kl_r.item()
+            loss_i = spk_loss.item()
+
+            if rank == 0 and step % hp.log.info_interval == 0:
+                writer.log_training(
+                    loss_g, loss_d, loss_m, loss_s, loss_k, loss_r, score_loss.item(), step)
+                logger.info("epoch %d | g %.04f m %.04f s %.04f d %.04f k %.04f r %.04f i %.04f | step %d" % (
+                    epoch, loss_g, loss_m, loss_s, loss_d, loss_k, loss_r, loss_i, step))
+
+        if rank == 0 and epoch % hp.log.save_interval == 0:
+            save_path = os.path.join(pth_dir, '%s_%04d.pt'
+                                     % (args.name, epoch))
+            torch.save({
+                'model_g': (model_g.module if args.num_gpus > 1 else model_g).state_dict(),
+                'model_d': (model_d.module if args.num_gpus > 1 else model_d).state_dict(),
+                'optim_g': optim_g.state_dict(),
+                'optim_d': optim_d.state_dict(),
+                'step': step,
+                'epoch': epoch,
+                'hp_str': hp_str,
+            }, save_path)
+            logger.info("Saved checkpoint to: %s" % save_path)
+
+        if rank == 0:
+            def clean_checkpoints(path_to_models=f'{pth_dir}', n_ckpts_to_keep=hp.log.keep_ckpts, sort_by_time=True):
+                """Freeing up space by deleting saved ckpts
+                Arguments:
+                path_to_models    --  Path to the model directory
+                n_ckpts_to_keep   --  Number of ckpts to keep, excluding sovits5.0_0.pth
+                                      If n_ckpts_to_keep == 0, do not delete any ckpts
+                sort_by_time      --  True -> chronologically delete ckpts
+                                      False -> lexicographically delete ckpts
+                """
+                assert isinstance(n_ckpts_to_keep, int) and n_ckpts_to_keep >= 0
+                ckpts_files = [f for f in os.listdir(path_to_models) if os.path.isfile(os.path.join(path_to_models, f))]
+                name_key = (lambda _f: int(re.compile(f'{args.name}_(\d+)\.pt').match(_f).group(1)))
+                time_key = (lambda _f: os.path.getmtime(os.path.join(path_to_models, _f)))
+                sort_key = time_key if sort_by_time else name_key
+                x_sorted = lambda _x: sorted(
+                    [f for f in ckpts_files if f.startswith(_x) and not f.endswith('sovits5.0_0.pth')], key=sort_key)
+                if n_ckpts_to_keep == 0:
+                    to_del = []
+                else:
+                    to_del = [os.path.join(path_to_models, fn) for fn in x_sorted(f'{args.name}')[:-n_ckpts_to_keep]]
+                del_info = lambda fn: logger.info(f"Free up space by deleting ckpt {fn}")
+                del_routine = lambda x: [os.remove(x), del_info(x)]
+                rs = [del_routine(fn) for fn in to_del]
+
+            clean_checkpoints()
+
+            os.makedirs(f'{pth_dir}', exist_ok=True)
+            keep_ckpts = getattr(hp.log, 'keep_ckpts', 0)
+            if keep_ckpts > 0:
+                clean_checkpoints(path_to_models=f'{pth_dir}', n_ckpts_to_keep=hp.log.keep_ckpts, sort_by_time=True)
+
+        scheduler_g.step()
+        scheduler_d.step()
diff --git a/vits_extend/validation.py b/vits_extend/validation.py
new file mode 100644
index 0000000000000000000000000000000000000000..acf93a1bb428b25386e8365bac19d7cfe22759d7
--- /dev/null
+++ b/vits_extend/validation.py
@@ -0,0 +1,48 @@
+import tqdm
+import torch
+import torch.nn.functional as F
+
+
+def validate(hp, args, generator, discriminator, valloader, stft, writer, step, device):
+    generator.eval()
+    discriminator.eval()
+    torch.backends.cudnn.benchmark = False
+
+    loader = tqdm.tqdm(valloader, desc='Validation loop')
+    mel_loss = 0.0
+    for idx, (ppg, ppg_l, vec, pit, spk, spec, spec_l, audio, audio_l) in enumerate(loader):
+        ppg = ppg.to(device)
+        vec = vec.to(device)
+        pit = pit.to(device)
+        spk = spk.to(device)
+        ppg_l = ppg_l.to(device)
+        audio = audio.to(device)
+
+        if hasattr(generator, 'module'):
+            fake_audio = generator.module.infer(ppg, vec, pit, spk, ppg_l)[
+                :, :, :audio.size(2)]
+        else:
+            fake_audio = generator.infer(ppg, vec, pit, spk, ppg_l)[
+                :, :, :audio.size(2)]
+
+        mel_fake = stft.mel_spectrogram(fake_audio.squeeze(1))
+        mel_real = stft.mel_spectrogram(audio.squeeze(1))
+
+        mel_loss += F.l1_loss(mel_fake, mel_real).item()
+
+        if idx < hp.log.num_audio:
+            spec_fake = stft.linear_spectrogram(fake_audio.squeeze(1))
+            spec_real = stft.linear_spectrogram(audio.squeeze(1))
+
+            audio = audio[0][0].cpu().detach().numpy()
+            fake_audio = fake_audio[0][0].cpu().detach().numpy()
+            spec_fake = spec_fake[0].cpu().detach().numpy()
+            spec_real = spec_real[0].cpu().detach().numpy()
+            writer.log_fig_audio(
+                audio, fake_audio, spec_fake, spec_real, idx, step)
+
+    mel_loss = mel_loss / len(valloader.dataset)
+
+    writer.log_validation(mel_loss, generator, discriminator, step)
+
+    torch.backends.cudnn.benchmark = True
diff --git a/vits_extend/writer.py b/vits_extend/writer.py
new file mode 100644
index 0000000000000000000000000000000000000000..386682bfc4467ee027efdca6d2bdbbe50d574895
--- /dev/null
+++ b/vits_extend/writer.py
@@ -0,0 +1,39 @@
+from torch.utils.tensorboard import SummaryWriter
+import numpy as np
+import librosa
+
+from .plotting import plot_waveform_to_numpy, plot_spectrogram_to_numpy
+
+class MyWriter(SummaryWriter):
+    def __init__(self, hp, logdir):
+        super(MyWriter, self).__init__(logdir)
+        self.sample_rate = hp.data.sampling_rate
+
+    def log_training(self, g_loss, d_loss, mel_loss, stft_loss, k_loss, r_loss, score_loss, step):
+        self.add_scalar('train/g_loss', g_loss, step)
+        self.add_scalar('train/d_loss', d_loss, step)
+        
+        self.add_scalar('train/score_loss', score_loss, step)
+        self.add_scalar('train/stft_loss', stft_loss, step)
+        self.add_scalar('train/mel_loss', mel_loss, step)
+        self.add_scalar('train/kl_f_loss', k_loss, step)
+        self.add_scalar('train/kl_r_loss', r_loss, step)
+
+    def log_validation(self, mel_loss, generator, discriminator, step):
+        self.add_scalar('validation/mel_loss', mel_loss, step)
+
+    def log_fig_audio(self, real, fake, spec_fake, spec_real, idx, step):
+        if idx == 0:
+            spec_fake = librosa.amplitude_to_db(spec_fake, ref=np.max,top_db=80.)
+            spec_real = librosa.amplitude_to_db(spec_real, ref=np.max,top_db=80.)
+            self.add_image(f'spec_fake/{step}', plot_spectrogram_to_numpy(spec_fake), step)
+            self.add_image(f'wave_fake/{step}', plot_waveform_to_numpy(fake), step)
+            self.add_image(f'spec_real/{step}', plot_spectrogram_to_numpy(spec_real), step)
+            self.add_image(f'wave_real/{step}', plot_waveform_to_numpy(real), step)
+
+            self.add_audio(f'fake/{step}', fake, step, self.sample_rate)
+            self.add_audio(f'real/{step}', real, step, self.sample_rate)
+
+    def log_histogram(self, model, step):
+        for tag, value in model.named_parameters():
+            self.add_histogram(tag.replace('.', '/'), value.cpu().detach().numpy(), step)
diff --git a/vits_pretrain/.DS_Store b/vits_pretrain/.DS_Store
new file mode 100644
index 0000000000000000000000000000000000000000..5008ddfcf53c02e82d7eee2e57c38e5672ef89f6
Binary files /dev/null and b/vits_pretrain/.DS_Store differ
diff --git a/vits_pretrain/README.md b/vits_pretrain/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..2be30a36a0e1e4718afca2cd53a9c69e5edc7df1
--- /dev/null
+++ b/vits_pretrain/README.md
@@ -0,0 +1,3 @@
+Path for:
+
+    sovits5.0_bigvgan_mix_v2.pth
diff --git a/whisper/LICENSE b/whisper/LICENSE
new file mode 100644
index 0000000000000000000000000000000000000000..d25552598bb9c5400612159ed4bab92ce12a5ce5
--- /dev/null
+++ b/whisper/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2022 OpenAI
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/whisper/README.md b/whisper/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..9ea3a38e58aa56be82a79e31461849083917babb
--- /dev/null
+++ b/whisper/README.md
@@ -0,0 +1,147 @@
+# Whisper
+
+[[Blog]](https://openai.com/blog/whisper)
+[[Paper]](https://arxiv.org/abs/2212.04356)
+[[Model card]](https://github.com/openai/whisper/blob/main/model-card.md)
+[[Colab example]](https://colab.research.google.com/github/openai/whisper/blob/master/notebooks/LibriSpeech.ipynb)
+
+Whisper is a general-purpose speech recognition model. It is trained on a large dataset of diverse audio and is also a multitasking model that can perform multilingual speech recognition, speech translation, and language identification.
+
+
+## Approach
+
+![Approach](https://raw.githubusercontent.com/openai/whisper/main/approach.png)
+
+A Transformer sequence-to-sequence model is trained on various speech processing tasks, including multilingual speech recognition, speech translation, spoken language identification, and voice activity detection. These tasks are jointly represented as a sequence of tokens to be predicted by the decoder, allowing a single model to replace many stages of a traditional speech-processing pipeline. The multitask training format uses a set of special tokens that serve as task specifiers or classification targets.
+
+
+## Setup
+
+We used Python 3.9.9 and [PyTorch](https://pytorch.org/) 1.10.1 to train and test our models, but the codebase is expected to be compatible with Python 3.8-3.10 and recent PyTorch versions. The codebase also depends on a few Python packages, most notably [HuggingFace Transformers](https://huggingface.co/docs/transformers/index) for their fast tokenizer implementation and [ffmpeg-python](https://github.com/kkroening/ffmpeg-python) for reading audio files. You can download and install (or update to) the latest release of Whisper with the following command:
+
+    pip install -U openai-whisper
+
+Alternatively, the following command will pull and install the latest commit from this repository, along with its Python dependencies:
+
+    pip install git+https://github.com/openai/whisper.git 
+
+To update the package to the latest version of this repository, please run:
+
+    pip install --upgrade --no-deps --force-reinstall git+https://github.com/openai/whisper.git
+
+It also requires the command-line tool [`ffmpeg`](https://ffmpeg.org/) to be installed on your system, which is available from most package managers:
+
+```bash
+# on Ubuntu or Debian
+sudo apt update && sudo apt install ffmpeg
+
+# on Arch Linux
+sudo pacman -S ffmpeg
+
+# on MacOS using Homebrew (https://brew.sh/)
+brew install ffmpeg
+
+# on Windows using Chocolatey (https://chocolatey.org/)
+choco install ffmpeg
+
+# on Windows using Scoop (https://scoop.sh/)
+scoop install ffmpeg
+```
+
+You may need [`rust`](http://rust-lang.org) installed as well, in case [tokenizers](https://pypi.org/project/tokenizers/) does not provide a pre-built wheel for your platform. If you see installation errors during the `pip install` command above, please follow the [Getting started page](https://www.rust-lang.org/learn/get-started) to install Rust development environment. Additionally, you may need to configure the `PATH` environment variable, e.g. `export PATH="$HOME/.cargo/bin:$PATH"`. If the installation fails with `No module named 'setuptools_rust'`, you need to install `setuptools_rust`, e.g. by running:
+
+```bash
+pip install setuptools-rust
+```
+
+
+## Available models and languages
+
+There are five model sizes, four with English-only versions, offering speed and accuracy tradeoffs. Below are the names of the available models and their approximate memory requirements and relative speed. 
+
+
+|  Size  | Parameters | English-only model | Multilingual model | Required VRAM | Relative speed |
+|:------:|:----------:|:------------------:|:------------------:|:-------------:|:--------------:|
+|  tiny  |    39 M    |     `tiny.en`      |       `tiny`       |     ~1 GB     |      ~32x      |
+|  base  |    74 M    |     `base.en`      |       `base`       |     ~1 GB     |      ~16x      |
+| small  |   244 M    |     `small.en`     |      `small`       |     ~2 GB     |      ~6x       |
+| medium |   769 M    |    `medium.en`     |      `medium`      |     ~5 GB     |      ~2x       |
+| large  |   1550 M   |        N/A         |      `large`       |    ~10 GB     |       1x       |
+
+The `.en` models for English-only applications tend to perform better, especially for the `tiny.en` and `base.en` models. We observed that the difference becomes less significant for the `small.en` and `medium.en` models.
+
+Whisper's performance varies widely depending on the language. The figure below shows a WER (Word Error Rate) breakdown by languages of the Fleurs dataset using the `large-v2` model. More WER and BLEU scores corresponding to the other models and datasets can be found in Appendix D in [the paper](https://arxiv.org/abs/2212.04356). The smaller, the better.
+
+![WER breakdown by language](https://raw.githubusercontent.com/openai/whisper/main/language-breakdown.svg)
+
+
+
+## Command-line usage
+
+The following command will transcribe speech in audio files, using the `medium` model:
+
+    whisper audio.flac audio.mp3 audio.wav --model medium
+
+The default setting (which selects the `small` model) works well for transcribing English. To transcribe an audio file containing non-English speech, you can specify the language using the `--language` option:
+
+    whisper japanese.wav --language Japanese
+
+Adding `--task translate` will translate the speech into English:
+
+    whisper japanese.wav --language Japanese --task translate
+
+Run the following to view all available options:
+
+    whisper --help
+
+See [tokenizer.py](https://github.com/openai/whisper/blob/main/whisper/tokenizer.py) for the list of all available languages.
+
+
+## Python usage
+
+Transcription can also be performed within Python: 
+
+```python
+import whisper
+
+model = whisper.load_model("base")
+result = model.transcribe("audio.mp3")
+print(result["text"])
+```
+
+Internally, the `transcribe()` method reads the entire file and processes the audio with a sliding 30-second window, performing autoregressive sequence-to-sequence predictions on each window.
+
+Below is an example usage of `whisper.detect_language()` and `whisper.decode()` which provide lower-level access to the model.
+
+```python
+import whisper
+
+model = whisper.load_model("base")
+
+# load audio and pad/trim it to fit 30 seconds
+audio = whisper.load_audio("audio.mp3")
+audio = whisper.pad_or_trim(audio)
+
+# make log-Mel spectrogram and move to the same device as the model
+mel = whisper.log_mel_spectrogram(audio).to(model.device)
+
+# detect the spoken language
+_, probs = model.detect_language(mel)
+print(f"Detected language: {max(probs, key=probs.get)}")
+
+# decode the audio
+options = whisper.DecodingOptions()
+result = whisper.decode(model, mel, options)
+
+# print the recognized text
+print(result.text)
+```
+
+## More examples
+
+Please use the [🙌 Show and tell](https://github.com/openai/whisper/discussions/categories/show-and-tell) category in Discussions for sharing more example usages of Whisper and third-party extensions such as web demos, integrations with other tools, ports for different platforms, etc.
+
+
+## License
+
+Whisper's code and model weights are released under the MIT License. See [LICENSE](https://github.com/openai/whisper/blob/main/LICENSE) for further details.
\ No newline at end of file
diff --git a/whisper/__init__.py b/whisper/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/whisper/audio.py b/whisper/audio.py
new file mode 100644
index 0000000000000000000000000000000000000000..2dfe105adda10dfe78179edb5e39cc6d3bde39f9
--- /dev/null
+++ b/whisper/audio.py
@@ -0,0 +1,100 @@
+import os
+from functools import lru_cache
+from typing import Union
+
+import librosa
+import numpy as np
+import torch
+import torch.nn.functional as F
+
+from .utils import exact_div
+
+from librosa.filters import mel as librosa_mel_fn
+
+# hard-coded audio hyperparameters
+SAMPLE_RATE = 16000
+N_FFT = 400
+N_MELS = 80
+HOP_LENGTH = 160
+CHUNK_LENGTH = 30
+N_SAMPLES = CHUNK_LENGTH * SAMPLE_RATE  # 480000: number of samples in a chunk
+N_FRAMES = exact_div(N_SAMPLES, HOP_LENGTH)  # 3000: number of frames in a mel spectrogram input
+
+
+def load_audio(file: str, sr: int = SAMPLE_RATE):
+    x, sr = librosa.load(file, sr=sr)
+    return x
+
+
+def pad_or_trim(array, length_max: int = N_SAMPLES, length_min: int = N_SAMPLES // 2, *, axis: int = -1):
+    """
+    Pad or trim the audio array to N_SAMPLES, as expected by the encoder.
+    """
+    if torch.is_tensor(array):
+        if array.shape[axis] > length_max:
+            array = array.index_select(dim=axis, index=torch.arange(length_max, device=array.device))
+
+        if array.shape[axis] < length_min:
+            pad_widths = [(0, 0)] * array.ndim
+            pad_widths[axis] = (0, length_min - array.shape[axis])
+            array = F.pad(array, [pad for sizes in pad_widths[::-1] for pad in sizes])
+    else:
+        if array.shape[axis] > length_max:
+            array = array.take(indices=range(length_max), axis=axis)
+
+        if array.shape[axis] < length_min:
+            pad_widths = [(0, 0)] * array.ndim
+            pad_widths[axis] = (0, length_min - array.shape[axis])
+            array = np.pad(array, pad_widths)
+
+    return array
+
+
+@lru_cache(maxsize=None)
+def mel_filters(device, n_mels: int = N_MELS) -> torch.Tensor:
+    """
+    load the mel filterbank matrix for projecting STFT into a Mel spectrogram.
+    Allows decoupling librosa dependency; saved using:
+
+        np.savez_compressed(
+            "mel_filters.npz",
+            mel_80=librosa.filters.mel(sr=16000, n_fft=400, n_mels=80),
+        )
+    """
+    assert n_mels == 80, f"Unsupported n_mels: {n_mels}"
+    return torch.from_numpy(librosa_mel_fn(sr=SAMPLE_RATE,n_fft=N_FFT,n_mels=n_mels)).to(device)
+
+
+def log_mel_spectrogram(audio: Union[str, np.ndarray, torch.Tensor], n_mels: int = N_MELS):
+    """
+    Compute the log-Mel spectrogram of
+
+    Parameters
+    ----------
+    audio: Union[str, np.ndarray, torch.Tensor], shape = (*)
+        The path to audio or either a NumPy array or Tensor containing the audio waveform in 16 kHz
+
+    n_mels: int
+        The number of Mel-frequency filters, only 80 is supported
+
+    Returns
+    -------
+    torch.Tensor, shape = (80, n_frames)
+        A Tensor that contains the Mel spectrogram
+    """
+    if not torch.is_tensor(audio):
+        if isinstance(audio, str):
+            audio = load_audio(audio)
+        audio = torch.from_numpy(audio)
+
+    window = torch.hann_window(N_FFT).to(audio.device)
+    stft = torch.stft(audio, N_FFT, HOP_LENGTH, window=window, return_complex=True)
+    magnitudes = stft[..., :-1].abs() ** 2
+
+    filters = mel_filters(audio.device, n_mels)
+    mel_spec = filters @ magnitudes
+
+    log_spec = torch.clamp(mel_spec, min=1e-10).log10()
+    log_spec = torch.maximum(log_spec, log_spec.max() - 8.0)
+    log_spec = (log_spec + 4.0) / 4.0
+    return log_spec
diff --git a/whisper/decoding.py b/whisper/decoding.py
new file mode 100644
index 0000000000000000000000000000000000000000..603546d4c9ff67514d2567576935b974fe373bef
--- /dev/null
+++ b/whisper/decoding.py
@@ -0,0 +1,712 @@
+from dataclasses import dataclass, field
+from typing import Dict, List, Tuple, Iterable, Optional, Sequence, Union, TYPE_CHECKING
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+from torch import Tensor
+from torch.distributions import Categorical
+
+from .audio import CHUNK_LENGTH
+from .tokenizer import Tokenizer, get_tokenizer
+from .utils import compression_ratio
+
+if TYPE_CHECKING:
+    from .model import Whisper
+
+
+@torch.no_grad()
+def detect_language(model: "Whisper", mel: Tensor, tokenizer: Tokenizer = None) -> Tuple[Tensor, List[dict]]:
+    """
+    Detect the spoken language in the audio, and return them as list of strings, along with the ids
+    of the most probable language tokens and the probability distribution over all language tokens.
+    This is performed outside the main decode loop in order to not interfere with kv-caching.
+
+    Returns
+    -------
+    language_tokens : Tensor, shape = (n_audio,)
+        ids of the most probable language tokens, which appears after the startoftranscript token.
+    language_probs : List[Dict[str, float]], length = n_audio
+        list of dictionaries containing the probability distribution over all languages.
+    """
+    if tokenizer is None:
+        tokenizer = get_tokenizer(model.is_multilingual)
+    if tokenizer.language is None or tokenizer.language_token not in tokenizer.sot_sequence:
+        raise ValueError(f"This model doesn't have language tokens so it can't perform lang id")
+
+    single = mel.ndim == 2
+    if single:
+        mel = mel.unsqueeze(0)
+
+    # skip encoder forward pass if already-encoded audio features were given
+    if mel.shape[-2:] != (model.dims.n_audio_ctx, model.dims.n_audio_state):
+        mel = model.encoder(mel)
+
+    # forward pass using a single token, startoftranscript
+    n_audio = mel.shape[0]
+    x = torch.tensor([[tokenizer.sot]] * n_audio).to(mel.device)  # [n_audio, 1]
+    logits = model.logits(x, mel)[:, 0]
+
+    # collect detected languages; suppress all non-language tokens
+    mask = torch.ones(logits.shape[-1], dtype=torch.bool)
+    mask[list(tokenizer.all_language_tokens)] = False
+    logits[:, mask] = -np.inf
+    language_tokens = logits.argmax(dim=-1)
+    language_token_probs = logits.softmax(dim=-1).cpu()
+    language_probs = [
+        {
+            c: language_token_probs[i, j].item()
+            for j, c in zip(tokenizer.all_language_tokens, tokenizer.all_language_codes)
+        }
+        for i in range(n_audio)
+    ]
+
+    if single:
+        language_tokens = language_tokens[0]
+        language_probs = language_probs[0]
+
+    return language_tokens, language_probs
+
+
+@dataclass(frozen=True)
+class DecodingOptions:
+    task: str = "transcribe"  # whether to perform X->X "transcribe" or X->English "translate"
+    language: Optional[str] = None  # language that the audio is in; uses detected language if None
+
+    # sampling-related options
+    temperature: float = 0.0
+    sample_len: Optional[int] = None  # maximum number of tokens to sample
+    best_of: Optional[int] = None     # number of independent samples to collect, when t > 0
+    beam_size: Optional[int] = None   # number of beams in beam search, when t == 0
+    patience: Optional[float] = None  # patience in beam search (https://arxiv.org/abs/2204.05424)
+
+    # options for ranking generations (either beams or best-of-N samples)
+    length_penalty: Optional[float] = None   # "alpha" in Google NMT, None defaults to length norm
+
+    # prompt, prefix, and token suppression
+    prompt: Optional[Union[str, List[int]]] = None   # text or tokens for the previous context
+    prefix: Optional[Union[str, List[int]]] = None   # text or tokens to prefix the current context
+    suppress_blank: bool = True                      # this will suppress blank outputs
+
+    # list of tokens ids (or comma-separated token ids) to suppress
+    # "-1" will suppress a set of symbols as defined in `tokenizer.non_speech_tokens()`
+    suppress_tokens: Optional[Union[str, Iterable[int]]] = "-1"
+
+    # timestamp sampling options
+    without_timestamps: bool = False              # use <|notimestamps|> to sample text tokens only
+    max_initial_timestamp: Optional[float] = 1.0  # the initial timestamp cannot be later than this
+
+    # implementation details
+    fp16: bool = True  # use fp16 for most of the calculation
+
+
+@dataclass(frozen=True)
+class DecodingResult:
+    audio_features: Tensor
+    language: str
+    language_probs: Optional[Dict[str, float]] = None
+    tokens: List[int] = field(default_factory=list)
+    text: str = ""
+    avg_logprob: float = np.nan
+    no_speech_prob: float = np.nan
+    temperature: float = np.nan
+    compression_ratio: float = np.nan
+
+
+class Inference:
+    def logits(self, tokens: Tensor, audio_features: Tensor) -> Tensor:
+        """Perform a forward pass on the decoder and return per-token logits"""
+        raise NotImplementedError
+
+    def rearrange_kv_cache(self, source_indices) -> None:
+        """Update the key-value cache according to the updated beams"""
+        raise NotImplementedError
+
+    def cleanup_caching(self) -> None:
+        """Clean up any resources or hooks after decoding is finished"""
+        pass
+
+
+class PyTorchInference(Inference):
+    def __init__(self, model: "Whisper", initial_token_length: int):
+        self.model: "Whisper" = model
+        self.initial_token_length = initial_token_length
+        self.kv_cache = {}
+        self.hooks = []
+
+    def logits(self, tokens: Tensor, audio_features: Tensor) -> Tensor:
+        if not self.kv_cache:
+            self.kv_cache, self.hooks = self.model.install_kv_cache_hooks()
+
+        if tokens.shape[-1] > self.initial_token_length:
+            # only need to use the last token except in the first forward pass
+            tokens = tokens[:, -1:]
+
+        return self.model.decoder(tokens, audio_features, kv_cache=self.kv_cache)
+
+    def cleanup_caching(self):
+        for hook in self.hooks:
+            hook.remove()
+
+        self.kv_cache = {}
+        self.hooks = []
+
+    def rearrange_kv_cache(self, source_indices):
+        for module, tensor in self.kv_cache.items():
+            # update the key/value cache to contain the selected sequences
+            self.kv_cache[module] = tensor[source_indices].detach()
+
+
+class SequenceRanker:
+    def rank(self, tokens: List[List[Tensor]], sum_logprobs: List[List[float]]) -> List[int]:
+        """
+        Given a list of groups of samples and their cumulative log probabilities,
+        return the indices of the samples in each group to select as the final result
+        """
+        raise NotImplementedError
+
+
+class MaximumLikelihoodRanker(SequenceRanker):
+    """
+    Select the sample with the highest log probabilities, penalized using either
+    a simple length normalization or Google NMT paper's length penalty
+    """
+
+    def __init__(self, length_penalty: Optional[float]):
+        self.length_penalty = length_penalty
+
+    def rank(self, tokens: List[List[Tensor]], sum_logprobs: List[List[float]]):
+        def scores(logprobs, lengths):
+            result = []
+            for logprob, length in zip(logprobs, lengths):
+                if self.length_penalty is None:
+                    penalty = length
+                else:
+                    # from the Google NMT paper
+                    penalty = ((5 + length) / 6) ** self.length_penalty
+                result.append(logprob / penalty)
+            return result
+
+        # get the sequence with the highest score
+        lengths = [[len(t) for t in s] for s in tokens]
+        return [np.argmax(scores(p, l)) for p, l in zip(sum_logprobs, lengths)]
+
+
+class TokenDecoder:
+    def reset(self):
+        """Initialize any stateful variables for decoding a new sequence"""
+
+    def update(self, tokens: Tensor, logits: Tensor, sum_logprobs: Tensor) -> Tuple[Tensor, bool]:
+        """Specify how to select the next token, based on the current trace and logits
+
+        Parameters
+        ----------
+        tokens : Tensor, shape = (n_batch, current_sequence_length)
+            all tokens in the context so far, including the prefix and sot_sequence tokens
+
+        logits : Tensor, shape = (n_batch, vocab_size)
+            per-token logits of the probability distribution at the current step
+
+        sum_logprobs : Tensor, shape = (n_batch)
+            cumulative log probabilities for each sequence
+
+        Returns
+        -------
+        tokens : Tensor, shape = (n_batch, current_sequence_length + 1)
+            the tokens, appended with the selected next token
+
+        completed : bool
+            True if all sequences has reached the end of text
+
+        """
+        raise NotImplementedError
+
+    def finalize(
+        self, tokens: Tensor, sum_logprobs: Tensor
+    ) -> Tuple[Sequence[Sequence[Tensor]], List[List[float]]]:
+        """Finalize search and return the final candidate sequences
+
+        Parameters
+        ----------
+        tokens : Tensor, shape = (n_audio, n_group, current_sequence_length)
+            all tokens in the context so far, including the prefix and sot_sequence
+
+        sum_logprobs : Tensor, shape = (n_audio, n_group)
+            cumulative log probabilities for each sequence
+
+        Returns
+        -------
+        tokens : Sequence[Sequence[Tensor]], length = n_audio
+            sequence of Tensors containing candidate token sequences, for each audio input
+
+        sum_logprobs : List[List[float]], length = n_audio
+            sequence of cumulative log probabilities corresponding to the above
+
+        """
+        raise NotImplementedError
+
+
+class GreedyDecoder(TokenDecoder):
+    def __init__(self, temperature: float, eot: int):
+        self.temperature = temperature
+        self.eot = eot
+
+    def update(self, tokens: Tensor, logits: Tensor, sum_logprobs: Tensor) -> Tuple[Tensor, bool]:
+        temperature = self.temperature
+        if temperature == 0:
+            next_tokens = logits.argmax(dim=-1)
+        else:
+            next_tokens = Categorical(logits=logits / temperature).sample()
+
+        logprobs = F.log_softmax(logits.float(), dim=-1)
+        current_logprobs = logprobs[torch.arange(logprobs.shape[0]), next_tokens]
+        sum_logprobs += current_logprobs * (tokens[:, -1] != self.eot)
+
+        next_tokens[tokens[:, -1] == self.eot] = self.eot
+        tokens = torch.cat([tokens, next_tokens[:, None]], dim=-1)
+
+        completed = (tokens[:, -1] == self.eot).all()
+        return tokens, completed
+
+    def finalize(self, tokens: Tensor, sum_logprobs: Tensor):
+        # make sure each sequence has at least one EOT token at the end
+        tokens = F.pad(tokens, (0, 1), value=self.eot)
+        return tokens, sum_logprobs.tolist()
+
+
+class BeamSearchDecoder(TokenDecoder):
+    def __init__(self, beam_size: int, eot: int, inference: Inference, patience: Optional[float] = None):
+        self.beam_size = beam_size
+        self.eot = eot
+        self.inference = inference
+        self.patience = patience or 1.0
+        self.max_candidates: int = round(beam_size * self.patience)
+        self.finished_sequences = None
+
+        assert self.max_candidates > 0, f"Invalid beam size ({beam_size}) or patience ({patience})"
+
+    def reset(self):
+        self.finished_sequences = None
+
+    def update(self, tokens: Tensor, logits: Tensor, sum_logprobs: Tensor) -> Tuple[Tensor, bool]:
+        if tokens.shape[0] % self.beam_size != 0:
+            raise ValueError(f"{tokens.shape}[0] % {self.beam_size} != 0")
+
+        n_audio = tokens.shape[0] // self.beam_size
+        if self.finished_sequences is None:  # for the first update
+            self.finished_sequences = [{} for _ in range(n_audio)]
+
+        logprobs = F.log_softmax(logits.float(), dim=-1)
+        next_tokens, source_indices, finished_sequences = [], [], []
+        for i in range(n_audio):
+            scores, sources, finished = {}, {}, {}
+
+            # STEP 1: calculate the cumulative log probabilities for possible candidates
+            for j in range(self.beam_size):
+                idx = i * self.beam_size + j
+                prefix = tokens[idx].tolist()
+                for logprob, token in zip(*logprobs[idx].topk(self.beam_size + 1)):
+                    new_logprob = (sum_logprobs[idx] + logprob).item()
+                    sequence = tuple(prefix + [token.item()])
+                    scores[sequence] = new_logprob
+                    sources[sequence] = idx
+
+            # STEP 2: rank the candidates and keep the top beam_size sequences for each audio
+            saved = 0
+            for sequence in sorted(scores, key=scores.get, reverse=True):
+                if sequence[-1] == self.eot:
+                    finished[sequence] = scores[sequence]
+                else:
+                    sum_logprobs[len(next_tokens)] = scores[sequence]
+                    next_tokens.append(sequence)
+                    source_indices.append(sources[sequence])
+
+                    saved += 1
+                    if saved == self.beam_size:
+                        break
+
+            finished_sequences.append(finished)
+
+        tokens = torch.tensor(next_tokens, device=tokens.device)
+        self.inference.rearrange_kv_cache(source_indices)
+
+        # add newly finished sequences to self.finished_sequences
+        assert len(self.finished_sequences) == len(finished_sequences)
+        for previously_finished, newly_finished in zip(self.finished_sequences, finished_sequences):
+            for seq in sorted(newly_finished, key=newly_finished.get, reverse=True):
+                if len(previously_finished) >= self.max_candidates:
+                    break  # the candidate list is full
+                previously_finished[seq] = newly_finished[seq]
+
+        # mark as completed if all audio has enough number of samples
+        completed = all(
+            len(sequences) >= self.max_candidates for sequences in self.finished_sequences
+        )
+        return tokens, completed
+
+    def finalize(self, preceding_tokens: Tensor, sum_logprobs: Tensor):
+        # collect all finished sequences, including patience, and add unfinished ones if not enough
+        sum_logprobs = sum_logprobs.cpu()
+        for i, sequences in enumerate(self.finished_sequences):
+            if len(sequences) < self.beam_size:  # when not enough sequences are finished
+                for j in list(np.argsort(sum_logprobs[i]))[::-1]:
+                    sequence = preceding_tokens[i, j].tolist() + [self.eot]
+                    sequences[tuple(sequence)] = sum_logprobs[i][j].item()
+                    if len(sequences) >= self.beam_size:
+                        break
+
+        tokens: List[List[Tensor]] = [
+            [torch.tensor(seq) for seq in sequences.keys()] for sequences in self.finished_sequences
+        ]
+        sum_logprobs: List[List[float]] = [
+            list(sequences.values()) for sequences in self.finished_sequences
+        ]
+        return tokens, sum_logprobs
+
+
+class LogitFilter:
+    def apply(self, logits: Tensor, tokens: Tensor) -> None:
+        """Apply any filtering or masking to logits in-place
+
+        Parameters
+        ----------
+        logits : Tensor, shape = (n_batch, vocab_size)
+            per-token logits of the probability distribution at the current step
+
+        tokens : Tensor, shape = (n_batch, current_sequence_length)
+            all tokens in the context so far, including the prefix and sot_sequence tokens
+
+        """
+        raise NotImplementedError
+
+
+class SuppressBlank(LogitFilter):
+    def __init__(self, tokenizer: Tokenizer, sample_begin: int):
+        self.tokenizer = tokenizer
+        self.sample_begin = sample_begin
+
+    def apply(self, logits: Tensor, tokens: Tensor):
+        if tokens.shape[1] == self.sample_begin:
+            logits[:, self.tokenizer.encode(" ") + [self.tokenizer.eot]] = -np.inf
+
+
+class SuppressTokens(LogitFilter):
+    def __init__(self, suppress_tokens: Sequence[int]):
+        self.suppress_tokens = list(suppress_tokens)
+
+    def apply(self, logits: Tensor, tokens: Tensor):
+        logits[:, self.suppress_tokens] = -np.inf
+
+
+class ApplyTimestampRules(LogitFilter):
+    def __init__(
+        self, tokenizer: Tokenizer, sample_begin: int, max_initial_timestamp_index: Optional[int]
+    ):
+        self.tokenizer = tokenizer
+        self.sample_begin = sample_begin
+        self.max_initial_timestamp_index = max_initial_timestamp_index
+
+    def apply(self, logits: Tensor, tokens: Tensor):
+        # suppress <|notimestamps|> which is handled by without_timestamps
+        if self.tokenizer.no_timestamps is not None:
+            logits[:, self.tokenizer.no_timestamps] = -np.inf
+
+        # timestamps have to appear in pairs, except directly before EOT; mask logits accordingly
+        for k in range(tokens.shape[0]):
+            seq = [t for t in tokens[k, self.sample_begin :].tolist()]
+            last_was_timestamp = len(seq) >= 1 and seq[-1] >= self.tokenizer.timestamp_begin
+            penultimate_was_timestamp = len(seq) < 2 or seq[-2] >= self.tokenizer.timestamp_begin
+
+            if last_was_timestamp:
+                if penultimate_was_timestamp:  # has to be non-timestamp
+                    logits[k, self.tokenizer.timestamp_begin :] = -np.inf
+                else:  # cannot be normal text tokens
+                    logits[k, : self.tokenizer.eot] = -np.inf
+
+        if tokens.shape[1] == self.sample_begin:
+            # suppress generating non-timestamp tokens at the beginning
+            logits[:, : self.tokenizer.timestamp_begin] = -np.inf
+
+            # apply the `max_initial_timestamp` option
+            if self.max_initial_timestamp_index is not None:
+                last_allowed = self.tokenizer.timestamp_begin + self.max_initial_timestamp_index
+                logits[:, last_allowed + 1 :] = -np.inf
+
+        # if sum of probability over timestamps is above any other token, sample timestamp
+        logprobs = F.log_softmax(logits.float(), dim=-1)
+        for k in range(tokens.shape[0]):
+            timestamp_logprob = logprobs[k, self.tokenizer.timestamp_begin :].logsumexp(dim=-1)
+            max_text_token_logprob = logprobs[k, : self.tokenizer.timestamp_begin].max()
+            if timestamp_logprob > max_text_token_logprob:
+                logits[k, : self.tokenizer.timestamp_begin] = -np.inf
+
+
+class DecodingTask:
+    inference: Inference
+    sequence_ranker: SequenceRanker
+    decoder: TokenDecoder
+    logit_filters: List[LogitFilter]
+
+    def __init__(self, model: "Whisper", options: DecodingOptions):
+        self.model = model
+
+        language = options.language or "en"
+        tokenizer = get_tokenizer(model.is_multilingual, language=language, task=options.task)
+        self.tokenizer: Tokenizer = tokenizer
+        self.options: DecodingOptions = self._verify_options(options)
+
+        self.n_group: int = options.beam_size or options.best_of or 1
+        self.n_ctx: int = model.dims.n_text_ctx
+        self.sample_len: int = options.sample_len or model.dims.n_text_ctx // 2
+
+        self.sot_sequence: Tuple[int] = tokenizer.sot_sequence
+        if self.options.without_timestamps:
+            self.sot_sequence = tokenizer.sot_sequence_including_notimestamps
+
+        self.initial_tokens: Tuple[int] = self._get_initial_tokens()
+        self.sample_begin: int = len(self.initial_tokens)
+        self.sot_index: int = self.initial_tokens.index(tokenizer.sot)
+
+        # inference: implements the forward pass through the decoder, including kv caching
+        self.inference = PyTorchInference(model, len(self.initial_tokens))
+
+        # sequence ranker: implements how to rank a group of sampled sequences
+        self.sequence_ranker = MaximumLikelihoodRanker(options.length_penalty)
+
+        # decoder: implements how to select the next tokens, given the autoregressive distribution
+        if options.beam_size is not None:
+            self.decoder = BeamSearchDecoder(
+                options.beam_size, tokenizer.eot, self.inference, options.patience
+            )
+        else:
+            self.decoder = GreedyDecoder(options.temperature, tokenizer.eot)
+
+        # logit filters: applies various rules to suppress or penalize certain tokens
+        self.logit_filters = []
+        if self.options.suppress_blank:
+            self.logit_filters.append(SuppressBlank(self.tokenizer, self.sample_begin))
+        if self.options.suppress_tokens:
+            self.logit_filters.append(SuppressTokens(self._get_suppress_tokens()))
+        if not options.without_timestamps:
+            precision = CHUNK_LENGTH / model.dims.n_audio_ctx  # usually 0.02 seconds
+            max_initial_timestamp_index = None
+            if options.max_initial_timestamp:
+                max_initial_timestamp_index = round(self.options.max_initial_timestamp / precision)
+            self.logit_filters.append(
+                ApplyTimestampRules(tokenizer, self.sample_begin, max_initial_timestamp_index)
+            )
+
+    def _verify_options(self, options: DecodingOptions) -> DecodingOptions:
+        if options.beam_size is not None and options.best_of is not None:
+            raise ValueError("beam_size and best_of can't be given together")
+        if options.temperature == 0:
+            if options.best_of is not None:
+                raise ValueError("best_of with greedy sampling (T=0) is not compatible")
+        if options.patience is not None and options.beam_size is None:
+            raise ValueError("patience requires beam_size to be given")
+        if options.length_penalty is not None and not (0 <= options.length_penalty <= 1):
+            raise ValueError("length_penalty (alpha) should be a value between 0 and 1")
+
+        return options
+
+    def _get_initial_tokens(self) -> Tuple[int]:
+        tokens = list(self.sot_sequence)
+        prefix = self.options.prefix
+        prompt = self.options.prompt
+
+        if prefix:
+            prefix_tokens = (
+                self.tokenizer.encode(" " + prefix.strip()) if isinstance(prefix, str) else prefix
+            )
+            if self.sample_len is not None:
+                max_prefix_len = self.n_ctx // 2 - self.sample_len
+                prefix_tokens = prefix_tokens[-max_prefix_len:]
+            tokens = tokens + prefix_tokens
+
+        if prompt:
+            prompt_tokens = (
+                self.tokenizer.encode(" " + prompt.strip()) if isinstance(prompt, str) else prompt
+            )
+            tokens = [self.tokenizer.sot_prev] + prompt_tokens[-(self.n_ctx // 2 - 1) :] + tokens
+
+        return tuple(tokens)
+
+    def _get_suppress_tokens(self) -> Tuple[int]:
+        suppress_tokens = self.options.suppress_tokens
+
+        if isinstance(suppress_tokens, str):
+            suppress_tokens = [int(t) for t in suppress_tokens.split(",")]
+
+        if -1 in suppress_tokens:
+            suppress_tokens = [t for t in suppress_tokens if t >= 0]
+            suppress_tokens.extend(self.tokenizer.non_speech_tokens)
+        elif suppress_tokens is None or len(suppress_tokens) == 0:
+            suppress_tokens = []  # interpret empty string as an empty list
+        else:
+            assert isinstance(suppress_tokens, list), "suppress_tokens must be a list"
+
+        suppress_tokens.extend(
+            [self.tokenizer.sot, self.tokenizer.sot_prev, self.tokenizer.sot_lm]
+        )
+        if self.tokenizer.no_speech is not None:
+            # no-speech probability is collected separately
+            suppress_tokens.append(self.tokenizer.no_speech)
+
+        return tuple(sorted(set(suppress_tokens)))
+
+    def _get_audio_features(self, mel: Tensor):
+        if self.options.fp16:
+            mel = mel.half()
+
+        if mel.shape[-2:] == (self.model.dims.n_audio_ctx, self.model.dims.n_audio_state):
+            # encoded audio features are given; skip audio encoding
+            print("encoded audio features are given; skip audio encoding")
+            audio_features = mel
+        else:
+            print(mel.shape)
+            print("===============================")
+            audio_features = self.model.encoder(mel)
+
+        if audio_features.dtype != (torch.float16 if self.options.fp16 else torch.float32):
+            return TypeError(f"audio_features has an incorrect dtype: {audio_features.dtype}")
+
+        return audio_features
+
+    def _detect_language(self, audio_features: Tensor, tokens: Tensor):
+        languages = [self.options.language] * audio_features.shape[0]
+        lang_probs = None
+
+        if self.options.language is None or self.options.task == "lang_id":
+            lang_tokens, lang_probs = self.model.detect_language(audio_features, self.tokenizer)
+            languages = [max(probs, key=probs.get) for probs in lang_probs]
+            if self.options.language is None:
+                tokens[:, self.sot_index + 1] = lang_tokens  # write language tokens
+
+        return languages, lang_probs
+
+    def _main_loop(self, audio_features: Tensor, tokens: Tensor):
+        assert audio_features.shape[0] == tokens.shape[0]
+        n_batch = tokens.shape[0]
+        sum_logprobs: Tensor = torch.zeros(n_batch, device=audio_features.device)
+        no_speech_probs = [np.nan] * n_batch
+
+        try:
+            for i in range(self.sample_len):
+                logits = self.inference.logits(tokens, audio_features)
+
+                if i == 0 and self.tokenizer.no_speech is not None:  # save no_speech_probs
+                    probs_at_sot = logits[:, self.sot_index].float().softmax(dim=-1)
+                    no_speech_probs = probs_at_sot[:, self.tokenizer.no_speech].tolist()
+
+                # now we need to consider the logits at the last token only
+                logits = logits[:, -1]
+
+                # apply the logit filters, e.g. for suppressing or applying penalty to
+                for logit_filter in self.logit_filters:
+                    logit_filter.apply(logits, tokens)
+
+                # expand the tokens tensor with the selected next tokens
+                tokens, completed = self.decoder.update(tokens, logits, sum_logprobs)
+
+                if completed or tokens.shape[-1] > self.n_ctx:
+                    break
+        finally:
+            self.inference.cleanup_caching()
+
+        return tokens, sum_logprobs, no_speech_probs
+
+    @torch.no_grad()
+    def run(self, mel: Tensor) -> List[DecodingResult]:
+        self.decoder.reset()
+        tokenizer: Tokenizer = self.tokenizer
+        n_audio: int = mel.shape[0]
+
+        audio_features: Tensor = self._get_audio_features(mel)  # encoder forward pass
+        tokens: Tensor = torch.tensor([self.initial_tokens]).repeat(n_audio, 1)
+
+        # detect language if requested, overwriting the language token
+        languages, language_probs = self._detect_language(audio_features, tokens)
+        if self.options.task == "lang_id":
+            return [
+                DecodingResult(audio_features=features, language=language, language_probs=probs)
+                for features, language, probs in zip(audio_features, languages, language_probs)
+            ]
+
+        # repeat the audio & text tensors by the group size, for beam search or best-of-n sampling
+        audio_features = audio_features.repeat_interleave(self.n_group, dim=0)
+        tokens = tokens.repeat_interleave(self.n_group, dim=0).to(audio_features.device)
+
+        # call the main sampling loop
+        tokens, sum_logprobs, no_speech_probs = self._main_loop(audio_features, tokens)
+
+        # reshape the tensors to have (n_audio, n_group) as the first two dimensions
+        audio_features = audio_features[:: self.n_group]
+        no_speech_probs = no_speech_probs[:: self.n_group]
+        assert audio_features.shape[0] == len(no_speech_probs) == n_audio
+
+        tokens = tokens.reshape(n_audio, self.n_group, -1)
+        sum_logprobs = sum_logprobs.reshape(n_audio, self.n_group)
+
+        # get the final candidates for each group, and slice between the first sampled token and EOT
+        tokens, sum_logprobs = self.decoder.finalize(tokens, sum_logprobs)
+        tokens: List[List[Tensor]] = [
+            [t[self.sample_begin : (t == tokenizer.eot).nonzero()[0, 0]] for t in s] for s in tokens
+        ]
+
+        # select the top-ranked sample in each group
+        selected = self.sequence_ranker.rank(tokens, sum_logprobs)
+        tokens: List[List[int]] = [t[i].tolist() for i, t in zip(selected, tokens)]
+        texts: List[str] = [tokenizer.decode(t).strip() for t in tokens]
+
+        sum_logprobs: List[float] = [lp[i] for i, lp in zip(selected, sum_logprobs)]
+        avg_logprobs: List[float] = [lp / (len(t) + 1) for t, lp in zip(tokens, sum_logprobs)]
+
+        fields = (texts, languages, tokens, audio_features, avg_logprobs, no_speech_probs)
+        if len(set(map(len, fields))) != 1:
+            raise RuntimeError(f"inconsistent result lengths: {list(map(len, fields))}")
+
+        return [
+            DecodingResult(
+                audio_features=features,
+                language=language,
+                tokens=tokens,
+                text=text,
+                avg_logprob=avg_logprob,
+                no_speech_prob=no_speech_prob,
+                temperature=self.options.temperature,
+                compression_ratio=compression_ratio(text),
+            )
+            for text, language, tokens, features, avg_logprob, no_speech_prob in zip(*fields)
+        ]
+
+
+@torch.no_grad()
+def decode(model: "Whisper", mel: Tensor, options: DecodingOptions = DecodingOptions()) -> Union[DecodingResult, List[DecodingResult]]:
+    """
+    Performs decoding of 30-second audio segment(s), provided as Mel spectrogram(s).
+
+    Parameters
+    ----------
+    model: Whisper
+        the Whisper model instance
+
+    mel: torch.Tensor, shape = (80, 3000) or (*, 80, 3000)
+        A tensor containing the Mel spectrogram(s)
+
+    options: DecodingOptions
+        A dataclass that contains all necessary options for decoding 30-second segments
+
+    Returns
+    -------
+    result: Union[DecodingResult, List[DecodingResult]]
+        The result(s) of decoding contained in `DecodingResult` dataclass instance(s)
+    """
+    single = mel.ndim == 2
+    if single:
+        mel = mel.unsqueeze(0)
+    result = DecodingTask(model, options).run(mel)
+    
+    if single:
+        result = result[0]
+
+    return result
diff --git a/whisper/inference.py b/whisper/inference.py
new file mode 100644
index 0000000000000000000000000000000000000000..16174c1499d49eccce6a212524309138bfe730b8
--- /dev/null
+++ b/whisper/inference.py
@@ -0,0 +1,78 @@
+import sys,os
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+import numpy as np
+import argparse
+import torch
+
+from whisper.model import Whisper, ModelDimensions
+from whisper.audio import load_audio, pad_or_trim, log_mel_spectrogram
+
+
+def load_model(path, device) -> Whisper:
+    checkpoint = torch.load(path, map_location="cpu")
+    dims = ModelDimensions(**checkpoint["dims"])
+    # print(dims)
+    model = Whisper(dims)
+    del model.decoder
+    cut = len(model.encoder.blocks) // 4
+    cut = -1 * cut
+    del model.encoder.blocks[cut:]
+    model.load_state_dict(checkpoint["model_state_dict"], strict=False)
+    model.eval()
+    if not (device == "cpu"):
+        model.half()
+    model.to(device)
+    # torch.save({
+    #     'dims': checkpoint["dims"],
+    #     'model_state_dict': model.state_dict(),
+    # }, "large-v2.pt")
+    return model
+
+
+def pred_ppg(whisper: Whisper, wavPath, ppgPath, device):
+    audio = load_audio(wavPath)
+    audln = audio.shape[0]
+    ppg_a = []
+    idx_s = 0
+    while (idx_s + 15 * 16000 < audln):
+        short = audio[idx_s:idx_s + 15 * 16000]
+        idx_s = idx_s + 15 * 16000
+        ppgln = 15 * 16000 // 320
+        # short = pad_or_trim(short)
+        mel = log_mel_spectrogram(short).to(device)
+        if not (device == "cpu"):
+            mel = mel.half()
+        with torch.no_grad():
+            mel = mel + torch.randn_like(mel) * 0.1
+            ppg = whisper.encoder(mel.unsqueeze(0)).squeeze().data.cpu().float().numpy()
+            ppg = ppg[:ppgln,]  # [length, dim=1024]
+            ppg_a.extend(ppg)
+    if (idx_s < audln):
+        short = audio[idx_s:audln]
+        ppgln = (audln - idx_s) // 320
+        # short = pad_or_trim(short)
+        mel = log_mel_spectrogram(short).to(device)
+        if not (device == "cpu"):
+            mel = mel.half()
+        with torch.no_grad():
+            mel = mel + torch.randn_like(mel) * 0.1
+            ppg = whisper.encoder(mel.unsqueeze(0)).squeeze().data.cpu().float().numpy()
+            ppg = ppg[:ppgln,]  # [length, dim=1024]
+            ppg_a.extend(ppg)
+    np.save(ppgPath, ppg_a, allow_pickle=False)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-w", "--wav", help="wav", dest="wav", required=True)
+    parser.add_argument("-p", "--ppg", help="ppg", dest="ppg", required=True)
+    args = parser.parse_args()
+    print(args.wav)
+    print(args.ppg)
+
+    wavPath = args.wav
+    ppgPath = args.ppg
+
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    whisper = load_model(os.path.join("whisper_pretrain", "large-v2.pt"), device)
+    pred_ppg(whisper, wavPath, ppgPath, device)
diff --git a/whisper/model.py b/whisper/model.py
new file mode 100644
index 0000000000000000000000000000000000000000..78d6d135ee8df21e4deb0fffa5d15e5631f960c5
--- /dev/null
+++ b/whisper/model.py
@@ -0,0 +1,270 @@
+from dataclasses import dataclass
+from typing import Dict
+from typing import Iterable, Optional
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+from torch import Tensor
+from torch import nn
+
+from .decoding import detect_language as detect_language_function, decode as decode_function
+
+
+@dataclass
+class ModelDimensions:
+    n_mels: int
+    n_audio_ctx: int
+    n_audio_state: int
+    n_audio_head: int
+    n_audio_layer: int
+    n_vocab: int
+    n_text_ctx: int
+    n_text_state: int
+    n_text_head: int
+    n_text_layer: int
+
+
+class LayerNorm(nn.LayerNorm):
+    def forward(self, x: Tensor) -> Tensor:
+        # return super().forward(x.float()).type(x.dtype) sovits5.0
+        return super().forward(x).type(x.dtype)
+
+
+class Linear(nn.Linear):
+    def forward(self, x: Tensor) -> Tensor:
+        return F.linear(
+            x, self.weight.to(x.dtype), None if self.bias is None else self.bias.to(x.dtype)
+        )
+
+
+class Conv1d(nn.Conv1d):
+    def _conv_forward(self, x: Tensor, weight: Tensor, bias: Optional[Tensor]) -> Tensor:
+        return super()._conv_forward(
+            x, weight.to(x.dtype), None if bias is None else bias.to(x.dtype)
+        )
+
+
+def sinusoids(length, channels, max_timescale=10000):
+    """Returns sinusoids for positional embedding"""
+    assert channels % 2 == 0
+    log_timescale_increment = np.log(max_timescale) / (channels // 2 - 1)
+    inv_timescales = torch.exp(-log_timescale_increment * torch.arange(channels // 2))
+    scaled_time = torch.arange(length)[:, np.newaxis] * inv_timescales[np.newaxis, :]
+    return torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], dim=1)
+
+
+class MultiHeadAttention(nn.Module):
+    def __init__(self, n_state: int, n_head: int):
+        super().__init__()
+        self.n_head = n_head
+        self.query = Linear(n_state, n_state)
+        self.key = Linear(n_state, n_state, bias=False)
+        self.value = Linear(n_state, n_state)
+        self.out = Linear(n_state, n_state)
+
+    def forward(
+        self,
+        x: Tensor,
+        xa: Optional[Tensor] = None,
+        mask: Optional[Tensor] = None,
+        kv_cache: Optional[dict] = None,
+    ):
+        q = self.query(x)
+
+        if kv_cache is None or xa is None or self.key not in kv_cache:
+            # hooks, if installed (i.e. kv_cache is not None), will prepend the cached kv tensors;
+            # otherwise, perform key/value projections for self- or cross-attention as usual.
+            k = self.key(x if xa is None else xa)
+            v = self.value(x if xa is None else xa)
+        else:
+            # for cross-attention, calculate keys and values once and reuse in subsequent calls.
+            k = kv_cache[self.key]
+            v = kv_cache[self.value]
+
+        wv, qk = self.qkv_attention(q, k, v, mask)
+        return self.out(wv), qk
+
+    def qkv_attention(self, q: Tensor, k: Tensor, v: Tensor, mask: Optional[Tensor] = None):
+        n_batch, n_ctx, n_state = q.shape
+        scale = (n_state // self.n_head) ** -0.25
+        q = q.view(*q.shape[:2], self.n_head, -1).permute(0, 2, 1, 3) * scale
+        k = k.view(*k.shape[:2], self.n_head, -1).permute(0, 2, 3, 1) * scale
+        v = v.view(*v.shape[:2], self.n_head, -1).permute(0, 2, 1, 3)
+
+        qk = q @ k
+        if mask is not None:
+            qk = qk + mask[:n_ctx, :n_ctx]
+        qk = qk.float()
+
+        w = F.softmax(qk, dim=-1).to(q.dtype)
+        return (w @ v).permute(0, 2, 1, 3).flatten(start_dim=2), qk.detach()
+
+
+class ResidualAttentionBlock(nn.Module):
+    def __init__(self, n_state: int, n_head: int, cross_attention: bool = False):
+        super().__init__()
+
+        self.attn = MultiHeadAttention(n_state, n_head)
+        self.attn_ln = LayerNorm(n_state)
+
+        self.cross_attn = MultiHeadAttention(n_state, n_head) if cross_attention else None
+        self.cross_attn_ln = LayerNorm(n_state) if cross_attention else None
+
+        n_mlp = n_state * 4
+        self.mlp = nn.Sequential(Linear(n_state, n_mlp), nn.GELU(), Linear(n_mlp, n_state))
+        self.mlp_ln = LayerNorm(n_state)
+
+    def forward(
+        self,
+        x: Tensor,
+        xa: Optional[Tensor] = None,
+        mask: Optional[Tensor] = None,
+        kv_cache: Optional[dict] = None,
+    ):
+        x = x + self.attn(self.attn_ln(x), mask=mask, kv_cache=kv_cache)[0]
+        if self.cross_attn:
+            x = x + self.cross_attn(self.cross_attn_ln(x), xa, kv_cache=kv_cache)[0]
+        x = x + self.mlp(self.mlp_ln(x))
+        return x
+
+
+class AudioEncoder(nn.Module):
+    def __init__(self, n_mels: int, n_ctx: int, n_state: int, n_head: int, n_layer: int):
+        super().__init__()
+        self.conv1 = Conv1d(n_mels, n_state, kernel_size=3, padding=1)
+        self.conv2 = Conv1d(n_state, n_state, kernel_size=3, stride=2, padding=1)
+        self.register_buffer("positional_embedding", sinusoids(n_ctx, n_state))
+
+        self.blocks: Iterable[ResidualAttentionBlock] = nn.ModuleList(
+            [ResidualAttentionBlock(n_state, n_head) for _ in range(n_layer)]
+        )
+        self.ln_post = LayerNorm(n_state)
+
+    def forward(self, x: Tensor):
+        """
+        x : torch.Tensor, shape = (batch_size, n_mels, n_ctx)
+            the mel spectrogram of the audio
+        """
+        x = F.gelu(self.conv1(x))
+        x = F.gelu(self.conv2(x))
+        x = x.permute(0, 2, 1)
+
+        len_x = x.shape[1]
+        len_e = self.positional_embedding.shape[0]
+        assert len_x <= len_e, "incorrect audio shape"
+        pos_e = self.positional_embedding[:len_x, :]
+        x = (x + pos_e).to(x.dtype)
+
+        for block in self.blocks:
+            x = block(x)
+
+        x = self.ln_post(x)
+        return x
+
+
+class TextDecoder(nn.Module):
+    def __init__(self, n_vocab: int, n_ctx: int, n_state: int, n_head: int, n_layer: int):
+        super().__init__()
+
+        self.token_embedding = nn.Embedding(n_vocab, n_state)
+        self.positional_embedding = nn.Parameter(torch.empty(n_ctx, n_state))
+
+        self.blocks: Iterable[ResidualAttentionBlock] = nn.ModuleList(
+            [ResidualAttentionBlock(n_state, n_head, cross_attention=True) for _ in range(n_layer)]
+        )
+        self.ln = LayerNorm(n_state)
+
+        mask = torch.empty(n_ctx, n_ctx).fill_(-np.inf).triu_(1)
+        self.register_buffer("mask", mask, persistent=False)
+
+    def forward(self, x: Tensor, xa: Tensor, kv_cache: Optional[dict] = None):
+        """
+        x : torch.LongTensor, shape = (batch_size, <= n_ctx)
+            the text tokens
+        xa : torch.Tensor, shape = (batch_size, n_mels, n_audio_ctx)
+            the encoded audio features to be attended on
+        """
+        offset = next(iter(kv_cache.values())).shape[1] if kv_cache else 0
+        x = self.token_embedding(x) + self.positional_embedding[offset : offset + x.shape[-1]]
+        x = x.to(xa.dtype)
+
+        for block in self.blocks:
+            x = block(x, xa, mask=self.mask, kv_cache=kv_cache)
+
+        x = self.ln(x)
+        logits = (x @ torch.transpose(self.token_embedding.weight.to(x.dtype), 0, 1)).float()
+
+        return logits
+
+
+class Whisper(nn.Module):
+    def __init__(self, dims: ModelDimensions):
+        super().__init__()
+        self.dims = dims
+        self.encoder = AudioEncoder(
+            self.dims.n_mels,
+            self.dims.n_audio_ctx,
+            self.dims.n_audio_state,
+            self.dims.n_audio_head,
+            self.dims.n_audio_layer,
+        )
+        self.decoder = TextDecoder(
+            self.dims.n_vocab,
+            self.dims.n_text_ctx,
+            self.dims.n_text_state,
+            self.dims.n_text_head,
+            self.dims.n_text_layer,
+        )
+
+    def embed_audio(self, mel: torch.Tensor):
+        return self.encoder(mel)
+
+    def logits(self, tokens: torch.Tensor, audio_features: torch.Tensor):
+        return self.decoder(tokens, audio_features)
+
+    def forward(self, mel: torch.Tensor, tokens: torch.Tensor) -> Dict[str, torch.Tensor]:
+        return self.decoder(tokens, self.encoder(mel))
+
+    @property
+    def device(self):
+        return next(self.parameters()).device
+
+    @property
+    def is_multilingual(self):
+        return self.dims.n_vocab == 51865
+
+    def install_kv_cache_hooks(self, cache: Optional[dict] = None):
+        """
+        The `MultiHeadAttention` module optionally accepts `kv_cache` which stores the key and value
+        tensors calculated for the previous positions. This method returns a dictionary that stores
+        all caches, and the necessary hooks for the key and value projection modules that save the
+        intermediate tensors to be reused during later calculations.
+
+        Returns
+        -------
+        cache : Dict[nn.Module, torch.Tensor]
+            A dictionary object mapping the key/value projection modules to its cache
+        hooks : List[RemovableHandle]
+            List of PyTorch RemovableHandle objects to stop the hooks to be called
+        """
+        cache = {**cache} if cache is not None else {}
+        hooks = []
+
+        def save_to_cache(module, _, output):
+            if module not in cache or output.shape[1] > self.decoder.positional_embedding.shape[0]:
+                cache[module] = output  # save as-is, for the first token or cross attention
+            else:
+                cache[module] = torch.cat([cache[module], output], dim=1).detach()
+            return cache[module]
+
+        def install_hooks(layer: nn.Module):
+            if isinstance(layer, MultiHeadAttention):
+                hooks.append(layer.key.register_forward_hook(save_to_cache))
+                hooks.append(layer.value.register_forward_hook(save_to_cache))
+
+        self.decoder.apply(install_hooks)
+        return cache, hooks
+
+    detect_language = detect_language_function
+    decode = decode_function
diff --git a/whisper/tokenizer.py b/whisper/tokenizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..a27cb359ee891590d3f793624f9f8ec768a26cc3
--- /dev/null
+++ b/whisper/tokenizer.py
@@ -0,0 +1,331 @@
+import os
+from dataclasses import dataclass
+from functools import lru_cache
+from typing import List, Optional, Tuple, Union
+
+import numpy as np
+import torch
+from transformers import GPT2TokenizerFast
+
+LANGUAGES = {
+    "en": "english",
+    "zh": "chinese",
+    "de": "german",
+    "es": "spanish",
+    "ru": "russian",
+    "ko": "korean",
+    "fr": "french",
+    "ja": "japanese",
+    "pt": "portuguese",
+    "tr": "turkish",
+    "pl": "polish",
+    "ca": "catalan",
+    "nl": "dutch",
+    "ar": "arabic",
+    "sv": "swedish",
+    "it": "italian",
+    "id": "indonesian",
+    "hi": "hindi",
+    "fi": "finnish",
+    "vi": "vietnamese",
+    "he": "hebrew",
+    "uk": "ukrainian",
+    "el": "greek",
+    "ms": "malay",
+    "cs": "czech",
+    "ro": "romanian",
+    "da": "danish",
+    "hu": "hungarian",
+    "ta": "tamil",
+    "no": "norwegian",
+    "th": "thai",
+    "ur": "urdu",
+    "hr": "croatian",
+    "bg": "bulgarian",
+    "lt": "lithuanian",
+    "la": "latin",
+    "mi": "maori",
+    "ml": "malayalam",
+    "cy": "welsh",
+    "sk": "slovak",
+    "te": "telugu",
+    "fa": "persian",
+    "lv": "latvian",
+    "bn": "bengali",
+    "sr": "serbian",
+    "az": "azerbaijani",
+    "sl": "slovenian",
+    "kn": "kannada",
+    "et": "estonian",
+    "mk": "macedonian",
+    "br": "breton",
+    "eu": "basque",
+    "is": "icelandic",
+    "hy": "armenian",
+    "ne": "nepali",
+    "mn": "mongolian",
+    "bs": "bosnian",
+    "kk": "kazakh",
+    "sq": "albanian",
+    "sw": "swahili",
+    "gl": "galician",
+    "mr": "marathi",
+    "pa": "punjabi",
+    "si": "sinhala",
+    "km": "khmer",
+    "sn": "shona",
+    "yo": "yoruba",
+    "so": "somali",
+    "af": "afrikaans",
+    "oc": "occitan",
+    "ka": "georgian",
+    "be": "belarusian",
+    "tg": "tajik",
+    "sd": "sindhi",
+    "gu": "gujarati",
+    "am": "amharic",
+    "yi": "yiddish",
+    "lo": "lao",
+    "uz": "uzbek",
+    "fo": "faroese",
+    "ht": "haitian creole",
+    "ps": "pashto",
+    "tk": "turkmen",
+    "nn": "nynorsk",
+    "mt": "maltese",
+    "sa": "sanskrit",
+    "lb": "luxembourgish",
+    "my": "myanmar",
+    "bo": "tibetan",
+    "tl": "tagalog",
+    "mg": "malagasy",
+    "as": "assamese",
+    "tt": "tatar",
+    "haw": "hawaiian",
+    "ln": "lingala",
+    "ha": "hausa",
+    "ba": "bashkir",
+    "jw": "javanese",
+    "su": "sundanese",
+}
+
+# language code lookup by name, with a few language aliases
+TO_LANGUAGE_CODE = {
+    **{language: code for code, language in LANGUAGES.items()},
+    "burmese": "my",
+    "valencian": "ca",
+    "flemish": "nl",
+    "haitian": "ht",
+    "letzeburgesch": "lb",
+    "pushto": "ps",
+    "panjabi": "pa",
+    "moldavian": "ro",
+    "moldovan": "ro",
+    "sinhalese": "si",
+    "castilian": "es",
+}
+
+
+@dataclass(frozen=True)
+class Tokenizer:
+    """A thin wrapper around `GPT2TokenizerFast` providing quick access to special tokens"""
+
+    tokenizer: "GPT2TokenizerFast"
+    language: Optional[str]
+    sot_sequence: Tuple[int]
+
+    def encode(self, text, **kwargs):
+        return self.tokenizer.encode(text, **kwargs)
+
+    def decode(self, token_ids: Union[int, List[int], np.ndarray, torch.Tensor], **kwargs):
+        return self.tokenizer.decode(token_ids, **kwargs)
+
+    def decode_with_timestamps(self, tokens) -> str:
+        """
+        Timestamp tokens are above the special tokens' id range and are ignored by `decode()`.
+        This method decodes given tokens with timestamps tokens annotated, e.g. "<|1.08|>".
+        """
+        outputs = [[]]
+        for token in tokens:
+            if token >= self.timestamp_begin:
+                timestamp = f"<|{(token - self.timestamp_begin) * 0.02:.2f}|>"
+                outputs.append(timestamp)
+                outputs.append([])
+            else:
+                outputs[-1].append(token)
+        outputs = [s if isinstance(s, str) else self.tokenizer.decode(s) for s in outputs]
+        return "".join(outputs)
+
+    @property
+    @lru_cache()
+    def eot(self) -> int:
+        return self.tokenizer.eos_token_id
+
+    @property
+    @lru_cache()
+    def sot(self) -> int:
+        return self._get_single_token_id("<|startoftranscript|>")
+
+    @property
+    @lru_cache()
+    def sot_lm(self) -> int:
+        return self._get_single_token_id("<|startoflm|>")
+
+    @property
+    @lru_cache()
+    def sot_prev(self) -> int:
+        return self._get_single_token_id("<|startofprev|>")
+
+    @property
+    @lru_cache()
+    def no_speech(self) -> int:
+        return self._get_single_token_id("<|nospeech|>")
+
+    @property
+    @lru_cache()
+    def no_timestamps(self) -> int:
+        return self._get_single_token_id("<|notimestamps|>")
+
+    @property
+    @lru_cache()
+    def timestamp_begin(self) -> int:
+        return self.tokenizer.all_special_ids[-1] + 1
+
+    @property
+    @lru_cache()
+    def language_token(self) -> int:
+        """Returns the token id corresponding to the value of the `language` field"""
+        if self.language is None:
+            raise ValueError(f"This tokenizer does not have language token configured")
+
+        additional_tokens = dict(
+            zip(
+                self.tokenizer.additional_special_tokens,
+                self.tokenizer.additional_special_tokens_ids,
+            )
+        )
+        candidate = f"<|{self.language}|>"
+        if candidate in additional_tokens:
+            return additional_tokens[candidate]
+
+        raise KeyError(f"Language {self.language} not found in tokenizer.")
+
+    @property
+    @lru_cache()
+    def all_language_tokens(self) -> Tuple[int]:
+        result = []
+        for token, token_id in zip(
+            self.tokenizer.additional_special_tokens,
+            self.tokenizer.additional_special_tokens_ids,
+        ):
+            if token.strip("<|>") in LANGUAGES:
+                result.append(token_id)
+        return tuple(result)
+
+    @property
+    @lru_cache()
+    def all_language_codes(self) -> Tuple[str]:
+        return tuple(self.decode([l]).strip("<|>") for l in self.all_language_tokens)
+
+    @property
+    @lru_cache()
+    def sot_sequence_including_notimestamps(self) -> Tuple[int]:
+        return tuple(list(self.sot_sequence) + [self.no_timestamps])
+
+    @property
+    @lru_cache()
+    def non_speech_tokens(self) -> Tuple[int]:
+        """
+        Returns the list of tokens to suppress in order to avoid any speaker tags or non-speech
+        annotations, to prevent sampling texts that are not actually spoken in the audio, e.g.
+
+        - ♪♪♪
+        - ( SPEAKING FOREIGN LANGUAGE )
+        - [DAVID] Hey there,
+
+        keeping basic punctuations like commas, periods, question marks, exclamation points, etc.
+        """
+        symbols = list("\"#()*+/:;<=>@[\\]^_`{|}~「」『』")
+        symbols += "<< >> <<< >>> -- --- -( -[ (' (\" (( )) ((( ))) [[ ]] {{ }} ♪♪ ♪♪♪".split()
+
+        # symbols that may be a single token or multiple tokens depending on the tokenizer.
+        # In case they're multiple tokens, suppress the first token, which is safe because:
+        # These are between U+2640 and U+267F miscellaneous symbols that are okay to suppress
+        # in generations, and in the 3-byte UTF-8 representation they share the first two bytes.
+        miscellaneous = set("♩♪♫♬♭♮♯")
+        assert all(0x2640 <= ord(c) <= 0x267F for c in miscellaneous)
+
+        # allow hyphens "-" and single quotes "'" between words, but not at the beginning of a word
+        result = {self.tokenizer.encode(" -")[0], self.tokenizer.encode(" '")[0]}
+        for symbol in symbols + list(miscellaneous):
+            for tokens in [self.tokenizer.encode(symbol), self.tokenizer.encode(" " + symbol)]:
+                if len(tokens) == 1 or symbol in miscellaneous:
+                    result.add(tokens[0])
+
+        return tuple(sorted(result))
+
+    def _get_single_token_id(self, text) -> int:
+        tokens = self.tokenizer.encode(text)
+        assert len(tokens) == 1, f"{text} is not encoded as a single token"
+        return tokens[0]
+
+
+@lru_cache(maxsize=None)
+def build_tokenizer(name: str = "gpt2"):
+    os.environ["TOKENIZERS_PARALLELISM"] = "false"
+    path = os.path.join(os.path.dirname(__file__), "assets", name)
+    tokenizer = GPT2TokenizerFast.from_pretrained(path)
+
+    specials = [
+        "<|startoftranscript|>",
+        *[f"<|{lang}|>" for lang in LANGUAGES.keys()],
+        "<|translate|>",
+        "<|transcribe|>",
+        "<|startoflm|>",
+        "<|startofprev|>",
+        "<|nospeech|>",
+        "<|notimestamps|>",
+    ]
+
+    tokenizer.add_special_tokens(dict(additional_special_tokens=specials))
+    return tokenizer
+
+
+@lru_cache(maxsize=None)
+def get_tokenizer(
+    multilingual: bool,
+    *,
+    task: Optional[str] = None,  # Literal["transcribe", "translate", None]
+    language: Optional[str] = None,
+) -> Tokenizer:
+    if language is not None:
+        language = language.lower()
+        if language not in LANGUAGES:
+            if language in TO_LANGUAGE_CODE:
+                language = TO_LANGUAGE_CODE[language]
+            else:
+                raise ValueError(f"Unsupported language: {language}")
+
+    if multilingual:
+        tokenizer_name = "multilingual"
+        task = task or "transcribe"
+        language = language or "en"
+    else:
+        tokenizer_name = "gpt2"
+        task = None
+        language = None
+
+    tokenizer = build_tokenizer(name=tokenizer_name)
+    all_special_ids: List[int] = tokenizer.all_special_ids
+    sot: int = all_special_ids[1]
+    translate: int = all_special_ids[-6]
+    transcribe: int = all_special_ids[-5]
+
+    langs = tuple(LANGUAGES.keys())
+    sot_sequence = [sot]
+    if language is not None:
+        sot_sequence.append(sot + 1 + langs.index(language))
+    if task is not None:
+        sot_sequence.append(transcribe if task == "transcribe" else translate)
+
+    return Tokenizer(tokenizer=tokenizer, language=language, sot_sequence=tuple(sot_sequence))
diff --git a/whisper/utils.py b/whisper/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..5dacc173c40bcd6e999d728862e29a968000b12e
--- /dev/null
+++ b/whisper/utils.py
@@ -0,0 +1,163 @@
+import json
+import os
+import sys
+import zlib
+from typing import Callable, TextIO
+
+system_encoding = sys.getdefaultencoding()
+
+if system_encoding != "utf-8":
+    def make_safe(string):
+        # replaces any character not representable using the system default encoding with an '?',
+        # avoiding UnicodeEncodeError (https://github.com/openai/whisper/discussions/729).
+        return string.encode(system_encoding, errors="replace").decode(system_encoding)
+else:
+    def make_safe(string):
+        # utf-8 can encode any Unicode code point, so no need to do the round-trip encoding
+        return string
+
+
+def exact_div(x, y):
+    assert x % y == 0
+    return x // y
+
+
+def str2bool(string):
+    str2val = {"True": True, "False": False}
+    if string in str2val:
+        return str2val[string]
+    else:
+        raise ValueError(f"Expected one of {set(str2val.keys())}, got {string}")
+
+
+def optional_int(string):
+    return None if string == "None" else int(string)
+
+
+def optional_float(string):
+    return None if string == "None" else float(string)
+
+
+def compression_ratio(text) -> float:
+    text_bytes = text.encode("utf-8")
+    return len(text_bytes) / len(zlib.compress(text_bytes))
+
+
+def format_timestamp(seconds: float, always_include_hours: bool = False, decimal_marker: str = '.'):
+    assert seconds >= 0, "non-negative timestamp expected"
+    milliseconds = round(seconds * 1000.0)
+
+    hours = milliseconds // 3_600_000
+    milliseconds -= hours * 3_600_000
+
+    minutes = milliseconds // 60_000
+    milliseconds -= minutes * 60_000
+
+    seconds = milliseconds // 1_000
+    milliseconds -= seconds * 1_000
+
+    hours_marker = f"{hours:02d}:" if always_include_hours or hours > 0 else ""
+    return f"{hours_marker}{minutes:02d}:{seconds:02d}{decimal_marker}{milliseconds:03d}"
+
+
+class ResultWriter:
+    extension: str
+
+    def __init__(self, output_dir: str):
+        self.output_dir = output_dir
+
+    def __call__(self, result: dict, audio_path: str):
+        audio_basename = os.path.basename(audio_path)
+        output_path = os.path.join(self.output_dir, audio_basename + "." + self.extension)
+
+        with open(output_path, "w", encoding="utf-8") as f:
+            self.write_result(result, file=f)
+
+    def write_result(self, result: dict, file: TextIO):
+        raise NotImplementedError
+
+
+class WriteTXT(ResultWriter):
+    extension: str = "txt"
+
+    def write_result(self, result: dict, file: TextIO):
+        for segment in result["segments"]:
+            print(segment['text'].strip(), file=file, flush=True)
+
+
+class WriteVTT(ResultWriter):
+    extension: str = "vtt"
+
+    def write_result(self, result: dict, file: TextIO):
+        print("WEBVTT\n", file=file)
+        for segment in result["segments"]:
+            print(
+                f"{format_timestamp(segment['start'])} --> {format_timestamp(segment['end'])}\n"
+                f"{segment['text'].strip().replace('-->', '->')}\n",
+                file=file,
+                flush=True,
+            )
+
+
+class WriteSRT(ResultWriter):
+    extension: str = "srt"
+
+    def write_result(self, result: dict, file: TextIO):
+        for i, segment in enumerate(result["segments"], start=1):
+            # write srt lines
+            print(
+                f"{i}\n"
+                f"{format_timestamp(segment['start'], always_include_hours=True, decimal_marker=',')} --> "
+                f"{format_timestamp(segment['end'], always_include_hours=True, decimal_marker=',')}\n"
+                f"{segment['text'].strip().replace('-->', '->')}\n",
+                file=file,
+                flush=True,
+            )
+
+
+class WriteTSV(ResultWriter):
+    """
+    Write a transcript to a file in TSV (tab-separated values) format containing lines like:
+    <start time in integer milliseconds>\t<end time in integer milliseconds>\t<transcript text>
+
+    Using integer milliseconds as start and end times means there's no chance of interference from
+    an environment setting a language encoding that causes the decimal in a floating point number
+    to appear as a comma; also is faster and more efficient to parse & store, e.g., in C++.
+    """
+    extension: str = "tsv"
+
+    def write_result(self, result: dict, file: TextIO):
+        print("start", "end", "text", sep="\t", file=file)
+        for segment in result["segments"]:
+            print(round(1000 * segment['start']), file=file, end="\t")
+            print(round(1000 * segment['end']), file=file, end="\t")
+            print(segment['text'].strip().replace("\t", " "), file=file, flush=True)
+
+
+class WriteJSON(ResultWriter):
+    extension: str = "json"
+
+    def write_result(self, result: dict, file: TextIO):
+        json.dump(result, file)
+
+
+def get_writer(output_format: str, output_dir: str) -> Callable[[dict, TextIO], None]:
+    writers = {
+        "txt": WriteTXT,
+        "vtt": WriteVTT,
+        "srt": WriteSRT,
+        "tsv": WriteTSV,
+        "json": WriteJSON,
+    }
+
+    if output_format == "all":
+        all_writers = [writer(output_dir) for writer in writers.values()]
+
+        def write_all(result: dict, file: TextIO):
+            for writer in all_writers:
+                writer(result, file)
+
+        return write_all
+
+    return writers[output_format](output_dir)
+
diff --git a/whisper_pretrain/.DS_Store b/whisper_pretrain/.DS_Store
new file mode 100644
index 0000000000000000000000000000000000000000..5008ddfcf53c02e82d7eee2e57c38e5672ef89f6
Binary files /dev/null and b/whisper_pretrain/.DS_Store differ
diff --git a/whisper_pretrain/README.md b/whisper_pretrain/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..f615cae0154333d0d3c778d83fc5263d30990b54
--- /dev/null
+++ b/whisper_pretrain/README.md
@@ -0,0 +1,3 @@
+Path for:
+
+    large-v2.pt