Orpheus-3B Anime Speech Finetune (10 Voices)

This repository contains a text-to-speech (TTS) model fine-tuned from canopylabs/orpheus-3b-0.1-ft. It has been specifically trained to generate anime-style speech using 10 distinct voices from the ShoukanLabs/AniSpeech dataset.

Model Description

Base Model: canopylabs/orpheus-3b-0.1-ft
Fine-tuning Dataset: ShoukanLabs/AniSpeech (Subset of 10 voices)
Architecture: Orpheus-3B
Language(s): Primarily trained on English audio, performance on other languages may vary.
Purpose: Generating expressive anime character voices from text prompts.

Voices Available & Audio Samples

The model was fine-tuned on the following 10 voice IDs from the AniSpeech dataset. You can select a voice by providing its ID in the prompt.

(Sample Text: "Rain tapped the tin roof as Mira whispered secrets to the dusk. Shadows danced between the lantern’s glow, weaving memories of laughter and loss.")

voice-16:

voice-107:

voice-125:

voice-145:

voice-163:

voice-179:

voice-180:

voice-183:

voice-185:

voice-187:

Usage

First, install the necessary libraries: pip install torch transformers scipy tqdm unsloth snac

Save the following code as a Python file (e.g., generate_speech.py) and run it. This script will generate audio for the specified prompts using each of the available voices.

import torch
from unsloth import FastLanguageModel
from snac import SNAC
from scipy.io.wavfile import write as write_wav
import os
from tqdm import tqdm

MODEL_NAME = "taresh18/orpheus-3B-animespeech-ft"
SNAC_MODEL_NAME = "hubertsiuzdak/snac_24khz"
MAX_SEQ_LENGTH = 2048
LOAD_IN_4BIT = False
DTYPE = None
DEVICE = "cuda"
OUTPUT_DIR = "outputs-animespeech-ft"

PROMPTS = [
    "Rain tapped the tin roof as Mira whispered secrets to the dusk. Shadows danced between the lantern’s glow, weaving memories of laughter and loss.",
]
VOICES = ["107", "125", "145", "16", "163", "179", "180", "183", "185", "187"]

# Special token IDs
START_TOKEN_ID = 128259
END_TOKENS_IDS = [128009, 128260]
PAD_TOKEN_ID = 128263
CROP_START_TOKEN_ID = 128257
REMOVE_TOKEN_ID = 128258
AUDIO_CODE_OFFSET = 128266


def redistribute_codes(code_list, device):
  """Redistributes flat token list into SNAC layers directly on the specified device."""
  layer_1 = []
  layer_2 = []
  layer_3 = []
  num_frames = len(code_list) // 7
  for i in range(num_frames):
    base_idx = 7 * i
    if base_idx + 6 >= len(code_list): break
    layer_1.append(code_list[base_idx])
    layer_2.append(code_list[base_idx + 1] - 4096)
    layer_3.append(code_list[base_idx + 2] - (2 * 4096))
    layer_3.append(code_list[base_idx + 3] - (3 * 4096))
    layer_2.append(code_list[base_idx + 4] - (4 * 4096))
    layer_3.append(code_list[base_idx + 5] - (5 * 4096))
    layer_3.append(code_list[base_idx + 6] - (6 * 4096))

  codes = [torch.tensor(layer_1, dtype=torch.long, device=device).unsqueeze(0),
           torch.tensor(layer_2, dtype=torch.long, device=device).unsqueeze(0),
           torch.tensor(layer_3, dtype=torch.long, device=device).unsqueeze(0)]
  return codes


def load_models():
    """Loads the language model and the SNAC vocoder."""
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name=MODEL_NAME,
        max_seq_length=MAX_SEQ_LENGTH,
        dtype=DTYPE,
        load_in_4bit=LOAD_IN_4BIT,
    )
    FastLanguageModel.for_inference(model)

    snac_model = SNAC.from_pretrained(SNAC_MODEL_NAME)
    snac_model.to(DEVICE)
    snac_model.eval()
    print("Models loaded.")
    return model, tokenizer, snac_model

def generate_audio_from_prompts(model, tokenizer, snac_model, prompts, chosen_voice):
    """Generates audio tensors from text prompts."""
    prompts_with_voice = [(f"{chosen_voice}: " + p) if chosen_voice else p for p in prompts]
    all_input_ids = [tokenizer(p, return_tensors="pt").input_ids for p in prompts_with_voice]

    start_token = torch.tensor([[START_TOKEN_ID]], dtype=torch.int64)
    end_tokens = torch.tensor([END_TOKENS_IDS], dtype=torch.int64)

    all_modified_input_ids = [torch.cat([start_token, ids, end_tokens], dim=1) for ids in all_input_ids]

    max_length = max([mod_ids.shape[1] for mod_ids in all_modified_input_ids])
    all_padded_tensors = []
    all_attention_masks = []
    for mod_ids in all_modified_input_ids:
        padding_length = max_length - mod_ids.shape[1]
        padding_tensor = torch.full((1, padding_length), PAD_TOKEN_ID, dtype=torch.int64)
        padded_tensor = torch.cat([padding_tensor, mod_ids], dim=1)
        mask_padding = torch.zeros((1, padding_length), dtype=torch.int64)
        mask_real = torch.ones((1, mod_ids.shape[1]), dtype=torch.int64)
        attention_mask = torch.cat([mask_padding, mask_real], dim=1)
        all_padded_tensors.append(padded_tensor)
        all_attention_masks.append(attention_mask)

    batch_input_ids = torch.cat(all_padded_tensors, dim=0).to(DEVICE)
    batch_attention_mask = torch.cat(all_attention_masks, dim=0).to(DEVICE)

    print("Generating tokens...")
    with torch.no_grad():
        generated_ids = model.generate(
            input_ids=batch_input_ids,
            attention_mask=batch_attention_mask,
            max_new_tokens=1200,
            do_sample=True,
            temperature=0.6,
            top_p=0.95,
            repetition_penalty=1.1,
            num_return_sequences=1,
            eos_token_id=REMOVE_TOKEN_ID,
            pad_token_id=tokenizer.pad_token_id if tokenizer.pad_token_id is not None else PAD_TOKEN_ID,
            use_cache=True
        )
    generated_ids = generated_ids.to("cpu")
    print("Token generation complete.")

    token_indices = (generated_ids == CROP_START_TOKEN_ID).nonzero(as_tuple=True)
    cropped_tensors = []
    if len(token_indices[0]) > 0:
        for i in range(generated_ids.shape[0]):
            seq_indices = token_indices[1][token_indices[0] == i]
            if len(seq_indices) > 0:
                last_occurrence_idx = seq_indices[-1].item()
                cropped_tensors.append(generated_ids[i, last_occurrence_idx + 1:].unsqueeze(0))
            else:
                cropped_tensors.append(generated_ids[i, batch_input_ids.shape[1]:].unsqueeze(0))
    else:
         cropped_tensors = [generated_ids[i, batch_input_ids.shape[1]:].unsqueeze(0) for i in range(generated_ids.shape[0])]


    processed_rows = []
    for row_tensor in cropped_tensors:
         if row_tensor.numel() > 0:
            row_1d = row_tensor.squeeze(0)
            mask = row_1d != REMOVE_TOKEN_ID
            processed_rows.append(row_1d[mask])
         else:
            processed_rows.append(row_tensor.squeeze(0))

    code_lists = []
    for row in processed_rows:
        if row.numel() >= 7:
            row_length = row.size(0)
            new_length = (row_length // 7) * 7
            trimmed_row = row[:new_length]
            adjusted_code_list = [(t.item() - AUDIO_CODE_OFFSET) for t in trimmed_row]
            code_lists.append(adjusted_code_list)
        else:
            code_lists.append([])

    print("Decoding audio with SNAC...")
    all_audio_samples = []
    for i, code_list in enumerate(code_lists):
        if code_list:
            codes_for_snac = redistribute_codes(code_list, DEVICE)
            with torch.no_grad():
                audio_hat = snac_model.decode(codes_for_snac)
            all_audio_samples.append(audio_hat.detach().cpu())
        else:
            all_audio_samples.append(torch.tensor([[]]))

    return all_audio_samples


def main():
    model, tokenizer, snac_model = load_models()

    for voice in tqdm(VOICES):
        my_samples = generate_audio_from_prompts(model, tokenizer, snac_model, PROMPTS, voice)

        if len(PROMPTS) != len(my_samples):
            print("Error: Mismatch between number of prompts and generated samples.")
        else:
            os.makedirs(OUTPUT_DIR, exist_ok=True)

            for i, samples in enumerate(my_samples):
                if samples.numel() > 0:
                    audio_data = samples.squeeze().numpy()
                    if audio_data.ndim == 0:
                        audio_data = audio_data.reshape(1)
                    output_filename = os.path.join(OUTPUT_DIR, f"voice_{voice}_{i}.wav")
                    write_wav(output_filename, 24000, audio_data)
                    print(f"Saved audio to: {output_filename}")
                else:
                    print(f"Skipping save for sample {i} as no audio data was generated.")


if __name__ == "__main__":
    main()

taresh18
/

orpheus-3B-animespeech-ft

Orpheus-3B Anime Speech Finetune (10 Voices)

Model Description

Voices Available & Audio Samples

Usage

Model tree for taresh18/orpheus-3B-animespeech-ft

Dataset used to train taresh18/orpheus-3B-animespeech-ft