--- base_model: unsloth/orpheus-3b-0.1-ft-unsloth-bnb-4bit tags: - transformers - unsloth - llama - text-to-speech - tts - audio - speech - anime - english - orpheus - unsloth - snac license: apache-2.0 pipeline_tag: text-to-speech language: - en datasets: - ShoukanLabs/AniSpeech widget: - text: "Rain tapped the tin roof as Mira whispered secrets to the dusk. Shadows danced between the lantern’s glow, weaving memories of laughter and loss." output: url: "https://cdn-uploads.huggingface.co/production/uploads/67c2f8504369cf18d0c356c3/4npjSAGONHwPwNtYAfyF-.wav" voice: "16" --- # Orpheus-3B Anime Speech Finetune (10 Voices) This repository contains a text-to-speech (TTS) model fine-tuned from `canopylabs/orpheus-3b-0.1-ft`. It has been specifically trained to generate anime-style speech using 10 distinct voices from the `ShoukanLabs/AniSpeech` dataset. ## Model Description * **Base Model:** [canopylabs/orpheus-3b-0.1-ft](https://huggingface.co/canopylabs/orpheus-3b-0.1-ft) * **Fine-tuning Dataset:** [ShoukanLabs/AniSpeech](https://huggingface.co/datasets/ShoukanLabs/AniSpeech) (Subset of 10 voices) * **Architecture:** Orpheus-3B * **Language(s):** Primarily trained on English audio, performance on other languages may vary. * **Purpose:** Generating expressive anime character voices from text prompts. ## Voices Available & Audio Samples The model was fine-tuned on the following 10 voice IDs from the AniSpeech dataset. You can select a voice by providing its ID in the prompt. *(Sample Text: "Rain tapped the tin roof as Mira whispered secrets to the dusk. Shadows danced between the lantern’s glow, weaving memories of laughter and loss.")* * **voice-16:** * **voice-107:** * **voice-125:** * **voice-145:** * **voice-163:** * **voice-179:** * **voice-180:** * **voice-183:** * **voice-185:** * **voice-187:** ## Usage First, install the necessary libraries: pip install torch transformers scipy tqdm unsloth snac Save the following code as a Python file (e.g., generate_speech.py) and run it. This script will generate audio for the specified prompts using each of the available voices. ```python import torch from unsloth import FastLanguageModel from snac import SNAC from scipy.io.wavfile import write as write_wav import os from tqdm import tqdm MODEL_NAME = "taresh18/orpheus-3B-animespeech-ft" SNAC_MODEL_NAME = "hubertsiuzdak/snac_24khz" MAX_SEQ_LENGTH = 2048 LOAD_IN_4BIT = False DTYPE = None DEVICE = "cuda" OUTPUT_DIR = "outputs-animespeech-ft" PROMPTS = [ "Rain tapped the tin roof as Mira whispered secrets to the dusk. Shadows danced between the lantern’s glow, weaving memories of laughter and loss.", ] VOICES = ["107", "125", "145", "16", "163", "179", "180", "183", "185", "187"] # Special token IDs START_TOKEN_ID = 128259 END_TOKENS_IDS = [128009, 128260] PAD_TOKEN_ID = 128263 CROP_START_TOKEN_ID = 128257 REMOVE_TOKEN_ID = 128258 AUDIO_CODE_OFFSET = 128266 def redistribute_codes(code_list, device): """Redistributes flat token list into SNAC layers directly on the specified device.""" layer_1 = [] layer_2 = [] layer_3 = [] num_frames = len(code_list) // 7 for i in range(num_frames): base_idx = 7 * i if base_idx + 6 >= len(code_list): break layer_1.append(code_list[base_idx]) layer_2.append(code_list[base_idx + 1] - 4096) layer_3.append(code_list[base_idx + 2] - (2 * 4096)) layer_3.append(code_list[base_idx + 3] - (3 * 4096)) layer_2.append(code_list[base_idx + 4] - (4 * 4096)) layer_3.append(code_list[base_idx + 5] - (5 * 4096)) layer_3.append(code_list[base_idx + 6] - (6 * 4096)) codes = [torch.tensor(layer_1, dtype=torch.long, device=device).unsqueeze(0), torch.tensor(layer_2, dtype=torch.long, device=device).unsqueeze(0), torch.tensor(layer_3, dtype=torch.long, device=device).unsqueeze(0)] return codes def load_models(): """Loads the language model and the SNAC vocoder.""" model, tokenizer = FastLanguageModel.from_pretrained( model_name=MODEL_NAME, max_seq_length=MAX_SEQ_LENGTH, dtype=DTYPE, load_in_4bit=LOAD_IN_4BIT, ) FastLanguageModel.for_inference(model) snac_model = SNAC.from_pretrained(SNAC_MODEL_NAME) snac_model.to(DEVICE) snac_model.eval() print("Models loaded.") return model, tokenizer, snac_model def generate_audio_from_prompts(model, tokenizer, snac_model, prompts, chosen_voice): """Generates audio tensors from text prompts.""" prompts_with_voice = [(f"{chosen_voice}: " + p) if chosen_voice else p for p in prompts] all_input_ids = [tokenizer(p, return_tensors="pt").input_ids for p in prompts_with_voice] start_token = torch.tensor([[START_TOKEN_ID]], dtype=torch.int64) end_tokens = torch.tensor([END_TOKENS_IDS], dtype=torch.int64) all_modified_input_ids = [torch.cat([start_token, ids, end_tokens], dim=1) for ids in all_input_ids] max_length = max([mod_ids.shape[1] for mod_ids in all_modified_input_ids]) all_padded_tensors = [] all_attention_masks = [] for mod_ids in all_modified_input_ids: padding_length = max_length - mod_ids.shape[1] padding_tensor = torch.full((1, padding_length), PAD_TOKEN_ID, dtype=torch.int64) padded_tensor = torch.cat([padding_tensor, mod_ids], dim=1) mask_padding = torch.zeros((1, padding_length), dtype=torch.int64) mask_real = torch.ones((1, mod_ids.shape[1]), dtype=torch.int64) attention_mask = torch.cat([mask_padding, mask_real], dim=1) all_padded_tensors.append(padded_tensor) all_attention_masks.append(attention_mask) batch_input_ids = torch.cat(all_padded_tensors, dim=0).to(DEVICE) batch_attention_mask = torch.cat(all_attention_masks, dim=0).to(DEVICE) print("Generating tokens...") with torch.no_grad(): generated_ids = model.generate( input_ids=batch_input_ids, attention_mask=batch_attention_mask, max_new_tokens=1200, do_sample=True, temperature=0.6, top_p=0.95, repetition_penalty=1.1, num_return_sequences=1, eos_token_id=REMOVE_TOKEN_ID, pad_token_id=tokenizer.pad_token_id if tokenizer.pad_token_id is not None else PAD_TOKEN_ID, use_cache=True ) generated_ids = generated_ids.to("cpu") print("Token generation complete.") token_indices = (generated_ids == CROP_START_TOKEN_ID).nonzero(as_tuple=True) cropped_tensors = [] if len(token_indices[0]) > 0: for i in range(generated_ids.shape[0]): seq_indices = token_indices[1][token_indices[0] == i] if len(seq_indices) > 0: last_occurrence_idx = seq_indices[-1].item() cropped_tensors.append(generated_ids[i, last_occurrence_idx + 1:].unsqueeze(0)) else: cropped_tensors.append(generated_ids[i, batch_input_ids.shape[1]:].unsqueeze(0)) else: cropped_tensors = [generated_ids[i, batch_input_ids.shape[1]:].unsqueeze(0) for i in range(generated_ids.shape[0])] processed_rows = [] for row_tensor in cropped_tensors: if row_tensor.numel() > 0: row_1d = row_tensor.squeeze(0) mask = row_1d != REMOVE_TOKEN_ID processed_rows.append(row_1d[mask]) else: processed_rows.append(row_tensor.squeeze(0)) code_lists = [] for row in processed_rows: if row.numel() >= 7: row_length = row.size(0) new_length = (row_length // 7) * 7 trimmed_row = row[:new_length] adjusted_code_list = [(t.item() - AUDIO_CODE_OFFSET) for t in trimmed_row] code_lists.append(adjusted_code_list) else: code_lists.append([]) print("Decoding audio with SNAC...") all_audio_samples = [] for i, code_list in enumerate(code_lists): if code_list: codes_for_snac = redistribute_codes(code_list, DEVICE) with torch.no_grad(): audio_hat = snac_model.decode(codes_for_snac) all_audio_samples.append(audio_hat.detach().cpu()) else: all_audio_samples.append(torch.tensor([[]])) return all_audio_samples def main(): model, tokenizer, snac_model = load_models() for voice in tqdm(VOICES): my_samples = generate_audio_from_prompts(model, tokenizer, snac_model, PROMPTS, voice) if len(PROMPTS) != len(my_samples): print("Error: Mismatch between number of prompts and generated samples.") else: os.makedirs(OUTPUT_DIR, exist_ok=True) for i, samples in enumerate(my_samples): if samples.numel() > 0: audio_data = samples.squeeze().numpy() if audio_data.ndim == 0: audio_data = audio_data.reshape(1) output_filename = os.path.join(OUTPUT_DIR, f"voice_{voice}_{i}.wav") write_wav(output_filename, 24000, audio_data) print(f"Saved audio to: {output_filename}") else: print(f"Skipping save for sample {i} as no audio data was generated.") if __name__ == "__main__": main() ```