ACE-Step

Runtime error

App Files Files Community

Sayoyo commited on May 13

Commit

96ec844

1 Parent(s): a8bbbf9

[feat] lora support

Browse files

Files changed (45) hide show

app.py +1 -0
config/zh_rap_lora_config.json +15 -0
data_sampler.py +14 -7
examples/{input_params → default/input_params}/output_20250426071706_0_input_params.json +0 -0
examples/{input_params → default/input_params}/output_20250426071812_0_input_params.json +0 -0
examples/{input_params → default/input_params}/output_20250426072346_0_input_params.json +0 -0
examples/{input_params → default/input_params}/output_20250426072508_0_input_params.json +0 -0
examples/{input_params → default/input_params}/output_20250426073829_0_input_params.json +0 -0
examples/{input_params → default/input_params}/output_20250426074037_0_input_params.json +0 -0
examples/{input_params → default/input_params}/output_20250426074214_0_input_params.json +0 -0
examples/{input_params → default/input_params}/output_20250426074413_0_input_params.json +0 -0
examples/{input_params → default/input_params}/output_20250426075107_0_input_params.json +0 -0
examples/{input_params → default/input_params}/output_20250426075537_0_input_params.json +0 -0
examples/{input_params → default/input_params}/output_20250426075843_0_input_params.json +0 -0
examples/{input_params → default/input_params}/output_20250426080234_0_input_params.json +0 -0
examples/{input_params → default/input_params}/output_20250426080407_0_input_params.json +0 -0
examples/{input_params → default/input_params}/output_20250426080601_0_input_params.json +0 -0
examples/{input_params → default/input_params}/output_20250426081134_0_input_params.json +0 -0
examples/{input_params → default/input_params}/output_20250426092025_0_input_params.json +0 -0
examples/{input_params → default/input_params}/output_20250426093007_0_input_params.json +0 -0
examples/{input_params → default/input_params}/output_20250426093146_0_input_params.json +0 -0
examples/input_params/output_20250426091716_0_input_params.json +0 -25
examples/zh_rap_lora/input_params/output_20250512101839_0_input_params.json +45 -0
examples/zh_rap_lora/input_params/output_20250512114703_0_input_params.json +45 -0
examples/zh_rap_lora/input_params/output_20250512115409_0_input_params.json +45 -0
examples/zh_rap_lora/input_params/output_20250512120348_0_input_params.json +45 -0
examples/zh_rap_lora/input_params/output_20250512143242_0_input_params.json +45 -0
examples/zh_rap_lora/input_params/output_20250512145057_0_input_params.json +45 -0
examples/zh_rap_lora/input_params/output_20250512152217_0_input_params.json +45 -0
examples/zh_rap_lora/input_params/output_20250512153616_0_input_params.json +45 -0
examples/zh_rap_lora/input_params/output_20250512154907_0_input_params.json +45 -0
examples/zh_rap_lora/input_params/output_20250512160830_0_input_params.json +45 -0
examples/zh_rap_lora/input_params/output_20250512161832_0_input_params.json +45 -0
examples/zh_rap_lora/input_params/output_20250512164224_0_input_params.json +45 -0
examples/zh_rap_lora/input_params/output_20250512171227_0_input_params.json +45 -0
examples/zh_rap_lora/input_params/output_20250512171809_0_input_params.json +45 -0
examples/zh_rap_lora/input_params/output_20250512172941_0_input_params.json +45 -0
examples/zh_rap_lora/input_params/output_20250513044511_0_input_params.json +45 -0
examples/zh_rap_lora/input_params/output_20250513050200_0_input_params.json +45 -0
examples/zh_rap_lora/input_params/output_20250513055451_0_input_params.json +45 -0
examples/zh_rap_lora/input_params/output_20250513060150_0_input_params.json +45 -0
pipeline_ace_step.py +37 -3
requirements.txt +5 -2
test.json +1 -0
ui/components.py +443 -67

app.py CHANGED Viewed

@@ -34,6 +34,7 @@ def main(args):
     demo = create_main_demo_ui(
         text2music_process_func=model_demo.__call__,
         sample_data_func=data_sampler.sample,
     )
     demo.queue(default_concurrency_limit=8).launch(

     demo = create_main_demo_ui(
         text2music_process_func=model_demo.__call__,
         sample_data_func=data_sampler.sample,
+        load_data_func=data_sampler.load_json,
     )
     demo.queue(default_concurrency_limit=8).launch(

config/zh_rap_lora_config.json ADDED Viewed

	@@ -0,0 +1,15 @@

+{
+    "r": 256,
+    "lora_alpha": 32,
+    "target_modules": [
+        "speaker_embedder",
+        "linear_q",
+        "linear_k",
+        "linear_v",
+        "to_q",
+        "to_k",
+        "to_v",
+        "to_out.0"
+    ],
+    "use_rslora": true
+}

data_sampler.py CHANGED Viewed

@@ -3,21 +3,28 @@ from pathlib import Path
 import random
-DEFAULT_ROOT_DIR = "examples/input_params"
 class DataSampler:
     def __init__(self, root_dir=DEFAULT_ROOT_DIR):
         self.root_dir = root_dir
-        # glob
         self.input_params_files = list(Path(self.root_dir).glob("*.json"))
     def load_json(self, file_path):
         with open(file_path, "r", encoding="utf-8") as f:
             return json.load(f)
-    def sample(self):
-        json_path = random.choice(self.input_params_files)
-        json_data = self.load_json(json_path)
         return json_data

 import random
+DEFAULT_ROOT_DIR = "examples/default/input_params"
+ZH_RAP_LORA_ROOT_DIR = "examples/zh_rap_lora/input_params"
 class DataSampler:
     def __init__(self, root_dir=DEFAULT_ROOT_DIR):
         self.root_dir = root_dir
         self.input_params_files = list(Path(self.root_dir).glob("*.json"))
+        self.zh_rap_lora_input_params_files = list(Path(ZH_RAP_LORA_ROOT_DIR).glob("*.json"))
+        self.zh_rap_lora_input_params_files += list(Path(ZH_RAP_LORA_ROOT_DIR).glob("*.json"))
     def load_json(self, file_path):
         with open(file_path, "r", encoding="utf-8") as f:
             return json.load(f)
+    def sample(self, lora_name_or_path=None):
+        if lora_name_or_path is None or lora_name_or_path == "none":
+            json_path = random.choice(self.input_params_files)
+            json_data = self.load_json(json_path)
+        else:
+            json_path = random.choice(self.zh_rap_lora_input_params_files)
+            json_data = self.load_json(json_path)
+            # Update the lora_name in the json_data
+            json_data["lora_name_or_path"] = lora_name_or_path
         return json_data

examples/{input_params → default/input_params}/output_20250426071706_0_input_params.json RENAMED Viewed

File without changes

examples/{input_params → default/input_params}/output_20250426071812_0_input_params.json RENAMED Viewed

File without changes

examples/{input_params → default/input_params}/output_20250426072346_0_input_params.json RENAMED Viewed

File without changes

examples/{input_params → default/input_params}/output_20250426072508_0_input_params.json RENAMED Viewed

File without changes

examples/{input_params → default/input_params}/output_20250426073829_0_input_params.json RENAMED Viewed

File without changes

examples/{input_params → default/input_params}/output_20250426074037_0_input_params.json RENAMED Viewed

File without changes

examples/{input_params → default/input_params}/output_20250426074214_0_input_params.json RENAMED Viewed

File without changes

examples/{input_params → default/input_params}/output_20250426074413_0_input_params.json RENAMED Viewed

File without changes

examples/{input_params → default/input_params}/output_20250426075107_0_input_params.json RENAMED Viewed

File without changes

examples/{input_params → default/input_params}/output_20250426075537_0_input_params.json RENAMED Viewed

File without changes

examples/{input_params → default/input_params}/output_20250426075843_0_input_params.json RENAMED Viewed

File without changes

examples/{input_params → default/input_params}/output_20250426080234_0_input_params.json RENAMED Viewed

File without changes

examples/{input_params → default/input_params}/output_20250426080407_0_input_params.json RENAMED Viewed

File without changes

examples/{input_params → default/input_params}/output_20250426080601_0_input_params.json RENAMED Viewed

File without changes

examples/{input_params → default/input_params}/output_20250426081134_0_input_params.json RENAMED Viewed

File without changes

examples/{input_params → default/input_params}/output_20250426092025_0_input_params.json RENAMED Viewed

File without changes

examples/{input_params → default/input_params}/output_20250426093007_0_input_params.json RENAMED Viewed

File without changes

examples/{input_params → default/input_params}/output_20250426093146_0_input_params.json RENAMED Viewed

File without changes

examples/input_params/output_20250426091716_0_input_params.json DELETED Viewed

@@ -1,25 +0,0 @@
-{
-    "prompt": "anime, cute female vocals, kawaii pop, j-pop, childish, piano, guitar, synthesizer, fast, happy, cheerful, lighthearted",
-    "lyrics": "[Chorus]\nねぇ、顔が赤いよ？\nどうしたの？ 熱があるの？\nそれとも怒ってるの？\nねぇ、言ってよ！\n\nどうしてそんな目で見るの？\n私、悪いことした？\n何か間違えたの？\nお願い、やめて… 怖いから…\nだから、やめてよ…\n\n[Bridge]\n目を閉じて、くるっと背を向けて、\n何も見なかったフリするから、\n怒らないで… 許してよ…\n\n[Chorus]\nねぇ、顔が赤いよ？\nどうしたの？ 熱があるの？\nそれとも怒ってるの？\nねぇ、言ってよ！\n\nどうしてそんな目で見るの？\n私、悪いことした？\n何か間違えたの？\nお願い、やめて… 怖いから…\nだから、やめてよ…\n\n[Bridge 2]\n待って、もし私が悪いなら、\nごめんなさいって言うから、\nアイスクリームあげるから、\nもう怒らないで？\n\nOoooh… 言ってよ！",
-    "audio_duration": 160,
-    "infer_step": 60,
-    "guidance_scale": 15,
-    "scheduler_type": "euler",
-    "cfg_type": "apg",
-    "omega_scale": 10,
-    "guidance_interval": 0.5,
-    "guidance_interval_decay": 0,
-    "min_guidance_scale": 3,
-    "use_erg_tag": true,
-    "use_erg_lyric": true,
-    "use_erg_diffusion": true,
-    "oss_steps": [],
-    "timecosts": {
-        "preprocess": 0.0282442569732666,
-        "diffusion": 12.104875326156616,
-        "latent2audio": 1.587641954421997
-    },
-    "actual_seeds": [
-        4028738662
-    ]
-}

examples/zh_rap_lora/input_params/output_20250512101839_0_input_params.json ADDED Viewed

	@@ -0,0 +1,45 @@

+{
+    "lora_name_or_path": "/root/sag_train/data/ace_step_v1_chinese_rap_lora",
+    "task": "text2music",
+    "prompt": "Rap, adult, male, spoken word, singing, bright, energetic, clear",
+    "lyrics": "[Intro]\n他们说我来自阴影里\n说我的肤色是原罪的印记\n\n[Verse]\n眼神像刀子刮过 穿透我的皮肤\n带着审判和偏见 让我无处可逃处\n你没听过我的故事 没走过我的路\n凭什么就下一个判决 把我划出你的版图\n你说我威胁到你 抢走了你的机会\n可你可知我付出的 是你不敢想象的血泪\n被贴上标签 被区别对待\n呼吸都是错的 只因我生来就不一样态\n\n[Chorus]\n看不见的墙 把我阻隔在外面\n听不见的声音 屏蔽了我的呼唤\n他们制造偏见 他们散播谎言\n只因为我的存在 让他们觉得不安\n\n[Verse]\n每一次努力争取 都会被审视被放大\n每一个细微的错误 都变成攻击的靶\n他们选择性失明 看不见我的汗水\n只看见他们想看的 带着恶意的定位\n系统性的歧视 像一张无形的网\n把我困在原地 无法自由地翱翔\n他们在享受特权 却指责我的贫困\n嘲笑我的口音 我的名字 我的出身\n\n[Chorus]\n看不见的墙 把我阻隔在外面\n听不见的声音 屏蔽了我的呼唤\n他们制造偏见 他们散播谎言\n只因为我的存在 让他们觉得不安\n\n[Bridge]\n我不想寻求同情 只想被公平对待\n不想被定义被束缚 有选择自己未来的权利\n什么时候 才能放下心中的成见\n看到真正的我 而不是你脑海里的画面\n\n[Outro]\n画面... 不安...\n偏见... 歧视...\n什么时候能停止...",
+    "audio_duration": 134.64,
+    "infer_step": 60,
+    "guidance_scale": 15,
+    "scheduler_type": "euler",
+    "cfg_type": "apg",
+    "omega_scale": 10,
+    "guidance_interval": 0.3,
+    "guidance_interval_decay": 0,
+    "min_guidance_scale": 3,
+    "use_erg_tag": true,
+    "use_erg_lyric": false,
+    "use_erg_diffusion": true,
+    "oss_steps": [],
+    "timecosts": {
+        "preprocess": 0.032018184661865234,
+        "diffusion": 13.275121927261353,
+        "latent2audio": 1.291429042816162
+    },
+    "actual_seeds": [
+        3826585269
+    ],
+    "retake_seeds": [
+        2907904223
+    ],
+    "retake_variance": 0.5,
+    "guidance_scale_text": 0,
+    "guidance_scale_lyric": 0,
+    "repaint_start": 0,
+    "repaint_end": 0,
+    "edit_n_min": 0.0,
+    "edit_n_max": 1.0,
+    "edit_n_avg": 1,
+    "src_audio_path": null,
+    "edit_target_prompt": null,
+    "edit_target_lyrics": null,
+    "audio2audio_enable": false,
+    "ref_audio_strength": 0.5,
+    "ref_audio_input": null,
+    "audio_path": "./outputs/output_20250512101839_0.wav"
+}

examples/zh_rap_lora/input_params/output_20250512114703_0_input_params.json ADDED Viewed

	@@ -0,0 +1,45 @@

+{
+    "lora_name_or_path": "/root/sag_train/data/ace_step_v1_chinese_rap_lora",
+    "task": "text2music",
+    "prompt": "Chorus Hook, Melodic Rap, Ambient Synth Pads, adult, rap, Very Fast, Storytelling, Chinese Rap, male, spoken word, bright, energetic, Melodic Flow, clear, clarity, 130 bpm",
+    "lyrics": "[Intro]\n舌 头 打 结 了... 快 念 快 念...\n\n[Verse 1]\n这 个 赌 鬼 蹲 在 柜 台 啃 着 苦 瓜 干 快 很 干\n赌 桌 堆 满 骨 牌 古 怪 股 票 和 五 块 钢 镚 儿 钢 镚\n他 甩 出 扑 克 牌 啪 啪 啪 拍 扁 螃 蟹 壳 哦 壳 扁\n又 摸 摸 麻 将 摸 出 幺 鸡 摸 出 发 财 摸 出 一 条 蛇 蛇 蛇\n庄 家 咳 嗽 咳 破 锣 嗓 子 喊 开 开 开 快 开 开\n赌 鬼 咕 嘟 咕 嘟 灌 咖 啡 灌 到 筷 子 戳 穿 碗 快 戳 穿\n空 气 里 飘 着 锅 巴 味 混 合 隔 夜 的 酸 奶 罐 哦 酸\n输 光 裤 带 还 想 翻 盘 翻 成 煎 饼 摊 老 板 快 翻 盘\n\n[Chorus]\n赌 鬼 赌 鬼 哦 赌 鬼 赌 鬼 快 很 快\n舌 头 打 结 着 念 这 段 哦 这 段 绕 口 令 牌\n若 念 错 一 字 就 罚 你 哦 罚 你 吞 十 斤 海 带\n赌 场 规 矩 就 是 绕 晕 你 哦 绕 晕 你 快 很 快\n\n[Verse 2]\n他 掏 出 铜 板 抠 出 口 袋 最 后 一 颗 快 很 颗\n庄 家 哗 啦 哗 啦 摇 骰 子 摇 出 三 点 又 三 点 哦 三 点\n赌 鬼 急 得 咬 牙 切 齿 咬 到 舌 头 打 蝴 蝶 结 快 打 结\n还 想 押 上 祖 传 的 拖 鞋 拖 把 铁 锅 和 半 包 盐 盐 盐\n突 然 警 笛 嘀 嘟 嘀 嘟 吓 得 他 钻 进 垃 圾 罐 哦 垃 圾\n警 察 咔 嚓 咔 嚓 拍 照 拍 到 他 头 顶 菠 菜 叶 快 拍 照\n最 后 赌 鬼 蹲 监 狱 天 天 背 这 首 绕 口 令 哦 背 不 完\n若 背 错 一 句 就 加 刑 十 年 再 加 十 年 快 加 刑\n\n[Outro]\n舌 头 打 结 了... 赌 鬼 哭 了 哦...\n这 首 歌... 绕 死 人 了 哦...",
+    "audio_duration": 186.59997916666666,
+    "infer_step": 60,
+    "guidance_scale": 15,
+    "scheduler_type": "euler",
+    "cfg_type": "apg",
+    "omega_scale": 10,
+    "guidance_interval": 0.7,
+    "guidance_interval_decay": 0,
+    "min_guidance_scale": 3,
+    "use_erg_tag": true,
+    "use_erg_lyric": false,
+    "use_erg_diffusion": true,
+    "oss_steps": [],
+    "timecosts": {
+        "preprocess": 0.03011012077331543,
+        "diffusion": 21.696259260177612,
+        "latent2audio": 1.7648537158966064
+    },
+    "actual_seeds": [
+        3776541388
+    ],
+    "retake_seeds": [
+        4274500599
+    ],
+    "retake_variance": 0.5,
+    "guidance_scale_text": 0,
+    "guidance_scale_lyric": 0,
+    "repaint_start": 0,
+    "repaint_end": 0,
+    "edit_n_min": 0.0,
+    "edit_n_max": 1.0,
+    "edit_n_avg": 1,
+    "src_audio_path": null,
+    "edit_target_prompt": null,
+    "edit_target_lyrics": null,
+    "audio2audio_enable": false,
+    "ref_audio_strength": 0.5,
+    "ref_audio_input": null,
+    "audio_path": "./outputs/output_20250512114703_0.wav"
+}

examples/zh_rap_lora/input_params/output_20250512115409_0_input_params.json ADDED Viewed

	@@ -0,0 +1,45 @@

+{
+    "lora_name_or_path": "/root/sag_train/data/ace_step_v1_chinese_rap_lora",
+    "task": "text2music",
+    "prompt": "electronic, hip-hop, rap, synthesizer, drums, vocals, fast, energetic, modern, uplifting, young adult, male, spoken word, singing, bright, energetic, clear, 140 bpm, female",
+    "lyrics": "[Verse 1]\n红鲤鱼绿鲤鱼，驴在河里追鲤鱼，\n驴追鲤鱼鱼躲驴，气得驴子直喘气。\n扁担长板凳宽，扁担绑在板凳边，\n扁担要绑板凳不让绑，扁担偏要绑上板凳面！\n\n[Chorus]\n绕口令，练嘴皮，\n说快说慢别迟疑，\n红鲤鱼驴扁担板凳，\n一口气念完算你赢！\n\n[Verse 2]\n四是四十是十，十四是十四四十是四十，\n谁说四十是十四，舌头打结别放肆。\n黑化肥会挥发，灰化肥也发黑，\n化肥混一起，黑灰不分嘴发废！\n\n[Chorus]\n绕口令，练嘴皮，\n说快说慢别迟疑，\n四十十四化肥灰，\n念错罚你唱十回！\n\n[Bridge]\n坡上立着一只鹅，坡下流着一条河，\n鹅要过河河渡鹅，河要渡鹅鹅笑河——\n到底谁更啰嗦？！\n\n[Outro]\n嘴皮子功夫别小瞧，\n绕口令rap我最飙，\n下次挑战准备好，\n舌头打结别求饶！",
+    "audio_duration": 123.2,
+    "infer_step": 60,
+    "guidance_scale": 15,
+    "scheduler_type": "euler",
+    "cfg_type": "apg",
+    "omega_scale": 10,
+    "guidance_interval": 0.7,
+    "guidance_interval_decay": 0,
+    "min_guidance_scale": 3,
+    "use_erg_tag": true,
+    "use_erg_lyric": false,
+    "use_erg_diffusion": true,
+    "oss_steps": [],
+    "timecosts": {
+        "preprocess": 0.026150941848754883,
+        "diffusion": 12.212433099746704,
+        "latent2audio": 1.1857895851135254
+    },
+    "actual_seeds": [
+        1415752189
+    ],
+    "retake_seeds": [
+        685932970
+    ],
+    "retake_variance": 0.5,
+    "guidance_scale_text": 0,
+    "guidance_scale_lyric": 0,
+    "repaint_start": 0,
+    "repaint_end": 0,
+    "edit_n_min": 0.0,
+    "edit_n_max": 1.0,
+    "edit_n_avg": 1,
+    "src_audio_path": null,
+    "edit_target_prompt": null,
+    "edit_target_lyrics": null,
+    "audio2audio_enable": false,
+    "ref_audio_strength": 0.5,
+    "ref_audio_input": null,
+    "audio_path": "./outputs/output_20250512115409_0.wav"
+}

examples/zh_rap_lora/input_params/output_20250512120348_0_input_params.json ADDED Viewed

	@@ -0,0 +1,45 @@

+{
+    "lora_name_or_path": "/root/sag_train/data/ace_step_v1_chinese_rap_lora",
+    "task": "text2music",
+    "prompt": "singing, bright, slightly nasal, energetic, spoken word, young adult, male, rap music",
+    "lyrics": "[Intro]\nYo, check it—speed demon, lyrical heat, uh!\nRatatat like a drum when the beat bumps, uh!\n\n[Verse 1]\nRapatapa tap tap, flash like a snap,\nRap tap tap, I don’t chat, I clap clap clap!\nFingers snap, flow don’t slack, rapataptaptap,\nSpit it fast, hit the gas, rap tap tap rap!\n\n[Pre-Chorus]\nBoom-bap, zoom past, leave ’em flat,\nRap taptaprapataptaptap—where ya at?\n\n[Chorus]\nRapatapa tap tap, yeah, I go brrrr,\nRap tap tap, make the crowd stir!\nRapataptaptap, no lag, just spit,\nRap taptaprapataptaptap—I’m lit!\n\n[Verse 2]\nTongue-twist, quick wrist, rapatapa boom,\nTap tap rap, leave ya stuck like glue-gum!\nNo slow-mo, turbo, rapataptaptap,\nRap tap rap, yeah, I clap clap clap!\n\n[Outro]\nRapatapa—TAP! Mic drop—that’s that.",
+    "audio_duration": 60,
+    "infer_step": 60,
+    "guidance_scale": 15,
+    "scheduler_type": "euler",
+    "cfg_type": "apg",
+    "omega_scale": 10,
+    "guidance_interval": 0.5,
+    "guidance_interval_decay": 0,
+    "min_guidance_scale": 3,
+    "use_erg_tag": true,
+    "use_erg_lyric": false,
+    "use_erg_diffusion": true,
+    "oss_steps": [],
+    "timecosts": {
+        "preprocess": 0.018491744995117188,
+        "diffusion": 8.084580898284912,
+        "latent2audio": 0.5694489479064941
+    },
+    "actual_seeds": [
+        721655639
+    ],
+    "retake_seeds": [
+        1603201617
+    ],
+    "retake_variance": 0.5,
+    "guidance_scale_text": 0,
+    "guidance_scale_lyric": 0,
+    "repaint_start": 0,
+    "repaint_end": 0,
+    "edit_n_min": 0.0,
+    "edit_n_max": 1.0,
+    "edit_n_avg": 1,
+    "src_audio_path": null,
+    "edit_target_prompt": null,
+    "edit_target_lyrics": null,
+    "audio2audio_enable": false,
+    "ref_audio_strength": 0.5,
+    "ref_audio_input": null,
+    "audio_path": "./outputs/output_20250512120348_0.wav"
+}

examples/zh_rap_lora/input_params/output_20250512143242_0_input_params.json ADDED Viewed

	@@ -0,0 +1,45 @@

+{
+    "lora_name_or_path": "ACE-Step/ACE-Step-v1-chinese-rap-LoRA",
+    "task": "text2music",
+    "prompt": "G-Funk, Hip Hop, Rap, Female Vocals, Melodic Rap, Summer, Laid-back Groove, Smooth Rhythm, Synthesizer Lead, Heavy Bassline, Groovy, West Coast Hip Hop",
+    "lyrics": "(Intro)\nOh yeah... \n\n(Verse 1)\n阳光下，沙滩排球场，一个身影跳跃\n小麦色，运动背心，闪耀活力四射\n她跳起扣杀，动作利落又巧妙\n汗水浸湿发梢，笑容比阳光更美好\n摇摆的节奏，是她的背景配乐\n每一次移动，都踩在鼓点上那么和谐\n我不由自主地停下脚步\n目光被她紧紧锁住\n\n(Chorus)\n沙滩排球女孩， 摇摆节拍下的身材\n无忧无虑的笑容，把我的心都填满\n想走上前去搭讪，嫌自己笨拙呆板\n这青春的气息，耀眼，灿烂！\n\n(Verse 3)\n她和队友击掌庆祝，笑声清脆悦耳\n拿起毛巾擦汗，不经意间瞥我一眼\n鼓起勇气走上前，假装问问时间\n她友好地回答，笑容灿烂没有敷衍\n聊了几句，发现彼此爱这摇摆音乐\n她眼中也闪过惊喜和亲切\n这共同点，让气氛变得融洽又热烈！\n夏天的故事，就这样开始了感觉真切！\n\n(Chorus)\n沙滩排球女孩， 摇摆节拍下的身材\n无忧无虑的笑容，把我的心都填满\n不再犹豫和等待，勇敢把脚步迈开\n这夏天的感觉，心跳，不断！",
+    "audio_duration": 93.93038,
+    "infer_step": 60,
+    "guidance_scale": 15,
+    "scheduler_type": "euler",
+    "cfg_type": "apg",
+    "omega_scale": 10,
+    "guidance_interval": 0.5,
+    "guidance_interval_decay": 0,
+    "min_guidance_scale": 3,
+    "use_erg_tag": true,
+    "use_erg_lyric": false,
+    "use_erg_diffusion": true,
+    "oss_steps": [],
+    "timecosts": {
+        "preprocess": 0.03020024299621582,
+        "diffusion": 9.942127704620361,
+        "latent2audio": 0.9470341205596924
+    },
+    "actual_seeds": [
+        3826585299
+    ],
+    "retake_seeds": [
+        2519711205
+    ],
+    "retake_variance": 0.5,
+    "guidance_scale_text": 0,
+    "guidance_scale_lyric": 0,
+    "repaint_start": 0,
+    "repaint_end": 0,
+    "edit_n_min": 0.0,
+    "edit_n_max": 1.0,
+    "edit_n_avg": 1,
+    "src_audio_path": null,
+    "edit_target_prompt": null,
+    "edit_target_lyrics": null,
+    "audio2audio_enable": false,
+    "ref_audio_strength": 0.5,
+    "ref_audio_input": null,
+    "audio_path": "./outputs/output_20250512143242_0.wav"
+}

examples/zh_rap_lora/input_params/output_20250512145057_0_input_params.json ADDED Viewed

	@@ -0,0 +1,45 @@

+{
+    "lora_name_or_path": "/root/sag_train/data/ace_step_v1_chinese_rap_lora_80k",
+    "task": "text2music",
+    "prompt": "lyrical rap, young adult, female, rap flow, spoken word, ad-libs, bright, energetic, eat, Fast, Engaging, Energetic",
+    "lyrics": "[Intro]\n扁擔寬 板凳長 扁擔想綁在板凳上\n扁擔寬 板凳長 扁擔想綁在板凳上\n\n[Verse]\n倫敦 瑪莉蓮 買了 件 旗袍 送 媽媽\n莫斯科 的 夫司基 愛上 牛肉 麵 疙瘩\n各種 顏色 的 皮膚 各種 顏色 的 頭髮\n嘴裡念的 說的 開始 流行 中國話 (中國話)\n\n[Bridge]\n多少年 我們 苦練 英文 發音 和 文法 (yeah)\n這幾年 換他們 捲著 舌頭 學 平上去入 的 變化\n平平 仄仄 平平 仄\n好聰明 的 中國人 好優美 的 中國話\n\n[Verse]\n扁擔寬 板凳長 扁擔想綁在板凳上\n板凳不讓扁擔綁在板凳上 扁擔偏要綁在板凳上\n板凳偏偏不讓扁擔綁在那板凳上\n到底扁擔寬 還是板凳長？\n\n[Verse]\n哥哥弟弟坡前坐\n坡上臥著一隻鵝 坡下流著一條河\n哥哥說 寬寬的河 弟弟說 白白的鵝\n鵝要過河 河要渡鵝\n不知是那鵝過河 還是河渡鵝\n\n[Chorus]\n全世界都在學中國話\n孔夫子的話 越來越國際化\n全世界都在講中國話\n我們說的話 讓世界都認真聽話\n\n[Verse]\n紐約蘇珊娜開了間禪風 lounge bar\n柏林來的沃夫岡拿胡琴配著電吉他\n各種顏色的皮膚 各種顏色的頭髮\n嘴裡念的 說的 開始流行中國話 (中國話)\n\n[Bridge]\n多少年我們苦練英文發音和文法 (yeah)\n這幾年換他們捲著舌頭學平上去入的變化\n仄仄平平仄仄平\n好聰明的中國人 好優美的中國話\n\n[Verse]\n有個小孩叫小杜 上街打醋又買布\n買了布 打了醋 回頭看見鷹抓兔\n放下布 擱下醋 上前去追鷹和兔\n飛了鷹 跑了兔 灑了醋 濕了布\n\n[Verse]\n嘴說腿 腿說嘴\n嘴說腿 愛跑腿\n腿說嘴 愛賣嘴\n光動嘴 不動腿\n光動腿 不動嘴\n不如不長腿和嘴\n到底是那嘴說腿 還是腿說嘴？\n\n[Chorus]\n全世界都在學中國話\n孔夫子的話 越來越國際化\n全世界都在講中國話\n我們說的話 讓世界都認真聽話\n\n[outro]\n全世界都在學中國話 (在學中國話)\n孔夫子的話 越來越國際化\n全世界都在講中國話\n我們說的話 (讓他) 讓世界 (認真) 都認真聽話",
+    "audio_duration": 239.8355625,
+    "infer_step": 60,
+    "guidance_scale": 15,
+    "scheduler_type": "euler",
+    "cfg_type": "apg",
+    "omega_scale": 10,
+    "guidance_interval": 0.5,
+    "guidance_interval_decay": 0,
+    "min_guidance_scale": 3,
+    "use_erg_tag": true,
+    "use_erg_lyric": false,
+    "use_erg_diffusion": true,
+    "oss_steps": [],
+    "timecosts": {
+        "preprocess": 0.04363536834716797,
+        "diffusion": 18.706920385360718,
+        "latent2audio": 2.1645781993865967
+    },
+    "actual_seeds": [
+        2364345905
+    ],
+    "retake_seeds": [
+        2100914041
+    ],
+    "retake_variance": 0.5,
+    "guidance_scale_text": 0,
+    "guidance_scale_lyric": 0,
+    "repaint_start": 0,
+    "repaint_end": 0,
+    "edit_n_min": 0.0,
+    "edit_n_max": 1.0,
+    "edit_n_avg": 1,
+    "src_audio_path": null,
+    "edit_target_prompt": null,
+    "edit_target_lyrics": null,
+    "audio2audio_enable": false,
+    "ref_audio_strength": 0.5,
+    "ref_audio_input": null,
+    "audio_path": "./outputs/output_20250512145057_0.wav"
+}

examples/zh_rap_lora/input_params/output_20250512152217_0_input_params.json ADDED Viewed

	@@ -0,0 +1,45 @@

+{
+    "lora_name_or_path": "/root/sag_train/data/ace_step_v1_chinese_rap_lora_80k",
+    "task": "text2music",
+    "prompt": "articulate, spoken word, young adult, warm, rap music, male, clear, street, dark, rap flow, hardcore rap",
+    "lyrics": "[verse]\n球场 的 橡胶味 弥漫 隔壁 是 健身房\n场 边上 的 老教练 战术 有 三套\n教 交叉 运球 的 大叔 会 欧洲步 耍 背后 传\n硬 身板 对抗 最 擅长 还 会 急停跳 后仰 投\n他们 徒弟 我 习惯 从小 就 耳濡目染\n什么 胯下 跟 变向 我 都 玩 的 有模有样\n什么 招式 最 喜欢 转身 过 人 柔中 带 刚\n想要 去 纽约 街头 斗 洛克 公园 场\n\n[chorus]\n看什么 看什么\n变速 突破 心 自在\n看什么 看什么\n假动作 晃 开 防守 来\n看什么 看什么\n每日 训练 绑 沙袋\n空中拉杆 莫 奇怪\n唰唰 入袋\n\n[verse]\n一个 试探 步后 一记 左 变向 右 变向\n一句 挑衅 我 的 人 别 嚣张\n一再 重演 一颗 我 不 投 的 球\n悬在 篮筐 上 它 一直 在 摇晃\n\n[chorus]\n看什么 看什么\n我 激活 小宇宙 来\n看什么 看什么\n菜鸟 新人 的 名号\n看什么 看什么\n已 被 我 一球 击倒\n\n[chorus]\n快 秀出 指尖 转球 砰砰 啪嗒\n快 秀出 指尖 转球 砰砰 啪嗒\n篮球 之 人 切记 勇者 无惧\n是 谁 在 玩 花式 引爆 空气\n快 秀出 指尖 转球 砰砰 啪嗒\n快 秀出 指尖 转球 砰砰 啪嗒\n如果 我 有 滞空 逆天 补扣\n为人 热血 不怂 一生 傲骨 吼\n\n[verse]\n他们 徒弟 我 习惯 从小 就 耳濡目染\n什么 胯下 跟 变向 我 都 玩 的 有模有样\n什么 招式 最 喜欢 转身 过 人 柔中 带 刚\n想要 去 纽约 街头 斗 洛克 公园 场\n\n[outro]\n快 秀出 指尖 转球 砰\n快 秀出 指尖 转球 砰\n如果 我 有 滞空 吼\n为人 热血 不怂 一生 傲骨 吼\n快 秀出 指尖 转球 砰\n我 用 背传 助攻 吼\n压哨 的 三分 球",
+    "audio_duration": 239.8355625,
+    "infer_step": 60,
+    "guidance_scale": 15,
+    "scheduler_type": "euler",
+    "cfg_type": "apg",
+    "omega_scale": 10,
+    "guidance_interval": 0.5,
+    "guidance_interval_decay": 0,
+    "min_guidance_scale": 3,
+    "use_erg_tag": true,
+    "use_erg_lyric": false,
+    "use_erg_diffusion": true,
+    "oss_steps": [],
+    "timecosts": {
+        "preprocess": 0.05357813835144043,
+        "diffusion": 25.644447326660156,
+        "latent2audio": 2.1787476539611816
+    },
+    "actual_seeds": [
+        3246571430
+    ],
+    "retake_seeds": [
+        1352325167
+    ],
+    "retake_variance": 0.5,
+    "guidance_scale_text": 0,
+    "guidance_scale_lyric": 0,
+    "repaint_start": 0,
+    "repaint_end": 0,
+    "edit_n_min": 0.0,
+    "edit_n_max": 1.0,
+    "edit_n_avg": 1,
+    "src_audio_path": null,
+    "edit_target_prompt": null,
+    "edit_target_lyrics": null,
+    "audio2audio_enable": false,
+    "ref_audio_strength": 0.5,
+    "ref_audio_input": null,
+    "audio_path": "./outputs/output_20250512152217_0.wav"
+}

examples/zh_rap_lora/input_params/output_20250512153616_0_input_params.json ADDED Viewed

	@@ -0,0 +1,45 @@

+{
+    "lora_name_or_path": "/root/sag_train/data/ace_step_v1_chinese_rap_lora_80k",
+    "task": "text2music",
+    "prompt": "articulate, spoken word, young adult, warm, rap music, male, clear, street, dark, rap flow, hardcore rap, fast",
+    "lyrics": "[verse]\n球场 的 橡胶味 弥漫 隔壁 是 健身房\n场 边上 的 老教练 战术 有 三套\n教 交叉 运球 的 大叔 会 欧洲步 耍 背后 传\n硬 身板 对抗 最 擅长 还 会 急停跳 后仰 投\n他们 徒弟 我 习惯 从小 就 耳濡目染\n什么 胯下 跟 变向 我 都 玩 的 有模有样\n什么 招式 最 喜欢 转身 过 人 柔中 带 刚\n想要 去 纽约 街头 斗 洛克 公园 场\n\n[chorus]\n看什么 看什么\n变速 突破 心 自在\n看什么 看什么\n假动作 晃 开 防守 来\n看什么 看什么\n每日 训练 绑 沙袋\n空中拉杆 莫 奇怪\n唰唰 入袋\n\n[verse]\n一个 试探 步后 一记 左 变向 右 变向\n一句 挑衅 我 的 人 别 嚣张\n一再 重演 一颗 我 不 投 的 球\n悬在 篮筐 上 它 一直 在 摇晃\n\n[chorus]\n看什么 看什么\n我 激活 小宇宙 来\n看什么 看什么\n菜鸟 新人 的 名号\n看什么 看什么\n已 被 我 一球 击倒\n\n[chorus]\n快 秀出 指尖 转球 砰砰 啪嗒\n快 秀出 指尖 转球 砰砰 啪嗒\n篮球 之 人 切记 勇者 无惧\n是 谁 在 玩 花式 引爆 空气\n快 秀出 指尖 转球 砰砰 啪嗒\n快 秀出 指尖 转球 砰砰 啪嗒\n如果 我 有 滞空 逆天 补扣\n为人 热血 不怂 一生 傲骨 吼\n\n[verse]\n他们 徒弟 我 习惯 从小 就 耳濡目染\n什么 胯下 跟 变向 我 都 玩 的 有模有样\n什么 招式 最 喜欢 转身 过 人 柔中 带 刚\n想要 去 纽约 街头 斗 洛克 公园 场\n\n[outro]\n快 秀出 指尖 转球 砰\n快 秀出 指尖 转球 砰\n如果 我 有 滞空 吼\n为人 热血 不怂 一生 傲骨 吼\n快 秀出 指尖 转球 砰\n我 用 背传 助攻 吼\n压哨 的 三分 球",
+    "audio_duration": 183.23,
+    "infer_step": 60,
+    "guidance_scale": 15,
+    "scheduler_type": "euler",
+    "cfg_type": "apg",
+    "omega_scale": 10,
+    "guidance_interval": 0.5,
+    "guidance_interval_decay": 0,
+    "min_guidance_scale": 3,
+    "use_erg_tag": true,
+    "use_erg_lyric": false,
+    "use_erg_diffusion": true,
+    "oss_steps": [],
+    "timecosts": {
+        "preprocess": 0.046170711517333984,
+        "diffusion": 14.21678113937378,
+        "latent2audio": 2.685957193374634
+    },
+    "actual_seeds": [
+        3072005931
+    ],
+    "retake_seeds": [
+        562842491
+    ],
+    "retake_variance": 0.5,
+    "guidance_scale_text": 0,
+    "guidance_scale_lyric": 0,
+    "repaint_start": 0,
+    "repaint_end": 0,
+    "edit_n_min": 0.0,
+    "edit_n_max": 1.0,
+    "edit_n_avg": 1,
+    "src_audio_path": null,
+    "edit_target_prompt": null,
+    "edit_target_lyrics": null,
+    "audio2audio_enable": false,
+    "ref_audio_strength": 0.5,
+    "ref_audio_input": null,
+    "audio_path": "./outputs/output_20250512153616_0.wav"
+}

examples/zh_rap_lora/input_params/output_20250512154907_0_input_params.json ADDED Viewed

	@@ -0,0 +1,45 @@

+{
+    "lora_name_or_path": "/root/sag_train/data/ace_step_v1_chinese_rap_lora_80k",
+    "task": "text2music",
+    "prompt": "articulate, spoken word, young adult, rap music, female, clear, energetic, warm",
+    "lyrics": "[Intro]\n\"System booting... 语言 模型 loading...\"\n\n[Verse 1]\n硅谷 那个 coder 调试 neural network\n北京 的 极客 训练 A I 写 report\n不同 架构 的 chip 不同 算法 的 war\n屏幕上 跑的 全是 machine learning (learning)\n\n[Bridge]\n多少年 我们 chase 摩尔 定律 的 trend (yeah)\n这两年 换他们 study 中文 N L P\nConvolution L S T M\n好烧脑 的 backprop 好暴力 的 big data\n\n[Verse 2]\nPython 强 say加加 刚 Python 调用 C++ 的 A P I\nsay加加 嫌 Python 太 slow Python 笑 C++ 太 hardcore\nL L V M 默默 generate 中间 code\n到底 interpreter 还是 compiler 屌？\n\n[Verse 3]\nP M 和 engineer\n白板 画满 flow chart 服务器 闪着 red light\nP M 说 add feature engineer 说 no way\n需求 变更 code 重构\n不知 是 P M 太 fly 还是 deadline 太 high\n\n[Chorus]\n全世界 都在 train neural network\nTransformer 的 paper 越来越 难 go through\n全世界 都在 tune 超参数\n我们 写的 bug 让 G P U 都 say no\n\n[Verse 4]\n柏林 hackathon demo blockchain contract\n上海 的 dev 用 federated learning 破 data wall\n各种 语言 的 error 各种 框架 的 doc\nterminal 里 滚的 全是 dependency 冲突\n\n[Bridge]\n曾以为 English 才是 coding 的 language (yeah)\n直到见 G P T 用 文言文 generate 正则 expression\nGradient explode\n好硬核 的 prompt 好头秃 的 debug road\n\n[Verse 5]\n有个 bug 叫 quantum\n测试 环境 run perfect 上线 立即就 crash\n查 log 看 monitor 发现是 thread 不同步\n改 sync 加 lock 慢 deadlock 更难办\n量子 computer 也解不开 这 chaos chain\n\n[Verse 6]\n你说 996 我说 007\n你说 福报 我说 burnout\nProduct 要 agile Boss 要 KPI\nCode 要 elegant deadline 是 tomorrow\n不如 直接 script 自动 submit 离职信\n\n[Outro]\n\"Warning: 内存 leak...core dumping...\"\n全世界 都在 train neural network (neural network)\nLoss 还没 converge 天已经亮\n全世界 都在 tune 超参数\n我们 写的 code (让它) 让 world (reboot) 都 reboot 无效",
+    "audio_duration": 179.12,
+    "infer_step": 60,
+    "guidance_scale": 15,
+    "scheduler_type": "euler",
+    "cfg_type": "apg",
+    "omega_scale": 10,
+    "guidance_interval": 0.5,
+    "guidance_interval_decay": 0,
+    "min_guidance_scale": 3,
+    "use_erg_tag": true,
+    "use_erg_lyric": false,
+    "use_erg_diffusion": true,
+    "oss_steps": [],
+    "timecosts": {
+        "preprocess": 0.062120914459228516,
+        "diffusion": 13.499217987060547,
+        "latent2audio": 1.6430137157440186
+    },
+    "actual_seeds": [
+        1637990575
+    ],
+    "retake_seeds": [
+        101283039
+    ],
+    "retake_variance": 0.5,
+    "guidance_scale_text": 0,
+    "guidance_scale_lyric": 0,
+    "repaint_start": 0,
+    "repaint_end": 0,
+    "edit_n_min": 0.0,
+    "edit_n_max": 1.0,
+    "edit_n_avg": 1,
+    "src_audio_path": null,
+    "edit_target_prompt": null,
+    "edit_target_lyrics": null,
+    "audio2audio_enable": false,
+    "ref_audio_strength": 0.5,
+    "ref_audio_input": null,
+    "audio_path": "./outputs/output_20250512154907_0.wav"
+}

examples/zh_rap_lora/input_params/output_20250512160830_0_input_params.json ADDED Viewed

	@@ -0,0 +1,45 @@

+{
+    "lora_name_or_path": "/root/sag_train/data/ace_step_v1_chinese_rap_lora_80k",
+    "task": "text2music",
+    "prompt": "articulate, spoken word, young adult, rap music, male, clear, energetic, warm, relaxed, breathy, night club",
+    "lyrics": "[verse]\n这 这 谁 又 在 派 对 喝 多\n我 的 脑 袋\n像 被 驴 踢 过\n不 对 劲\n舌 头 打 结 不 会 说\n你 来 挑 战 我 就 跪\n开 局 直 接 崩 溃\n\n[chorus]\n就 咪 乱 咪 念 咪 错 咪\n嘴 咪 瓢 咪 成 咪 狗 咪\n脑 咪 袋 咪 像 咪 浆 咪 糊 咪\n跟 咪 着 咪 节 咪 奏 咪\n把 咪 歌 咪 词 咪 全 咪 忘 咪\n一 咪 张 咪 嘴 咪 就 咪 废 咪\n只 咪 剩 咪 下 咪 尴 咪 尬 咪 回 咪 忆\n草！\n\n[verse]\n错 错 错 错 了\n一 口 气 全 念 错\n错 错 错 错 了\n舌 头 打 结 甩 锅\n甩 甩 甩 甩 锅\n甩 锅 甩 锅\n拍 子 全 部 乱 套\n观 众 笑 到 吐 血\n\n[verse]\n你 的 歌 词 我 的 噩 梦\n唱 完 直 接 社 死\n调 跑 到 外 太 空\n观 众 表 情 裂 开\n你 笑 我 菜\n我 笑 你 不 懂\n这 叫 艺 术 表 演\n不 服 你 来！\n\n[verse]\n这 这 谁 又 在 派 对 丢 人\n我 的 世 界\n已 经 彻 底 崩 溃\n没 有 完 美\n只 有 翻 车 现 场\n以 及 观 众 的 嘲 讽\n\n[chorus]\n就 咪 乱 咪 念 咪 错 咪\n嘴 咪 瓢 咪 成 咪 狗 咪\n脑 咪 袋 咪 像 咪 浆 咪 糊 咪\n跟 咪 着 咪 节 咪 奏 咪\n把 咪 歌 咪 词 咪 全 咪 忘 咪\n一 咪 张 咪 嘴 咪 就 咪 废 咪\n只 咪 剩 咪 下 咪 尴 咪 尬 咪 回 咪 忆\n草！\n\n[verse]\n错 错 错 错 了\n一 口 气 全 念 错\n错 错 错 错 了\n舌 头 打 结 甩 锅\n甩 甩 甩 甩 锅\n甩 锅 甩 锅\n拍 子 全 部 乱 套\n观 众 笑 到 吐 血\n\n[verse]\n你 的 歌 词 我 的 噩 梦\n唱 完 直 接 社 死\n调 跑 到 外 太 空\n观 众 表 情 裂 开\n你 笑 我 菜\n我 笑 你 不 懂\n这 叫 艺 术 表 演\n不 服 你 来！",
+    "audio_duration": 169.12,
+    "infer_step": 60,
+    "guidance_scale": 15,
+    "scheduler_type": "euler",
+    "cfg_type": "apg",
+    "omega_scale": 10,
+    "guidance_interval": 0.5,
+    "guidance_interval_decay": 0,
+    "min_guidance_scale": 3,
+    "use_erg_tag": true,
+    "use_erg_lyric": false,
+    "use_erg_diffusion": true,
+    "oss_steps": [],
+    "timecosts": {
+        "preprocess": 0.041605472564697266,
+        "diffusion": 14.009192705154419,
+        "latent2audio": 1.55946946144104
+    },
+    "actual_seeds": [
+        547563805
+    ],
+    "retake_seeds": [
+        2702917060
+    ],
+    "retake_variance": 0.5,
+    "guidance_scale_text": 0,
+    "guidance_scale_lyric": 0,
+    "repaint_start": 0,
+    "repaint_end": 0,
+    "edit_n_min": 0.0,
+    "edit_n_max": 1.0,
+    "edit_n_avg": 1,
+    "src_audio_path": null,
+    "edit_target_prompt": null,
+    "edit_target_lyrics": null,
+    "audio2audio_enable": false,
+    "ref_audio_strength": 0.5,
+    "ref_audio_input": null,
+    "audio_path": "./outputs/output_20250512160830_0.wav"
+}

examples/zh_rap_lora/input_params/output_20250512161832_0_input_params.json ADDED Viewed

	@@ -0,0 +1,45 @@

+{
+    "lora_name_or_path": "/root/sag_train/data/ace_step_v1_chinese_rap_lora_80k",
+    "task": "text2music",
+    "prompt": "articulate, spoken word, young adult, rap music, male, clear, energetic, warm, relaxed, breathy, night club, auto-tune, mumble rap, trap",
+    "lyrics": "[verse]\n这 这 谁 又 在 派 对 喝 多\n我 的 脑 袋\n像 被 驴 踢 过\n不 对 劲\n舌 头 打 结 不 会 说\n你 来 挑 战 我 就 跪\n开 局 直 接 崩 溃\n\n[chorus]\n就 咪 乱 咪 念 咪 错 咪\n嘴 咪 瓢 咪 成 咪 狗 咪\n脑 咪 袋 咪 像 咪 浆 咪 糊 咪\n跟 咪 着 咪 节 咪 奏 咪\n把 咪 歌 咪 词 咪 全 咪 忘 咪\n一 咪 张 咪 嘴 咪 就 咪 废 咪\n只 咪 剩 咪 下 咪 尴 咪 尬 咪 回 咪 忆\n草！\n\n[verse]\n错 错 错 错 了\n一 口 气 全 念 错\n错 错 错 错 了\n舌 头 打 结 甩 锅\n甩 甩 甩 甩 锅\n甩 锅 甩 锅\n拍 子 全 部 乱 套\n观 众 笑 到 吐 血\n\n[verse]\n你 的 歌 词 我 的 噩 梦\n唱 完 直 接 社 死\n调 跑 到 外 太 空\n观 众 表 情 裂 开\n你 笑 我 菜\n我 笑 你 不 懂\n这 叫 艺 术 表 演\n不 服 你 来！\n\n[verse]\n这 这 谁 又 在 派 对 丢 人\n我 的 世 界\n已 经 彻 底 崩 溃\n没 有 完 美\n只 有 翻 车 现 场\n以 及 观 众 的 嘲 讽\n\n[chorus]\n就 咪 乱 咪 念 咪 错 咪\n嘴 咪 瓢 咪 成 咪 狗 咪\n脑 咪 袋 咪 像 咪 浆 咪 糊 咪\n跟 咪 着 咪 节 咪 奏 咪\n把 咪 歌 咪 词 咪 全 咪 忘 咪\n一 咪 张 咪 嘴 咪 就 咪 废 咪\n只 咪 剩 咪 下 咪 尴 咪 尬 咪 回 咪 忆\n草！\n\n[verse]\n错 错 错 错 了\n一 口 气 全 念 错\n错 错 错 错 了\n舌 头 打 结 甩 锅\n甩 甩 甩 甩 锅\n甩 锅 甩 锅\n拍 子 全 部 乱 套\n观 众 笑 到 吐 血\n\n[verse]\n你 的 歌 词 我 的 噩 梦\n唱 完 直 接 社 死\n调 跑 到 外 太 空\n观 众 表 情 裂 开\n你 笑 我 菜\n我 笑 你 不 懂\n这 叫 艺 术 表 演\n不 服 你 来！",
+    "audio_duration": 169.12,
+    "infer_step": 60,
+    "guidance_scale": 15,
+    "scheduler_type": "euler",
+    "cfg_type": "apg",
+    "omega_scale": 10,
+    "guidance_interval": 0.5,
+    "guidance_interval_decay": 0,
+    "min_guidance_scale": 3,
+    "use_erg_tag": true,
+    "use_erg_lyric": false,
+    "use_erg_diffusion": true,
+    "oss_steps": [],
+    "timecosts": {
+        "preprocess": 0.04321885108947754,
+        "diffusion": 14.026689767837524,
+        "latent2audio": 1.5587565898895264
+    },
+    "actual_seeds": [
+        1905941472
+    ],
+    "retake_seeds": [
+        3018484796
+    ],
+    "retake_variance": 0.5,
+    "guidance_scale_text": 0,
+    "guidance_scale_lyric": 0,
+    "repaint_start": 0,
+    "repaint_end": 0,
+    "edit_n_min": 0.0,
+    "edit_n_max": 1.0,
+    "edit_n_avg": 1,
+    "src_audio_path": null,
+    "edit_target_prompt": null,
+    "edit_target_lyrics": null,
+    "audio2audio_enable": false,
+    "ref_audio_strength": 0.5,
+    "ref_audio_input": null,
+    "audio_path": "./outputs/output_20250512161832_0.wav"
+}

examples/zh_rap_lora/input_params/output_20250512164224_0_input_params.json ADDED Viewed

	@@ -0,0 +1,45 @@

+{
+    "lora_name_or_path": "/root/sag_train/data/ace_step_v1_chinese_rap_lora_80k",
+    "task": "text2music",
+    "prompt": "四川话, spoken word, male, Tempo - Fast, Elements - Chorus Hook, Subgenre-Satirical Hip Hop, Rap, Chinese-language music, energetic, slightly nasal, Instrument - Live Bass Guitar, adult, Vocals - Syncopated Flow, Genre - Hip-Hop, rapping, bright",
+    "lyrics": "[chorus]\n黑 墨镜 金 链子 越 低调 越 霸气\n玩 街机 泡 吧里 再 野的 场子 都 不 怯气\n上海 滩 老 江湖 外滩 钟声 敲 胜负\n陆家嘴 黄浦江 财路 宽 给 你 开 扇窗\n\n[verse]\n老子 在 弄堂 斜起 走 想 拦路 的 先 报 名号\n我 早看透 你们 手抖 脚软\n只敢 网上 吠 现实 怂成 猫\n看 你们 混的 真 可怜 整天 蹲在 网吧 蹭 烟\n钱 赚不到 架 不敢打 还 学人 摆 大哥 脸\n\n[verse]\n叫 我 沪上 老 克勒 不是 拉菲 我 不 碰杯\n规矩 我 懒得 讲 太多 钞票 直接 拍 你 脸上 飞\n老子 耐心 差 门槛 高 你 找茬 等于 自 寻 烦恼\n要么 跪 要么 爬 最后 警告 只 说 一 遭\n\n[chorus]\n黑 墨镜 金 链子 越 低调 越 霸气\n玩 街机 泡 吧里 再 野的 场子 都 不 怯气\n上海 滩 老 江湖 外滩 钟声 敲 胜负\n陆家嘴 黄浦江 财路 宽 给 你 开 扇窗\n\n[verse]\n古巴 雪茄 在 指间 绕 代表 魔都 格调 必须 顶\nOG 在 你 够不到 的 高度 My bro 永远 在 顶层 盯\nCheck my vibe 不靠 大 金劳 留声机 放 周璇 和 白光\n爹妈 太 宠你 养出 巨婴 症 早晚 社会 教你 做人 经\n\n[verse]\n玩 说唱 小囡 太 年轻 要 比 flow 先去 练 气功\n廿年 磨 枪 才 亮 锋芒 我 三十六 招 收 你 入 瓮\n老子 存在 就是 打假 标\n多少 人 眼红 又 不敢 挑\n键盘 侠 的 狠话 像 棉花 糖\n见 真人 秒变 Hello Kitty 叫\n\n[chorus]\n黑 墨镜 金 链子 越 低调 越 霸气\n玩 街机 泡 吧里 再 野的 场子 都 不 怯气\n上海 滩 老 江湖 外滩 钟声 敲 胜负\n陆家嘴 黄浦江 财路 宽 给 你 开 扇窗\n\n[chorus]\n黑 墨镜 金 链子 越 低调 越 霸气\n玩 街机 泡 吧里 再 野的 场子 都 不 怯气\n上海 滩 老 江湖 外滩 钟声 敲 胜负\n陆家嘴 黄浦江 财路 宽 给 你 开 扇窗",
+    "audio_duration": 135.92,
+    "infer_step": 60,
+    "guidance_scale": 15,
+    "scheduler_type": "euler",
+    "cfg_type": "apg",
+    "omega_scale": 10,
+    "guidance_interval": 0.5,
+    "guidance_interval_decay": 0,
+    "min_guidance_scale": 3,
+    "use_erg_tag": true,
+    "use_erg_lyric": false,
+    "use_erg_diffusion": true,
+    "oss_steps": [],
+    "timecosts": {
+        "preprocess": 0.038518667221069336,
+        "diffusion": 16.47420620918274,
+        "latent2audio": 2.5094873905181885
+    },
+    "actual_seeds": [
+        2159904788
+    ],
+    "retake_seeds": [
+        2403013980
+    ],
+    "retake_variance": 0.5,
+    "guidance_scale_text": 0,
+    "guidance_scale_lyric": 0,
+    "repaint_start": 0,
+    "repaint_end": 0,
+    "edit_n_min": 0.0,
+    "edit_n_max": 1.0,
+    "edit_n_avg": 1,
+    "src_audio_path": null,
+    "edit_target_prompt": null,
+    "edit_target_lyrics": null,
+    "audio2audio_enable": false,
+    "ref_audio_strength": 0.5,
+    "ref_audio_input": null,
+    "audio_path": "./outputs/output_20250512164224_0.wav"
+}

examples/zh_rap_lora/input_params/output_20250512171227_0_input_params.json ADDED Viewed

	@@ -0,0 +1,45 @@

+{
+    "lora_name_or_path": "ACE-Step/ACE-Step-v1-chinese-rap-LoRA",
+    "task": "text2music",
+    "prompt": "Rap, Chinese Rap, J-Pop, Anime, kawaii pop, EDM, Aggressive, Intense, Crisp Snare, Super Fast, Clear",
+    "lyrics": "(Intro)\nLet's drift away...\n\n(Verse 1)\n现实是灰色的格子间，重复的工作，枯燥的报表 \n敲打着键盘，眼神却放空，意识早已挣脱了肉体的镣铐\n飘向窗外，飞过拥挤的街道，穿过云层，到达想象的群岛\n那里色彩斑斓，形状奇异，逻辑失效，一切都随心所欲地飘摇\n迷幻的鼓点，像心跳的变奏，忽快忽慢，难以预料\n抽象的采样，扭曲的人声，构建一个超现实的音景环绕\n我变成一只鸟，一条鱼，一束光，自由地变换形态和奔跑\n在这白日梦里，我无所不能，摆脱了所有现实的烦恼， feeling the afterglow\n\n(Chorus)\n意识漫游，逃离乏味的轨道 \n迷幻嘻哈的节拍，是白日梦的引导 \n抽象的世界，逻辑被重新构造\nMind wandering free, where reality starts to fade slow\n\n(Verse 2)\n会议室里老板在讲话，声音模糊，像隔着水听不清道\n我的思绪，早已潜入深海，与发光的水母一起舞蹈\n或者飞向外太空，在星云间穿梭，探索未知的星球和轨道\n现实的规则，在这里被打破，物理定律也失去效劳\n白日梦是我的避难所，是精神的氧气罩\n在乏味的现实里，为我注入一点色彩和奇妙\n虽然短暂，虽然虚幻，但它让我能够喘息，重新把能量找到\n然后回到现实，继续扮演那个，循规蹈矩的角色，把梦藏好， keep the dream aglow\n\n(Chorus)\n意识漫游，逃离乏味的轨道\n迷幻嘻哈的节拍，是白日梦的引导\n抽象的世界，逻辑被重新构造\nMind wandering free, where reality starts to fade slow\n",
+    "audio_duration": 153.7148,
+    "infer_step": 60,
+    "guidance_scale": 15,
+    "scheduler_type": "euler",
+    "cfg_type": "apg",
+    "omega_scale": 10,
+    "guidance_interval": 0.5,
+    "guidance_interval_decay": 0,
+    "min_guidance_scale": 3,
+    "use_erg_tag": true,
+    "use_erg_lyric": false,
+    "use_erg_diffusion": true,
+    "oss_steps": [],
+    "timecosts": {
+        "preprocess": 0.04823446273803711,
+        "diffusion": 13.158645629882812,
+        "latent2audio": 1.493880033493042
+    },
+    "actual_seeds": [
+        2945962357
+    ],
+    "retake_seeds": [
+        2676242300
+    ],
+    "retake_variance": 0.5,
+    "guidance_scale_text": 0.7,
+    "guidance_scale_lyric": 1.5,
+    "repaint_start": 0,
+    "repaint_end": 0,
+    "edit_n_min": 0.0,
+    "edit_n_max": 1.0,
+    "edit_n_avg": 1,
+    "src_audio_path": null,
+    "edit_target_prompt": null,
+    "edit_target_lyrics": null,
+    "audio2audio_enable": false,
+    "ref_audio_strength": 0.5,
+    "ref_audio_input": null,
+    "audio_path": "./outputs/output_20250512171227_0.wav"
+}

examples/zh_rap_lora/input_params/output_20250512171809_0_input_params.json ADDED Viewed

	@@ -0,0 +1,45 @@

+{
+    "lora_name_or_path": "/root/sag_train/data/ace_step_v1_chinese_rap_lora",
+    "task": "text2music",
+    "prompt": "J-Pop, Anime, kawaii future bass, Femal vocals, EDM, Boombap, Aggressive, Intense, Crisp Snare, Super Fast, Rap",
+    "lyrics": "[Intro]\nYo, 这是来自深渊的怒吼\n\n[Verse]\n指尖飞快刷新，屏幕又亮起\n渴望那点赞，像致命的氧气\n精心修饰的脸庞，完美到诡异\n背后隐藏的疲惫，谁又会在意\n光鲜亮丽的橱窗，贩卖着焦虑\n每个人都在表演，戴着虚伪面具\n比较的游戏，让人逐渐窒息\n迷失在数据洪流，找不到自己\n\n[Chorus]\n这流量的时代，真假早已分不清\n盲目追随潮流，丢掉了初心\n为了那点虚荣，灵魂在沉沦\n看不见的锁链，捆绑每个灵魂\n\n[Verse]\n滤镜下的生活，美得不切实际\n营造虚假繁荣，掩盖内心空虚\n他人的光环下，显得自己多余\n嫉妒和自卑，交织成悲剧\n\n[Chorus]\n朋友圈里炫耀，现实中却叹气\n刷着别人的故事，忘记了呼吸\n算法推荐着你，想看的一切东西\n不知不觉间，你已不再是你\n他们说这是进步，我看是种病\n精神鸦片侵蚀，慢慢要了你的命\n\n[Bridge]\n屏幕亮了又暗，一天又过去\n究竟得到了什么，还是失去了自己\n那真实的连接，在何处寻觅\n困在这迷宫里，找不到出口的轨迹\n\n[Outro]\n我想挣脱，我想呼吸\n这虚拟的繁华，让我喘不过气\n谁能告诉我，这到底有什么意义\n一切都像泡沫，一触就破裂没余地",
+    "audio_duration": 119.44348,
+    "infer_step": 60,
+    "guidance_scale": 15,
+    "scheduler_type": "euler",
+    "cfg_type": "apg",
+    "omega_scale": 10,
+    "guidance_interval": 0.5,
+    "guidance_interval_decay": 0,
+    "min_guidance_scale": 3,
+    "use_erg_tag": true,
+    "use_erg_lyric": false,
+    "use_erg_diffusion": true,
+    "oss_steps": [],
+    "timecosts": {
+        "preprocess": 0.04764962196350098,
+        "diffusion": 10.94297981262207,
+        "latent2audio": 1.1815783977508545
+    },
+    "actual_seeds": [
+        3826585273
+    ],
+    "retake_seeds": [
+        2527594022
+    ],
+    "retake_variance": 0.5,
+    "guidance_scale_text": 0,
+    "guidance_scale_lyric": 0,
+    "repaint_start": 0,
+    "repaint_end": 0,
+    "edit_n_min": 0.0,
+    "edit_n_max": 1.0,
+    "edit_n_avg": 1,
+    "src_audio_path": null,
+    "edit_target_prompt": null,
+    "edit_target_lyrics": null,
+    "audio2audio_enable": false,
+    "ref_audio_strength": 0.5,
+    "ref_audio_input": null,
+    "audio_path": "./outputs/output_20250512171809_0.wav"
+}

examples/zh_rap_lora/input_params/output_20250512172941_0_input_params.json ADDED Viewed

	@@ -0,0 +1,45 @@

+{
+    "lora_name_or_path": "/root/sag_train/data/ace_step_v1_chinese_rap_lora_80k",
+    "task": "text2music",
+    "prompt": "Hip Hop, Hi-hat Rolls, spoken word, Melodic Flow, articulate, Female Rap, 120 BPM, clear, warm, female, melodic Rap, adult, super fast",
+    "lyrics": "[Verse 1]\n打南边来了个喇嘛，手里提拉着五斤鳎目，\n打北边来了个哑巴，腰里别着个喇叭。\n喇嘛想换哑巴的喇叭，哑巴摇头不说话，\n鳎目一甩像道闪电，喇叭一响震天涯！\n\n[Chorus]\n丁丁当当，乒乓乓乓，\n话赶话，舌绕梁，\n东边的钉，西边的墙，\n绕不完的弯，唱不完的慌！\n\n[Verse 2]\n墙上一根钉，钉下绳摇晃，\n绳吊着瓶，瓶碰碎了光。\n灯骂瓶，瓶怪绳，绳怨钉，\n稀里哗啦，一场荒唐！\n\n[Chorus]\n丁丁当当，乒乓乓乓，\n话赶话，舌绕梁，\n东边的钉，西边的墙，\n绕不完的弯，唱不完的慌！\n\n[Verse 3]\n板凳宽，扁担长，\n一个偏要绑，一个偏不让。\n青龙洞里龙翻身，\n千年大梦变稻香！\n\n[Bridge]\n麻婆婆的狗，咬破麻叉口，\n麻线穿针眼，补丁也风流。\n左一句，右一句，\n舌头打结心自由！\n\n[Chorus]\n丁丁当当，乒乓乓乓，\n话赶话，舌绕梁，\n东边的钉，西边的墙，\n绕不完的弯，唱不完的慌！",
+    "audio_duration": 214.12,
+    "infer_step": 60,
+    "guidance_scale": 15,
+    "scheduler_type": "euler",
+    "cfg_type": "apg",
+    "omega_scale": 10,
+    "guidance_interval": 0.5,
+    "guidance_interval_decay": 0,
+    "min_guidance_scale": 3,
+    "use_erg_tag": true,
+    "use_erg_lyric": false,
+    "use_erg_diffusion": true,
+    "oss_steps": [],
+    "timecosts": {
+        "preprocess": 0.031190156936645508,
+        "diffusion": 20.130417823791504,
+        "latent2audio": 1.9650826454162598
+    },
+    "actual_seeds": [
+        1946426111
+    ],
+    "retake_seeds": [
+        331383387
+    ],
+    "retake_variance": 0.5,
+    "guidance_scale_text": 0,
+    "guidance_scale_lyric": 0,
+    "repaint_start": 0,
+    "repaint_end": 0,
+    "edit_n_min": 0.0,
+    "edit_n_max": 1.0,
+    "edit_n_avg": 1,
+    "src_audio_path": null,
+    "edit_target_prompt": null,
+    "edit_target_lyrics": null,
+    "audio2audio_enable": false,
+    "ref_audio_strength": 0.5,
+    "ref_audio_input": null,
+    "audio_path": "./outputs/output_20250512172941_0.wav"
+}

examples/zh_rap_lora/input_params/output_20250513044511_0_input_params.json ADDED Viewed

	@@ -0,0 +1,45 @@

+{
+    "lora_name_or_path": "/root/sag_train/data/ace_step_v1_chinese_rap_lora_100k",
+    "task": "text2music",
+    "prompt": "东北话, spoken word, male, Tempo - Fast, Elements - Chorus Hook, Subgenre-Satirical Hip Hop, Rap, Chinese-language music, energetic, slightly nasal, Instrument - Live Bass Guitar, adult, Vocals - Syncopated Flow, Genre - Hip-Hop, rapping, bright",
+    "lyrics": "[verse]\n挣着 憋屈的 工资 还得 装乐呵\n猫着 怂样儿 还搁 朋友圈 嘚瑟\n扛着 傻逼的 指标 没人 搭把手\n这儿 不是 托儿所 少整 那出儿 哭唧尿嚎\n\n俺们 就像 一条条 老板的 裤衩子\n陪着 笑脸 接他 每一回 突突\n哎呦 老板 今儿个 穿我呗\n他 撅个腚 眼角 瞟你 那熊样\n\n[chorus]\n他们 骂我 打工仔 太多人 没睡醒\n寻思 抠搜 老板 一天天 穷折腾\n不想 俺的 人生 烂在 这嘎达\n不想 俺的 将来 折在 这破棚\n\n老子 不想 上班 老子 是外星人\n你都 把俺 骂急眼了 俺还 这么淡定\n现实 才是 梦 啥时候 能醒啊\n那 糟践人的 答案 在西北风 里飘\n\n[verse]\n瞅见 二愣子 同事 给老板 舔腚沟子\n瞅见 浪蹄子 女同事 在老板 胯骨轴 扭搭\n瞅见 白瞎的 光阴 耗在 没亮儿的 道儿\n瞅见 公交车上 一帮 僵尸 吐酸水\n\n瞅见 俺的 命 定在 苦逼的 坑里\n瞅见 俺的 爱情 被轮了 成了 老处女\n瞅见 好事儿 全归 高富帅\n还有 那些 臭不要脸 扭腚的 货色\n\n[chorus]（重复）\n他们 骂我 打工仔 太多人 没睡醒...\n\n[bridge]\n加班 没补助 俺认了\n欠薪 揍员工 把俺 当牲口\n去你妈 的小姘头\n\n[verse]\n破逼 管理制度 净整 娱乐八卦\n撸管式 管理 也就 你自己 嗨\n出点儿 屁事儿 就往 下属 脑瓜子 扣\n挣俩 钢镚儿 立马 牛逼 不分 公母\n\n你挖个 大坑 把俺们 往里 踹\n说这 叫梦想 你当年 多能耐\n俺们 就当 听传销 洗脑课\n可怜 连骗人 你都 就会 这一套\n\n[outro]\n老子 不想 上班\n老子 不想 上班\n老子 不想 上班",
+    "audio_duration": 135.92,
+    "infer_step": 60,
+    "guidance_scale": 15,
+    "scheduler_type": "euler",
+    "cfg_type": "apg",
+    "omega_scale": 10,
+    "guidance_interval": 0.5,
+    "guidance_interval_decay": 0,
+    "min_guidance_scale": 3,
+    "use_erg_tag": true,
+    "use_erg_lyric": false,
+    "use_erg_diffusion": true,
+    "oss_steps": [],
+    "timecosts": {
+        "preprocess": 0.06204533576965332,
+        "diffusion": 35.75483560562134,
+        "latent2audio": 1.5193355083465576
+    },
+    "actual_seeds": [
+        4176354214
+    ],
+    "retake_seeds": [
+        601086915
+    ],
+    "retake_variance": 0.5,
+    "guidance_scale_text": 0,
+    "guidance_scale_lyric": 0,
+    "repaint_start": 0,
+    "repaint_end": 0,
+    "edit_n_min": 0.0,
+    "edit_n_max": 1.0,
+    "edit_n_avg": 1,
+    "src_audio_path": null,
+    "edit_target_prompt": null,
+    "edit_target_lyrics": null,
+    "audio2audio_enable": false,
+    "ref_audio_strength": 0.5,
+    "ref_audio_input": null,
+    "audio_path": "./outputs/output_20250513044511_0.wav"
+}

examples/zh_rap_lora/input_params/output_20250513050200_0_input_params.json ADDED Viewed

	@@ -0,0 +1,45 @@

+{
+    "lora_name_or_path": "/root/sag_train/data/ace_step_v1_chinese_rap_lora_100k",
+    "task": "text2music",
+    "prompt": "Rap, J-Pop, Anime, kawaii pop, EDM, Aggressive, Intense, Crisp Snare, Super Fast, Clear",
+    "lyrics": "[Intro]\nNya.\n\n[Verse]\n我 在 五 点 二 十 早 起，十 三 点 十 四 弹 会儿 琴\n习 惯 了 坐 班，习惯了 隔夜 的  剩 饭，\n习 惯 了 没有 你\n\n[Verse]\n怕 你 想 不 开，拦 在 你 的 面 前\n那 时 候 摔 得 差 点 住 院\n东 京 的 春 天 莺 莺 燕 燕\n我 说 想 不 想 来 跟 我 玩 音乐\n\n[Verse]\n带 着 我 的 朋 友 守 在 你 的 门 口\n弹 着 我 的 钢 琴 当 伴 奏\n等 你 放 学 后，陪 你   K   T   V\n端 着 我 的 红 茶 跟 你 碰 杯\n\n[Pre-Chorus]\n忽然间现实淹没了远方\n万家灯火，盖住月光\n奔走，忍受，变成了人偶\n别再对我伸出你的 双 手，会 受 伤\n\n[Chorus]\n明明都向前走，方向却渐渐不同\n时间让你我越走越近，却越来越陌生\n春 天 在 滂 沱 的 大 雨 里 飘 落\n得 了 心 太 高 脸 太 薄 的病\n\n[Bridge]\n我越难过，春日影越顶\n眼泪晃得我看不清\n埋葬了懦弱还有矫情\n却还是会在半夜摸眼睛\n\n青春期大部分时间在工 作\n用微笑换来余额几个零\n戴上了面具也明白了生活\n拼的是数字和脸更是命\n\n[Verse]\n我在五点二十早起，十三点十四弹会琴\n早上要做饭，回家时满地的瓶罐\n\n师 徒 二 人 站 在 我 的 面 前\n台 词 很 熟 练，照 着 就 念\n\n背 后 的 小 睦 扭 扭 捏 捏\n我 说 我 还 有 点 事 要 不 改 天 见\n\n然 后 你 的 双手 握 住 我 的 袖 口\n开 始 哭 着 求 我 不 要 走\n\n[Verse]\n我在下班后，忙活柴米油\n你和你的姐妹住着高楼\n\n苦 来 兮 苦，早 就 没 了\n现 实 扬 鞭，赶 着 我 向 前\n没有时间跟你分辨什么对与错\n\n[Bridge]\n没有什么对错，没有罪过\n谁不曾天真，是我太早看破\n生活一片狼藉，却又不想放弃\n一 边 聚 光 灯 下 绽 放，一 边 坠 落\n故作坚强，筑起心的墙\n越是委屈的伤口，越要藏\nLet it all out， it’s all right\n\n[Outro]\n俺 是 东 京 嘞，东 京 打 工 妹\n\n从虎之门带你转到浅草\n再从新宿转到竹桥\n\n俺 是 东 京 嘞，东 京 打 工 妹\n\n带 你 转 羽田 成田 蒲田 神田\n做 你 嘞 小 甜 甜！\n\n俺 是 东 京 嘞，东 京 打 工 妹\n带 你 转 赤 坂，带 你 转 霞 关\n恁 咋 不 早 说，今 天 不 管 饭\n",
+    "audio_duration": 147.62212,
+    "infer_step": 60,
+    "guidance_scale": 15,
+    "scheduler_type": "euler",
+    "cfg_type": "apg",
+    "omega_scale": 10,
+    "guidance_interval": 0.5,
+    "guidance_interval_decay": 0,
+    "min_guidance_scale": 3,
+    "use_erg_tag": true,
+    "use_erg_lyric": false,
+    "use_erg_diffusion": true,
+    "oss_steps": [],
+    "timecosts": {
+        "preprocess": 0.052134037017822266,
+        "diffusion": 17.909283876419067,
+        "latent2audio": 1.4904146194458008
+    },
+    "actual_seeds": [
+        2945962357
+    ],
+    "retake_seeds": [
+        2252292438
+    ],
+    "retake_variance": 0.5,
+    "guidance_scale_text": 0.7,
+    "guidance_scale_lyric": 0,
+    "repaint_start": 0,
+    "repaint_end": 0,
+    "edit_n_min": 0.0,
+    "edit_n_max": 1.0,
+    "edit_n_avg": 1,
+    "src_audio_path": null,
+    "edit_target_prompt": null,
+    "edit_target_lyrics": null,
+    "audio2audio_enable": false,
+    "ref_audio_strength": 0.5,
+    "ref_audio_input": null,
+    "audio_path": "./outputs/output_20250513050200_0.wav"
+}

examples/zh_rap_lora/input_params/output_20250513055451_0_input_params.json ADDED Viewed

	@@ -0,0 +1,45 @@

+{
+    "lora_name_or_path": "/root/sag_train/data/ace_step_v1_chinese_rap_lora_100k",
+    "task": "text2music",
+    "prompt": "Rap, adult, male, spoken word, rapping, clear, warm, articulate, Lo-Fi Hip Hop, 100-120 BPM, Keyboard Chords, Male Rap, Lazy Rhythm, Melancholy, Rap",
+    "lyrics": "[Intro]\n夜色 很 淡 像 褪色 的 照片  \n但 记忆 却 像 刀锋 一样 锐利  \n\n[Verse 1]\n你 说过 的 甜言蜜语 现在 听来 像 最 恶毒 的 咒骂  \n你 刺进 我 心里 的 刀 现在 还 在 滴血 未 干 哪  \n慵懒 的 旋律 像 我 的 脚步 拖着 沉重 的 躯壳  \n脑海 里 循环 播放 那 画面 快 把 我 逼疯 了  \n键盘 和弦 低沉 又 忧伤 弹奏 着 我 的 绝望  \n我 曾经 的 信任 像 玻璃 一样 被 你 狠狠 地 摔 在 地上  \n不想 振作 不想 原谅 只 想 让 这 一切 都 停止  \n可 心底 有 个 声音 嘶吼 着 要 你 付出 该 有 的 代价  \n\n[Chorus]\n背叛 像 毒药 渗透 我 的 血液  \n复仇 的 火焰 在 我 眼中 燃起  \n哪怕 遍体鳞伤 哪怕 万劫不复  \n我 也 要 亲手 撕碎 你 的 幸福  \n这 是 我 的 哀歌 也 是 我 的 战书  \n键盘 的 音符 每 一下 都 带着 恨意 和 痛苦  \n\n[Verse 2]\n曾经 的 兄弟 现在 面目全非 像 个 陌生人  \n你 的 自私 像 癌细胞 一点点 吞噬 我 的 纯真  \n我 学着 你 的 样子 把 心 锁 起来 不再 轻易 相信  \n让 懒散 的 节奏 包裹 我 给 自己 一点 喘息  \n键盘 的 音色 变得 更加 阴冷 像 秋天 的 雨滴  \n冲刷 掉 所有 温情 只 剩下 彻骨 的 寒意  \n我 不会 大喊大叫 只是 默默 地 计划  \n每 一步 都 走向 让 你 后悔 的 那 一 刹那  \n\n[Chorus]\n背叛 像 毒药 渗透 我 的 血液  \n复仇 的 火焰 在 我 眼中 燃起  \n哪怕 遍体鳞伤 哪怕 万劫不复  \n我 也 要 亲手 撕碎 你 的 幸福  \n这 是 我 的 哀歌 也 是 我 的 战书  \n键盘 的 音符 每 一下 都 带着 恨意 和 痛苦  \n\n[Bridge]\n也许 复仇 不能 带来 平静  \n也许 只 会 让 我 更 堕落  \n但 如果 不 这样 做  \n我 连 活下去 的 勇气 都 没有  \n\n[Outro]\n复仇 复仇 复仇  \n直到 最后 一刻  \n懒散 地 复仇 着  ",
+    "audio_duration": 202.64,
+    "infer_step": 60,
+    "guidance_scale": 15,
+    "scheduler_type": "euler",
+    "cfg_type": "apg",
+    "omega_scale": 10,
+    "guidance_interval": 0.65,
+    "guidance_interval_decay": 0,
+    "min_guidance_scale": 3,
+    "use_erg_tag": true,
+    "use_erg_lyric": false,
+    "use_erg_diffusion": true,
+    "oss_steps": [],
+    "timecosts": {
+        "preprocess": 0.036400794982910156,
+        "diffusion": 23.055809259414673,
+        "latent2audio": 1.8787360191345215
+    },
+    "actual_seeds": [
+        3900061002
+    ],
+    "retake_seeds": [
+        3037373819
+    ],
+    "retake_variance": 0.5,
+    "guidance_scale_text": 0,
+    "guidance_scale_lyric": 0,
+    "repaint_start": 0,
+    "repaint_end": 0,
+    "edit_n_min": 0.0,
+    "edit_n_max": 1.0,
+    "edit_n_avg": 1,
+    "src_audio_path": null,
+    "edit_target_prompt": null,
+    "edit_target_lyrics": null,
+    "audio2audio_enable": false,
+    "ref_audio_strength": 0.5,
+    "ref_audio_input": null,
+    "audio_path": "./outputs/output_20250513055451_0.wav"
+}

examples/zh_rap_lora/input_params/output_20250513060150_0_input_params.json ADDED Viewed

	@@ -0,0 +1,45 @@

+{
+    "lora_name_or_path": "/root/sag_train/data/ace_step_v1_chinese_rap_lora_100k",
+    "task": "text2music",
+    "prompt": "Orchestra, Symphony, Sonata, Opera, Concerto, Rap, Beat, DJ, MC, StreetCulture",
+    "lyrics": "[verse1]\n羊皮卷轴 墨香飘 莫扎特 熬 安魂曲 通宵  \n和弦齿轮 咔哒转 比 瑞士 手表 更 精密 律动  \n八轨磁带 玩叠叠乐 披头士 炸 录音棚 天花板  \nAI 卷起 新风暴 像 灭霸 打响指 般 简单  \n\n[chorus]\n琴弦 到 代码 进化论 狂飙（skr）  \n象牙塔 被 鼠标 点爆 像 泡泡（boom）  \n灵感 加 算法 等于 王炸 大招  \n人类 心跳 才是 终极 混音 调料  \n\n[verse2]\n春之祭 召唤 百人 乐团 才够 燥  \n合成器 极客 玩电焊 焊出 赛博 神庙  \nDAW 解放 双手 钢琴卷帘 变 乐高  \n音色库 开挂 像 吃 金币 的 马里奥  \n\nAI 拆解 爵士乐 黑话 像 庖丁 解牛  \nCityPop 复古 滤镜 直接 参数 调油  \n神经网络 偷师 贝多芬 半夜 翻墙头  \n音乐 基因库 被 改写成 超频 万花筒  \n\n[chorus]  \n琴弦 到 代码 进化论 狂飙（skr）  \n象牙塔 被 鼠标 点爆 像 泡泡（boom）  \n灵感 加 算法 等于 王炸 大招  \n人类 心跳 才是 终极 混音 调料  \n\n[verse3]  \n电子琴 被 吐槽 塑料 味 超标  \n卧室 制作人 用 鼠标 单挑 整个 乐团 编制  \nAI 伴奏 刚上线 就被 键盘侠 集火  \n却 忘了 电吉他 曾被 说 是 魔鬼 的 副歌  \n\n现在 我 指尖 蹦迪 在 数据 炼丹炉  \n提示词 召唤 莫扎特 跨次元 碰杯 珍珠奶茶  \n当 比特 海洋 淹没 所有 物理 琴柱  \n最后 的 音轨 永远 连着 心脏 的 跳针  \n\n[bridge]  \n鹅毛笔 蘸着 银河 当 墨汁（绝了）  \n音浪 在 元宇宙 开 分店（疯了）  \n技术 迷雾 散成 像素 烟花  \n而 我们 始终 带着 老派 的 心跳 混搭  \n\n[chorus]  \n琴弦 到 代码 进化论 狂飙（skr）  \n象牙塔 被 鼠标 点爆 像 泡泡（boom）  \n灵感 加 算法 等于 王炸 大招  \n人类 心跳 才是 终极 混音 调料  \n\n[outro]  \n从 蒸汽 到 硅基 浪潮 我 冲浪（yo）  \n用 脑洞 接住 每个 技术 暴击（叮）  \n当 所有 设备 没电 的 凌晨 三点钟  \n最 原始 的 旋律 在 胸腔 敲击 成 龙卷风  ",
+    "audio_duration": 172.64,
+    "infer_step": 60,
+    "guidance_scale": 15,
+    "scheduler_type": "euler",
+    "cfg_type": "apg",
+    "omega_scale": 10,
+    "guidance_interval": 0.65,
+    "guidance_interval_decay": 0,
+    "min_guidance_scale": 3,
+    "use_erg_tag": true,
+    "use_erg_lyric": false,
+    "use_erg_diffusion": true,
+    "oss_steps": [],
+    "timecosts": {
+        "preprocess": 3.648996353149414,
+        "diffusion": 16.44967818260193,
+        "latent2audio": 1.614703893661499
+    },
+    "actual_seeds": [
+        1198023141
+    ],
+    "retake_seeds": [
+        3389016134
+    ],
+    "retake_variance": 0.5,
+    "guidance_scale_text": 0,
+    "guidance_scale_lyric": 0,
+    "repaint_start": 0,
+    "repaint_end": 0,
+    "edit_n_min": 0.0,
+    "edit_n_max": 1.0,
+    "edit_n_avg": 1,
+    "src_audio_path": null,
+    "edit_target_prompt": null,
+    "edit_target_lyrics": null,
+    "audio2audio_enable": false,
+    "ref_audio_strength": 0.5,
+    "ref_audio_input": null,
+    "audio_path": "./outputs/output_20250513060150_0.wav"
+}

pipeline_ace_step.py CHANGED Viewed

@@ -9,7 +9,7 @@ from loguru import logger
 from tqdm import tqdm
 import json
 import math
-from huggingface_hub import hf_hub_download
 # from diffusers.pipelines.pipeline_utils import DiffusionPipeline
 from schedulers.scheduling_flow_match_euler_discrete import FlowMatchEulerDiscreteScheduler
@@ -63,7 +63,6 @@ class ACEStepPipeline:
             else:
                 checkpoint_dir = os.path.join(persistent_storage_path, "checkpoints")
         ensure_directory_exists(checkpoint_dir)
         self.checkpoint_dir = checkpoint_dir
         device = torch.device(f"cuda:{device_id}") if torch.cuda.is_available() else torch.device("cpu")
         if device.type == "cpu" and torch.backends.mps.is_available():
@@ -74,6 +73,22 @@ class ACEStepPipeline:
         self.device = device
         self.loaded = False
         self.torch_compile = torch_compile
     def load_checkpoint(self, checkpoint_dir=None):
         device = self.device
@@ -976,6 +991,10 @@ class ACEStepPipeline:
         oss_steps: str = None,
         guidance_scale_text: float = 0.0,
         guidance_scale_lyric: float = 0.0,
         retake_seeds: list = None,
         retake_variance: float = 0.5,
         task: str = "text2music",
@@ -1000,7 +1019,7 @@ class ACEStepPipeline:
             self.load_checkpoint(self.checkpoint_dir)
             load_model_cost = time.time() - start_time
             logger.info(f"Model loaded in {load_model_cost:.2f} seconds.")
         start_time = time.time()
         random_generators, actual_seeds = self.set_seeds(batch_size, manual_seeds)
@@ -1053,6 +1072,14 @@ class ACEStepPipeline:
             assert os.path.exists(src_audio_path), f"src_audio_path {src_audio_path} does not exist"
             src_latents = self.infer_latents(src_audio_path)
         if task == "edit":
             texts = [edit_target_prompt]
             target_encoder_text_hidden_states, target_text_attention_mask = self.get_text_embeddings(texts, self.device)
@@ -1117,6 +1144,9 @@ class ACEStepPipeline:
                 repaint_start=repaint_start,
                 repaint_end=repaint_end,
                 src_latents=src_latents,
             )
         end_time = time.time()
@@ -1139,6 +1169,7 @@ class ACEStepPipeline:
         }
         input_params_json = {
             "task": task,
             "prompt": prompt if task != "edit" else edit_target_prompt,
             "lyrics": lyrics if task != "edit" else edit_target_lyrics,
@@ -1169,6 +1200,9 @@ class ACEStepPipeline:
             "src_audio_path": src_audio_path,
             "edit_target_prompt": edit_target_prompt,
             "edit_target_lyrics": edit_target_lyrics,
         }
         # save input_params_json
         for output_audio_path in output_paths:

 from tqdm import tqdm
 import json
 import math
+from huggingface_hub import hf_hub_download, snapshot_download
 # from diffusers.pipelines.pipeline_utils import DiffusionPipeline
 from schedulers.scheduling_flow_match_euler_discrete import FlowMatchEulerDiscreteScheduler
             else:
                 checkpoint_dir = os.path.join(persistent_storage_path, "checkpoints")
         ensure_directory_exists(checkpoint_dir)
         self.checkpoint_dir = checkpoint_dir
         device = torch.device(f"cuda:{device_id}") if torch.cuda.is_available() else torch.device("cpu")
         if device.type == "cpu" and torch.backends.mps.is_available():
         self.device = device
         self.loaded = False
         self.torch_compile = torch_compile
+        self.lora_path = "none"
+    def load_lora(self, lora_name_or_path):
+        if lora_name_or_path != self.lora_path and lora_name_or_path != "none":
+            if not os.path.exists(lora_name_or_path):
+                lora_download_path = snapshot_download(lora_name_or_path, cache_dir=self.checkpoint_dir)
+            else:
+                lora_download_path = lora_name_or_path
+            if self.lora_path != "none":
+                self.ace_step_transformer.unload_lora()
+            self.ace_step_transformer.load_lora_adapter(os.path.join(lora_download_path, "pytorch_lora_weights.safetensors"), adapter_name="zh_rap_lora", with_alpha=True)
+            logger.info(f"Loading lora weights from: {lora_name_or_path} download path is: {lora_download_path}")
+            self.lora_path = lora_name_or_path
+        elif self.lora_path != "none" and lora_name_or_path == "none":
+            logger.info("No lora weights to load.")
+            self.ace_step_transformer.unload_lora()
     def load_checkpoint(self, checkpoint_dir=None):
         device = self.device
         oss_steps: str = None,
         guidance_scale_text: float = 0.0,
         guidance_scale_lyric: float = 0.0,
+        audio2audio_enable: bool = False,
+        ref_audio_strength: float = 0.5,
+        ref_audio_input: str = None,
+        lora_name_or_path: str = "none",
         retake_seeds: list = None,
         retake_variance: float = 0.5,
         task: str = "text2music",
             self.load_checkpoint(self.checkpoint_dir)
             load_model_cost = time.time() - start_time
             logger.info(f"Model loaded in {load_model_cost:.2f} seconds.")
+        self.load_lora(lora_name_or_path)
         start_time = time.time()
         random_generators, actual_seeds = self.set_seeds(batch_size, manual_seeds)
             assert os.path.exists(src_audio_path), f"src_audio_path {src_audio_path} does not exist"
             src_latents = self.infer_latents(src_audio_path)
+        ref_latents = None
+        if ref_audio_input is not None and audio2audio_enable:
+            assert ref_audio_input is not None, "ref_audio_input is required for audio2audio task"
+            assert os.path.exists(
+                ref_audio_input
+            ), f"ref_audio_input {ref_audio_input} does not exist"
+            ref_latents = self.infer_latents(ref_audio_input)
         if task == "edit":
             texts = [edit_target_prompt]
             target_encoder_text_hidden_states, target_text_attention_mask = self.get_text_embeddings(texts, self.device)
                 repaint_start=repaint_start,
                 repaint_end=repaint_end,
                 src_latents=src_latents,
+                audio2audio_enable=audio2audio_enable,
+                ref_audio_strength=ref_audio_strength,
+                ref_latents=ref_latents,
             )
         end_time = time.time()
         }
         input_params_json = {
+            "lora_name_or_path": lora_name_or_path,
             "task": task,
             "prompt": prompt if task != "edit" else edit_target_prompt,
             "lyrics": lyrics if task != "edit" else edit_target_lyrics,
             "src_audio_path": src_audio_path,
             "edit_target_prompt": edit_target_prompt,
             "edit_target_lyrics": edit_target_lyrics,
+            "audio2audio_enable": audio2audio_enable,
+            "ref_audio_strength": ref_audio_strength,
+            "ref_audio_input": ref_audio_input,
         }
         # save input_params_json
         for output_audio_path in output_paths:

requirements.txt CHANGED Viewed

@@ -1,6 +1,6 @@
 datasets==3.4.1
 diffusers==0.32.2
-gradio==5.23.3
 librosa==0.11.0
 loguru==0.7.3
 matplotlib==3.10.1
@@ -11,7 +11,7 @@ soundfile==0.13.1
 torch
 torchaudio
 torchvision
-tqdm==4.67.1
 transformers==4.50.0
 py3langid==0.3.0
 hangul-romanize==0.1.0
@@ -20,3 +20,6 @@ spacy==3.8.4
 accelerate==1.6.0
 cutlet
 fugashi[unidic-lite]

 datasets==3.4.1
 diffusers==0.32.2
+gradio
 librosa==0.11.0
 loguru==0.7.3
 matplotlib==3.10.1
 torch
 torchaudio
 torchvision
+tqdm
 transformers==4.50.0
 py3langid==0.3.0
 hangul-romanize==0.1.0
 accelerate==1.6.0
 cutlet
 fugashi[unidic-lite]
+peft
+tensorboard
+tensorboardX

test.json ADDED Viewed

	@@ -0,0 +1 @@

+ {'id': 'gen-1746104947-KKBqwgZ992wxV5m4PZga', 'provider': 'Google', 'model': 'google/gemini-2.5-flash-preview', 'object': 'chat.completion', 'created': 1746104947, 'choices': [{'logprobs': None, 'finish_reason': 'stop', 'native_finish_reason': 'STOP', 'index': 0, 'message': {'role': 'assistant', 'content': '```json\n[\n {\n "id": "1",\n "lyrics": "[Intro]\\nYo\\nCheck the mic, one two\\n你们欠我的总要还\\n[Chorus]\\n欠我的总要还\\n别想着逃跑\\n我来了带着我的flow\\n就像一场风暴\\n别再假装不知道\\n你们的演技太糟\\n今天我就让你知道\\n什么叫做报应到\\n[Verse 1]\\n还记得那天吗\\n我的付出你们全抛下\\n踩着我的头\\n往上爬\\n现在轮到我\\n来收账啦\\n那些虚伪的嘴脸\\n还在对我笑\\n却不知道报应\\n已经在敲门了\\n每一个背叛\\n都刻在我的心上\\n今天就是你们\\n付出代价的时候\\n[Pre-Chorus]\\n别求饶\\n别哭叫\\n这一切都是你们\\n自找的\\n[Chorus]\\n欠我的总要还\\n别想着逃跑\\n我来了带着我的flow\\n就像一场风暴\\n别再假装不知道\\n你们的演技太糟\\n今天我就让你知道\\n什么叫做报应到\\n[Outro]\\nHahaha\\nPayback time\\n你们跑不掉的\\n永远",\n "tags": "Rap, Hip Hop, Boombap, Ambient Synth Pads, Humorous, Revenge"\n },\n {\n "id": "120",\n "lyrics": "[Intro]\\nYeah\\nLet\'s take it back\\n回望过去的光景\\n[Verse 1]\\n曾经的街道\\n熟悉的味道\\n那些画面\\n在脑海里环绕\\n年轻的我们\\n充满着梦想\\n以为世界\\n就在我们的手掌\\n那段时光\\n单纯又美好\\n虽然 sometimes 很苦\\n但是我们都在笑\\n那些面孔\\n有些已不再联系\\n但他们的故事\\n依然在我记忆里\\n[Chorus]\\n回望过去\\n就像一部电影\\n有好有坏\\n有哭也有笑声\\n那些经历\\n塑造了今天的我\\n感谢一切\\n让我变得成熟\\n[Verse 2]\\n记得第一次心碎\\n在雨中流泪\\n记得第一次成功\\n那种喜悦滋味\\n记得那些争吵\\n也记得和解的拥抱\\n每一次跌倒\\n都教会我如何 STAND TALL\\n那些友情\\n那些爱情\\n都是我生命中\\n最珍贵的风景\\n[Bridge]\\n时间流逝\\n我们都在改变\\n但有些东西\\n永远不会变\\n初心还在\\n梦想还在\\n带着过去的经验\\n走向未来\\n[Chorus]\\n回望过去\\n就像一部电影\\n有好有坏\\n有哭也有笑声\\n那些经历\\n塑造了今天的我\\n感谢一切\\n让我变得成熟\\n[Outro]\\nLooking back\\nNever forget\\n那些日子\\n永远在我心里\\nYeah",\n "tags": "Rap, Hip Hop, Jazz Hop, Fast Tempo, Synth Lead, Traditional Instrument Samples, Storytelling, Looking Back"\n },\n {\n "id": "210",\n "lyrics": "[Intro]\\nYo\\nLet\'s go\\nParty started!\\n[Verse 1]\\n节奏快到爆炸\\n我的 Flow 像火箭一样发射\\n麦克风在我手里\\n就是我的武器\\n每一个字都像子弹\\n精准地击中你\\n从不放慢速度\\n只有一路狂飙\\n派对动物在欢呼\\n气氛越来越高潮\\n汗水湿透了衣裳\\n精力无限释放\\n今晚不回家\\n玩到天亮\\n[Chorus]\\n派对时间到了\\n跟着我的节奏摇摆\\n忘记所有烦恼\\n今晚我们主宰舞台\\n音乐声震耳欲聋\\n点燃激情和冲动\\n我们 unstoppable\\n像 electric guitar riff 的作用\\n[Verse 2]\\n别傻站着\\n加入我们的人潮\\n尽情扭动\\n释放你的 SIGNAL\\n不用在意别人眼光\\n做最真实的自己\\n这个夜晚\\n属于你的胜利\\n我的歌声在你耳边\\n就像电流穿梭\\n让你感受到力量\\n让你全身都着火\\n[Chorus]\\n派对时间到了\\n跟着我的节奏摇摆\\n忘记所有烦恼\\n今晚我们主宰舞台\\n音乐声震耳欲聋\\n点燃激情和冲动\\n我们 unstoppable\\n像 electric guitar riff 的作用\\n[Bridge]\\n从黑夜到黎明\\n我们的能量不会停\\n一直在前进\\n unstoppable 势不可挡\\n[Outro]\\nYeah\\nThat\'s right\\nParty never ends\\nLet\'s get it!",\n "tags": "Rap, Hip Hop, Fast Tempo, Chopper Flow, Sampled Vocal Hook, Electric Guitar Riff, Motivational, Chill, Relaxed, Party Scene"\n }\n]\n```', 'refusal': None, 'reasoning': None}}], 'usage': {'prompt_tokens': 1212, 'completion_tokens': 1121, 'total_tokens': 2333}}

ui/components.py CHANGED Viewed

@@ -1,5 +1,14 @@
 import gradio as gr
 import librosa
 TAG_DEFAULT = "funk, pop, soul, rock, melodic, guitar, drums, bass, keyboard, percussion, 105 BPM, energetic, upbeat, groovy, vibrant, dynamic"
@@ -40,7 +49,25 @@ Catch the tune and hold it tight
 In this moment we take flight
 """
 def create_output_ui(task_name="Text2Music"):
@@ -63,46 +90,229 @@ def create_text2music_ui(
     gr,
     text2music_process_func,
     sample_data_func=None,
 ):
     with gr.Row():
         with gr.Column():
             with gr.Row(equal_height=True):
                 # add markdown, tags and lyrics examples are from ai music generation community
-                audio_duration = gr.Slider(-1, 240.0, step=0.00001, value=-1, label="Audio Duration", interactive=True, info="-1 means random duration (30 ~ 240).", scale=9)
-                sample_bnt = gr.Button("Sample", variant="primary", scale=1)
-            prompt = gr.Textbox(lines=2, label="Tags", max_lines=4, value=TAG_DEFAULT, info="Support tags, descriptions, and scene. Use commas to separate different tags.\ntags and lyrics examples are from ai music generation community")
-            lyrics = gr.Textbox(lines=9, label="Lyrics", max_lines=13, value=LYRIC_DEFAULT, info="Support lyric structure tags like [verse], [chorus], and [bridge] to separate different parts of the lyrics.\nUse [instrumental] or [inst] to generate instrumental music. Not support genre structure tag in lyrics")
             with gr.Accordion("Basic Settings", open=False):
-                infer_step = gr.Slider(minimum=1, maximum=60, step=1, value=27, label="Infer Steps", interactive=True)
-                guidance_scale = gr.Slider(minimum=0.0, maximum=200.0, step=0.1, value=15.0, label="Guidance Scale", interactive=True, info="When guidance_scale_lyric > 1 and guidance_scale_text > 1, the guidance scale will not be applied.")
-                guidance_scale_text = gr.Slider(minimum=0.0, maximum=10.0, step=0.1, value=0.0, label="Guidance Scale Text", interactive=True, info="Guidance scale for text condition. It can only apply to cfg. set guidance_scale_text=5.0, guidance_scale_lyric=1.5 for start")
-                guidance_scale_lyric = gr.Slider(minimum=0.0, maximum=10.0, step=0.1, value=0.0, label="Guidance Scale Lyric", interactive=True)
-                manual_seeds = gr.Textbox(label="manual seeds (default None)", placeholder="1,2,3,4", value=None, info="Seed for the generation")
             with gr.Accordion("Advanced Settings", open=False):
-                scheduler_type = gr.Radio(["euler", "heun"], value="euler", label="Scheduler Type", elem_id="scheduler_type", info="Scheduler type for the generation. euler is recommended. heun will take more time.")
-                cfg_type = gr.Radio(["cfg", "apg", "cfg_star"], value="apg", label="CFG Type", elem_id="cfg_type", info="CFG type for the generation. apg is recommended. cfg and cfg_star are almost the same.")
-                use_erg_tag = gr.Checkbox(label="use ERG for tag", value=True, info="Use Entropy Rectifying Guidance for tag. It will multiple a temperature to the attention to make a weaker tag condition and make better diversity.")
-                use_erg_lyric = gr.Checkbox(label="use ERG for lyric", value=True, info="The same but apply to lyric encoder's attention.")
-                use_erg_diffusion = gr.Checkbox(label="use ERG for diffusion", value=True, info="The same but apply to diffusion model's attention.")
-                omega_scale = gr.Slider(minimum=-100.0, maximum=100.0, step=0.1, value=10.0, label="Granularity Scale", interactive=True, info="Granularity scale for the generation. Higher values can reduce artifacts")
-                guidance_interval = gr.Slider(minimum=0.0, maximum=1.0, step=0.01, value=0.5, label="Guidance Interval", interactive=True, info="Guidance interval for the generation. 0.5 means only apply guidance in the middle steps (0.25 * infer_steps to 0.75 * infer_steps)")
-                guidance_interval_decay = gr.Slider(minimum=0.0, maximum=1.0, step=0.01, value=0.0, label="Guidance Interval Decay", interactive=True, info="Guidance interval decay for the generation. Guidance scale will decay from guidance_scale to min_guidance_scale in the interval. 0.0 means no decay.")
-                min_guidance_scale = gr.Slider(minimum=0.0, maximum=200.0, step=0.1, value=3.0, label="Min Guidance Scale", interactive=True, info="Min guidance scale for guidance interval decay's end scale")
-                oss_steps = gr.Textbox(label="OSS Steps", placeholder="16, 29, 52, 96, 129, 158, 172, 183, 189, 200", value=None, info="Optimal Steps for the generation. But not test well")
             text2music_bnt = gr.Button("Generate", variant="primary")
         with gr.Column():
             outputs, input_params_json = create_output_ui()
             with gr.Tab("retake"):
-                retake_variance = gr.Slider(minimum=0.0, maximum=1.0, step=0.01, value=0.2, label="variance")
-                retake_seeds = gr.Textbox(label="retake seeds (default None)", placeholder="", value=None)
                 retake_bnt = gr.Button("Retake", variant="primary")
                 retake_outputs, retake_input_params_json = create_output_ui("Retake")
@@ -124,13 +334,22 @@ def create_text2music_ui(
                         json_data["use_erg_lyric"],
                         json_data["use_erg_diffusion"],
                         ", ".join(map(str, json_data["oss_steps"])),
-                        json_data["guidance_scale_text"] if "guidance_scale_text" in json_data else 0.0,
-                        json_data["guidance_scale_lyric"] if "guidance_scale_lyric" in json_data else 0.0,
                         retake_seeds=retake_seeds,
                         retake_variance=retake_variance,
                         task="retake",
                     )
                 retake_bnt.click(
                     fn=retake_process_func,
                     inputs=[
@@ -141,15 +360,46 @@ def create_text2music_ui(
                     outputs=retake_outputs + [retake_input_params_json],
                 )
             with gr.Tab("repainting"):
-                retake_variance = gr.Slider(minimum=0.0, maximum=1.0, step=0.01, value=0.2, label="variance")
-                retake_seeds = gr.Textbox(label="repaint seeds (default None)", placeholder="", value=None)
-                repaint_start = gr.Slider(minimum=0.0, maximum=240.0, step=0.01, value=0.0, label="Repaint Start Time", interactive=True)
-                repaint_end = gr.Slider(minimum=0.0, maximum=240.0, step=0.01, value=30.0, label="Repaint End Time", interactive=True)
-                repaint_source = gr.Radio(["text2music", "last_repaint", "upload"], value="text2music", label="Repaint Source", elem_id="repaint_source")
-                repaint_source_audio_upload = gr.Audio(label="Upload Audio", type="filepath", visible=False, elem_id="repaint_source_audio_upload")
                 repaint_source.change(
-                    fn=lambda x: gr.update(visible=x == "upload", elem_id="repaint_source_audio_upload"),
                     inputs=[repaint_source],
                     outputs=[repaint_source_audio_upload],
                 )
@@ -187,9 +437,7 @@ def create_text2music_ui(
                     if repaint_source == "upload":
                         src_audio_path = repaint_source_audio_upload
                         audio_duration = librosa.get_duration(filename=src_audio_path)
-                        json_data = {
-                            "audio_duration": audio_duration
-                        }
                     elif repaint_source == "text2music":
                         json_data = text2music_json_data
                         src_audio_path = json_data["audio_path"]
@@ -222,6 +470,7 @@ def create_text2music_ui(
                         repaint_start=repaint_start,
                         repaint_end=repaint_end,
                         src_audio_path=src_audio_path,
                     )
                 repaint_bnt.click(
@@ -258,11 +507,33 @@ def create_text2music_ui(
             with gr.Tab("edit"):
                 edit_prompt = gr.Textbox(lines=2, label="Edit Tags", max_lines=4)
                 edit_lyrics = gr.Textbox(lines=9, label="Edit Lyrics", max_lines=13)
-                retake_seeds = gr.Textbox(label="edit seeds (default None)", placeholder="", value=None)
-                edit_type = gr.Radio(["only_lyrics", "remix"], value="only_lyrics", label="Edit Type", elem_id="edit_type", info="`only_lyrics` will keep the whole song the same except lyrics difference. Make your diffrence smaller, e.g. one lyrc line change.\nremix can change the song melody and genre")
-                edit_n_min = gr.Slider(minimum=0.0, maximum=1.0, step=0.01, value=0.6, label="edit_n_min", interactive=True)
-                edit_n_max = gr.Slider(minimum=0.0, maximum=1.0, step=0.01, value=1.0, label="edit_n_max", interactive=True)
                 def edit_type_change_func(edit_type):
                     if edit_type == "only_lyrics":
@@ -276,13 +547,26 @@ def create_text2music_ui(
                 edit_type.change(
                     edit_type_change_func,
                     inputs=[edit_type],
-                    outputs=[edit_n_min, edit_n_max]
                 )
-                edit_source = gr.Radio(["text2music", "last_edit", "upload"], value="text2music", label="Edit Source", elem_id="edit_source")
-                edit_source_audio_upload = gr.Audio(label="Upload Audio", type="filepath", visible=False, elem_id="edit_source_audio_upload")
                 edit_source.change(
-                    fn=lambda x: gr.update(visible=x == "upload", elem_id="edit_source_audio_upload"),
                     inputs=[edit_source],
                     outputs=[edit_source_audio_upload],
                 )
@@ -321,9 +605,7 @@ def create_text2music_ui(
                     if edit_source == "upload":
                         src_audio_path = edit_source_audio_upload
                         audio_duration = librosa.get_duration(filename=src_audio_path)
-                        json_data = {
-                            "audio_duration": audio_duration
-                        }
                     elif edit_source == "text2music":
                         json_data = text2music_json_data
                         src_audio_path = json_data["audio_path"]
@@ -362,6 +644,7 @@ def create_text2music_ui(
                         edit_n_min=edit_n_min,
                         edit_n_max=edit_n_max,
                         retake_seeds=retake_seeds,
                     )
                 edit_bnt.click(
@@ -397,14 +680,43 @@ def create_text2music_ui(
                     outputs=edit_outputs + [edit_input_params_json],
                 )
             with gr.Tab("extend"):
-                extend_seeds = gr.Textbox(label="extend seeds (default None)", placeholder="", value=None)
-                left_extend_length = gr.Slider(minimum=0.0, maximum=240.0, step=0.01, value=0.0, label="Left Extend Length", interactive=True)
-                right_extend_length = gr.Slider(minimum=0.0, maximum=240.0, step=0.01, value=30.0, label="Right Extend Length", interactive=True)
-                extend_source = gr.Radio(["text2music", "last_extend", "upload"], value="text2music", label="Extend Source", elem_id="extend_source")
-                extend_source_audio_upload = gr.Audio(label="Upload Audio", type="filepath", visible=False, elem_id="extend_source_audio_upload")
                 extend_source.change(
-                    fn=lambda x: gr.update(visible=x == "upload", elem_id="extend_source_audio_upload"),
                     inputs=[extend_source],
                     outputs=[extend_source_audio_upload],
                 )
@@ -442,9 +754,7 @@ def create_text2music_ui(
                         src_audio_path = extend_source_audio_upload
                         # get audio duration
                         audio_duration = librosa.get_duration(filename=src_audio_path)
-                        json_data = {
-                            "audio_duration": audio_duration
-                        }
                     elif extend_source == "text2music":
                         json_data = text2music_json_data
                         src_audio_path = json_data["audio_path"]
@@ -479,6 +789,7 @@ def create_text2music_ui(
                         repaint_start=repaint_start,
                         repaint_end=repaint_end,
                         src_audio_path=src_audio_path,
                     )
                 extend_bnt.click(
@@ -512,8 +823,7 @@ def create_text2music_ui(
                     outputs=extend_outputs + [extend_input_params_json],
                 )
-        def sample_data():
-            json_data = sample_data_func()
             return (
                 json_data["audio_duration"],
                 json_data["prompt"],
@@ -531,12 +841,73 @@ def create_text2music_ui(
                 json_data["use_erg_lyric"],
                 json_data["use_erg_diffusion"],
                 ", ".join(map(str, json_data["oss_steps"])),
-                json_data["guidance_scale_text"] if "guidance_scale_text" in json_data else 0.0,
-                json_data["guidance_scale_lyric"] if "guidance_scale_lyric" in json_data else 0.0,
             )
         sample_bnt.click(
             sample_data,
             outputs=[
                 audio_duration,
                 prompt,
@@ -556,6 +927,9 @@ def create_text2music_ui(
                 oss_steps,
                 guidance_scale_text,
                 guidance_scale_lyric,
             ],
         )
@@ -580,13 +954,19 @@ def create_text2music_ui(
             oss_steps,
             guidance_scale_text,
             guidance_scale_lyric,
-        ], outputs=outputs + [input_params_json]
     )
 def create_main_demo_ui(
     text2music_process_func=dump_func,
     sample_data_func=dump_func,
 ):
     with gr.Blocks(
         title="ACE-Step Model 1.0 DEMO",
@@ -594,18 +974,14 @@ def create_main_demo_ui(
         gr.Markdown(
             """
             <h1 style="text-align: center;">ACE-Step: A Step Towards Music Generation Foundation Model</h1>
-            <p>
-                <a href="https://ace-step.github.io/">Project</a> |
-                <a href="https://huggingface.co/ACE-Step/ACE-Step-v1-3.5B">Checkpoints</a> |
-                <a href="https://discord.gg/rjAZz2xBdG">Discord</a>
-            </p>
-        """)
         with gr.Tab("text2music"):
             create_text2music_ui(
                 gr=gr,
                 text2music_process_func=text2music_process_func,
                 sample_data_func=sample_data_func,
             )
     return demo

+"""
+ACE-Step: A Step Towards Music Generation Foundation Model
+https://github.com/ace-step/ACE-Step
+Apache 2.0 License
+"""
 import gradio as gr
 import librosa
+import os
 TAG_DEFAULT = "funk, pop, soul, rock, melodic, guitar, drums, bass, keyboard, percussion, 105 BPM, energetic, upbeat, groovy, vibrant, dynamic"
 In this moment we take flight
 """
+# First, let's define the presets at the top of the file, after the imports
+GENRE_PRESETS = {
+    "Modern Pop": "pop, synth, drums, guitar, 120 bpm, upbeat, catchy, vibrant, female vocals, polished vocals",
+    "Rock": "rock, electric guitar, drums, bass, 130 bpm, energetic, rebellious, gritty, male vocals, raw vocals",
+    "Hip Hop": "hip hop, 808 bass, hi-hats, synth, 90 bpm, bold, urban, intense, male vocals, rhythmic vocals",
+    "Country": "country, acoustic guitar, steel guitar, fiddle, 100 bpm, heartfelt, rustic, warm, male vocals, twangy vocals",
+    "EDM": "edm, synth, bass, kick drum, 128 bpm, euphoric, pulsating, energetic, instrumental",
+    "Reggae": "reggae, guitar, bass, drums, 80 bpm, chill, soulful, positive, male vocals, smooth vocals",
+    "Classical": "classical, orchestral, strings, piano, 60 bpm, elegant, emotive, timeless, instrumental",
+    "Jazz": "jazz, saxophone, piano, double bass, 110 bpm, smooth, improvisational, soulful, male vocals, crooning vocals",
+    "Metal": "metal, electric guitar, double kick drum, bass, 160 bpm, aggressive, intense, heavy, male vocals, screamed vocals",
+    "R&B": "r&b, synth, bass, drums, 85 bpm, sultry, groovy, romantic, female vocals, silky vocals"
+}
+# Add this function to handle preset selection
+def update_tags_from_preset(preset_name):
+    if preset_name == "Custom":
+        return ""
+    return GENRE_PRESETS.get(preset_name, "")
 def create_output_ui(task_name="Text2Music"):
     gr,
     text2music_process_func,
     sample_data_func=None,
+    load_data_func=None,
 ):
+    with gr.Row(equal_height=True):
+        curr_file_dir = os.path.dirname(__file__)
+        output_file_dir = os.path.join(curr_file_dir, "..", "..", "outputs")
+        json_files = [f for f in os.listdir(output_file_dir) if f.endswith('.json')]
+        json_files.sort(reverse=True, key=lambda x: int(x.split('_')[1]))
+        output_files = gr.Dropdown(choices=json_files, label="Select previous generated input params", scale=9, interactive=True)
+        load_bnt = gr.Button("Load", variant="primary", scale=1)
     with gr.Row():
         with gr.Column():
             with gr.Row(equal_height=True):
                 # add markdown, tags and lyrics examples are from ai music generation community
+                audio_duration = gr.Slider(
+                    -1,
+                    240.0,
+                    step=0.00001,
+                    value=-1,
+                    label="Audio Duration",
+                    interactive=True,
+                    info="-1 means random duration (30 ~ 240).",
+                    scale=9,
+                )
+                sample_bnt = gr.Button("Sample", variant="secondary", scale=1)
+            # audio2audio
+            with gr.Row(equal_height=True):
+                audio2audio_enable = gr.Checkbox(label="Enable Audio2Audio", value=False, info="Check to enable Audio-to-Audio generation using a reference audio.", elem_id="audio2audio_checkbox")
+                lora_name_or_path = gr.Dropdown(
+                    label="Lora Name or Path",
+                    choices=["ACE-Step/ACE-Step-v1-chinese-rap-LoRA", "none"],
+                    value="none",
+                    allow_custom_value=True,
+                )
+            ref_audio_input = gr.Audio(type="filepath", label="Reference Audio (for Audio2Audio)", visible=False, elem_id="ref_audio_input", show_download_button=True)
+            ref_audio_strength = gr.Slider(
+                label="Refer audio strength",
+                minimum=0.0,
+                maximum=1.0,
+                step=0.01,
+                value=0.5,
+                elem_id="ref_audio_strength",
+                visible=False,
+                interactive=True,
+            )
+            def toggle_ref_audio_visibility(is_checked):
+                return (
+                    gr.update(visible=is_checked, elem_id="ref_audio_input"),
+                    gr.update(visible=is_checked, elem_id="ref_audio_strength"),
+                )
+            audio2audio_enable.change(
+                fn=toggle_ref_audio_visibility,
+                inputs=[audio2audio_enable],
+                outputs=[ref_audio_input, ref_audio_strength],
+            )
+            with gr.Column(scale=2):
+                with gr.Group():
+                    gr.Markdown("""<center>Support tags, descriptions, and scene. Use commas to separate different tags.<br>Tags and lyrics examples are from AI music generation community.</center>""")
+                    with gr.Row():
+                        genre_preset = gr.Dropdown(
+                            choices=["Custom"] + list(GENRE_PRESETS.keys()),
+                            value="Custom",
+                            label="Preset",
+                            scale=1,
+                        )
+                        prompt = gr.Textbox(
+                            lines=1,
+                            label="Tags",
+                            max_lines=4,
+                            value=TAG_DEFAULT,
+                            scale=9,
+                        )
+            # Add the change event for the preset dropdown
+            genre_preset.change(
+                fn=update_tags_from_preset,
+                inputs=[genre_preset],
+                outputs=[prompt]
+            )
+            with gr.Group():
+                gr.Markdown("""<center>Support lyric structure tags like [verse], [chorus], and [bridge] to separate different parts of the lyrics.<br>Use [instrumental] or [inst] to generate instrumental music. Not support genre structure tag in lyrics</center>""")
+                lyrics = gr.Textbox(
+                    lines=9,
+                    label="Lyrics",
+                    max_lines=13,
+                    value=LYRIC_DEFAULT,
+                )
             with gr.Accordion("Basic Settings", open=False):
+                infer_step = gr.Slider(
+                    minimum=1,
+                    maximum=200,
+                    step=1,
+                    value=60,
+                    label="Infer Steps",
+                    interactive=True,
+                )
+                guidance_scale = gr.Slider(
+                    minimum=0.0,
+                    maximum=30.0,
+                    step=0.1,
+                    value=15.0,
+                    label="Guidance Scale",
+                    interactive=True,
+                    info="When guidance_scale_lyric > 1 and guidance_scale_text > 1, the guidance scale will not be applied.",
+                )
+                guidance_scale_text = gr.Slider(
+                    minimum=0.0,
+                    maximum=10.0,
+                    step=0.1,
+                    value=0.0,
+                    label="Guidance Scale Text",
+                    interactive=True,
+                    info="Guidance scale for text condition. It can only apply to cfg. set guidance_scale_text=5.0, guidance_scale_lyric=1.5 for start",
+                )
+                guidance_scale_lyric = gr.Slider(
+                    minimum=0.0,
+                    maximum=10.0,
+                    step=0.1,
+                    value=0.0,
+                    label="Guidance Scale Lyric",
+                    interactive=True,
+                )
+                manual_seeds = gr.Textbox(
+                    label="manual seeds (default None)",
+                    placeholder="1,2,3,4",
+                    value=None,
+                    info="Seed for the generation",
+                )
             with gr.Accordion("Advanced Settings", open=False):
+                scheduler_type = gr.Radio(
+                    ["euler", "heun"],
+                    value="euler",
+                    label="Scheduler Type",
+                    elem_id="scheduler_type",
+                    info="Scheduler type for the generation. euler is recommended. heun will take more time.",
+                )
+                cfg_type = gr.Radio(
+                    ["cfg", "apg", "cfg_star"],
+                    value="apg",
+                    label="CFG Type",
+                    elem_id="cfg_type",
+                    info="CFG type for the generation. apg is recommended. cfg and cfg_star are almost the same.",
+                )
+                use_erg_tag = gr.Checkbox(
+                    label="use ERG for tag",
+                    value=True,
+                    info="Use Entropy Rectifying Guidance for tag. It will multiple a temperature to the attention to make a weaker tag condition and make better diversity.",
+                )
+                use_erg_lyric = gr.Checkbox(
+                    label="use ERG for lyric",
+                    value=False,
+                    info="The same but apply to lyric encoder's attention.",
+                )
+                use_erg_diffusion = gr.Checkbox(
+                    label="use ERG for diffusion",
+                    value=True,
+                    info="The same but apply to diffusion model's attention.",
+                )
+                omega_scale = gr.Slider(
+                    minimum=-100.0,
+                    maximum=100.0,
+                    step=0.1,
+                    value=10.0,
+                    label="Granularity Scale",
+                    interactive=True,
+                    info="Granularity scale for the generation. Higher values can reduce artifacts",
+                )
+                guidance_interval = gr.Slider(
+                    minimum=0.0,
+                    maximum=1.0,
+                    step=0.01,
+                    value=0.5,
+                    label="Guidance Interval",
+                    interactive=True,
+                    info="Guidance interval for the generation. 0.5 means only apply guidance in the middle steps (0.25 * infer_steps to 0.75 * infer_steps)",
+                )
+                guidance_interval_decay = gr.Slider(
+                    minimum=0.0,
+                    maximum=1.0,
+                    step=0.01,
+                    value=0.0,
+                    label="Guidance Interval Decay",
+                    interactive=True,
+                    info="Guidance interval decay for the generation. Guidance scale will decay from guidance_scale to min_guidance_scale in the interval. 0.0 means no decay.",
+                )
+                min_guidance_scale = gr.Slider(
+                    minimum=0.0,
+                    maximum=200.0,
+                    step=0.1,
+                    value=3.0,
+                    label="Min Guidance Scale",
+                    interactive=True,
+                    info="Min guidance scale for guidance interval decay's end scale",
+                )
+                oss_steps = gr.Textbox(
+                    label="OSS Steps",
+                    placeholder="16, 29, 52, 96, 129, 158, 172, 183, 189, 200",
+                    value=None,
+                    info="Optimal Steps for the generation. But not test well",
+                )
             text2music_bnt = gr.Button("Generate", variant="primary")
         with gr.Column():
             outputs, input_params_json = create_output_ui()
             with gr.Tab("retake"):
+                retake_variance = gr.Slider(
+                    minimum=0.0, maximum=1.0, step=0.01, value=0.2, label="variance"
+                )
+                retake_seeds = gr.Textbox(
+                    label="retake seeds (default None)", placeholder="", value=None
+                )
                 retake_bnt = gr.Button("Retake", variant="primary")
                 retake_outputs, retake_input_params_json = create_output_ui("Retake")
                         json_data["use_erg_lyric"],
                         json_data["use_erg_diffusion"],
                         ", ".join(map(str, json_data["oss_steps"])),
+                        (
+                            json_data["guidance_scale_text"]
+                            if "guidance_scale_text" in json_data
+                            else 0.0
+                        ),
+                        (
+                            json_data["guidance_scale_lyric"]
+                            if "guidance_scale_lyric" in json_data
+                            else 0.0
+                        ),
                         retake_seeds=retake_seeds,
                         retake_variance=retake_variance,
                         task="retake",
+                        lora_name_or_path="none" if "lora_name_or_path" not in json_data else json_data["lora_name_or_path"]
                     )
                 retake_bnt.click(
                     fn=retake_process_func,
                     inputs=[
                     outputs=retake_outputs + [retake_input_params_json],
                 )
             with gr.Tab("repainting"):
+                retake_variance = gr.Slider(
+                    minimum=0.0, maximum=1.0, step=0.01, value=0.2, label="variance"
+                )
+                retake_seeds = gr.Textbox(
+                    label="repaint seeds (default None)", placeholder="", value=None
+                )
+                repaint_start = gr.Slider(
+                    minimum=0.0,
+                    maximum=240.0,
+                    step=0.01,
+                    value=0.0,
+                    label="Repaint Start Time",
+                    interactive=True,
+                )
+                repaint_end = gr.Slider(
+                    minimum=0.0,
+                    maximum=240.0,
+                    step=0.01,
+                    value=30.0,
+                    label="Repaint End Time",
+                    interactive=True,
+                )
+                repaint_source = gr.Radio(
+                    ["text2music", "last_repaint", "upload"],
+                    value="text2music",
+                    label="Repaint Source",
+                    elem_id="repaint_source",
+                )
+                repaint_source_audio_upload = gr.Audio(
+                    label="Upload Audio",
+                    type="filepath",
+                    visible=False,
+                    elem_id="repaint_source_audio_upload",
+                    show_download_button=True,
+                )
                 repaint_source.change(
+                    fn=lambda x: gr.update(
+                        visible=x == "upload", elem_id="repaint_source_audio_upload"
+                    ),
                     inputs=[repaint_source],
                     outputs=[repaint_source_audio_upload],
                 )
                     if repaint_source == "upload":
                         src_audio_path = repaint_source_audio_upload
                         audio_duration = librosa.get_duration(filename=src_audio_path)
+                        json_data = {"audio_duration": audio_duration}
                     elif repaint_source == "text2music":
                         json_data = text2music_json_data
                         src_audio_path = json_data["audio_path"]
                         repaint_start=repaint_start,
                         repaint_end=repaint_end,
                         src_audio_path=src_audio_path,
+                        lora_name_or_path="none" if "lora_name_or_path" not in json_data else json_data["lora_name_or_path"]
                     )
                 repaint_bnt.click(
             with gr.Tab("edit"):
                 edit_prompt = gr.Textbox(lines=2, label="Edit Tags", max_lines=4)
                 edit_lyrics = gr.Textbox(lines=9, label="Edit Lyrics", max_lines=13)
+                retake_seeds = gr.Textbox(
+                    label="edit seeds (default None)", placeholder="", value=None
+                )
+                edit_type = gr.Radio(
+                    ["only_lyrics", "remix"],
+                    value="only_lyrics",
+                    label="Edit Type",
+                    elem_id="edit_type",
+                    info="`only_lyrics` will keep the whole song the same except lyrics difference. Make your diffrence smaller, e.g. one lyrc line change.\nremix can change the song melody and genre",
+                )
+                edit_n_min = gr.Slider(
+                    minimum=0.0,
+                    maximum=1.0,
+                    step=0.01,
+                    value=0.6,
+                    label="edit_n_min",
+                    interactive=True,
+                )
+                edit_n_max = gr.Slider(
+                    minimum=0.0,
+                    maximum=1.0,
+                    step=0.01,
+                    value=1.0,
+                    label="edit_n_max",
+                    interactive=True,
+                )
                 def edit_type_change_func(edit_type):
                     if edit_type == "only_lyrics":
                 edit_type.change(
                     edit_type_change_func,
                     inputs=[edit_type],
+                    outputs=[edit_n_min, edit_n_max],
                 )
+                edit_source = gr.Radio(
+                    ["text2music", "last_edit", "upload"],
+                    value="text2music",
+                    label="Edit Source",
+                    elem_id="edit_source",
+                )
+                edit_source_audio_upload = gr.Audio(
+                    label="Upload Audio",
+                    type="filepath",
+                    visible=False,
+                    elem_id="edit_source_audio_upload",
+                    show_download_button=True,
+                )
                 edit_source.change(
+                    fn=lambda x: gr.update(
+                        visible=x == "upload", elem_id="edit_source_audio_upload"
+                    ),
                     inputs=[edit_source],
                     outputs=[edit_source_audio_upload],
                 )
                     if edit_source == "upload":
                         src_audio_path = edit_source_audio_upload
                         audio_duration = librosa.get_duration(filename=src_audio_path)
+                        json_data = {"audio_duration": audio_duration}
                     elif edit_source == "text2music":
                         json_data = text2music_json_data
                         src_audio_path = json_data["audio_path"]
                         edit_n_min=edit_n_min,
                         edit_n_max=edit_n_max,
                         retake_seeds=retake_seeds,
+                        lora_name_or_path="none" if "lora_name_or_path" not in json_data else json_data["lora_name_or_path"]
                     )
                 edit_bnt.click(
                     outputs=edit_outputs + [edit_input_params_json],
                 )
             with gr.Tab("extend"):
+                extend_seeds = gr.Textbox(
+                    label="extend seeds (default None)", placeholder="", value=None
+                )
+                left_extend_length = gr.Slider(
+                    minimum=0.0,
+                    maximum=240.0,
+                    step=0.01,
+                    value=0.0,
+                    label="Left Extend Length",
+                    interactive=True,
+                )
+                right_extend_length = gr.Slider(
+                    minimum=0.0,
+                    maximum=240.0,
+                    step=0.01,
+                    value=30.0,
+                    label="Right Extend Length",
+                    interactive=True,
+                )
+                extend_source = gr.Radio(
+                    ["text2music", "last_extend", "upload"],
+                    value="text2music",
+                    label="Extend Source",
+                    elem_id="extend_source",
+                )
+                extend_source_audio_upload = gr.Audio(
+                    label="Upload Audio",
+                    type="filepath",
+                    visible=False,
+                    elem_id="extend_source_audio_upload",
+                    show_download_button=True,
+                )
                 extend_source.change(
+                    fn=lambda x: gr.update(
+                        visible=x == "upload", elem_id="extend_source_audio_upload"
+                    ),
                     inputs=[extend_source],
                     outputs=[extend_source_audio_upload],
                 )
                         src_audio_path = extend_source_audio_upload
                         # get audio duration
                         audio_duration = librosa.get_duration(filename=src_audio_path)
+                        json_data = {"audio_duration": audio_duration}
                     elif extend_source == "text2music":
                         json_data = text2music_json_data
                         src_audio_path = json_data["audio_path"]
                         repaint_start=repaint_start,
                         repaint_end=repaint_end,
                         src_audio_path=src_audio_path,
+                        lora_name_or_path="none" if "lora_name_or_path" not in json_data else json_data["lora_name_or_path"]
                     )
                 extend_bnt.click(
                     outputs=extend_outputs + [extend_input_params_json],
                 )
+        def json2output(json_data):
             return (
                 json_data["audio_duration"],
                 json_data["prompt"],
                 json_data["use_erg_lyric"],
                 json_data["use_erg_diffusion"],
                 ", ".join(map(str, json_data["oss_steps"])),
+                (
+                    json_data["guidance_scale_text"]
+                    if "guidance_scale_text" in json_data
+                    else 0.0
+                ),
+                (
+                    json_data["guidance_scale_lyric"]
+                    if "guidance_scale_lyric" in json_data
+                    else 0.0
+                ),
+                (
+                    json_data["audio2audio_enable"]
+                    if "audio2audio_enable" in json_data
+                    else False
+                ),
+                (
+                    json_data["ref_audio_strength"]
+                    if "ref_audio_strength" in json_data
+                    else 0.5
+                ),
+                (
+                    json_data["ref_audio_input"]
+                    if "ref_audio_input" in json_data
+                    else None
+                ),
             )
+        def sample_data(lora_name_or_path_):
+            json_data = sample_data_func(lora_name_or_path_)
+            return json2output(json_data)
         sample_bnt.click(
             sample_data,
+            inputs=[lora_name_or_path],
+            outputs=[
+                audio_duration,
+                prompt,
+                lyrics,
+                infer_step,
+                guidance_scale,
+                scheduler_type,
+                cfg_type,
+                omega_scale,
+                manual_seeds,
+                guidance_interval,
+                guidance_interval_decay,
+                min_guidance_scale,
+                use_erg_tag,
+                use_erg_lyric,
+                use_erg_diffusion,
+                oss_steps,
+                guidance_scale_text,
+                guidance_scale_lyric,
+                audio2audio_enable,
+                ref_audio_strength,
+                ref_audio_input,
+            ],
+        )
+        def load_data(json_file):
+            json_file = os.path.join(output_file_dir, json_file)
+            json_data = load_data_func(json_file)
+            return json2output(json_data)
+        load_bnt.click(
+            fn=load_data,
+            inputs=[output_files],
             outputs=[
                 audio_duration,
                 prompt,
                 oss_steps,
                 guidance_scale_text,
                 guidance_scale_lyric,
+                audio2audio_enable,
+                ref_audio_strength,
+                ref_audio_input,
             ],
         )
             oss_steps,
             guidance_scale_text,
             guidance_scale_lyric,
+            audio2audio_enable,
+            ref_audio_strength,
+            ref_audio_input,
+            lora_name_or_path,
+        ],
+        outputs=outputs + [input_params_json],
     )
 def create_main_demo_ui(
     text2music_process_func=dump_func,
     sample_data_func=dump_func,
+    load_data_func=dump_func,
 ):
     with gr.Blocks(
         title="ACE-Step Model 1.0 DEMO",
         gr.Markdown(
             """
             <h1 style="text-align: center;">ACE-Step: A Step Towards Music Generation Foundation Model</h1>
+        """
+        )
         with gr.Tab("text2music"):
             create_text2music_ui(
                 gr=gr,
                 text2music_process_func=text2music_process_func,
                 sample_data_func=sample_data_func,
+                load_data_func=load_data_func,
             )
     return demo