|
--- |
|
license: other |
|
license_name: stability-ai |
|
license_link: https://stability.ai/license |
|
--- |
|
|
|
attempting to run stable-audio-open-small with onnxruntime in swift/IOS |
|
|
|
this is a mess. these models run successfully in python when validating em. haven't gotten the iphone to stop crashing yet. |
|
|
|
when using the fp16_tools version, the diffusion component crashes the iphone on step 0. |
|
|
|
when using the initial version, the decoder ((autoencoder_arm.onnx)) crashes the iphone. |
|
|
|
nothing to see here, yet... just wanted a place to store these. |
|
|
|
like everything else i do...pure vibes zero real knowledge. |
|
|
|
Here's a python script i used to validate outputs against the original pytorch model. |
|
|
|
there's another one using cfg stuff that gets essentially the same outputs. |
|
|
|
``` |
|
|
|
#!/usr/bin/env python |
|
import numpy as np, soundfile as sf, onnxruntime as ort |
|
from transformers import AutoTokenizer |
|
|
|
# Load ONNX models |
|
dit = ort.InferenceSession("diffusion_dit_arm.onnx") |
|
cond = ort.InferenceSession("conditioners.onnx") |
|
dec = ort.InferenceSession("autoencoder_arm.onnx") |
|
|
|
# Config |
|
prompt = "lo-fi hip-hop beat with pianos 90bpm" |
|
steps = 10 |
|
rng = np.random.RandomState(12345) |
|
x = rng.randn(1, 64, 256).astype(np.float32) |
|
|
|
# Conditioning |
|
tok = AutoTokenizer.from_pretrained("t5-base") |
|
tokens = tok(prompt, truncation=True, padding="max_length", max_length=128, return_tensors="np") |
|
conds = cond.run(None, { |
|
"input_ids": tokens["input_ids"].astype(np.int64), |
|
"attention_mask": tokens["attention_mask"].astype(np.int64), |
|
"seconds_total": np.array([10.0], dtype=np.float32) |
|
}) |
|
cross, _, glob = conds |
|
|
|
# Run 10 steps with linear t, no CFG |
|
for i in range(steps): |
|
t_val = 1.0 - i / (steps - 1) |
|
t = np.array([t_val], dtype=np.float32) |
|
|
|
v = dit.run(None, { |
|
"x": x, "t": t, |
|
"cross_attn_cond": cross, |
|
"global_cond": glob |
|
})[0] |
|
|
|
x -= 0.1 * v # fixed Euler step |
|
|
|
# Decode |
|
audio = dec.run(None, {'sampled': x})[0] |
|
if audio.shape[0] == 2: |
|
audio = audio.T |
|
audio /= np.abs(audio).max() |
|
sf.write("onnx_lofi_linear.wav", audio, 44100) |
|
print("✅ onnx_lofi_linear.wav written!") |
|
|
|
``` |