CantusSVS-hf / app.py
liampond's picture
Update app.py
1c1f57b verified
import streamlit as st
st.set_page_config(page_title="CantusSVS", layout="wide")
import os
import yaml
import shutil
import traceback
import json
import requests
import zipfile
import streamlit.components.v1 as components
from pathlib import Path
from webapp.services.defaults.default_splitter import split_syllable
def patch_config_yaml_files():
root = "/tmp/cantussvs_v1"
checkpoints_root = os.path.join(root, "checkpoints")
data_root = os.path.join(root, "data")
for dirpath, _, filenames in os.walk(checkpoints_root):
for filename in filenames:
if filename == "config.yaml":
full_path = os.path.join(dirpath, filename)
try:
with open(full_path, "r") as f:
config = yaml.safe_load(f)
if not isinstance(config, dict):
continue
modified = False
for key, value in config.items():
if isinstance(value, str):
if value.startswith("checkpoints/"):
rel = value.split("/", 1)[1]
config[key] = os.path.join(checkpoints_root, rel)
modified = True
elif value.startswith("data/"):
rel = value.split("/", 1)[1]
config[key] = os.path.join(data_root, rel)
modified = True
if modified:
with open(full_path, "w") as f:
yaml.dump(config, f)
print(f"✅ Patched paths in {full_path}")
except Exception as e:
print(f"❌ Failed to patch {full_path}: {e}")
# Disable Streamlit file watcher
os.environ['STREAMLIT_SERVER_FILE_WATCHER_TYPE'] = 'none'
# Ensure project root is on the import path
PROJECT_ROOT = Path(__file__).resolve().parent
import sys
sys.path.insert(0, str(PROJECT_ROOT))
from webapp.services.parsing.mei_parser import parse_mei_for_editor
from webapp.services.parsing.ds_builder import build_ds_from_notes
from webapp.services.parsing.ds_validator import validate_ds
from webapp.services.phonemes.phoneme_dict import PHONEMES as permitted_phonemes
from inference.pipeline import run_inference
def safe_symlink(src, dst):
try:
if os.path.islink(dst):
if os.readlink(dst) == src:
print(f"✅ Symlink already correct: {dst}{src}")
return
else:
print(f"⚠️ Symlink exists but points elsewhere. Skipping: {dst}")
return
elif os.path.exists(dst):
print(f"❗ Cannot create symlink, path exists and is not a symlink: {dst}")
return
os.symlink(src, dst)
print(f"✅ Created symlink: {dst}{src}")
except Exception as e:
print(f"❗ Failed to create symlink {dst} -> {src}: {e}")
# Directories
HF_CHECKPOINTS_DIR = "/tmp/cantussvs_v1/checkpoints"
HF_DATA_DIR = "/tmp/cantussvs_v1/data"
DEMO_FILES = PROJECT_ROOT / "webapp/demo_files"
UPLOAD_MEI_DIR = PROJECT_ROOT / "webapp/uploaded_mei"
UPLOAD_DS_DIR = PROJECT_ROOT / "webapp/uploaded_ds"
TMP_DS_DIR = PROJECT_ROOT / "webapp/tmp_ds"
OUTPUT_DIR = PROJECT_ROOT / "webapp/output"
for d in [DEMO_FILES, UPLOAD_MEI_DIR, UPLOAD_DS_DIR, TMP_DS_DIR, OUTPUT_DIR]:
d.mkdir(parents=True, exist_ok=True)
@st.cache_resource
def download_and_extract_from_hf():
url = "https://huggingface.co/datasets/liampond/CantusSVS/resolve/main/cantussvs_v1.zip"
zip_path = "/tmp/cantussvs_v1.zip"
extract_dir = "/tmp/cantussvs_v1"
if not os.path.exists(extract_dir):
st.write("📦 Downloading data + model from Hugging Face...")
r = requests.get(url, stream=True)
with open(zip_path, "wb") as f:
for chunk in r.iter_content(chunk_size=8192):
f.write(chunk)
st.write("📂 Extracting contents...")
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
zip_ref.extractall(extract_dir)
# ✅ Only do this once, right after unzip
patch_config_yaml_files()
safe_symlink(os.path.join(extract_dir, "checkpoints"), "checkpoints")
safe_symlink(os.path.join(extract_dir, "data"), "data")
return extract_dir
# Call it once and use it globally
base_path = download_and_extract_from_hf()
patch_config_yaml_files()
st.write("✅ Loaded assets to:", base_path)
# CSS styling
# st.markdown("""
# <style>
# html, body, [class*="css"] { font-size: 18px !important; }
# div[data-testid="stSelectbox"] label,
# div[data-testid="stNumberInput"] label,
# div[data-testid="stTextInput"] label { font-size: 13px; padding-bottom: 0px; }
# div[data-testid="stSlider"] label { font-size: 0px; }
# div.stButton > button:first-child {
# background-color: black; color: white;
# font-size: 14px; padding: 4px 10px;
# border-radius: 8px;
# }
# section[data-testid="stFileUploaderDropzone"] { padding: 2rem; }
# .tooltip {
# position: relative;
# display: inline-block;
# border-bottom: 1px dotted white;
# cursor: help;
# }
# .tooltip .tooltiptext {
# visibility: hidden;
# width: 250px;
# background-color: black;
# color: #fff;
# text-align: center;
# border-radius: 6px;
# padding: 6px;
# position: absolute;
# z-index: 1;
# bottom: 125%; /* Position above */
# left: 50%;
# margin-left: -125px;
# opacity: 0;
# transition: opacity 0.3s;
# }
# .tooltip:hover .tooltiptext {
# visibility: visible;
# opacity: 1;
# }
# </style>
# """, unsafe_allow_html=True)
# Phoneme mappings
phoneme_display_map = { "ap": "Pause", "br": "Breath" }
display_to_phoneme = {v: k for k, v in phoneme_display_map.items()}
full_phoneme_list_display = [phoneme_display_map.get(p, p) for p in permitted_phonemes]
# Pitch list D4-D5
allowed_pitches = ["D4", "D#4", "E4", "F4", "F#4", "G4", "G#4", "A4", "A#4", "B4", "C5", "C#5", "D5"]
# Title
st.title("CantusSVS: Latin Singing Voice Synthesis")
st.markdown("""
# About CantusSVS
<p>CantusSVS is a web-based Singing Voice Synthesis (SVS) system designed for composers and musicians to synthesize Latin chant audio from a custom musical score.
Built on top of the DiffSinger AI model, CantusSVS enables detailed, precise control over melody, rhythm, phonemes, and timing without any programming knowledge required.</p>
<p>Designed by Liam Pond as the final project for MUS6329X: Projet en informatique musicale (Prof. Dominic Thibault) at the Université de Montréal. For more information, you can view the README.md under the 'Files' tab of this Space.</p>
You can find DiffSinger in the following paper:
**DiffSinger: Singing Voice Synthesis via Shallow Diffusion Mechanism**
Liu, Jinglin, Chengxi Li, Yi Ren, Feiyang Chen, and Zhou Zhao. 2022. "Diffsinger: Singing Voice Synthesis via Shallow Diffusion Mechanism." In *Proceedings of the AAAI Conference on Artificial Intelligence* 36 10: 11020–11028. [https://arxiv.org/abs/2105.02446](http://dx.doi.org/10.1609/aaai.v36i10.21350).
Model training was done using Cedar, a cluster provided by the Digital Research Alliance of Canada. To train your own model locally, follow [this tutorial](https://youtu.be/Sxt11TAflV0?feature=shared) by [tigermeat](https://www.youtube.com/@spicytigermeat).
For general help training and creating a dataset, [this tutorial](https://docs.google.com/document/d/1uMsepxbdUW65PfIWL1pt2OM6ZKa5ybTTJOpZ733Ht6s/view) by [PixPrucer](https://bsky.app/profile/pixprucer.bsky.social) is an excellent guide. For help, join the [DiffSinger Discord server](https://discord.gg/DZ6fhEUfnb).
The dataset used for this project was built using [*Adventus: Dominica prima adventus Domini*](https://youtu.be/ThnPySybDJs?feature=shared), the first track from [Psallentes](https://psallentes.com/)' album *Salzinnes Saints*. Psallentes is a Belgian women's chorus that specializes in Late Medieval and Renaissance music. *Salzinnes Saints* is an album of music from the [Salzinnes Antiphonal](https://www.smu.ca/academics/archives/the-salzinnes-antiphonal.html), a mid-sixteenth century choirbook with the music and text for the Liturgy of the Hours.
---
# How to Use CantusSVS
## 1. Compose Your Music
Compose the chant you want to synthesize using the notation software of your choice. [MuseScore 4](https://musescore.org/en/download) is recommended.
The chant must adhere to the following conditions:
- Monophonic only (one note at a time, no harmonies or chords)
- Pitch range of <span class="tooltip">**D4 to D5**<span class="tooltiptext">Because training data was limited outside this range, synthesis outside these pitches is very poor.</span></span>
- Lyrics (Latin) under each note, separated by syllable
## 2. Export Your Score to MEI
When your score is complete, export it to MEI.
In MuseScore:
- Go to **File → Export**
- Choose the `.mei` file format
- Save it to your computer
## 3. Upload Your Score to CantusSVS
In the CantusSVS web app:
- Select **MEI** mode
- Adjust the **tempo** if necessary using the provided slider
- Upload your `.mei` file
- Your score will be displayed using Verovio
- You may use the demo `.mei` file if you wish
## 4. Edit Phonemes, Durations, and Pitches
CantusSVS automatically suggests phoneme splits for each syllable.
However, you will have the opportunity to review phonemes, durations, and pitches.
## 5. Synthesize the Audio
When you're done:
- Click **Confirm**
- CantusSVS will create a `.ds` file which are processed through pretrained DiffSinger models
- The synthesized chant will be generated
This can take a few minutes depending on input length
## 6. Listen and Download
After synthesis you can either listen to your chant directly in the app or download a `.wav` file to your computer.
---
""", unsafe_allow_html=True)
st.markdown("""
<script>
const tooltipSpan = window.parent.document.querySelector('span[style*="border-bottom: 1px dotted black"]');
if (tooltipSpan) {
tooltipSpan.addEventListener('mouseover', () => {
tooltipSpan.children[0].style.visibility = 'visible';
tooltipSpan.children[0].style.opacity = 1;
});
tooltipSpan.addEventListener('mouseout', () => {
tooltipSpan.children[0].style.visibility = 'hidden';
tooltipSpan.children[0].style.opacity = 0;
});
}
</script>
""", unsafe_allow_html=True)
filetype = st.selectbox("Select file type:", ["MEI", "DS"])
def handle_exception(context_message):
st.error(f"{context_message}. See console.")
print("\n" + "="*30)
print(f"Exception during {context_message}")
traceback.print_exc()
print("="*30 + "\n")
st.stop()
if filetype == "MEI":
st.header("1. Select MEI Source")
use_demo = st.checkbox("Use demo MEI file", value=False)
tempo = st.slider("Tempo (BPM)", 1, 300, 60)
if use_demo:
mei_path = DEMO_FILES / "Demo1.mei"
if not mei_path.exists():
st.error("Demo MEI file missing.")
st.stop()
with open(mei_path, "rb") as f:
mei_file_bytes = f.read()
else:
mei_file = st.file_uploader("Upload your MEI file", type="mei")
if not mei_file:
st.stop()
mei_path = UPLOAD_MEI_DIR / mei_file.name
with open(mei_path, "wb") as f:
f.write(mei_file.getbuffer())
mei_file_bytes = mei_file.getvalue()
mei_text = mei_file_bytes.decode("utf-8")
try:
raw_notes = parse_mei_for_editor(mei_path, tempo)
except Exception:
handle_exception("MEI parsing")
# Always update session state
st.session_state.original_raw_notes = raw_notes
syllable_groups = []
for note in st.session_state.original_raw_notes:
syllable_text = note["lyric"]
pitch = note["pitch"]
syllable = split_syllable(
syllable=syllable_text,
note_duration=note["duration"],
tempo=tempo,
pitch=pitch
)
syllable_groups.append({
"syllable": syllable_text,
"phonemes": syllable
})
if "edited_syllables" not in st.session_state:
st.session_state.edited_syllables = syllable_groups
st.subheader("Score Preview")
components.html(f"""<div id=\"app\" style=\"border: 1px solid lightgray; min-height: 400px;\"></div><script type=\"module\">import 'https://editor.verovio.org/javascript/app/verovio-app.js';const app=new Verovio.App(document.getElementById(\"app\"),{{defaultView:'document',documentZoom:4}});app.loadData(`{mei_text}`);</script>""", height=500)
st.header("2. Edit Phonemes, Durations, and Pitches")
updated_syllables = []
if "previous_tempo" not in st.session_state:
st.session_state.previous_tempo = tempo
if tempo != st.session_state.previous_tempo:
for i, note in enumerate(st.session_state.original_raw_notes):
updated = split_syllable(
syllable=note["lyric"],
note_duration=note["duration"],
tempo=tempo,
pitch=note["pitch"]
)
# preserve existing phoneme values (if possible)
for j, ph in enumerate(updated):
try:
existing = st.session_state.edited_syllables[i]["phonemes"][j]
ph["phoneme"] = existing["phoneme"]
ph["pitch"] = existing["pitch"]
except IndexError:
pass # new phoneme or longer split
st.session_state.edited_syllables[i]["phonemes"] = updated
st.session_state.previous_tempo = tempo
for idx, group in enumerate(st.session_state.edited_syllables):
st.markdown(f"#### {group['syllable'].capitalize()}")
new_phonemes = []
for j, ph in enumerate(group["phonemes"]):
col1, col2, col3, col4 = st.columns([3, 3, 3, 1]) # new column for delete button
with col1:
phoneme_display = st.selectbox(
"Phoneme",
full_phoneme_list_display,
index=full_phoneme_list_display.index(phoneme_display_map.get(ph["phoneme"], ph["phoneme"])),
key=f"phoneme_{idx}_{j}"
)
phoneme_internal = display_to_phoneme.get(phoneme_display, phoneme_display)
with col2:
duration = st.number_input(
"Duration (seconds)",
min_value=0.0, max_value=5.0,
value=float(ph["duration"]),
step=0.01, format="%.2f",
key=f"duration_num_{idx}_{j}"
)
with col3:
pitch = st.selectbox(
"Pitch",
allowed_pitches,
index=allowed_pitches.index(ph["pitch"]) if ph["pitch"] in allowed_pitches else 0,
key=f"pitch_{idx}_{j}"
)
with col4:
if st.button("❌", key=f"remove_{idx}_{j}"):
group["phonemes"].pop(j)
st.experimental_rerun() # force rerender safely
new_phonemes.append({"phoneme": phoneme_internal, "duration": duration, "pitch": pitch})
if st.button("➕ Add Phoneme", key=f"add_phoneme_{idx}"):
group["phonemes"].append({"phoneme": "a", "duration": 0.2, "pitch": "D4"})
st.experimental_rerun()
updated_syllables.append({"syllable": group["syllable"], "phonemes": new_phonemes})
st.divider()
st.session_state.edited_syllables = updated_syllables
st.header("3. Synthesize")
confirm_clicked = st.button("✅ Synthesize", key="confirm_button_mei")
if confirm_clicked:
ds_path = TMP_DS_DIR / f"{mei_path.stem}.ds"
try:
all_phonemes = [ph for syllable in st.session_state.edited_syllables for ph in syllable["phonemes"]]
build_ds_from_notes(all_phonemes, ds_path)
with open(ds_path, "r", encoding="utf-8") as f:
ds_data = json.load(f)
validate_ds(ds_data)
st.success(f"DS file created: {ds_path.name}")
except Exception:
handle_exception("DS generation or validation")
with st.spinner("Running DiffSinger inference…"):
try:
wav_path = run_inference(ds_path, OUTPUT_DIR, mei_path.stem)
except Exception:
handle_exception("inference")
st.success("Synthesis complete!")
st.audio(str(wav_path))
st.download_button("Download WAV", data=open(wav_path, "rb"), file_name=wav_path.name)
elif filetype == "DS":
st.header("1. Upload DS File")
ds_file = st.file_uploader("Upload your .ds file", type=["ds", "json"])
st.header("2. Synthesize")
synth_clicked = st.button("✅ Synthesize", key="synthesize_button_ds")
if synth_clicked:
if not ds_file:
st.error("Please upload a .ds file.")
st.stop()
ds_path = UPLOAD_DS_DIR / ds_file.name
with open(ds_path, "wb") as f:
f.write(ds_file.getbuffer())
with open(ds_path, "r", encoding="utf-8") as f:
ds_data = json.load(f)
try:
validate_ds(ds_data)
except Exception as e:
st.error(f"Invalid DS file: {e}")
st.stop()
with st.spinner("Running DiffSinger inference…"):
try:
wav_path = run_inference(ds_path, OUTPUT_DIR, ds_path.stem)
except Exception:
handle_exception("inference")
st.success("Synthesis complete!")
st.audio(str(wav_path))
st.download_button("Download WAV", data=open(wav_path, "rb"), file_name=wav_path.name)