Spaces:

liampond
/

CantusSVS-hf

Sleeping

File size: 17,934 Bytes

import streamlit as st
st.set_page_config(page_title="CantusSVS", layout="wide")

import os
import yaml
import shutil
import traceback
import json
import requests
import zipfile
import streamlit.components.v1 as components
from pathlib import Path
from webapp.services.defaults.default_splitter import split_syllable


def patch_config_yaml_files():
    root = "/tmp/cantussvs_v1"
    checkpoints_root = os.path.join(root, "checkpoints")
    data_root = os.path.join(root, "data")

    for dirpath, _, filenames in os.walk(checkpoints_root):
        for filename in filenames:
            if filename == "config.yaml":
                full_path = os.path.join(dirpath, filename)
                try:
                    with open(full_path, "r") as f:
                        config = yaml.safe_load(f)

                    if not isinstance(config, dict):
                        continue

                    modified = False
                    for key, value in config.items():
                        if isinstance(value, str):
                            if value.startswith("checkpoints/"):
                                rel = value.split("/", 1)[1]
                                config[key] = os.path.join(checkpoints_root, rel)
                                modified = True
                            elif value.startswith("data/"):
                                rel = value.split("/", 1)[1]
                                config[key] = os.path.join(data_root, rel)
                                modified = True

                    if modified:
                        with open(full_path, "w") as f:
                            yaml.dump(config, f)
                        print(f"✅ Patched paths in {full_path}")
                except Exception as e:
                    print(f"❌ Failed to patch {full_path}: {e}")

# Disable Streamlit file watcher
os.environ['STREAMLIT_SERVER_FILE_WATCHER_TYPE'] = 'none'

# Ensure project root is on the import path
PROJECT_ROOT = Path(__file__).resolve().parent
import sys
sys.path.insert(0, str(PROJECT_ROOT))

from webapp.services.parsing.mei_parser import parse_mei_for_editor
from webapp.services.parsing.ds_builder import build_ds_from_notes
from webapp.services.parsing.ds_validator import validate_ds
from webapp.services.phonemes.phoneme_dict import PHONEMES as permitted_phonemes
from inference.pipeline import run_inference

def safe_symlink(src, dst):
    try:
        if os.path.islink(dst):
            if os.readlink(dst) == src:
                print(f"✅ Symlink already correct: {dst} → {src}")
                return
            else:
                print(f"⚠️ Symlink exists but points elsewhere. Skipping: {dst}")
                return
        elif os.path.exists(dst):
            print(f"❗ Cannot create symlink, path exists and is not a symlink: {dst}")
            return
        os.symlink(src, dst)
        print(f"✅ Created symlink: {dst} → {src}")
    except Exception as e:
        print(f"❗ Failed to create symlink {dst} -> {src}: {e}")

# Directories
HF_CHECKPOINTS_DIR = "/tmp/cantussvs_v1/checkpoints"
HF_DATA_DIR = "/tmp/cantussvs_v1/data"
DEMO_FILES = PROJECT_ROOT / "webapp/demo_files"
UPLOAD_MEI_DIR = PROJECT_ROOT / "webapp/uploaded_mei"
UPLOAD_DS_DIR = PROJECT_ROOT / "webapp/uploaded_ds"
TMP_DS_DIR = PROJECT_ROOT / "webapp/tmp_ds"
OUTPUT_DIR = PROJECT_ROOT / "webapp/output"
for d in [DEMO_FILES, UPLOAD_MEI_DIR, UPLOAD_DS_DIR, TMP_DS_DIR, OUTPUT_DIR]:
    d.mkdir(parents=True, exist_ok=True)

@st.cache_resource
def download_and_extract_from_hf():
    url = "https://huggingface.co/datasets/liampond/CantusSVS/resolve/main/cantussvs_v1.zip"
    zip_path = "/tmp/cantussvs_v1.zip"
    extract_dir = "/tmp/cantussvs_v1"

    if not os.path.exists(extract_dir):
        st.write("📦 Downloading data + model from Hugging Face...")
        r = requests.get(url, stream=True)
        with open(zip_path, "wb") as f:
            for chunk in r.iter_content(chunk_size=8192):
                f.write(chunk)

        st.write("📂 Extracting contents...")
        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
            zip_ref.extractall(extract_dir)

        # ✅ Only do this once, right after unzip
        patch_config_yaml_files()

    safe_symlink(os.path.join(extract_dir, "checkpoints"), "checkpoints")
    safe_symlink(os.path.join(extract_dir, "data"), "data")

    return extract_dir

# Call it once and use it globally
base_path = download_and_extract_from_hf()
patch_config_yaml_files()
st.write("✅ Loaded assets to:", base_path)

# CSS styling
# st.markdown("""
# <style>
# html, body, [class*="css"] { font-size: 18px !important; }
# div[data-testid="stSelectbox"] label,
# div[data-testid="stNumberInput"] label,
# div[data-testid="stTextInput"] label { font-size: 13px; padding-bottom: 0px; }
# div[data-testid="stSlider"] label { font-size: 0px; }
# div.stButton > button:first-child {
#     background-color: black; color: white;
#     font-size: 14px; padding: 4px 10px;
#     border-radius: 8px;
# }
# section[data-testid="stFileUploaderDropzone"] { padding: 2rem; }

# .tooltip {
#   position: relative;
#   display: inline-block;
#   border-bottom: 1px dotted white;
#   cursor: help;
# }

# .tooltip .tooltiptext {
#   visibility: hidden;
#   width: 250px;
#   background-color: black;
#   color: #fff;
#   text-align: center;
#   border-radius: 6px;
#   padding: 6px;
#   position: absolute;
#   z-index: 1;
#   bottom: 125%; /* Position above */
#   left: 50%;
#   margin-left: -125px;
#   opacity: 0;
#   transition: opacity 0.3s;
# }

# .tooltip:hover .tooltiptext {
#   visibility: visible;
#   opacity: 1;
# }
# </style>
# """, unsafe_allow_html=True)

# Phoneme mappings
phoneme_display_map = { "ap": "Pause", "br": "Breath" }
display_to_phoneme = {v: k for k, v in phoneme_display_map.items()}
full_phoneme_list_display = [phoneme_display_map.get(p, p) for p in permitted_phonemes]

# Pitch list D4-D5
allowed_pitches = ["D4", "D#4", "E4", "F4", "F#4", "G4", "G#4", "A4", "A#4", "B4", "C5", "C#5", "D5"]

# Title
st.title("CantusSVS: Latin Singing Voice Synthesis")

st.markdown("""
# About CantusSVS

<p>CantusSVS is a web-based Singing Voice Synthesis (SVS) system designed for composers and musicians to synthesize Latin chant audio from a custom musical score.
Built on top of the DiffSinger AI model, CantusSVS enables detailed, precise control over melody, rhythm, phonemes, and timing without any programming knowledge required.</p>
            
<p>Designed by Liam Pond as the final project for MUS6329X: Projet en informatique musicale (Prof. Dominic Thibault) at the Université de Montréal. For more information, you can view the README.md under the 'Files' tab of this Space.</p>

You can find DiffSinger in the following paper:
**DiffSinger: Singing Voice Synthesis via Shallow Diffusion Mechanism**  

Liu, Jinglin, Chengxi Li, Yi Ren, Feiyang Chen, and Zhou Zhao. 2022. "Diffsinger: Singing Voice Synthesis via Shallow Diffusion Mechanism." In *Proceedings of the AAAI Conference on Artificial Intelligence* 36 10: 11020–11028. [https://arxiv.org/abs/2105.02446](http://dx.doi.org/10.1609/aaai.v36i10.21350).

Model training was done using Cedar, a cluster provided by the Digital Research Alliance of Canada. To train your own model locally, follow [this tutorial](https://youtu.be/Sxt11TAflV0?feature=shared) by [tigermeat](https://www.youtube.com/@spicytigermeat).

For general help training and creating a dataset, [this tutorial](https://docs.google.com/document/d/1uMsepxbdUW65PfIWL1pt2OM6ZKa5ybTTJOpZ733Ht6s/view) by [PixPrucer](https://bsky.app/profile/pixprucer.bsky.social) is an excellent guide. For help, join the [DiffSinger Discord server](https://discord.gg/DZ6fhEUfnb).

The dataset used for this project was built using [*Adventus: Dominica prima adventus Domini*](https://youtu.be/ThnPySybDJs?feature=shared), the first track from [Psallentes](https://psallentes.com/)' album *Salzinnes Saints*. Psallentes is a Belgian women's chorus that specializes in Late Medieval and Renaissance music. *Salzinnes Saints* is an album of music from the [Salzinnes Antiphonal](https://www.smu.ca/academics/archives/the-salzinnes-antiphonal.html), a mid-sixteenth century choirbook with the music and text for the Liturgy of the Hours.

---

# How to Use CantusSVS

## 1. Compose Your Music

Compose the chant you want to synthesize using the notation software of your choice. [MuseScore 4](https://musescore.org/en/download) is recommended.
The chant must adhere to the following conditions:

- Monophonic only (one note at a time, no harmonies or chords)
- Pitch range of <span class="tooltip">**D4 to D5**<span class="tooltiptext">Because training data was limited outside this range, synthesis outside these pitches is very poor.</span></span>
- Lyrics (Latin) under each note, separated by syllable

## 2. Export Your Score to MEI

When your score is complete, export it to MEI.

In MuseScore:            
- Go to **File → Export**
- Choose the `.mei` file format
- Save it to your computer

## 3. Upload Your Score to CantusSVS

In the CantusSVS web app:

- Select **MEI** mode
- Adjust the **tempo** if necessary using the provided slider
- Upload your `.mei` file
- Your score will be displayed using Verovio
- You may use the demo `.mei` file if you wish

## 4. Edit Phonemes, Durations, and Pitches

CantusSVS automatically suggests phoneme splits for each syllable.
However, you will have the opportunity to review phonemes, durations, and pitches.

## 5. Synthesize the Audio

When you're done:

- Click **Confirm**
- CantusSVS will create a `.ds` file which are processed through pretrained DiffSinger models
- The synthesized chant will be generated

This can take a few minutes depending on input length

## 6. Listen and Download

After synthesis you can either listen to your chant directly in the app or download a `.wav` file to your computer.

---
""", unsafe_allow_html=True)

st.markdown("""
<script>
const tooltipSpan = window.parent.document.querySelector('span[style*="border-bottom: 1px dotted black"]');
if (tooltipSpan) {
    tooltipSpan.addEventListener('mouseover', () => {
        tooltipSpan.children[0].style.visibility = 'visible';
        tooltipSpan.children[0].style.opacity = 1;
    });
    tooltipSpan.addEventListener('mouseout', () => {
        tooltipSpan.children[0].style.visibility = 'hidden';
        tooltipSpan.children[0].style.opacity = 0;
    });
}
</script>
""", unsafe_allow_html=True)

filetype = st.selectbox("Select file type:", ["MEI", "DS"])

def handle_exception(context_message):
    st.error(f"{context_message}. See console.")
    print("\n" + "="*30)
    print(f"Exception during {context_message}")
    traceback.print_exc()
    print("="*30 + "\n")
    st.stop()

if filetype == "MEI":
    st.header("1. Select MEI Source")
    use_demo = st.checkbox("Use demo MEI file", value=False)
    tempo = st.slider("Tempo (BPM)", 1, 300, 60)

    if use_demo:
        mei_path = DEMO_FILES / "Demo1.mei"
        if not mei_path.exists():
            st.error("Demo MEI file missing.")
            st.stop()
        with open(mei_path, "rb") as f:
            mei_file_bytes = f.read()
    else:
        mei_file = st.file_uploader("Upload your MEI file", type="mei")
        if not mei_file:
            st.stop()
        mei_path = UPLOAD_MEI_DIR / mei_file.name
        with open(mei_path, "wb") as f:
            f.write(mei_file.getbuffer())
        mei_file_bytes = mei_file.getvalue()

    mei_text = mei_file_bytes.decode("utf-8")

    try:
        raw_notes = parse_mei_for_editor(mei_path, tempo)
    except Exception:
        handle_exception("MEI parsing")

    # Always update session state
    st.session_state.original_raw_notes = raw_notes

    syllable_groups = []
    for note in st.session_state.original_raw_notes:
        syllable_text = note["lyric"]
        pitch = note["pitch"]
        syllable = split_syllable(
            syllable=syllable_text,
            note_duration=note["duration"],
            tempo=tempo,
            pitch=pitch
        )
        syllable_groups.append({
            "syllable": syllable_text,
            "phonemes": syllable
    })

    if "edited_syllables" not in st.session_state:
        st.session_state.edited_syllables = syllable_groups

    st.subheader("Score Preview")
    components.html(f"""<div id=\"app\" style=\"border: 1px solid lightgray; min-height: 400px;\"></div><script type=\"module\">import 'https://editor.verovio.org/javascript/app/verovio-app.js';const app=new Verovio.App(document.getElementById(\"app\"),{{defaultView:'document',documentZoom:4}});app.loadData(`{mei_text}`);</script>""", height=500)

    st.header("2. Edit Phonemes, Durations, and Pitches")
    updated_syllables = []

    if "previous_tempo" not in st.session_state:
        st.session_state.previous_tempo = tempo

    if tempo != st.session_state.previous_tempo:
        for i, note in enumerate(st.session_state.original_raw_notes):
            updated = split_syllable(
                syllable=note["lyric"],
                note_duration=note["duration"],
                tempo=tempo,
                pitch=note["pitch"]
            )
            # preserve existing phoneme values (if possible)
            for j, ph in enumerate(updated):
                try:
                    existing = st.session_state.edited_syllables[i]["phonemes"][j]
                    ph["phoneme"] = existing["phoneme"]
                    ph["pitch"] = existing["pitch"]
                except IndexError:
                    pass  # new phoneme or longer split
            st.session_state.edited_syllables[i]["phonemes"] = updated
        st.session_state.previous_tempo = tempo

    for idx, group in enumerate(st.session_state.edited_syllables):
        st.markdown(f"#### {group['syllable'].capitalize()}")
        new_phonemes = []
        for j, ph in enumerate(group["phonemes"]):
            col1, col2, col3, col4 = st.columns([3, 3, 3, 1])  # new column for delete button
            with col1:
                phoneme_display = st.selectbox(
                    "Phoneme",
                    full_phoneme_list_display,
                    index=full_phoneme_list_display.index(phoneme_display_map.get(ph["phoneme"], ph["phoneme"])),
                    key=f"phoneme_{idx}_{j}"
                )
                phoneme_internal = display_to_phoneme.get(phoneme_display, phoneme_display)
            with col2:
                duration = st.number_input(
                    "Duration (seconds)",
                    min_value=0.0, max_value=5.0,
                    value=float(ph["duration"]),
                    step=0.01, format="%.2f",
                    key=f"duration_num_{idx}_{j}"
                )
            with col3:
                pitch = st.selectbox(
                    "Pitch",
                    allowed_pitches,
                    index=allowed_pitches.index(ph["pitch"]) if ph["pitch"] in allowed_pitches else 0,
                    key=f"pitch_{idx}_{j}"
                )
            with col4:
                if st.button("❌", key=f"remove_{idx}_{j}"):
                    group["phonemes"].pop(j)
                    st.experimental_rerun()  # force rerender safely

            new_phonemes.append({"phoneme": phoneme_internal, "duration": duration, "pitch": pitch})

        if st.button("➕ Add Phoneme", key=f"add_phoneme_{idx}"):
            group["phonemes"].append({"phoneme": "a", "duration": 0.2, "pitch": "D4"})
            st.experimental_rerun()

        updated_syllables.append({"syllable": group["syllable"], "phonemes": new_phonemes})
        st.divider()

    st.session_state.edited_syllables = updated_syllables

    st.header("3. Synthesize")
    confirm_clicked = st.button("✅ Synthesize", key="confirm_button_mei")

    if confirm_clicked:
        ds_path = TMP_DS_DIR / f"{mei_path.stem}.ds"
        try:
            all_phonemes = [ph for syllable in st.session_state.edited_syllables for ph in syllable["phonemes"]]
            build_ds_from_notes(all_phonemes, ds_path)
            with open(ds_path, "r", encoding="utf-8") as f:
                ds_data = json.load(f)
            validate_ds(ds_data)
            st.success(f"DS file created: {ds_path.name}")
        except Exception:
            handle_exception("DS generation or validation")

        with st.spinner("Running DiffSinger inference…"):
            try:
                wav_path = run_inference(ds_path, OUTPUT_DIR, mei_path.stem)
            except Exception:
                handle_exception("inference")

        st.success("Synthesis complete!")
        st.audio(str(wav_path))
        st.download_button("Download WAV", data=open(wav_path, "rb"), file_name=wav_path.name)

elif filetype == "DS":
    st.header("1. Upload DS File")
    ds_file = st.file_uploader("Upload your .ds file", type=["ds", "json"])

    st.header("2. Synthesize")
    synth_clicked = st.button("✅ Synthesize", key="synthesize_button_ds")

    if synth_clicked:
        if not ds_file:
            st.error("Please upload a .ds file.")
            st.stop()
        ds_path = UPLOAD_DS_DIR / ds_file.name
        with open(ds_path, "wb") as f:
            f.write(ds_file.getbuffer())
        with open(ds_path, "r", encoding="utf-8") as f:
            ds_data = json.load(f)

        try:
            validate_ds(ds_data)
        except Exception as e:
            st.error(f"Invalid DS file: {e}")
            st.stop()

        with st.spinner("Running DiffSinger inference…"):
            try:
                wav_path = run_inference(ds_path, OUTPUT_DIR, ds_path.stem)
            except Exception:
                handle_exception("inference")

        st.success("Synthesis complete!")
        st.audio(str(wav_path))
        st.download_button("Download WAV", data=open(wav_path, "rb"), file_name=wav_path.name)