Spaces:
Sleeping
Sleeping
Liam Pond
commited on
Commit
·
9062c1f
1
Parent(s):
c79c7f8
smarter phonemizer
Browse files- app.py +69 -66
- webapp/services/defaults/default_splitter.py +65 -0
app.py
CHANGED
@@ -10,6 +10,8 @@ import requests
|
|
10 |
import zipfile
|
11 |
import streamlit.components.v1 as components
|
12 |
from pathlib import Path
|
|
|
|
|
13 |
|
14 |
def patch_config_yaml_files():
|
15 |
root = "/tmp/cantussvs_v1"
|
@@ -174,87 +176,87 @@ full_phoneme_list_display = [phoneme_display_map.get(p, p) for p in permitted_ph
|
|
174 |
# Pitch list D4-D5
|
175 |
allowed_pitches = ["D4", "D#4", "E4", "F4", "F#4", "G4", "G#4", "A4", "A#4", "B4", "C5", "C#5", "D5"]
|
176 |
|
177 |
-
#
|
178 |
-
|
179 |
|
180 |
-
|
181 |
-
#
|
182 |
|
183 |
-
|
184 |
-
|
185 |
|
186 |
-
|
187 |
-
|
188 |
|
189 |
-
|
190 |
|
191 |
-
#
|
192 |
|
193 |
-
|
194 |
|
195 |
-
|
196 |
-
|
197 |
|
198 |
-
|
199 |
-
|
200 |
-
|
201 |
|
202 |
-
|
203 |
|
204 |
-
|
205 |
|
206 |
-
|
207 |
-
|
208 |
-
|
209 |
-
|
210 |
|
211 |
-
|
212 |
|
213 |
-
|
214 |
|
215 |
-
|
216 |
-
|
217 |
-
|
218 |
-
|
219 |
-
|
220 |
|
221 |
-
|
222 |
|
223 |
-
|
224 |
-
|
225 |
|
226 |
-
|
227 |
|
228 |
-
|
229 |
|
230 |
-
|
231 |
-
|
232 |
-
|
233 |
|
234 |
-
|
235 |
|
236 |
-
|
237 |
|
238 |
-
|
239 |
|
240 |
-
|
241 |
-
|
242 |
|
243 |
-
|
244 |
-
|
245 |
-
|
246 |
-
|
247 |
-
|
248 |
-
|
249 |
-
|
250 |
-
|
251 |
-
|
252 |
-
|
253 |
-
|
254 |
-
|
255 |
-
|
256 |
-
|
257 |
-
|
258 |
|
259 |
filetype = st.selectbox("Select file type:", ["MEI", "DS"])
|
260 |
|
@@ -301,15 +303,16 @@ if filetype == "MEI":
|
|
301 |
for note in st.session_state.original_raw_notes:
|
302 |
syllable_text = note["lyric"]
|
303 |
pitch = note["pitch"]
|
304 |
-
|
305 |
-
|
306 |
-
|
307 |
-
|
308 |
-
|
309 |
-
|
310 |
-
|
311 |
-
|
312 |
-
|
|
|
313 |
|
314 |
if "edited_syllables" not in st.session_state:
|
315 |
st.session_state.edited_syllables = syllable_groups
|
|
|
10 |
import zipfile
|
11 |
import streamlit.components.v1 as components
|
12 |
from pathlib import Path
|
13 |
+
from webapp.services.defaults.default_splitter import split_syllable
|
14 |
+
|
15 |
|
16 |
def patch_config_yaml_files():
|
17 |
root = "/tmp/cantussvs_v1"
|
|
|
176 |
# Pitch list D4-D5
|
177 |
allowed_pitches = ["D4", "D#4", "E4", "F4", "F#4", "G4", "G#4", "A4", "A#4", "B4", "C5", "C#5", "D5"]
|
178 |
|
179 |
+
# Title
|
180 |
+
st.title("CantusSVS: Latin Singing Voice Synthesis")
|
181 |
|
182 |
+
st.markdown("""
|
183 |
+
# About CantusSVS
|
184 |
|
185 |
+
<p>CantusSVS is a web-based Singing Voice Synthesis (SVS) system designed for composers and musicians to synthesize Latin chant audio from a custom musical score.
|
186 |
+
Built on top of the DiffSinger AI model, CantusSVS enables detailed, precise control over melody, rhythm, phonemes, and timing without any programming knowledge required.</p>
|
187 |
|
188 |
+
<p>Designed by Liam Pond as the final project for MUS6329X: Projet en informatique musicale (Prof. Dominic Thibault) at the Université de Montréal.
|
189 |
+
You can view this project's GitHub repository [here](https://github.com/liampond/CantusSVS).</p>
|
190 |
|
191 |
+
---
|
192 |
|
193 |
+
# How to Use CantusSVS
|
194 |
|
195 |
+
## 1. Compose Your Music
|
196 |
|
197 |
+
Compose the chant you want to synthesize using the notation software of your choice. [MuseScore 4](https://musescore.org/en/download) is recommended.
|
198 |
+
The chant must adhere to the following conditions:
|
199 |
|
200 |
+
- Monophonic only (one note at a time, no harmonies or chords)
|
201 |
+
- Pitch range of <span class="tooltip">**D4 to D5**<span class="tooltiptext">Because training data was limited outside this range, synthesis outside these pitches is very poor.</span></span>
|
202 |
+
- Lyrics (Latin) under each note, separated by syllable
|
203 |
|
204 |
+
## 2. Export Your Score to MEI
|
205 |
|
206 |
+
When your score is complete, export it to MEI.
|
207 |
|
208 |
+
In MuseScore:
|
209 |
+
- Go to **File → Export**
|
210 |
+
- Choose the `.mei` file format
|
211 |
+
- Save it to your computer
|
212 |
|
213 |
+
## 3. Upload Your Score to CantusSVS
|
214 |
|
215 |
+
In the CantusSVS web app:
|
216 |
|
217 |
+
- Select **MEI** mode
|
218 |
+
- Adjust the **tempo** if necessary using the provided slider
|
219 |
+
- Upload your `.mei` file
|
220 |
+
- Your score will be displayed using Verovio
|
221 |
+
- You may use the demo `.mei` file if you wish
|
222 |
|
223 |
+
## 4. Edit Phonemes, Durations, and Pitches
|
224 |
|
225 |
+
CantusSVS automatically suggests phoneme splits for each syllable.
|
226 |
+
However, you will have the opportunity to review phonemes, durations, and pitches.
|
227 |
|
228 |
+
## 5. Synthesize the Audio
|
229 |
|
230 |
+
When you're done:
|
231 |
|
232 |
+
- Click **Confirm**
|
233 |
+
- CantusSVS will create a `.ds` file which are processed through pretrained DiffSinger models
|
234 |
+
- The synthesized chant will be generated
|
235 |
|
236 |
+
This can take a few minutes depending on input length
|
237 |
|
238 |
+
## 6. Listen and Download
|
239 |
|
240 |
+
After synthesis you can either listen to your chant directly in the app or download a `.wav` file to your computer.
|
241 |
|
242 |
+
---
|
243 |
+
""", unsafe_allow_html=True)
|
244 |
|
245 |
+
st.markdown("""
|
246 |
+
<script>
|
247 |
+
const tooltipSpan = window.parent.document.querySelector('span[style*="border-bottom: 1px dotted black"]');
|
248 |
+
if (tooltipSpan) {
|
249 |
+
tooltipSpan.addEventListener('mouseover', () => {
|
250 |
+
tooltipSpan.children[0].style.visibility = 'visible';
|
251 |
+
tooltipSpan.children[0].style.opacity = 1;
|
252 |
+
});
|
253 |
+
tooltipSpan.addEventListener('mouseout', () => {
|
254 |
+
tooltipSpan.children[0].style.visibility = 'hidden';
|
255 |
+
tooltipSpan.children[0].style.opacity = 0;
|
256 |
+
});
|
257 |
+
}
|
258 |
+
</script>
|
259 |
+
""", unsafe_allow_html=True)
|
260 |
|
261 |
filetype = st.selectbox("Select file type:", ["MEI", "DS"])
|
262 |
|
|
|
303 |
for note in st.session_state.original_raw_notes:
|
304 |
syllable_text = note["lyric"]
|
305 |
pitch = note["pitch"]
|
306 |
+
syllable = split_syllable(
|
307 |
+
syllable=syllable_text,
|
308 |
+
note_duration=note["duration"],
|
309 |
+
tempo=tempo,
|
310 |
+
pitch=pitch
|
311 |
+
)
|
312 |
+
syllable_groups.append({
|
313 |
+
"syllable": syllable_text,
|
314 |
+
"phonemes": syllable
|
315 |
+
})
|
316 |
|
317 |
if "edited_syllables" not in st.session_state:
|
318 |
st.session_state.edited_syllables = syllable_groups
|
webapp/services/defaults/default_splitter.py
ADDED
@@ -0,0 +1,65 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# webapp/services/defaults/default_splitter.py
|
2 |
+
|
3 |
+
from webapp.services.phonemes.phoneme_dict import PHONEMES
|
4 |
+
|
5 |
+
|
6 |
+
from webapp.services.phonemes.phoneme_dict import PHONEMES
|
7 |
+
|
8 |
+
# Treat the list as a set for fast lookup
|
9 |
+
PHONEME_SET = set(PHONEMES)
|
10 |
+
|
11 |
+
from webapp.services.phonemes.phoneme_dict import PHONEMES
|
12 |
+
|
13 |
+
PHONEME_SET = set(PHONEMES)
|
14 |
+
|
15 |
+
def _apply_brightness_overrides(syllable: str, phonemes: list[str]) -> list[str]:
|
16 |
+
# Rule: if the syllable is 'ecce', override the final vowel to 'ay'
|
17 |
+
if syllable == "ecce" and phonemes and phonemes[-1] in {"e", "eh", "ae"}:
|
18 |
+
phonemes[-1] = "ay"
|
19 |
+
return phonemes
|
20 |
+
|
21 |
+
def latin_phoneme_split(syllable: str) -> list[str]:
|
22 |
+
syllable = syllable.lower()
|
23 |
+
result = []
|
24 |
+
|
25 |
+
# First try: one-letter phonemes
|
26 |
+
all_valid = True
|
27 |
+
for ch in syllable:
|
28 |
+
if ch in PHONEME_SET:
|
29 |
+
result.append(ch)
|
30 |
+
else:
|
31 |
+
all_valid = False
|
32 |
+
break
|
33 |
+
|
34 |
+
if all_valid:
|
35 |
+
return _apply_brightness_overrides(syllable, result)
|
36 |
+
|
37 |
+
# Greedy 2-letter then 1-letter fallback
|
38 |
+
result = []
|
39 |
+
i = 0
|
40 |
+
while i < len(syllable):
|
41 |
+
two = syllable[i:i+2]
|
42 |
+
if len(two) == 2 and two in PHONEME_SET:
|
43 |
+
result.append(two)
|
44 |
+
i += 2
|
45 |
+
elif syllable[i] in PHONEME_SET:
|
46 |
+
result.append(syllable[i])
|
47 |
+
i += 1
|
48 |
+
else:
|
49 |
+
result.append("a") # fallback
|
50 |
+
i += 1
|
51 |
+
|
52 |
+
return _apply_brightness_overrides(syllable, result)
|
53 |
+
|
54 |
+
|
55 |
+
def split_syllable(syllable: str, note_duration: float, tempo: float, pitch: str) -> list[dict]:
|
56 |
+
phonemes = latin_phoneme_split(syllable)
|
57 |
+
duration_per = max(0.05, (note_duration / len(phonemes)) * (60 / tempo))
|
58 |
+
return [
|
59 |
+
{
|
60 |
+
"phoneme": ph if ph in PHONEME_SET else "a",
|
61 |
+
"duration": duration_per,
|
62 |
+
"pitch": pitch
|
63 |
+
}
|
64 |
+
for ph in phonemes
|
65 |
+
]
|