mrfakename commited on
Commit
7b01ab0
·
verified ·
1 Parent(s): abc80dc

Sync from GitHub repo

Browse files

This Space is synced from the GitHub repo: https://github.com/SWivid/F5-TTS. Please submit contributions to the Space there

app.py CHANGED
@@ -3,6 +3,7 @@
3
 
4
  import gc
5
  import json
 
6
  import re
7
  import tempfile
8
  from collections import OrderedDict
@@ -41,6 +42,7 @@ from f5_tts.infer.utils_infer import (
41
  preprocess_ref_audio_text,
42
  remove_silence_for_generated_wav,
43
  save_spectrogram,
 
44
  )
45
  from f5_tts.model import DiT, UNetT
46
 
@@ -189,16 +191,20 @@ def infer(
189
 
190
  # Remove silence
191
  if remove_silence:
192
- with tempfile.NamedTemporaryFile(suffix=".wav") as f:
193
- sf.write(f.name, final_wave, final_sample_rate)
 
 
194
  remove_silence_for_generated_wav(f.name)
195
  final_wave, _ = torchaudio.load(f.name)
 
 
196
  final_wave = final_wave.squeeze().cpu().numpy()
197
 
198
  # Save the spectrogram
199
- with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp_spectrogram:
200
  spectrogram_path = tmp_spectrogram.name
201
- save_spectrogram(combined_spectrogram, spectrogram_path)
202
 
203
  return (final_sample_rate, final_wave), spectrogram_path, ref_text, used_seed
204
 
 
3
 
4
  import gc
5
  import json
6
+ import os
7
  import re
8
  import tempfile
9
  from collections import OrderedDict
 
42
  preprocess_ref_audio_text,
43
  remove_silence_for_generated_wav,
44
  save_spectrogram,
45
+ tempfile_kwargs,
46
  )
47
  from f5_tts.model import DiT, UNetT
48
 
 
191
 
192
  # Remove silence
193
  if remove_silence:
194
+ with tempfile.NamedTemporaryFile(suffix=".wav", **tempfile_kwargs) as f:
195
+ temp_path = f.name
196
+ try:
197
+ sf.write(temp_path, final_wave, final_sample_rate)
198
  remove_silence_for_generated_wav(f.name)
199
  final_wave, _ = torchaudio.load(f.name)
200
+ finally:
201
+ os.unlink(temp_path)
202
  final_wave = final_wave.squeeze().cpu().numpy()
203
 
204
  # Save the spectrogram
205
+ with tempfile.NamedTemporaryFile(suffix=".png", **tempfile_kwargs) as tmp_spectrogram:
206
  spectrogram_path = tmp_spectrogram.name
207
+ save_spectrogram(combined_spectrogram, spectrogram_path)
208
 
209
  return (final_sample_rate, final_wave), spectrogram_path, ref_text, used_seed
210
 
pyproject.toml CHANGED
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
 
5
  [project]
6
  name = "f5-tts"
7
- version = "1.1.4"
8
  description = "F5-TTS: A Fairytaler that Fakes Fluent and Faithful Speech with Flow Matching"
9
  readme = "README.md"
10
  license = {text = "MIT License"}
 
4
 
5
  [project]
6
  name = "f5-tts"
7
+ version = "1.1.5"
8
  description = "F5-TTS: A Fairytaler that Fakes Fluent and Faithful Speech with Flow Matching"
9
  readme = "README.md"
10
  license = {text = "MIT License"}
src/f5_tts/infer/SHARED.md CHANGED
@@ -33,6 +33,8 @@
33
  - [F5-TTS Base @ ru @ HotDro4illa](#f5-tts-base--ru--hotdro4illa)
34
  - [Spanish](#spanish)
35
  - [F5-TTS Base @ es @ jpgallegoar](#f5-tts-base--es--jpgallegoar)
 
 
36
 
37
 
38
  ## Multilingual
@@ -173,3 +175,12 @@ Config: {"dim": 1024, "depth": 22, "heads": 16, "ff_mult": 2, "text_dim": 512, "
173
  |F5-TTS Base|[ckpt & vocab](https://huggingface.co/jpgallegoar/F5-Spanish)|[Voxpopuli](https://huggingface.co/datasets/facebook/voxpopuli) & Crowdsourced & TEDx, 218 hours|cc0-1.0|
174
 
175
  - @jpgallegoar [GitHub repo](https://github.com/jpgallegoar/Spanish-F5), Jupyter Notebook and Gradio usage for Spanish model.
 
 
 
 
 
 
 
 
 
 
33
  - [F5-TTS Base @ ru @ HotDro4illa](#f5-tts-base--ru--hotdro4illa)
34
  - [Spanish](#spanish)
35
  - [F5-TTS Base @ es @ jpgallegoar](#f5-tts-base--es--jpgallegoar)
36
+ - [German](#german)
37
+ - [F5-TTS Base @ de @ hvoss-techfak](#f5-tts-base--de--hvoss-techfak)
38
 
39
 
40
  ## Multilingual
 
175
  |F5-TTS Base|[ckpt & vocab](https://huggingface.co/jpgallegoar/F5-Spanish)|[Voxpopuli](https://huggingface.co/datasets/facebook/voxpopuli) & Crowdsourced & TEDx, 218 hours|cc0-1.0|
176
 
177
  - @jpgallegoar [GitHub repo](https://github.com/jpgallegoar/Spanish-F5), Jupyter Notebook and Gradio usage for Spanish model.
178
+
179
+ ## German
180
+
181
+ #### F5-TTS Base @ de @ hvoss-techfak
182
+ |Model|🤗Hugging Face|Data (Hours)|Model License|
183
+ |:---:|:------------:|:-----------:|:-------------:|
184
+ |F5-TTS Base|[ckpt & vocab](https://huggingface.co/hvoss-techfak/F5-TTS-German)|[Mozilla Common Voice 19.0](https://commonvoice.mozilla.org/en/datasets) & 800 hours Crowdsourced |cc-by-nc-4.0|
185
+
186
+ - Finetuned by [@hvoss-techfak](https://github.com/hvoss-techfak)
src/f5_tts/infer/utils_infer.py CHANGED
@@ -45,6 +45,8 @@ device = (
45
  else "cpu"
46
  )
47
 
 
 
48
  # -----------------------------------------
49
 
50
  target_sample_rate = 24000
@@ -306,42 +308,44 @@ def preprocess_ref_audio_text(ref_audio_orig, ref_text, show_info=print):
306
  ref_audio = _ref_audio_cache[audio_hash]
307
 
308
  else: # first pass, do preprocess
309
- with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
310
- aseg = AudioSegment.from_file(ref_audio_orig)
 
 
311
 
312
- # 1. try to find long silence for clipping
 
 
 
 
 
 
 
 
 
 
 
 
313
  non_silent_segs = silence.split_on_silence(
314
- aseg, min_silence_len=1000, silence_thresh=-50, keep_silence=1000, seek_step=10
315
  )
316
  non_silent_wave = AudioSegment.silent(duration=0)
317
  for non_silent_seg in non_silent_segs:
318
  if len(non_silent_wave) > 6000 and len(non_silent_wave + non_silent_seg) > 12000:
319
- show_info("Audio is over 12s, clipping short. (1)")
320
  break
321
  non_silent_wave += non_silent_seg
322
 
323
- # 2. try to find short silence for clipping if 1. failed
324
- if len(non_silent_wave) > 12000:
325
- non_silent_segs = silence.split_on_silence(
326
- aseg, min_silence_len=100, silence_thresh=-40, keep_silence=1000, seek_step=10
327
- )
328
- non_silent_wave = AudioSegment.silent(duration=0)
329
- for non_silent_seg in non_silent_segs:
330
- if len(non_silent_wave) > 6000 and len(non_silent_wave + non_silent_seg) > 12000:
331
- show_info("Audio is over 12s, clipping short. (2)")
332
- break
333
- non_silent_wave += non_silent_seg
334
-
335
- aseg = non_silent_wave
336
-
337
- # 3. if no proper silence found for clipping
338
- if len(aseg) > 12000:
339
- aseg = aseg[:12000]
340
- show_info("Audio is over 12s, clipping short. (3)")
341
-
342
- aseg = remove_silence_edges(aseg) + AudioSegment.silent(duration=50)
343
- aseg.export(f.name, format="wav")
344
- ref_audio = f.name
345
 
346
  # Cache the processed reference audio
347
  _ref_audio_cache[audio_hash] = ref_audio
 
45
  else "cpu"
46
  )
47
 
48
+ tempfile_kwargs = {"delete_on_close": False} if sys.version_info >= (3, 12) else {"delete": False}
49
+
50
  # -----------------------------------------
51
 
52
  target_sample_rate = 24000
 
308
  ref_audio = _ref_audio_cache[audio_hash]
309
 
310
  else: # first pass, do preprocess
311
+ with tempfile.NamedTemporaryFile(suffix=".wav", **tempfile_kwargs) as f:
312
+ temp_path = f.name
313
+
314
+ aseg = AudioSegment.from_file(ref_audio_orig)
315
 
316
+ # 1. try to find long silence for clipping
317
+ non_silent_segs = silence.split_on_silence(
318
+ aseg, min_silence_len=1000, silence_thresh=-50, keep_silence=1000, seek_step=10
319
+ )
320
+ non_silent_wave = AudioSegment.silent(duration=0)
321
+ for non_silent_seg in non_silent_segs:
322
+ if len(non_silent_wave) > 6000 and len(non_silent_wave + non_silent_seg) > 12000:
323
+ show_info("Audio is over 12s, clipping short. (1)")
324
+ break
325
+ non_silent_wave += non_silent_seg
326
+
327
+ # 2. try to find short silence for clipping if 1. failed
328
+ if len(non_silent_wave) > 12000:
329
  non_silent_segs = silence.split_on_silence(
330
+ aseg, min_silence_len=100, silence_thresh=-40, keep_silence=1000, seek_step=10
331
  )
332
  non_silent_wave = AudioSegment.silent(duration=0)
333
  for non_silent_seg in non_silent_segs:
334
  if len(non_silent_wave) > 6000 and len(non_silent_wave + non_silent_seg) > 12000:
335
+ show_info("Audio is over 12s, clipping short. (2)")
336
  break
337
  non_silent_wave += non_silent_seg
338
 
339
+ aseg = non_silent_wave
340
+
341
+ # 3. if no proper silence found for clipping
342
+ if len(aseg) > 12000:
343
+ aseg = aseg[:12000]
344
+ show_info("Audio is over 12s, clipping short. (3)")
345
+
346
+ aseg = remove_silence_edges(aseg) + AudioSegment.silent(duration=50)
347
+ aseg.export(temp_path, format="wav")
348
+ ref_audio = temp_path
 
 
 
 
 
 
 
 
 
 
 
 
349
 
350
  # Cache the processed reference audio
351
  _ref_audio_cache[audio_hash] = ref_audio