Dionyssos's picture
Audionar
02bf1ff
# -*- coding: utf-8 -*-
import numpy as np
import soundfile
from Utils.text_utils import split_into_sentences
import msinference
import re
import srt
import time
import subprocess
import cv2
from pathlib import Path
from types import SimpleNamespace
from flask import Flask, request, send_from_directory
from moviepy.video.io.VideoFileClip import VideoFileClip
from moviepy.video.VideoClip import ImageClip
from audiocraft.builders import AudioGen
CACHE_DIR = 'flask_cache/'
sound_generator = AudioGen().to('cuda:0').eval() # duration chosen in generate()
Path(CACHE_DIR).mkdir(parents=True, exist_ok=True)
def resize_with_white_padding(image):
"""
Resizes an image to 1920x1080 while preserving aspect ratio
by adding white padding.
Args:
image (np.ndarray): The input image as a NumPy array.
Returns:
np.ndarray: The resized image with white padding.
"""
h, w = image.shape[:2]
target_h, target_w = 1080, 1920
aspect_ratio = w / h
target_aspect_ratio = target_w / target_h
if aspect_ratio > target_aspect_ratio:
# Image is wider than the target, pad top and bottom
new_w = target_w
new_h = int(new_w / aspect_ratio)
resized_image = cv2.resize(
image, (new_w, new_h), interpolation=cv2.INTER_LANCZOS4)
padding_h = target_h - new_h
top_padding = padding_h // 2
bottom_padding = padding_h - top_padding
padding = [(top_padding, bottom_padding), (0, 0)]
if len(image.shape) == 3:
padding.append((0, 0)) # Add padding for color channels
padded_image = np.pad(resized_image, padding,
mode='constant', constant_values=255)
elif aspect_ratio < target_aspect_ratio:
# Image is taller than the target, pad left and right
new_h = target_h
new_w = int(new_h * aspect_ratio)
resized_image = cv2.resize(
image, (new_w, new_h), interpolation=cv2.INTER_LANCZOS4)
padding_w = target_w - new_w
left_padding = padding_w // 2
right_padding = padding_w - left_padding
padding = [(0, 0), (left_padding, right_padding)]
if len(image.shape) == 3:
padding.append((0, 0)) # Add padding for color channels
padded_image = np.pad(resized_image, padding,
mode='constant', constant_values=255)
else:
# Aspect ratio matches the target, just resize
padded_image = cv2.resize(
image, (target_w, target_h), interpolation=cv2.INTER_LANCZOS4)
return padded_image # image 2 speech
def _shorten(filename):
return filename.replace("/", "")[-6:]
def _resize(image, width=None, height=None, inter=cv2.INTER_AREA):
'''https://github.com/PyImageSearch/imutils/blob/master/imutils/convenience.py'''
# initialize the dimensions of the image to be resized and
# grab the image size
dim = None
(h, w) = image.shape[:2]
# if both the width and height are None, then return the
# original image
if width is None and height is None:
return image
# check to see if the width is None
if width is None:
# calculate the ratio of the height and construct the
# dimensions
r = height / float(h)
dim = (int(w * r), height)
# otherwise, the height is None
else:
# calculate the ratio of the width and construct the
# dimensions
r = width / float(w)
dim = (width, int(h * r))
# resize the image
resized = cv2.resize(image, dim, interpolation=inter)
# return the resized image
return resized
def overlay(x, soundscape=None):
if soundscape is not None:
background = sound_generator.generate(soundscape,
duration=len(x)/16000 + .74, # duration seconds
).detach().cpu().numpy()
x = .6 * x + .4 * background[:len(x)]
return x
def tts_multi_sentence(precomputed_style_vector=None,
text=None,
voice=None,
soundscape=None,
speed=None):
'''create 24kHZ np.array with tts
precomputed_style_vector : required if en_US or en_UK in voice, so
to perform affective TTS.
text : string
voice : string or None (falls to styleTTS)
soundscape : 'A castle in far away lands' -> if passed will generate background sound soundscape
'''
# StyleTTS2 - English
if precomputed_style_vector is not None:
x = []
if not isinstance(text, list):
text = split_into_sentences(text) # Avoid OOM in StyleTTS2
for _sentence in text:
# StyleTTS2 - pronounciation Fx
# .replace("ţ", "ț").replace('ț','ts').replace('î', 'u')
_sentence = _sentence.lower()
if 'vctk_low#p326' in voice:
# fix sounding of sleepy AAABS TRAACT
_sentence = _sentence.replace(
'abstract', 'ahbstract') # 'ahstract'
x.append(msinference.inference(_sentence,
precomputed_style_vector)
)
x = np.concatenate(x)
# Fallback - MMS TTS - Non-English
else:
# dont split foreign sentences: Avoids speaker change issue
x = msinference.foreign(text=text,
lang=voice, # voice = 'romanian', 'serbian' 'hungarian'
speed=speed) # normalisation externally
# volume
x /= 1.02 * np.abs(x).max() + 1e-7 # amplify speech to full [-1,1] No amplification / normalisation on soundscapes
return overlay(x, soundscape=soundscape)
# voices = {}
# import phonemizer
# global_phonemizer = phonemizer.backend.EspeakBackend(language='en-us', preserve_punctuation=True, with_stress=True)
app = Flask(__name__)
@app.route("/", methods=['GET', 'POST', 'PUT'])
def serve_wav():
# https://stackoverflow.com/questions/13522137/in-flask-convert-form-post-
# object-into-a-representation-suitable-for-mongodb
r = request.form.to_dict(flat=False)
# Physically Save Client Files
for filename, obj in request.files.items():
obj.save(f'{CACHE_DIR}{_shorten(filename)}')
print('Saved all files on Server Side\n\n')
args = SimpleNamespace(
# crop last letters from original filename & use as tmp
text=None if r.get('text') is None else CACHE_DIR +
_shorten(r.get('text')[0]),
video=None if r.get('video') is None else CACHE_DIR +
_shorten(r.get('video')[0]),
image=None if r.get('image') is None else CACHE_DIR +
_shorten(r.get('image')[0]),
native=None if r.get('native') is None else CACHE_DIR +
_shorten(r.get('native')[0]),
affective=r.get('affective')[0],
voice=r.get('voice')[0],
speed=None, # obsolete due to oscillating MMS TTS VITS duration per language
soundscape=r.get('soundscape')[0] if r.get(
'soundscape') is not None else None,
)
# print('\n==RECOMPOSED as \n',request.data,request.form,'\n==')
print(args, 'ENTER Script')
do_video_dub = True if args.text.endswith('.srt') else False
SILENT_VIDEO = '_silent_video.mp4'
AUDIO_TRACK = '_audio_track.wav'
if do_video_dub:
print(
'==\nFound .srt : {args.txt}, thus Video should be given as well\n\n')
with open(args.text, "r") as f:
s = f.read()
text = [[j.content, j.start.total_seconds(), j.end.total_seconds()]
for j in srt.parse(s)]
assert args.video is not None
native_audio_file = '_tmp.wav'
subprocess.run(
["ffmpeg",
"-y", # https://stackoverflow.com/questions/39788972/ffmpeg-overwrite-output-file-if-exists
"-i",
args.video,
"-f",
"mp3",
"-ar",
"16000", # "22050 for mimic3",
"-vn",
native_audio_file])
x_native, _ = soundfile.read(native_audio_file) # reads mp3
# stereo in video
if x_native.ndim > 1:
x_native = x_native[:, 0] # stereo
# ffmpeg -i Sandra\ Kotevska\,\ Painting\ Rose\ bush\,\ mixed\ media\,\ 2017.\ \[NMzC_036MtE\].mkv -f mp3 -ar 22050 -vn out44.wa
else:
with open(args.text, 'r') as f:
text = ''.join(f)
# delete spaces / split in list in tts_multi_sentence()
text = re.sub(' +', ' ', text)
# == STYLE VECTOR ==
precomputed_style_vector = None
if args.native: # Voice Cloning
try:
precomputed_style_vector = msinference.compute_style(args.native)
except soundfile.LibsndfileError: # Fallback - internal voice
print('\n Could not voice clone audio:', args.native,
'fallback to video or Internal TTS voice.\n')
if do_video_dub: # Clone voice via Video
native_audio_file = args.video.replace('.', '').replace('/', '')
native_audio_file += '__native_audio_track.wav'
soundfile.write('tgt_spk.wav',
np.concatenate([
x_native[:int(4 * 16000)]], 0).astype(np.float32), 16000) # 27400?
precomputed_style_vector = msinference.compute_style('tgt_spk.wav')
# NOTE: style vector is normally None here - except if --native arg was passed
# Native English Accent TTS
if precomputed_style_vector is None:
if 'en_US' in args.voice or 'en_UK' in args.voice:
_dir = '/' if args.affective else '_v2/'
precomputed_style_vector = msinference.compute_style(
'assets/wavs/style_vector' + _dir + args.voice.replace(
'/', '_').replace(
'#', '_').replace(
'cmu-arctic', 'cmu_arctic').replace(
'_low', '') + '.wav')
# Non-Native English Accent TTS
elif '_' in args.voice:
precomputed_style_vector = msinference.compute_style('assets/wavs/mimic3_foreign_4x/' + args.voice.replace(
'/', '_').replace('#', '_').replace(
'cmu-arctic', 'cmu_arctic').replace(
'_low', '') + '.wav')
# Foreign Lang
else:
print(f'\n\n\n\n\n FallBack to MMS TTS due to: {args.voice=}')
# NOTE : precomputed_style_vector is still None if MMS TTS
# == SILENT VIDEO ==
if args.video is not None:
# banner - precomput @ 1920 pixels
frame_tts = np.zeros((104, 1920, 3), dtype=np.uint8)
font = cv2.FONT_HERSHEY_SIMPLEX
bottomLeftCornerOfText = (240, 74) # w,h
fontScale = 2
fontColor = (255, 255, 255)
thickness = 4
lineType = 2
cv2.putText(frame_tts, 'TTS',
bottomLeftCornerOfText,
font,
fontScale,
fontColor,
thickness,
lineType)
# cv2.imshow('i', frame_tts); cv2.waitKey(); cv2.destroyAllWindows()
# ====================================== NATIVE VOICE
frame_orig = np.zeros((104, 1920, 3), dtype=np.uint8)
font = cv2.FONT_HERSHEY_SIMPLEX
bottomLeftCornerOfText = (101, 74) # w,h
fontScale = 2
fontColor = (255, 255, 255)
thickness = 4
lineType = 1000
cv2.putText(frame_orig, 'ORIGINAL VOICE',
bottomLeftCornerOfText,
font,
fontScale,
fontColor,
thickness,
lineType)
print(f'\n______________________________\n'
f'Gen Banners for TTS/Native Title {frame_tts.shape=} {frame_orig.shape=}'
f'\n______________________________\n')
# ====SILENT VIDEO EXTRACT====
# DONLOAD SRT from youtube
#
# yt-dlp --write-sub --sub-lang en --convert-subs "srt" https://www.youtube.com/watch?v=F1Ib7TAu7eg&list=PL4x2B6LSwFewdDvRnUTpBM7jkmpwouhPv&index=2
#
#
# .mkv ->.mp4 moviepy loads only .mp4
#
# ffmpeg -y -i Distaff\ \[qVonBgRXcWU\].mkv -c copy -c:a aac Distaff_qVonBgRXcWU.mp4
# video_file, srt_file = ['assets/Head_of_fortuna.mp4',
# 'assets/head_of_fortuna_en.srt']
#
video_file = args.video
vf = VideoFileClip(video_file)
# GET 1st FRAME to OBTAIN frame RESOLUTION
h, w, _ = vf.get_frame(0).shape
frame_tts = _resize(frame_tts, width=w)
frame_orig = _resize(frame_orig, width=w)
h, w, _ = frame_orig.shape
try:
# inpaint banner to say if native voice
num = x_native.shape[0]
# fade heaviside
is_tts = .5 + .5 * np.tanh(4*(np.linspace(-10, 10, num) + 9.4))
def inpaint_banner(get_frame, t):
'''blend banner - (now plays) tts or native voic
'''
im = np.copy(get_frame(t)) # pic
ix = int(t * 16000) # ix may overflow the is_tts.shape
if ix < num:
if is_tts[ix] > .5: # mask == 1 => tts / mask == 0 -> native
frame = frame_tts # rename frame to rsz_frame_... because if frame_tts is mod
# then is considered a "local variable" thus the "outer var"
# is not observed by python raising referenced before assign
else:
frame = frame_orig
# For the ix that is out of bounds of num assume frame_tts
else:
frame = frame_tts
# im[-h:, -w:, :] = (.4 * im[-h:, -w:, :] + .6 * frame_orig).astype(np.uint8)
offset_h = 24
print(
f' > inpaint_banner() HAS NATIVE: {frame.shape=} {im.shape=}\n\n\n\n')
im[offset_h:h + offset_h, :w, :] = (.4 * im[offset_h:h + offset_h, :w, :]
+ .6 * frame).astype(np.uint8)
# im2 = np.concatenate([im, frame_tts], 0)
# cv2.imshow('t', im2); cv2.waitKey(); cv2.destroyAllWindows()
return im # np.concatenate([im, frane_ttts], 0)
except UnboundLocalError: # args.native == False
def inpaint_banner(get_frame, t):
im = np.copy(get_frame(t))
h, w, _ = frame_tts.shape # frame = banner
if w != im.shape[1]: # rsz banners to fit video w
local_frame = _resize(frame_tts, width=im.shape[1])
offset_h = 24
im[offset_h:h + offset_h, :w, :] = (.4 * im[offset_h:h+offset_h, :w, :]
+ .6 * local_frame).astype(np.uint8)
return im
vf = vf.fl(inpaint_banner)
vf.write_videofile(SILENT_VIDEO)
# ==== TTS .srt ====
if do_video_dub:
OUT_FILE = 'tmp.mp4' # args.out_file + '_video_dub.mp4'
subtitles = text
MAX_LEN = int(subtitles[-1][2] + 17) * 16000
# 17 extra seconds fail-safe for long-last-segment
print("TOTAL LEN SAMPLES ", MAX_LEN, '\n====================')
pieces = []
for k, (_text_, orig_start, orig_end) in enumerate(subtitles):
pieces.append(tts_multi_sentence(text=_text_,
precomputed_style_vector=precomputed_style_vector,
voice=args.voice,
soundscape=args.soundscape,
speed=args.speed)
)
total = np.concatenate(pieces, 0)
# x = audresample.resample(x.astype(np.float32), 24000, 22050) # reshapes (64,) -> (1,64)
# PAD SHORTEST of TTS / NATIVE
if len(x_native) > len(total):
total = np.pad(
total, (0, max(0, x_native.shape[0] - total.shape[0])))
else: # pad native to len of is_tts & total
x_native = np.pad(
x_native, (0, max(0, total.shape[0] - x_native.shape[0])))
# print(total.shape, x_native.shape, 'PADDED TRACKS')
soundfile.write(AUDIO_TRACK,
# (is_tts * total + (1-is_tts) * x_native)[:, None],
(.64 * total + .27 * x_native)[:, None],
16000)
else: # Video from plain (.txt)
OUT_FILE = 'tmp.mp4'
x = tts_multi_sentence(text=text,
precomputed_style_vector=precomputed_style_vector,
voice=args.voice,
soundscape=args.soundscape,
speed=args.speed)
soundfile.write(AUDIO_TRACK, x, 16000)
# IMAGE 2 SPEECH
if args.image is not None:
# Resize Input Image to 1920x1080 - Issue of .mp4 non visible for other aspect ratios
STATIC_FRAME = args.image + '.jpg' # 'assets/image_from_T31.jpg'
cv2.imwrite(
STATIC_FRAME,
resize_with_white_padding(cv2.imread(args.image)
))
OUT_FILE = 'tmp.mp4' # args.out_file + '_image_to_speech.mp4'
# SILENT CLIP
clip_silent = ImageClip(img=STATIC_FRAME,
duration=5) # ffmpeg continues this silent video for duration of TTS
clip_silent.write_videofile(SILENT_VIDEO, fps=24)
x = tts_multi_sentence(text=text,
precomputed_style_vector=precomputed_style_vector,
voice=args.voice,
soundscape=args.soundscape,
speed=args.speed
)
soundfile.write(AUDIO_TRACK, x, 16000)
if args.video or args.image:
# write final output video
subprocess.run(
["ffmpeg",
"-y",
"-i",
SILENT_VIDEO,
"-i",
AUDIO_TRACK,
"-c:v",
"copy",
"-map",
"0:v:0",
"-map",
" 1:a:0",
CACHE_DIR + OUT_FILE])
print(f'\noutput video is saved as {OUT_FILE}')
else:
# Fallback: No image nor video provided - do only tts
x = tts_multi_sentence(text=text,
precomputed_style_vector=precomputed_style_vector,
voice=args.voice,
soundscape=args.soundscape,
speed=args.speed)
OUT_FILE = 'tmp.wav'
soundfile.write(CACHE_DIR + OUT_FILE, x, 16000)
# audios = [msinference.inference(text,
# msinference.compute_style(f'voices/{voice}.wav'))]
# # for t in [text]:
# output_buffer = io.BytesIO()
# write(output_buffer, 24000, np.concatenate(audios))
# response = Response(output_buffer.getvalue())
# response.headers["Content-Type"] = "audio/wav"
# https://stackoverflow.com/questions/67591467/
# flask-shows-typeerror-send-from-directory-missing-1-required-positional-argum
# time.sleep(4)
# send server's output as default file -> srv_result.xx
print(f'\n=SERVER saved as {OUT_FILE=}\n')
response = send_from_directory(CACHE_DIR, path=OUT_FILE)
response.headers['suffix-file-type'] = OUT_FILE
print('________________\n ? \n_______________')
return response
if __name__ == "__main__":
app.run(host="0.0.0.0")
# Concat. .mp4
# _list.txt
#
# file out/som_utasitvany_en_txt.mp4
# file out/som_utasitvany_hu_txt.mp4
#
#
# subprocess.run(
# [
# "ffmpeg",
# "-f",
# "concat",
# '-safe',
# '0',
# '-i',
# '_list.txt',
# '-c',
# 'copy',
# f'fusion.mp4', # save to correct location is handled in client
# ])
#
# ffmpeg -f concat -i mylist.txt -c copy output.mp4