File size: 4,873 Bytes
26f400e f57c107 26f400e a686be1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 |
import gradio as gr
import json
import os
import torchaudio
from infer import (
WatermarkSolver,
hamming_distance
)
# Predefined watermarks (instead of loading from a JSON file)
watermarks = {
"VoiceMark": "1000010101010011",
"Voice Cloning": "1111111001000010",
"Speech Security": "1011101100001110",
"Audio Watermarking": "0110110011100010",
"Deep Learning": "0000100111111000",
"Artificial Intelligence": "0010000100011111",
"Hello World": "0001111101110001",
"Happy New Year": "1101011011011101",
"World Peace": "0011110010011110",
"Good Morning": "0000001011000010",
}
# Initialize WatermarkSolver model
solver = WatermarkSolver()
solver.load_model(checkpoint_dir="./", checkpoint_name="voicemark.pth", strict=True)
# Gradio interface
with gr.Blocks() as demo:
gr.Markdown(
"## VoiceMark: Zero-Shot Voice Cloning-Resistant Watermarking Approach Leveraging Speaker-Specific Latents"
)
with gr.Column():
gr.Image(
value="voicemark_overview.png",
width=925,
height=487,
elem_id="overview_image",
label="overview"
)
# Step 1: Upload audio and select watermark
gr.HTML("<h3 style='text-align: center;'>The overall architecture of our proposed VoiceMark</h3>")
# Step 1: Upload audio and select watermark
gr.Markdown(
"""
**Step 1**: Upload an audio file or select one from the provided samples, choose a watermark, and generate the watermarked audio.
"""
)
with gr.Row():
with gr.Column():
audio_input = gr.Audio(label="Upload Audio", type="filepath")
gr.Examples(
examples=[
["audios/1.wav"],
["audios/2.wav"],
["audios/3.wav"],
["audios/4.wav"],
["audios/5.wav"],
],
inputs=audio_input,
label="Sample Audios (Click to Use)"
)
with gr.Column():
audio_output = gr.Audio(label="Watermarked Audio", type="filepath")
watermark_list = gr.Dropdown(
label="Select Watermark", choices=list(watermarks.keys()), interactive=True
)
add_watermark_button = gr.Button("Add Watermark to Audio")
# Step 2: TTS tools demo links
gr.Markdown(
"""
**Step 2**: Download the generated watermarked audio, then use Zero-Shot Voice Cloning tools to generate the cloned audio. Some available tools are:
- [CosyVoice2: Scalable Streaming Speech Synthesis with Large Language Models](https://www.modelscope.cn/studios/iic/CosyVoice2-0.5B)
- [F5-TTS: A Fairytaler that Fakes Fluent and Faithful Speech with Flow Matching](https://huggingface.co/spaces/mrfakename/E2-F5-TTS)
- [MaskGCT: Zero-Shot Text-to-Speech with Masked Generative Codec Transformer](https://huggingface.co/spaces/amphion/maskgct)
"""
)
# Step 3: Upload cloned audio to decode watermark
gr.Markdown(
"""
**Step 3**: Upload the cloned audio and decode your watermark.
"""
)
with gr.Row():
decode_audio_input = gr.Audio(label="Upload Cloned Audio", type="filepath")
with gr.Column():
decoded_watermark_output = gr.Textbox(label="Decoded Watermark")
decode_button = gr.Button("Decode Watermark")
def process_audio(audio_path, watermark_text):
if not audio_path:
return "No audio selected. Please upload or select a sample."
try:
watermarked_audio = solver.infer_for_ui(
audio_path, watermarks[watermark_text]
)
return watermarked_audio
except ValueError as e:
return str(e)
add_watermark_button.click(
process_audio,
inputs=[audio_input, watermark_list],
outputs=audio_output
)
def decode_watermark(audio_path):
try:
detect_prob, decoded_id = solver.decode_for_ui(audio_path)
if detect_prob < 1e-2:
return "No matching watermark found"
closest_match = None
min_distance = float("inf")
for text, id_bin in watermarks.items():
distance = hamming_distance(decoded_id, id_bin, base=16)
if distance < min_distance:
closest_match = text
min_distance = distance
if min_distance < 10:
return closest_match
return "No matching watermark found"
except ValueError as e:
return str(e)
decode_button.click(
decode_watermark, inputs=decode_audio_input, outputs=decoded_watermark_output
)
# Launch the Gradio app
demo.launch()
|