File size: 4,873 Bytes
26f400e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f57c107
26f400e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a686be1
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
import gradio as gr
import json
import os
import torchaudio
from infer import (
    WatermarkSolver,
    hamming_distance
)

# Predefined watermarks (instead of loading from a JSON file)
watermarks = {
    "VoiceMark": "1000010101010011",
    "Voice Cloning": "1111111001000010",
    "Speech Security": "1011101100001110",
    "Audio Watermarking": "0110110011100010",
    "Deep Learning": "0000100111111000",
    "Artificial Intelligence": "0010000100011111",
    "Hello World": "0001111101110001",
    "Happy New Year": "1101011011011101",
    "World Peace": "0011110010011110",
    "Good Morning": "0000001011000010",
}

# Initialize WatermarkSolver model
solver = WatermarkSolver()
solver.load_model(checkpoint_dir="./", checkpoint_name="voicemark.pth", strict=True)

# Gradio interface
with gr.Blocks() as demo:
    gr.Markdown(
        "## VoiceMark: Zero-Shot Voice Cloning-Resistant Watermarking Approach Leveraging Speaker-Specific Latents"
    )
    with gr.Column():
        gr.Image(
            value="voicemark_overview.png",
            width=925,
            height=487,
            elem_id="overview_image",
            label="overview"
        )
        # Step 1: Upload audio and select watermark
        gr.HTML("<h3 style='text-align: center;'>The overall architecture of our proposed VoiceMark</h3>")

    # Step 1: Upload audio and select watermark
    gr.Markdown(
        """
        **Step 1**: Upload an audio file or select one from the provided samples, choose a watermark, and generate the watermarked audio.
        """
    )
    
    with gr.Row():
        with gr.Column():
            audio_input = gr.Audio(label="Upload Audio", type="filepath")
            
            gr.Examples(
                examples=[
                    ["audios/1.wav"],
                    ["audios/2.wav"],
                    ["audios/3.wav"],
                    ["audios/4.wav"],
                    ["audios/5.wav"],
                ],
                inputs=audio_input,
                label="Sample Audios (Click to Use)"
            )

        with gr.Column():
            audio_output = gr.Audio(label="Watermarked Audio", type="filepath")
            watermark_list = gr.Dropdown(
                label="Select Watermark", choices=list(watermarks.keys()), interactive=True
            )
            add_watermark_button = gr.Button("Add Watermark to Audio")

    # Step 2: TTS tools demo links
    gr.Markdown(
        """
        **Step 2**: Download the generated watermarked audio, then use Zero-Shot Voice Cloning tools to generate the cloned audio. Some available tools are:
        - [CosyVoice2: Scalable Streaming Speech Synthesis with Large Language Models](https://www.modelscope.cn/studios/iic/CosyVoice2-0.5B)
        - [F5-TTS: A Fairytaler that Fakes Fluent and Faithful Speech with Flow Matching](https://huggingface.co/spaces/mrfakename/E2-F5-TTS)
        - [MaskGCT: Zero-Shot Text-to-Speech with Masked Generative Codec Transformer](https://huggingface.co/spaces/amphion/maskgct)
        """
    )

    # Step 3: Upload cloned audio to decode watermark
    gr.Markdown(
        """
        **Step 3**: Upload the cloned audio and decode your watermark.
        """
    )

    with gr.Row():
        decode_audio_input = gr.Audio(label="Upload Cloned Audio", type="filepath")
        with gr.Column():
            decoded_watermark_output = gr.Textbox(label="Decoded Watermark")
            decode_button = gr.Button("Decode Watermark")

    def process_audio(audio_path, watermark_text):
        if not audio_path:
            return "No audio selected. Please upload or select a sample."
        try:
            watermarked_audio = solver.infer_for_ui(
                audio_path, watermarks[watermark_text]
            )
            return watermarked_audio
        except ValueError as e:
            return str(e)

    add_watermark_button.click(
        process_audio,
        inputs=[audio_input, watermark_list],
        outputs=audio_output
    )

    def decode_watermark(audio_path):
        try:
            detect_prob, decoded_id = solver.decode_for_ui(audio_path)
            if detect_prob < 1e-2:
                return "No matching watermark found"
            closest_match = None
            min_distance = float("inf")
            for text, id_bin in watermarks.items():
                distance = hamming_distance(decoded_id, id_bin, base=16)
                if distance < min_distance:
                    closest_match = text
                    min_distance = distance
            if min_distance < 10:
                return closest_match
            return "No matching watermark found"
        except ValueError as e:
            return str(e)

    decode_button.click(
        decode_watermark, inputs=decode_audio_input, outputs=decoded_watermark_output
    )

# Launch the Gradio app
demo.launch()