Spaces:

slprl
/

WhiStress-Demo

Running

App Files Files Community

iyosha commited on 13 days ago

Commit

73c9c96

verified ·

1 Parent(s): 6c0f20f

Upload 12 files

Browse files

Files changed (12) hide show

app.py +78 -0
assets/whistress_model.svg +0 -0
requirements.txt +7 -0
whistress/__init__.py +1 -0
whistress/inference_client/__init__.py +1 -0
whistress/inference_client/utils.py +163 -0
whistress/inference_client/whistress_client.py +26 -0
whistress/model/__init__.py +1 -0
whistress/model/model.py +318 -0
whistress/weights/additional_decoder_block.pt +3 -0
whistress/weights/classifier.pt +3 -0
whistress/weights/metadata.json +3 -0

app.py ADDED Viewed

	@@ -0,0 +1,78 @@

+import torch
+import gradio as gr
+from pathlib import Path
+from whistress import WhiStressInferenceClient
+CURRENT_DIR = Path(__file__).parent
+# Load the model
+device = "cuda" if torch.cuda.is_available() else "cpu"
+model = WhiStressInferenceClient(device=device)
+def get_whistress_predictions(audio):
+    """
+    Get the transcription and emphasis scores for the given audio input.
+    Args:
+        audio (sr, numpy.ndarray): The audio input as a NumPy array.
+    Returns:
+        List[Tuple[str, int]]: A list of tuples containing words and their emphasis scores.
+    """
+    audio = {
+        "sampling_rate": audio[0],
+        "array": audio[1],
+    }
+    return model.predict(audio=audio, transcription=None, return_pairs=True)
+# App UI
+with gr.Blocks() as demo:
+    with gr.Row():
+        with gr.Column(scale=2):
+            gr.Markdown(
+                """
+                # WhiStress: Enriching Transcriptions with Sentence Stress Detection
+                The WhiStress model allows you to detect important words in your speech.
+                Check out our paper: 📚 [WhiStress](https://arxiv.org/),
+                ## Architecture
+                The model is built on [Whisper](https://arxiv.org/abs/2212.04356) model,
+                using `whisper-small.en` [model](https://huggingface.co/openai/whisper-small.en)
+                as the backbone.
+                WhiStress includes an additional decoder based classifier that predicts the stress label of each transcription token.
+                ## Training Data
+                The model was trained using [TinyStress-15K](https://huggingface.co/datasets/loud-whisper-project/tinyStories-audio-emphasized),
+                that is derived from [tinyStories](https://huggingface.co/datasets/roneneldan/TinyStories) dataset.
+                ## Inference Demo
+                Upload an audio file or record your own voice to transcribe the speech and emphasize the important words.
+                For maximal performance, please speak clearly.
+                """
+            )
+        with gr.Column(scale=1):
+            # Define Gradio interface for displaying image with HTML component
+            gr.Image(
+                f"{CURRENT_DIR}/assets/whistress_model.svg",
+                label="Architecture",
+            )
+    gr.Interface(
+        get_whistress_predictions,
+        gr.Audio(
+                sources=["microphone", "upload"],
+                label="Upload speech or record your own",
+                type="numpy",
+            ),
+        gr.HighlightedText(),
+        allow_flagging="never",
+    )
+def launch():
+    demo.launch()
+if __name__ == "__main__":
+    launch()

assets/whistress_model.svg ADDED Viewed

requirements.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+torch==2.1.0
+torchaudio==2.1.0
+torchlibrosa==0.1.0
+librosa==0.10.2.post1
+transformers==4.44.0
+numpy==1.26.4
+gradio==5.31.0

whistress/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .inference_client import WhiStressInferenceClient

whistress/inference_client/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .whistress_client import WhiStressInferenceClient

whistress/inference_client/utils.py ADDED Viewed

	@@ -0,0 +1,163 @@

+import torch
+from transformers import WhisperConfig
+import librosa
+import numpy as np
+import pathlib
+from torch.nn import functional as F
+from ..model import WhiStress
+PATH_TO_WEIGHTS = pathlib.Path(__file__).parent.parent / "weights"
+def get_loaded_model(device="cuda"):
+    whisper_model_name = f"openai/whisper-small.en"
+    whisper_config = WhisperConfig()
+    whistress_model = WhiStress(
+        whisper_config, layer_for_head=9, whisper_backbone_name=whisper_model_name
+    ).to(device)
+    whistress_model.processor.tokenizer.model_input_names = [
+        "input_ids",
+        "attention_mask",
+        "labels_head",
+    ]
+    whistress_model.load_model(PATH_TO_WEIGHTS)
+    whistress_model.to(device)
+    whistress_model.eval()
+    return whistress_model
+def get_word_emphasis_pairs(
+    transcription_preds, emphasis_preds, processor, filter_special_tokens=True
+):
+    emphasis_preds_list = emphasis_preds.tolist()
+    transcription_preds_words = [
+        processor.tokenizer.decode([i], skip_special_tokens=False)
+        for i in transcription_preds
+    ]
+    if filter_special_tokens:
+        special_tokens_indices = [
+            i
+            for i, x in enumerate(transcription_preds)
+            if x in processor.tokenizer.all_special_ids
+        ]
+        emphasis_preds_list = [
+            x
+            for i, x in enumerate(emphasis_preds_list)
+            if i not in special_tokens_indices
+        ]
+        transcription_preds_words = [
+            x
+            for i, x in enumerate(transcription_preds_words)
+            if i not in special_tokens_indices
+        ]
+    return list(zip(transcription_preds_words, emphasis_preds_list))
+def inference_from_audio(audio: np.ndarray, model: WhiStress, device: str):
+    input_features = model.processor.feature_extractor(
+        audio, sampling_rate=16000, return_tensors="pt"
+    )["input_features"]
+    out_model = model.generate_dual(input_features=input_features.to(device))
+    emphasis_probs = F.softmax(out_model.logits, dim=-1)
+    emphasis_preds = torch.argmax(emphasis_probs, dim=-1)
+    emphasis_preds_right_shifted = torch.cat((emphasis_preds[:, -1:], emphasis_preds[:, :-1]), dim=1)
+    word_emphasis_pairs = get_word_emphasis_pairs(
+        out_model.preds[0],
+        emphasis_preds_right_shifted[0],
+        model.processor,
+        filter_special_tokens=True,
+    )
+    return word_emphasis_pairs
+def prepare_audio(audio, target_sr=16000):
+    # resample to 16kHz
+    sr = audio["sampling_rate"]
+    y = audio["array"]
+    y = np.array(y, dtype=float)
+    y_resampled = librosa.resample(y, orig_sr=sr, target_sr=target_sr)
+    # Normalize the audio (scale to [-1, 1])
+    y_resampled /= max(abs(y_resampled))
+    return y_resampled
+def merge_stressed_tokens(tokens_with_stress):
+    """
+    tokens_with_stress is a list of tuples: (token_string, stress_value)
+    e.g.:
+       [(" I", 0), (" didn", 1), ("'t", 0), (" say", 0), (" he", 0), (" stole", 0),
+        (" the", 0), (" money", 0), (".", 0)]
+    Returns a list of merged tuples, combining subwords into full words.
+    """
+    merged = []
+    current_word = ""
+    current_stress = 0  # 0 means not stressed, 1 means stressed
+    for token, stress in tokens_with_stress:
+        # If token starts with a space (or is the very first), we treat it as a new word
+        # or if current_word is empty (first iteration).
+        if token.startswith(" ") or current_word == "":
+            # If we already have something in current_word, push it into merged
+            # before starting a new one
+            if current_word:
+                merged.append((current_word, current_stress))
+            # Start a new word
+            current_word = token
+            current_stress = stress
+        else:
+            # Otherwise, it's a subword that should be appended to the previous word
+            current_word += token
+            # If any sub-token is stressed, the whole merged word is stressed
+            current_stress = max(current_stress, stress)
+    # Don't forget to append the final word
+    if current_word:
+        merged.append((current_word, current_stress))
+    return merged
+def inference_from_audio_and_transcription(
+    audio: np.ndarray, transcription, model: WhiStress, device: str
+):
+    input_features = model.processor.feature_extractor(
+        audio, sampling_rate=16000, return_tensors="pt"
+    )["input_features"]
+    # convert transcription to input_ids
+    input_ids = model.processor.tokenizer(
+        transcription,
+        return_tensors="pt",
+        padding="max_length",
+        truncation=True,
+        max_length=30,
+    )["input_ids"]
+    out_model = model(
+                    input_features=input_features.to(device),
+                    decoder_input_ids=input_ids.to(device),
+                )
+    emphasis_probs = F.softmax(out_model.logits, dim=-1)
+    emphasis_preds = torch.argmax(emphasis_probs, dim=-1)
+    emphasis_preds_right_shifted = torch.cat((emphasis_preds[:, -1:], emphasis_preds[:, :-1]), dim=1)
+    word_emphasis_pairs = get_word_emphasis_pairs(
+        input_ids[0],
+        emphasis_preds_right_shifted[0],
+        model.processor,
+        filter_special_tokens=True,
+    )
+    return word_emphasis_pairs
+def scored_transcription(audio, model, strip_words=True, transcription: str = None, device="cuda"):
+    audio_arr = prepare_audio(audio)
+    token_stress_pairs = None
+    if transcription: # if we want to use the ground truth transcription
+        token_stress_pairs = inference_from_audio_and_transcription(audio_arr, transcription, model, device)
+    else:
+        token_stress_pairs = inference_from_audio(audio_arr, model, device)
+    # token_stress_pairs = inference_from_audio(audio_arr, model)
+    word_level_stress = merge_stressed_tokens(token_stress_pairs)
+    if strip_words:
+        word_level_stress = [(word.strip(), stress) for word, stress in word_level_stress]
+    return word_level_stress

whistress/inference_client/whistress_client.py ADDED Viewed

	@@ -0,0 +1,26 @@

+import numpy as np
+from .utils import get_loaded_model, scored_transcription
+from typing import Union, Dict
+class WhiStressInferenceClient:
+    def __init__(self, device="cuda"):
+        self.device = device
+        self.whistress = get_loaded_model(self.device)
+    def predict(
+        self, audio: Dict[str, Union[np.ndarray, int]], transcription=None, return_pairs=True
+    ):
+        word_emphasis_pairs = scored_transcription(
+            audio=audio,
+            model=self.whistress,
+            device=self.device,
+            strip_words=True,
+            transcription=transcription
+        )
+        if return_pairs:
+            return word_emphasis_pairs
+        # returs transcription str and list of emphasized words
+        return " ".join([x[0] for x in word_emphasis_pairs]), [
+            x[0] for x in word_emphasis_pairs if x[1] == 1
+        ]

whistress/model/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .model import WhiStress

whistress/model/model.py ADDED Viewed

	@@ -0,0 +1,318 @@

+from transformers import (
+    WhisperForConditionalGeneration,
+    WhisperProcessor,
+    PreTrainedModel,
+    WhisperConfig,
+)
+from transformers.models.whisper.modeling_whisper import WhisperDecoderLayer
+from transformers.modeling_outputs import BaseModelOutput
+import torch.nn.functional as F
+import torch.nn as nn
+import torch
+import os
+from dataclasses import dataclass
+from typing import Optional
+import json
+@dataclass
+class CustomModelOutput(BaseModelOutput):
+    loss: Optional[torch.FloatTensor] = None
+    logits: torch.FloatTensor = None
+    head_preds: torch.FloatTensor = None
+    labels_head: Optional[torch.FloatTensor] = None
+    whisper_logits: torch.FloatTensor = None
+    preds: Optional[torch.Tensor] = None
+# Define a new head (e.g., a classification layer)
+class LinearHead(nn.Module):
+    def __init__(self, input_dim, output_dim):
+        super(LinearHead, self).__init__()
+        self.linear = nn.Linear(input_dim, output_dim)
+    def forward(self, x):
+        return self.linear(x)
+class FCNN(nn.Module):
+    def __init__(self, input_dim, output_dim):
+        super(FCNN, self).__init__()
+        hidden_dim = 2 * input_dim
+        self.fc1 = nn.Linear(input_dim, hidden_dim)
+        self.fc2 = nn.Linear(hidden_dim, output_dim)
+    def forward(self, x):
+        x = F.relu(self.fc1(x))
+        x = self.fc2(x)
+        return x
+class WhiStress(PreTrainedModel):
+    config_class = WhisperConfig
+    model_input_names = ["input_features", "labels_head", "whisper_labels"]
+    def __init__(
+        self,
+        config: WhisperConfig,
+        layer_for_head: Optional[int] = None,
+        whisper_backbone_name="openai/whisper-small.en",
+    ):
+        super().__init__(config)
+        self.whisper_backbone_name = whisper_backbone_name
+        self.whisper_model = WhisperForConditionalGeneration.from_pretrained(
+            self.whisper_backbone_name,
+        ).eval()
+        self.processor = WhisperProcessor.from_pretrained(self.whisper_backbone_name)
+        input_dim = self.whisper_model.config.d_model  # Model's hidden size
+        output_dim = 2  # Number of classes or output features for the new head
+        config = self.whisper_model.config
+        # add additional decoder block using the existing Whisper config
+        self.additional_decoder_block = WhisperDecoderLayer(config)
+        self.classifier = FCNN(input_dim, output_dim)
+        # add weighted loss for CE
+        neg_weight = 1.0
+        pos_weight = 0.7 / 0.3
+        class_weights = torch.tensor([neg_weight, pos_weight])
+        self.loss_fct = nn.CrossEntropyLoss(ignore_index=-100, weight=class_weights)
+        self.layer_for_head = -1 if layer_for_head is None else layer_for_head
+    def to(self, device: str = ("cuda" if torch.cuda.is_available() else "cpu")):
+        self.whisper_model.to(device)
+        self.additional_decoder_block.to(device)
+        self.classifier.to(device)
+        super().to(device)
+        return self
+    def load_model(self, save_dir=None):
+        # load only the classifier and extra decoder layer (saved locally)
+        if save_dir is not None:
+            print('loading model from:', save_dir)
+            device = "cuda" if torch.cuda.is_available() else "cpu"
+            self.classifier.load_state_dict(
+                torch.load(
+                    os.path.join(save_dir, "classifier.pt"),
+                    weights_only=False,
+                    map_location=torch.device(device),
+                )
+            )
+            self.additional_decoder_block.load_state_dict(
+                torch.load(
+                    os.path.join(save_dir, "additional_decoder_block.pt"),
+                    weights_only=False,
+                    map_location=torch.device(device),
+                )
+            )
+            # read and load the layer_for_head.json
+            # the json format is {"layer_for_head": 9}
+            with open(os.path.join(save_dir, "metadata.json"), "r") as f:
+                metadata = json.load(f)
+                self.layer_for_head = metadata["layer_for_head"]
+        return
+    def train(self, mode: Optional[bool] = True):
+        # freeze whisper and train classifier
+        self.whisper_model.eval()
+        # mark whisper model requires grad false
+        for param in self.whisper_model.parameters():
+            param.requires_grad = False
+        for param in self.additional_decoder_block.parameters():
+            param.requires_grad = True
+        for param in self.classifier.parameters():
+            param.requires_grad = True
+        self.additional_decoder_block.train()
+        self.classifier.train()
+    def eval(self):
+        self.whisper_model.eval()
+        self.additional_decoder_block.eval()
+        self.classifier.eval()
+    def forward(
+        self,
+        input_features,
+        attention_mask=None,
+        decoder_input_ids=None,
+        labels_head=None,
+        whisper_labels=None,
+    ):
+        device = "cuda" if torch.cuda.is_available() else "cpu"
+        self.whisper_model.eval()
+        # pass the inputs through the model
+        backbone_outputs = self.whisper_model(
+            input_features=input_features,
+            attention_mask=attention_mask,
+            decoder_input_ids=decoder_input_ids,
+            output_hidden_states=True,
+            labels=whisper_labels,
+        )
+        # Extract the hidden states of the last layer of the decoder
+        decoder_last_layer_hidden_states = backbone_outputs.decoder_hidden_states[
+            self.layer_for_head
+        ].to(device)
+        # Extract the hidden states of the layer of the encoder who encapsulates best the prosodic features
+        layer_for_head_hidden_states = backbone_outputs.encoder_hidden_states[
+            self.layer_for_head
+        ].to(device)
+        # Pass the decoder last hidden layers through the new head (decoder_block + lin cls)
+        additional_decoder_block_outputs = self.additional_decoder_block(
+            hidden_states=decoder_last_layer_hidden_states,
+            encoder_hidden_states=layer_for_head_hidden_states,
+        )
+        head_logits = self.classifier(additional_decoder_block_outputs[0].to(device))
+        # calculate softmax
+        head_probs = F.softmax(head_logits, dim=-1)
+        preds = head_probs.argmax(dim=-1).to(device)
+        if labels_head is not None:
+            preds = torch.where(
+                torch.isin(
+                    labels_head, torch.tensor(list([-100])).to(device)  # 50257, 50362,
+                ),
+                torch.tensor(-100),
+                preds,
+            )
+        # Calculate custom loss if labels are provided
+        loss = None
+        if labels_head is not None:
+            # CrossEntropyLoss for the custom head
+            loss = self.loss_fct(
+                head_logits.reshape(-1, head_logits.size(-1)), labels_head.reshape(-1)
+            )
+        return CustomModelOutput(
+            logits=head_logits,
+            labels_head=labels_head,
+            whisper_logits=backbone_outputs.logits,
+            loss=loss,
+            preds=preds,
+        )
+    def generate(
+        self,
+        input_features,
+        max_length=128,
+        labels_head=None,
+        whisper_labels=None,
+        **generate_kwargs,
+    ):
+        """
+        Generate both the Whisper output and custom head output sequences in alignment.
+        """
+        device = "cuda" if torch.cuda.is_available() else "cpu"
+        # Generate the Whisper output sequence
+        whisper_outputs = self.whisper_model.generate(
+            input_features=input_features,
+            max_length=max_length,
+            labels=whisper_labels,
+            do_sample=False,
+            **generate_kwargs,
+        )
+        # pass the inputs through the model
+        backbone_outputs = self.whisper_model(
+            input_features=input_features,
+            decoder_input_ids=whisper_outputs,
+            output_hidden_states=True,
+        )
+        # Extract the hidden states of the last layer of the decoder
+        decoder_last_layer_hidden_states = backbone_outputs.decoder_hidden_states[
+            self.layer_for_head
+        ].to(device)
+        # Extract the hidden states of the last layer of the encoder
+        layer_for_head_hidden_states = backbone_outputs.encoder_hidden_states[
+            self.layer_for_head
+        ].to(device)
+        # Pass the decoder last hidden layers through the new head (decoder_block + lin cls)
+        additional_decoder_block_outputs = self.additional_decoder_block(
+            hidden_states=decoder_last_layer_hidden_states,
+            encoder_hidden_states=layer_for_head_hidden_states,
+        )
+        head_logits = self.classifier(additional_decoder_block_outputs[0].to(device))
+        # calculate softmax
+        head_probs = F.softmax(head_logits, dim=-1)
+        preds = head_probs.argmax(dim=-1).to(device)
+        preds = torch.where(
+            torch.isin(
+                whisper_outputs, torch.tensor(list([50256])).to(device)  # 50257, 50362,
+            ),
+            torch.tensor(-100),
+            preds,
+        )
+        # preds_shifted = torch.cat((preds[:, 1:], preds[:, :1]), dim=1)
+        return preds
+    def generate_dual(
+        self,
+        input_features,
+        attention_mask=None,
+        max_length=200,
+        labels_head=None,
+        whisper_labels=None,
+        **generate_kwargs,
+    ):
+        """
+        Generate both the Whisper output and custom head output sequences in alignment.
+        """
+        device = "cuda" if torch.cuda.is_available() else "cpu"
+        # Generate the Whisper output sequence
+        whisper_outputs = self.whisper_model.generate(
+            input_features=input_features,
+            attention_mask=attention_mask,
+            max_length=max_length,
+            labels=whisper_labels,
+            return_dict_in_generate=True,
+            **generate_kwargs,
+        )
+        # pass the inputs through the model
+        backbone_outputs = self.whisper_model(
+            input_features=input_features,
+            attention_mask=attention_mask,
+            decoder_input_ids=whisper_outputs.sequences,
+            output_hidden_states=True,
+        )
+        # Extract the hidden states of the last layer of the decoder
+        decoder_last_layer_hidden_states = backbone_outputs.decoder_hidden_states[
+            self.layer_for_head
+        ].to(device)
+        # Extract the hidden states of the last layer of the encoder
+        layer_for_head_hidden_states = backbone_outputs.encoder_hidden_states[
+            self.layer_for_head
+        ].to(device)
+        # Pass the decoder last hidden layers through the new head (decoder_block + lin cls)
+        additional_decoder_block_outputs = self.additional_decoder_block(
+            hidden_states=decoder_last_layer_hidden_states,
+            encoder_hidden_states=layer_for_head_hidden_states,
+        )
+        head_logits = self.classifier(additional_decoder_block_outputs[0].to(device))
+        head_probs = F.softmax(head_logits, dim=-1)
+        preds = head_probs.argmax(dim=-1).to(device)
+        preds = torch.where(
+            torch.isin(
+                whisper_outputs.sequences, torch.tensor(list([50256])).to(device)  # 50257, 50362,
+            ),
+            torch.tensor(-100),
+            preds,
+        )
+        return CustomModelOutput(
+            logits=head_logits,
+            head_preds=preds,
+            whisper_logits=whisper_outputs.logits,
+            preds=whisper_outputs.sequences
+        )
+    def __str__(self):
+        return "WhiStress"

whistress/weights/additional_decoder_block.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b7d440821c831364c5046e859843926120550a38143f89e1bace82a2ed03cc77
+size 37809834

whistress/weights/classifier.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:599257b647cbca9fc21aac4ede87651cd43d03c3338e705bd59d919ee19ebc6f
+size 4739176

whistress/weights/metadata.json ADDED Viewed

	@@ -0,0 +1,3 @@

+{
+    "layer_for_head": 9
+}