In [9]:
# !pip install soundfile
# !pip install librosa

In [1]:
from transformers import WhisperProcessor, WhisperForConditionalGeneration
from datasets import load_dataset

In [2]:
# load model and processor
processor = WhisperProcessor.from_pretrained("openai/whisper-tiny")
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny")
model.config.forced_decoder_ids = None

# load dummy dataset and read audio files
ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
sample = ds[0]["audio"]

In [3]:
sample

{'path': '/home/ravi.naik/.cache/huggingface/datasets/downloads/extracted/431c2c946d216530b2666a0e7ffa5ac3f5b3da89dd28858a9de6c78fae7caa4a/dev_clean/1272/128104/1272-128104-0000.flac',
 'array': array([0.00238037, 0.0020752 , 0.00198364, ..., 0.00042725, 0.00057983,
        0.0010376 ]),
 'sampling_rate': 16000}

In [4]:
input_features = processor(sample["array"], sampling_rate=sample["sampling_rate"], return_tensors="pt").input_features 

In [5]:
# generate token ids
predicted_ids = model.generate(input_features)
# decode token ids to text
transcription = processor.batch_decode(predicted_ids, skip_special_tokens=False)

transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)

In [6]:
transcription

[' Mr. Quilter is the apostle of the middle classes and we are glad to welcome his gospel.']

In [3]:
import torch
from torch import nn

In [4]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, AutoTokenizer

In [5]:
model_name = "microsoft/phi-2"
phi2_tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
phi2_tokenizer.pad_token = phi2_tokenizer.eos_token

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [6]:
tokens = phi2_tokenizer(*transcription, return_tensors="pt", return_attention_mask=False)

NameError: name 'transcription' is not defined

In [22]:
tokens

{'input_ids': tensor([[ 1770,    13,  2264,   346,   353,   318,   262, 46329,   286,   262,
          3504,  6097,   290,   356,   389,  9675,   284,  7062,   465, 21443,
            13]])}

In [12]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    trust_remote_code=True,
    device_map="cuda:0"
)
model.config.use_cache = False

A new version of the following files was downloaded from https://huggingface.co/microsoft/phi-2:
- configuration_phi.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling_phi.py:   0%|          | 0.00/62.7k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/microsoft/phi-2:
- modeling_phi.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [19]:
phi2_tokenizer.batch_decode(model.generate(**tokens))

[' Mr. Quilter is the apostle of the middle classes and we are glad to welcome his gospel.\n']

In [7]:
class AudioLanguageConnector:
    def __init__(self):
        model_name = "microsoft/phi-2"
        self.phi2_tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
        self.phi2_tokenizer.pad_token = self.phi2_tokenizer.eos_token

    def __call__(self, text):
        text = f"<audio_start> {text} <audio_end>"
        tokens = self.phi2_tokenizer(text, return_tensors="pt", return_attention_mask=False)
        return tokens
        

class WhisperWithProjection:
    def __init__(self):
        self.processor = WhisperProcessor.from_pretrained("openai/whisper-tiny")
        self.model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny")
        self.model.config.forced_decoder_ids = None
        self.audio_language_connector = AudioLanguageConnector()
        
    def forward(self, audio):
        input_features = self.processor(audio["array"],
                                   sampling_rate=audio["sampling_rate"],
                                   return_tensors="pt").input_features
        # generate token ids
        predicted_ids = self.model.generate(input_features)
        # decode token ids to text        
        transcription = self.processor.batch_decode(predicted_ids, skip_special_tokens=True)

        audio_embeddings = self.audio_language_connector(transcription)
        return audio_embeddings

In [8]:
class TextModality:
    def __init__(self):
        model_name = "microsoft/phi-2"
        self.phi2_tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
        self.phi2_tokenizer.pad_token = self.phi2_tokenizer.eos_token

    def __call__(self, text):
        tokens = self.phi2_tokenizer(text, return_tensors="pt", return_attention_mask=False)
        return tokens

In [15]:
class MultiModalPhi2:
    def __init__(self):
        self.text_modality = TextModality()
        self.whisper_w_proj = WhisperWithProjection()
        self.llm = self.load_llm()

    def load_llm(self):
        bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.float16)
    
        model = AutoModelForCausalLM.from_pretrained(
            model_name,
            quantization_config=bnb_config,
            trust_remote_code=True,
            device_map="cuda:0"
        )
        model.config.use_cache = False
        return model

    def generate(self, audio, text):
        text_embeddings = self.text_modality(text)
        audio_embeddings = self.whisper_w_proj.forward(audio)
        inputs = torch.concat([text_embeddings["input_ids"], audio_embeddings["input_ids"]], dim=1)
        
        # outputs = self.llm.generate(inputs, max_length=200)
        outputs = self.llm(inputs)
        return outputs
        
        # text = self.text_modality.phi2_tokenizer.batch_decode(outputs)[0]
        # print(text)

In [16]:
multi_modal_phi = MultiModalPhi2()

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [17]:
audio = sample
text = "explain about the audio"
multi_modal_phi.generate(audio, text)

CausalLMOutputWithPast(loss={'logits': tensor([[[ 6.9531,  9.9375,  7.0234,  ...,  2.0020,  2.0020,  2.0000],
         [ 8.9062, 12.1172,  7.5977,  ..., -1.2012, -1.2012, -1.2012],
         [ 7.0273,  5.3477,  3.6328,  ..., -4.2070, -4.2070, -4.2070],
         ...,
         [ 7.0234,  7.4414,  9.1016,  ...,  1.0117,  1.0127,  1.0117],
         [ 9.4531, 10.0391,  9.7578,  ...,  0.0776,  0.0775,  0.0764],
         [ 8.0703,  6.6445,  5.5156,  ..., -1.9268, -1.9268, -1.9277]]],
       grad_fn=<ToCopyBackward0>)}, logits=tensor([[[ 6.9531,  9.9375,  7.0234,  ...,  2.0020,  2.0020,  2.0000],
         [ 8.9062, 12.1172,  7.5977,  ..., -1.2012, -1.2012, -1.2012],
         [ 7.0273,  5.3477,  3.6328,  ..., -4.2070, -4.2070, -4.2070],
         ...,
         [ 7.0234,  7.4414,  9.1016,  ...,  1.0117,  1.0127,  1.0117],
         [ 9.4531, 10.0391,  9.7578,  ...,  0.0776,  0.0775,  0.0764],
         [ 8.0703,  6.6445,  5.5156,  ..., -1.9268, -1.9268, -1.9277]]],
       grad_fn=<ToCopyBackward0>),