Malaysian Whisper
Collection
Finetuning OpenAI Whisper on Malaysian context audio.
•
8 items
•
Updated
•
7
Finetune Whisper Large V3 Turbo on Malaysian context.
<|transcribeprecise|>
token, a new task!We done 2 phases,
We use for examples,
from transformers import (
WhisperFeatureExtractor,
WhisperForConditionalGeneration,
WhisperProcessor,
WhisperTokenizerFast
)
from datasets import Audio
import torch
def get_audio(f):
return audio.decode_example(audio.encode_example(f))['array']
sr = 16000
audio = Audio(sampling_rate=sr)
feature_extractor = WhisperFeatureExtractor.from_pretrained(
'openai/whisper-large-v3'
)
processor = WhisperProcessor.from_pretrained(
'openai/whisper-large-v3'
)
tokenizer = WhisperTokenizerFast.from_pretrained(
'openai/whisper-large-v3'
)
model = WhisperForConditionalGeneration.from_pretrained(
'openai/whisper-large-v3',
torch_dtype = torch.bfloat16,
).cuda()
assembly = get_audio('assembly.mp3')
assembly = assembly[: 16000 * 30]
toodia = get_audio('toodia.mp3')
toodia = toodia[: 16000 * 30]
with torch.no_grad():
p = processor([assembly], return_tensors='pt')
p['input_features'] = p['input_features'].to(torch.bfloat16)
r = model.generate(
p['input_features'].cuda(),
output_scores=True,
return_dict_in_generate=True,
return_timestamps=True,
task = 'transcribe',
)
tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(r['sequences'][0]))
Output,
<|startoftranscript|><|ms|><|transcribe|>
<|0.00|> Assembly on Aging di Vienna, Australia yang telah diadakan pada tahun 1982<|6.42|>
<|6.42|> dan berasaskan unjuran tersebut maka Jabatan Perangkaan Malaysia menganggarkan<|11.58|>
<|11.58|> menjelang tahun 2035 sejumlah 15% penduduk kita adalah daripada kalangan warga emas.<|18.70|>
<|18.70|> Untuk makluman Tuan Yang Putua dan juga Aliam Bohmat, pembangunan sistem pendaftaran warga emas<|24.02|>
<|24.02|> ataupun kita sebutkan IWEN adalah usaha kerajaan ke arah merealisasikan objektif<|28.70|>
<|29.40|><|endoftext|>
with torch.no_grad():
p = processor([toodia], return_tensors='pt')
p['input_features'] = p['input_features'].to(torch.bfloat16)
r = model.generate(
p['input_features'].cuda(),
output_scores=True,
return_dict_in_generate=True,
return_timestamps=True,
task = 'transcribe',
)
tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(r['sequences'][0]))
Output,
<|startoftranscript|><|ms|><|transcribe|>
<|0.00|> Kerana dia tak ada keseimbangan dalam hidup.<|4.00|>
<|4.00|> Jika anda mencari keseimbangan dalam hidup,<|6.00|>
<|6.00|> anda akan menemukan dupe.<|7.00|>
<|7.00|> Tak ada?<|8.00|>
<|8.00|> Tak ada.<|9.00|>
<|9.00|> Tapi jika anda menikmati, anda akan baik-baik saja.<|13.00|>
<|15.00|> Seperti semua rahsia yang saya buat,<|17.00|>
<|17.00|> pada masa yang sama, saya menambahkan.<|19.00|>
<|19.00|> Pada masa yang sama, saya menambahkan.<|20.00|>
<|20.00|> Oh, jadi seperti kelabu-kelabu.<|22.00|>
<|22.00|> Lepas itu, saya menambahkan rahsia di Langkawi.<|25.00|>
<|25.00|> Selepas itu, kita mengambil pesawat.<|26.00|>
<|26.00|> Selepas itu, kita pergi dan mengambil pesawat ke KL.<|28.00|>
<|28.00|><|endoftext|>
from transformers import (
WhisperFeatureExtractor,
WhisperForConditionalGeneration,
WhisperProcessor,
WhisperTokenizerFast
)
from datasets import Audio
import torch
def get_audio(f):
return audio.decode_example(audio.encode_example(f))['array']
sr = 16000
audio = Audio(sampling_rate=sr)
feature_extractor = WhisperFeatureExtractor.from_pretrained(
'openai/whisper-large-v3-turbo'
)
processor = WhisperProcessor.from_pretrained(
'openai/whisper-large-v3-turbo'
)
tokenizer = WhisperTokenizerFast.from_pretrained(
'openai/whisper-large-v3-turbo'
)
model = WhisperForConditionalGeneration.from_pretrained(
'openai/whisper-large-v3-turbo',
torch_dtype = torch.bfloat16,
).cuda()
assembly = get_audio('assembly.mp3')
assembly = assembly[: 16000 * 30]
toodia = get_audio('toodia.mp3')
toodia = toodia[: 16000 * 30]
with torch.no_grad():
p = processor([assembly], return_tensors='pt')
p['input_features'] = p['input_features'].to(torch.bfloat16)
r = model.generate(
p['input_features'].cuda(),
output_scores=True,
return_dict_in_generate=True,
return_timestamps=True,
task = 'transcribe',
)
tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(r['sequences'][0]))
Output,
<|startoftranscript|><|ms|><|transcribe|>
<|0.00|> Assembly on Aging di Vienna, Australia<|3.72|>
<|3.72|> yang telah diadakan pada tahun 1982<|6.36|>
<|6.36|> dan berasaskan unjuran tersebut<|8.80|>
<|8.80|> maka Jabatan Perangkaan Malaysia<|10.38|>
<|10.38|> menganggarkan menjelang tahun 2035<|13.68|>
<|13.68|> sejumlah 15% penduduk kita adalah<|17.30|>
<|17.30|> daripada kalangan warga emas.<|18.70|>
<|19.26|> Untuk makluman Tuan Niri Putua dan juga Aliam Buhumat<|22.08|>
<|22.08|> pembangunan sistem pendaftaran warga emas<|24.02|>
<|24.02|> ataupun kita sebutkan EWEN<|25.36|>
<|25.36|> adalah usaha kerajaan kearah merealisasikan<|28.38|>
<|28.38|><|endoftext|>
with torch.no_grad():
p = processor([toodia], return_tensors='pt')
p['input_features'] = p['input_features'].to(torch.bfloat16)
r = model.generate(
p['input_features'].cuda(),
output_scores=True,
return_dict_in_generate=True,
return_timestamps=True,
task = 'transcribe',
)
tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(r['sequences'][0]))
Output,
<|startoftranscript|><|ms|><|transcribe|>
<|0.00|> My line of work, dia takde work life balance.<|3.74|>
<|4.36|> If you cari work life balance, dupe camp.<|7.16|>
<|7.36|> Takde.<|7.70|>
<|8.26|> Memang takde.<|8.94|>
<|9.86|> But, you know, if you enjoy it, then you're okay with it.<|12.70|>
<|15.40|> Macam lahsia semua aku buat pun, at the same time aku shoot.<|18.90|>
<|19.16|> At the same time aku ada pun motion.<|20.16|>
<|20.88|> Jadi macam kelang kabut.<|22.12|>
<|22.24|> Lepas tu lah lahsia pula aku shoot dekat Langkawi.<|24.50|>
<|25.16|> So, ni kita take flight.<|25.92|>
<|26.12|> Lepas tu pergi, take flight, balik ke L, lepas tu balik ke L.<|28.00|>
<|28.00|> Jadi macam tu pun nak.<|29.12|>
<|29.28|> Kelang kabut.<|29.76|>
<|endoftext|>
import torch
from transformers.models.whisper import tokenization_whisper
tokenization_whisper.TASK_IDS = ["translate", "transcribe", 'transcribeprecise']
from transformers import (
WhisperFeatureExtractor,
WhisperForConditionalGeneration,
WhisperProcessor,
WhisperTokenizerFast
)
from datasets import Audio
def get_audio(f):
return audio.decode_example(audio.encode_example(f))['array']
sr = 16000
audio = Audio(sampling_rate=sr)
feature_extractor = WhisperFeatureExtractor.from_pretrained(
'mesolitica/Malaysian-whisper-large-v3-turbo-v3'
)
processor = WhisperProcessor.from_pretrained(
'mesolitica/Malaysian-whisper-large-v3-turbo-v3'
)
tokenizer = WhisperTokenizerFast.from_pretrained(
'mesolitica/Malaysian-whisper-large-v3-turbo-v3'
)
model = WhisperForConditionalGeneration.from_pretrained(
'mesolitica/Malaysian-whisper-large-v3-turbo-v3',
torch_dtype = torch.bfloat16,
).cuda()
assembly = get_audio('assembly.mp3')
assembly = assembly[: 16000 * 30]
toodia = get_audio('toodia.mp3')
toodia = toodia[: 16000 * 30]
with torch.no_grad():
p = processor([assembly], return_tensors='pt')
p['input_features'] = p['input_features'].to(torch.bfloat16)
r = model.generate(
p['input_features'].cuda(),
output_scores=True,
return_dict_in_generate=True,
return_timestamps=True,
task = 'transcribeprecise',
)
tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(r['sequences'][0]))
Output,
<|startoftranscript|><|ms|><|transcribeprecise|>
<|0.00|> Example<|1.20|>
<|1.56|> on<|1.62|>
<|1.74|> Aging<|2.04|>
<|2.14|> di<|2.20|>
<|2.26|> Vienna,<|2.50|>
<|2.70|> Austria<|3.78|>
<|4.28|> yang<|4.38|>
<|4.42|> telah<|4.58|>
<|4.62|> diadakan<|5.08|>
<|5.16|> pada<|5.30|>
<|5.36|> tahun<|5.60|>
<|5.62|> 1982<|7.02|>
<|7.12|> dan<|7.24|>
<|7.32|> berasaskan<|7.86|>
<|7.96|> unjuran<|8.36|>
<|8.42|> tersebut,<|8.80|>
<|8.88|> maka<|9.04|>
<|9.12|> Jabatan<|9.48|>
<|9.54|> Perangkaan<|9.98|>
<|10.04|> Malaysia<|10.36|>
<|10.84|> menganggarkan<|11.56|>
<|11.96|> menjelang<|12.34|>
<|12.40|> tahun<|12.64|>
<|12.66|> 2035,<|14.02|>
<|14.50|> sejumlah<|14.96|>
<|14.98|> 15%<|15.92|>
<|16.26|> penduduk<|16.60|>
<|16.68|> kita<|16.88|>
<|17.00|> adalah<|17.30|>
<|17.40|> daripada<|17.80|>
<|17.86|> kalangan<|18.16|>
<|18.20|> warga<|18.38|>
<|18.44|> emas.<|18.66|>
<|19.24|> Untuk<|19.40|>
<|19.46|> makluman<|19.86|>
<|20.64|> Tuan<|20.76|>
<|20.80|> dan<|20.88|>
<|20.94|> Pertua<|21.14|>
<|21.20|> dan<|21.28|>
<|21.34|> juga<|21.50|>
<|21.58|> Aliam<|21.74|>
<|21.80|> Bohmat,<|22.08|>
<|22.22|> pembangunan<|22.66|>
<|22.72|> sistem<|23.00|>
<|23.06|> pendaftaran<|23.48|>
<|23.54|> warga<|23.72|>
<|23.78|> emas<|23.98|>
<|24.06|> ataupun<|24.36|>
<|24.42|> kita<|24.56|>
<|24.64|> sebutkan<|24.96|>
<|25.08|> EWEN<|25.34|>
<|25.86|> adalah<|26.08|>
<|26.16|> usaha<|26.46|>
<|26.60|> kerajaan<|27.04|>
<|27.16|> ke<|27.20|>
<|27.28|> arah<|27.44|>
<|27.50|> merealisasikan<|28.36|>
<|28.88|> objektif<|29.36|>
<|29.42|> yang<|29.50|>
<|29.56|> telah<|29.70|>
<|29.76|> digariskan<|29.98|>
<|endoftext|>
with torch.no_grad():
p = processor([toodia], return_tensors='pt')
p['input_features'] = p['input_features'].to(torch.bfloat16)
r = model.generate(
p['input_features'].cuda(),
output_scores=True,
return_dict_in_generate=True,
return_timestamps=True,
task = 'transcribeprecise',
)
tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(r['sequences'][0]))
Output,
<|startoftranscript|><|ms|><|transcribeprecise|>
<|0.00|> Pada<|0.70|>
<|0.70|> barisan<|1.00|>
<|1.04|> kerja,<|1.24|>
<|1.96|> dia<|2.08|>
<|2.20|> takde<|2.48|>
<|2.76|> keseimbangan<|2.90|>
<|2.94|> kerja-hidup<|3.20|>
<|3.28|> keseimbangan.<|3.72|>
<|4.68|> Jika<|4.78|>
<|4.84|> anda<|4.92|>
<|4.96|> cari<|5.20|>
<|5.30|> keseimbangan<|5.40|>
<|5.44|> kerja-hidup<|5.66|>
<|5.72|> keseimbangan,<|6.20|>
<|6.52|> dupe<|6.72|>
<|6.80|> kem.<|7.08|>
<|7.36|> Takde.<|7.64|>
<|7.92|> Takde.<|8.10|>
<|8.52|> Memang<|8.74|>
<|8.78|> takde.<|8.88|>
<|9.80|> Takde.<|10.02|>
<|10.16|> Tetapi,<|10.34|>
<|10.68|> anda<|10.74|>
<|10.78|> tahu,<|10.88|>
<|10.98|> jika<|11.04|>
<|11.08|> anda<|11.12|>
<|11.18|> menikmatinya,<|11.52|>
<|11.64|> ya,<|11.76|>
<|11.94|> maka<|12.06|>
<|12.24|> anda<|12.30|>
<|12.36|> okay<|12.48|>
<|12.52|> dengan<|12.60|>
<|12.64|> itu.<|12.68|>
<|15.64|> Macam<|15.84|>
<|15.88|> lahsia<|16.16|>
<|16.24|> semua<|16.36|>
<|16.40|> aku<|16.54|>
<|16.80|> buat<|16.96|>
<|17.00|> pun,<|17.12|>
<|18.00|> pada<|18.12|>
<|18.16|> masa<|18.28|>
<|18.32|> sama<|18.44|>
<|18.48|> aku<|18.56|>
<|18.64|> shoot.<|18.88|>
<|19.16|> Pada<|19.24|>
<|19.28|> sama<|19.42|>
<|19.46|> masa<|19.56|>
<|19.60|> aku<|19.68|>
<|19.72|> ada<|19.76|>
<|19.80|> pusing<|19.92|>
<|19.96|> motion.<|20.24|>
<|20.36|> Oh.<|20.48|>
<|20.56|> Oh.<|20.62|>
<|20.68|> Jadi<|21.40|>
<|21.50|> macam<|21.68|>
<|21.72|> kelangkabut.<|22.12|>
<|22.18|> Lepas<|22.38|>
<|22.42|> lahsia<|22.90|>
<|22.98|> pula<|23.08|>
<|23.16|> aku<|23.22|>
<|23.30|> shoot<|23.44|>
<|23.48|> dekat<|23.72|>
<|24.10|> Langkawi.<|24.44|>
<|25.14|> Jadi<|25.24|>
<|25.28|> kita<|25.44|>
<|25.48|> ambil<|25.60|>
<|25.64|> penerbangan.<|25.92|>
<|26.08|> Lepas<|26.20|>
<|26.24|> tu<|26.30|>
<|26.34|> pergi<|26.52|>
<|26.64|> ambil<|26.72|>
<|26.80|> penerbangan,<|26.96|>
<|27.00|> balik<|27.32|>
<|27.36|> KL,<|27.58|>
<|27.62|> lepas<|27.72|>
<|27.76|> tu<|27.78|>
<|27.78|> pulak<|27.88|>
<|27.92|> KL.<|27.98|>
<|28.04|> Kita<|28.12|>
<|28.16|> jadi<|28.56|>
<|28.60|> macam<|28.72|>
<|28.76|> lah.<|28.80|>
<|28.84|> Itu<|28.92|>
<|28.96|> penat.<|29.32|>
<|29.36|> Kelangkabut<|29.72|>
<|endoftext|>
Base model
openai/whisper-large-v3