Spaces:

marcosremar2
/

wav2vec-pt-asr

Runtime error

App Files Files Community

marcosremar2 commited on 11 days ago

Commit

c555b3d

0 Parent(s):

Initial commit: Wav2Vec2 XLS-R 1B Portuguese ASR Gradio app

Browse files

Files changed (3) hide show

README.md +148 -0
app.py +109 -0
requirements.txt +6 -0

README.md ADDED Viewed

	@@ -0,0 +1,148 @@

+---
+language:
+- pt
+license: apache-2.0
+tags:
+- automatic-speech-recognition
+- hf-asr-leaderboard
+- mozilla-foundation/common_voice_8_0
+- pt
+- robust-speech-event
+datasets:
+- mozilla-foundation/common_voice_8_0
+model-index:
+- name: XLS-R Wav2Vec2 Portuguese by Jonatas Grosman
+  results:
+  - task:
+      name: Automatic Speech Recognition
+      type: automatic-speech-recognition
+    dataset:
+      name: Common Voice 8
+      type: mozilla-foundation/common_voice_8_0
+      args: pt
+    metrics:
+    - name: Test WER
+      type: wer
+      value: 8.7
+    - name: Test CER
+      type: cer
+      value: 2.55
+    - name: Test WER (+LM)
+      type: wer
+      value: 6.04
+    - name: Test CER (+LM)
+      type: cer
+      value: 1.98
+  - task:
+      name: Automatic Speech Recognition
+      type: automatic-speech-recognition
+    dataset:
+      name: Robust Speech Event - Dev Data
+      type: speech-recognition-community-v2/dev_data
+      args: pt
+    metrics:
+    - name: Dev WER
+      type: wer
+      value: 24.23
+    - name: Dev CER
+      type: cer
+      value: 11.3
+    - name: Dev WER (+LM)
+      type: wer
+      value: 19.41
+    - name: Dev CER (+LM)
+      type: cer
+      value: 10.19
+  - task:
+      name: Automatic Speech Recognition
+      type: automatic-speech-recognition
+    dataset:
+      name: Robust Speech Event - Test Data
+      type: speech-recognition-community-v2/eval_data
+      args: pt
+    metrics:
+    - name: Test WER
+      type: wer
+      value: 18.8
+---
+# Fine-tuned XLS-R 1B model for speech recognition in Portuguese
+Fine-tuned [facebook/wav2vec2-xls-r-1b](https://huggingface.co/facebook/wav2vec2-xls-r-1b) on Portuguese using the train and validation splits of [Common Voice 8.0](https://huggingface.co/datasets/mozilla-foundation/common_voice_8_0), [CORAA](https://github.com/nilc-nlp/CORAA), [Multilingual TEDx](http://www.openslr.org/100), and [Multilingual LibriSpeech](https://www.openslr.org/94/).
+When using this model, make sure that your speech input is sampled at 16kHz.
+This model has been fine-tuned by the [HuggingSound](https://github.com/jonatasgrosman/huggingsound) tool, and thanks to the GPU credits generously given by the [OVHcloud](https://www.ovhcloud.com/en/public-cloud/ai-training/) :)
+## Usage
+Using the [HuggingSound](https://github.com/jonatasgrosman/huggingsound) library:
+```python
+from huggingsound import SpeechRecognitionModel
+model = SpeechRecognitionModel("jonatasgrosman/wav2vec2-xls-r-1b-portuguese")
+audio_paths = ["/path/to/file.mp3", "/path/to/another_file.wav"]
+transcriptions = model.transcribe(audio_paths)
+```
+Writing your own inference script:
+```python
+import torch
+import librosa
+from datasets import load_dataset
+from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
+LANG_ID = "pt"
+MODEL_ID = "jonatasgrosman/wav2vec2-xls-r-1b-portuguese"
+SAMPLES = 10
+test_dataset = load_dataset("common_voice", LANG_ID, split=f"test[:{SAMPLES}]")
+processor = Wav2Vec2Processor.from_pretrained(MODEL_ID)
+model = Wav2Vec2ForCTC.from_pretrained(MODEL_ID)
+# Preprocessing the datasets.
+# We need to read the audio files as arrays
+def speech_file_to_array_fn(batch):
+    speech_array, sampling_rate = librosa.load(batch["path"], sr=16_000)
+    batch["speech"] = speech_array
+    batch["sentence"] = batch["sentence"].upper()
+    return batch
+test_dataset = test_dataset.map(speech_file_to_array_fn)
+inputs = processor(test_dataset["speech"], sampling_rate=16_000, return_tensors="pt", padding=True)
+with torch.no_grad():
+    logits = model(inputs.input_values, attention_mask=inputs.attention_mask).logits
+predicted_ids = torch.argmax(logits, dim=-1)
+predicted_sentences = processor.batch_decode(predicted_ids)
+```
+## Evaluation Commands
+1. To evaluate on `mozilla-foundation/common_voice_8_0` with split `test`
+```bash
+python eval.py --model_id jonatasgrosman/wav2vec2-xls-r-1b-portuguese --dataset mozilla-foundation/common_voice_8_0 --config pt --split test
+```
+2. To evaluate on `speech-recognition-community-v2/dev_data`
+```bash
+python eval.py --model_id jonatasgrosman/wav2vec2-xls-r-1b-portuguese --dataset speech-recognition-community-v2/dev_data --config pt --split validation --chunk_length_s 5.0 --stride_length_s 1.0
+```
+## Citation
+If you want to cite this model you can use this:
+```bibtex
+@misc{grosman2021xlsr-1b-portuguese,
+  title={Fine-tuned {XLS-R} 1{B} model for speech recognition in {P}ortuguese},
+  author={Grosman, Jonatas},
+  howpublished={\url{https://huggingface.co/jonatasgrosman/wav2vec2-xls-r-1b-portuguese}},
+  year={2022}
+}
+```

app.py ADDED Viewed

	@@ -0,0 +1,109 @@

+"""
+Wav2Vec2 XLS-R 1B Portuguese - Hugging Face Space
+"""
+import gradio as gr
+import torch
+import librosa
+import numpy as np
+from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
+import warnings
+warnings.filterwarnings("ignore")
+# Initialize model and processor
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+model_name = "jonatasgrosman/wav2vec2-xls-r-1b-portuguese"
+print(f"Loading model {model_name}...")
+processor = Wav2Vec2Processor.from_pretrained(model_name)
+model = Wav2Vec2ForCTC.from_pretrained(model_name)
+model.to(device)
+model.eval()
+print(f"Model loaded on device: {device}")
+def transcribe_audio(audio_path):
+    """Transcribe audio using Wav2Vec2"""
+    try:
+        # Load and preprocess audio
+        speech_array, sampling_rate = librosa.load(audio_path, sr=16000, mono=True)
+        # Process with model
+        inputs = processor(
+            speech_array,
+            sampling_rate=16000,
+            return_tensors="pt",
+            padding=True
+        )
+        inputs = {k: v.to(device) for k, v in inputs.items()}
+        with torch.no_grad():
+            logits = model(**inputs).logits
+        # Decode
+        predicted_ids = torch.argmax(logits, dim=-1)
+        transcription = processor.decode(predicted_ids[0])
+        # Calculate confidence
+        probs = torch.softmax(logits, dim=-1)
+        confidence = torch.max(probs).item()
+        return transcription, confidence
+    except Exception as e:
+        return f"Error: {str(e)}", 0.0
+def process_audio(audio):
+    """Process audio input from Gradio"""
+    if audio is None:
+        return "Please provide an audio file.", ""
+    transcription, confidence = transcribe_audio(audio)
+    # Format output
+    output = f"**Transcription:** {transcription}\n\n"
+    output += f"**Confidence:** {confidence:.2%}"
+    return output, transcription
+# Create Gradio interface
+with gr.Blocks(title="Wav2Vec2 XLS-R 1B Portuguese") as demo:
+    gr.Markdown("# 🎙️ Wav2Vec2 XLS-R 1B - Portuguese ASR")
+    gr.Markdown("Speech recognition for Portuguese using jonatasgrosman/wav2vec2-xls-r-1b-portuguese")
+    with gr.Row():
+        with gr.Column():
+            audio_input = gr.Audio(
+                sources=["upload", "microphone"],
+                type="filepath",
+                label="Audio Input"
+            )
+            submit_btn = gr.Button("Transcribe", variant="primary")
+        with gr.Column():
+            output_text = gr.Markdown(label="Results")
+            transcription_output = gr.Textbox(
+                label="Transcription Text",
+                lines=3,
+                interactive=False
+            )
+    submit_btn.click(
+        fn=process_audio,
+        inputs=[audio_input],
+        outputs=[output_text, transcription_output]
+    )
+    gr.Examples(
+        examples=[
+            ["example_audio.wav"],
+        ],
+        inputs=[audio_input],
+        cache_examples=False
+    )
+# Launch the app - let Hugging Face Spaces handle the configuration
+if __name__ == "__main__":
+    demo.launch()  # Remove server_name and server_port for HF Spaces compatibility

requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+gradio
+transformers
+torch
+torchaudio
+librosa
+numpy