Spaces:

Slamlab
/

asr_metrics

Sleeping

App Files Files Community

akki2825 commited on Jun 26

Commit

4e09a41

verified ·

1 Parent(s): 61ecb99

fix tok

Browse files

Files changed (1) hide show

app.py +28 -100

app.py CHANGED Viewed

@@ -1,43 +1,19 @@
 import spaces
 import jiwer
 import numpy as np
 import gradio as gr
-import nltk
-from nltk.tokenize import sent_tokenize
-import os
-import requests
-# Function to download and setup NLTK punkt tokenizer
-def setup_nltk_tokenizer():
-    try:
-        # Check if punkt is already downloaded
-        nltk.data.find('tokenizers/punkt')
-        print("NLTK punkt tokenizer already downloaded")
-        return
-    except LookupError:
-        print("Downloading NLTK punkt tokenizer...")
-        # Manually download the punkt tokenizer
-        url = "https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/tokenizers/punkt.zip"
-        response = requests.get(url)
-        # Create the NLTK data directory if it doesn't exist
-        nltk_data_dir = os.path.join(os.path.expanduser("~"), "nltk_data")
-        if not os.path.exists(nltk_data_dir):
-            os.makedirs(nltk_data_dir)
-        # Save the tokenizer to the NLTK data directory
-        tokenizer_path = os.path.join(nltk_data_dir, "tokenizers", "punkt")
-        if not os.path.exists(tokenizer_path):
-            os.makedirs(tokenizer_path)
-        # Extract and save the punkt tokenizer
-        with open(os.path.join(tokenizer_path, "punkt.zip"), "wb") as f:
-            f.write(response.content)
-        print("NLTK punkt tokenizer downloaded successfully")
-# Call the function to setup the tokenizer
-setup_nltk_tokenizer()
 @spaces.GPU()
 def calculate_wer(reference, hypothesis):
@@ -61,15 +37,11 @@ def calculate_sentence_wer(reference, hypothesis):
     Calculate WER for each sentence and overall statistics.
     """
     try:
-        reference_sentences = sent_tokenize(reference)
-        hypothesis_sentences = sent_tokenize(hypothesis)
-        # Get minimum number of sentences
-        min_sentences = min(len(reference_sentences), len(hypothesis_sentences))
-        # Trim to the same number of sentences
-        reference_sentences = reference_sentences[:min_sentences]
-        hypothesis_sentences = hypothesis_sentences[:min_sentences]
         sentence_wers = []
         for ref, hyp in zip(reference_sentences, hypothesis_sentences):
@@ -80,34 +52,19 @@ def calculate_sentence_wer(reference, hypothesis):
             return {
                 "sentence_wers": [],
                 "average_wer": 0.0,
-                "std_dev": 0.0,
-                "warning": "No sentences to compare"
             }
         average_wer = np.mean(sentence_wers)
         std_dev = np.std(sentence_wers)
-        # Check if there were extra sentences
-        if len(reference_sentences) != len(hypothesis_sentences):
-            warning = f"Reference has {len(reference_sentences)} sentences, " \
-                      f"hypothesis has {len(hypothesis_sentences)} sentences. " \
-                      f"Only compared the first {min_sentences} sentences."
-        else:
-            warning = None
         return {
             "sentence_wers": sentence_wers,
             "average_wer": average_wer,
-            "std_dev": std_dev,
-            "warning": warning
         }
     except Exception as e:
-        return {
-            "sentence_wers": [],
-            "average_wer": 0.0,
-            "std_dev": 0.0,
-            "error": str(e)
-        }
 @spaces.GPU()
 def process_files(reference_file, hypothesis_file):
@@ -118,11 +75,6 @@ def process_files(reference_file, hypothesis_file):
         with open(hypothesis_file.name, 'r') as f:
             hypothesis_text = f.read()
-        if not reference_text or not hypothesis_text:
-            return {
-                "error": "Both reference and hypothesis files must contain text"
-            }
         wer_value = calculate_wer(reference_text, hypothesis_text)
         cer_value = calculate_cer(reference_text, hypothesis_text)
         sentence_wer_stats = calculate_sentence_wer(reference_text, hypothesis_text)
@@ -132,39 +84,21 @@ def process_files(reference_file, hypothesis_file):
             "CER": cer_value,
             "Sentence WERs": sentence_wer_stats["sentence_wers"],
             "Average WER": sentence_wer_stats["average_wer"],
-            "Standard Deviation": sentence_wer_stats["std_dev"],
-            "Warning": sentence_wer_stats.get("warning"),
-            "Error": sentence_wer_stats.get("error")
         }
     except Exception as e:
-        return {
-            "WER": 0.0,
-            "CER": 0.0,
-            "Sentence WERs": [],
-            "Average WER": 0.0,
-            "Standard Deviation": 0.0,
-            "Error": str(e)
-        }
-def format_sentence_wer_stats(sentence_wers, average_wer, std_dev, warning, error):
-    md = ""
-    if error:
-        md += f"### Error\n{error}\n\n"
-    elif warning:
-        md += f"### Warning\n{warning}\n\n"
     if not sentence_wers:
-        md += "No sentences to compare"
-        return md
-    md += "### Sentence-level WER Analysis\n\n"
     md += f"* Average WER: {average_wer:.2f}\n"
     md += f"* Standard Deviation: {std_dev:.2f}\n\n"
     md += "### WER for Each Sentence\n\n"
     for i, wer in enumerate(sentence_wers):
         md += f"* Sentence {i+1}: {wer:.2f}\n"
     return md
 def main():
@@ -211,24 +145,18 @@ def main():
         def process_and_display(ref_file, hyp_file):
             result = process_files(ref_file, hyp_file)
             metrics = {
                 "WER": result["WER"],
                 "CER": result["CER"]
             }
-            error = result.get("Error")
-            warning = result.get("Warning")
-            sentence_wers = result.get("Sentence WERs", [])
-            average_wer = result.get("Average WER", 0.0)
-            std_dev = result.get("Standard Deviation", 0.0)
             wer_stats_md = format_sentence_wer_stats(
-                sentence_wers,
-                average_wer,
-                std_dev,
-                warning,
-                error
             )
             return metrics, wer_stats_md

 import spaces
 import jiwer
 import numpy as np
+import re
 import gradio as gr
+def split_into_sentences(text):
+    """
+    Simple sentence tokenizer using regular expressions.
+    Splits text into sentences based on punctuation.
+    """
+    # Split text into sentences using regex
+    sentences = re.split(r'(?<=[.!?])\s*', text)
+    # Clean up empty strings and whitespace
+    sentences = [s.strip() for s in sentences if s.strip()]
+    return sentences
 @spaces.GPU()
 def calculate_wer(reference, hypothesis):
     Calculate WER for each sentence and overall statistics.
     """
     try:
+        reference_sentences = split_into_sentences(reference)
+        hypothesis_sentences = split_into_sentences(hypothesis)
+        if len(reference_sentences) != len(hypothesis_sentences):
+            raise ValueError("Reference and hypothesis must contain the same number of sentences")
         sentence_wers = []
         for ref, hyp in zip(reference_sentences, hypothesis_sentences):
             return {
                 "sentence_wers": [],
                 "average_wer": 0.0,
+                "std_dev": 0.0
             }
         average_wer = np.mean(sentence_wers)
         std_dev = np.std(sentence_wers)
         return {
             "sentence_wers": sentence_wers,
             "average_wer": average_wer,
+            "std_dev": std_dev
         }
     except Exception as e:
+        raise e
 @spaces.GPU()
 def process_files(reference_file, hypothesis_file):
         with open(hypothesis_file.name, 'r') as f:
             hypothesis_text = f.read()
         wer_value = calculate_wer(reference_text, hypothesis_text)
         cer_value = calculate_cer(reference_text, hypothesis_text)
         sentence_wer_stats = calculate_sentence_wer(reference_text, hypothesis_text)
             "CER": cer_value,
             "Sentence WERs": sentence_wer_stats["sentence_wers"],
             "Average WER": sentence_wer_stats["average_wer"],
+            "Standard Deviation": sentence_wer_stats["std_dev"]
         }
     except Exception as e:
+        return {"error": str(e)}
+def format_sentence_wer_stats(sentence_wers, average_wer, std_dev):
     if not sentence_wers:
+        return "All sentences match perfectly!"
+    md = "### Sentence-level WER Analysis\n\n"
     md += f"* Average WER: {average_wer:.2f}\n"
     md += f"* Standard Deviation: {std_dev:.2f}\n\n"
     md += "### WER for Each Sentence\n\n"
     for i, wer in enumerate(sentence_wers):
         md += f"* Sentence {i+1}: {wer:.2f}\n"
     return md
 def main():
         def process_and_display(ref_file, hyp_file):
             result = process_files(ref_file, hyp_file)
+            if "error" in result:
+                return {}, {}, "Error: " + result["error"]
             metrics = {
                 "WER": result["WER"],
                 "CER": result["CER"]
             }
             wer_stats_md = format_sentence_wer_stats(
+                result["Sentence WERs"],
+                result["Average WER"],
+                result["Standard Deviation"]
             )
             return metrics, wer_stats_md