Spaces:

Slamlab
/

asr_metrics

Sleeping

App Files Files Community

akki2825 commited on Jul 4

Commit

8c82d13

verified ·

1 Parent(s): 047167d

update splitting fn

Browse files

Files changed (1) hide show

app.py +4 -7

app.py CHANGED Viewed

@@ -9,7 +9,7 @@ def split_into_sentences(text):
     Simple sentence tokenizer using regular expressions.
     Splits text into sentences based on punctuation.
     """
-    sentences = re.split(r'(?<=[.!?])\s*', text)
     sentences = [s.strip() for s in sentences if s.strip()]
     return sentences
@@ -78,7 +78,6 @@ def calculate_sentence_metrics(reference, hypothesis):
     except Exception as e:
         raise e
 def identify_misaligned_sentences(reference_text, hypothesis_text):
     """
     Identify sentences that don't match between reference and hypothesis.
@@ -88,7 +87,6 @@ def identify_misaligned_sentences(reference_text, hypothesis_text):
     hypothesis_sentences = split_into_sentences(hypothesis_text)
     misaligned = []
     for i, (ref, hyp) in enumerate(zip(reference_sentences, hypothesis_sentences)):
         if ref != hyp:
             # Split sentences into words
@@ -102,9 +100,6 @@ def identify_misaligned_sentences(reference_text, hypothesis_text):
                 if ref_words[j] != hyp_words[j]:
                     misalignment_start = j
                     break
-            # Check if one sentence is longer than the other
-            if len(ref_words) != len(hyp_words):
-                misalignment_start = min_length
             # Prepare the context for display
             context_ref = ' '.join(ref_words[:misalignment_start] + ['**' + ref_words[misalignment_start] + '**'])
@@ -118,7 +113,8 @@ def identify_misaligned_sentences(reference_text, hypothesis_text):
                 "context_ref": context_ref,
                 "context_hyp": context_hyp
             })
     if len(reference_sentences) > len(hypothesis_sentences):
         for i in range(len(hypothesis_sentences), len(reference_sentences)):
             misaligned.append({
@@ -142,6 +138,7 @@ def identify_misaligned_sentences(reference_text, hypothesis_text):
     return misaligned
 def format_sentence_metrics(sentence_wers, sentence_cers, average_wer, average_cer, std_dev_wer, std_dev_cer, misaligned_sentences):
     md = "### Sentence-level Metrics\n\n"
     md += "#### Word Error Rate (WER)\n"

     Simple sentence tokenizer using regular expressions.
     Splits text into sentences based on punctuation.
     """
+    sentences = text.split()
     sentences = [s.strip() for s in sentences if s.strip()]
     return sentences
     except Exception as e:
         raise e
 def identify_misaligned_sentences(reference_text, hypothesis_text):
     """
     Identify sentences that don't match between reference and hypothesis.
     hypothesis_sentences = split_into_sentences(hypothesis_text)
     misaligned = []
     for i, (ref, hyp) in enumerate(zip(reference_sentences, hypothesis_sentences)):
         if ref != hyp:
             # Split sentences into words
                 if ref_words[j] != hyp_words[j]:
                     misalignment_start = j
                     break
             # Prepare the context for display
             context_ref = ' '.join(ref_words[:misalignment_start] + ['**' + ref_words[misalignment_start] + '**'])
                 "context_ref": context_ref,
                 "context_hyp": context_hyp
             })
+    # Handle cases where the number of sentences differs
     if len(reference_sentences) > len(hypothesis_sentences):
         for i in range(len(hypothesis_sentences), len(reference_sentences)):
             misaligned.append({
     return misaligned
 def format_sentence_metrics(sentence_wers, sentence_cers, average_wer, average_cer, std_dev_wer, std_dev_cer, misaligned_sentences):
     md = "### Sentence-level Metrics\n\n"
     md += "#### Word Error Rate (WER)\n"