Spaces:
Sleeping
Sleeping
update splitting fn
Browse files
app.py
CHANGED
@@ -9,7 +9,7 @@ def split_into_sentences(text):
|
|
9 |
Simple sentence tokenizer using regular expressions.
|
10 |
Splits text into sentences based on punctuation.
|
11 |
"""
|
12 |
-
sentences =
|
13 |
sentences = [s.strip() for s in sentences if s.strip()]
|
14 |
return sentences
|
15 |
|
@@ -78,7 +78,6 @@ def calculate_sentence_metrics(reference, hypothesis):
|
|
78 |
except Exception as e:
|
79 |
raise e
|
80 |
|
81 |
-
|
82 |
def identify_misaligned_sentences(reference_text, hypothesis_text):
|
83 |
"""
|
84 |
Identify sentences that don't match between reference and hypothesis.
|
@@ -88,7 +87,6 @@ def identify_misaligned_sentences(reference_text, hypothesis_text):
|
|
88 |
hypothesis_sentences = split_into_sentences(hypothesis_text)
|
89 |
|
90 |
misaligned = []
|
91 |
-
|
92 |
for i, (ref, hyp) in enumerate(zip(reference_sentences, hypothesis_sentences)):
|
93 |
if ref != hyp:
|
94 |
# Split sentences into words
|
@@ -102,9 +100,6 @@ def identify_misaligned_sentences(reference_text, hypothesis_text):
|
|
102 |
if ref_words[j] != hyp_words[j]:
|
103 |
misalignment_start = j
|
104 |
break
|
105 |
-
# Check if one sentence is longer than the other
|
106 |
-
if len(ref_words) != len(hyp_words):
|
107 |
-
misalignment_start = min_length
|
108 |
|
109 |
# Prepare the context for display
|
110 |
context_ref = ' '.join(ref_words[:misalignment_start] + ['**' + ref_words[misalignment_start] + '**'])
|
@@ -118,7 +113,8 @@ def identify_misaligned_sentences(reference_text, hypothesis_text):
|
|
118 |
"context_ref": context_ref,
|
119 |
"context_hyp": context_hyp
|
120 |
})
|
121 |
-
|
|
|
122 |
if len(reference_sentences) > len(hypothesis_sentences):
|
123 |
for i in range(len(hypothesis_sentences), len(reference_sentences)):
|
124 |
misaligned.append({
|
@@ -142,6 +138,7 @@ def identify_misaligned_sentences(reference_text, hypothesis_text):
|
|
142 |
|
143 |
return misaligned
|
144 |
|
|
|
145 |
def format_sentence_metrics(sentence_wers, sentence_cers, average_wer, average_cer, std_dev_wer, std_dev_cer, misaligned_sentences):
|
146 |
md = "### Sentence-level Metrics\n\n"
|
147 |
md += "#### Word Error Rate (WER)\n"
|
|
|
9 |
Simple sentence tokenizer using regular expressions.
|
10 |
Splits text into sentences based on punctuation.
|
11 |
"""
|
12 |
+
sentences = text.split()
|
13 |
sentences = [s.strip() for s in sentences if s.strip()]
|
14 |
return sentences
|
15 |
|
|
|
78 |
except Exception as e:
|
79 |
raise e
|
80 |
|
|
|
81 |
def identify_misaligned_sentences(reference_text, hypothesis_text):
|
82 |
"""
|
83 |
Identify sentences that don't match between reference and hypothesis.
|
|
|
87 |
hypothesis_sentences = split_into_sentences(hypothesis_text)
|
88 |
|
89 |
misaligned = []
|
|
|
90 |
for i, (ref, hyp) in enumerate(zip(reference_sentences, hypothesis_sentences)):
|
91 |
if ref != hyp:
|
92 |
# Split sentences into words
|
|
|
100 |
if ref_words[j] != hyp_words[j]:
|
101 |
misalignment_start = j
|
102 |
break
|
|
|
|
|
|
|
103 |
|
104 |
# Prepare the context for display
|
105 |
context_ref = ' '.join(ref_words[:misalignment_start] + ['**' + ref_words[misalignment_start] + '**'])
|
|
|
113 |
"context_ref": context_ref,
|
114 |
"context_hyp": context_hyp
|
115 |
})
|
116 |
+
|
117 |
+
# Handle cases where the number of sentences differs
|
118 |
if len(reference_sentences) > len(hypothesis_sentences):
|
119 |
for i in range(len(hypothesis_sentences), len(reference_sentences)):
|
120 |
misaligned.append({
|
|
|
138 |
|
139 |
return misaligned
|
140 |
|
141 |
+
|
142 |
def format_sentence_metrics(sentence_wers, sentence_cers, average_wer, average_cer, std_dev_wer, std_dev_cer, misaligned_sentences):
|
143 |
md = "### Sentence-level Metrics\n\n"
|
144 |
md += "#### Word Error Rate (WER)\n"
|