Spaces:
Sleeping
Sleeping
misalignment at the word level
Browse files
app.py
CHANGED
@@ -82,31 +82,29 @@ def calculate_sentence_metrics(reference, hypothesis):
|
|
82 |
def identify_misaligned_sentences(reference_text, hypothesis_text):
|
83 |
"""
|
84 |
Identify sentences that don't match between reference and hypothesis.
|
85 |
-
Handles cases where the number of sentences differ.
|
86 |
Returns a dictionary with misaligned sentence pairs, their indices, and misalignment details.
|
87 |
"""
|
88 |
reference_sentences = split_into_sentences(reference_text)
|
89 |
hypothesis_sentences = split_into_sentences(hypothesis_text)
|
90 |
|
91 |
misaligned = []
|
92 |
-
|
93 |
-
|
94 |
-
# Compare sentences up to the minimum length
|
95 |
-
for i in range(min_length):
|
96 |
-
ref = reference_sentences[i]
|
97 |
-
hyp = hypothesis_sentences[i]
|
98 |
-
|
99 |
if ref != hyp:
|
100 |
-
#
|
101 |
-
|
|
|
|
|
|
|
|
|
102 |
misalignment_start = 0
|
103 |
-
for j in range(
|
104 |
-
if
|
105 |
misalignment_start = j
|
106 |
break
|
|
|
107 |
# Prepare the context for display
|
108 |
-
context_ref =
|
109 |
-
context_hyp =
|
110 |
|
111 |
misaligned.append({
|
112 |
"index": i+1,
|
@@ -117,9 +115,9 @@ def identify_misaligned_sentences(reference_text, hypothesis_text):
|
|
117 |
"context_hyp": context_hyp
|
118 |
})
|
119 |
|
120 |
-
#
|
121 |
if len(reference_sentences) > len(hypothesis_sentences):
|
122 |
-
for i in range(
|
123 |
misaligned.append({
|
124 |
"index": i+1,
|
125 |
"reference": reference_sentences[i],
|
@@ -129,7 +127,7 @@ def identify_misaligned_sentences(reference_text, hypothesis_text):
|
|
129 |
"context_hyp": "No corresponding sentence"
|
130 |
})
|
131 |
elif len(hypothesis_sentences) > len(reference_sentences):
|
132 |
-
for i in range(
|
133 |
misaligned.append({
|
134 |
"index": i+1,
|
135 |
"reference": "No corresponding sentence",
|
@@ -162,9 +160,9 @@ def format_sentence_metrics(sentence_wers, sentence_cers, average_wer, average_c
|
|
162 |
md += "\n### Misaligned Sentences\n\n"
|
163 |
for misaligned in misaligned_sentences:
|
164 |
md += f"#### Sentence {misaligned['index']}\n"
|
165 |
-
md += f"* Reference: {misaligned['
|
166 |
-
md += f"* Hypothesis: {misaligned['
|
167 |
-
md += f"* Misalignment starts at
|
168 |
else:
|
169 |
md += "\n### Misaligned Sentences\n\n"
|
170 |
md += "* No misaligned sentences found."
|
|
|
82 |
def identify_misaligned_sentences(reference_text, hypothesis_text):
|
83 |
"""
|
84 |
Identify sentences that don't match between reference and hypothesis.
|
|
|
85 |
Returns a dictionary with misaligned sentence pairs, their indices, and misalignment details.
|
86 |
"""
|
87 |
reference_sentences = split_into_sentences(reference_text)
|
88 |
hypothesis_sentences = split_into_sentences(hypothesis_text)
|
89 |
|
90 |
misaligned = []
|
91 |
+
for i, (ref, hyp) in enumerate(zip(reference_sentences, hypothesis_sentences)):
|
|
|
|
|
|
|
|
|
|
|
|
|
92 |
if ref != hyp:
|
93 |
+
# Split sentences into words
|
94 |
+
ref_words = ref.split()
|
95 |
+
hyp_words = hyp.split()
|
96 |
+
|
97 |
+
# Find the first position where the words diverge
|
98 |
+
min_length = min(len(ref_words), len(hyp_words))
|
99 |
misalignment_start = 0
|
100 |
+
for j in range(min_length):
|
101 |
+
if ref_words[j] != hyp_words[j]:
|
102 |
misalignment_start = j
|
103 |
break
|
104 |
+
|
105 |
# Prepare the context for display
|
106 |
+
context_ref = ' '.join(ref_words[:misalignment_start] + ['**' + ref_words[misalignment_start] + '**'])
|
107 |
+
context_hyp = ' '.join(hyp_words[:misalignment_start] + ['**' + hyp_words[misalignment_start] + '**'])
|
108 |
|
109 |
misaligned.append({
|
110 |
"index": i+1,
|
|
|
115 |
"context_hyp": context_hyp
|
116 |
})
|
117 |
|
118 |
+
# Handle cases where the number of sentences differs
|
119 |
if len(reference_sentences) > len(hypothesis_sentences):
|
120 |
+
for i in range(len(hypothesis_sentences), len(reference_sentences)):
|
121 |
misaligned.append({
|
122 |
"index": i+1,
|
123 |
"reference": reference_sentences[i],
|
|
|
127 |
"context_hyp": "No corresponding sentence"
|
128 |
})
|
129 |
elif len(hypothesis_sentences) > len(reference_sentences):
|
130 |
+
for i in range(len(reference_sentences), len(hypothesis_sentences)):
|
131 |
misaligned.append({
|
132 |
"index": i+1,
|
133 |
"reference": "No corresponding sentence",
|
|
|
160 |
md += "\n### Misaligned Sentences\n\n"
|
161 |
for misaligned in misaligned_sentences:
|
162 |
md += f"#### Sentence {misaligned['index']}\n"
|
163 |
+
md += f"* Reference: {misaligned['context_ref']}\n"
|
164 |
+
md += f"* Hypothesis: {misaligned['context_hyp']}\n"
|
165 |
+
md += f"* Misalignment starts at word: {misaligned['misalignment_start'] + 1}\n\n"
|
166 |
else:
|
167 |
md += "\n### Misaligned Sentences\n\n"
|
168 |
md += "* No misaligned sentences found."
|