akki2825 commited on
Commit
dd10290
·
verified ·
1 Parent(s): ea825ae

misalignment at the word level

Browse files
Files changed (1) hide show
  1. app.py +18 -20
app.py CHANGED
@@ -82,31 +82,29 @@ def calculate_sentence_metrics(reference, hypothesis):
82
  def identify_misaligned_sentences(reference_text, hypothesis_text):
83
  """
84
  Identify sentences that don't match between reference and hypothesis.
85
- Handles cases where the number of sentences differ.
86
  Returns a dictionary with misaligned sentence pairs, their indices, and misalignment details.
87
  """
88
  reference_sentences = split_into_sentences(reference_text)
89
  hypothesis_sentences = split_into_sentences(hypothesis_text)
90
 
91
  misaligned = []
92
- min_length = min(len(reference_sentences), len(hypothesis_sentences))
93
-
94
- # Compare sentences up to the minimum length
95
- for i in range(min_length):
96
- ref = reference_sentences[i]
97
- hyp = hypothesis_sentences[i]
98
-
99
  if ref != hyp:
100
- # Find the first position where the sentences diverge
101
- min_len = min(len(ref), len(hyp))
 
 
 
 
102
  misalignment_start = 0
103
- for j in range(min_len):
104
- if ref[j] != hyp[j]:
105
  misalignment_start = j
106
  break
 
107
  # Prepare the context for display
108
- context_ref = ref[:misalignment_start] + f"**{ref[misalignment_start:]}**"
109
- context_hyp = hyp[:misalignment_start] + f"**{hyp[misalignment_start:]}**"
110
 
111
  misaligned.append({
112
  "index": i+1,
@@ -117,9 +115,9 @@ def identify_misaligned_sentences(reference_text, hypothesis_text):
117
  "context_hyp": context_hyp
118
  })
119
 
120
- # Note any extra sentences as misaligned
121
  if len(reference_sentences) > len(hypothesis_sentences):
122
- for i in range(min_length, len(reference_sentences)):
123
  misaligned.append({
124
  "index": i+1,
125
  "reference": reference_sentences[i],
@@ -129,7 +127,7 @@ def identify_misaligned_sentences(reference_text, hypothesis_text):
129
  "context_hyp": "No corresponding sentence"
130
  })
131
  elif len(hypothesis_sentences) > len(reference_sentences):
132
- for i in range(min_length, len(hypothesis_sentences)):
133
  misaligned.append({
134
  "index": i+1,
135
  "reference": "No corresponding sentence",
@@ -162,9 +160,9 @@ def format_sentence_metrics(sentence_wers, sentence_cers, average_wer, average_c
162
  md += "\n### Misaligned Sentences\n\n"
163
  for misaligned in misaligned_sentences:
164
  md += f"#### Sentence {misaligned['index']}\n"
165
- md += f"* Reference: {misaligned['reference']}\n"
166
- md += f"* Hypothesis: {misaligned['hypothesis']}\n"
167
- md += f"* Misalignment starts at position: {misaligned['misalignment_start']}\n\n"
168
  else:
169
  md += "\n### Misaligned Sentences\n\n"
170
  md += "* No misaligned sentences found."
 
82
  def identify_misaligned_sentences(reference_text, hypothesis_text):
83
  """
84
  Identify sentences that don't match between reference and hypothesis.
 
85
  Returns a dictionary with misaligned sentence pairs, their indices, and misalignment details.
86
  """
87
  reference_sentences = split_into_sentences(reference_text)
88
  hypothesis_sentences = split_into_sentences(hypothesis_text)
89
 
90
  misaligned = []
91
+ for i, (ref, hyp) in enumerate(zip(reference_sentences, hypothesis_sentences)):
 
 
 
 
 
 
92
  if ref != hyp:
93
+ # Split sentences into words
94
+ ref_words = ref.split()
95
+ hyp_words = hyp.split()
96
+
97
+ # Find the first position where the words diverge
98
+ min_length = min(len(ref_words), len(hyp_words))
99
  misalignment_start = 0
100
+ for j in range(min_length):
101
+ if ref_words[j] != hyp_words[j]:
102
  misalignment_start = j
103
  break
104
+
105
  # Prepare the context for display
106
+ context_ref = ' '.join(ref_words[:misalignment_start] + ['**' + ref_words[misalignment_start] + '**'])
107
+ context_hyp = ' '.join(hyp_words[:misalignment_start] + ['**' + hyp_words[misalignment_start] + '**'])
108
 
109
  misaligned.append({
110
  "index": i+1,
 
115
  "context_hyp": context_hyp
116
  })
117
 
118
+ # Handle cases where the number of sentences differs
119
  if len(reference_sentences) > len(hypothesis_sentences):
120
+ for i in range(len(hypothesis_sentences), len(reference_sentences)):
121
  misaligned.append({
122
  "index": i+1,
123
  "reference": reference_sentences[i],
 
127
  "context_hyp": "No corresponding sentence"
128
  })
129
  elif len(hypothesis_sentences) > len(reference_sentences):
130
+ for i in range(len(reference_sentences), len(hypothesis_sentences)):
131
  misaligned.append({
132
  "index": i+1,
133
  "reference": "No corresponding sentence",
 
160
  md += "\n### Misaligned Sentences\n\n"
161
  for misaligned in misaligned_sentences:
162
  md += f"#### Sentence {misaligned['index']}\n"
163
+ md += f"* Reference: {misaligned['context_ref']}\n"
164
+ md += f"* Hypothesis: {misaligned['context_hyp']}\n"
165
+ md += f"* Misalignment starts at word: {misaligned['misalignment_start'] + 1}\n\n"
166
  else:
167
  md += "\n### Misaligned Sentences\n\n"
168
  md += "* No misaligned sentences found."