akki2825 commited on
Commit
8c82d13
·
verified ·
1 Parent(s): 047167d

update splitting fn

Browse files
Files changed (1) hide show
  1. app.py +4 -7
app.py CHANGED
@@ -9,7 +9,7 @@ def split_into_sentences(text):
9
  Simple sentence tokenizer using regular expressions.
10
  Splits text into sentences based on punctuation.
11
  """
12
- sentences = re.split(r'(?<=[.!?])\s*', text)
13
  sentences = [s.strip() for s in sentences if s.strip()]
14
  return sentences
15
 
@@ -78,7 +78,6 @@ def calculate_sentence_metrics(reference, hypothesis):
78
  except Exception as e:
79
  raise e
80
 
81
-
82
  def identify_misaligned_sentences(reference_text, hypothesis_text):
83
  """
84
  Identify sentences that don't match between reference and hypothesis.
@@ -88,7 +87,6 @@ def identify_misaligned_sentences(reference_text, hypothesis_text):
88
  hypothesis_sentences = split_into_sentences(hypothesis_text)
89
 
90
  misaligned = []
91
-
92
  for i, (ref, hyp) in enumerate(zip(reference_sentences, hypothesis_sentences)):
93
  if ref != hyp:
94
  # Split sentences into words
@@ -102,9 +100,6 @@ def identify_misaligned_sentences(reference_text, hypothesis_text):
102
  if ref_words[j] != hyp_words[j]:
103
  misalignment_start = j
104
  break
105
- # Check if one sentence is longer than the other
106
- if len(ref_words) != len(hyp_words):
107
- misalignment_start = min_length
108
 
109
  # Prepare the context for display
110
  context_ref = ' '.join(ref_words[:misalignment_start] + ['**' + ref_words[misalignment_start] + '**'])
@@ -118,7 +113,8 @@ def identify_misaligned_sentences(reference_text, hypothesis_text):
118
  "context_ref": context_ref,
119
  "context_hyp": context_hyp
120
  })
121
-
 
122
  if len(reference_sentences) > len(hypothesis_sentences):
123
  for i in range(len(hypothesis_sentences), len(reference_sentences)):
124
  misaligned.append({
@@ -142,6 +138,7 @@ def identify_misaligned_sentences(reference_text, hypothesis_text):
142
 
143
  return misaligned
144
 
 
145
  def format_sentence_metrics(sentence_wers, sentence_cers, average_wer, average_cer, std_dev_wer, std_dev_cer, misaligned_sentences):
146
  md = "### Sentence-level Metrics\n\n"
147
  md += "#### Word Error Rate (WER)\n"
 
9
  Simple sentence tokenizer using regular expressions.
10
  Splits text into sentences based on punctuation.
11
  """
12
+ sentences = text.split()
13
  sentences = [s.strip() for s in sentences if s.strip()]
14
  return sentences
15
 
 
78
  except Exception as e:
79
  raise e
80
 
 
81
  def identify_misaligned_sentences(reference_text, hypothesis_text):
82
  """
83
  Identify sentences that don't match between reference and hypothesis.
 
87
  hypothesis_sentences = split_into_sentences(hypothesis_text)
88
 
89
  misaligned = []
 
90
  for i, (ref, hyp) in enumerate(zip(reference_sentences, hypothesis_sentences)):
91
  if ref != hyp:
92
  # Split sentences into words
 
100
  if ref_words[j] != hyp_words[j]:
101
  misalignment_start = j
102
  break
 
 
 
103
 
104
  # Prepare the context for display
105
  context_ref = ' '.join(ref_words[:misalignment_start] + ['**' + ref_words[misalignment_start] + '**'])
 
113
  "context_ref": context_ref,
114
  "context_hyp": context_hyp
115
  })
116
+
117
+ # Handle cases where the number of sentences differs
118
  if len(reference_sentences) > len(hypothesis_sentences):
119
  for i in range(len(hypothesis_sentences), len(reference_sentences)):
120
  misaligned.append({
 
138
 
139
  return misaligned
140
 
141
+
142
  def format_sentence_metrics(sentence_wers, sentence_cers, average_wer, average_cer, std_dev_wer, std_dev_cer, misaligned_sentences):
143
  md = "### Sentence-level Metrics\n\n"
144
  md += "#### Word Error Rate (WER)\n"