akki2825 commited on
Commit
4e09a41
·
verified ·
1 Parent(s): 61ecb99
Files changed (1) hide show
  1. app.py +28 -100
app.py CHANGED
@@ -1,43 +1,19 @@
1
  import spaces
2
  import jiwer
3
  import numpy as np
 
4
  import gradio as gr
5
- import nltk
6
- from nltk.tokenize import sent_tokenize
7
- import os
8
- import requests
9
 
10
- # Function to download and setup NLTK punkt tokenizer
11
- def setup_nltk_tokenizer():
12
- try:
13
- # Check if punkt is already downloaded
14
- nltk.data.find('tokenizers/punkt')
15
- print("NLTK punkt tokenizer already downloaded")
16
- return
17
- except LookupError:
18
- print("Downloading NLTK punkt tokenizer...")
19
- # Manually download the punkt tokenizer
20
- url = "https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/tokenizers/punkt.zip"
21
- response = requests.get(url)
22
-
23
- # Create the NLTK data directory if it doesn't exist
24
- nltk_data_dir = os.path.join(os.path.expanduser("~"), "nltk_data")
25
- if not os.path.exists(nltk_data_dir):
26
- os.makedirs(nltk_data_dir)
27
-
28
- # Save the tokenizer to the NLTK data directory
29
- tokenizer_path = os.path.join(nltk_data_dir, "tokenizers", "punkt")
30
- if not os.path.exists(tokenizer_path):
31
- os.makedirs(tokenizer_path)
32
-
33
- # Extract and save the punkt tokenizer
34
- with open(os.path.join(tokenizer_path, "punkt.zip"), "wb") as f:
35
- f.write(response.content)
36
-
37
- print("NLTK punkt tokenizer downloaded successfully")
38
-
39
- # Call the function to setup the tokenizer
40
- setup_nltk_tokenizer()
41
 
42
  @spaces.GPU()
43
  def calculate_wer(reference, hypothesis):
@@ -61,15 +37,11 @@ def calculate_sentence_wer(reference, hypothesis):
61
  Calculate WER for each sentence and overall statistics.
62
  """
63
  try:
64
- reference_sentences = sent_tokenize(reference)
65
- hypothesis_sentences = sent_tokenize(hypothesis)
66
 
67
- # Get minimum number of sentences
68
- min_sentences = min(len(reference_sentences), len(hypothesis_sentences))
69
-
70
- # Trim to the same number of sentences
71
- reference_sentences = reference_sentences[:min_sentences]
72
- hypothesis_sentences = hypothesis_sentences[:min_sentences]
73
 
74
  sentence_wers = []
75
  for ref, hyp in zip(reference_sentences, hypothesis_sentences):
@@ -80,34 +52,19 @@ def calculate_sentence_wer(reference, hypothesis):
80
  return {
81
  "sentence_wers": [],
82
  "average_wer": 0.0,
83
- "std_dev": 0.0,
84
- "warning": "No sentences to compare"
85
  }
86
 
87
  average_wer = np.mean(sentence_wers)
88
  std_dev = np.std(sentence_wers)
89
 
90
- # Check if there were extra sentences
91
- if len(reference_sentences) != len(hypothesis_sentences):
92
- warning = f"Reference has {len(reference_sentences)} sentences, " \
93
- f"hypothesis has {len(hypothesis_sentences)} sentences. " \
94
- f"Only compared the first {min_sentences} sentences."
95
- else:
96
- warning = None
97
-
98
  return {
99
  "sentence_wers": sentence_wers,
100
  "average_wer": average_wer,
101
- "std_dev": std_dev,
102
- "warning": warning
103
  }
104
  except Exception as e:
105
- return {
106
- "sentence_wers": [],
107
- "average_wer": 0.0,
108
- "std_dev": 0.0,
109
- "error": str(e)
110
- }
111
 
112
  @spaces.GPU()
113
  def process_files(reference_file, hypothesis_file):
@@ -118,11 +75,6 @@ def process_files(reference_file, hypothesis_file):
118
  with open(hypothesis_file.name, 'r') as f:
119
  hypothesis_text = f.read()
120
 
121
- if not reference_text or not hypothesis_text:
122
- return {
123
- "error": "Both reference and hypothesis files must contain text"
124
- }
125
-
126
  wer_value = calculate_wer(reference_text, hypothesis_text)
127
  cer_value = calculate_cer(reference_text, hypothesis_text)
128
  sentence_wer_stats = calculate_sentence_wer(reference_text, hypothesis_text)
@@ -132,39 +84,21 @@ def process_files(reference_file, hypothesis_file):
132
  "CER": cer_value,
133
  "Sentence WERs": sentence_wer_stats["sentence_wers"],
134
  "Average WER": sentence_wer_stats["average_wer"],
135
- "Standard Deviation": sentence_wer_stats["std_dev"],
136
- "Warning": sentence_wer_stats.get("warning"),
137
- "Error": sentence_wer_stats.get("error")
138
  }
139
  except Exception as e:
140
- return {
141
- "WER": 0.0,
142
- "CER": 0.0,
143
- "Sentence WERs": [],
144
- "Average WER": 0.0,
145
- "Standard Deviation": 0.0,
146
- "Error": str(e)
147
- }
148
-
149
- def format_sentence_wer_stats(sentence_wers, average_wer, std_dev, warning, error):
150
- md = ""
151
-
152
- if error:
153
- md += f"### Error\n{error}\n\n"
154
- elif warning:
155
- md += f"### Warning\n{warning}\n\n"
156
 
 
157
  if not sentence_wers:
158
- md += "No sentences to compare"
159
- return md
160
 
161
- md += "### Sentence-level WER Analysis\n\n"
162
  md += f"* Average WER: {average_wer:.2f}\n"
163
  md += f"* Standard Deviation: {std_dev:.2f}\n\n"
164
  md += "### WER for Each Sentence\n\n"
165
  for i, wer in enumerate(sentence_wers):
166
  md += f"* Sentence {i+1}: {wer:.2f}\n"
167
-
168
  return md
169
 
170
  def main():
@@ -211,24 +145,18 @@ def main():
211
 
212
  def process_and_display(ref_file, hyp_file):
213
  result = process_files(ref_file, hyp_file)
 
 
214
 
215
  metrics = {
216
  "WER": result["WER"],
217
  "CER": result["CER"]
218
  }
219
 
220
- error = result.get("Error")
221
- warning = result.get("Warning")
222
- sentence_wers = result.get("Sentence WERs", [])
223
- average_wer = result.get("Average WER", 0.0)
224
- std_dev = result.get("Standard Deviation", 0.0)
225
-
226
  wer_stats_md = format_sentence_wer_stats(
227
- sentence_wers,
228
- average_wer,
229
- std_dev,
230
- warning,
231
- error
232
  )
233
 
234
  return metrics, wer_stats_md
 
1
  import spaces
2
  import jiwer
3
  import numpy as np
4
+ import re
5
  import gradio as gr
 
 
 
 
6
 
7
+ def split_into_sentences(text):
8
+ """
9
+ Simple sentence tokenizer using regular expressions.
10
+ Splits text into sentences based on punctuation.
11
+ """
12
+ # Split text into sentences using regex
13
+ sentences = re.split(r'(?<=[.!?])\s*', text)
14
+ # Clean up empty strings and whitespace
15
+ sentences = [s.strip() for s in sentences if s.strip()]
16
+ return sentences
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
 
18
  @spaces.GPU()
19
  def calculate_wer(reference, hypothesis):
 
37
  Calculate WER for each sentence and overall statistics.
38
  """
39
  try:
40
+ reference_sentences = split_into_sentences(reference)
41
+ hypothesis_sentences = split_into_sentences(hypothesis)
42
 
43
+ if len(reference_sentences) != len(hypothesis_sentences):
44
+ raise ValueError("Reference and hypothesis must contain the same number of sentences")
 
 
 
 
45
 
46
  sentence_wers = []
47
  for ref, hyp in zip(reference_sentences, hypothesis_sentences):
 
52
  return {
53
  "sentence_wers": [],
54
  "average_wer": 0.0,
55
+ "std_dev": 0.0
 
56
  }
57
 
58
  average_wer = np.mean(sentence_wers)
59
  std_dev = np.std(sentence_wers)
60
 
 
 
 
 
 
 
 
 
61
  return {
62
  "sentence_wers": sentence_wers,
63
  "average_wer": average_wer,
64
+ "std_dev": std_dev
 
65
  }
66
  except Exception as e:
67
+ raise e
 
 
 
 
 
68
 
69
  @spaces.GPU()
70
  def process_files(reference_file, hypothesis_file):
 
75
  with open(hypothesis_file.name, 'r') as f:
76
  hypothesis_text = f.read()
77
 
 
 
 
 
 
78
  wer_value = calculate_wer(reference_text, hypothesis_text)
79
  cer_value = calculate_cer(reference_text, hypothesis_text)
80
  sentence_wer_stats = calculate_sentence_wer(reference_text, hypothesis_text)
 
84
  "CER": cer_value,
85
  "Sentence WERs": sentence_wer_stats["sentence_wers"],
86
  "Average WER": sentence_wer_stats["average_wer"],
87
+ "Standard Deviation": sentence_wer_stats["std_dev"]
 
 
88
  }
89
  except Exception as e:
90
+ return {"error": str(e)}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
91
 
92
+ def format_sentence_wer_stats(sentence_wers, average_wer, std_dev):
93
  if not sentence_wers:
94
+ return "All sentences match perfectly!"
 
95
 
96
+ md = "### Sentence-level WER Analysis\n\n"
97
  md += f"* Average WER: {average_wer:.2f}\n"
98
  md += f"* Standard Deviation: {std_dev:.2f}\n\n"
99
  md += "### WER for Each Sentence\n\n"
100
  for i, wer in enumerate(sentence_wers):
101
  md += f"* Sentence {i+1}: {wer:.2f}\n"
 
102
  return md
103
 
104
  def main():
 
145
 
146
  def process_and_display(ref_file, hyp_file):
147
  result = process_files(ref_file, hyp_file)
148
+ if "error" in result:
149
+ return {}, {}, "Error: " + result["error"]
150
 
151
  metrics = {
152
  "WER": result["WER"],
153
  "CER": result["CER"]
154
  }
155
 
 
 
 
 
 
 
156
  wer_stats_md = format_sentence_wer_stats(
157
+ result["Sentence WERs"],
158
+ result["Average WER"],
159
+ result["Standard Deviation"]
 
 
160
  )
161
 
162
  return metrics, wer_stats_md