Spaces:
Sleeping
Sleeping
fix tok
Browse files
app.py
CHANGED
@@ -1,43 +1,19 @@
|
|
1 |
import spaces
|
2 |
import jiwer
|
3 |
import numpy as np
|
|
|
4 |
import gradio as gr
|
5 |
-
import nltk
|
6 |
-
from nltk.tokenize import sent_tokenize
|
7 |
-
import os
|
8 |
-
import requests
|
9 |
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
url = "https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/tokenizers/punkt.zip"
|
21 |
-
response = requests.get(url)
|
22 |
-
|
23 |
-
# Create the NLTK data directory if it doesn't exist
|
24 |
-
nltk_data_dir = os.path.join(os.path.expanduser("~"), "nltk_data")
|
25 |
-
if not os.path.exists(nltk_data_dir):
|
26 |
-
os.makedirs(nltk_data_dir)
|
27 |
-
|
28 |
-
# Save the tokenizer to the NLTK data directory
|
29 |
-
tokenizer_path = os.path.join(nltk_data_dir, "tokenizers", "punkt")
|
30 |
-
if not os.path.exists(tokenizer_path):
|
31 |
-
os.makedirs(tokenizer_path)
|
32 |
-
|
33 |
-
# Extract and save the punkt tokenizer
|
34 |
-
with open(os.path.join(tokenizer_path, "punkt.zip"), "wb") as f:
|
35 |
-
f.write(response.content)
|
36 |
-
|
37 |
-
print("NLTK punkt tokenizer downloaded successfully")
|
38 |
-
|
39 |
-
# Call the function to setup the tokenizer
|
40 |
-
setup_nltk_tokenizer()
|
41 |
|
42 |
@spaces.GPU()
|
43 |
def calculate_wer(reference, hypothesis):
|
@@ -61,15 +37,11 @@ def calculate_sentence_wer(reference, hypothesis):
|
|
61 |
Calculate WER for each sentence and overall statistics.
|
62 |
"""
|
63 |
try:
|
64 |
-
reference_sentences =
|
65 |
-
hypothesis_sentences =
|
66 |
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
# Trim to the same number of sentences
|
71 |
-
reference_sentences = reference_sentences[:min_sentences]
|
72 |
-
hypothesis_sentences = hypothesis_sentences[:min_sentences]
|
73 |
|
74 |
sentence_wers = []
|
75 |
for ref, hyp in zip(reference_sentences, hypothesis_sentences):
|
@@ -80,34 +52,19 @@ def calculate_sentence_wer(reference, hypothesis):
|
|
80 |
return {
|
81 |
"sentence_wers": [],
|
82 |
"average_wer": 0.0,
|
83 |
-
"std_dev": 0.0
|
84 |
-
"warning": "No sentences to compare"
|
85 |
}
|
86 |
|
87 |
average_wer = np.mean(sentence_wers)
|
88 |
std_dev = np.std(sentence_wers)
|
89 |
|
90 |
-
# Check if there were extra sentences
|
91 |
-
if len(reference_sentences) != len(hypothesis_sentences):
|
92 |
-
warning = f"Reference has {len(reference_sentences)} sentences, " \
|
93 |
-
f"hypothesis has {len(hypothesis_sentences)} sentences. " \
|
94 |
-
f"Only compared the first {min_sentences} sentences."
|
95 |
-
else:
|
96 |
-
warning = None
|
97 |
-
|
98 |
return {
|
99 |
"sentence_wers": sentence_wers,
|
100 |
"average_wer": average_wer,
|
101 |
-
"std_dev": std_dev
|
102 |
-
"warning": warning
|
103 |
}
|
104 |
except Exception as e:
|
105 |
-
|
106 |
-
"sentence_wers": [],
|
107 |
-
"average_wer": 0.0,
|
108 |
-
"std_dev": 0.0,
|
109 |
-
"error": str(e)
|
110 |
-
}
|
111 |
|
112 |
@spaces.GPU()
|
113 |
def process_files(reference_file, hypothesis_file):
|
@@ -118,11 +75,6 @@ def process_files(reference_file, hypothesis_file):
|
|
118 |
with open(hypothesis_file.name, 'r') as f:
|
119 |
hypothesis_text = f.read()
|
120 |
|
121 |
-
if not reference_text or not hypothesis_text:
|
122 |
-
return {
|
123 |
-
"error": "Both reference and hypothesis files must contain text"
|
124 |
-
}
|
125 |
-
|
126 |
wer_value = calculate_wer(reference_text, hypothesis_text)
|
127 |
cer_value = calculate_cer(reference_text, hypothesis_text)
|
128 |
sentence_wer_stats = calculate_sentence_wer(reference_text, hypothesis_text)
|
@@ -132,39 +84,21 @@ def process_files(reference_file, hypothesis_file):
|
|
132 |
"CER": cer_value,
|
133 |
"Sentence WERs": sentence_wer_stats["sentence_wers"],
|
134 |
"Average WER": sentence_wer_stats["average_wer"],
|
135 |
-
"Standard Deviation": sentence_wer_stats["std_dev"]
|
136 |
-
"Warning": sentence_wer_stats.get("warning"),
|
137 |
-
"Error": sentence_wer_stats.get("error")
|
138 |
}
|
139 |
except Exception as e:
|
140 |
-
return {
|
141 |
-
"WER": 0.0,
|
142 |
-
"CER": 0.0,
|
143 |
-
"Sentence WERs": [],
|
144 |
-
"Average WER": 0.0,
|
145 |
-
"Standard Deviation": 0.0,
|
146 |
-
"Error": str(e)
|
147 |
-
}
|
148 |
-
|
149 |
-
def format_sentence_wer_stats(sentence_wers, average_wer, std_dev, warning, error):
|
150 |
-
md = ""
|
151 |
-
|
152 |
-
if error:
|
153 |
-
md += f"### Error\n{error}\n\n"
|
154 |
-
elif warning:
|
155 |
-
md += f"### Warning\n{warning}\n\n"
|
156 |
|
|
|
157 |
if not sentence_wers:
|
158 |
-
|
159 |
-
return md
|
160 |
|
161 |
-
md
|
162 |
md += f"* Average WER: {average_wer:.2f}\n"
|
163 |
md += f"* Standard Deviation: {std_dev:.2f}\n\n"
|
164 |
md += "### WER for Each Sentence\n\n"
|
165 |
for i, wer in enumerate(sentence_wers):
|
166 |
md += f"* Sentence {i+1}: {wer:.2f}\n"
|
167 |
-
|
168 |
return md
|
169 |
|
170 |
def main():
|
@@ -211,24 +145,18 @@ def main():
|
|
211 |
|
212 |
def process_and_display(ref_file, hyp_file):
|
213 |
result = process_files(ref_file, hyp_file)
|
|
|
|
|
214 |
|
215 |
metrics = {
|
216 |
"WER": result["WER"],
|
217 |
"CER": result["CER"]
|
218 |
}
|
219 |
|
220 |
-
error = result.get("Error")
|
221 |
-
warning = result.get("Warning")
|
222 |
-
sentence_wers = result.get("Sentence WERs", [])
|
223 |
-
average_wer = result.get("Average WER", 0.0)
|
224 |
-
std_dev = result.get("Standard Deviation", 0.0)
|
225 |
-
|
226 |
wer_stats_md = format_sentence_wer_stats(
|
227 |
-
|
228 |
-
|
229 |
-
|
230 |
-
warning,
|
231 |
-
error
|
232 |
)
|
233 |
|
234 |
return metrics, wer_stats_md
|
|
|
1 |
import spaces
|
2 |
import jiwer
|
3 |
import numpy as np
|
4 |
+
import re
|
5 |
import gradio as gr
|
|
|
|
|
|
|
|
|
6 |
|
7 |
+
def split_into_sentences(text):
|
8 |
+
"""
|
9 |
+
Simple sentence tokenizer using regular expressions.
|
10 |
+
Splits text into sentences based on punctuation.
|
11 |
+
"""
|
12 |
+
# Split text into sentences using regex
|
13 |
+
sentences = re.split(r'(?<=[.!?])\s*', text)
|
14 |
+
# Clean up empty strings and whitespace
|
15 |
+
sentences = [s.strip() for s in sentences if s.strip()]
|
16 |
+
return sentences
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
17 |
|
18 |
@spaces.GPU()
|
19 |
def calculate_wer(reference, hypothesis):
|
|
|
37 |
Calculate WER for each sentence and overall statistics.
|
38 |
"""
|
39 |
try:
|
40 |
+
reference_sentences = split_into_sentences(reference)
|
41 |
+
hypothesis_sentences = split_into_sentences(hypothesis)
|
42 |
|
43 |
+
if len(reference_sentences) != len(hypothesis_sentences):
|
44 |
+
raise ValueError("Reference and hypothesis must contain the same number of sentences")
|
|
|
|
|
|
|
|
|
45 |
|
46 |
sentence_wers = []
|
47 |
for ref, hyp in zip(reference_sentences, hypothesis_sentences):
|
|
|
52 |
return {
|
53 |
"sentence_wers": [],
|
54 |
"average_wer": 0.0,
|
55 |
+
"std_dev": 0.0
|
|
|
56 |
}
|
57 |
|
58 |
average_wer = np.mean(sentence_wers)
|
59 |
std_dev = np.std(sentence_wers)
|
60 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
61 |
return {
|
62 |
"sentence_wers": sentence_wers,
|
63 |
"average_wer": average_wer,
|
64 |
+
"std_dev": std_dev
|
|
|
65 |
}
|
66 |
except Exception as e:
|
67 |
+
raise e
|
|
|
|
|
|
|
|
|
|
|
68 |
|
69 |
@spaces.GPU()
|
70 |
def process_files(reference_file, hypothesis_file):
|
|
|
75 |
with open(hypothesis_file.name, 'r') as f:
|
76 |
hypothesis_text = f.read()
|
77 |
|
|
|
|
|
|
|
|
|
|
|
78 |
wer_value = calculate_wer(reference_text, hypothesis_text)
|
79 |
cer_value = calculate_cer(reference_text, hypothesis_text)
|
80 |
sentence_wer_stats = calculate_sentence_wer(reference_text, hypothesis_text)
|
|
|
84 |
"CER": cer_value,
|
85 |
"Sentence WERs": sentence_wer_stats["sentence_wers"],
|
86 |
"Average WER": sentence_wer_stats["average_wer"],
|
87 |
+
"Standard Deviation": sentence_wer_stats["std_dev"]
|
|
|
|
|
88 |
}
|
89 |
except Exception as e:
|
90 |
+
return {"error": str(e)}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
91 |
|
92 |
+
def format_sentence_wer_stats(sentence_wers, average_wer, std_dev):
|
93 |
if not sentence_wers:
|
94 |
+
return "All sentences match perfectly!"
|
|
|
95 |
|
96 |
+
md = "### Sentence-level WER Analysis\n\n"
|
97 |
md += f"* Average WER: {average_wer:.2f}\n"
|
98 |
md += f"* Standard Deviation: {std_dev:.2f}\n\n"
|
99 |
md += "### WER for Each Sentence\n\n"
|
100 |
for i, wer in enumerate(sentence_wers):
|
101 |
md += f"* Sentence {i+1}: {wer:.2f}\n"
|
|
|
102 |
return md
|
103 |
|
104 |
def main():
|
|
|
145 |
|
146 |
def process_and_display(ref_file, hyp_file):
|
147 |
result = process_files(ref_file, hyp_file)
|
148 |
+
if "error" in result:
|
149 |
+
return {}, {}, "Error: " + result["error"]
|
150 |
|
151 |
metrics = {
|
152 |
"WER": result["WER"],
|
153 |
"CER": result["CER"]
|
154 |
}
|
155 |
|
|
|
|
|
|
|
|
|
|
|
|
|
156 |
wer_stats_md = format_sentence_wer_stats(
|
157 |
+
result["Sentence WERs"],
|
158 |
+
result["Average WER"],
|
159 |
+
result["Standard Deviation"]
|
|
|
|
|
160 |
)
|
161 |
|
162 |
return metrics, wer_stats_md
|