xujinheng666 commited on
Commit
c228696
·
verified ·
1 Parent(s): 3d5104e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +14 -30
app.py CHANGED
@@ -1,9 +1,8 @@
1
- import os
2
  import torch
3
  import torchaudio
4
  import numpy as np
5
  import re
6
- import streamlit as st
7
  from difflib import SequenceMatcher
8
  from transformers import pipeline
9
 
@@ -19,17 +18,18 @@ pipe = pipeline(
19
  chunk_length_s=30,
20
  device=device,
21
  generate_kwargs={
22
- "no_repeat_ngram_size": 3,
23
- "repetition_penalty": 1.3,
24
- "temperature": 0.7,
25
- "top_p": 0.9,
26
- "top_k": 50
 
 
27
  }
28
  )
29
  pipe.model.config.forced_decoder_ids = pipe.tokenizer.get_decoder_prompt_ids(language=language, task="transcribe")
30
 
31
- # Load quality rating model
32
- rating_pipe = pipeline("text-classification", model="Leo0129/CustomModel-multilingual-sentiment-analysis", device=device)
33
 
34
  def is_similar(a, b, threshold=0.8):
35
  return SequenceMatcher(None, a, b).ratio() > threshold
@@ -47,57 +47,41 @@ def remove_punctuation(text):
47
 
48
  def transcribe_audio(audio_path):
49
  waveform, sample_rate = torchaudio.load(audio_path)
50
-
51
  if waveform.shape[0] > 1:
52
  waveform = torch.mean(waveform, dim=0, keepdim=True)
53
-
54
  waveform = waveform.squeeze(0).numpy()
55
  duration = waveform.shape[0] / sample_rate
56
-
57
  if duration > 60:
58
  chunk_size = sample_rate * 55
59
  step_size = sample_rate * 50
60
  results = []
61
-
62
  for start in range(0, waveform.shape[0], step_size):
63
  chunk = waveform[start:start + chunk_size]
64
  if chunk.shape[0] == 0:
65
  break
66
  transcript = pipe({"sampling_rate": sample_rate, "raw": chunk})["text"]
67
  results.append(remove_punctuation(transcript))
68
-
69
  return remove_punctuation(remove_repeated_phrases(" ".join(results)))
70
-
71
  return remove_punctuation(remove_repeated_phrases(pipe({"sampling_rate": sample_rate, "raw": waveform})["text"]))
72
 
73
  def rate_quality(text):
74
  chunks = [text[i:i+512] for i in range(0, len(text), 512)]
75
  results = rating_pipe(chunks, batch_size=4)
76
-
77
  label_map = {"Very Negative": "Very Poor", "Negative": "Poor", "Neutral": "Neutral", "Positive": "Good", "Very Positive": "Very Good"}
78
  processed_results = [label_map.get(res["label"], "Unknown") for res in results]
79
-
80
  return max(set(processed_results), key=processed_results.count)
81
 
82
  # Streamlit UI
83
- st.title("Audio Transcription and Quality Rating")
84
-
85
  uploaded_file = st.file_uploader("Upload an audio file", type=["wav", "mp3", "flac"])
86
-
87
- if uploaded_file is not None:
88
- st.audio(uploaded_file, format="audio/wav")
89
-
90
- temp_audio_path = "temp_audio.wav"
91
- with open(temp_audio_path, "wb") as f:
92
  f.write(uploaded_file.read())
93
-
94
  st.write("Processing audio...")
95
- transcript = transcribe_audio(temp_audio_path)
96
  st.subheader("Transcript")
97
  st.write(transcript)
98
-
99
  quality_rating = rate_quality(transcript)
100
  st.subheader("Quality Rating")
101
  st.write(quality_rating)
102
-
103
- os.remove(temp_audio_path)
 
1
+ import streamlit as st
2
  import torch
3
  import torchaudio
4
  import numpy as np
5
  import re
 
6
  from difflib import SequenceMatcher
7
  from transformers import pipeline
8
 
 
18
  chunk_length_s=30,
19
  device=device,
20
  generate_kwargs={
21
+ "no_repeat_ngram_size": 4,
22
+ "repetition_penalty": 1.15,
23
+ "temperature": 0.5,
24
+ "top_p": 0.97,
25
+ "top_k": 40,
26
+ "max_new_tokens": 300,
27
+ "do_sample": True
28
  }
29
  )
30
  pipe.model.config.forced_decoder_ids = pipe.tokenizer.get_decoder_prompt_ids(language=language, task="transcribe")
31
 
32
+ rating_pipe = pipeline("text-classification", model="MonkeyDLLLLLLuffy/CustomModel-multilingual-sentiment-analysis", device=device)
 
33
 
34
  def is_similar(a, b, threshold=0.8):
35
  return SequenceMatcher(None, a, b).ratio() > threshold
 
47
 
48
  def transcribe_audio(audio_path):
49
  waveform, sample_rate = torchaudio.load(audio_path)
 
50
  if waveform.shape[0] > 1:
51
  waveform = torch.mean(waveform, dim=0, keepdim=True)
 
52
  waveform = waveform.squeeze(0).numpy()
53
  duration = waveform.shape[0] / sample_rate
 
54
  if duration > 60:
55
  chunk_size = sample_rate * 55
56
  step_size = sample_rate * 50
57
  results = []
 
58
  for start in range(0, waveform.shape[0], step_size):
59
  chunk = waveform[start:start + chunk_size]
60
  if chunk.shape[0] == 0:
61
  break
62
  transcript = pipe({"sampling_rate": sample_rate, "raw": chunk})["text"]
63
  results.append(remove_punctuation(transcript))
 
64
  return remove_punctuation(remove_repeated_phrases(" ".join(results)))
 
65
  return remove_punctuation(remove_repeated_phrases(pipe({"sampling_rate": sample_rate, "raw": waveform})["text"]))
66
 
67
  def rate_quality(text):
68
  chunks = [text[i:i+512] for i in range(0, len(text), 512)]
69
  results = rating_pipe(chunks, batch_size=4)
 
70
  label_map = {"Very Negative": "Very Poor", "Negative": "Poor", "Neutral": "Neutral", "Positive": "Good", "Very Positive": "Very Good"}
71
  processed_results = [label_map.get(res["label"], "Unknown") for res in results]
 
72
  return max(set(processed_results), key=processed_results.count)
73
 
74
  # Streamlit UI
75
+ st.title("Audio Transcription & Quality Rating")
 
76
  uploaded_file = st.file_uploader("Upload an audio file", type=["wav", "mp3", "flac"])
77
+ if uploaded_file:
78
+ st.audio(uploaded_file, format='audio/wav')
79
+ with open("temp_audio.wav", "wb") as f:
 
 
 
80
  f.write(uploaded_file.read())
 
81
  st.write("Processing audio...")
82
+ transcript = transcribe_audio("temp_audio.wav")
83
  st.subheader("Transcript")
84
  st.write(transcript)
 
85
  quality_rating = rate_quality(transcript)
86
  st.subheader("Quality Rating")
87
  st.write(quality_rating)