AnalysisWithMSR commited on
Commit
1d133a1
·
verified ·
1 Parent(s): 783f341

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +83 -105
app.py CHANGED
@@ -1,20 +1,33 @@
1
- import gradio as gr
2
- from transformers import pipeline
 
3
  import whisper
4
  from pydub import AudioSegment
5
  import tempfile
6
- import os
7
- import googleapiclient.discovery
8
  from youtube_transcript_api import YouTubeTranscriptApi
 
9
  import openai
 
 
 
 
 
 
 
 
 
10
 
11
- # Load API keys from environment variables (recommended)
12
- YOUTUBE_API_KEY = os.environ.get("YOUTUBE_API_KEY")
13
- OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
14
 
 
 
15
 
 
16
  def extract_video_id(url):
17
- """Extracts the video ID from a YouTube URL."""
18
  try:
19
  parsed_url = urlparse(url)
20
  if "youtube.com" in parsed_url.netloc:
@@ -22,22 +35,15 @@ def extract_video_id(url):
22
  return query_params.get('v', [None])[0]
23
  elif "youtu.be" in parsed_url.netloc:
24
  return parsed_url.path.strip("/")
25
- else:
26
- print("Invalid YouTube URL.")
27
- return None
28
  except Exception as e:
29
  print(f"Error parsing URL: {e}")
30
  return None
31
 
32
-
33
  def get_video_duration(video_id):
34
- """Fetches the video duration in minutes (if API key provided)."""
35
- if not YOUTUBE_API_KEY:
36
- print("Missing YouTube API key. Skipping video duration.")
37
- return None
38
-
39
  try:
40
- youtube = googleapiclient.discovery.build("youtube", "v3", developerKey=YOUTUBE_API_KEY)
41
  request = youtube.videos().list(part="contentDetails", id=video_id)
42
  response = request.execute()
43
  if response["items"]:
@@ -47,105 +53,86 @@ def get_video_duration(video_id):
47
  minutes = int(match.group(2)) if match.group(2) else 0
48
  seconds = int(match.group(3)) if match.group(3) else 0
49
  return hours * 60 + minutes + seconds / 60
50
- else:
51
- print("No video details found.")
52
- return None
53
  except Exception as e:
54
- print(f"Error fetching video duration: {e}")
55
  return None
56
 
57
-
58
  def download_and_transcribe_with_whisper(youtube_url):
59
- """Downloads and transcribes audio using Whisper."""
60
  try:
61
  with tempfile.TemporaryDirectory() as temp_dir:
62
  temp_audio_file = os.path.join(temp_dir, "audio.mp3")
63
  ydl_opts = {
64
  'format': 'bestaudio/best',
65
  'outtmpl': temp_audio_file,
66
- 'extractaudio': True,
67
- 'audioquality': 1,
 
 
 
68
  }
69
-
70
- # Download audio using yt-dlp
71
  with yt_dlp.YoutubeDL(ydl_opts) as ydl:
72
  ydl.download([youtube_url])
73
 
74
- # Convert to wav for Whisper
75
  audio = AudioSegment.from_file(temp_audio_file)
76
  wav_file = os.path.join(temp_dir, "audio.wav")
77
  audio.export(wav_file, format="wav")
78
 
79
- # Run Whisper transcription
80
  model = whisper.load_model("large")
81
  result = model.transcribe(wav_file)
82
- transcript = result['text']
83
- return transcript
84
-
85
  except Exception as e:
86
- print(f"Error during transcription: {e}")
87
- return None
88
-
89
-
90
- def get_transcript_from_youtube_api(video_id):
91
- """Fetches transcript using YouTube API (if available)."""
92
- if not YOUTUBE_API_KEY:
93
- print("Missing YouTube API key. Skipping YouTube transcript.")
94
  return None
95
 
 
 
96
  try:
97
  transcript_list = YouTubeTranscriptApi.list_transcripts(video_id)
98
  for transcript in transcript_list:
99
  if not transcript.is_generated:
100
- segments = transcript.fetch()
101
- return " ".join(segment['text'] for segment in segments)
102
- print("Manual transcript not found.")
 
103
  return None
104
-
105
  except Exception as e:
106
  print(f"Error fetching transcript: {e}")
107
  return None
108
 
109
-
110
  def get_transcript(youtube_url):
111
- """Gets transcript from YouTube API or Whisper if unavailable."""
112
  video_id = extract_video_id(youtube_url)
113
  if not video_id:
114
- print("Invalid or unsupported YouTube URL.")
115
- return None
116
-
117
  video_length = get_video_duration(video_id)
118
  if video_length:
119
- transcript = get_transcript_from_youtube_api(video_id)
120
- if transcript:
121
- return transcript
122
-
123
- print("Using Whisper for transcription.")
124
- return download_and_transcribe_with_whisper(youtube_url)
125
-
126
-
127
- def summarize_text_huggingface(text):
128
- """Summarizes text using a Hugging Face summarization model."""
129
- summarizer = pipeline("summarization", model="facebook/bart-large-cnn", device=0 if torch.cuda.is_available() else -1)
130
- max_input_length = 1024
131
- chunk_overlap = 100
132
- text_chunks = [
133
- text[i:i + max_input_length]
134
- for i in range(0, len(text), max_input_length - chunk_overlap)
135
- ]
136
- summaries = [
137
- summarizer(chunk, max_length=100, min_length=50, do_sample=False)[0]['summary_text']
138
- for chunk in text_chunks
139
- ]
140
- return " ".join(summaries)
141
-
142
-
143
- def generate_optimized_content(summarized_transcript):
144
- """Generates optimized content using OpenAI (if API key provided)."""
145
- if not OPENAI_API_KEY:
146
- print("Missing OpenAI API key. Skipping optimized content generation.")
147
  return None
148
 
 
 
149
  prompt = f"""
150
  Analyze the following summarized YouTube video transcript and:
151
  1. Extract the top 10 keywords.
@@ -154,7 +141,7 @@ def generate_optimized_content(summarized_transcript):
154
  4. Generate related tags for the video.
155
 
156
  Summarized Transcript:
157
- {summarized_transcript}
158
 
159
  Provide the results in the following JSON format:
160
  {{
@@ -164,9 +151,7 @@ def generate_optimized_content(summarized_transcript):
164
  "tags": ["tag1", "tag2", ..., "tag10"]
165
  }}
166
  """
167
-
168
  try:
169
- # Use the updated OpenAI API format for chat completions
170
  response = openai.ChatCompletion.create(
171
  model="gpt-3.5-turbo",
172
  messages=[
@@ -174,35 +159,28 @@ def generate_optimized_content(summarized_transcript):
174
  {"role": "user", "content": prompt}
175
  ]
176
  )
177
- # Extract and parse the response
178
- response_content = response['choices'][0]['message']['content']
179
- content = json.loads(response_content)
180
- return content
181
-
182
  except Exception as e:
183
- print(f"Error generating content: {e}")
184
- return None
185
-
186
 
187
- def seo_tool(youtube_url):
188
- """This function takes a YouTube URL as input and performs SEO optimization tasks."""
 
189
  transcript = get_transcript(youtube_url)
190
  if not transcript:
191
- return "Could not fetch the transcript. Please try another video."
192
-
193
- summary = summarize_text_huggingface(transcript)
194
  optimized_content = generate_optimized_content(summary)
195
-
196
- return summary, optimized_content
197
-
198
-
199
- interface = gr.Interface(
200
- fn=seo_tool,
201
- inputs="text",
202
- outputs=["text", "json"],
203
- title="SEO Tool for YouTube Videos",
204
- description="Enter a YouTube URL to get a summary and optimized content suggestions."
205
  )
206
 
207
  if __name__ == "__main__":
208
- interface.launch()
 
1
+ import googleapiclient.discovery
2
+ import re
3
+ import yt_dlp
4
  import whisper
5
  from pydub import AudioSegment
6
  import tempfile
7
+ from transformers import pipeline
 
8
  from youtube_transcript_api import YouTubeTranscriptApi
9
+ import torch
10
  import openai
11
+ import json
12
+ from urllib.parse import urlparse, parse_qs
13
+ import os
14
+ import gradio as gr
15
+
16
+ # API Keys setup
17
+ youtube_api_key = os.getenv("YOUTUBE_API_KEY") # Set these as environment variables
18
+ openai_api_key = os.getenv("OPENAI_API_KEY")
19
+ openai.api_key = openai_api_key
20
 
21
+ # Validation for missing API keys
22
+ if not youtube_api_key:
23
+ raise ValueError("YOUTUBE_API_KEY is not set. Please set it as an environment variable.")
24
 
25
+ if not openai_api_key:
26
+ raise ValueError("OPENAI_API_KEY is not set. Please set it as an environment variable.")
27
 
28
+ # Utility Functions
29
  def extract_video_id(url):
30
+ """Extract the video ID from a YouTube URL."""
31
  try:
32
  parsed_url = urlparse(url)
33
  if "youtube.com" in parsed_url.netloc:
 
35
  return query_params.get('v', [None])[0]
36
  elif "youtu.be" in parsed_url.netloc:
37
  return parsed_url.path.strip("/")
38
+ return None
 
 
39
  except Exception as e:
40
  print(f"Error parsing URL: {e}")
41
  return None
42
 
 
43
  def get_video_duration(video_id):
44
+ """Fetch the video duration."""
 
 
 
 
45
  try:
46
+ youtube = googleapiclient.discovery.build("youtube", "v3", developerKey=youtube_api_key)
47
  request = youtube.videos().list(part="contentDetails", id=video_id)
48
  response = request.execute()
49
  if response["items"]:
 
53
  minutes = int(match.group(2)) if match.group(2) else 0
54
  seconds = int(match.group(3)) if match.group(3) else 0
55
  return hours * 60 + minutes + seconds / 60
56
+ return None
 
 
57
  except Exception as e:
58
+ print(f"Error fetching duration: {e}")
59
  return None
60
 
 
61
  def download_and_transcribe_with_whisper(youtube_url):
62
+ """Download audio and transcribe using Whisper."""
63
  try:
64
  with tempfile.TemporaryDirectory() as temp_dir:
65
  temp_audio_file = os.path.join(temp_dir, "audio.mp3")
66
  ydl_opts = {
67
  'format': 'bestaudio/best',
68
  'outtmpl': temp_audio_file,
69
+ 'postprocessors': [{
70
+ 'key': 'FFmpegExtractAudio',
71
+ 'preferredcodec': 'mp3',
72
+ 'preferredquality': '192',
73
+ }],
74
  }
 
 
75
  with yt_dlp.YoutubeDL(ydl_opts) as ydl:
76
  ydl.download([youtube_url])
77
 
 
78
  audio = AudioSegment.from_file(temp_audio_file)
79
  wav_file = os.path.join(temp_dir, "audio.wav")
80
  audio.export(wav_file, format="wav")
81
 
 
82
  model = whisper.load_model("large")
83
  result = model.transcribe(wav_file)
84
+ return result['text']
 
 
85
  except Exception as e:
86
+ print(f"Error during Whisper transcription: {e}")
 
 
 
 
 
 
 
87
  return None
88
 
89
+ def get_transcript_from_youtube_api(video_id, video_length):
90
+ """Fetch transcript using YouTubeTranscriptApi."""
91
  try:
92
  transcript_list = YouTubeTranscriptApi.list_transcripts(video_id)
93
  for transcript in transcript_list:
94
  if not transcript.is_generated:
95
+ return " ".join(segment['text'] for segment in transcript.fetch())
96
+ if video_length > 15:
97
+ auto_transcript = transcript_list.find_generated_transcript(['en'])
98
+ return " ".join(segment['text'] for segment in auto_transcript.fetch())
99
  return None
 
100
  except Exception as e:
101
  print(f"Error fetching transcript: {e}")
102
  return None
103
 
 
104
  def get_transcript(youtube_url):
105
+ """Fetch transcript or use Whisper fallback."""
106
  video_id = extract_video_id(youtube_url)
107
  if not video_id:
108
+ return "Invalid or unsupported YouTube URL."
 
 
109
  video_length = get_video_duration(video_id)
110
  if video_length:
111
+ transcript = get_transcript_from_youtube_api(video_id, video_length)
112
+ return transcript if transcript else download_and_transcribe_with_whisper(youtube_url)
113
+ return "Error fetching video details."
114
+
115
+ def summarize_text(text):
116
+ """Summarize text using Hugging Face's BART model."""
117
+ try:
118
+ summarizer = pipeline("summarization", model="facebook/bart-large-cnn", device=0 if torch.cuda.is_available() else -1)
119
+ max_input_length = 1024
120
+ chunk_overlap = 100
121
+ text_chunks = [
122
+ text[i:i + max_input_length]
123
+ for i in range(0, len(text), max_input_length - chunk_overlap)
124
+ ]
125
+ summaries = [
126
+ summarizer(chunk, max_length=100, min_length=50, do_sample=False)[0]['summary_text']
127
+ for chunk in text_chunks
128
+ ]
129
+ return " ".join(summaries)
130
+ except Exception as e:
131
+ print(f"Error during summarization: {e}")
 
 
 
 
 
 
 
132
  return None
133
 
134
+ def generate_optimized_content(summarized_text):
135
+ """Generate optimized video metadata using GPT."""
136
  prompt = f"""
137
  Analyze the following summarized YouTube video transcript and:
138
  1. Extract the top 10 keywords.
 
141
  4. Generate related tags for the video.
142
 
143
  Summarized Transcript:
144
+ {summarized_text}
145
 
146
  Provide the results in the following JSON format:
147
  {{
 
151
  "tags": ["tag1", "tag2", ..., "tag10"]
152
  }}
153
  """
 
154
  try:
 
155
  response = openai.ChatCompletion.create(
156
  model="gpt-3.5-turbo",
157
  messages=[
 
159
  {"role": "user", "content": prompt}
160
  ]
161
  )
162
+ return json.loads(response['choices'][0]['message']['content'])
 
 
 
 
163
  except Exception as e:
164
+ print(f"Error generating metadata: {e}")
165
+ return {"error": "Unable to generate metadata."}
 
166
 
167
+ # Main Gradio Interface
168
+ def process_video(youtube_url):
169
+ """Complete video processing workflow."""
170
  transcript = get_transcript(youtube_url)
171
  if not transcript:
172
+ return {"error": "Could not fetch the transcript. Please try another video."}
173
+ summary = summarize_text(transcript)
 
174
  optimized_content = generate_optimized_content(summary)
175
+ return optimized_content
176
+
177
+ iface = gr.Interface(
178
+ fn=process_video,
179
+ inputs=gr.Textbox(label="Enter YouTube URL"),
180
+ outputs=gr.JSON(label="Optimized Metadata"),
181
+ title="YouTube Video SEO Optimizer",
182
+ description="Paste a YouTube URL to generate an SEO-friendly title, description, tags, and keywords."
 
 
183
  )
184
 
185
  if __name__ == "__main__":
186
+ iface.launch()