AnalysisWithMSR commited on
Commit
21fd183
·
verified ·
1 Parent(s): 655b975

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +67 -58
app.py CHANGED
@@ -5,26 +5,13 @@ import whisper
5
  from pydub import AudioSegment
6
  import tempfile
7
  from transformers import pipeline
 
8
  from youtube_transcript_api import YouTubeTranscriptApi
9
  import torch
10
  import openai
11
  import json
12
  from urllib.parse import urlparse, parse_qs
13
  import os
14
- import gradio as gr
15
-
16
- # Set up API keys (ensure these are provided as environment variables)
17
- youtube_api_key = os.getenv("YOUTUBE_API_KEY")
18
- openai_api_key = os.getenv("OPENAI_API_KEY")
19
- openai.api_key = openai_api_key
20
-
21
- # Validate API keys
22
- if not youtube_api_key:
23
- raise ValueError("YOUTUBE_API_KEY is not set. Please set it as an environment variable.")
24
-
25
- if not openai_api_key:
26
- raise ValueError("OPENAI_API_KEY is not set. Please set it as an environment variable.")
27
-
28
 
29
  def extract_video_id(url):
30
  """Extracts the video ID from a YouTube URL."""
@@ -36,12 +23,12 @@ def extract_video_id(url):
36
  elif "youtu.be" in parsed_url.netloc:
37
  return parsed_url.path.strip("/")
38
  else:
 
39
  return None
40
  except Exception as e:
41
  print(f"Error parsing URL: {e}")
42
  return None
43
 
44
-
45
  def get_video_duration(video_id, api_key):
46
  """Fetches the video duration in minutes."""
47
  try:
@@ -56,14 +43,13 @@ def get_video_duration(video_id, api_key):
56
  seconds = int(match.group(3)) if match.group(3) else 0
57
  return hours * 60 + minutes + seconds / 60
58
  else:
 
59
  return None
60
  except Exception as e:
61
  print(f"Error fetching video duration: {e}")
62
  return None
63
 
64
-
65
  def download_and_transcribe_with_whisper(youtube_url):
66
- """Downloads audio from YouTube and transcribes it using Whisper."""
67
  try:
68
  with tempfile.TemporaryDirectory() as temp_dir:
69
  temp_audio_file = os.path.join(temp_dir, "audio.mp3")
@@ -71,63 +57,73 @@ def download_and_transcribe_with_whisper(youtube_url):
71
  ydl_opts = {
72
  'format': 'bestaudio/best',
73
  'outtmpl': temp_audio_file,
74
- 'postprocessors': [{
75
- 'key': 'FFmpegExtractAudio',
76
- 'preferredcodec': 'mp3',
77
- 'preferredquality': '192',
78
- }],
79
  }
80
- # Download audio
 
81
  with yt_dlp.YoutubeDL(ydl_opts) as ydl:
82
  ydl.download([youtube_url])
83
- # Convert to WAV
 
84
  audio = AudioSegment.from_file(temp_audio_file)
85
  wav_file = os.path.join(temp_dir, "audio.wav")
86
  audio.export(wav_file, format="wav")
87
- # Transcribe using Whisper
 
88
  model = whisper.load_model("large")
89
  result = model.transcribe(wav_file)
90
- return result['text']
 
 
91
  except Exception as e:
92
  print(f"Error during transcription: {e}")
93
  return None
94
 
95
-
96
  def get_transcript_from_youtube_api(video_id, video_length):
97
  """Fetches transcript using YouTube API if available."""
98
  try:
99
  transcript_list = YouTubeTranscriptApi.list_transcripts(video_id)
 
100
  for transcript in transcript_list:
101
  if not transcript.is_generated:
102
  segments = transcript.fetch()
103
  return " ".join(segment['text'] for segment in segments)
104
- if video_length > 15: # Use generated transcript for longer videos
 
105
  auto_transcript = transcript_list.find_generated_transcript(['en'])
106
  if auto_transcript:
107
  segments = auto_transcript.fetch()
108
  return " ".join(segment['text'] for segment in segments)
 
 
109
  return None
 
110
  except Exception as e:
111
  print(f"Error fetching transcript: {e}")
112
  return None
113
 
114
-
115
- def get_transcript(youtube_url):
116
- """Gets transcript using YouTube API or Whisper."""
117
  video_id = extract_video_id(youtube_url)
118
  if not video_id:
119
- return "Invalid YouTube URL."
120
- video_length = get_video_duration(video_id, youtube_api_key)
 
 
121
  if video_length is not None:
 
122
  transcript = get_transcript_from_youtube_api(video_id, video_length)
123
  if transcript:
124
  return transcript
 
125
  return download_and_transcribe_with_whisper(youtube_url)
126
- return "Error fetching video duration."
127
-
 
128
 
129
- def summarize_text(text):
130
- """Summarizes text using Hugging Face pipeline."""
131
  summarizer = pipeline("summarization", model="facebook/bart-large-cnn", device=0 if torch.cuda.is_available() else -1)
132
  max_input_length = 1024
133
  chunk_overlap = 100
@@ -141,9 +137,9 @@ def summarize_text(text):
141
  ]
142
  return " ".join(summaries)
143
 
 
 
144
 
145
- def generate_optimized_content(summary):
146
- """Generates optimized content using OpenAI GPT."""
147
  prompt = f"""
148
  Analyze the following summarized YouTube video transcript and:
149
  1. Extract the top 10 keywords.
@@ -152,9 +148,9 @@ def generate_optimized_content(summary):
152
  4. Generate related tags for the video.
153
 
154
  Summarized Transcript:
155
- {summary}
156
 
157
- Provide the results in JSON format:
158
  {{
159
  "keywords": ["keyword1", "keyword2", ..., "keyword10"],
160
  "title": "Generated Title",
@@ -162,7 +158,9 @@ def generate_optimized_content(summary):
162
  "tags": ["tag1", "tag2", ..., "tag10"]
163
  }}
164
  """
 
165
  try:
 
166
  response = openai.ChatCompletion.create(
167
  model="gpt-3.5-turbo",
168
  messages=[
@@ -170,28 +168,39 @@ def generate_optimized_content(summary):
170
  {"role": "user", "content": prompt}
171
  ]
172
  )
173
- return json.loads(response['choices'][0]['message']['content'])
 
 
 
 
174
  except Exception as e:
175
- return {"error": str(e)}
 
 
 
 
 
 
 
176
 
 
 
 
177
 
178
- def process_video(youtube_url):
179
- """Processes video and returns optimized metadata."""
180
- transcript = get_transcript(youtube_url)
181
  if not transcript:
182
- return {"error": "Could not fetch the transcript."}
183
- summary = summarize_text(transcript)
184
- return generate_optimized_content(summary)
185
 
 
 
186
 
187
- # Gradio Interface
188
- iface = gr.Interface(
189
- fn=process_video,
190
- inputs=gr.Textbox(label="Enter a YouTube video URL"),
191
- outputs=gr.JSON(label="Optimized Content"),
192
- title="YouTube Video Optimization Tool",
193
- description="Enter a YouTube URL to generate SEO-optimized titles, descriptions, and tags."
194
- )
195
 
196
  if __name__ == "__main__":
197
- iface.launch()
 
5
  from pydub import AudioSegment
6
  import tempfile
7
  from transformers import pipeline
8
+ from pytrends.request import TrendReq
9
  from youtube_transcript_api import YouTubeTranscriptApi
10
  import torch
11
  import openai
12
  import json
13
  from urllib.parse import urlparse, parse_qs
14
  import os
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
 
16
  def extract_video_id(url):
17
  """Extracts the video ID from a YouTube URL."""
 
23
  elif "youtu.be" in parsed_url.netloc:
24
  return parsed_url.path.strip("/")
25
  else:
26
+ print("Invalid YouTube URL.")
27
  return None
28
  except Exception as e:
29
  print(f"Error parsing URL: {e}")
30
  return None
31
 
 
32
  def get_video_duration(video_id, api_key):
33
  """Fetches the video duration in minutes."""
34
  try:
 
43
  seconds = int(match.group(3)) if match.group(3) else 0
44
  return hours * 60 + minutes + seconds / 60
45
  else:
46
+ print("No video details found.")
47
  return None
48
  except Exception as e:
49
  print(f"Error fetching video duration: {e}")
50
  return None
51
 
 
52
  def download_and_transcribe_with_whisper(youtube_url):
 
53
  try:
54
  with tempfile.TemporaryDirectory() as temp_dir:
55
  temp_audio_file = os.path.join(temp_dir, "audio.mp3")
 
57
  ydl_opts = {
58
  'format': 'bestaudio/best',
59
  'outtmpl': temp_audio_file,
60
+ 'extractaudio': True,
61
+ 'audioquality': 1,
 
 
 
62
  }
63
+
64
+ # Download audio using yt-dlp
65
  with yt_dlp.YoutubeDL(ydl_opts) as ydl:
66
  ydl.download([youtube_url])
67
+
68
+ # Convert to wav for Whisper
69
  audio = AudioSegment.from_file(temp_audio_file)
70
  wav_file = os.path.join(temp_dir, "audio.wav")
71
  audio.export(wav_file, format="wav")
72
+
73
+ # Run Whisper transcription
74
  model = whisper.load_model("large")
75
  result = model.transcribe(wav_file)
76
+ transcript = result['text']
77
+ return transcript
78
+
79
  except Exception as e:
80
  print(f"Error during transcription: {e}")
81
  return None
82
 
 
83
  def get_transcript_from_youtube_api(video_id, video_length):
84
  """Fetches transcript using YouTube API if available."""
85
  try:
86
  transcript_list = YouTubeTranscriptApi.list_transcripts(video_id)
87
+
88
  for transcript in transcript_list:
89
  if not transcript.is_generated:
90
  segments = transcript.fetch()
91
  return " ".join(segment['text'] for segment in segments)
92
+
93
+ if video_length > 15:
94
  auto_transcript = transcript_list.find_generated_transcript(['en'])
95
  if auto_transcript:
96
  segments = auto_transcript.fetch()
97
  return " ".join(segment['text'] for segment in segments)
98
+
99
+ print("Manual transcript not available, and video is too short for auto-transcript.")
100
  return None
101
+
102
  except Exception as e:
103
  print(f"Error fetching transcript: {e}")
104
  return None
105
 
106
+ def get_transcript(youtube_url, api_key):
107
+ """Gets transcript from YouTube API or Whisper if unavailable."""
 
108
  video_id = extract_video_id(youtube_url)
109
  if not video_id:
110
+ print("Invalid or unsupported YouTube URL.")
111
+ return None
112
+
113
+ video_length = get_video_duration(video_id, api_key)
114
  if video_length is not None:
115
+ print(f"Video length: {video_length:.2f} minutes.")
116
  transcript = get_transcript_from_youtube_api(video_id, video_length)
117
  if transcript:
118
  return transcript
119
+ print("Using Whisper for transcription.")
120
  return download_and_transcribe_with_whisper(youtube_url)
121
+ else:
122
+ print("Error fetching video duration.")
123
+ return None
124
 
125
+ def summarize_text_huggingface(text):
126
+ """Summarizes text using a Hugging Face summarization model."""
127
  summarizer = pipeline("summarization", model="facebook/bart-large-cnn", device=0 if torch.cuda.is_available() else -1)
128
  max_input_length = 1024
129
  chunk_overlap = 100
 
137
  ]
138
  return " ".join(summaries)
139
 
140
+ def generate_optimized_content(api_key, summarized_transcript):
141
+ openai.api_key = api_key
142
 
 
 
143
  prompt = f"""
144
  Analyze the following summarized YouTube video transcript and:
145
  1. Extract the top 10 keywords.
 
148
  4. Generate related tags for the video.
149
 
150
  Summarized Transcript:
151
+ {summarized_transcript}
152
 
153
+ Provide the results in the following JSON format:
154
  {{
155
  "keywords": ["keyword1", "keyword2", ..., "keyword10"],
156
  "title": "Generated Title",
 
158
  "tags": ["tag1", "tag2", ..., "tag10"]
159
  }}
160
  """
161
+
162
  try:
163
+ # Use the updated OpenAI API format for chat completions
164
  response = openai.ChatCompletion.create(
165
  model="gpt-3.5-turbo",
166
  messages=[
 
168
  {"role": "user", "content": prompt}
169
  ]
170
  )
171
+ # Extract and parse the response
172
+ response_content = response['choices'][0]['message']['content']
173
+ content = json.loads(response_content)
174
+ return content
175
+
176
  except Exception as e:
177
+ print(f"Error generating content: {e}")
178
+ return None
179
+
180
+
181
+ def main():
182
+ youtube_url = input("Enter a YouTube video URL: ").strip()
183
+ youtube_api_key = "AIzaSyDzvaQzykj94MWl5fmY3wIBQchqXiCClUc" # Set your YouTube API key as an environment variable
184
+ openai_api_key = "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" # Set your OpenAI API key as an environment variable
185
 
186
+ if not youtube_api_key or not openai_api_key:
187
+ print("Missing API keys. Please set your YOUTUBE_API_KEY and OPENAI_API_KEY environment variables.")
188
+ return
189
 
190
+ transcript = get_transcript(youtube_url, youtube_api_key)
 
 
191
  if not transcript:
192
+ print("Could not fetch the transcript. Please try another video.")
193
+ return
 
194
 
195
+ summary = summarize_text_huggingface(transcript)
196
+ print("\nSummarized Transcript:\n", summary)
197
 
198
+ optimized_content = generate_optimized_content(openai_api_key, summary)
199
+ if optimized_content:
200
+ print("\nOptimized Content:")
201
+ print(json.dumps(optimized_content, indent=4))
202
+ else:
203
+ print("Error generating optimized content.")
 
 
204
 
205
  if __name__ == "__main__":
206
+ main()