Spaces:

AnalysisWithMSR
/

SEO

Sleeping

App Files Files Community

AnalysisWithMSR commited on Dec 14, 2024

Commit

b248ec3

verified ·

1 Parent(s): 5552067

Update app.py

Browse files

Files changed (1) hide show

app.py +103 -79

app.py CHANGED Viewed

@@ -5,29 +5,16 @@ import whisper
 from pydub import AudioSegment
 import tempfile
 from transformers import pipeline
 from youtube_transcript_api import YouTubeTranscriptApi
 import torch
 import openai
 import json
 from urllib.parse import urlparse, parse_qs
 import os
-import gradio as gr
-# API Keys setup
-youtube_api_key = os.getenv("YOUTUBE_API_KEY")  # Set these as environment variables
-openai_api_key = os.getenv("OPENAI_API_KEY")
-openai.api_key = openai_api_key
-# Validation for missing API keys
-if not youtube_api_key:
-    raise ValueError("YOUTUBE_API_KEY is not set. Please set it as an environment variable.")
-if not openai_api_key:
-    raise ValueError("OPENAI_API_KEY is not set. Please set it as an environment variable.")
-# Utility Functions
 def extract_video_id(url):
-    """Extract the video ID from a YouTube URL."""
     try:
         parsed_url = urlparse(url)
         if "youtube.com" in parsed_url.netloc:
@@ -35,15 +22,17 @@ def extract_video_id(url):
             return query_params.get('v', [None])[0]
         elif "youtu.be" in parsed_url.netloc:
             return parsed_url.path.strip("/")
-        return None
     except Exception as e:
         print(f"Error parsing URL: {e}")
         return None
-def get_video_duration(video_id):
-    """Fetch the video duration."""
     try:
-        youtube = googleapiclient.discovery.build("youtube", "v3", developerKey=youtube_api_key)
         request = youtube.videos().list(part="contentDetails", id=video_id)
         response = request.execute()
         if response["items"]:
@@ -53,86 +42,104 @@ def get_video_duration(video_id):
             minutes = int(match.group(2)) if match.group(2) else 0
             seconds = int(match.group(3)) if match.group(3) else 0
             return hours * 60 + minutes + seconds / 60
-        return None
     except Exception as e:
-        print(f"Error fetching duration: {e}")
         return None
 def download_and_transcribe_with_whisper(youtube_url):
-    """Download audio and transcribe using Whisper."""
     try:
         with tempfile.TemporaryDirectory() as temp_dir:
             temp_audio_file = os.path.join(temp_dir, "audio.mp3")
             ydl_opts = {
                 'format': 'bestaudio/best',
                 'outtmpl': temp_audio_file,
-                'postprocessors': [{
-                    'key': 'FFmpegExtractAudio',
-                    'preferredcodec': 'mp3',
-                    'preferredquality': '192',
-                }],
             }
             with yt_dlp.YoutubeDL(ydl_opts) as ydl:
                 ydl.download([youtube_url])
             audio = AudioSegment.from_file(temp_audio_file)
             wav_file = os.path.join(temp_dir, "audio.wav")
             audio.export(wav_file, format="wav")
             model = whisper.load_model("large")
             result = model.transcribe(wav_file)
-            return result['text']
     except Exception as e:
-        print(f"Error during Whisper transcription: {e}")
         return None
 def get_transcript_from_youtube_api(video_id, video_length):
-    """Fetch transcript using YouTubeTranscriptApi."""
     try:
         transcript_list = YouTubeTranscriptApi.list_transcripts(video_id)
         for transcript in transcript_list:
             if not transcript.is_generated:
-                return " ".join(segment['text'] for segment in transcript.fetch())
         if video_length > 15:
             auto_transcript = transcript_list.find_generated_transcript(['en'])
-            return " ".join(segment['text'] for segment in auto_transcript.fetch())
         return None
     except Exception as e:
         print(f"Error fetching transcript: {e}")
         return None
-def get_transcript(youtube_url):
-    """Fetch transcript or use Whisper fallback."""
     video_id = extract_video_id(youtube_url)
     if not video_id:
-        return "Invalid or unsupported YouTube URL."
-    video_length = get_video_duration(video_id)
-    if video_length:
-        transcript = get_transcript_from_youtube_api(video_id, video_length)
-        return transcript if transcript else download_and_transcribe_with_whisper(youtube_url)
-    return "Error fetching video details."
-def summarize_text(text):
-    """Summarize text using Hugging Face's BART model."""
-    try:
-        summarizer = pipeline("summarization", model="facebook/bart-large-cnn", device=0 if torch.cuda.is_available() else -1)
-        max_input_length = 1024
-        chunk_overlap = 100
-        text_chunks = [
-            text[i:i + max_input_length]
-            for i in range(0, len(text), max_input_length - chunk_overlap)
-        ]
-        summaries = [
-            summarizer(chunk, max_length=100, min_length=50, do_sample=False)[0]['summary_text']
-            for chunk in text_chunks
-        ]
-        return " ".join(summaries)
-    except Exception as e:
-        print(f"Error during summarization: {e}")
         return None
-def generate_optimized_content(summarized_text):
-    """Generate optimized video metadata using GPT."""
     prompt = f"""
     Analyze the following summarized YouTube video transcript and:
     1. Extract the top 10 keywords.
@@ -141,7 +148,7 @@ def generate_optimized_content(summarized_text):
     4. Generate related tags for the video.
     Summarized Transcript:
-    {summarized_text}
     Provide the results in the following JSON format:
     {{
@@ -151,7 +158,9 @@ def generate_optimized_content(summarized_text):
         "tags": ["tag1", "tag2", ..., "tag10"]
     }}
     """
     try:
         response = openai.ChatCompletion.create(
             model="gpt-3.5-turbo",
             messages=[
@@ -159,28 +168,43 @@ def generate_optimized_content(summarized_text):
                 {"role": "user", "content": prompt}
             ]
         )
-        return json.loads(response['choices'][0]['message']['content'])
-    except Exception as e:
-        print(f"Error generating metadata: {e}")
-        return {"error": "Unable to generate metadata."}
-# Main Gradio Interface
-def process_video(youtube_url):
-    """Complete video processing workflow."""
-    transcript = get_transcript(youtube_url)
     if not transcript:
-        return {"error": "Could not fetch the transcript. Please try another video."}
-    summary = summarize_text(transcript)
-    optimized_content = generate_optimized_content(summary)
-    return optimized_content
 iface = gr.Interface(
-    fn=process_video,
-    inputs=gr.Textbox(label="Enter YouTube URL"),
-    outputs=gr.JSON(label="Optimized Metadata"),
-    title="YouTube Video SEO Optimizer",
-    description="Paste a YouTube URL to generate an SEO-friendly title, description, tags, and keywords."
 )
 if __name__ == "__main__":
-    iface.launch()

 from pydub import AudioSegment
 import tempfile
 from transformers import pipeline
+from pytrends.request import TrendReq
 from youtube_transcript_api import YouTubeTranscriptApi
 import torch
 import openai
 import json
 from urllib.parse import urlparse, parse_qs
 import os
 def extract_video_id(url):
+    """Extracts the video ID from a YouTube URL."""
     try:
         parsed_url = urlparse(url)
         if "youtube.com" in parsed_url.netloc:
             return query_params.get('v', [None])[0]
         elif "youtu.be" in parsed_url.netloc:
             return parsed_url.path.strip("/")
+        else:
+            print("Invalid YouTube URL.")
+            return None
     except Exception as e:
         print(f"Error parsing URL: {e}")
         return None
+def get_video_duration(video_id, api_key):
+    """Fetches the video duration in minutes."""
     try:
+        youtube = googleapiclient.discovery.build("youtube", "v3", developerKey=api_key)
         request = youtube.videos().list(part="contentDetails", id=video_id)
         response = request.execute()
         if response["items"]:
             minutes = int(match.group(2)) if match.group(2) else 0
             seconds = int(match.group(3)) if match.group(3) else 0
             return hours * 60 + minutes + seconds / 60
+        else:
+            print("No video details found.")
+            return None
     except Exception as e:
+        print(f"Error fetching video duration: {e}")
         return None
 def download_and_transcribe_with_whisper(youtube_url):
     try:
         with tempfile.TemporaryDirectory() as temp_dir:
             temp_audio_file = os.path.join(temp_dir, "audio.mp3")
             ydl_opts = {
                 'format': 'bestaudio/best',
                 'outtmpl': temp_audio_file,
+                'extractaudio': True,
+                'audioquality': 1,
             }
+            # Download audio using yt-dlp
             with yt_dlp.YoutubeDL(ydl_opts) as ydl:
                 ydl.download([youtube_url])
+            # Convert to wav for Whisper
             audio = AudioSegment.from_file(temp_audio_file)
             wav_file = os.path.join(temp_dir, "audio.wav")
             audio.export(wav_file, format="wav")
+            # Run Whisper transcription
             model = whisper.load_model("large")
             result = model.transcribe(wav_file)
+            transcript = result['text']
+            return transcript
     except Exception as e:
+        print(f"Error during transcription: {e}")
         return None
 def get_transcript_from_youtube_api(video_id, video_length):
+    """Fetches transcript using YouTube API if available."""
     try:
         transcript_list = YouTubeTranscriptApi.list_transcripts(video_id)
         for transcript in transcript_list:
             if not transcript.is_generated:
+                segments = transcript.fetch()
+                return " ".join(segment['text'] for segment in segments)
         if video_length > 15:
             auto_transcript = transcript_list.find_generated_transcript(['en'])
+            if auto_transcript:
+                segments = auto_transcript.fetch()
+                return " ".join(segment['text'] for segment in segments)
+        print("Manual transcript not available, and video is too short for auto-transcript.")
         return None
     except Exception as e:
         print(f"Error fetching transcript: {e}")
         return None
+def get_transcript(youtube_url, api_key):
+    """Gets transcript from YouTube API or Whisper if unavailable."""
     video_id = extract_video_id(youtube_url)
     if not video_id:
+        print("Invalid or unsupported YouTube URL.")
+        return None
+    video_length = get_video_duration(video_id, api_key)
+    if video_length is not None:
+        print(f"Video length: {video_length:.2f} minutes.")
+        transcript = get_transcript_from_youtube_api(video_id, video_length)
+        if transcript:
+            return transcript
+        print("Using Whisper for transcription.")
+        return download_and_transcribe_with_whisper(youtube_url)
+    else:
+        print("Error fetching video duration.")
         return None
+def summarize_text_huggingface(text):
+    """Summarizes text using a Hugging Face summarization model."""
+    summarizer = pipeline("summarization", model="facebook/bart-large-cnn", device=0 if torch.cuda.is_available() else -1)
+    max_input_length = 1024
+    chunk_overlap = 100
+    text_chunks = [
+        text[i:i + max_input_length]
+        for i in range(0, len(text), max_input_length - chunk_overlap)
+    ]
+    summaries = [
+        summarizer(chunk, max_length=100, min_length=50, do_sample=False)[0]['summary_text']
+        for chunk in text_chunks
+    ]
+    return " ".join(summaries)
+def generate_optimized_content(api_key, summarized_transcript):
+    openai.api_key = api_key
     prompt = f"""
     Analyze the following summarized YouTube video transcript and:
     1. Extract the top 10 keywords.
     4. Generate related tags for the video.
     Summarized Transcript:
+    {summarized_transcript}
     Provide the results in the following JSON format:
     {{
         "tags": ["tag1", "tag2", ..., "tag10"]
     }}
     """
     try:
+        # Use the updated OpenAI API format for chat completions
         response = openai.ChatCompletion.create(
             model="gpt-3.5-turbo",
             messages=[
                 {"role": "user", "content": prompt}
             ]
         )
+        # Extract and parse the response
+        response_content = response['choices'][0]['message']['content']
+        content = json.loads(response_content)
+        return content
+    except Exception as e:
+        print(f"Error generating content: {e}")
+        return None
+def youtube_seo_pipeline(youtube_url):
+    openai.api_key = OPENAI_API_KEY
+    if not YOUTUBE_API_KEY or not OPENAI_API_KEY:
+        return "API keys missing! Please check environment variables."
+    video_id = extract_video_id(youtube_url)
+    if not video_id:
+        return "Invalid YouTube URL."
+    transcript = get_transcript(youtube_url, YOUTUBE_API_KEY)
     if not transcript:
+        return "Failed to fetch transcript. Try another video."
+    summarized_text = summarize_text_huggingface(transcript)
+    optimized_content = generate_optimized_content(OPENAI_API_KEY, summarized_text)
+    if optimized_content:
+        return json.dumps(optimized_content, indent=4)
+    else:
+        return "Failed to generate SEO content."
+# Define the Gradio Interface
 iface = gr.Interface(
+    fn=youtube_seo_pipeline,
+    inputs="text",
+    outputs="text",
+    title="YouTube SEO Optimizer",
+    description="Enter a YouTube video URL to fetch and optimize SEO content (title, description, tags, and keywords)."
 )
+# Run the Gradio app
 if __name__ == "__main__":
+    iface.launch()