Spaces:

AnalysisWithMSR
/

SEO

Sleeping

App Files Files Community

AnalysisWithMSR commited on Dec 14, 2024

Commit

1d133a1

verified ·

1 Parent(s): 783f341

Update app.py

Browse files

Files changed (1) hide show

app.py +83 -105

app.py CHANGED Viewed

@@ -1,20 +1,33 @@
-import gradio as gr
-from transformers import pipeline
 import whisper
 from pydub import AudioSegment
 import tempfile
-import os
-import googleapiclient.discovery
 from youtube_transcript_api import YouTubeTranscriptApi
 import openai
-# Load API keys from environment variables (recommended)
-YOUTUBE_API_KEY = os.environ.get("YOUTUBE_API_KEY")
-OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
 def extract_video_id(url):
-    """Extracts the video ID from a YouTube URL."""
     try:
         parsed_url = urlparse(url)
         if "youtube.com" in parsed_url.netloc:
@@ -22,22 +35,15 @@ def extract_video_id(url):
             return query_params.get('v', [None])[0]
         elif "youtu.be" in parsed_url.netloc:
             return parsed_url.path.strip("/")
-        else:
-            print("Invalid YouTube URL.")
-            return None
     except Exception as e:
         print(f"Error parsing URL: {e}")
         return None
 def get_video_duration(video_id):
-    """Fetches the video duration in minutes (if API key provided)."""
-    if not YOUTUBE_API_KEY:
-        print("Missing YouTube API key. Skipping video duration.")
-        return None
     try:
-        youtube = googleapiclient.discovery.build("youtube", "v3", developerKey=YOUTUBE_API_KEY)
         request = youtube.videos().list(part="contentDetails", id=video_id)
         response = request.execute()
         if response["items"]:
@@ -47,105 +53,86 @@ def get_video_duration(video_id):
             minutes = int(match.group(2)) if match.group(2) else 0
             seconds = int(match.group(3)) if match.group(3) else 0
             return hours * 60 + minutes + seconds / 60
-        else:
-            print("No video details found.")
-            return None
     except Exception as e:
-        print(f"Error fetching video duration: {e}")
         return None
 def download_and_transcribe_with_whisper(youtube_url):
-    """Downloads and transcribes audio using Whisper."""
     try:
         with tempfile.TemporaryDirectory() as temp_dir:
             temp_audio_file = os.path.join(temp_dir, "audio.mp3")
             ydl_opts = {
                 'format': 'bestaudio/best',
                 'outtmpl': temp_audio_file,
-                'extractaudio': True,
-                'audioquality': 1,
             }
-            # Download audio using yt-dlp
             with yt_dlp.YoutubeDL(ydl_opts) as ydl:
                 ydl.download([youtube_url])
-            # Convert to wav for Whisper
             audio = AudioSegment.from_file(temp_audio_file)
             wav_file = os.path.join(temp_dir, "audio.wav")
             audio.export(wav_file, format="wav")
-            # Run Whisper transcription
             model = whisper.load_model("large")
             result = model.transcribe(wav_file)
-            transcript = result['text']
-            return transcript
     except Exception as e:
-        print(f"Error during transcription: {e}")
-        return None
-def get_transcript_from_youtube_api(video_id):
-    """Fetches transcript using YouTube API (if available)."""
-    if not YOUTUBE_API_KEY:
-        print("Missing YouTube API key. Skipping YouTube transcript.")
         return None
     try:
         transcript_list = YouTubeTranscriptApi.list_transcripts(video_id)
         for transcript in transcript_list:
             if not transcript.is_generated:
-                segments = transcript.fetch()
-                return " ".join(segment['text'] for segment in segments)
-        print("Manual transcript not found.")
         return None
     except Exception as e:
         print(f"Error fetching transcript: {e}")
         return None
 def get_transcript(youtube_url):
-    """Gets transcript from YouTube API or Whisper if unavailable."""
     video_id = extract_video_id(youtube_url)
     if not video_id:
-        print("Invalid or unsupported YouTube URL.")
-        return None
     video_length = get_video_duration(video_id)
     if video_length:
-        transcript = get_transcript_from_youtube_api(video_id)
-        if transcript:
-            return transcript
-    print("Using Whisper for transcription.")
-    return download_and_transcribe_with_whisper(youtube_url)
-def summarize_text_huggingface(text):
-    """Summarizes text using a Hugging Face summarization model."""
-    summarizer = pipeline("summarization", model="facebook/bart-large-cnn", device=0 if torch.cuda.is_available() else -1)
-    max_input_length = 1024
-    chunk_overlap = 100
-    text_chunks = [
-        text[i:i + max_input_length]
-        for i in range(0, len(text), max_input_length - chunk_overlap)
-    ]
-    summaries = [
-        summarizer(chunk, max_length=100, min_length=50, do_sample=False)[0]['summary_text']
-        for chunk in text_chunks
-    ]
-    return " ".join(summaries)
-def generate_optimized_content(summarized_transcript):
-    """Generates optimized content using OpenAI (if API key provided)."""
-    if not OPENAI_API_KEY:
-        print("Missing OpenAI API key. Skipping optimized content generation.")
         return None
     prompt = f"""
     Analyze the following summarized YouTube video transcript and:
     1. Extract the top 10 keywords.
@@ -154,7 +141,7 @@ def generate_optimized_content(summarized_transcript):
     4. Generate related tags for the video.
     Summarized Transcript:
-    {summarized_transcript}
     Provide the results in the following JSON format:
     {{
@@ -164,9 +151,7 @@ def generate_optimized_content(summarized_transcript):
         "tags": ["tag1", "tag2", ..., "tag10"]
     }}
     """
     try:
-        # Use the updated OpenAI API format for chat completions
         response = openai.ChatCompletion.create(
             model="gpt-3.5-turbo",
             messages=[
@@ -174,35 +159,28 @@ def generate_optimized_content(summarized_transcript):
                 {"role": "user", "content": prompt}
             ]
         )
-        # Extract and parse the response
-        response_content = response['choices'][0]['message']['content']
-        content = json.loads(response_content)
-        return content
     except Exception as e:
-        print(f"Error generating content: {e}")
-        return None
-def seo_tool(youtube_url):
-    """This function takes a YouTube URL as input and performs SEO optimization tasks."""
     transcript = get_transcript(youtube_url)
     if not transcript:
-        return "Could not fetch the transcript. Please try another video."
-    summary = summarize_text_huggingface(transcript)
     optimized_content = generate_optimized_content(summary)
-    return summary, optimized_content
-interface = gr.Interface(
-    fn=seo_tool,
-    inputs="text",
-    outputs=["text", "json"],
-    title="SEO Tool for YouTube Videos",
-    description="Enter a YouTube URL to get a summary and optimized content suggestions."
 )
 if __name__ == "__main__":
-    interface.launch()

+import googleapiclient.discovery
+import re
+import yt_dlp
 import whisper
 from pydub import AudioSegment
 import tempfile
+from transformers import pipeline
 from youtube_transcript_api import YouTubeTranscriptApi
+import torch
 import openai
+import json
+from urllib.parse import urlparse, parse_qs
+import os
+import gradio as gr
+# API Keys setup
+youtube_api_key = os.getenv("YOUTUBE_API_KEY")  # Set these as environment variables
+openai_api_key = os.getenv("OPENAI_API_KEY")
+openai.api_key = openai_api_key
+# Validation for missing API keys
+if not youtube_api_key:
+    raise ValueError("YOUTUBE_API_KEY is not set. Please set it as an environment variable.")
+if not openai_api_key:
+    raise ValueError("OPENAI_API_KEY is not set. Please set it as an environment variable.")
+# Utility Functions
 def extract_video_id(url):
+    """Extract the video ID from a YouTube URL."""
     try:
         parsed_url = urlparse(url)
         if "youtube.com" in parsed_url.netloc:
             return query_params.get('v', [None])[0]
         elif "youtu.be" in parsed_url.netloc:
             return parsed_url.path.strip("/")
+        return None
     except Exception as e:
         print(f"Error parsing URL: {e}")
         return None
 def get_video_duration(video_id):
+    """Fetch the video duration."""
     try:
+        youtube = googleapiclient.discovery.build("youtube", "v3", developerKey=youtube_api_key)
         request = youtube.videos().list(part="contentDetails", id=video_id)
         response = request.execute()
         if response["items"]:
             minutes = int(match.group(2)) if match.group(2) else 0
             seconds = int(match.group(3)) if match.group(3) else 0
             return hours * 60 + minutes + seconds / 60
+        return None
     except Exception as e:
+        print(f"Error fetching duration: {e}")
         return None
 def download_and_transcribe_with_whisper(youtube_url):
+    """Download audio and transcribe using Whisper."""
     try:
         with tempfile.TemporaryDirectory() as temp_dir:
             temp_audio_file = os.path.join(temp_dir, "audio.mp3")
             ydl_opts = {
                 'format': 'bestaudio/best',
                 'outtmpl': temp_audio_file,
+                'postprocessors': [{
+                    'key': 'FFmpegExtractAudio',
+                    'preferredcodec': 'mp3',
+                    'preferredquality': '192',
+                }],
             }
             with yt_dlp.YoutubeDL(ydl_opts) as ydl:
                 ydl.download([youtube_url])
             audio = AudioSegment.from_file(temp_audio_file)
             wav_file = os.path.join(temp_dir, "audio.wav")
             audio.export(wav_file, format="wav")
             model = whisper.load_model("large")
             result = model.transcribe(wav_file)
+            return result['text']
     except Exception as e:
+        print(f"Error during Whisper transcription: {e}")
         return None
+def get_transcript_from_youtube_api(video_id, video_length):
+    """Fetch transcript using YouTubeTranscriptApi."""
     try:
         transcript_list = YouTubeTranscriptApi.list_transcripts(video_id)
         for transcript in transcript_list:
             if not transcript.is_generated:
+                return " ".join(segment['text'] for segment in transcript.fetch())
+        if video_length > 15:
+            auto_transcript = transcript_list.find_generated_transcript(['en'])
+            return " ".join(segment['text'] for segment in auto_transcript.fetch())
         return None
     except Exception as e:
         print(f"Error fetching transcript: {e}")
         return None
 def get_transcript(youtube_url):
+    """Fetch transcript or use Whisper fallback."""
     video_id = extract_video_id(youtube_url)
     if not video_id:
+        return "Invalid or unsupported YouTube URL."
     video_length = get_video_duration(video_id)
     if video_length:
+        transcript = get_transcript_from_youtube_api(video_id, video_length)
+        return transcript if transcript else download_and_transcribe_with_whisper(youtube_url)
+    return "Error fetching video details."
+def summarize_text(text):
+    """Summarize text using Hugging Face's BART model."""
+    try:
+        summarizer = pipeline("summarization", model="facebook/bart-large-cnn", device=0 if torch.cuda.is_available() else -1)
+        max_input_length = 1024
+        chunk_overlap = 100
+        text_chunks = [
+            text[i:i + max_input_length]
+            for i in range(0, len(text), max_input_length - chunk_overlap)
+        ]
+        summaries = [
+            summarizer(chunk, max_length=100, min_length=50, do_sample=False)[0]['summary_text']
+            for chunk in text_chunks
+        ]
+        return " ".join(summaries)
+    except Exception as e:
+        print(f"Error during summarization: {e}")
         return None
+def generate_optimized_content(summarized_text):
+    """Generate optimized video metadata using GPT."""
     prompt = f"""
     Analyze the following summarized YouTube video transcript and:
     1. Extract the top 10 keywords.
     4. Generate related tags for the video.
     Summarized Transcript:
+    {summarized_text}
     Provide the results in the following JSON format:
     {{
         "tags": ["tag1", "tag2", ..., "tag10"]
     }}
     """
     try:
         response = openai.ChatCompletion.create(
             model="gpt-3.5-turbo",
             messages=[
                 {"role": "user", "content": prompt}
             ]
         )
+        return json.loads(response['choices'][0]['message']['content'])
     except Exception as e:
+        print(f"Error generating metadata: {e}")
+        return {"error": "Unable to generate metadata."}
+# Main Gradio Interface
+def process_video(youtube_url):
+    """Complete video processing workflow."""
     transcript = get_transcript(youtube_url)
     if not transcript:
+        return {"error": "Could not fetch the transcript. Please try another video."}
+    summary = summarize_text(transcript)
     optimized_content = generate_optimized_content(summary)
+    return optimized_content
+iface = gr.Interface(
+    fn=process_video,
+    inputs=gr.Textbox(label="Enter YouTube URL"),
+    outputs=gr.JSON(label="Optimized Metadata"),
+    title="YouTube Video SEO Optimizer",
+    description="Paste a YouTube URL to generate an SEO-friendly title, description, tags, and keywords."
 )
 if __name__ == "__main__":
+    iface.launch()