Spaces:

kirbah
/

mcp-youtube-transcript

Running

App Files Files Community

kirbah commited on 8 days ago

Commit

666750d

1 Parent(s): c5d96b1

Add initial implementation for YouTube transcript retrieval and interface setup

Browse files

Files changed (3) hide show

.gitignore +1 -0
app.py +133 -0
requirements.txt +2 -0

.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ .gradio/**

app.py ADDED Viewed

	@@ -0,0 +1,133 @@

+import re
+import gradio as gr
+from youtube_transcript_api._api import YouTubeTranscriptApi
+from youtube_transcript_api._errors import TranscriptsDisabled, NoTranscriptFound
+def _extract_video_id(youtube_url: str) -> str | None:
+    """
+    Extracts the YouTube video ID from a URL.
+    Handles standard, shortened, and embed URLs.
+    """
+    # Standard URL: https://www.youtube.com/watch?v=VIDEO_ID
+    match = re.search(r"watch\?v=([^&]+)", youtube_url)
+    if match:
+        return match.group(1)
+    # Shortened URL: https://youtu.be/VIDEO_ID
+    match = re.search(r"youtu\.be/([^?&]+)", youtube_url)
+    if match:
+        return match.group(1)
+    # Embed URL: https://www.youtube.com/embed/VIDEO_ID
+    match = re.search(r"youtube\.com/embed/([^?&]+)", youtube_url)
+    if match:
+        return match.group(1)
+    # Video ID directly passed
+    # Basic check for a valid video ID format
+    if re.fullmatch(r"^[a-zA-Z0-9_-]{11}$", youtube_url):
+        return youtube_url
+    return None
+def get_youtube_video_transcript(video_url_or_id: str, lang_preference: list[str] = ['en', 'en-US', 'en-GB']) -> str:
+    """
+    Retrieves the transcript for a given YouTube video URL or video ID.
+    It tries to fetch the transcript in the preferred languages first (defaulting to English).
+    Args:
+        video_url_or_id (str): The full YouTube video URL (e.g., "https://www.youtube.com/watch?v=VIDEO_ID")
+                               or just the 11-character video ID.
+        lang_preference (list[str]): A list of language codes to try for the transcript, in order of preference.
+                                     Defaults to ['en', 'en-US', 'en-GB'].
+    Returns:
+        str: The concatenated transcript text if successful.
+             An error message string if the transcript cannot be fetched (e.g., disabled, not found, invalid ID).
+    """
+    video_id = _extract_video_id(video_url_or_id)
+    if not video_id:
+        return f"Error: Invalid YouTube video URL or ID provided: '{video_url_or_id}'. Could not extract a valid video ID."
+    try:
+        # Fetch available transcripts
+        transcript_list = YouTubeTranscriptApi().list(video_id)
+        # Try to find transcript in preferred languages
+        transcript = None
+        for lang_code in lang_preference:
+            try:
+                transcript = transcript_list.find_transcript([lang_code])
+                break
+            except NoTranscriptFound:
+                continue
+        # If not found in preferred, try generated transcript in preferred languages
+        if not transcript:
+            for lang_code in lang_preference:
+                try:
+                    transcript = transcript_list.find_generated_transcript([
+                                                                           lang_code])
+                    break
+                except NoTranscriptFound:
+                    continue
+        # If still not found, try any available English transcript
+        if not transcript:
+            try:
+                transcript = transcript_list.find_transcript(
+                    ['en', 'en-US', 'en-GB', 'en-AU', 'en-CA', 'en-IN'])
+            except NoTranscriptFound:
+                pass  # Continue to try any generated English transcript
+        if not transcript:
+            try:
+                transcript = transcript_list.find_generated_transcript(
+                    ['en', 'en-US', 'en-GB', 'en-AU', 'en-CA', 'en-IN'])
+            except NoTranscriptFound:
+                # If no English transcript, grab the first available original language transcript
+                try:
+                    print(
+                        f"YouTubeTool: No English transcript found for {video_id}. Trying first available original language.")
+                    original_lang_transcript = next(
+                        iter(transcript_list))  # get the first one
+                    transcript = original_lang_transcript
+                except StopIteration:  # No transcripts at all
+                    return f"Error: No transcripts at all seem to be available for video ID '{video_id}'."
+                except NoTranscriptFound:  # Should be caught by StopIteration if list is empty
+                    return f"Error: No transcripts found for video ID '{video_id}' after trying preferred and English languages."
+        if transcript:
+            full_transcript_data = transcript.fetch()
+            # Concatenate all text segments into a single string
+            transcript_text = " ".join([segment.text
+                                       for segment in full_transcript_data])
+            return transcript_text
+        else:
+            # This case should ideally be covered by the fallbacks above
+            return f"Error: Could not find a suitable transcript for video ID '{video_id}' in languages: {lang_preference} or English."
+    except TranscriptsDisabled:
+        return f"Error: Transcripts are disabled for video ID '{video_id}'."
+    # This might catch cases where video ID is valid but has zero transcripts at all.
+    except NoTranscriptFound:
+        return f"Error: No transcripts whatsoever could be found for video ID '{video_id}'. The video might not have any captions or transcripts."
+    except Exception as e:
+        # Catch any other unexpected errors from the API or video ID issues not caught by regex
+        error_type = type(e).__name__
+        # Check for common youtube_transcript_api specific errors not explicitly caught if any
+        # Heuristic for bad ID
+        if "video ID" in str(e).lower() or "parameter" in str(e).lower():
+            return f"Error: Could not retrieve transcript for video ID '{video_id}'. It might be an invalid ID or the video is private/deleted. (API Error: {error_type})"
+        return f"Error: An unexpected error occurred while fetching transcript for video ID '{video_id}': {error_type} - {str(e)}"
+def greet(name):
+    return get_youtube_video_transcript(name)
+demo = gr.Interface(fn=greet, inputs="text", outputs="text")
+demo.launch(mcp_server=True)

requirements.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ gradio[mcp]
2	+ youtube-transcript-api