Spaces:

kirbah
/

mcp-youtube-transcript

Running

File size: 10,752 Bytes

import re
import gradio as gr
from youtube_transcript_api._api import YouTubeTranscriptApi
from youtube_transcript_api._errors import TranscriptsDisabled, NoTranscriptFound


def _extract_video_id(youtube_url: str) -> str | None:
    """
    Extracts the YouTube video ID from a URL.
    Handles standard, shortened, and embed URLs.
    """
    # Standard URL: https://www.youtube.com/watch?v=VIDEO_ID
    match = re.search(r"watch\?v=([^&]+)", youtube_url)
    if match:
        return match.group(1)

    # Shortened URL: https://youtu.be/VIDEO_ID
    match = re.search(r"youtu\.be/([^?&]+)", youtube_url)
    if match:
        return match.group(1)

    # Embed URL: https://www.youtube.com/embed/VIDEO_ID
    match = re.search(r"youtube\.com/embed/([^?&]+)", youtube_url)
    if match:
        return match.group(1)

    # Video ID directly passed
    if re.fullmatch(r"^[a-zA-Z0-9_-]{11}$", youtube_url):
        return youtube_url
    return None


def get_youtube_video_transcript_scraper(video_url_or_id: str, lang_preference_list: list[str]) -> str:
    """
    Retrieves the transcript for a given YouTube video URL or video ID using your specified youtube_transcript_api methods.
    It tries to fetch the transcript in the preferred languages first.

    Args:
        video_url_or_id (str): The full YouTube video URL (e.g., "https://www.youtube.com/watch?v=VIDEO_ID") 
                               or just the 11-character video ID.
        lang_preference_list (list[str]): A list of language codes to try for the transcript, in order of preference.
                                     Example: ['en', 'en-US', 'es'].

    Returns:
        str: The concatenated transcript text if successful.
             An error message string if the transcript cannot be fetched.
    """
    video_id = _extract_video_id(video_url_or_id)

    if not video_id:
        return f"Error: Invalid YouTube video URL or ID: '{video_url_or_id}'. Could not extract a valid video ID."

    if not lang_preference_list:
        return "Error: Language preference list ('lang_preference_list') cannot be empty."

    try:
        # Using your specified API instantiation and list method
        api = YouTubeTranscriptApi()
        transcript_list_obj = api.list(
            video_id)  # This is TranscriptList object

        transcript_found = None

        # Try to find manually created transcript in preferred languages
        for lang_code in lang_preference_list:
            try:
                # Using your specified find_transcript method
                transcript_found = transcript_list_obj.find_transcript([
                                                                       lang_code])
                break
            except NoTranscriptFound:
                continue

        # If not found, try generated transcript in preferred languages
        if not transcript_found:
            for lang_code in lang_preference_list:
                try:
                    # Using your specified find_generated_transcript method
                    transcript_found = transcript_list_obj.find_generated_transcript([
                                                                                     lang_code])
                    break
                except NoTranscriptFound:
                    continue

        # Fallback logic (similar to your original code's structure)
        english_fallbacks = ['en', 'en-US', 'en-GB', 'en-AU', 'en-CA', 'en-IN']
        already_tried_english = any(lang.lower().startswith(
            'en') for lang in lang_preference_list)

        if not transcript_found and not already_tried_english:
            # Try any available English transcript (manual first)
            try:
                transcript_found = transcript_list_obj.find_transcript(
                    english_fallbacks)
            except NoTranscriptFound:
                # Then try generated English
                try:
                    transcript_found = transcript_list_obj.find_generated_transcript(
                        english_fallbacks)
                except NoTranscriptFound:
                    pass  # No English transcript found

        # If still not found, try the first available original language (as per your initial logic)
        if not transcript_found:
            try:
                # This part requires iterating through the TranscriptList object if no specific methods like "get_first" exist.
                # Your original code used `next(iter(transcript_list_obj))` which implies the object is iterable.
                # Let's assume the TranscriptList object itself can be iterated or has a way to get its items.
                # A more direct way, if the object behaves like a list of available transcripts:
                print(f"Notice: No transcript found in preferred languages or English for video ID '{video_id}'. "
                      "Attempting to fetch the first available original language transcript.")

                # Iterate through all available transcripts in the list_obj
                # This assumes transcript_list_obj is iterable and yields transcript objects directly.
                # Based on your original code `next(iter(transcript_list))`, where transcript_list was from `api.list()`,
                # this should work similarly.
                for tr in transcript_list_obj:  # transcript_list_obj is a TranscriptList
                    transcript_found = tr  # Get the first one and break
                    break
                if not transcript_found:  # If loop completed without finding any
                    raise StopIteration  # Mimic original behavior to be caught below

            except StopIteration:  # No transcripts at all
                return (f"Error: No transcripts at all seem to be available for video ID '{video_id}'. "
                        f"Checked preferred: {lang_preference_list}, English fallbacks, and any original language.")
            except NoTranscriptFound:  # Should ideally be caught by StopIteration if list is empty
                return (f"Error: No transcripts found for video ID '{video_id}' after trying preferred, English, and original languages.")

        if transcript_found:  # transcript_found is a Transcript object
            full_transcript_data = transcript_found.fetch()
            transcript_text = " ".join([segment.text
                                       for segment in full_transcript_data])
            return transcript_text
        else:
            return (f"Error: Could not find any suitable transcript for video ID '{video_id}'. "
                    f"Preferred languages: {lang_preference_list}. Also checked English and original languages if applicable.")

    except TranscriptsDisabled:
        return f"Error: Transcripts are disabled for video ID '{video_id}'."
    except NoTranscriptFound:  # This can be raised by list_transcripts directly if no captions at all for the video
        return f"Error: No transcripts whatsoever found for video ID '{video_id}'. The video might not have any captions initially."
    except Exception as e:
        error_type = type(e).__name__
        # Check for common youtube_transcript_api specific errors
        if "VideoUnavailable" in error_type:  # Common error from the library
            return f"Error: Video '{video_id}' is unavailable. It might be private, deleted, or geo-restricted."
        # Heuristic from your original code
        if "video ID" in str(e).lower() or "parameter" in str(e).lower():
            return f"Error: Could not retrieve transcript for video ID '{video_id}'. It might be an invalid ID or other parameter issue. (API Error: {error_type})"
        return f"Error: An unexpected error occurred while fetching transcript for video ID '{video_id}': {error_type} - {str(e)}"


def gradio_mcp_handler(video_url_or_id: str, lang_preference_str: str):
    """
    MCP tool handler to retrieve YouTube video transcript using youtube_transcript_api.

    Args:
        video_url_or_id (str): The YouTube video URL or its 11-character ID.
        lang_preference_str (str): A comma-separated string of preferred language codes for the transcript
                                   (e.g., "en,en-US,es"). Defaults to "en" if empty or invalid.

    Returns:
        str: The fetched transcript or an error message.
    """
    if not video_url_or_id.strip():
        return "Error: 'video_url_or_id' argument cannot be empty."

    if lang_preference_str and lang_preference_str.strip():
        lang_list = [lang.strip()
                     for lang in lang_preference_str.split(',') if lang.strip()]
    else:
        lang_list = ['en']

    if not lang_list:  # Handle cases like lang_preference_str = ","
        lang_list = ['en']

    return get_youtube_video_transcript_scraper(video_url_or_id, lang_list)


# Define Gradio input components for MCP
inputs = [
    gr.Textbox(
        label="YouTube Video URL or ID",
        placeholder="e.g., https://www.youtube.com/watch?v=VIDEO_ID or VIDEO_ID"
    ),
    gr.Textbox(
        label="Preferred Language Codes (comma-separated)",
        value="en,en-US",
        placeholder="e.g., en,es,fr (default: en)"
    )
]

outputs = gr.Textbox(
    label="Transcript Output",
    lines=15,
    show_copy_button=True
)

demo = gr.Interface(
    fn=gradio_mcp_handler,
    inputs=inputs,
    outputs=outputs,
    title="YouTube Transcript Retriever (youtube-transcript-api)",
    description=(
        "Enter YouTube video URL/ID and comma-separated language codes to fetch transcript using 'youtube-transcript-api'. "
        "MCP argument descriptions from handler's docstring."
    ),
    allow_flagging='never',
    examples=[
        ["https://www.youtube.com/watch?v=Sd6F2pfKJmk", "en"],
        ["Sd6F2pfKJmk", "en,ja"],
        ["https://www.youtube.com/watch?v=rokGy0huYEA", "ja,en"]
    ],
    article=(
        "**How to Use:**\n"
        "1. Paste YouTube video URL or 11-character video ID.\n"
        "2. Enter comma-separated language codes (e.g., `en-GB,en,es`). Defaults to `en` if empty.\n"
        "3. Click 'Submit'.\n\n"
        "**MCP Server Information:**\n"
        "Launched with `mcp_server=True`, exposes an MCP tool.\n"
        "- Tool arguments `video_url_or_id` (str) and `lang_preference_str` (str) are defined in handler docstring.\n"
        "- Schema: `/gradio_api/mcp/schema`. Endpoint: `/gradio_api/mcp/sse`."
    )
)

if __name__ == '__main__':
    print("Gradio app starting with your specified youtube-transcript-api methods...")
    print("MCP Server integration enabled (mcp_server=True).")
    print("Ensure 'gradio[mcp]' and 'youtube-transcript-api' are installed.")
    demo.launch(mcp_server=True)