import re import gradio as gr from youtube_transcript_api._api import YouTubeTranscriptApi from youtube_transcript_api._errors import TranscriptsDisabled, NoTranscriptFound def _extract_video_id(youtube_url: str) -> str | None: """ Extracts the YouTube video ID from a URL. Handles standard, shortened, and embed URLs. """ # Standard URL: https://www.youtube.com/watch?v=VIDEO_ID match = re.search(r"watch\?v=([^&]+)", youtube_url) if match: return match.group(1) # Shortened URL: https://youtu.be/VIDEO_ID match = re.search(r"youtu\.be/([^?&]+)", youtube_url) if match: return match.group(1) # Embed URL: https://www.youtube.com/embed/VIDEO_ID match = re.search(r"youtube\.com/embed/([^?&]+)", youtube_url) if match: return match.group(1) # Video ID directly passed if re.fullmatch(r"^[a-zA-Z0-9_-]{11}$", youtube_url): return youtube_url return None def get_youtube_video_transcript_scraper(video_url_or_id: str, lang_preference_list: list[str]) -> str: """ Retrieves the transcript for a given YouTube video URL or video ID using your specified youtube_transcript_api methods. It tries to fetch the transcript in the preferred languages first. Args: video_url_or_id (str): The full YouTube video URL (e.g., "https://www.youtube.com/watch?v=VIDEO_ID") or just the 11-character video ID. lang_preference_list (list[str]): A list of language codes to try for the transcript, in order of preference. Example: ['en', 'en-US', 'es']. Returns: str: The concatenated transcript text if successful. An error message string if the transcript cannot be fetched. """ video_id = _extract_video_id(video_url_or_id) if not video_id: return f"Error: Invalid YouTube video URL or ID: '{video_url_or_id}'. Could not extract a valid video ID." if not lang_preference_list: return "Error: Language preference list ('lang_preference_list') cannot be empty." try: # Using your specified API instantiation and list method api = YouTubeTranscriptApi() transcript_list_obj = api.list( video_id) # This is TranscriptList object transcript_found = None # Try to find manually created transcript in preferred languages for lang_code in lang_preference_list: try: # Using your specified find_transcript method transcript_found = transcript_list_obj.find_transcript([ lang_code]) break except NoTranscriptFound: continue # If not found, try generated transcript in preferred languages if not transcript_found: for lang_code in lang_preference_list: try: # Using your specified find_generated_transcript method transcript_found = transcript_list_obj.find_generated_transcript([ lang_code]) break except NoTranscriptFound: continue # Fallback logic (similar to your original code's structure) english_fallbacks = ['en', 'en-US', 'en-GB', 'en-AU', 'en-CA', 'en-IN'] already_tried_english = any(lang.lower().startswith( 'en') for lang in lang_preference_list) if not transcript_found and not already_tried_english: # Try any available English transcript (manual first) try: transcript_found = transcript_list_obj.find_transcript( english_fallbacks) except NoTranscriptFound: # Then try generated English try: transcript_found = transcript_list_obj.find_generated_transcript( english_fallbacks) except NoTranscriptFound: pass # No English transcript found # If still not found, try the first available original language (as per your initial logic) if not transcript_found: try: # This part requires iterating through the TranscriptList object if no specific methods like "get_first" exist. # Your original code used `next(iter(transcript_list_obj))` which implies the object is iterable. # Let's assume the TranscriptList object itself can be iterated or has a way to get its items. # A more direct way, if the object behaves like a list of available transcripts: print(f"Notice: No transcript found in preferred languages or English for video ID '{video_id}'. " "Attempting to fetch the first available original language transcript.") # Iterate through all available transcripts in the list_obj # This assumes transcript_list_obj is iterable and yields transcript objects directly. # Based on your original code `next(iter(transcript_list))`, where transcript_list was from `api.list()`, # this should work similarly. for tr in transcript_list_obj: # transcript_list_obj is a TranscriptList transcript_found = tr # Get the first one and break break if not transcript_found: # If loop completed without finding any raise StopIteration # Mimic original behavior to be caught below except StopIteration: # No transcripts at all return (f"Error: No transcripts at all seem to be available for video ID '{video_id}'. " f"Checked preferred: {lang_preference_list}, English fallbacks, and any original language.") except NoTranscriptFound: # Should ideally be caught by StopIteration if list is empty return (f"Error: No transcripts found for video ID '{video_id}' after trying preferred, English, and original languages.") if transcript_found: # transcript_found is a Transcript object full_transcript_data = transcript_found.fetch() transcript_text = " ".join([segment.text for segment in full_transcript_data]) return transcript_text else: return (f"Error: Could not find any suitable transcript for video ID '{video_id}'. " f"Preferred languages: {lang_preference_list}. Also checked English and original languages if applicable.") except TranscriptsDisabled: return f"Error: Transcripts are disabled for video ID '{video_id}'." except NoTranscriptFound: # This can be raised by list_transcripts directly if no captions at all for the video return f"Error: No transcripts whatsoever found for video ID '{video_id}'. The video might not have any captions initially." except Exception as e: error_type = type(e).__name__ # Check for common youtube_transcript_api specific errors if "VideoUnavailable" in error_type: # Common error from the library return f"Error: Video '{video_id}' is unavailable. It might be private, deleted, or geo-restricted." # Heuristic from your original code if "video ID" in str(e).lower() or "parameter" in str(e).lower(): return f"Error: Could not retrieve transcript for video ID '{video_id}'. It might be an invalid ID or other parameter issue. (API Error: {error_type})" return f"Error: An unexpected error occurred while fetching transcript for video ID '{video_id}': {error_type} - {str(e)}" def gradio_mcp_handler(video_url_or_id: str, lang_preference_str: str): """ MCP tool handler to retrieve YouTube video transcript using youtube_transcript_api. Args: video_url_or_id (str): The YouTube video URL or its 11-character ID. lang_preference_str (str): A comma-separated string of preferred language codes for the transcript (e.g., "en,en-US,es"). Defaults to "en" if empty or invalid. Returns: str: The fetched transcript or an error message. """ if not video_url_or_id.strip(): return "Error: 'video_url_or_id' argument cannot be empty." if lang_preference_str and lang_preference_str.strip(): lang_list = [lang.strip() for lang in lang_preference_str.split(',') if lang.strip()] else: lang_list = ['en'] if not lang_list: # Handle cases like lang_preference_str = "," lang_list = ['en'] return get_youtube_video_transcript_scraper(video_url_or_id, lang_list) # Define Gradio input components for MCP inputs = [ gr.Textbox( label="YouTube Video URL or ID", placeholder="e.g., https://www.youtube.com/watch?v=VIDEO_ID or VIDEO_ID" ), gr.Textbox( label="Preferred Language Codes (comma-separated)", value="en,en-US", placeholder="e.g., en,es,fr (default: en)" ) ] outputs = gr.Textbox( label="Transcript Output", lines=15, show_copy_button=True ) demo = gr.Interface( fn=gradio_mcp_handler, inputs=inputs, outputs=outputs, title="YouTube Transcript Retriever (youtube-transcript-api)", description=( "Enter YouTube video URL/ID and comma-separated language codes to fetch transcript using 'youtube-transcript-api'. " "MCP argument descriptions from handler's docstring." ), allow_flagging='never', examples=[ ["https://www.youtube.com/watch?v=Sd6F2pfKJmk", "en"], ["Sd6F2pfKJmk", "en,ja"], ["https://www.youtube.com/watch?v=rokGy0huYEA", "ja,en"] ], article=( "**How to Use:**\n" "1. Paste YouTube video URL or 11-character video ID.\n" "2. Enter comma-separated language codes (e.g., `en-GB,en,es`). Defaults to `en` if empty.\n" "3. Click 'Submit'.\n\n" "**MCP Server Information:**\n" "Launched with `mcp_server=True`, exposes an MCP tool.\n" "- Tool arguments `video_url_or_id` (str) and `lang_preference_str` (str) are defined in handler docstring.\n" "- Schema: `/gradio_api/mcp/schema`. Endpoint: `/gradio_api/mcp/sse`." ) ) if __name__ == '__main__': print("Gradio app starting with your specified youtube-transcript-api methods...") print("MCP Server integration enabled (mcp_server=True).") print("Ensure 'gradio[mcp]' and 'youtube-transcript-api' are installed.") demo.launch(mcp_server=True)