Spaces:
Running
Running
import re | |
import gradio as gr | |
from youtube_transcript_api._api import YouTubeTranscriptApi | |
from youtube_transcript_api._errors import TranscriptsDisabled, NoTranscriptFound | |
def _extract_video_id(youtube_url: str) -> str | None: | |
""" | |
Extracts the YouTube video ID from a URL. | |
Handles standard, shortened, and embed URLs. | |
""" | |
# Standard URL: https://www.youtube.com/watch?v=VIDEO_ID | |
match = re.search(r"watch\?v=([^&]+)", youtube_url) | |
if match: | |
return match.group(1) | |
# Shortened URL: https://youtu.be/VIDEO_ID | |
match = re.search(r"youtu\.be/([^?&]+)", youtube_url) | |
if match: | |
return match.group(1) | |
# Embed URL: https://www.youtube.com/embed/VIDEO_ID | |
match = re.search(r"youtube\.com/embed/([^?&]+)", youtube_url) | |
if match: | |
return match.group(1) | |
# Video ID directly passed | |
if re.fullmatch(r"^[a-zA-Z0-9_-]{11}$", youtube_url): | |
return youtube_url | |
return None | |
def get_youtube_video_transcript_scraper(video_url_or_id: str, lang_preference_list: list[str]) -> str: | |
""" | |
Retrieves the transcript for a given YouTube video URL or video ID using your specified youtube_transcript_api methods. | |
It tries to fetch the transcript in the preferred languages first. | |
Args: | |
video_url_or_id (str): The full YouTube video URL (e.g., "https://www.youtube.com/watch?v=VIDEO_ID") | |
or just the 11-character video ID. | |
lang_preference_list (list[str]): A list of language codes to try for the transcript, in order of preference. | |
Example: ['en', 'en-US', 'es']. | |
Returns: | |
str: The concatenated transcript text if successful. | |
An error message string if the transcript cannot be fetched. | |
""" | |
video_id = _extract_video_id(video_url_or_id) | |
if not video_id: | |
return f"Error: Invalid YouTube video URL or ID: '{video_url_or_id}'. Could not extract a valid video ID." | |
if not lang_preference_list: | |
return "Error: Language preference list ('lang_preference_list') cannot be empty." | |
try: | |
# Using your specified API instantiation and list method | |
api = YouTubeTranscriptApi() | |
transcript_list_obj = api.list( | |
video_id) # This is TranscriptList object | |
transcript_found = None | |
# Try to find manually created transcript in preferred languages | |
for lang_code in lang_preference_list: | |
try: | |
# Using your specified find_transcript method | |
transcript_found = transcript_list_obj.find_transcript([ | |
lang_code]) | |
break | |
except NoTranscriptFound: | |
continue | |
# If not found, try generated transcript in preferred languages | |
if not transcript_found: | |
for lang_code in lang_preference_list: | |
try: | |
# Using your specified find_generated_transcript method | |
transcript_found = transcript_list_obj.find_generated_transcript([ | |
lang_code]) | |
break | |
except NoTranscriptFound: | |
continue | |
# Fallback logic (similar to your original code's structure) | |
english_fallbacks = ['en', 'en-US', 'en-GB', 'en-AU', 'en-CA', 'en-IN'] | |
already_tried_english = any(lang.lower().startswith( | |
'en') for lang in lang_preference_list) | |
if not transcript_found and not already_tried_english: | |
# Try any available English transcript (manual first) | |
try: | |
transcript_found = transcript_list_obj.find_transcript( | |
english_fallbacks) | |
except NoTranscriptFound: | |
# Then try generated English | |
try: | |
transcript_found = transcript_list_obj.find_generated_transcript( | |
english_fallbacks) | |
except NoTranscriptFound: | |
pass # No English transcript found | |
# If still not found, try the first available original language (as per your initial logic) | |
if not transcript_found: | |
try: | |
# This part requires iterating through the TranscriptList object if no specific methods like "get_first" exist. | |
# Your original code used `next(iter(transcript_list_obj))` which implies the object is iterable. | |
# Let's assume the TranscriptList object itself can be iterated or has a way to get its items. | |
# A more direct way, if the object behaves like a list of available transcripts: | |
print(f"Notice: No transcript found in preferred languages or English for video ID '{video_id}'. " | |
"Attempting to fetch the first available original language transcript.") | |
# Iterate through all available transcripts in the list_obj | |
# This assumes transcript_list_obj is iterable and yields transcript objects directly. | |
# Based on your original code `next(iter(transcript_list))`, where transcript_list was from `api.list()`, | |
# this should work similarly. | |
for tr in transcript_list_obj: # transcript_list_obj is a TranscriptList | |
transcript_found = tr # Get the first one and break | |
break | |
if not transcript_found: # If loop completed without finding any | |
raise StopIteration # Mimic original behavior to be caught below | |
except StopIteration: # No transcripts at all | |
return (f"Error: No transcripts at all seem to be available for video ID '{video_id}'. " | |
f"Checked preferred: {lang_preference_list}, English fallbacks, and any original language.") | |
except NoTranscriptFound: # Should ideally be caught by StopIteration if list is empty | |
return (f"Error: No transcripts found for video ID '{video_id}' after trying preferred, English, and original languages.") | |
if transcript_found: # transcript_found is a Transcript object | |
full_transcript_data = transcript_found.fetch() | |
transcript_text = " ".join([segment.text | |
for segment in full_transcript_data]) | |
return transcript_text | |
else: | |
return (f"Error: Could not find any suitable transcript for video ID '{video_id}'. " | |
f"Preferred languages: {lang_preference_list}. Also checked English and original languages if applicable.") | |
except TranscriptsDisabled: | |
return f"Error: Transcripts are disabled for video ID '{video_id}'." | |
except NoTranscriptFound: # This can be raised by list_transcripts directly if no captions at all for the video | |
return f"Error: No transcripts whatsoever found for video ID '{video_id}'. The video might not have any captions initially." | |
except Exception as e: | |
error_type = type(e).__name__ | |
# Check for common youtube_transcript_api specific errors | |
if "VideoUnavailable" in error_type: # Common error from the library | |
return f"Error: Video '{video_id}' is unavailable. It might be private, deleted, or geo-restricted." | |
# Heuristic from your original code | |
if "video ID" in str(e).lower() or "parameter" in str(e).lower(): | |
return f"Error: Could not retrieve transcript for video ID '{video_id}'. It might be an invalid ID or other parameter issue. (API Error: {error_type})" | |
return f"Error: An unexpected error occurred while fetching transcript for video ID '{video_id}': {error_type} - {str(e)}" | |
def gradio_mcp_handler(video_url_or_id: str, lang_preference_str: str): | |
""" | |
MCP tool handler to retrieve YouTube video transcript using youtube_transcript_api. | |
Args: | |
video_url_or_id (str): The YouTube video URL or its 11-character ID. | |
lang_preference_str (str): A comma-separated string of preferred language codes for the transcript | |
(e.g., "en,en-US,es"). Defaults to "en" if empty or invalid. | |
Returns: | |
str: The fetched transcript or an error message. | |
""" | |
if not video_url_or_id.strip(): | |
return "Error: 'video_url_or_id' argument cannot be empty." | |
if lang_preference_str and lang_preference_str.strip(): | |
lang_list = [lang.strip() | |
for lang in lang_preference_str.split(',') if lang.strip()] | |
else: | |
lang_list = ['en'] | |
if not lang_list: # Handle cases like lang_preference_str = "," | |
lang_list = ['en'] | |
return get_youtube_video_transcript_scraper(video_url_or_id, lang_list) | |
# Define Gradio input components for MCP | |
inputs = [ | |
gr.Textbox( | |
label="YouTube Video URL or ID", | |
placeholder="e.g., https://www.youtube.com/watch?v=VIDEO_ID or VIDEO_ID" | |
), | |
gr.Textbox( | |
label="Preferred Language Codes (comma-separated)", | |
value="en,en-US", | |
placeholder="e.g., en,es,fr (default: en)" | |
) | |
] | |
outputs = gr.Textbox( | |
label="Transcript Output", | |
lines=15, | |
show_copy_button=True | |
) | |
demo = gr.Interface( | |
fn=gradio_mcp_handler, | |
inputs=inputs, | |
outputs=outputs, | |
title="YouTube Transcript Retriever (youtube-transcript-api)", | |
description=( | |
"Enter YouTube video URL/ID and comma-separated language codes to fetch transcript using 'youtube-transcript-api'. " | |
"MCP argument descriptions from handler's docstring." | |
), | |
allow_flagging='never', | |
examples=[ | |
["https://www.youtube.com/watch?v=Sd6F2pfKJmk", "en"], | |
["Sd6F2pfKJmk", "en,ja"], | |
["https://www.youtube.com/watch?v=rokGy0huYEA", "ja,en"] | |
], | |
article=( | |
"**How to Use:**\n" | |
"1. Paste YouTube video URL or 11-character video ID.\n" | |
"2. Enter comma-separated language codes (e.g., `en-GB,en,es`). Defaults to `en` if empty.\n" | |
"3. Click 'Submit'.\n\n" | |
"**MCP Server Information:**\n" | |
"Launched with `mcp_server=True`, exposes an MCP tool.\n" | |
"- Tool arguments `video_url_or_id` (str) and `lang_preference_str` (str) are defined in handler docstring.\n" | |
"- Schema: `/gradio_api/mcp/schema`. Endpoint: `/gradio_api/mcp/sse`." | |
) | |
) | |
if __name__ == '__main__': | |
print("Gradio app starting with your specified youtube-transcript-api methods...") | |
print("MCP Server integration enabled (mcp_server=True).") | |
print("Ensure 'gradio[mcp]' and 'youtube-transcript-api' are installed.") | |
demo.launch(mcp_server=True) | |