kirbah's picture
Add initial implementation for YouTube transcript retrieval and interface setup
666750d
raw
history blame
6.08 kB
import re
import gradio as gr
from youtube_transcript_api._api import YouTubeTranscriptApi
from youtube_transcript_api._errors import TranscriptsDisabled, NoTranscriptFound
def _extract_video_id(youtube_url: str) -> str | None:
"""
Extracts the YouTube video ID from a URL.
Handles standard, shortened, and embed URLs.
"""
# Standard URL: https://www.youtube.com/watch?v=VIDEO_ID
match = re.search(r"watch\?v=([^&]+)", youtube_url)
if match:
return match.group(1)
# Shortened URL: https://youtu.be/VIDEO_ID
match = re.search(r"youtu\.be/([^?&]+)", youtube_url)
if match:
return match.group(1)
# Embed URL: https://www.youtube.com/embed/VIDEO_ID
match = re.search(r"youtube\.com/embed/([^?&]+)", youtube_url)
if match:
return match.group(1)
# Video ID directly passed
# Basic check for a valid video ID format
if re.fullmatch(r"^[a-zA-Z0-9_-]{11}$", youtube_url):
return youtube_url
return None
def get_youtube_video_transcript(video_url_or_id: str, lang_preference: list[str] = ['en', 'en-US', 'en-GB']) -> str:
"""
Retrieves the transcript for a given YouTube video URL or video ID.
It tries to fetch the transcript in the preferred languages first (defaulting to English).
Args:
video_url_or_id (str): The full YouTube video URL (e.g., "https://www.youtube.com/watch?v=VIDEO_ID")
or just the 11-character video ID.
lang_preference (list[str]): A list of language codes to try for the transcript, in order of preference.
Defaults to ['en', 'en-US', 'en-GB'].
Returns:
str: The concatenated transcript text if successful.
An error message string if the transcript cannot be fetched (e.g., disabled, not found, invalid ID).
"""
video_id = _extract_video_id(video_url_or_id)
if not video_id:
return f"Error: Invalid YouTube video URL or ID provided: '{video_url_or_id}'. Could not extract a valid video ID."
try:
# Fetch available transcripts
transcript_list = YouTubeTranscriptApi().list(video_id)
# Try to find transcript in preferred languages
transcript = None
for lang_code in lang_preference:
try:
transcript = transcript_list.find_transcript([lang_code])
break
except NoTranscriptFound:
continue
# If not found in preferred, try generated transcript in preferred languages
if not transcript:
for lang_code in lang_preference:
try:
transcript = transcript_list.find_generated_transcript([
lang_code])
break
except NoTranscriptFound:
continue
# If still not found, try any available English transcript
if not transcript:
try:
transcript = transcript_list.find_transcript(
['en', 'en-US', 'en-GB', 'en-AU', 'en-CA', 'en-IN'])
except NoTranscriptFound:
pass # Continue to try any generated English transcript
if not transcript:
try:
transcript = transcript_list.find_generated_transcript(
['en', 'en-US', 'en-GB', 'en-AU', 'en-CA', 'en-IN'])
except NoTranscriptFound:
# If no English transcript, grab the first available original language transcript
try:
print(
f"YouTubeTool: No English transcript found for {video_id}. Trying first available original language.")
original_lang_transcript = next(
iter(transcript_list)) # get the first one
transcript = original_lang_transcript
except StopIteration: # No transcripts at all
return f"Error: No transcripts at all seem to be available for video ID '{video_id}'."
except NoTranscriptFound: # Should be caught by StopIteration if list is empty
return f"Error: No transcripts found for video ID '{video_id}' after trying preferred and English languages."
if transcript:
full_transcript_data = transcript.fetch()
# Concatenate all text segments into a single string
transcript_text = " ".join([segment.text
for segment in full_transcript_data])
return transcript_text
else:
# This case should ideally be covered by the fallbacks above
return f"Error: Could not find a suitable transcript for video ID '{video_id}' in languages: {lang_preference} or English."
except TranscriptsDisabled:
return f"Error: Transcripts are disabled for video ID '{video_id}'."
# This might catch cases where video ID is valid but has zero transcripts at all.
except NoTranscriptFound:
return f"Error: No transcripts whatsoever could be found for video ID '{video_id}'. The video might not have any captions or transcripts."
except Exception as e:
# Catch any other unexpected errors from the API or video ID issues not caught by regex
error_type = type(e).__name__
# Check for common youtube_transcript_api specific errors not explicitly caught if any
# Heuristic for bad ID
if "video ID" in str(e).lower() or "parameter" in str(e).lower():
return f"Error: Could not retrieve transcript for video ID '{video_id}'. It might be an invalid ID or the video is private/deleted. (API Error: {error_type})"
return f"Error: An unexpected error occurred while fetching transcript for video ID '{video_id}': {error_type} - {str(e)}"
def greet(name):
return get_youtube_video_transcript(name)
demo = gr.Interface(fn=greet, inputs="text", outputs="text")
demo.launch(mcp_server=True)