Spaces:
Running
Running
File size: 6,075 Bytes
666750d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 |
import re
import gradio as gr
from youtube_transcript_api._api import YouTubeTranscriptApi
from youtube_transcript_api._errors import TranscriptsDisabled, NoTranscriptFound
def _extract_video_id(youtube_url: str) -> str | None:
"""
Extracts the YouTube video ID from a URL.
Handles standard, shortened, and embed URLs.
"""
# Standard URL: https://www.youtube.com/watch?v=VIDEO_ID
match = re.search(r"watch\?v=([^&]+)", youtube_url)
if match:
return match.group(1)
# Shortened URL: https://youtu.be/VIDEO_ID
match = re.search(r"youtu\.be/([^?&]+)", youtube_url)
if match:
return match.group(1)
# Embed URL: https://www.youtube.com/embed/VIDEO_ID
match = re.search(r"youtube\.com/embed/([^?&]+)", youtube_url)
if match:
return match.group(1)
# Video ID directly passed
# Basic check for a valid video ID format
if re.fullmatch(r"^[a-zA-Z0-9_-]{11}$", youtube_url):
return youtube_url
return None
def get_youtube_video_transcript(video_url_or_id: str, lang_preference: list[str] = ['en', 'en-US', 'en-GB']) -> str:
"""
Retrieves the transcript for a given YouTube video URL or video ID.
It tries to fetch the transcript in the preferred languages first (defaulting to English).
Args:
video_url_or_id (str): The full YouTube video URL (e.g., "https://www.youtube.com/watch?v=VIDEO_ID")
or just the 11-character video ID.
lang_preference (list[str]): A list of language codes to try for the transcript, in order of preference.
Defaults to ['en', 'en-US', 'en-GB'].
Returns:
str: The concatenated transcript text if successful.
An error message string if the transcript cannot be fetched (e.g., disabled, not found, invalid ID).
"""
video_id = _extract_video_id(video_url_or_id)
if not video_id:
return f"Error: Invalid YouTube video URL or ID provided: '{video_url_or_id}'. Could not extract a valid video ID."
try:
# Fetch available transcripts
transcript_list = YouTubeTranscriptApi().list(video_id)
# Try to find transcript in preferred languages
transcript = None
for lang_code in lang_preference:
try:
transcript = transcript_list.find_transcript([lang_code])
break
except NoTranscriptFound:
continue
# If not found in preferred, try generated transcript in preferred languages
if not transcript:
for lang_code in lang_preference:
try:
transcript = transcript_list.find_generated_transcript([
lang_code])
break
except NoTranscriptFound:
continue
# If still not found, try any available English transcript
if not transcript:
try:
transcript = transcript_list.find_transcript(
['en', 'en-US', 'en-GB', 'en-AU', 'en-CA', 'en-IN'])
except NoTranscriptFound:
pass # Continue to try any generated English transcript
if not transcript:
try:
transcript = transcript_list.find_generated_transcript(
['en', 'en-US', 'en-GB', 'en-AU', 'en-CA', 'en-IN'])
except NoTranscriptFound:
# If no English transcript, grab the first available original language transcript
try:
print(
f"YouTubeTool: No English transcript found for {video_id}. Trying first available original language.")
original_lang_transcript = next(
iter(transcript_list)) # get the first one
transcript = original_lang_transcript
except StopIteration: # No transcripts at all
return f"Error: No transcripts at all seem to be available for video ID '{video_id}'."
except NoTranscriptFound: # Should be caught by StopIteration if list is empty
return f"Error: No transcripts found for video ID '{video_id}' after trying preferred and English languages."
if transcript:
full_transcript_data = transcript.fetch()
# Concatenate all text segments into a single string
transcript_text = " ".join([segment.text
for segment in full_transcript_data])
return transcript_text
else:
# This case should ideally be covered by the fallbacks above
return f"Error: Could not find a suitable transcript for video ID '{video_id}' in languages: {lang_preference} or English."
except TranscriptsDisabled:
return f"Error: Transcripts are disabled for video ID '{video_id}'."
# This might catch cases where video ID is valid but has zero transcripts at all.
except NoTranscriptFound:
return f"Error: No transcripts whatsoever could be found for video ID '{video_id}'. The video might not have any captions or transcripts."
except Exception as e:
# Catch any other unexpected errors from the API or video ID issues not caught by regex
error_type = type(e).__name__
# Check for common youtube_transcript_api specific errors not explicitly caught if any
# Heuristic for bad ID
if "video ID" in str(e).lower() or "parameter" in str(e).lower():
return f"Error: Could not retrieve transcript for video ID '{video_id}'. It might be an invalid ID or the video is private/deleted. (API Error: {error_type})"
return f"Error: An unexpected error occurred while fetching transcript for video ID '{video_id}': {error_type} - {str(e)}"
def greet(name):
return get_youtube_video_transcript(name)
demo = gr.Interface(fn=greet, inputs="text", outputs="text")
demo.launch(mcp_server=True)
|