File size: 6,075 Bytes
666750d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
import re
import gradio as gr
from youtube_transcript_api._api import YouTubeTranscriptApi
from youtube_transcript_api._errors import TranscriptsDisabled, NoTranscriptFound


def _extract_video_id(youtube_url: str) -> str | None:
    """
    Extracts the YouTube video ID from a URL.
    Handles standard, shortened, and embed URLs.
    """
    # Standard URL: https://www.youtube.com/watch?v=VIDEO_ID
    match = re.search(r"watch\?v=([^&]+)", youtube_url)
    if match:
        return match.group(1)

    # Shortened URL: https://youtu.be/VIDEO_ID
    match = re.search(r"youtu\.be/([^?&]+)", youtube_url)
    if match:
        return match.group(1)

    # Embed URL: https://www.youtube.com/embed/VIDEO_ID
    match = re.search(r"youtube\.com/embed/([^?&]+)", youtube_url)
    if match:
        return match.group(1)

    # Video ID directly passed
    # Basic check for a valid video ID format
    if re.fullmatch(r"^[a-zA-Z0-9_-]{11}$", youtube_url):
        return youtube_url

    return None


def get_youtube_video_transcript(video_url_or_id: str, lang_preference: list[str] = ['en', 'en-US', 'en-GB']) -> str:
    """
    Retrieves the transcript for a given YouTube video URL or video ID.
    It tries to fetch the transcript in the preferred languages first (defaulting to English).

    Args:
        video_url_or_id (str): The full YouTube video URL (e.g., "https://www.youtube.com/watch?v=VIDEO_ID") 
                               or just the 11-character video ID.
        lang_preference (list[str]): A list of language codes to try for the transcript, in order of preference.
                                     Defaults to ['en', 'en-US', 'en-GB'].

    Returns:
        str: The concatenated transcript text if successful.
             An error message string if the transcript cannot be fetched (e.g., disabled, not found, invalid ID).
    """
    video_id = _extract_video_id(video_url_or_id)

    if not video_id:
        return f"Error: Invalid YouTube video URL or ID provided: '{video_url_or_id}'. Could not extract a valid video ID."

    try:
        # Fetch available transcripts
        transcript_list = YouTubeTranscriptApi().list(video_id)

        # Try to find transcript in preferred languages
        transcript = None
        for lang_code in lang_preference:
            try:
                transcript = transcript_list.find_transcript([lang_code])
                break
            except NoTranscriptFound:
                continue

        # If not found in preferred, try generated transcript in preferred languages
        if not transcript:
            for lang_code in lang_preference:
                try:
                    transcript = transcript_list.find_generated_transcript([
                                                                           lang_code])
                    break
                except NoTranscriptFound:
                    continue

        # If still not found, try any available English transcript
        if not transcript:
            try:
                transcript = transcript_list.find_transcript(
                    ['en', 'en-US', 'en-GB', 'en-AU', 'en-CA', 'en-IN'])
            except NoTranscriptFound:
                pass  # Continue to try any generated English transcript

        if not transcript:
            try:
                transcript = transcript_list.find_generated_transcript(
                    ['en', 'en-US', 'en-GB', 'en-AU', 'en-CA', 'en-IN'])
            except NoTranscriptFound:
                # If no English transcript, grab the first available original language transcript
                try:
                    print(
                        f"YouTubeTool: No English transcript found for {video_id}. Trying first available original language.")
                    original_lang_transcript = next(
                        iter(transcript_list))  # get the first one
                    transcript = original_lang_transcript
                except StopIteration:  # No transcripts at all
                    return f"Error: No transcripts at all seem to be available for video ID '{video_id}'."
                except NoTranscriptFound:  # Should be caught by StopIteration if list is empty
                    return f"Error: No transcripts found for video ID '{video_id}' after trying preferred and English languages."

        if transcript:
            full_transcript_data = transcript.fetch()
            # Concatenate all text segments into a single string
            transcript_text = " ".join([segment.text
                                       for segment in full_transcript_data])
            return transcript_text
        else:
            # This case should ideally be covered by the fallbacks above
            return f"Error: Could not find a suitable transcript for video ID '{video_id}' in languages: {lang_preference} or English."

    except TranscriptsDisabled:
        return f"Error: Transcripts are disabled for video ID '{video_id}'."
    # This might catch cases where video ID is valid but has zero transcripts at all.
    except NoTranscriptFound:
        return f"Error: No transcripts whatsoever could be found for video ID '{video_id}'. The video might not have any captions or transcripts."
    except Exception as e:
        # Catch any other unexpected errors from the API or video ID issues not caught by regex
        error_type = type(e).__name__
        # Check for common youtube_transcript_api specific errors not explicitly caught if any
        # Heuristic for bad ID
        if "video ID" in str(e).lower() or "parameter" in str(e).lower():
            return f"Error: Could not retrieve transcript for video ID '{video_id}'. It might be an invalid ID or the video is private/deleted. (API Error: {error_type})"
        return f"Error: An unexpected error occurred while fetching transcript for video ID '{video_id}': {error_type} - {str(e)}"


def greet(name):
    return get_youtube_video_transcript(name)


demo = gr.Interface(fn=greet, inputs="text", outputs="text")
demo.launch(mcp_server=True)