kirbah commited on
Commit
666750d
·
1 Parent(s): c5d96b1

Add initial implementation for YouTube transcript retrieval and interface setup

Browse files
Files changed (3) hide show
  1. .gitignore +1 -0
  2. app.py +133 -0
  3. requirements.txt +2 -0
.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ .gradio/**
app.py ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import gradio as gr
3
+ from youtube_transcript_api._api import YouTubeTranscriptApi
4
+ from youtube_transcript_api._errors import TranscriptsDisabled, NoTranscriptFound
5
+
6
+
7
+ def _extract_video_id(youtube_url: str) -> str | None:
8
+ """
9
+ Extracts the YouTube video ID from a URL.
10
+ Handles standard, shortened, and embed URLs.
11
+ """
12
+ # Standard URL: https://www.youtube.com/watch?v=VIDEO_ID
13
+ match = re.search(r"watch\?v=([^&]+)", youtube_url)
14
+ if match:
15
+ return match.group(1)
16
+
17
+ # Shortened URL: https://youtu.be/VIDEO_ID
18
+ match = re.search(r"youtu\.be/([^?&]+)", youtube_url)
19
+ if match:
20
+ return match.group(1)
21
+
22
+ # Embed URL: https://www.youtube.com/embed/VIDEO_ID
23
+ match = re.search(r"youtube\.com/embed/([^?&]+)", youtube_url)
24
+ if match:
25
+ return match.group(1)
26
+
27
+ # Video ID directly passed
28
+ # Basic check for a valid video ID format
29
+ if re.fullmatch(r"^[a-zA-Z0-9_-]{11}$", youtube_url):
30
+ return youtube_url
31
+
32
+ return None
33
+
34
+
35
+ def get_youtube_video_transcript(video_url_or_id: str, lang_preference: list[str] = ['en', 'en-US', 'en-GB']) -> str:
36
+ """
37
+ Retrieves the transcript for a given YouTube video URL or video ID.
38
+ It tries to fetch the transcript in the preferred languages first (defaulting to English).
39
+
40
+ Args:
41
+ video_url_or_id (str): The full YouTube video URL (e.g., "https://www.youtube.com/watch?v=VIDEO_ID")
42
+ or just the 11-character video ID.
43
+ lang_preference (list[str]): A list of language codes to try for the transcript, in order of preference.
44
+ Defaults to ['en', 'en-US', 'en-GB'].
45
+
46
+ Returns:
47
+ str: The concatenated transcript text if successful.
48
+ An error message string if the transcript cannot be fetched (e.g., disabled, not found, invalid ID).
49
+ """
50
+ video_id = _extract_video_id(video_url_or_id)
51
+
52
+ if not video_id:
53
+ return f"Error: Invalid YouTube video URL or ID provided: '{video_url_or_id}'. Could not extract a valid video ID."
54
+
55
+ try:
56
+ # Fetch available transcripts
57
+ transcript_list = YouTubeTranscriptApi().list(video_id)
58
+
59
+ # Try to find transcript in preferred languages
60
+ transcript = None
61
+ for lang_code in lang_preference:
62
+ try:
63
+ transcript = transcript_list.find_transcript([lang_code])
64
+ break
65
+ except NoTranscriptFound:
66
+ continue
67
+
68
+ # If not found in preferred, try generated transcript in preferred languages
69
+ if not transcript:
70
+ for lang_code in lang_preference:
71
+ try:
72
+ transcript = transcript_list.find_generated_transcript([
73
+ lang_code])
74
+ break
75
+ except NoTranscriptFound:
76
+ continue
77
+
78
+ # If still not found, try any available English transcript
79
+ if not transcript:
80
+ try:
81
+ transcript = transcript_list.find_transcript(
82
+ ['en', 'en-US', 'en-GB', 'en-AU', 'en-CA', 'en-IN'])
83
+ except NoTranscriptFound:
84
+ pass # Continue to try any generated English transcript
85
+
86
+ if not transcript:
87
+ try:
88
+ transcript = transcript_list.find_generated_transcript(
89
+ ['en', 'en-US', 'en-GB', 'en-AU', 'en-CA', 'en-IN'])
90
+ except NoTranscriptFound:
91
+ # If no English transcript, grab the first available original language transcript
92
+ try:
93
+ print(
94
+ f"YouTubeTool: No English transcript found for {video_id}. Trying first available original language.")
95
+ original_lang_transcript = next(
96
+ iter(transcript_list)) # get the first one
97
+ transcript = original_lang_transcript
98
+ except StopIteration: # No transcripts at all
99
+ return f"Error: No transcripts at all seem to be available for video ID '{video_id}'."
100
+ except NoTranscriptFound: # Should be caught by StopIteration if list is empty
101
+ return f"Error: No transcripts found for video ID '{video_id}' after trying preferred and English languages."
102
+
103
+ if transcript:
104
+ full_transcript_data = transcript.fetch()
105
+ # Concatenate all text segments into a single string
106
+ transcript_text = " ".join([segment.text
107
+ for segment in full_transcript_data])
108
+ return transcript_text
109
+ else:
110
+ # This case should ideally be covered by the fallbacks above
111
+ return f"Error: Could not find a suitable transcript for video ID '{video_id}' in languages: {lang_preference} or English."
112
+
113
+ except TranscriptsDisabled:
114
+ return f"Error: Transcripts are disabled for video ID '{video_id}'."
115
+ # This might catch cases where video ID is valid but has zero transcripts at all.
116
+ except NoTranscriptFound:
117
+ return f"Error: No transcripts whatsoever could be found for video ID '{video_id}'. The video might not have any captions or transcripts."
118
+ except Exception as e:
119
+ # Catch any other unexpected errors from the API or video ID issues not caught by regex
120
+ error_type = type(e).__name__
121
+ # Check for common youtube_transcript_api specific errors not explicitly caught if any
122
+ # Heuristic for bad ID
123
+ if "video ID" in str(e).lower() or "parameter" in str(e).lower():
124
+ return f"Error: Could not retrieve transcript for video ID '{video_id}'. It might be an invalid ID or the video is private/deleted. (API Error: {error_type})"
125
+ return f"Error: An unexpected error occurred while fetching transcript for video ID '{video_id}': {error_type} - {str(e)}"
126
+
127
+
128
+ def greet(name):
129
+ return get_youtube_video_transcript(name)
130
+
131
+
132
+ demo = gr.Interface(fn=greet, inputs="text", outputs="text")
133
+ demo.launch(mcp_server=True)
requirements.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ gradio[mcp]
2
+ youtube-transcript-api