Spaces:
Running
Running
Better descriptions
Browse files- app via API.py +239 -0
- app.py +154 -57
- requirements.txt +2 -1
app via API.py
ADDED
@@ -0,0 +1,239 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re
|
2 |
+
import gradio as gr
|
3 |
+
from googleapiclient.discovery import build
|
4 |
+
from googleapiclient.errors import HttpError
|
5 |
+
|
6 |
+
# Re-used function to extract video ID
|
7 |
+
|
8 |
+
|
9 |
+
def _extract_video_id(youtube_url: str) -> str | None:
|
10 |
+
"""
|
11 |
+
Extracts the YouTube video ID from a URL.
|
12 |
+
Handles standard, shortened, embed URLs, and direct ID.
|
13 |
+
"""
|
14 |
+
# Standard URL: https://www.youtube.com/watch?v=VIDEO_ID
|
15 |
+
match = re.search(r"watch\?v=([^&]+)", youtube_url)
|
16 |
+
if match:
|
17 |
+
return match.group(1)
|
18 |
+
|
19 |
+
# Shortened URL: https://youtu.be/VIDEO_ID
|
20 |
+
match = re.search(r"youtu\.be/([^?&]+)", youtube_url)
|
21 |
+
if match:
|
22 |
+
return match.group(1)
|
23 |
+
|
24 |
+
# Embed URL: https://www.youtube.com/embed/VIDEO_ID
|
25 |
+
match = re.search(r"youtube\.com/embed/([^?&]+)", youtube_url)
|
26 |
+
if match:
|
27 |
+
return match.group(1)
|
28 |
+
|
29 |
+
# Video ID directly passed
|
30 |
+
if re.fullmatch(r"^[a-zA-Z0-9_-]{11}$", youtube_url):
|
31 |
+
return youtube_url
|
32 |
+
return None
|
33 |
+
|
34 |
+
|
35 |
+
def _parse_srt_to_text(srt_content: str) -> str:
|
36 |
+
"""
|
37 |
+
Parses SRT formatted string to extract plain text.
|
38 |
+
Removes timestamps, sequence numbers, and basic HTML formatting.
|
39 |
+
"""
|
40 |
+
text_lines = []
|
41 |
+
lines = srt_content.splitlines()
|
42 |
+
for line in lines:
|
43 |
+
if not line.strip() or line.strip().isdigit() or '-->' in line:
|
44 |
+
continue
|
45 |
+
line_text = re.sub(r'<[^>]+>', '', line)
|
46 |
+
text_lines.append(line_text.strip())
|
47 |
+
return " ".join(text_lines)
|
48 |
+
|
49 |
+
|
50 |
+
def get_youtube_transcript_official_api(video_url_or_id: str, api_key: str, target_language: str = 'en') -> str:
|
51 |
+
"""
|
52 |
+
Retrieves the transcript for a YouTube video using the official YouTube Data API v3.
|
53 |
+
This function is intended to be exposed as an MCP tool.
|
54 |
+
|
55 |
+
Args:
|
56 |
+
video_url_or_id (str): YouTube video URL or 11-character video ID.
|
57 |
+
api_key (str): Your YouTube Data API v3 key.
|
58 |
+
target_language (str): Preferred language code for the transcript (e.g., 'en', 'es'). Defaults to 'en'.
|
59 |
+
|
60 |
+
Returns:
|
61 |
+
str: The concatenated transcript text or an error message.
|
62 |
+
"""
|
63 |
+
video_id = _extract_video_id(video_url_or_id)
|
64 |
+
if not video_id:
|
65 |
+
return f"Error: Invalid YouTube video URL or ID: '{video_url_or_id}'. Could not extract a valid video ID."
|
66 |
+
|
67 |
+
if not api_key or not api_key.strip():
|
68 |
+
return "Error: YouTube Data API Key is missing. Please provide a valid API key for the 'api_key' argument."
|
69 |
+
|
70 |
+
try:
|
71 |
+
youtube = build('youtube', 'v3', developerKey=api_key)
|
72 |
+
except Exception as e:
|
73 |
+
return f"Error: Could not build YouTube API client. Check library installation. Details: {str(e)}"
|
74 |
+
|
75 |
+
try:
|
76 |
+
caption_request = youtube.captions().list(
|
77 |
+
part="snippet",
|
78 |
+
videoId=video_id
|
79 |
+
)
|
80 |
+
caption_response = caption_request.execute()
|
81 |
+
|
82 |
+
caption_id_to_download = None
|
83 |
+
found_lang_for_download = None
|
84 |
+
available_langs_details = []
|
85 |
+
|
86 |
+
for item in caption_response.get('items', []):
|
87 |
+
lang_code = item['snippet']['language']
|
88 |
+
lang_name = item['snippet'].get('name', 'N/A')
|
89 |
+
track_kind = item['snippet'].get('trackKind', 'N/A')
|
90 |
+
available_langs_details.append(
|
91 |
+
f"{lang_code} (Name: '{lang_name}', Type: {track_kind})")
|
92 |
+
|
93 |
+
if lang_code.lower() == target_language.lower():
|
94 |
+
caption_id_to_download = item['id']
|
95 |
+
found_lang_for_download = lang_code
|
96 |
+
break
|
97 |
+
|
98 |
+
if not caption_id_to_download and target_language.lower() != 'en':
|
99 |
+
for item in caption_response.get('items', []):
|
100 |
+
lang_code = item['snippet']['language']
|
101 |
+
if lang_code.lower() == 'en':
|
102 |
+
caption_id_to_download = item['id']
|
103 |
+
found_lang_for_download = lang_code
|
104 |
+
break
|
105 |
+
|
106 |
+
if not caption_id_to_download:
|
107 |
+
available_langs_str = "\n - ".join(
|
108 |
+
available_langs_details) if available_langs_details else "None listed (captions might be disabled, non-existent, or API access restricted)"
|
109 |
+
return (f"Error: No suitable caption track found for language '{target_language}' "
|
110 |
+
f"(or 'en' fallback) for video ID '{video_id}'.\n"
|
111 |
+
f"Available caption tracks:\n - {available_langs_str}")
|
112 |
+
|
113 |
+
download_request = youtube.captions().download(
|
114 |
+
id=caption_id_to_download,
|
115 |
+
tfmt='srt'
|
116 |
+
)
|
117 |
+
srt_transcript = download_request.execute()
|
118 |
+
|
119 |
+
plain_text_transcript = _parse_srt_to_text(srt_transcript)
|
120 |
+
|
121 |
+
if not plain_text_transcript.strip():
|
122 |
+
return (f"Notice: Transcript for video ID '{video_id}' (Language: {found_lang_for_download}) "
|
123 |
+
"was downloaded but appears empty after parsing. The SRT file might be malformed or contain no text.")
|
124 |
+
|
125 |
+
return plain_text_transcript
|
126 |
+
|
127 |
+
except HttpError as e:
|
128 |
+
error_content_bytes = e.content
|
129 |
+
error_details = "No additional details in error content."
|
130 |
+
if error_content_bytes:
|
131 |
+
try:
|
132 |
+
error_details = error_content_bytes.decode('utf-8')
|
133 |
+
except UnicodeDecodeError:
|
134 |
+
error_details = "Error content could not be decoded (non-UTF-8)."
|
135 |
+
|
136 |
+
status_code = e.resp.status
|
137 |
+
|
138 |
+
if status_code == 403:
|
139 |
+
if "quotaExceeded" in error_details.lower() or "daily limit exceeded" in error_details.lower():
|
140 |
+
return f"API Error (403): YouTube API quota exceeded. Details: {error_details}"
|
141 |
+
return (f"API Error (403): Forbidden. Check API Key ('api_key'), YouTube Data API v3 enablement, or video owner restrictions for video_id='{video_id}'. Details: {error_details}")
|
142 |
+
elif status_code == 404:
|
143 |
+
return (f"API Error (404): Not Found. Video ID '{video_id}' ('video_url_or_id') might be incorrect, private/deleted, or caption track missing. Details: {error_details}")
|
144 |
+
else:
|
145 |
+
return f"API Error ({status_code}): An API error occurred while processing video_id='{video_id}'. Details: {error_details}"
|
146 |
+
|
147 |
+
except Exception as e:
|
148 |
+
return f"Unexpected Error processing video_id='{video_id}': {type(e).__name__} - {str(e)}"
|
149 |
+
|
150 |
+
|
151 |
+
def gradio_interface_handler(video_url_or_id: str, api_key: str, language: str):
|
152 |
+
"""
|
153 |
+
Handler function for the Gradio interface that wraps the main transcript retrieval logic.
|
154 |
+
Type hints and this docstring help Gradio generate the MCP tool schema.
|
155 |
+
|
156 |
+
Args:
|
157 |
+
video_url_or_id (str): The YouTube video URL or its 11-character ID. This description will appear in the MCP tool schema for this argument.
|
158 |
+
api_key (str): The YouTube Data API v3 key. This description will appear in the MCP tool schema for this argument.
|
159 |
+
language (str): The preferred ISO 639-1 language code for the transcript (e.g., 'en', 'es'). Defaults to 'en'. This description will appear in the MCP tool schema for this argument.
|
160 |
+
|
161 |
+
Returns:
|
162 |
+
str: The fetched transcript or an error message. This defines the tool's output.
|
163 |
+
"""
|
164 |
+
if not video_url_or_id.strip():
|
165 |
+
return "Error: YouTube Video URL or ID ('video_url_or_id') input is empty. Please provide a valid URL or ID."
|
166 |
+
if not api_key.strip():
|
167 |
+
return "Error: YouTube API Key ('api_key') input is empty. Please provide your API key."
|
168 |
+
|
169 |
+
language_to_use = language.strip().lower(
|
170 |
+
) if language and language.strip() else 'en'
|
171 |
+
|
172 |
+
return get_youtube_transcript_official_api(video_url_or_id, api_key, language_to_use)
|
173 |
+
|
174 |
+
|
175 |
+
# Define Gradio input components
|
176 |
+
# The 'label' is for the UI, and 'placeholder' provides a hint.
|
177 |
+
# The descriptions for the MCP tool arguments are derived from the docstring of 'gradio_interface_handler'.
|
178 |
+
inputs = [
|
179 |
+
gr.Textbox(
|
180 |
+
label="YouTube Video URL or ID",
|
181 |
+
placeholder="e.g., https://www.youtube.com/watch?v=dQw4w9WgXcQ or dQw4w9WgXcQ"
|
182 |
+
),
|
183 |
+
gr.Textbox(
|
184 |
+
label="YouTube Data API Key",
|
185 |
+
type="password",
|
186 |
+
placeholder="Enter your API key (e.g., AIzaSy...)"
|
187 |
+
),
|
188 |
+
gr.Textbox(
|
189 |
+
label="Preferred Language Code",
|
190 |
+
value="en", # Default language
|
191 |
+
placeholder="e.g., en, es, fr, de"
|
192 |
+
)
|
193 |
+
]
|
194 |
+
|
195 |
+
# Define Gradio output component
|
196 |
+
# The 'label' is for the UI. The description for the MCP tool output is derived from the return type hint and docstring of 'gradio_interface_handler'.
|
197 |
+
outputs = gr.Textbox(
|
198 |
+
label="Transcript Output",
|
199 |
+
lines=15,
|
200 |
+
show_copy_button=True
|
201 |
+
)
|
202 |
+
|
203 |
+
# Create and launch the Gradio interface
|
204 |
+
demo = gr.Interface(
|
205 |
+
fn=gradio_interface_handler, # The function to wrap, with type hints and docstrings
|
206 |
+
inputs=inputs,
|
207 |
+
outputs=outputs,
|
208 |
+
title="YouTube Video Transcript Retriever (MCP Enabled)",
|
209 |
+
description=( # This is the main description for the Gradio UI and can also provide context for the tool.
|
210 |
+
"Enter a YouTube video URL/ID, your YouTube Data API Key, and a preferred language code "
|
211 |
+
"to fetch the video transcript. This interface also exposes an MCP tool for programmatic access. "
|
212 |
+
"The MCP tool's argument descriptions are generated from the function's docstring."
|
213 |
+
),
|
214 |
+
allow_flagging='never',
|
215 |
+
examples=[
|
216 |
+
["https://www.youtube.com/watch?v=dQw4w9WgXcQ", "YOUR_API_KEY_HERE", "en"],
|
217 |
+
["Mdcw3_s2T_s", "YOUR_API_KEY_HERE", "en"],
|
218 |
+
["https://www.youtube.com/watch?v=rokGy0huYEA", "YOUR_API_KEY_HERE", "ja"]
|
219 |
+
],
|
220 |
+
article=(
|
221 |
+
"**Using the Web Interface:**\n"
|
222 |
+
"1. Obtain a [YouTube Data API v3 key](https://developers.google.com/youtube/v3/getting-started).\n"
|
223 |
+
"2. Ensure the YouTube Data API v3 is enabled for your project in Google Cloud Console.\n"
|
224 |
+
"3. Paste the video URL/ID, your API key, and desired language code into the respective fields.\n"
|
225 |
+
"4. Click 'Submit' to retrieve the transcript.\n\n"
|
226 |
+
"**MCP Server Information:**\n"
|
227 |
+
"When launched with `mcp_server=True`, Gradio also starts an MCP server.\n"
|
228 |
+
"- The tool schema (including argument descriptions from the function's docstring) can typically be found at `/gradio_api/mcp/schema`.\n"
|
229 |
+
"- The MCP server endpoint is usually at `/gradio_api/mcp/sse`.\n"
|
230 |
+
"This allows AI models and other MCP clients to use the transcript retrieval functionality programmatically."
|
231 |
+
)
|
232 |
+
)
|
233 |
+
|
234 |
+
if __name__ == '__main__':
|
235 |
+
print("Gradio app starting...")
|
236 |
+
print("MCP Server integration is enabled via mcp_server=True.")
|
237 |
+
print(
|
238 |
+
"Ensure 'gradio[mcp]' is installed if you encounter issues related to MCP.")
|
239 |
+
demo.launch(mcp_server=True)
|
app.py
CHANGED
@@ -25,109 +25,206 @@ def _extract_video_id(youtube_url: str) -> str | None:
|
|
25 |
return match.group(1)
|
26 |
|
27 |
# Video ID directly passed
|
28 |
-
# Basic check for a valid video ID format
|
29 |
if re.fullmatch(r"^[a-zA-Z0-9_-]{11}$", youtube_url):
|
30 |
return youtube_url
|
31 |
-
|
32 |
return None
|
33 |
|
34 |
|
35 |
-
def
|
36 |
"""
|
37 |
-
Retrieves the transcript for a given YouTube video URL or video ID.
|
38 |
-
It tries to fetch the transcript in the preferred languages first
|
39 |
|
40 |
Args:
|
41 |
video_url_or_id (str): The full YouTube video URL (e.g., "https://www.youtube.com/watch?v=VIDEO_ID")
|
42 |
or just the 11-character video ID.
|
43 |
-
|
44 |
-
|
45 |
|
46 |
Returns:
|
47 |
str: The concatenated transcript text if successful.
|
48 |
-
An error message string if the transcript cannot be fetched
|
49 |
"""
|
50 |
video_id = _extract_video_id(video_url_or_id)
|
51 |
|
52 |
if not video_id:
|
53 |
-
return f"Error: Invalid YouTube video URL or ID
|
|
|
|
|
|
|
54 |
|
55 |
try:
|
56 |
-
#
|
57 |
-
|
|
|
|
|
58 |
|
59 |
-
|
60 |
-
|
61 |
-
|
|
|
62 |
try:
|
63 |
-
|
|
|
|
|
64 |
break
|
65 |
except NoTranscriptFound:
|
66 |
continue
|
67 |
|
68 |
-
# If not found
|
69 |
-
if not
|
70 |
-
for lang_code in
|
71 |
try:
|
72 |
-
|
73 |
-
|
|
|
74 |
break
|
75 |
except NoTranscriptFound:
|
76 |
continue
|
77 |
|
78 |
-
#
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
['en', 'en-US', 'en-GB', 'en-AU', 'en-CA', 'en-IN'])
|
83 |
-
except NoTranscriptFound:
|
84 |
-
pass # Continue to try any generated English transcript
|
85 |
|
86 |
-
if not
|
|
|
87 |
try:
|
88 |
-
|
89 |
-
|
90 |
except NoTranscriptFound:
|
91 |
-
#
|
92 |
try:
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
106 |
transcript_text = " ".join([segment.text
|
107 |
for segment in full_transcript_data])
|
108 |
return transcript_text
|
109 |
else:
|
110 |
-
|
111 |
-
|
112 |
|
113 |
except TranscriptsDisabled:
|
114 |
return f"Error: Transcripts are disabled for video ID '{video_id}'."
|
115 |
-
# This
|
116 |
-
|
117 |
-
return f"Error: No transcripts whatsoever could be found for video ID '{video_id}'. The video might not have any captions or transcripts."
|
118 |
except Exception as e:
|
119 |
-
# Catch any other unexpected errors from the API or video ID issues not caught by regex
|
120 |
error_type = type(e).__name__
|
121 |
-
# Check for common youtube_transcript_api specific errors
|
122 |
-
#
|
|
|
|
|
123 |
if "video ID" in str(e).lower() or "parameter" in str(e).lower():
|
124 |
-
return f"Error: Could not retrieve transcript for video ID '{video_id}'. It might be an invalid ID or
|
125 |
return f"Error: An unexpected error occurred while fetching transcript for video ID '{video_id}': {error_type} - {str(e)}"
|
126 |
|
127 |
|
128 |
-
def
|
129 |
-
|
|
|
130 |
|
|
|
|
|
|
|
|
|
131 |
|
132 |
-
|
133 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
25 |
return match.group(1)
|
26 |
|
27 |
# Video ID directly passed
|
|
|
28 |
if re.fullmatch(r"^[a-zA-Z0-9_-]{11}$", youtube_url):
|
29 |
return youtube_url
|
|
|
30 |
return None
|
31 |
|
32 |
|
33 |
+
def get_youtube_video_transcript_scraper(video_url_or_id: str, lang_preference_list: list[str]) -> str:
|
34 |
"""
|
35 |
+
Retrieves the transcript for a given YouTube video URL or video ID using your specified youtube_transcript_api methods.
|
36 |
+
It tries to fetch the transcript in the preferred languages first.
|
37 |
|
38 |
Args:
|
39 |
video_url_or_id (str): The full YouTube video URL (e.g., "https://www.youtube.com/watch?v=VIDEO_ID")
|
40 |
or just the 11-character video ID.
|
41 |
+
lang_preference_list (list[str]): A list of language codes to try for the transcript, in order of preference.
|
42 |
+
Example: ['en', 'en-US', 'es'].
|
43 |
|
44 |
Returns:
|
45 |
str: The concatenated transcript text if successful.
|
46 |
+
An error message string if the transcript cannot be fetched.
|
47 |
"""
|
48 |
video_id = _extract_video_id(video_url_or_id)
|
49 |
|
50 |
if not video_id:
|
51 |
+
return f"Error: Invalid YouTube video URL or ID: '{video_url_or_id}'. Could not extract a valid video ID."
|
52 |
+
|
53 |
+
if not lang_preference_list:
|
54 |
+
return "Error: Language preference list ('lang_preference_list') cannot be empty."
|
55 |
|
56 |
try:
|
57 |
+
# Using your specified API instantiation and list method
|
58 |
+
api = YouTubeTranscriptApi()
|
59 |
+
transcript_list_obj = api.list(
|
60 |
+
video_id) # This is TranscriptList object
|
61 |
|
62 |
+
transcript_found = None
|
63 |
+
|
64 |
+
# Try to find manually created transcript in preferred languages
|
65 |
+
for lang_code in lang_preference_list:
|
66 |
try:
|
67 |
+
# Using your specified find_transcript method
|
68 |
+
transcript_found = transcript_list_obj.find_transcript([
|
69 |
+
lang_code])
|
70 |
break
|
71 |
except NoTranscriptFound:
|
72 |
continue
|
73 |
|
74 |
+
# If not found, try generated transcript in preferred languages
|
75 |
+
if not transcript_found:
|
76 |
+
for lang_code in lang_preference_list:
|
77 |
try:
|
78 |
+
# Using your specified find_generated_transcript method
|
79 |
+
transcript_found = transcript_list_obj.find_generated_transcript([
|
80 |
+
lang_code])
|
81 |
break
|
82 |
except NoTranscriptFound:
|
83 |
continue
|
84 |
|
85 |
+
# Fallback logic (similar to your original code's structure)
|
86 |
+
english_fallbacks = ['en', 'en-US', 'en-GB', 'en-AU', 'en-CA', 'en-IN']
|
87 |
+
already_tried_english = any(lang.lower().startswith(
|
88 |
+
'en') for lang in lang_preference_list)
|
|
|
|
|
|
|
89 |
|
90 |
+
if not transcript_found and not already_tried_english:
|
91 |
+
# Try any available English transcript (manual first)
|
92 |
try:
|
93 |
+
transcript_found = transcript_list_obj.find_transcript(
|
94 |
+
english_fallbacks)
|
95 |
except NoTranscriptFound:
|
96 |
+
# Then try generated English
|
97 |
try:
|
98 |
+
transcript_found = transcript_list_obj.find_generated_transcript(
|
99 |
+
english_fallbacks)
|
100 |
+
except NoTranscriptFound:
|
101 |
+
pass # No English transcript found
|
102 |
+
|
103 |
+
# If still not found, try the first available original language (as per your initial logic)
|
104 |
+
if not transcript_found:
|
105 |
+
try:
|
106 |
+
# This part requires iterating through the TranscriptList object if no specific methods like "get_first" exist.
|
107 |
+
# Your original code used `next(iter(transcript_list_obj))` which implies the object is iterable.
|
108 |
+
# Let's assume the TranscriptList object itself can be iterated or has a way to get its items.
|
109 |
+
# A more direct way, if the object behaves like a list of available transcripts:
|
110 |
+
print(f"Notice: No transcript found in preferred languages or English for video ID '{video_id}'. "
|
111 |
+
"Attempting to fetch the first available original language transcript.")
|
112 |
+
|
113 |
+
# Iterate through all available transcripts in the list_obj
|
114 |
+
# This assumes transcript_list_obj is iterable and yields transcript objects directly.
|
115 |
+
# Based on your original code `next(iter(transcript_list))`, where transcript_list was from `api.list()`,
|
116 |
+
# this should work similarly.
|
117 |
+
for tr in transcript_list_obj: # transcript_list_obj is a TranscriptList
|
118 |
+
transcript_found = tr # Get the first one and break
|
119 |
+
break
|
120 |
+
if not transcript_found: # If loop completed without finding any
|
121 |
+
raise StopIteration # Mimic original behavior to be caught below
|
122 |
+
|
123 |
+
except StopIteration: # No transcripts at all
|
124 |
+
return (f"Error: No transcripts at all seem to be available for video ID '{video_id}'. "
|
125 |
+
f"Checked preferred: {lang_preference_list}, English fallbacks, and any original language.")
|
126 |
+
except NoTranscriptFound: # Should ideally be caught by StopIteration if list is empty
|
127 |
+
return (f"Error: No transcripts found for video ID '{video_id}' after trying preferred, English, and original languages.")
|
128 |
+
|
129 |
+
if transcript_found: # transcript_found is a Transcript object
|
130 |
+
full_transcript_data = transcript_found.fetch()
|
131 |
transcript_text = " ".join([segment.text
|
132 |
for segment in full_transcript_data])
|
133 |
return transcript_text
|
134 |
else:
|
135 |
+
return (f"Error: Could not find any suitable transcript for video ID '{video_id}'. "
|
136 |
+
f"Preferred languages: {lang_preference_list}. Also checked English and original languages if applicable.")
|
137 |
|
138 |
except TranscriptsDisabled:
|
139 |
return f"Error: Transcripts are disabled for video ID '{video_id}'."
|
140 |
+
except NoTranscriptFound: # This can be raised by list_transcripts directly if no captions at all for the video
|
141 |
+
return f"Error: No transcripts whatsoever found for video ID '{video_id}'. The video might not have any captions initially."
|
|
|
142 |
except Exception as e:
|
|
|
143 |
error_type = type(e).__name__
|
144 |
+
# Check for common youtube_transcript_api specific errors
|
145 |
+
if "VideoUnavailable" in error_type: # Common error from the library
|
146 |
+
return f"Error: Video '{video_id}' is unavailable. It might be private, deleted, or geo-restricted."
|
147 |
+
# Heuristic from your original code
|
148 |
if "video ID" in str(e).lower() or "parameter" in str(e).lower():
|
149 |
+
return f"Error: Could not retrieve transcript for video ID '{video_id}'. It might be an invalid ID or other parameter issue. (API Error: {error_type})"
|
150 |
return f"Error: An unexpected error occurred while fetching transcript for video ID '{video_id}': {error_type} - {str(e)}"
|
151 |
|
152 |
|
153 |
+
def gradio_mcp_handler(video_url_or_id: str, lang_preference_str: str):
|
154 |
+
"""
|
155 |
+
MCP tool handler to retrieve YouTube video transcript using youtube_transcript_api.
|
156 |
|
157 |
+
Args:
|
158 |
+
video_url_or_id (str): The YouTube video URL or its 11-character ID.
|
159 |
+
lang_preference_str (str): A comma-separated string of preferred language codes for the transcript
|
160 |
+
(e.g., "en,en-US,es"). Defaults to "en" if empty or invalid.
|
161 |
|
162 |
+
Returns:
|
163 |
+
str: The fetched transcript or an error message.
|
164 |
+
"""
|
165 |
+
if not video_url_or_id.strip():
|
166 |
+
return "Error: 'video_url_or_id' argument cannot be empty."
|
167 |
+
|
168 |
+
if lang_preference_str and lang_preference_str.strip():
|
169 |
+
lang_list = [lang.strip()
|
170 |
+
for lang in lang_preference_str.split(',') if lang.strip()]
|
171 |
+
else:
|
172 |
+
lang_list = ['en']
|
173 |
+
|
174 |
+
if not lang_list: # Handle cases like lang_preference_str = ","
|
175 |
+
lang_list = ['en']
|
176 |
+
|
177 |
+
return get_youtube_video_transcript_scraper(video_url_or_id, lang_list)
|
178 |
+
|
179 |
+
|
180 |
+
# Define Gradio input components for MCP
|
181 |
+
inputs = [
|
182 |
+
gr.Textbox(
|
183 |
+
label="YouTube Video URL or ID",
|
184 |
+
placeholder="e.g., https://www.youtube.com/watch?v=VIDEO_ID or VIDEO_ID"
|
185 |
+
),
|
186 |
+
gr.Textbox(
|
187 |
+
label="Preferred Language Codes (comma-separated)",
|
188 |
+
value="en,en-US",
|
189 |
+
placeholder="e.g., en,es,fr (default: en)"
|
190 |
+
)
|
191 |
+
]
|
192 |
+
|
193 |
+
outputs = gr.Textbox(
|
194 |
+
label="Transcript Output",
|
195 |
+
lines=15,
|
196 |
+
show_copy_button=True
|
197 |
+
)
|
198 |
+
|
199 |
+
demo = gr.Interface(
|
200 |
+
fn=gradio_mcp_handler,
|
201 |
+
inputs=inputs,
|
202 |
+
outputs=outputs,
|
203 |
+
title="YouTube Transcript Retriever (youtube-transcript-api)",
|
204 |
+
description=(
|
205 |
+
"Enter YouTube video URL/ID and comma-separated language codes to fetch transcript using 'youtube-transcript-api'. "
|
206 |
+
"MCP argument descriptions from handler's docstring."
|
207 |
+
),
|
208 |
+
allow_flagging='never',
|
209 |
+
examples=[
|
210 |
+
["https://www.youtube.com/watch?v=Sd6F2pfKJmk", "en"],
|
211 |
+
["Sd6F2pfKJmk", "en,ja"],
|
212 |
+
["https://www.youtube.com/watch?v=rokGy0huYEA", "ja,en"]
|
213 |
+
],
|
214 |
+
article=(
|
215 |
+
"**How to Use:**\n"
|
216 |
+
"1. Paste YouTube video URL or 11-character video ID.\n"
|
217 |
+
"2. Enter comma-separated language codes (e.g., `en-GB,en,es`). Defaults to `en` if empty.\n"
|
218 |
+
"3. Click 'Submit'.\n\n"
|
219 |
+
"**MCP Server Information:**\n"
|
220 |
+
"Launched with `mcp_server=True`, exposes an MCP tool.\n"
|
221 |
+
"- Tool arguments `video_url_or_id` (str) and `lang_preference_str` (str) are defined in handler docstring.\n"
|
222 |
+
"- Schema: `/gradio_api/mcp/schema`. Endpoint: `/gradio_api/mcp/sse`."
|
223 |
+
)
|
224 |
+
)
|
225 |
+
|
226 |
+
if __name__ == '__main__':
|
227 |
+
print("Gradio app starting with your specified youtube-transcript-api methods...")
|
228 |
+
print("MCP Server integration enabled (mcp_server=True).")
|
229 |
+
print("Ensure 'gradio[mcp]' and 'youtube-transcript-api' are installed.")
|
230 |
+
demo.launch(mcp_server=True)
|
requirements.txt
CHANGED
@@ -1,2 +1,3 @@
|
|
1 |
gradio[mcp]
|
2 |
-
youtube-transcript-api
|
|
|
|
1 |
gradio[mcp]
|
2 |
+
youtube-transcript-api
|
3 |
+
google-api-python-client
|