Spaces:

AnalysisWithMSR
/

SEO

Sleeping

App Files Files Community

SEO / app.py

AnalysisWithMSR

Update app.py

1d133a1 verified 8 months ago

raw

history blame

6.96 kB

	import googleapiclient.discovery
	import re
	import yt_dlp
	import whisper
	from pydub import AudioSegment
	import tempfile
	from transformers import pipeline
	from youtube_transcript_api import YouTubeTranscriptApi
	import torch
	import openai
	import json
	from urllib.parse import urlparse, parse_qs
	import os
	import gradio as gr

	# API Keys setup
	youtube_api_key = os.getenv("YOUTUBE_API_KEY") # Set these as environment variables
	openai_api_key = os.getenv("OPENAI_API_KEY")
	openai.api_key = openai_api_key

	# Validation for missing API keys
	if not youtube_api_key:
	raise ValueError("YOUTUBE_API_KEY is not set. Please set it as an environment variable.")

	if not openai_api_key:
	raise ValueError("OPENAI_API_KEY is not set. Please set it as an environment variable.")

	# Utility Functions
	def extract_video_id(url):
	"""Extract the video ID from a YouTube URL."""
	try:
	parsed_url = urlparse(url)
	if "youtube.com" in parsed_url.netloc:
	query_params = parse_qs(parsed_url.query)
	return query_params.get('v', [None])[0]
	elif "youtu.be" in parsed_url.netloc:
	return parsed_url.path.strip("/")
	return None
	except Exception as e:
	print(f"Error parsing URL: {e}")
	return None

	def get_video_duration(video_id):
	"""Fetch the video duration."""
	try:
	youtube = googleapiclient.discovery.build("youtube", "v3", developerKey=youtube_api_key)
	request = youtube.videos().list(part="contentDetails", id=video_id)
	response = request.execute()
	if response["items"]:
	duration = response["items"][0]["contentDetails"]["duration"]
	match = re.match(r'PT(?:(\d+)H)?(?:(\d+)M)?(?:(\d+)S)?', duration)
	hours = int(match.group(1)) if match.group(1) else 0
	minutes = int(match.group(2)) if match.group(2) else 0
	seconds = int(match.group(3)) if match.group(3) else 0
	return hours * 60 + minutes + seconds / 60
	return None
	except Exception as e:
	print(f"Error fetching duration: {e}")
	return None

	def download_and_transcribe_with_whisper(youtube_url):
	"""Download audio and transcribe using Whisper."""
	try:
	with tempfile.TemporaryDirectory() as temp_dir:
	temp_audio_file = os.path.join(temp_dir, "audio.mp3")
	ydl_opts = {
	'format': 'bestaudio/best',
	'outtmpl': temp_audio_file,
	'postprocessors': [{
	'key': 'FFmpegExtractAudio',
	'preferredcodec': 'mp3',
	'preferredquality': '192',
	}],
	}
	with yt_dlp.YoutubeDL(ydl_opts) as ydl:
	ydl.download([youtube_url])

	audio = AudioSegment.from_file(temp_audio_file)
	wav_file = os.path.join(temp_dir, "audio.wav")
	audio.export(wav_file, format="wav")

	model = whisper.load_model("large")
	result = model.transcribe(wav_file)
	return result['text']
	except Exception as e:
	print(f"Error during Whisper transcription: {e}")
	return None

	def get_transcript_from_youtube_api(video_id, video_length):
	"""Fetch transcript using YouTubeTranscriptApi."""
	try:
	transcript_list = YouTubeTranscriptApi.list_transcripts(video_id)
	for transcript in transcript_list:
	if not transcript.is_generated:
	return " ".join(segment['text'] for segment in transcript.fetch())
	if video_length > 15:
	auto_transcript = transcript_list.find_generated_transcript(['en'])
	return " ".join(segment['text'] for segment in auto_transcript.fetch())
	return None
	except Exception as e:
	print(f"Error fetching transcript: {e}")
	return None

	def get_transcript(youtube_url):
	"""Fetch transcript or use Whisper fallback."""
	video_id = extract_video_id(youtube_url)
	if not video_id:
	return "Invalid or unsupported YouTube URL."
	video_length = get_video_duration(video_id)
	if video_length:
	transcript = get_transcript_from_youtube_api(video_id, video_length)
	return transcript if transcript else download_and_transcribe_with_whisper(youtube_url)
	return "Error fetching video details."

	def summarize_text(text):
	"""Summarize text using Hugging Face's BART model."""
	try:
	summarizer = pipeline("summarization", model="facebook/bart-large-cnn", device=0 if torch.cuda.is_available() else -1)
	max_input_length = 1024
	chunk_overlap = 100
	text_chunks = [
	text[i:i + max_input_length]
	for i in range(0, len(text), max_input_length - chunk_overlap)
	]
	summaries = [
	summarizer(chunk, max_length=100, min_length=50, do_sample=False)[0]['summary_text']
	for chunk in text_chunks
	]
	return " ".join(summaries)
	except Exception as e:
	print(f"Error during summarization: {e}")
	return None

	def generate_optimized_content(summarized_text):
	"""Generate optimized video metadata using GPT."""
	prompt = f"""
	Analyze the following summarized YouTube video transcript and:
	1. Extract the top 10 keywords.
	2. Generate an optimized title (less than 65 characters).
	3. Create an engaging description.
	4. Generate related tags for the video.

	Summarized Transcript:
	{summarized_text}

	Provide the results in the following JSON format:
	{{
	"keywords": ["keyword1", "keyword2", ..., "keyword10"],
	"title": "Generated Title",
	"description": "Generated Description",
	"tags": ["tag1", "tag2", ..., "tag10"]
	}}
	"""
	try:
	response = openai.ChatCompletion.create(
	model="gpt-3.5-turbo",
	messages=[
	{"role": "system", "content": "You are an SEO expert."},
	{"role": "user", "content": prompt}
	]
	)
	return json.loads(response['choices'][0]['message']['content'])
	except Exception as e:
	print(f"Error generating metadata: {e}")
	return {"error": "Unable to generate metadata."}

	# Main Gradio Interface
	def process_video(youtube_url):
	"""Complete video processing workflow."""
	transcript = get_transcript(youtube_url)
	if not transcript:
	return {"error": "Could not fetch the transcript. Please try another video."}
	summary = summarize_text(transcript)
	optimized_content = generate_optimized_content(summary)
	return optimized_content

	iface = gr.Interface(
	fn=process_video,
	inputs=gr.Textbox(label="Enter YouTube URL"),
	outputs=gr.JSON(label="Optimized Metadata"),
	title="YouTube Video SEO Optimizer",
	description="Paste a YouTube URL to generate an SEO-friendly title, description, tags, and keywords."
	)

	if __name__ == "__main__":
	iface.launch()