Spaces:

Agents-MCP-Hackathon
/

SitegeistAI

Running

SitegeistAI / modal_app.py

Alejandro Ardila

First attempt

d0c87f7 1 day ago

14 kB

	import modal
	import requests
	import textstat # For readability scores
	from bs4 import BeautifulSoup # For HTML parsing
	import json # For handling JSON data
	import os # For environment variables
	import openai # For OpenAI API calls

	# --- Configuration ---
	# Define a Modal image with necessary Python packages
	modal_image = modal.Image.debian_slim().pip_install(
	"requests",
	"beautifulsoup4",
	"textstat",
	"lxml", # A robust parser for BeautifulSoup
	"openai" # OpenAI API library
	)

	# Define a Modal App. The name is important for lookup from Gradio.
	app = modal.App(name="sitegeist-ai-app")

	# --- OpenAI LLM Function ---
	def query_llm(prompt_text: str, expected_json_structure: dict):
	"""
	Calls the OpenAI API to get structured JSON responses.

	Args:
	prompt_text (str): The prompt to send to OpenAI
	expected_json_structure (dict): Dictionary structure to guide the output format

	Returns:
	dict: Parsed JSON response from OpenAI

	Raises:
	Exception: If API call fails or response is not valid JSON
	"""
	try:
	# Initialize OpenAI client with API key from environment variable
	client = openai.OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))

	# Create system prompt that instructs the model to return JSON with expected structure
	expected_keys = list(expected_json_structure.keys())
	system_prompt = (
	"You are a helpful assistant designed to output JSON. "
	"Based on the user's content, provide a JSON object with the following keys: "
	f"{', '.join(expected_keys)}. "
	"Ensure your response is valid JSON format only."
	)

	print(f"--- OpenAI API CALLED ---\nPrompt: {prompt_text[:200]}...\nExpected JSON keys: {expected_keys}")

	# Make the API call with JSON mode enabled
	response = client.chat.completions.create(
	model="gpt-4-turbo-preview",
	response_format={"type": "json_object"}, # Enable JSON mode
	messages=[
	{"role": "system", "content": system_prompt},
	{"role": "user", "content": prompt_text}
	],
	temperature=0.3 # Lower temperature for more consistent structured output
	)

	# Extract and parse the JSON response
	result_json_str = response.choices[0].message.content
	print(f"OpenAI Response: {result_json_str[:200]}...")

	# Parse the JSON string into a Python dictionary
	result_dict = json.loads(result_json_str)
	return result_dict

	except openai.APIError as e:
	error_msg = f"OpenAI API error: {str(e)}"
	print(f"ERROR: {error_msg}")

	# Fallback: Try without JSON mode if the model doesn't support it
	if "response_format" in str(e) and "not supported" in str(e):
	print("Falling back to non-JSON mode...")
	try:
	fallback_response = client.chat.completions.create(
	model="gpt-4-turbo-preview",
	messages=[
	{"role": "system", "content": system_prompt + " Make sure to respond with valid JSON only."},
	{"role": "user", "content": prompt_text}
	],
	temperature=0.3
	)

	fallback_result_str = fallback_response.choices[0].message.content
	print(f"Fallback Response: {fallback_result_str[:200]}...")

	# Try to extract JSON from the response (in case there's extra text)
	import re
	json_match = re.search(r'\{.*\}', fallback_result_str, re.DOTALL)
	if json_match:
	json_str = json_match.group()
	return json.loads(json_str)
	else:
	return json.loads(fallback_result_str)

	except Exception as fallback_error:
	return {"error": f"Fallback also failed: {str(fallback_error)}", "status": "fallback_failed"}
	else:
	return {"error": error_msg, "status": "api_error"}

	except json.JSONDecodeError as e:
	error_msg = f"Failed to parse JSON response: {str(e)}"
	print(f"ERROR: {error_msg}")
	return {"error": error_msg, "status": "json_parse_error", "raw_response": result_json_str}

	except Exception as e:
	error_msg = f"Unexpected error in OpenAI call: {str(e)}"
	print(f"ERROR: {error_msg}")
	return {"error": error_msg, "status": "unexpected_error"}

	# --- Deep Analysis Function (Runs on Modal) ---
	@app.function(image=modal_image, secrets=[modal.Secret.from_name("openai-secret")])
	def deep_analyze_url(url: str):
	"""
	Performs a deep marketing and content analysis on a single URL.
	"""
	print(f"Deep analyzing URL: {url}")
	scraped_data = {}
	text_content = ""

	# 1. Scraping
	try:
	response = requests.get(url, timeout=10, headers={'User-Agent': 'Mozilla/5.0'})
	response.raise_for_status() # Raise an exception for HTTP errors
	soup = BeautifulSoup(response.content, 'lxml')

	# Try to find main content, fall back to body
	main_content_area = soup.find('article') or soup.find('main') or soup.body
	if main_content_area:
	text_content = main_content_area.get_text(separator=' ', strip=True)
	else:
	text_content = soup.get_text(separator=' ', strip=True) # Fallback if no specific main area

	scraped_data["meta_title"] = soup.find('title').get_text(strip=True) if soup.find('title') else "Not found"
	meta_desc_tag = soup.find('meta', attrs={'name': 'description'})
	scraped_data["meta_description"] = meta_desc_tag['content'] if meta_desc_tag and 'content' in meta_desc_tag.attrs else "Not found"

	# Link counts
	all_links = [a['href'] for a in soup.find_all('a', href=True)]
	scraped_data["internal_links"] = len([link for link in all_links if url in link or link.startswith('/')])
	scraped_data["external_links"] = len([link for link in all_links if url not in link and link.startswith('http')])

	except requests.exceptions.RequestException as e:
	return {"url": url, "status": "failed", "error": f"Scraping failed: {str(e)}"}
	except Exception as e:
	return {"url": url, "status": "failed", "error": f"Error during scraping/parsing: {str(e)}"}

	if not text_content:
	return {"url": url, "status": "failed", "error": "Could not extract text content."}

	# 2. Statistical & SEO Analysis (using textstat)
	try:
	word_count = textstat.lexicon_count(text_content)
	sentence_count = textstat.sentence_count(text_content)
	readability_metrics = {
	"flesch_reading_ease": textstat.flesch_reading_ease(text_content),
	"flesch_kincaid_grade": textstat.flesch_kincaid_grade(text_content),
	"estimated_reading_time_minutes": round(word_count / 200, 2) if word_count > 0 else 0,
	"word_count": word_count,
	"sentence_count": sentence_count,
	}
	except Exception as e:
	readability_metrics = {"error": f"Readability analysis failed: {str(e)}"}


	# 3. LLM-Powered Qualitative Analysis (OpenAI)
	llm_prompt_for_deep_analysis = f"""
	Analyze the following web content from {url}. Extract the requested information and provide it in a JSON format.

	Content: "{text_content}"

	Please analyze this content and provide:
	- primary_keywords: List of 3-5 main keywords/topics
	- lsi_keywords: List of related semantic keywords (5-8 keywords)
	- sentiment: Object with "score" (Positive/Negative/Neutral) and "confidence" (0-1)
	- emotional_tone: List of emotional descriptors (2-4 items)
	- cta_analysis: Object with "has_cta" (boolean) and "cta_text" (string or null)
	- brand_mentions: List of brand names mentioned in the content
	"""

	# This defines the structure we expect from the LLM for deep analysis
	expected_llm_structure_deep = {
	"primary_keywords": [], "lsi_keywords": [], "sentiment": {},
	"emotional_tone": [], "cta_analysis": {}, "brand_mentions": []
	}
	llm_driven_analysis_result = query_llm(llm_prompt_for_deep_analysis, expected_llm_structure_deep)

	# Check if there was an error in the LLM call
	if "error" in llm_driven_analysis_result:
	return {
	"url": url,
	"status": "partial_success",
	"analysis": {
	"readability_metrics": readability_metrics,
	"seo_metrics": {
	"meta_title": scraped_data.get("meta_title"),
	"meta_description": scraped_data.get("meta_description"),
	"internal_links": scraped_data.get("internal_links"),
	"external_links": scraped_data.get("external_links"),
	},
	"llm_driven_analysis": llm_driven_analysis_result # Will contain error info
	}
	}

	# 4. Combine and Return
	return {
	"url": url,
	"status": "success",
	"analysis": {
	"readability_metrics": readability_metrics,
	"seo_metrics": { # Merging scraped SEO data here
	"meta_title": scraped_data.get("meta_title"),
	"meta_description": scraped_data.get("meta_description"),
	"internal_links": scraped_data.get("internal_links"),
	"external_links": scraped_data.get("external_links"),
	},
	"llm_driven_analysis": llm_driven_analysis_result
	}
	}

	# --- Swarm Analysis Functions (Runs on Modal) ---
	@app.function(image=modal_image, secrets=[modal.Secret.from_name("openai-secret")])
	def scrape_and_analyze(url: str, analysis_prompt_for_swarm: str):
	"""
	Helper function for swarm analysis: scrapes and performs a simple analysis on one URL.
	"""
	print(f"Swarm - analyzing single URL: {url}")
	try:
	response = requests.get(url, timeout=10, headers={'User-Agent': 'Mozilla/5.0'})
	response.raise_for_status()
	soup = BeautifulSoup(response.content, 'lxml')
	main_content_area = soup.find('article') or soup.find('main') or soup.body
	text_content = main_content_area.get_text(separator=' ', strip=True) if main_content_area else soup.get_text(separator=' ', strip=True)

	if not text_content:
	return {"url": url, "status": "failed", "error": "No text content found"}

	# OpenAI call for a simple summary
	llm_prompt = f"""
	Content from {url}: {text_content[:1000]}

	{analysis_prompt_for_swarm}

	Please provide a concise summary of the main topic and key points from this content.
	"""
	summary_result = query_llm(llm_prompt, {"summary": ""}) # Expecting a simple summary
	return {"url": url, "status": "success", "analysis": summary_result}

	except requests.exceptions.RequestException as e:
	return {"url": url, "status": "failed", "error": f"Scraping failed: {str(e)}"}
	except Exception as e:
	return {"url": url, "status": "failed", "error": f"Processing error: {str(e)}"}

	@app.function(image=modal_image, secrets=[modal.Secret.from_name("openai-secret")], timeout=60000) # Longer timeout for potentially many URLs
	def swarm_analyze_urls(urls: list[str], analysis_prompt: str):
	"""
	Scrapes and analyzes a list of URLs in parallel for swarm mode.
	"""
	print(f"Swarm analyzing {len(urls)} URLs. Prompt: {analysis_prompt}")
	individual_results = []
	# Use .map to run scrape_and_analyze in parallel for each URL
	# The 'kwargs' argument passes the analysis_prompt to each mapped function call
	for result in scrape_and_analyze.map(urls, kwargs={"analysis_prompt_for_swarm": analysis_prompt}):
	individual_results.append(result)

	# Aggregate results (OpenAI call)
	successful_summaries = [
	res["analysis"]["summary"]
	for res in individual_results
	if res["status"] == "success" and "analysis" in res and "summary" in res["analysis"]
	]

	if not successful_summaries:
	return {
	"overall_summary": "No successful analyses to aggregate.",
	"top_themes": [],
	"individual_results": individual_results
	}

	aggregation_prompt = f"""
	Synthesize these summaries into an comprehensive overview and identify the top themes:

	Summaries: {'. '.join(successful_summaries)}

	Please provide:
	- aggregated_summary: A comprehensive overview synthesizing all summaries
	- top_themes: List of 3-5 main themes that emerge across all content
	"""
	aggregated_llm_result = query_llm(aggregation_prompt, {"aggregated_summary": "", "top_themes": []})

	return {
	"overall_summary": aggregated_llm_result.get("aggregated_summary"),
	"top_themes": aggregated_llm_result.get("top_themes"),
	"individual_results": individual_results
	}

	# --- Local Stub for Testing (Optional) ---
	# This allows you to test your Modal functions locally without deploying.
	# To run: modal run modal_app.py
	@app.local_entrypoint()
	def main():
	print("--- Testing deep_analyze_url ---")
	# Test with a known working URL for scraping
	test_url_deep = "https://modal.com/docs/guide" # Example URL
	deep_result = deep_analyze_url.remote(test_url_deep)
	print(json.dumps(deep_result, indent=2))

	print("\n--- Testing swarm_analyze_urls ---")
	test_urls_swarm = [
	"https://modal.com/blog",
	"https://gantry.io/blog",
	"http://example.com/nonexistentpage"
	]
	swarm_result = swarm_analyze_urls.remote(test_urls_swarm, "Provide a brief summary of the main topic.")
	print(json.dumps(swarm_result, indent=2))