File size: 14,020 Bytes
d0c87f7 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 |
import modal
import requests
import textstat # For readability scores
from bs4 import BeautifulSoup # For HTML parsing
import json # For handling JSON data
import os # For environment variables
import openai # For OpenAI API calls
# --- Configuration ---
# Define a Modal image with necessary Python packages
modal_image = modal.Image.debian_slim().pip_install(
"requests",
"beautifulsoup4",
"textstat",
"lxml", # A robust parser for BeautifulSoup
"openai" # OpenAI API library
)
# Define a Modal App. The name is important for lookup from Gradio.
app = modal.App(name="sitegeist-ai-app")
# --- OpenAI LLM Function ---
def query_llm(prompt_text: str, expected_json_structure: dict):
"""
Calls the OpenAI API to get structured JSON responses.
Args:
prompt_text (str): The prompt to send to OpenAI
expected_json_structure (dict): Dictionary structure to guide the output format
Returns:
dict: Parsed JSON response from OpenAI
Raises:
Exception: If API call fails or response is not valid JSON
"""
try:
# Initialize OpenAI client with API key from environment variable
client = openai.OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))
# Create system prompt that instructs the model to return JSON with expected structure
expected_keys = list(expected_json_structure.keys())
system_prompt = (
"You are a helpful assistant designed to output JSON. "
"Based on the user's content, provide a JSON object with the following keys: "
f"{', '.join(expected_keys)}. "
"Ensure your response is valid JSON format only."
)
print(f"--- OpenAI API CALLED ---\nPrompt: {prompt_text[:200]}...\nExpected JSON keys: {expected_keys}")
# Make the API call with JSON mode enabled
response = client.chat.completions.create(
model="gpt-4-turbo-preview",
response_format={"type": "json_object"}, # Enable JSON mode
messages=[
{"role": "system", "content": system_prompt},
{"role": "user", "content": prompt_text}
],
temperature=0.3 # Lower temperature for more consistent structured output
)
# Extract and parse the JSON response
result_json_str = response.choices[0].message.content
print(f"OpenAI Response: {result_json_str[:200]}...")
# Parse the JSON string into a Python dictionary
result_dict = json.loads(result_json_str)
return result_dict
except openai.APIError as e:
error_msg = f"OpenAI API error: {str(e)}"
print(f"ERROR: {error_msg}")
# Fallback: Try without JSON mode if the model doesn't support it
if "response_format" in str(e) and "not supported" in str(e):
print("Falling back to non-JSON mode...")
try:
fallback_response = client.chat.completions.create(
model="gpt-4-turbo-preview",
messages=[
{"role": "system", "content": system_prompt + " Make sure to respond with valid JSON only."},
{"role": "user", "content": prompt_text}
],
temperature=0.3
)
fallback_result_str = fallback_response.choices[0].message.content
print(f"Fallback Response: {fallback_result_str[:200]}...")
# Try to extract JSON from the response (in case there's extra text)
import re
json_match = re.search(r'\{.*\}', fallback_result_str, re.DOTALL)
if json_match:
json_str = json_match.group()
return json.loads(json_str)
else:
return json.loads(fallback_result_str)
except Exception as fallback_error:
return {"error": f"Fallback also failed: {str(fallback_error)}", "status": "fallback_failed"}
else:
return {"error": error_msg, "status": "api_error"}
except json.JSONDecodeError as e:
error_msg = f"Failed to parse JSON response: {str(e)}"
print(f"ERROR: {error_msg}")
return {"error": error_msg, "status": "json_parse_error", "raw_response": result_json_str}
except Exception as e:
error_msg = f"Unexpected error in OpenAI call: {str(e)}"
print(f"ERROR: {error_msg}")
return {"error": error_msg, "status": "unexpected_error"}
# --- Deep Analysis Function (Runs on Modal) ---
@app.function(image=modal_image, secrets=[modal.Secret.from_name("openai-secret")])
def deep_analyze_url(url: str):
"""
Performs a deep marketing and content analysis on a single URL.
"""
print(f"Deep analyzing URL: {url}")
scraped_data = {}
text_content = ""
# 1. Scraping
try:
response = requests.get(url, timeout=10, headers={'User-Agent': 'Mozilla/5.0'})
response.raise_for_status() # Raise an exception for HTTP errors
soup = BeautifulSoup(response.content, 'lxml')
# Try to find main content, fall back to body
main_content_area = soup.find('article') or soup.find('main') or soup.body
if main_content_area:
text_content = main_content_area.get_text(separator=' ', strip=True)
else:
text_content = soup.get_text(separator=' ', strip=True) # Fallback if no specific main area
scraped_data["meta_title"] = soup.find('title').get_text(strip=True) if soup.find('title') else "Not found"
meta_desc_tag = soup.find('meta', attrs={'name': 'description'})
scraped_data["meta_description"] = meta_desc_tag['content'] if meta_desc_tag and 'content' in meta_desc_tag.attrs else "Not found"
# Link counts
all_links = [a['href'] for a in soup.find_all('a', href=True)]
scraped_data["internal_links"] = len([link for link in all_links if url in link or link.startswith('/')])
scraped_data["external_links"] = len([link for link in all_links if url not in link and link.startswith('http')])
except requests.exceptions.RequestException as e:
return {"url": url, "status": "failed", "error": f"Scraping failed: {str(e)}"}
except Exception as e:
return {"url": url, "status": "failed", "error": f"Error during scraping/parsing: {str(e)}"}
if not text_content:
return {"url": url, "status": "failed", "error": "Could not extract text content."}
# 2. Statistical & SEO Analysis (using textstat)
try:
word_count = textstat.lexicon_count(text_content)
sentence_count = textstat.sentence_count(text_content)
readability_metrics = {
"flesch_reading_ease": textstat.flesch_reading_ease(text_content),
"flesch_kincaid_grade": textstat.flesch_kincaid_grade(text_content),
"estimated_reading_time_minutes": round(word_count / 200, 2) if word_count > 0 else 0,
"word_count": word_count,
"sentence_count": sentence_count,
}
except Exception as e:
readability_metrics = {"error": f"Readability analysis failed: {str(e)}"}
# 3. LLM-Powered Qualitative Analysis (OpenAI)
llm_prompt_for_deep_analysis = f"""
Analyze the following web content from {url}. Extract the requested information and provide it in a JSON format.
Content: "{text_content}"
Please analyze this content and provide:
- primary_keywords: List of 3-5 main keywords/topics
- lsi_keywords: List of related semantic keywords (5-8 keywords)
- sentiment: Object with "score" (Positive/Negative/Neutral) and "confidence" (0-1)
- emotional_tone: List of emotional descriptors (2-4 items)
- cta_analysis: Object with "has_cta" (boolean) and "cta_text" (string or null)
- brand_mentions: List of brand names mentioned in the content
"""
# This defines the structure we expect from the LLM for deep analysis
expected_llm_structure_deep = {
"primary_keywords": [], "lsi_keywords": [], "sentiment": {},
"emotional_tone": [], "cta_analysis": {}, "brand_mentions": []
}
llm_driven_analysis_result = query_llm(llm_prompt_for_deep_analysis, expected_llm_structure_deep)
# Check if there was an error in the LLM call
if "error" in llm_driven_analysis_result:
return {
"url": url,
"status": "partial_success",
"analysis": {
"readability_metrics": readability_metrics,
"seo_metrics": {
"meta_title": scraped_data.get("meta_title"),
"meta_description": scraped_data.get("meta_description"),
"internal_links": scraped_data.get("internal_links"),
"external_links": scraped_data.get("external_links"),
},
"llm_driven_analysis": llm_driven_analysis_result # Will contain error info
}
}
# 4. Combine and Return
return {
"url": url,
"status": "success",
"analysis": {
"readability_metrics": readability_metrics,
"seo_metrics": { # Merging scraped SEO data here
"meta_title": scraped_data.get("meta_title"),
"meta_description": scraped_data.get("meta_description"),
"internal_links": scraped_data.get("internal_links"),
"external_links": scraped_data.get("external_links"),
},
"llm_driven_analysis": llm_driven_analysis_result
}
}
# --- Swarm Analysis Functions (Runs on Modal) ---
@app.function(image=modal_image, secrets=[modal.Secret.from_name("openai-secret")])
def scrape_and_analyze(url: str, analysis_prompt_for_swarm: str):
"""
Helper function for swarm analysis: scrapes and performs a *simple* analysis on one URL.
"""
print(f"Swarm - analyzing single URL: {url}")
try:
response = requests.get(url, timeout=10, headers={'User-Agent': 'Mozilla/5.0'})
response.raise_for_status()
soup = BeautifulSoup(response.content, 'lxml')
main_content_area = soup.find('article') or soup.find('main') or soup.body
text_content = main_content_area.get_text(separator=' ', strip=True) if main_content_area else soup.get_text(separator=' ', strip=True)
if not text_content:
return {"url": url, "status": "failed", "error": "No text content found"}
# OpenAI call for a simple summary
llm_prompt = f"""
Content from {url}: {text_content[:1000]}
{analysis_prompt_for_swarm}
Please provide a concise summary of the main topic and key points from this content.
"""
summary_result = query_llm(llm_prompt, {"summary": ""}) # Expecting a simple summary
return {"url": url, "status": "success", "analysis": summary_result}
except requests.exceptions.RequestException as e:
return {"url": url, "status": "failed", "error": f"Scraping failed: {str(e)}"}
except Exception as e:
return {"url": url, "status": "failed", "error": f"Processing error: {str(e)}"}
@app.function(image=modal_image, secrets=[modal.Secret.from_name("openai-secret")], timeout=60000) # Longer timeout for potentially many URLs
def swarm_analyze_urls(urls: list[str], analysis_prompt: str):
"""
Scrapes and analyzes a list of URLs in parallel for swarm mode.
"""
print(f"Swarm analyzing {len(urls)} URLs. Prompt: {analysis_prompt}")
individual_results = []
# Use .map to run scrape_and_analyze in parallel for each URL
# The 'kwargs' argument passes the analysis_prompt to each mapped function call
for result in scrape_and_analyze.map(urls, kwargs={"analysis_prompt_for_swarm": analysis_prompt}):
individual_results.append(result)
# Aggregate results (OpenAI call)
successful_summaries = [
res["analysis"]["summary"]
for res in individual_results
if res["status"] == "success" and "analysis" in res and "summary" in res["analysis"]
]
if not successful_summaries:
return {
"overall_summary": "No successful analyses to aggregate.",
"top_themes": [],
"individual_results": individual_results
}
aggregation_prompt = f"""
Synthesize these summaries into an comprehensive overview and identify the top themes:
Summaries: {'. '.join(successful_summaries)}
Please provide:
- aggregated_summary: A comprehensive overview synthesizing all summaries
- top_themes: List of 3-5 main themes that emerge across all content
"""
aggregated_llm_result = query_llm(aggregation_prompt, {"aggregated_summary": "", "top_themes": []})
return {
"overall_summary": aggregated_llm_result.get("aggregated_summary"),
"top_themes": aggregated_llm_result.get("top_themes"),
"individual_results": individual_results
}
# --- Local Stub for Testing (Optional) ---
# This allows you to test your Modal functions locally without deploying.
# To run: modal run modal_app.py
@app.local_entrypoint()
def main():
print("--- Testing deep_analyze_url ---")
# Test with a known working URL for scraping
test_url_deep = "https://modal.com/docs/guide" # Example URL
deep_result = deep_analyze_url.remote(test_url_deep)
print(json.dumps(deep_result, indent=2))
print("\n--- Testing swarm_analyze_urls ---")
test_urls_swarm = [
"https://modal.com/blog",
"https://gantry.io/blog",
"http://example.com/nonexistentpage"
]
swarm_result = swarm_analyze_urls.remote(test_urls_swarm, "Provide a brief summary of the main topic.")
print(json.dumps(swarm_result, indent=2)) |