import modal import requests import textstat # For readability scores from bs4 import BeautifulSoup # For HTML parsing import json # For handling JSON data import os # For environment variables import openai # For OpenAI API calls # --- Configuration --- # Define a Modal image with necessary Python packages modal_image = modal.Image.debian_slim().pip_install( "requests", "beautifulsoup4", "textstat", "lxml", # A robust parser for BeautifulSoup "openai" # OpenAI API library ) # Define a Modal App. The name is important for lookup from Gradio. app = modal.App(name="sitegeist-ai-app") # --- OpenAI LLM Function --- def query_llm(prompt_text: str, expected_json_structure: dict): """ Calls the OpenAI API to get structured JSON responses. Args: prompt_text (str): The prompt to send to OpenAI expected_json_structure (dict): Dictionary structure to guide the output format Returns: dict: Parsed JSON response from OpenAI Raises: Exception: If API call fails or response is not valid JSON """ try: # Initialize OpenAI client with API key from environment variable client = openai.OpenAI(api_key=os.environ.get("OPENAI_API_KEY")) # Create system prompt that instructs the model to return JSON with expected structure expected_keys = list(expected_json_structure.keys()) system_prompt = ( "You are a helpful assistant designed to output JSON. " "Based on the user's content, provide a JSON object with the following keys: " f"{', '.join(expected_keys)}. " "Ensure your response is valid JSON format only." ) print(f"--- OpenAI API CALLED ---\nPrompt: {prompt_text[:200]}...\nExpected JSON keys: {expected_keys}") # Make the API call with JSON mode enabled response = client.chat.completions.create( model="gpt-4-turbo-preview", response_format={"type": "json_object"}, # Enable JSON mode messages=[ {"role": "system", "content": system_prompt}, {"role": "user", "content": prompt_text} ], temperature=0.3 # Lower temperature for more consistent structured output ) # Extract and parse the JSON response result_json_str = response.choices[0].message.content print(f"OpenAI Response: {result_json_str[:200]}...") # Parse the JSON string into a Python dictionary result_dict = json.loads(result_json_str) return result_dict except openai.APIError as e: error_msg = f"OpenAI API error: {str(e)}" print(f"ERROR: {error_msg}") # Fallback: Try without JSON mode if the model doesn't support it if "response_format" in str(e) and "not supported" in str(e): print("Falling back to non-JSON mode...") try: fallback_response = client.chat.completions.create( model="gpt-4-turbo-preview", messages=[ {"role": "system", "content": system_prompt + " Make sure to respond with valid JSON only."}, {"role": "user", "content": prompt_text} ], temperature=0.3 ) fallback_result_str = fallback_response.choices[0].message.content print(f"Fallback Response: {fallback_result_str[:200]}...") # Try to extract JSON from the response (in case there's extra text) import re json_match = re.search(r'\{.*\}', fallback_result_str, re.DOTALL) if json_match: json_str = json_match.group() return json.loads(json_str) else: return json.loads(fallback_result_str) except Exception as fallback_error: return {"error": f"Fallback also failed: {str(fallback_error)}", "status": "fallback_failed"} else: return {"error": error_msg, "status": "api_error"} except json.JSONDecodeError as e: error_msg = f"Failed to parse JSON response: {str(e)}" print(f"ERROR: {error_msg}") return {"error": error_msg, "status": "json_parse_error", "raw_response": result_json_str} except Exception as e: error_msg = f"Unexpected error in OpenAI call: {str(e)}" print(f"ERROR: {error_msg}") return {"error": error_msg, "status": "unexpected_error"} # --- Deep Analysis Function (Runs on Modal) --- @app.function(image=modal_image, secrets=[modal.Secret.from_name("openai-secret")]) def deep_analyze_url(url: str): """ Performs a deep marketing and content analysis on a single URL. """ print(f"Deep analyzing URL: {url}") scraped_data = {} text_content = "" # 1. Scraping try: response = requests.get(url, timeout=10, headers={'User-Agent': 'Mozilla/5.0'}) response.raise_for_status() # Raise an exception for HTTP errors soup = BeautifulSoup(response.content, 'lxml') # Try to find main content, fall back to body main_content_area = soup.find('article') or soup.find('main') or soup.body if main_content_area: text_content = main_content_area.get_text(separator=' ', strip=True) else: text_content = soup.get_text(separator=' ', strip=True) # Fallback if no specific main area scraped_data["meta_title"] = soup.find('title').get_text(strip=True) if soup.find('title') else "Not found" meta_desc_tag = soup.find('meta', attrs={'name': 'description'}) scraped_data["meta_description"] = meta_desc_tag['content'] if meta_desc_tag and 'content' in meta_desc_tag.attrs else "Not found" # Link counts all_links = [a['href'] for a in soup.find_all('a', href=True)] scraped_data["internal_links"] = len([link for link in all_links if url in link or link.startswith('/')]) scraped_data["external_links"] = len([link for link in all_links if url not in link and link.startswith('http')]) except requests.exceptions.RequestException as e: return {"url": url, "status": "failed", "error": f"Scraping failed: {str(e)}"} except Exception as e: return {"url": url, "status": "failed", "error": f"Error during scraping/parsing: {str(e)}"} if not text_content: return {"url": url, "status": "failed", "error": "Could not extract text content."} # 2. Statistical & SEO Analysis (using textstat) try: word_count = textstat.lexicon_count(text_content) sentence_count = textstat.sentence_count(text_content) readability_metrics = { "flesch_reading_ease": textstat.flesch_reading_ease(text_content), "flesch_kincaid_grade": textstat.flesch_kincaid_grade(text_content), "estimated_reading_time_minutes": round(word_count / 200, 2) if word_count > 0 else 0, "word_count": word_count, "sentence_count": sentence_count, } except Exception as e: readability_metrics = {"error": f"Readability analysis failed: {str(e)}"} # 3. LLM-Powered Qualitative Analysis (OpenAI) llm_prompt_for_deep_analysis = f""" Analyze the following web content from {url}. Extract the requested information and provide it in a JSON format. Content: "{text_content}" Please analyze this content and provide: - primary_keywords: List of 3-5 main keywords/topics - lsi_keywords: List of related semantic keywords (5-8 keywords) - sentiment: Object with "score" (Positive/Negative/Neutral) and "confidence" (0-1) - emotional_tone: List of emotional descriptors (2-4 items) - cta_analysis: Object with "has_cta" (boolean) and "cta_text" (string or null) - brand_mentions: List of brand names mentioned in the content """ # This defines the structure we expect from the LLM for deep analysis expected_llm_structure_deep = { "primary_keywords": [], "lsi_keywords": [], "sentiment": {}, "emotional_tone": [], "cta_analysis": {}, "brand_mentions": [] } llm_driven_analysis_result = query_llm(llm_prompt_for_deep_analysis, expected_llm_structure_deep) # Check if there was an error in the LLM call if "error" in llm_driven_analysis_result: return { "url": url, "status": "partial_success", "analysis": { "readability_metrics": readability_metrics, "seo_metrics": { "meta_title": scraped_data.get("meta_title"), "meta_description": scraped_data.get("meta_description"), "internal_links": scraped_data.get("internal_links"), "external_links": scraped_data.get("external_links"), }, "llm_driven_analysis": llm_driven_analysis_result # Will contain error info } } # 4. Combine and Return return { "url": url, "status": "success", "analysis": { "readability_metrics": readability_metrics, "seo_metrics": { # Merging scraped SEO data here "meta_title": scraped_data.get("meta_title"), "meta_description": scraped_data.get("meta_description"), "internal_links": scraped_data.get("internal_links"), "external_links": scraped_data.get("external_links"), }, "llm_driven_analysis": llm_driven_analysis_result } } # --- Swarm Analysis Functions (Runs on Modal) --- @app.function(image=modal_image, secrets=[modal.Secret.from_name("openai-secret")]) def scrape_and_analyze(url: str, analysis_prompt_for_swarm: str): """ Helper function for swarm analysis: scrapes and performs a *simple* analysis on one URL. """ print(f"Swarm - analyzing single URL: {url}") try: response = requests.get(url, timeout=10, headers={'User-Agent': 'Mozilla/5.0'}) response.raise_for_status() soup = BeautifulSoup(response.content, 'lxml') main_content_area = soup.find('article') or soup.find('main') or soup.body text_content = main_content_area.get_text(separator=' ', strip=True) if main_content_area else soup.get_text(separator=' ', strip=True) if not text_content: return {"url": url, "status": "failed", "error": "No text content found"} # OpenAI call for a simple summary llm_prompt = f""" Content from {url}: {text_content[:1000]} {analysis_prompt_for_swarm} Please provide a concise summary of the main topic and key points from this content. """ summary_result = query_llm(llm_prompt, {"summary": ""}) # Expecting a simple summary return {"url": url, "status": "success", "analysis": summary_result} except requests.exceptions.RequestException as e: return {"url": url, "status": "failed", "error": f"Scraping failed: {str(e)}"} except Exception as e: return {"url": url, "status": "failed", "error": f"Processing error: {str(e)}"} @app.function(image=modal_image, secrets=[modal.Secret.from_name("openai-secret")], timeout=60000) # Longer timeout for potentially many URLs def swarm_analyze_urls(urls: list[str], analysis_prompt: str): """ Scrapes and analyzes a list of URLs in parallel for swarm mode. """ print(f"Swarm analyzing {len(urls)} URLs. Prompt: {analysis_prompt}") individual_results = [] # Use .map to run scrape_and_analyze in parallel for each URL # The 'kwargs' argument passes the analysis_prompt to each mapped function call for result in scrape_and_analyze.map(urls, kwargs={"analysis_prompt_for_swarm": analysis_prompt}): individual_results.append(result) # Aggregate results (OpenAI call) successful_summaries = [ res["analysis"]["summary"] for res in individual_results if res["status"] == "success" and "analysis" in res and "summary" in res["analysis"] ] if not successful_summaries: return { "overall_summary": "No successful analyses to aggregate.", "top_themes": [], "individual_results": individual_results } aggregation_prompt = f""" Synthesize these summaries into an comprehensive overview and identify the top themes: Summaries: {'. '.join(successful_summaries)} Please provide: - aggregated_summary: A comprehensive overview synthesizing all summaries - top_themes: List of 3-5 main themes that emerge across all content """ aggregated_llm_result = query_llm(aggregation_prompt, {"aggregated_summary": "", "top_themes": []}) return { "overall_summary": aggregated_llm_result.get("aggregated_summary"), "top_themes": aggregated_llm_result.get("top_themes"), "individual_results": individual_results } # --- Local Stub for Testing (Optional) --- # This allows you to test your Modal functions locally without deploying. # To run: modal run modal_app.py @app.local_entrypoint() def main(): print("--- Testing deep_analyze_url ---") # Test with a known working URL for scraping test_url_deep = "https://modal.com/docs/guide" # Example URL deep_result = deep_analyze_url.remote(test_url_deep) print(json.dumps(deep_result, indent=2)) print("\n--- Testing swarm_analyze_urls ---") test_urls_swarm = [ "https://modal.com/blog", "https://gantry.io/blog", "http://example.com/nonexistentpage" ] swarm_result = swarm_analyze_urls.remote(test_urls_swarm, "Provide a brief summary of the main topic.") print(json.dumps(swarm_result, indent=2))