Spaces:

Agents-MCP-Hackathon
/

SitegeistAI

Running

File size: 14,020 Bytes

d0c87f7

import modal
import requests
import textstat # For readability scores
from bs4 import BeautifulSoup # For HTML parsing
import json # For handling JSON data
import os # For environment variables
import openai # For OpenAI API calls

# --- Configuration ---
# Define a Modal image with necessary Python packages
modal_image = modal.Image.debian_slim().pip_install(
    "requests",
    "beautifulsoup4",
    "textstat",
    "lxml", # A robust parser for BeautifulSoup
    "openai" # OpenAI API library
)

# Define a Modal App. The name is important for lookup from Gradio.
app = modal.App(name="sitegeist-ai-app")

# --- OpenAI LLM Function ---
def query_llm(prompt_text: str, expected_json_structure: dict):
    """
    Calls the OpenAI API to get structured JSON responses.
    
    Args:
        prompt_text (str): The prompt to send to OpenAI
        expected_json_structure (dict): Dictionary structure to guide the output format
        
    Returns:
        dict: Parsed JSON response from OpenAI
        
    Raises:
        Exception: If API call fails or response is not valid JSON
    """
    try:
        # Initialize OpenAI client with API key from environment variable
        client = openai.OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))
        
        # Create system prompt that instructs the model to return JSON with expected structure
        expected_keys = list(expected_json_structure.keys())
        system_prompt = (
            "You are a helpful assistant designed to output JSON. "
            "Based on the user's content, provide a JSON object with the following keys: "
            f"{', '.join(expected_keys)}. "
            "Ensure your response is valid JSON format only."
        )
        
        print(f"--- OpenAI API CALLED ---\nPrompt: {prompt_text[:200]}...\nExpected JSON keys: {expected_keys}")
        
        # Make the API call with JSON mode enabled
        response = client.chat.completions.create(
            model="gpt-4-turbo-preview",
            response_format={"type": "json_object"},  # Enable JSON mode
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": prompt_text}
            ],
            temperature=0.3  # Lower temperature for more consistent structured output
        )
        
        # Extract and parse the JSON response
        result_json_str = response.choices[0].message.content
        print(f"OpenAI Response: {result_json_str[:200]}...")
        
        # Parse the JSON string into a Python dictionary
        result_dict = json.loads(result_json_str)
        return result_dict
        
    except openai.APIError as e:
        error_msg = f"OpenAI API error: {str(e)}"
        print(f"ERROR: {error_msg}")
        
        # Fallback: Try without JSON mode if the model doesn't support it
        if "response_format" in str(e) and "not supported" in str(e):
            print("Falling back to non-JSON mode...")
            try:
                fallback_response = client.chat.completions.create(
                    model="gpt-4-turbo-preview",
                    messages=[
                        {"role": "system", "content": system_prompt + " Make sure to respond with valid JSON only."},
                        {"role": "user", "content": prompt_text}
                    ],
                    temperature=0.3
                )
                
                fallback_result_str = fallback_response.choices[0].message.content
                print(f"Fallback Response: {fallback_result_str[:200]}...")
                
                # Try to extract JSON from the response (in case there's extra text)
                import re
                json_match = re.search(r'\{.*\}', fallback_result_str, re.DOTALL)
                if json_match:
                    json_str = json_match.group()
                    return json.loads(json_str)
                else:
                    return json.loads(fallback_result_str)
                    
            except Exception as fallback_error:
                return {"error": f"Fallback also failed: {str(fallback_error)}", "status": "fallback_failed"}
        else:
            return {"error": error_msg, "status": "api_error"}
        
    except json.JSONDecodeError as e:
        error_msg = f"Failed to parse JSON response: {str(e)}"
        print(f"ERROR: {error_msg}")
        return {"error": error_msg, "status": "json_parse_error", "raw_response": result_json_str}
        
    except Exception as e:
        error_msg = f"Unexpected error in OpenAI call: {str(e)}"
        print(f"ERROR: {error_msg}")
        return {"error": error_msg, "status": "unexpected_error"}

# --- Deep Analysis Function (Runs on Modal) ---
@app.function(image=modal_image, secrets=[modal.Secret.from_name("openai-secret")])
def deep_analyze_url(url: str):
    """
    Performs a deep marketing and content analysis on a single URL.
    """
    print(f"Deep analyzing URL: {url}")
    scraped_data = {}
    text_content = ""

    # 1. Scraping
    try:
        response = requests.get(url, timeout=10, headers={'User-Agent': 'Mozilla/5.0'})
        response.raise_for_status() # Raise an exception for HTTP errors
        soup = BeautifulSoup(response.content, 'lxml')
        
        # Try to find main content, fall back to body
        main_content_area = soup.find('article') or soup.find('main') or soup.body
        if main_content_area:
            text_content = main_content_area.get_text(separator=' ', strip=True)
        else:
            text_content = soup.get_text(separator=' ', strip=True) # Fallback if no specific main area

        scraped_data["meta_title"] = soup.find('title').get_text(strip=True) if soup.find('title') else "Not found"
        meta_desc_tag = soup.find('meta', attrs={'name': 'description'})
        scraped_data["meta_description"] = meta_desc_tag['content'] if meta_desc_tag and 'content' in meta_desc_tag.attrs else "Not found"
        
        # Link counts
        all_links = [a['href'] for a in soup.find_all('a', href=True)]
        scraped_data["internal_links"] = len([link for link in all_links if url in link or link.startswith('/')])
        scraped_data["external_links"] = len([link for link in all_links if url not in link and link.startswith('http')])

    except requests.exceptions.RequestException as e:
        return {"url": url, "status": "failed", "error": f"Scraping failed: {str(e)}"}
    except Exception as e:
        return {"url": url, "status": "failed", "error": f"Error during scraping/parsing: {str(e)}"}

    if not text_content:
        return {"url": url, "status": "failed", "error": "Could not extract text content."}

    # 2. Statistical & SEO Analysis (using textstat)
    try:
        word_count = textstat.lexicon_count(text_content)
        sentence_count = textstat.sentence_count(text_content)
        readability_metrics = {
            "flesch_reading_ease": textstat.flesch_reading_ease(text_content),
            "flesch_kincaid_grade": textstat.flesch_kincaid_grade(text_content),
            "estimated_reading_time_minutes": round(word_count / 200, 2) if word_count > 0 else 0,
            "word_count": word_count,
            "sentence_count": sentence_count,
        }
    except Exception as e:
        readability_metrics = {"error": f"Readability analysis failed: {str(e)}"}


    # 3. LLM-Powered Qualitative Analysis (OpenAI)
    llm_prompt_for_deep_analysis = f"""
    Analyze the following web content from {url}. Extract the requested information and provide it in a JSON format.
    
    Content: "{text_content}"
    
    Please analyze this content and provide:
    - primary_keywords: List of 3-5 main keywords/topics
    - lsi_keywords: List of related semantic keywords (5-8 keywords)
    - sentiment: Object with "score" (Positive/Negative/Neutral) and "confidence" (0-1)
    - emotional_tone: List of emotional descriptors (2-4 items)
    - cta_analysis: Object with "has_cta" (boolean) and "cta_text" (string or null)
    - brand_mentions: List of brand names mentioned in the content
    """
    
    # This defines the structure we expect from the LLM for deep analysis
    expected_llm_structure_deep = {
        "primary_keywords": [], "lsi_keywords": [], "sentiment": {},
        "emotional_tone": [], "cta_analysis": {}, "brand_mentions": []
    }
    llm_driven_analysis_result = query_llm(llm_prompt_for_deep_analysis, expected_llm_structure_deep)

    # Check if there was an error in the LLM call
    if "error" in llm_driven_analysis_result:
        return {
            "url": url,
            "status": "partial_success",
            "analysis": {
                "readability_metrics": readability_metrics,
                "seo_metrics": {
                    "meta_title": scraped_data.get("meta_title"),
                    "meta_description": scraped_data.get("meta_description"),
                    "internal_links": scraped_data.get("internal_links"),
                    "external_links": scraped_data.get("external_links"),
                },
                "llm_driven_analysis": llm_driven_analysis_result  # Will contain error info
            }
        }

    # 4. Combine and Return
    return {
        "url": url,
        "status": "success",
        "analysis": {
            "readability_metrics": readability_metrics,
            "seo_metrics": { # Merging scraped SEO data here
                "meta_title": scraped_data.get("meta_title"),
                "meta_description": scraped_data.get("meta_description"),
                "internal_links": scraped_data.get("internal_links"),
                "external_links": scraped_data.get("external_links"),
            },
            "llm_driven_analysis": llm_driven_analysis_result
        }
    }

# --- Swarm Analysis Functions (Runs on Modal) ---
@app.function(image=modal_image, secrets=[modal.Secret.from_name("openai-secret")])
def scrape_and_analyze(url: str, analysis_prompt_for_swarm: str):
    """
    Helper function for swarm analysis: scrapes and performs a *simple* analysis on one URL.
    """
    print(f"Swarm - analyzing single URL: {url}")
    try:
        response = requests.get(url, timeout=10, headers={'User-Agent': 'Mozilla/5.0'})
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'lxml')
        main_content_area = soup.find('article') or soup.find('main') or soup.body
        text_content = main_content_area.get_text(separator=' ', strip=True) if main_content_area else soup.get_text(separator=' ', strip=True)

        if not text_content:
            return {"url": url, "status": "failed", "error": "No text content found"}

        # OpenAI call for a simple summary
        llm_prompt = f"""
        Content from {url}: {text_content[:1000]}
        
        {analysis_prompt_for_swarm}
        
        Please provide a concise summary of the main topic and key points from this content.
        """
        summary_result = query_llm(llm_prompt, {"summary": ""}) # Expecting a simple summary
        return {"url": url, "status": "success", "analysis": summary_result}

    except requests.exceptions.RequestException as e:
        return {"url": url, "status": "failed", "error": f"Scraping failed: {str(e)}"}
    except Exception as e:
        return {"url": url, "status": "failed", "error": f"Processing error: {str(e)}"}

@app.function(image=modal_image, secrets=[modal.Secret.from_name("openai-secret")], timeout=60000) # Longer timeout for potentially many URLs
def swarm_analyze_urls(urls: list[str], analysis_prompt: str):
    """
    Scrapes and analyzes a list of URLs in parallel for swarm mode.
    """
    print(f"Swarm analyzing {len(urls)} URLs. Prompt: {analysis_prompt}")
    individual_results = []
    # Use .map to run scrape_and_analyze in parallel for each URL
    # The 'kwargs' argument passes the analysis_prompt to each mapped function call
    for result in scrape_and_analyze.map(urls, kwargs={"analysis_prompt_for_swarm": analysis_prompt}):
        individual_results.append(result)

    # Aggregate results (OpenAI call)
    successful_summaries = [
        res["analysis"]["summary"]
        for res in individual_results
        if res["status"] == "success" and "analysis" in res and "summary" in res["analysis"]
    ]

    if not successful_summaries:
        return {
            "overall_summary": "No successful analyses to aggregate.",
            "top_themes": [],
            "individual_results": individual_results
        }

    aggregation_prompt = f"""
    Synthesize these summaries into an comprehensive overview and identify the top themes:
    
    Summaries: {'. '.join(successful_summaries)}
    
    Please provide:
    - aggregated_summary: A comprehensive overview synthesizing all summaries
    - top_themes: List of 3-5 main themes that emerge across all content
    """
    aggregated_llm_result = query_llm(aggregation_prompt, {"aggregated_summary": "", "top_themes": []})

    return {
        "overall_summary": aggregated_llm_result.get("aggregated_summary"),
        "top_themes": aggregated_llm_result.get("top_themes"),
        "individual_results": individual_results
    }

# --- Local Stub for Testing (Optional) ---
# This allows you to test your Modal functions locally without deploying.
# To run: modal run modal_app.py
@app.local_entrypoint()
def main():
    print("--- Testing deep_analyze_url ---")
    # Test with a known working URL for scraping
    test_url_deep = "https://modal.com/docs/guide" # Example URL
    deep_result = deep_analyze_url.remote(test_url_deep)
    print(json.dumps(deep_result, indent=2))

    print("\n--- Testing swarm_analyze_urls ---")
    test_urls_swarm = [
        "https://modal.com/blog",
        "https://gantry.io/blog",
        "http://example.com/nonexistentpage"
    ]
    swarm_result = swarm_analyze_urls.remote(test_urls_swarm, "Provide a brief summary of the main topic.")
    print(json.dumps(swarm_result, indent=2))