Spaces:

Agents-MCP-Hackathon
/

SitegeistAI

Running

App Files Files Community

Alejandro Ardila commited on 1 day ago

Commit

d0c87f7

1 Parent(s): b1fe90c

First attempt

Browse files

Files changed (4) hide show

.gitignore +179 -0
app.py +98 -0
modal_app.py +319 -0
requirements.txt +7 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,179 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+.python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# IDE
+.vscode/
+.idea/
+*.swp
+*.swo
+*~
+# OS generated files
+.DS_Store
+.DS_Store?
+._*
+.Spotlight-V100
+.Trashes
+ehthumbs.db
+Thumbs.db
+# Temporary files
+*.tmp
+*.temp
+# API keys and secrets
+.env.local
+.env.development.local
+.env.test.local
+.env.production.local
+secrets.json
+config.json
+*.key
+*.pem
+# Logs
+logs/
+*.log
+# Database
+*.db
+*.sqlite
+*.sqlite3
+# Mac specific
+.AppleDouble
+.LSOverride
+Icon

app.py ADDED Viewed

	@@ -0,0 +1,98 @@

+import gradio as gr
+import modal
+import json
+# --- Configuration ---
+MODAL_APP_NAME = "sitegeist-ai-app"
+def analyze_web_content(urls_json: str, deep_analysis: bool = False, analysis_prompt: str = "Summarize the content and identify key themes."):
+    """
+    MCP Tool: Analyzes web content from one or more URLs.
+    Performs deep analysis with marketing metrics if a single URL is provided and deep_analysis is True.
+    Otherwise, performs a swarm analysis for multiple URLs or a single URL without deep_analysis.
+    Args:
+        urls_json (str): A JSON string representing a list of URLs. e.g., '["http://example.com", "http://another.com"]'
+        deep_analysis (bool): If True and only one URL is provided, performs an in-depth analysis.
+        analysis_prompt (str): The specific analysis to perform on the content.
+    """
+    print(f"Received request: deep_analysis={deep_analysis}, prompt='{analysis_prompt}', urls_json='{urls_json}'")
+    try:
+        urls = json.loads(urls_json)
+        if not isinstance(urls, list) or not all(isinstance(url, str) for url in urls):
+            raise ValueError("Input must be a JSON string of a list of URLs.")
+        if not urls:
+            return json.dumps({"status": "error", "message": "URL list cannot be empty."})
+    except json.JSONDecodeError:
+        return json.dumps({"status": "error", "message": "Invalid JSON format for URLs."})
+    except ValueError as ve:
+        return json.dumps({"status": "error", "message": str(ve)})
+    result = None
+    try:
+        if len(urls) == 1 and deep_analysis:
+            print(f"Calling Modal: deep_analyze_url for {urls[0]}")
+            # Lookup the Modal function
+            modal_deep_analyze = modal.Function.lookup(MODAL_APP_NAME, "deep_analyze_url")
+            if modal_deep_analyze is None:
+                 return json.dumps({"status": "error", "message": f"Could not find Modal function 'deep_analyze_url' in app '{MODAL_APP_NAME}'."})
+            # Call the Modal function remotely
+            result = modal_deep_analyze.remote(url=urls[0])
+        else:
+            print(f"Calling Modal: swarm_analyze_urls for {len(urls)} URLs")
+            # Lookup the Modal function
+            modal_swarm_analyze = modal.Function.lookup(MODAL_APP_NAME, "swarm_analyze_urls")
+            if modal_swarm_analyze is None:
+                return json.dumps({"status": "error", "message": f"Could not find Modal function 'swarm_analyze_urls' in app '{MODAL_APP_NAME}'."})
+            # Call the Modal function remotely
+            result = modal_swarm_analyze.remote(urls=urls, analysis_prompt=analysis_prompt)
+        return json.dumps(result, indent=2) # Return the result from Modal as a JSON string
+    except modal.exception.NotFoundError as e:
+         print(f"Modal function not found: {e}")
+         return json.dumps({"status": "error", "message": f"Modal function lookup failed. Ensure '{MODAL_APP_NAME}' is deployed and functions are correctly named. Details: {e}"})
+    except Exception as e:
+        print(f"An unexpected error occurred: {e}")
+        return json.dumps({"status": "error", "message": f"An unexpected error occurred: {str(e)}"})
+# --- Gradio Interface for Testing (if not using MCP directly) ---
+# This allows you to test the analyze_web_content function via a web UI.
+# You would typically expose `analyze_web_content` directly as an MCP tool.
+with gr.Blocks() as demo:
+    gr.Markdown("# Sitegeist AI: Marketing & Content Intelligence Engine")
+    gr.Markdown(
+        "Enter URLs as a JSON list (e.g., `[\"http://url1.com\", \"http://url2.com\"]`). "
+        "The Modal backend calls are mocked and will return predefined data."
+    )
+    with gr.Row():
+        urls_input = gr.Textbox(label="URLs (JSON list)", placeholder='["https://example.com"]')
+        deep_analysis_checkbox = gr.Checkbox(label="Perform Deep Analysis (for single URL)", value=False)
+    analysis_prompt_input = gr.Textbox(label="Analysis Prompt", value="Summarize the content.")
+    submit_button = gr.Button("Analyze Content")
+    output_json = gr.JSON(label="Analysis Result")
+    submit_button.click(
+        analyze_web_content,
+        inputs=[urls_input, deep_analysis_checkbox, analysis_prompt_input],
+        outputs=output_json
+    )
+if __name__ == "__main__":
+    # To run this Gradio app: python app.py
+    # Ensure your Modal token is configured (`modal token set`)
+    # And the Modal app (`modal_app.py`) is deployed (`modal deploy modal_app.py`)
+    # or runnable locally if you are testing `modal run modal_app.py` in another terminal.
+    # For the Gradio app to successfully call `modal.Function.lookup()`,
+    # it generally expects the Modal app to be deployed, or you need to be
+    # running within a `modal.stub.run()` context if calling local stubs
+    # from another Modal process, which is more advanced.
+    # The simplest way for this sketch is to deploy `modal_app.py` first.
+    print("Attempting to launch Gradio demo...")
+    print("REMINDER: For Gradio to connect to Modal functions,")
+    print(f"1. Deploy 'modal_app.py' using 'modal deploy modal_app.py'.")
+    print(f"2. Ensure your Modal token is set up.")
+    print(f"3. The MODAL_APP_NAME ('{MODAL_APP_NAME}') in app.py must match the app name in modal_app.py.")
+    demo.launch(mcp_server=True)

modal_app.py ADDED Viewed

	@@ -0,0 +1,319 @@

+import modal
+import requests
+import textstat # For readability scores
+from bs4 import BeautifulSoup # For HTML parsing
+import json # For handling JSON data
+import os # For environment variables
+import openai # For OpenAI API calls
+# --- Configuration ---
+# Define a Modal image with necessary Python packages
+modal_image = modal.Image.debian_slim().pip_install(
+    "requests",
+    "beautifulsoup4",
+    "textstat",
+    "lxml", # A robust parser for BeautifulSoup
+    "openai" # OpenAI API library
+)
+# Define a Modal App. The name is important for lookup from Gradio.
+app = modal.App(name="sitegeist-ai-app")
+# --- OpenAI LLM Function ---
+def query_llm(prompt_text: str, expected_json_structure: dict):
+    """
+    Calls the OpenAI API to get structured JSON responses.
+    Args:
+        prompt_text (str): The prompt to send to OpenAI
+        expected_json_structure (dict): Dictionary structure to guide the output format
+    Returns:
+        dict: Parsed JSON response from OpenAI
+    Raises:
+        Exception: If API call fails or response is not valid JSON
+    """
+    try:
+        # Initialize OpenAI client with API key from environment variable
+        client = openai.OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))
+        # Create system prompt that instructs the model to return JSON with expected structure
+        expected_keys = list(expected_json_structure.keys())
+        system_prompt = (
+            "You are a helpful assistant designed to output JSON. "
+            "Based on the user's content, provide a JSON object with the following keys: "
+            f"{', '.join(expected_keys)}. "
+            "Ensure your response is valid JSON format only."
+        )
+        print(f"--- OpenAI API CALLED ---\nPrompt: {prompt_text[:200]}...\nExpected JSON keys: {expected_keys}")
+        # Make the API call with JSON mode enabled
+        response = client.chat.completions.create(
+            model="gpt-4-turbo-preview",
+            response_format={"type": "json_object"},  # Enable JSON mode
+            messages=[
+                {"role": "system", "content": system_prompt},
+                {"role": "user", "content": prompt_text}
+            ],
+            temperature=0.3  # Lower temperature for more consistent structured output
+        )
+        # Extract and parse the JSON response
+        result_json_str = response.choices[0].message.content
+        print(f"OpenAI Response: {result_json_str[:200]}...")
+        # Parse the JSON string into a Python dictionary
+        result_dict = json.loads(result_json_str)
+        return result_dict
+    except openai.APIError as e:
+        error_msg = f"OpenAI API error: {str(e)}"
+        print(f"ERROR: {error_msg}")
+        # Fallback: Try without JSON mode if the model doesn't support it
+        if "response_format" in str(e) and "not supported" in str(e):
+            print("Falling back to non-JSON mode...")
+            try:
+                fallback_response = client.chat.completions.create(
+                    model="gpt-4-turbo-preview",
+                    messages=[
+                        {"role": "system", "content": system_prompt + " Make sure to respond with valid JSON only."},
+                        {"role": "user", "content": prompt_text}
+                    ],
+                    temperature=0.3
+                )
+                fallback_result_str = fallback_response.choices[0].message.content
+                print(f"Fallback Response: {fallback_result_str[:200]}...")
+                # Try to extract JSON from the response (in case there's extra text)
+                import re
+                json_match = re.search(r'\{.*\}', fallback_result_str, re.DOTALL)
+                if json_match:
+                    json_str = json_match.group()
+                    return json.loads(json_str)
+                else:
+                    return json.loads(fallback_result_str)
+            except Exception as fallback_error:
+                return {"error": f"Fallback also failed: {str(fallback_error)}", "status": "fallback_failed"}
+        else:
+            return {"error": error_msg, "status": "api_error"}
+    except json.JSONDecodeError as e:
+        error_msg = f"Failed to parse JSON response: {str(e)}"
+        print(f"ERROR: {error_msg}")
+        return {"error": error_msg, "status": "json_parse_error", "raw_response": result_json_str}
+    except Exception as e:
+        error_msg = f"Unexpected error in OpenAI call: {str(e)}"
+        print(f"ERROR: {error_msg}")
+        return {"error": error_msg, "status": "unexpected_error"}
+# --- Deep Analysis Function (Runs on Modal) ---
+@app.function(image=modal_image, secrets=[modal.Secret.from_name("openai-secret")])
+def deep_analyze_url(url: str):
+    """
+    Performs a deep marketing and content analysis on a single URL.
+    """
+    print(f"Deep analyzing URL: {url}")
+    scraped_data = {}
+    text_content = ""
+    # 1. Scraping
+    try:
+        response = requests.get(url, timeout=10, headers={'User-Agent': 'Mozilla/5.0'})
+        response.raise_for_status() # Raise an exception for HTTP errors
+        soup = BeautifulSoup(response.content, 'lxml')
+        # Try to find main content, fall back to body
+        main_content_area = soup.find('article') or soup.find('main') or soup.body
+        if main_content_area:
+            text_content = main_content_area.get_text(separator=' ', strip=True)
+        else:
+            text_content = soup.get_text(separator=' ', strip=True) # Fallback if no specific main area
+        scraped_data["meta_title"] = soup.find('title').get_text(strip=True) if soup.find('title') else "Not found"
+        meta_desc_tag = soup.find('meta', attrs={'name': 'description'})
+        scraped_data["meta_description"] = meta_desc_tag['content'] if meta_desc_tag and 'content' in meta_desc_tag.attrs else "Not found"
+        # Link counts
+        all_links = [a['href'] for a in soup.find_all('a', href=True)]
+        scraped_data["internal_links"] = len([link for link in all_links if url in link or link.startswith('/')])
+        scraped_data["external_links"] = len([link for link in all_links if url not in link and link.startswith('http')])
+    except requests.exceptions.RequestException as e:
+        return {"url": url, "status": "failed", "error": f"Scraping failed: {str(e)}"}
+    except Exception as e:
+        return {"url": url, "status": "failed", "error": f"Error during scraping/parsing: {str(e)}"}
+    if not text_content:
+        return {"url": url, "status": "failed", "error": "Could not extract text content."}
+    # 2. Statistical & SEO Analysis (using textstat)
+    try:
+        word_count = textstat.lexicon_count(text_content)
+        sentence_count = textstat.sentence_count(text_content)
+        readability_metrics = {
+            "flesch_reading_ease": textstat.flesch_reading_ease(text_content),
+            "flesch_kincaid_grade": textstat.flesch_kincaid_grade(text_content),
+            "estimated_reading_time_minutes": round(word_count / 200, 2) if word_count > 0 else 0,
+            "word_count": word_count,
+            "sentence_count": sentence_count,
+        }
+    except Exception as e:
+        readability_metrics = {"error": f"Readability analysis failed: {str(e)}"}
+    # 3. LLM-Powered Qualitative Analysis (OpenAI)
+    llm_prompt_for_deep_analysis = f"""
+    Analyze the following web content from {url}. Extract the requested information and provide it in a JSON format.
+    Content: "{text_content}"
+    Please analyze this content and provide:
+    - primary_keywords: List of 3-5 main keywords/topics
+    - lsi_keywords: List of related semantic keywords (5-8 keywords)
+    - sentiment: Object with "score" (Positive/Negative/Neutral) and "confidence" (0-1)
+    - emotional_tone: List of emotional descriptors (2-4 items)
+    - cta_analysis: Object with "has_cta" (boolean) and "cta_text" (string or null)
+    - brand_mentions: List of brand names mentioned in the content
+    """
+    # This defines the structure we expect from the LLM for deep analysis
+    expected_llm_structure_deep = {
+        "primary_keywords": [], "lsi_keywords": [], "sentiment": {},
+        "emotional_tone": [], "cta_analysis": {}, "brand_mentions": []
+    }
+    llm_driven_analysis_result = query_llm(llm_prompt_for_deep_analysis, expected_llm_structure_deep)
+    # Check if there was an error in the LLM call
+    if "error" in llm_driven_analysis_result:
+        return {
+            "url": url,
+            "status": "partial_success",
+            "analysis": {
+                "readability_metrics": readability_metrics,
+                "seo_metrics": {
+                    "meta_title": scraped_data.get("meta_title"),
+                    "meta_description": scraped_data.get("meta_description"),
+                    "internal_links": scraped_data.get("internal_links"),
+                    "external_links": scraped_data.get("external_links"),
+                },
+                "llm_driven_analysis": llm_driven_analysis_result  # Will contain error info
+            }
+        }
+    # 4. Combine and Return
+    return {
+        "url": url,
+        "status": "success",
+        "analysis": {
+            "readability_metrics": readability_metrics,
+            "seo_metrics": { # Merging scraped SEO data here
+                "meta_title": scraped_data.get("meta_title"),
+                "meta_description": scraped_data.get("meta_description"),
+                "internal_links": scraped_data.get("internal_links"),
+                "external_links": scraped_data.get("external_links"),
+            },
+            "llm_driven_analysis": llm_driven_analysis_result
+        }
+    }
+# --- Swarm Analysis Functions (Runs on Modal) ---
+@app.function(image=modal_image, secrets=[modal.Secret.from_name("openai-secret")])
+def scrape_and_analyze(url: str, analysis_prompt_for_swarm: str):
+    """
+    Helper function for swarm analysis: scrapes and performs a *simple* analysis on one URL.
+    """
+    print(f"Swarm - analyzing single URL: {url}")
+    try:
+        response = requests.get(url, timeout=10, headers={'User-Agent': 'Mozilla/5.0'})
+        response.raise_for_status()
+        soup = BeautifulSoup(response.content, 'lxml')
+        main_content_area = soup.find('article') or soup.find('main') or soup.body
+        text_content = main_content_area.get_text(separator=' ', strip=True) if main_content_area else soup.get_text(separator=' ', strip=True)
+        if not text_content:
+            return {"url": url, "status": "failed", "error": "No text content found"}
+        # OpenAI call for a simple summary
+        llm_prompt = f"""
+        Content from {url}: {text_content[:1000]}
+        {analysis_prompt_for_swarm}
+        Please provide a concise summary of the main topic and key points from this content.
+        """
+        summary_result = query_llm(llm_prompt, {"summary": ""}) # Expecting a simple summary
+        return {"url": url, "status": "success", "analysis": summary_result}
+    except requests.exceptions.RequestException as e:
+        return {"url": url, "status": "failed", "error": f"Scraping failed: {str(e)}"}
+    except Exception as e:
+        return {"url": url, "status": "failed", "error": f"Processing error: {str(e)}"}
+@app.function(image=modal_image, secrets=[modal.Secret.from_name("openai-secret")], timeout=60000) # Longer timeout for potentially many URLs
+def swarm_analyze_urls(urls: list[str], analysis_prompt: str):
+    """
+    Scrapes and analyzes a list of URLs in parallel for swarm mode.
+    """
+    print(f"Swarm analyzing {len(urls)} URLs. Prompt: {analysis_prompt}")
+    individual_results = []
+    # Use .map to run scrape_and_analyze in parallel for each URL
+    # The 'kwargs' argument passes the analysis_prompt to each mapped function call
+    for result in scrape_and_analyze.map(urls, kwargs={"analysis_prompt_for_swarm": analysis_prompt}):
+        individual_results.append(result)
+    # Aggregate results (OpenAI call)
+    successful_summaries = [
+        res["analysis"]["summary"]
+        for res in individual_results
+        if res["status"] == "success" and "analysis" in res and "summary" in res["analysis"]
+    ]
+    if not successful_summaries:
+        return {
+            "overall_summary": "No successful analyses to aggregate.",
+            "top_themes": [],
+            "individual_results": individual_results
+        }
+    aggregation_prompt = f"""
+    Synthesize these summaries into an comprehensive overview and identify the top themes:
+    Summaries: {'. '.join(successful_summaries)}
+    Please provide:
+    - aggregated_summary: A comprehensive overview synthesizing all summaries
+    - top_themes: List of 3-5 main themes that emerge across all content
+    """
+    aggregated_llm_result = query_llm(aggregation_prompt, {"aggregated_summary": "", "top_themes": []})
+    return {
+        "overall_summary": aggregated_llm_result.get("aggregated_summary"),
+        "top_themes": aggregated_llm_result.get("top_themes"),
+        "individual_results": individual_results
+    }
+# --- Local Stub for Testing (Optional) ---
+# This allows you to test your Modal functions locally without deploying.
+# To run: modal run modal_app.py
+@app.local_entrypoint()
+def main():
+    print("--- Testing deep_analyze_url ---")
+    # Test with a known working URL for scraping
+    test_url_deep = "https://modal.com/docs/guide" # Example URL
+    deep_result = deep_analyze_url.remote(test_url_deep)
+    print(json.dumps(deep_result, indent=2))
+    print("\n--- Testing swarm_analyze_urls ---")
+    test_urls_swarm = [
+        "https://modal.com/blog",
+        "https://gantry.io/blog",
+        "http://example.com/nonexistentpage"
+    ]
+    swarm_result = swarm_analyze_urls.remote(test_urls_swarm, "Provide a brief summary of the main topic.")
+    print(json.dumps(swarm_result, indent=2))

requirements.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+modal
+requests
+gradio[mcp]
+textstat
+beautifulsoup4
+lxml
+openai