File size: 14,020 Bytes
d0c87f7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
import modal
import requests
import textstat # For readability scores
from bs4 import BeautifulSoup # For HTML parsing
import json # For handling JSON data
import os # For environment variables
import openai # For OpenAI API calls

# --- Configuration ---
# Define a Modal image with necessary Python packages
modal_image = modal.Image.debian_slim().pip_install(
    "requests",
    "beautifulsoup4",
    "textstat",
    "lxml", # A robust parser for BeautifulSoup
    "openai" # OpenAI API library
)

# Define a Modal App. The name is important for lookup from Gradio.
app = modal.App(name="sitegeist-ai-app")

# --- OpenAI LLM Function ---
def query_llm(prompt_text: str, expected_json_structure: dict):
    """
    Calls the OpenAI API to get structured JSON responses.
    
    Args:
        prompt_text (str): The prompt to send to OpenAI
        expected_json_structure (dict): Dictionary structure to guide the output format
        
    Returns:
        dict: Parsed JSON response from OpenAI
        
    Raises:
        Exception: If API call fails or response is not valid JSON
    """
    try:
        # Initialize OpenAI client with API key from environment variable
        client = openai.OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))
        
        # Create system prompt that instructs the model to return JSON with expected structure
        expected_keys = list(expected_json_structure.keys())
        system_prompt = (
            "You are a helpful assistant designed to output JSON. "
            "Based on the user's content, provide a JSON object with the following keys: "
            f"{', '.join(expected_keys)}. "
            "Ensure your response is valid JSON format only."
        )
        
        print(f"--- OpenAI API CALLED ---\nPrompt: {prompt_text[:200]}...\nExpected JSON keys: {expected_keys}")
        
        # Make the API call with JSON mode enabled
        response = client.chat.completions.create(
            model="gpt-4-turbo-preview",
            response_format={"type": "json_object"},  # Enable JSON mode
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": prompt_text}
            ],
            temperature=0.3  # Lower temperature for more consistent structured output
        )
        
        # Extract and parse the JSON response
        result_json_str = response.choices[0].message.content
        print(f"OpenAI Response: {result_json_str[:200]}...")
        
        # Parse the JSON string into a Python dictionary
        result_dict = json.loads(result_json_str)
        return result_dict
        
    except openai.APIError as e:
        error_msg = f"OpenAI API error: {str(e)}"
        print(f"ERROR: {error_msg}")
        
        # Fallback: Try without JSON mode if the model doesn't support it
        if "response_format" in str(e) and "not supported" in str(e):
            print("Falling back to non-JSON mode...")
            try:
                fallback_response = client.chat.completions.create(
                    model="gpt-4-turbo-preview",
                    messages=[
                        {"role": "system", "content": system_prompt + " Make sure to respond with valid JSON only."},
                        {"role": "user", "content": prompt_text}
                    ],
                    temperature=0.3
                )
                
                fallback_result_str = fallback_response.choices[0].message.content
                print(f"Fallback Response: {fallback_result_str[:200]}...")
                
                # Try to extract JSON from the response (in case there's extra text)
                import re
                json_match = re.search(r'\{.*\}', fallback_result_str, re.DOTALL)
                if json_match:
                    json_str = json_match.group()
                    return json.loads(json_str)
                else:
                    return json.loads(fallback_result_str)
                    
            except Exception as fallback_error:
                return {"error": f"Fallback also failed: {str(fallback_error)}", "status": "fallback_failed"}
        else:
            return {"error": error_msg, "status": "api_error"}
        
    except json.JSONDecodeError as e:
        error_msg = f"Failed to parse JSON response: {str(e)}"
        print(f"ERROR: {error_msg}")
        return {"error": error_msg, "status": "json_parse_error", "raw_response": result_json_str}
        
    except Exception as e:
        error_msg = f"Unexpected error in OpenAI call: {str(e)}"
        print(f"ERROR: {error_msg}")
        return {"error": error_msg, "status": "unexpected_error"}

# --- Deep Analysis Function (Runs on Modal) ---
@app.function(image=modal_image, secrets=[modal.Secret.from_name("openai-secret")])
def deep_analyze_url(url: str):
    """
    Performs a deep marketing and content analysis on a single URL.
    """
    print(f"Deep analyzing URL: {url}")
    scraped_data = {}
    text_content = ""

    # 1. Scraping
    try:
        response = requests.get(url, timeout=10, headers={'User-Agent': 'Mozilla/5.0'})
        response.raise_for_status() # Raise an exception for HTTP errors
        soup = BeautifulSoup(response.content, 'lxml')
        
        # Try to find main content, fall back to body
        main_content_area = soup.find('article') or soup.find('main') or soup.body
        if main_content_area:
            text_content = main_content_area.get_text(separator=' ', strip=True)
        else:
            text_content = soup.get_text(separator=' ', strip=True) # Fallback if no specific main area

        scraped_data["meta_title"] = soup.find('title').get_text(strip=True) if soup.find('title') else "Not found"
        meta_desc_tag = soup.find('meta', attrs={'name': 'description'})
        scraped_data["meta_description"] = meta_desc_tag['content'] if meta_desc_tag and 'content' in meta_desc_tag.attrs else "Not found"
        
        # Link counts
        all_links = [a['href'] for a in soup.find_all('a', href=True)]
        scraped_data["internal_links"] = len([link for link in all_links if url in link or link.startswith('/')])
        scraped_data["external_links"] = len([link for link in all_links if url not in link and link.startswith('http')])

    except requests.exceptions.RequestException as e:
        return {"url": url, "status": "failed", "error": f"Scraping failed: {str(e)}"}
    except Exception as e:
        return {"url": url, "status": "failed", "error": f"Error during scraping/parsing: {str(e)}"}

    if not text_content:
        return {"url": url, "status": "failed", "error": "Could not extract text content."}

    # 2. Statistical & SEO Analysis (using textstat)
    try:
        word_count = textstat.lexicon_count(text_content)
        sentence_count = textstat.sentence_count(text_content)
        readability_metrics = {
            "flesch_reading_ease": textstat.flesch_reading_ease(text_content),
            "flesch_kincaid_grade": textstat.flesch_kincaid_grade(text_content),
            "estimated_reading_time_minutes": round(word_count / 200, 2) if word_count > 0 else 0,
            "word_count": word_count,
            "sentence_count": sentence_count,
        }
    except Exception as e:
        readability_metrics = {"error": f"Readability analysis failed: {str(e)}"}


    # 3. LLM-Powered Qualitative Analysis (OpenAI)
    llm_prompt_for_deep_analysis = f"""
    Analyze the following web content from {url}. Extract the requested information and provide it in a JSON format.
    
    Content: "{text_content}"
    
    Please analyze this content and provide:
    - primary_keywords: List of 3-5 main keywords/topics
    - lsi_keywords: List of related semantic keywords (5-8 keywords)
    - sentiment: Object with "score" (Positive/Negative/Neutral) and "confidence" (0-1)
    - emotional_tone: List of emotional descriptors (2-4 items)
    - cta_analysis: Object with "has_cta" (boolean) and "cta_text" (string or null)
    - brand_mentions: List of brand names mentioned in the content
    """
    
    # This defines the structure we expect from the LLM for deep analysis
    expected_llm_structure_deep = {
        "primary_keywords": [], "lsi_keywords": [], "sentiment": {},
        "emotional_tone": [], "cta_analysis": {}, "brand_mentions": []
    }
    llm_driven_analysis_result = query_llm(llm_prompt_for_deep_analysis, expected_llm_structure_deep)

    # Check if there was an error in the LLM call
    if "error" in llm_driven_analysis_result:
        return {
            "url": url,
            "status": "partial_success",
            "analysis": {
                "readability_metrics": readability_metrics,
                "seo_metrics": {
                    "meta_title": scraped_data.get("meta_title"),
                    "meta_description": scraped_data.get("meta_description"),
                    "internal_links": scraped_data.get("internal_links"),
                    "external_links": scraped_data.get("external_links"),
                },
                "llm_driven_analysis": llm_driven_analysis_result  # Will contain error info
            }
        }

    # 4. Combine and Return
    return {
        "url": url,
        "status": "success",
        "analysis": {
            "readability_metrics": readability_metrics,
            "seo_metrics": { # Merging scraped SEO data here
                "meta_title": scraped_data.get("meta_title"),
                "meta_description": scraped_data.get("meta_description"),
                "internal_links": scraped_data.get("internal_links"),
                "external_links": scraped_data.get("external_links"),
            },
            "llm_driven_analysis": llm_driven_analysis_result
        }
    }

# --- Swarm Analysis Functions (Runs on Modal) ---
@app.function(image=modal_image, secrets=[modal.Secret.from_name("openai-secret")])
def scrape_and_analyze(url: str, analysis_prompt_for_swarm: str):
    """
    Helper function for swarm analysis: scrapes and performs a *simple* analysis on one URL.
    """
    print(f"Swarm - analyzing single URL: {url}")
    try:
        response = requests.get(url, timeout=10, headers={'User-Agent': 'Mozilla/5.0'})
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'lxml')
        main_content_area = soup.find('article') or soup.find('main') or soup.body
        text_content = main_content_area.get_text(separator=' ', strip=True) if main_content_area else soup.get_text(separator=' ', strip=True)

        if not text_content:
            return {"url": url, "status": "failed", "error": "No text content found"}

        # OpenAI call for a simple summary
        llm_prompt = f"""
        Content from {url}: {text_content[:1000]}
        
        {analysis_prompt_for_swarm}
        
        Please provide a concise summary of the main topic and key points from this content.
        """
        summary_result = query_llm(llm_prompt, {"summary": ""}) # Expecting a simple summary
        return {"url": url, "status": "success", "analysis": summary_result}

    except requests.exceptions.RequestException as e:
        return {"url": url, "status": "failed", "error": f"Scraping failed: {str(e)}"}
    except Exception as e:
        return {"url": url, "status": "failed", "error": f"Processing error: {str(e)}"}

@app.function(image=modal_image, secrets=[modal.Secret.from_name("openai-secret")], timeout=60000) # Longer timeout for potentially many URLs
def swarm_analyze_urls(urls: list[str], analysis_prompt: str):
    """
    Scrapes and analyzes a list of URLs in parallel for swarm mode.
    """
    print(f"Swarm analyzing {len(urls)} URLs. Prompt: {analysis_prompt}")
    individual_results = []
    # Use .map to run scrape_and_analyze in parallel for each URL
    # The 'kwargs' argument passes the analysis_prompt to each mapped function call
    for result in scrape_and_analyze.map(urls, kwargs={"analysis_prompt_for_swarm": analysis_prompt}):
        individual_results.append(result)

    # Aggregate results (OpenAI call)
    successful_summaries = [
        res["analysis"]["summary"]
        for res in individual_results
        if res["status"] == "success" and "analysis" in res and "summary" in res["analysis"]
    ]

    if not successful_summaries:
        return {
            "overall_summary": "No successful analyses to aggregate.",
            "top_themes": [],
            "individual_results": individual_results
        }

    aggregation_prompt = f"""
    Synthesize these summaries into an comprehensive overview and identify the top themes:
    
    Summaries: {'. '.join(successful_summaries)}
    
    Please provide:
    - aggregated_summary: A comprehensive overview synthesizing all summaries
    - top_themes: List of 3-5 main themes that emerge across all content
    """
    aggregated_llm_result = query_llm(aggregation_prompt, {"aggregated_summary": "", "top_themes": []})

    return {
        "overall_summary": aggregated_llm_result.get("aggregated_summary"),
        "top_themes": aggregated_llm_result.get("top_themes"),
        "individual_results": individual_results
    }

# --- Local Stub for Testing (Optional) ---
# This allows you to test your Modal functions locally without deploying.
# To run: modal run modal_app.py
@app.local_entrypoint()
def main():
    print("--- Testing deep_analyze_url ---")
    # Test with a known working URL for scraping
    test_url_deep = "https://modal.com/docs/guide" # Example URL
    deep_result = deep_analyze_url.remote(test_url_deep)
    print(json.dumps(deep_result, indent=2))

    print("\n--- Testing swarm_analyze_urls ---")
    test_urls_swarm = [
        "https://modal.com/blog",
        "https://gantry.io/blog",
        "http://example.com/nonexistentpage"
    ]
    swarm_result = swarm_analyze_urls.remote(test_urls_swarm, "Provide a brief summary of the main topic.")
    print(json.dumps(swarm_result, indent=2))