Spaces:

GuglielmoTor
/

LinkedinMonitor

Running

App Files Files Community

GuglielmoTor commited on 7 days ago

Commit

afab0ca

verified ·

1 Parent(s): e4cf8e8

Update features/insight_and_tasks/orchestrators/linkedin_analytics_orchestrator.py

Browse files

Files changed (1) hide show

features/insight_and_tasks/orchestrators/linkedin_analytics_orchestrator.py +227 -56

features/insight_and_tasks/orchestrators/linkedin_analytics_orchestrator.py CHANGED Viewed

@@ -1,9 +1,9 @@
 # orchestrators/linkedin_analytics_orchestrator.py
 import pandas as pd
 import logging
-from typing import Dict, Any, Optional, AsyncGenerator
-from datetime import date, datetime
-from dataclasses import asdict
 import os
 os.environ["GOOGLE_GENAI_USE_VERTEXAI"] = "False"
@@ -11,115 +11,286 @@ GOOGLE_API_KEY = os.environ.get("GEMINI_API_KEY")
 os.environ["GOOGLE_API_KEY"] = GOOGLE_API_KEY
 # Project-specific imports
-from features.insight_and_tasks.utils.pandasai_setup import configure_pandasai
 from features.insight_and_tasks.coordinators.employer_branding_coordinator import EnhancedEmployerBrandingCoordinator
 from features.insight_and_tasks.agents.task_extraction_agent import TaskExtractionAgent
-from features.insight_and_tasks.data_models.metrics import AgentMetrics
-from features.insight_and_tasks.data_models.tasks import TaskExtractionOutput
 from features.insight_and_tasks.agents.task_extraction_model import extract_tasks_from_text
 logger = logging.getLogger(__name__)
 class EnhancedLinkedInAnalyticsOrchestrator:
     """
-    Orchestrates the end-to-end LinkedIn analytics process with streaming results.
     """
     def __init__(self, api_key: str, llm_model_name: Optional[str] = None, current_date_for_tasks: Optional[date] = None):
         self.api_key = api_key
-        self.llm_model_name = llm_model_name
         try:
             configure_pandasai(api_key=self.api_key, model_name=self.llm_model_name)
             logger.info(f"PandasAI configured by orchestrator with model hint: {self.llm_model_name or 'default'}.")
         except Exception as e:
             logger.error(f"Failed to configure PandasAI in orchestrator: {e}", exc_info=True)
         self.coordinator = EnhancedEmployerBrandingCoordinator(api_key=self.api_key, model_name=self.llm_model_name)
         self.task_extractor = TaskExtractionAgent(
             api_key=self.api_key,
-            model_name=self.llm_model_name,
-            current_date=current_date_for_tasks
         )
         logger.info("EnhancedLinkedInAnalyticsOrchestrator initialized.")
-    async def generate_full_analysis_and_tasks_streaming(
         self,
         follower_stats_df: pd.DataFrame,
         post_df: pd.DataFrame,
         mentions_df: pd.DataFrame
-    ) -> AsyncGenerator[Dict[str, Any], None]:
         """
-        Executes the full pipeline with streaming results.
-        Yields intermediate results as they become available.
         """
-        logger.info("Starting streaming analysis and task generation pipeline...")
-        # Step 1: Get analyses and metrics from specialized agents
         logger.info("Running follower analysis...")
         follower_agent_metrics: AgentMetrics = self.coordinator.follower_agent.analyze_follower_data(follower_stats_df)
-        logger.info(f"Follower analysis complete.")
         logger.info("Running post performance analysis...")
         post_agent_metrics: AgentMetrics = self.coordinator.post_agent.analyze_post_data(post_df)
-        logger.info(f"Post analysis complete.")
         logger.info("Running mentions analysis...")
         mentions_agent_metrics: AgentMetrics = self.coordinator.mentions_agent.analyze_mentions_data(mentions_df)
-        logger.info(f"Mentions analysis complete.")
-        # Step 2: Coordinator synthesizes these metrics into a comprehensive analysis text
         logger.info("Running coordinator for synthesis...")
         comprehensive_analysis_text: str = await self.coordinator.generate_comprehensive_analysis(
             follower_agent_metrics, post_agent_metrics, mentions_agent_metrics
         )
         logger.info(f"Coordinator synthesis complete. Report length: {len(comprehensive_analysis_text)} chars.")
-        # Yield the report as soon as it's ready
-        partial_results = {
-            "comprehensive_analysis_report": comprehensive_analysis_text,
-            "actionable_okrs_and_tasks": None,  # Not ready yet
-            "detailed_metrics": {
-                "follower_agent": asdict(follower_agent_metrics) if follower_agent_metrics else None,
-                "post_agent": asdict(post_agent_metrics) if post_agent_metrics else None,
-                "mentions_agent": asdict(mentions_agent_metrics) if mentions_agent_metrics else None,
-            },
-            "status": "report_ready"  # Indicate what's available
-        }
-        logger.info("Yielding report results...")
-        yield partial_results
-        # Step 3: TaskExtractionAgent extracts actionable tasks (OKRs) from the comprehensive text
         logger.info("Running task extraction...")
         actionable_tasks_okrs: TaskExtractionOutput = extract_tasks_from_text(comprehensive_analysis_text, GOOGLE_API_KEY)
         logger.info(f"Task extraction complete. Number of OKRs: {len(actionable_tasks_okrs.okrs) if actionable_tasks_okrs else 'Error'}.")
-        # Yield the final complete results
         final_results = {
             "comprehensive_analysis_report": comprehensive_analysis_text,
-            "actionable_okrs_and_tasks": actionable_tasks_okrs.model_dump() if actionable_tasks_okrs else None,
             "detailed_metrics": {
                 "follower_agent": asdict(follower_agent_metrics) if follower_agent_metrics else None,
                 "post_agent": asdict(post_agent_metrics) if post_agent_metrics else None,
                 "mentions_agent": asdict(mentions_agent_metrics) if mentions_agent_metrics else None,
-            },
-            "status": "complete"  # Indicate everything is ready
         }
-        logger.info("Yielding final complete results...")
-        yield final_results
-    # Keep the original method for backward compatibility
-    async def generate_full_analysis_and_tasks(
-        self,
-        follower_stats_df: pd.DataFrame,
-        post_df: pd.DataFrame,
-        mentions_df: pd.DataFrame
-    ) -> Dict[str, Any]:
-        """
-        Original method - returns complete results only when everything is done.
-        """
-        async for result in self.generate_full_analysis_and_tasks_streaming(follower_stats_df, post_df, mentions_df):
-            if result.get("status") == "complete":
-                return result
-        # Fallback if no complete result
-        return {"error": "Pipeline did not complete successfully"}

 # orchestrators/linkedin_analytics_orchestrator.py
 import pandas as pd
 import logging
+from typing import Dict, Any, Optional
+from datetime import date, datetime # For TaskExtractionAgent date
+from dataclasses import asdict # For converting AgentMetrics to dict if needed for final output
 import os
 os.environ["GOOGLE_GENAI_USE_VERTEXAI"] = "False"
 os.environ["GOOGLE_API_KEY"] = GOOGLE_API_KEY
 # Project-specific imports
+from features.insight_and_tasks.utils.pandasai_setup import configure_pandasai # Centralized PandasAI config
 from features.insight_and_tasks.coordinators.employer_branding_coordinator import EnhancedEmployerBrandingCoordinator
 from features.insight_and_tasks.agents.task_extraction_agent import TaskExtractionAgent
+from features.insight_and_tasks.data_models.metrics import AgentMetrics # For type hinting
+from features.insight_and_tasks.data_models.tasks import TaskExtractionOutput # For type hinting
 from features.insight_and_tasks.agents.task_extraction_model import extract_tasks_from_text
+# Configure logger for this module
 logger = logging.getLogger(__name__)
 class EnhancedLinkedInAnalyticsOrchestrator:
     """
+    Orchestrates the end-to-end LinkedIn analytics process, from data input through
+    specialized agent analysis, coordinator synthesis, and actionable task extraction.
     """
     def __init__(self, api_key: str, llm_model_name: Optional[str] = None, current_date_for_tasks: Optional[date] = None):
+        """
+        Initializes the orchestrator.
+        Args:
+            api_key: The API key for Google services (used by PandasAI and LlmAgents).
+            llm_model_name: Optional. The primary LLM model name to be used by agents.
+                            Specific agents/coordinator might override with their defaults if not set.
+            current_date_for_tasks: Optional. The date to be used by TaskExtractionAgent for quarter calculations. Defaults to today.
+        """
         self.api_key = api_key
+        self.llm_model_name = llm_model_name # Can be passed down or agents use their defaults
+        # Configure PandasAI globally at the start of orchestration.
+        # Pass the model_name if specified, otherwise pandasai_setup might use its own default.
         try:
             configure_pandasai(api_key=self.api_key, model_name=self.llm_model_name)
             logger.info(f"PandasAI configured by orchestrator with model hint: {self.llm_model_name or 'default'}.")
         except Exception as e:
             logger.error(f"Failed to configure PandasAI in orchestrator: {e}", exc_info=True)
+            # Decide if this is a critical failure or if agents can proceed (they might try to reconfigure)
+        # Initialize the coordinator, which in turn initializes its specialized agents.
+        # Pass the model_name hint to the coordinator.
         self.coordinator = EnhancedEmployerBrandingCoordinator(api_key=self.api_key, model_name=self.llm_model_name)
+        # Initialize the TaskExtractionAgent.
+        # It uses its own default model unless overridden here.
         self.task_extractor = TaskExtractionAgent(
             api_key=self.api_key,
+            model_name=self.llm_model_name, # Pass model hint
+            current_date=current_date_for_tasks # Defaults to today if None
         )
         logger.info("EnhancedLinkedInAnalyticsOrchestrator initialized.")
+    async def generate_full_analysis_and_tasks(
         self,
         follower_stats_df: pd.DataFrame,
         post_df: pd.DataFrame,
         mentions_df: pd.DataFrame
+    ) -> Dict[str, Any]:
         """
+        Executes the full pipeline: agent analyses, coordinator synthesis, and task extraction.
+        Args:
+            follower_stats_df: DataFrame containing follower statistics.
+            post_df: DataFrame containing post performance data.
+            mentions_df: DataFrame containing brand mentions data.
+        Returns:
+            A dictionary containing the comprehensive analysis text, actionable tasks (OKRs),
+            and the detailed metrics from each specialized agent.
         """
+        logger.info("Starting full analysis and task generation pipeline...")
+        # Step 1: Get analyses and metrics from specialized agents.
+        # The coordinator's internal agents are used here.
         logger.info("Running follower analysis...")
         follower_agent_metrics: AgentMetrics = self.coordinator.follower_agent.analyze_follower_data(follower_stats_df)
+        logger.info(f"Follower analysis complete. Summary: {follower_agent_metrics.analysis_summary[:100]}...")
         logger.info("Running post performance analysis...")
         post_agent_metrics: AgentMetrics = self.coordinator.post_agent.analyze_post_data(post_df)
+        logger.info(f"Post analysis complete. Summary: {post_agent_metrics.analysis_summary[:100]}...")
         logger.info("Running mentions analysis...")
         mentions_agent_metrics: AgentMetrics = self.coordinator.mentions_agent.analyze_mentions_data(mentions_df)
+        logger.info(f"Mentions analysis complete. Summary: {mentions_agent_metrics.analysis_summary[:100]}...")
+        # Step 2: Coordinator synthesizes these metrics into a comprehensive analysis text.
         logger.info("Running coordinator for synthesis...")
         comprehensive_analysis_text: str = await self.coordinator.generate_comprehensive_analysis(
             follower_agent_metrics, post_agent_metrics, mentions_agent_metrics
         )
         logger.info(f"Coordinator synthesis complete. Report length: {len(comprehensive_analysis_text)} chars.")
+        if not comprehensive_analysis_text or comprehensive_analysis_text.startswith("Error"):
+            logger.error(f"Coordinator synthesis failed or produced an error message: {comprehensive_analysis_text}")
+            # Potentially stop here or proceed with task extraction on whatever text was generated.
+        # Step 3: TaskExtractionAgent extracts actionable tasks (OKRs) from the comprehensive text.
         logger.info("Running task extraction...")
+        #actionable_tasks_okrs: TaskExtractionOutput = await self.task_extractor.extract_tasks(comprehensive_analysis_text)
         actionable_tasks_okrs: TaskExtractionOutput = extract_tasks_from_text(comprehensive_analysis_text, GOOGLE_API_KEY)
         logger.info(f"Task extraction complete. Number of OKRs: {len(actionable_tasks_okrs.okrs) if actionable_tasks_okrs else 'Error'}.")
+        # Step 4: Compile and return all results.
+        # Convert Pydantic/dataclass objects to dicts for easier JSON serialization if the final output needs it.
+        # The `actionable_tasks_okrs` is already a Pydantic model, which can be serialized with .model_dump() / .json().
+        # `AgentMetrics` are dataclasses, use `asdict`.
         final_results = {
             "comprehensive_analysis_report": comprehensive_analysis_text,
+            "actionable_okrs_and_tasks": actionable_tasks_okrs.model_dump() if actionable_tasks_okrs else None, # Pydantic v2
+            # "actionable_okrs_and_tasks": actionable_tasks_okrs.dict() if actionable_tasks_okrs else None, # Pydantic v1
             "detailed_metrics": {
                 "follower_agent": asdict(follower_agent_metrics) if follower_agent_metrics else None,
                 "post_agent": asdict(post_agent_metrics) if post_agent_metrics else None,
                 "mentions_agent": asdict(mentions_agent_metrics) if mentions_agent_metrics else None,
+            }
         }
+        logger.info("Full analysis and task generation pipeline finished successfully.")
+        return final_results
+# Example usage (similar to the original script's main execution block)
+if __name__ == '__main__':
+    import asyncio
+    import os
+    from utils.logging_config import setup_logging
+    from utils.data_fetching import fetch_linkedin_data_from_bubble, VALID_DATA_TYPES
+    setup_logging() # Configure logging for the application
+    # --- Configuration ---
+    # Attempt to get API key from environment variable
+    # IMPORTANT: Set GOOGLE_API_KEY and BUBBLE_API_KEY in your environment for this to run.
+    GOOGLE_API_KEY = os.environ.get("GOOGLE_API_KEY")
+    BUBBLE_API_KEY_ENV = os.environ.get("BUBBLE_API_KEY") # Used by data_fetching
+    if not GOOGLE_API_KEY:
+        logger.critical("GOOGLE_API_KEY environment variable not set. Orchestrator cannot initialize LLM agents.")
+        exit(1)
+    if not BUBBLE_API_KEY_ENV: # data_fetching will also check, but good to note here
+        logger.warning("BUBBLE_API_KEY environment variable not set. Data fetching from Bubble will fail.")
+        # You might want to exit or use mock data if Bubble is essential.
+    # Set the Google Vertex AI environment variable if not using Vertex AI (as in original)
+    os.environ["GOOGLE_GENAI_USE_VERTEXAI"] = "False"
+    # Orchestrator settings
+    ORG_URN_EXAMPLE = "urn:li:organization:19010008" # Example, replace with actual
+    # Specify a model or let orchestrator/agents use their defaults
+    # LLM_MODEL_FOR_ORCHESTRATION = "gemini-2.5-flash-preview-05-20" # Example: use a powerful model
+    LLM_MODEL_FOR_ORCHESTRATION = None # Let agents use their defaults or pass a specific one
+    # --- Initialize Orchestrator ---
+    orchestrator = EnhancedLinkedInAnalyticsOrchestrator(
+        api_key=GOOGLE_API_KEY,
+        llm_model_name=LLM_MODEL_FOR_ORCHESTRATION,
+        current_date_for_tasks=datetime.utcnow().date() # Use today for task planning
+    )
+    # --- Data Fetching ---
+    logger.info(f"Fetching data for organization URN: {ORG_URN_EXAMPLE}")
+    # Helper to fetch and log
+    def get_data(data_type: VALID_DATA_TYPES, org_urn: str) -> pd.DataFrame:
+        df, error = fetch_linkedin_data_from_bubble(org_urn=org_urn, data_type=data_type)
+        if error:
+            logger.error(f"Error fetching {data_type}: {error}. Using empty DataFrame.")
+            return pd.DataFrame()
+        if df is None: # Should not happen if error is None, but as a safeguard
+            logger.warning(f"Fetched {data_type} is None but no error reported. Using empty DataFrame.")
+            return pd.DataFrame()
+        logger.info(f"Successfully fetched {data_type} with {len(df)} rows.")
+        return df
+    follower_stats_df_raw = get_data("li_follower_stats", ORG_URN_EXAMPLE)
+    posts_df_raw = get_data("LI_posts", ORG_URN_EXAMPLE) # Contains post content, media_type, etc.
+    mentions_df_raw = get_data("Li_mentions", ORG_URN_EXAMPLE)
+    post_stats_df_raw = get_data("LI_post_stats", ORG_URN_EXAMPLE) # Contains engagement numbers for posts
+    # --- Data Preprocessing/Merging (as in original example) ---
+    # Select relevant columns for follower_stats_df
+    if not follower_stats_df_raw.empty:
+        follower_stats_df = follower_stats_df_raw[[
+            'category_name', "follower_count_organic", "follower_count_paid", "follower_count_type"
+        ]].copy()
+    else:
+        follower_stats_df = pd.DataFrame() # Ensure it's an empty DF if raw is empty
+    # Merge posts_df and post_stats_df
+    # This logic assumes 'id' in posts_df_raw and 'post_id' in post_stats_df_raw
+    merged_posts_df = pd.DataFrame()
+    if not posts_df_raw.empty and not post_stats_df_raw.empty:
+        if 'id' in posts_df_raw.columns and 'post_id' in post_stats_df_raw.columns:
+            # Ensure 'id' in posts_df_raw is unique before merge if it's a left table key
+            # posts_df_raw.drop_duplicates(subset=['id'], keep='first', inplace=True)
+            merged_posts_df = pd.merge(posts_df_raw, post_stats_df_raw, left_on='id', right_on='post_id', how='left', suffixes=('', '_stats'))
+            logger.info(f"Merged posts_df ({len(posts_df_raw)}) and post_stats_df ({len(post_stats_df_raw)}) into merged_posts_df ({len(merged_posts_df)}).")
+        else:
+            logger.warning("Cannot merge posts_df and post_stats_df due to missing 'id' or 'post_id'. Using posts_df_raw.")
+            merged_posts_df = posts_df_raw.copy() # Fallback to posts_df_raw
+    elif not posts_df_raw.empty:
+        logger.info("post_stats_df is empty. Using posts_df_raw for post analysis.")
+        merged_posts_df = posts_df_raw.copy()
+    else:
+        logger.warning("Both posts_df_raw and post_stats_df_raw are empty.")
+        merged_posts_df = pd.DataFrame() # Empty DF
+    # Select and ensure essential columns for merged_posts_df
+    # These are columns expected by EnhancedPostPerformanceAgent
+    expected_post_cols = [
+        'li_eb_label', 'media_type', 'is_ad', 'id', 'published_at', 'sentiment',
+        'engagement', 'impressionCount', 'clickCount', 'likeCount', 'commentCount', 'shareCount'
+    ]
+    if not merged_posts_df.empty:
+        final_post_df_cols = {}
+        for col in expected_post_cols:
+            if col in merged_posts_df.columns:
+                final_post_df_cols[col] = merged_posts_df[col]
+            elif f"{col}_stats" in merged_posts_df.columns: # Check for suffixed columns from merge
+                 final_post_df_cols[col] = merged_posts_df[f"{col}_stats"]
+            else:
+                logger.debug(f"Expected column '{col}' not found in merged_posts_df. Will be created as empty/default by agent if needed.")
+                # Agent preprocessing should handle missing columns by creating them with defaults (0 or 'Unknown')
+        # Create the final DataFrame with only the selected/available columns
+        # This ensures that if a column is missing, it doesn't cause an error here,
+        # but the agent's preprocessing will handle it.
+        # However, it's better to ensure they exist with NAs if the agent expects them.
+        temp_post_df = pd.DataFrame(final_post_df_cols)
+        # Ensure all expected columns are present, filling with NA if missing from selection
+        for col in expected_post_cols:
+            if col not in temp_post_df.columns:
+                temp_post_df[col] = pd.NA # Or appropriate default like 0 for numeric, 'Unknown' for categorical
+        merged_posts_df = temp_post_df[expected_post_cols].copy() # Ensure correct order and all columns
+    else: # If merged_posts_df started empty and stayed empty
+        merged_posts_df = pd.DataFrame(columns=expected_post_cols)
+    # Mentions DataFrame - select relevant columns if necessary, or pass as is
+    # Assuming mentions_df_raw is already in the correct shape or agent handles it.
+    # For example, if it needs specific columns:
+    # mentions_df = mentions_df_raw[['date', 'sentiment_label', 'mention_content']].copy() if not mentions_df_raw.empty else pd.DataFrame()
+    mentions_df = mentions_df_raw.copy() # Pass as is, agent will preprocess
+    # --- Run Orchestration ---
+    async def main_orchestration():
+        if follower_stats_df.empty and merged_posts_df.empty and mentions_df.empty:
+            logger.error("All input DataFrames are empty. Aborting orchestration.")
+            return None
+        logger.info("Orchestrator starting generate_full_analysis_and_tasks...")
+        results = await orchestrator.generate_full_analysis_and_tasks(
+            follower_stats_df=follower_stats_df,
+            post_df=merged_posts_df,
+            mentions_df=mentions_df
+        )
+        return results
+    orchestration_results = asyncio.run(main_orchestration())
+    # --- Output Results ---
+    if orchestration_results:
+        print("\n\n" + "="*30 + " COMPREHENSIVE ANALYSIS REPORT " + "="*30)
+        print(orchestration_results.get("comprehensive_analysis_report", "Report not generated."))
+        print("\n\n" + "="*30 + " ACTIONABLE TASKS (OKRs) " + "="*30)
+        okrs_data = orchestration_results.get("actionable_okrs_and_tasks")
+        if okrs_data:
+            # okrs_data is already a dict from .model_dump()
+            print(json.dumps(okrs_data, indent=2))
+        else:
+            print("No actionable tasks (OKRs) generated or an error occurred.")
+        print("\n\n" + "="*30 + " DETAILED AGENT METRICS " + "="*30)
+        detailed_metrics = orchestration_results.get("detailed_metrics", {})
+        for agent_name, metrics_dict in detailed_metrics.items():
+            print(f"\n--- {agent_name.replace('_', ' ').title()} Metrics ---")
+            if metrics_dict:
+                print(json.dumps(metrics_dict, indent=2, default=str)) # default=str for any non-serializable types
+            else:
+                print("Metrics not available for this agent.")
+    else:
+        logger.info("Orchestration did not produce results (likely due to empty input data).")
+    logger.info("Orchestration example finished.")