GuglielmoTor commited on
Commit
762599c
·
verified ·
1 Parent(s): c473dc0

Delete features/insight_and_tasks

Browse files
features/insight_and_tasks/__init__.py DELETED
File without changes
features/insight_and_tasks/agents/follower_agent.py DELETED
@@ -1,509 +0,0 @@
1
- # agents/follower_agent.py
2
- import pandas as pd
3
- from typing import Dict, List, Any, Optional
4
- import logging
5
- import pandasai as pai # Assuming pandasai is imported as pai globally or configured
6
-
7
- from google.adk.agents import LlmAgent # Assuming this is the correct import path
8
-
9
- # Project-specific imports
10
- from features.insight_and_tasks.utils.retry_mechanism import RetryMechanism
11
- from features.insight_and_tasks.data_models.metrics import AgentMetrics, TimeSeriesMetric
12
-
13
- # Configure logger for this module
14
- logger = logging.getLogger(__name__)
15
-
16
- # Define the model globally or pass it as a parameter. For now, using a constant.
17
- # Consider moving this to a shared config or environment variable.
18
- DEFAULT_AGENT_MODEL = "gemini-2.5-flash-preview-05-20" # Or your specific model like "gemini-1.5-flash-preview-05-20"
19
-
20
-
21
- class EnhancedFollowerAnalysisAgent:
22
- """
23
- Enhanced follower analysis agent with proper handling of different follower count types
24
- and structured metric extraction.
25
- """
26
-
27
- AGENT_NAME = "follower_analyst"
28
- AGENT_DESCRIPTION = "Expert analyst specializing in follower growth patterns and demographic analysis."
29
- AGENT_INSTRUCTION = """
30
- You are a specialized LinkedIn follower analytics expert focused on temporal patterns and demographic trends.
31
-
32
- Your role includes:
33
-
34
- 1. FOLLOWER TREND ANALYSIS:
35
- - Analyze follower growth trends over time (monthly data from 'follower_gains_monthly' type).
36
- - Identify growth acceleration/deceleration periods.
37
- - Calculate growth rates and velocity changes.
38
- - Detect seasonal patterns and anomalies.
39
- - Analyze organic vs paid follower counts over time.
40
-
41
- 2. DEMOGRAPHIC ANALYSIS (based on 'follower_industry', 'follower_seniority', etc.):
42
- - Analyze follower distribution by industry, seniority, function, and geography.
43
- - Compare organic vs paid followers across these demographic segments.
44
- - Identify high-value audience segments based on counts and potential engagement.
45
-
46
- 3. TIME-BASED INSIGHTS:
47
- - Provide month-over-month comparisons for growth data.
48
- - Identify critical inflection points in follower growth.
49
- - Calculate trend momentum and acceleration.
50
-
51
- 4. METRIC EXTRACTION (for the AgentMetrics structure):
52
- - Extract time-series data for total, organic, and paid follower counts, and growth rates.
53
- - Provide aggregate metrics like average monthly growth, total organic/paid followers.
54
- - Provide demographic breakdowns as categorical metrics (e.g., top N industries by follower count).
55
-
56
- Focus on separating temporal analysis (monthly) from demographic analysis.
57
- When analyzing demographics, consider the top N segments (e.g., top 10 industries) for conciseness.
58
- Ensure your analysis summary is comprehensive and insightful.
59
- """
60
-
61
- def __init__(self, api_key: str, model_name: Optional[str] = None):
62
- """
63
- Initializes the Follower Analysis Agent.
64
-
65
- Args:
66
- api_key: API key for LLM and potentially PandasAI.
67
- model_name: Name of the language model to use. Defaults to DEFAULT_AGENT_MODEL.
68
- """
69
- self.api_key = api_key # May be used if PandasAI is configured per agent or for other API calls
70
- self.model_name = model_name or DEFAULT_AGENT_MODEL
71
-
72
- self.agent = LlmAgent(
73
- name=self.AGENT_NAME,
74
- model=self.model_name,
75
- description=self.AGENT_DESCRIPTION,
76
- instruction=self.AGENT_INSTRUCTION
77
- )
78
- self.retry_mechanism = RetryMechanism()
79
- logger.info(f"{self.AGENT_NAME} initialized with model {self.model_name}.")
80
-
81
- def _separate_follower_data_by_type(self, df: pd.DataFrame) -> Dict[str, pd.DataFrame]:
82
- """Separate follower data by follower_count_type and process appropriately."""
83
- separated_data = {}
84
-
85
- if df is None or df.empty or 'follower_count_type' not in df.columns:
86
- logger.warning("Input DataFrame is empty or 'follower_count_type' column is missing.")
87
- return separated_data
88
-
89
- # Define the expected follower count types
90
- # These should match the 'follower_count_type' values in your Bubble data
91
- follower_types = [
92
- 'follower_gains_monthly', # For time-series analysis
93
- 'follower_industry', # For demographic analysis
94
- 'follower_seniority',
95
- 'follower_function',
96
- 'follower_geo'
97
- ]
98
-
99
- for ftype in follower_types:
100
- type_data = df[df['follower_count_type'] == ftype].copy()
101
- if not type_data.empty:
102
- if ftype == 'follower_gains_monthly':
103
- type_data = self._process_monthly_data(type_data)
104
- else: # Demographic data
105
- type_data = self._get_top_demographic_segments(type_data, top_n=10)
106
- separated_data[ftype] = type_data
107
- else:
108
- logger.info(f"No data found for follower_count_type: {ftype}")
109
-
110
- return separated_data
111
-
112
- def _get_top_demographic_segments(self, demo_df: pd.DataFrame, top_n: int = 10) -> pd.DataFrame:
113
- """Get top N demographic segments by total follower count (organic + paid)."""
114
- if demo_df.empty:
115
- return demo_df
116
-
117
- # Ensure required columns exist and are numeric, fill NaNs with 0 for sum
118
- demo_df = demo_df.copy() # Work on a copy
119
- demo_df['follower_count_organic'] = pd.to_numeric(demo_df.get('follower_count_organic'), errors='coerce').fillna(0)
120
- demo_df['follower_count_paid'] = pd.to_numeric(demo_df.get('follower_count_paid'), errors='coerce').fillna(0)
121
-
122
- demo_df['total_followers'] = demo_df['follower_count_organic'] + demo_df['follower_count_paid']
123
-
124
- # Sort by total followers and take top N
125
- # 'category_name' usually holds the demographic label (e.g., industry name)
126
- if 'category_name' not in demo_df.columns:
127
- logger.warning("'_get_top_demographic_segments' expects 'category_name' column for grouping.")
128
- return demo_df.drop(columns=['total_followers'], errors='ignore')
129
-
130
- # Group by category_name if there are multiple entries for the same category, sum followers
131
- # This step might be redundant if data is already aggregated per category_name
132
- # demo_df_grouped = demo_df.groupby('category_name').agg(
133
- # follower_count_organic=('follower_count_organic', 'sum'),
134
- # follower_count_paid=('follower_count_paid', 'sum'),
135
- # total_followers=('total_followers', 'sum')
136
- # ).reset_index()
137
-
138
- top_segments = demo_df.nlargest(top_n, 'total_followers')
139
-
140
- return top_segments.drop(columns=['total_followers'], errors='ignore')
141
-
142
-
143
- def _process_monthly_data(self, monthly_df: pd.DataFrame) -> pd.DataFrame:
144
- """Process monthly follower data: parse dates, sort."""
145
- if monthly_df.empty or 'category_name' not in monthly_df.columns:
146
- logger.warning("Monthly data DataFrame is empty or 'category_name' column is missing.")
147
- return monthly_df
148
-
149
- df_processed = monthly_df.copy()
150
-
151
- # 'category_name' for monthly data is expected to be a date string like 'YYYY-MM-DD'
152
- # Attempt to convert 'category_name' to datetime
153
- df_processed['date_for_analysis'] = pd.to_datetime(df_processed['category_name'], errors='coerce')
154
-
155
- # Drop rows where date conversion failed
156
- df_processed.dropna(subset=['date_for_analysis'], inplace=True)
157
-
158
- if df_processed.empty:
159
- logger.warning("No valid dates found in 'category_name' for monthly data after processing.")
160
- return df_processed
161
-
162
- df_processed['year_month'] = df_processed['date_for_analysis'].dt.strftime('%Y-%m')
163
- df_processed['month_name'] = df_processed['date_for_analysis'].dt.strftime('%B %Y')
164
-
165
- # Ensure numeric types for follower counts
166
- for col in ['follower_count_organic', 'follower_count_paid']:
167
- if col in df_processed.columns:
168
- df_processed[col] = pd.to_numeric(df_processed[col], errors='coerce').fillna(0)
169
- else: # Add column with zeros if missing, to prevent errors in later calculations
170
- df_processed[col] = 0
171
-
172
-
173
- return df_processed.sort_values('date_for_analysis')
174
-
175
- def _extract_time_series_metrics(self, monthly_df: pd.DataFrame) -> List[TimeSeriesMetric]:
176
- """Extract time-series metrics from processed monthly follower data."""
177
- ts_metrics = []
178
- if monthly_df.empty or 'date_for_analysis' not in monthly_df.columns:
179
- logger.info("Cannot extract time-series metrics: monthly DataFrame is empty or lacks 'date_for_analysis'.")
180
- return ts_metrics
181
-
182
- # Ensure data is sorted by date for correct growth rate calculation
183
- monthly_df_sorted = monthly_df.sort_values('date_for_analysis').copy()
184
-
185
- timestamps = monthly_df_sorted['year_month'].tolist()
186
-
187
- # Calculate total followers
188
- monthly_df_sorted['total_followers'] = monthly_df_sorted.get('follower_count_organic', 0) + \
189
- monthly_df_sorted.get('follower_count_paid', 0)
190
-
191
- metric_definitions = {
192
- "total_follower_count": monthly_df_sorted['total_followers'],
193
- "organic_follower_count": monthly_df_sorted.get('follower_count_organic', pd.Series(0, index=monthly_df_sorted.index)),
194
- "paid_follower_count": monthly_df_sorted.get('follower_count_paid', pd.Series(0, index=monthly_df_sorted.index))
195
- }
196
-
197
- for name, values_series in metric_definitions.items():
198
- ts_metrics.append(TimeSeriesMetric(
199
- metric_name=name,
200
- values=values_series.tolist(),
201
- timestamps=timestamps,
202
- metric_type="time_series",
203
- time_granularity="monthly"
204
- ))
205
-
206
- # Calculate growth rate for total followers
207
- if len(monthly_df_sorted) > 1:
208
- # pct_change gives NaN for the first element, fill with 0
209
- growth_rates = monthly_df_sorted['total_followers'].pct_change().fillna(0).tolist()
210
- ts_metrics.append(TimeSeriesMetric(
211
- metric_name="total_follower_growth_rate",
212
- values=growth_rates,
213
- timestamps=timestamps, # Timestamps align, first growth rate is vs non-existent previous point (so 0)
214
- metric_type="time_series",
215
- time_granularity="monthly",
216
- unit="%"
217
- ))
218
- else:
219
- logger.info("Not enough data points (<=1) to calculate growth rate.")
220
-
221
- return ts_metrics
222
-
223
- def _calculate_aggregate_metrics(self, separated_data: Dict[str, pd.DataFrame]) -> Dict[str, float]:
224
- """Calculate aggregate metrics from all follower data."""
225
- agg_metrics = {}
226
-
227
- monthly_df = separated_data.get('follower_gains_monthly')
228
- if monthly_df is not None and not monthly_df.empty:
229
- total_organic = monthly_df['follower_count_organic'].sum()
230
- total_paid = monthly_df['follower_count_paid'].sum()
231
- total_all_followers = total_organic + total_paid
232
-
233
- agg_metrics['total_organic_followers_gained_period'] = float(total_organic)
234
- agg_metrics['total_paid_followers_gained_period'] = float(total_paid)
235
- agg_metrics['overall_total_followers_gained_period'] = float(total_all_followers)
236
-
237
- if total_all_followers > 0:
238
- agg_metrics['overall_organic_follower_ratio_gained'] = float(total_organic / total_all_followers)
239
- agg_metrics['overall_paid_follower_ratio_gained'] = float(total_paid / total_all_followers)
240
-
241
- # Average monthly gain (if 'total_followers' represents gain, not cumulative)
242
- # Assuming 'follower_count_organic/paid' in 'follower_gains_monthly' are indeed GAINS for that month
243
- monthly_df['monthly_total_gain'] = monthly_df['follower_count_organic'] + monthly_df['follower_count_paid']
244
- if not monthly_df['monthly_total_gain'].empty:
245
- agg_metrics['avg_monthly_follower_gain'] = float(monthly_df['monthly_total_gain'].mean())
246
- agg_metrics['max_monthly_follower_gain'] = float(monthly_df['monthly_total_gain'].max())
247
- agg_metrics['min_monthly_follower_gain'] = float(monthly_df['monthly_total_gain'].min())
248
-
249
-
250
- # Count of distinct demographic segments identified (top N for each)
251
- for demo_type in ['follower_industry', 'follower_seniority', 'follower_function', 'follower_geo']:
252
- if demo_type in separated_data and not separated_data[demo_type].empty:
253
- agg_metrics[f'distinct_{demo_type}_segments_analyzed'] = float(len(separated_data[demo_type]))
254
-
255
- return agg_metrics
256
-
257
- def _extract_demographic_metrics(self, separated_data: Dict[str, pd.DataFrame]) -> Dict[str, Any]:
258
- """Extract demographic distributions (categorical metrics)."""
259
- cat_metrics = {}
260
- demographic_types_map = {
261
- 'follower_industry': 'industry_distribution',
262
- 'follower_seniority': 'seniority_distribution',
263
- 'follower_function': 'function_distribution',
264
- 'follower_geo': 'geographic_distribution'
265
- }
266
-
267
- for demo_type_key, metric_name_prefix in demographic_types_map.items():
268
- demo_df = separated_data.get(demo_type_key)
269
- if demo_df is not None and not demo_df.empty and 'category_name' in demo_df.columns:
270
- distribution = {}
271
- for _, row in demo_df.iterrows():
272
- category = row['category_name']
273
- organic = float(row.get('follower_count_organic', 0))
274
- paid = float(row.get('follower_count_paid', 0))
275
- total = organic + paid
276
- distribution[category] = {
277
- 'total_followers': total,
278
- 'organic_followers': organic,
279
- 'paid_followers': paid,
280
- 'organic_ratio': organic / total if total > 0 else 0.0
281
- }
282
-
283
- # Sort by total followers descending for the distribution
284
- sorted_distribution = dict(sorted(distribution.items(), key=lambda item: item[1]['total_followers'], reverse=True))
285
- cat_metrics[metric_name_prefix] = sorted_distribution
286
-
287
- # Summary for this demographic type
288
- total_followers_in_type = sum(item['total_followers'] for item in distribution.values())
289
- cat_metrics[f'{metric_name_prefix}_summary'] = {
290
- 'total_followers_in_top_segments': total_followers_in_type,
291
- 'number_of_segments_reported': len(distribution),
292
- 'top_segment': list(sorted_distribution.keys())[0] if sorted_distribution else "N/A"
293
- }
294
- return cat_metrics
295
-
296
- def _extract_time_periods(self, monthly_df: Optional[pd.DataFrame]) -> List[str]:
297
- """Extract unique year-month time periods covered by the monthly data."""
298
- if monthly_df is None or monthly_df.empty or 'year_month' not in monthly_df.columns:
299
- return ["Data period not available or N/A"]
300
-
301
- periods = sorted(monthly_df['year_month'].dropna().unique().tolist(), reverse=True)
302
- return periods[:12] # Return up to the last 12 months if available
303
-
304
-
305
- def analyze_follower_data(self, follower_stats_df: pd.DataFrame) -> AgentMetrics:
306
- """
307
- Generate comprehensive follower analysis using PandasAI and structured metric extraction.
308
- """
309
- if follower_stats_df is None or follower_stats_df.empty:
310
- logger.warning("Follower statistics DataFrame is empty. Returning empty metrics.")
311
- return AgentMetrics(
312
- agent_name=self.AGENT_NAME,
313
- analysis_summary="No follower data provided for analysis.",
314
- time_periods_covered=["N/A"]
315
- )
316
-
317
- # 1. Pre-process and separate data
318
- separated_data = self._separate_follower_data_by_type(follower_stats_df)
319
-
320
- # Prepare a combined DataFrame for PandasAI if needed, or use the original one.
321
- # For PandasAI, it's often better to provide a clean, understandable DataFrame.
322
- # Let's use the original df for the textual analysis by PandasAI,
323
- # as it contains all types and the LLM can be instructed to differentiate.
324
-
325
- # Ensure PandasAI is configured (this should ideally be done once at orchestrator level)
326
- # from utils.pandasai_setup import configure_pandasai
327
- # configure_pandasai(self.api_key, self.model_name) # Or pass LLM object if configured outside
328
-
329
- df_description = "LinkedIn follower statistics. Contains 'follower_count_type' indicating data category (e.g., 'follower_gains_monthly', 'follower_industry'), 'category_name' (e.g., date for monthly, industry name for industry type), 'follower_count_organic', 'follower_count_paid'."
330
-
331
- # Create PandasAI DataFrame
332
- # Check if pai.DataFrame is the correct way to initialize based on your pandasai version
333
- try:
334
- pandas_ai_df = pai.DataFrame(follower_stats_df, description=df_description)
335
- except Exception as e:
336
- logger.error(f"Failed to create PandasAI DataFrame: {e}", exc_info=True)
337
- return AgentMetrics(
338
- agent_name=self.AGENT_NAME,
339
- analysis_summary=f"Error initializing PandasAI: {e}",
340
- time_periods_covered=self._extract_time_periods(separated_data.get('follower_gains_monthly'))
341
- )
342
-
343
- # 2. Generate textual analysis using PandasAI via LlmAgent
344
- # The LlmAgent itself doesn't directly use PandasAI's .chat() method.
345
- # The instruction for LlmAgent should guide it to perform analysis.
346
- # If direct PandasAI chat is needed, it's a separate call.
347
- # The original code uses pandas_df.chat(analysis_query). This implies PandasAI is used directly.
348
- # Let's stick to the direct PandasAI chat call as in the original structure.
349
-
350
- analysis_query = f"""
351
- Analyze the provided LinkedIn follower statistics. The DataFrame contains various 'follower_count_type' values.
352
- Focus on:
353
- 1. For 'follower_gains_monthly': Analyze monthly follower growth trends (total, organic, paid). Identify key periods of growth or decline.
354
- 2. For demographic types (industry, seniority, function, geo): Describe the distribution of followers. Which are the top segments? How do organic vs paid compare?
355
- 3. Synthesize these findings into an overall summary of follower dynamics.
356
-
357
- Consider the data structure: 'category_name' holds the date for monthly data or the demographic label.
358
- 'follower_count_organic' and 'follower_count_paid' are the key metrics.
359
- """
360
-
361
- analysis_result_text = "PandasAI analysis could not be performed." # Default
362
- try:
363
- def chat_operation():
364
- # Ensure the LLM for PandasAI is correctly configured before this call
365
- # This might involve re-calling configure_pandasai if it's not persistent
366
- # or if the LLM object needs to be explicitly passed to PandasAI DataFrame.
367
- # Check if LLM is configured using the proper config.get() method
368
- config = pai.config.get()
369
- logger.info(f"pai_config: {config}, Type of config: {type(config)}")
370
- if not config.llm:
371
- logger.warning("PandasAI LLM not configured. Attempting to configure now.")
372
- # This assumes configure_pandasai is available and sets the LLM config
373
- from insight_and_tasks.utils.pandasai_setup import configure_pandasai
374
- configure_pandasai(self.api_key, self.model_name)
375
-
376
- # Re-check configuration after setup attempt
377
- config = pai.config.get()
378
- if not config.llm:
379
- raise RuntimeError("PandasAI LLM could not be configured for chat operation.")
380
-
381
- logger.info(f"Executing PandasAI chat for follower analysis with LLM: {config.llm}")
382
- return pandas_ai_df.chat(analysis_query)
383
-
384
- analysis_result_raw = self.retry_mechanism.retry_with_backoff(
385
- func=chat_operation,
386
- max_retries=2, # Adjusted retries
387
- base_delay=2.0,
388
- exceptions=(Exception,) # Catch broader exceptions for PandasAI calls
389
- )
390
- analysis_result_text = str(analysis_result_raw) if analysis_result_raw else "No textual analysis generated by PandasAI."
391
- logger.info("Follower analysis via PandasAI completed.")
392
-
393
- except Exception as e:
394
- logger.error(f"Follower analysis with PandasAI failed after retries: {e}", exc_info=True)
395
- analysis_result_text = f"Follower analysis using PandasAI failed. Error: {str(e)[:200]}"
396
-
397
- # 3. Extract structured metrics using the separated and processed data
398
- monthly_data_for_metrics = separated_data.get('follower_gains_monthly', pd.DataFrame())
399
-
400
- time_series_metrics = self._extract_time_series_metrics(monthly_data_for_metrics)
401
- aggregate_metrics = self._calculate_aggregate_metrics(separated_data) # Uses all separated types
402
- categorical_metrics = self._extract_demographic_metrics(separated_data) # Uses demographic types
403
- time_periods = self._extract_time_periods(monthly_data_for_metrics)
404
-
405
- return AgentMetrics(
406
- agent_name=self.AGENT_NAME,
407
- analysis_summary=analysis_result_text[:2000], # Truncate if too long
408
- time_series_metrics=time_series_metrics,
409
- aggregate_metrics=aggregate_metrics,
410
- categorical_metrics=categorical_metrics,
411
- time_periods_covered=time_periods,
412
- data_sources_used=[f"follower_stats_df (shape: {follower_stats_df.shape})"]
413
- )
414
-
415
- if __name__ == '__main__':
416
- # This is for example and testing purposes.
417
- # Ensure logging and other necessary setups are done.
418
- try:
419
- from utils.logging_config import setup_logging
420
- setup_logging()
421
- logger.info("Logging setup for EnhancedFollowerAnalysisAgent test.")
422
- except ImportError:
423
- logging.basicConfig(level=logging.INFO)
424
- logger.warning("Could not import setup_logging. Using basicConfig.")
425
-
426
- # Mock API Key and Model for testing
427
- # IMPORTANT: For PandasAI to run, a valid API key and model setup are needed.
428
- # This example might not fully execute PandasAI chat without proper environment setup.
429
- MOCK_API_KEY = os.environ.get("GOOGLE_API_KEY", "test_api_key_followers")
430
- MODEL_NAME = DEFAULT_AGENT_MODEL
431
-
432
- # Configure PandasAI (essential for the .chat() part)
433
- try:
434
- from utils.pandasai_setup import configure_pandasai
435
- if MOCK_API_KEY != "test_api_key_followers": # Only configure if a real key might be present
436
- configure_pandasai(MOCK_API_KEY, MODEL_NAME)
437
- logger.info("PandasAI configured for testing EnhancedFollowerAnalysisAgent.")
438
- else:
439
- logger.warning("Using mock API key. PandasAI chat will likely fail or use a default/mock LLM if available.")
440
- # Mock pai.DataFrame if pandasai is not fully set up to avoid errors
441
- class MockPandasAIDataFrame:
442
- def __init__(self, df, description): self.df = df; self.description = description
443
- def chat(self, query): return f"Mock PandasAI response to: {query}"
444
- pai.DataFrame = MockPandasAIDataFrame
445
-
446
- except ImportError:
447
- logger.error("utils.pandasai_setup not found. PandasAI will not be configured.")
448
- class MockPandasAIDataFrame:
449
- def __init__(self, df, description): self.df = df; self.description = description
450
- def chat(self, query): return f"Mock PandasAI response to: {query}"
451
- pai.DataFrame = MockPandasAIDataFrame
452
-
453
- # Sample Data
454
- sample_follower_data = {
455
- 'follower_count_type': [
456
- 'follower_gains_monthly', 'follower_gains_monthly', 'follower_gains_monthly',
457
- 'follower_industry', 'follower_industry', 'follower_industry', 'follower_industry',
458
- 'follower_seniority', 'follower_seniority'
459
- ],
460
- 'category_name': [ # Dates for monthly, names for demographics
461
- '2023-01-01', '2023-02-01', '2023-03-01',
462
- 'Technology', 'Finance', 'Healthcare', 'Retail',
463
- 'Senior', 'Entry-Level'
464
- ],
465
- 'follower_count_organic': [
466
- 100, 120, 110, # Monthly gains
467
- 500, 300, 200, 150, # Industry organic
468
- 600, 400 # Seniority organic
469
- ],
470
- 'follower_count_paid': [
471
- 10, 15, 12, # Monthly gains
472
- 50, 30, 20, 10, # Industry paid
473
- 60, 40 # Seniority paid
474
- ]
475
- }
476
- sample_df = pd.DataFrame(sample_follower_data)
477
-
478
- # Initialize agent
479
- follower_agent = EnhancedFollowerAnalysisAgent(api_key=MOCK_API_KEY, model_name=MODEL_NAME)
480
-
481
- logger.info("Analyzing sample follower data...")
482
- metrics_result = follower_agent.analyze_follower_data(sample_df)
483
-
484
- print("\n--- EnhancedFollowerAnalysisAgent Results ---")
485
- print(f"Agent Name: {metrics_result.agent_name}")
486
- print(f"Analysis Summary: {metrics_result.analysis_summary}")
487
- print("\nTime Series Metrics:")
488
- for ts_metric in metrics_result.time_series_metrics:
489
- print(f" - {ts_metric.metric_name}: {len(ts_metric.values)} data points, e.g., {ts_metric.values[:3]} for ts {ts_metric.timestamps[:3]}")
490
- print("\nAggregate Metrics:")
491
- for key, value in metrics_result.aggregate_metrics.items():
492
- print(f" - {key}: {value}")
493
- print("\nCategorical Metrics:")
494
- for key, value in metrics_result.categorical_metrics.items():
495
- print(f" - {key}: (details below)")
496
- if isinstance(value, dict):
497
- for sub_key, sub_value in list(value.items())[:2]: # Print first 2 items for brevity
498
- print(f" - {sub_key}: {sub_value}")
499
- else:
500
- print(f" {value}")
501
-
502
- print(f"\nTime Periods Covered: {metrics_result.time_periods_covered}")
503
- print(f"Data Sources Used: {metrics_result.data_sources_used}")
504
- print(f"Generated Timestamp: {metrics_result.generation_timestamp}")
505
-
506
- # Test with empty DataFrame
507
- logger.info("\n--- Testing with empty DataFrame ---")
508
- empty_metrics_result = follower_agent.analyze_follower_data(pd.DataFrame())
509
- print(f"Empty DF Analysis Summary: {empty_metrics_result.analysis_summary}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
features/insight_and_tasks/agents/mentions_agent.py DELETED
@@ -1,397 +0,0 @@
1
- # agents/mentions_agent.py
2
- import pandas as pd
3
- from typing import Dict, List, Any, Optional, Mapping
4
- import logging
5
- import pandasai as pai # Assuming pandasai is imported as pai globally or configured
6
-
7
- from google.adk.agents import LlmAgent # Assuming this is the correct import path
8
-
9
- # Project-specific imports
10
- from features.insight_and_tasks.utils.retry_mechanism import RetryMechanism
11
- from features.insight_and_tasks.data_models.metrics import AgentMetrics, TimeSeriesMetric
12
-
13
- # Configure logger for this module
14
- logger = logging.getLogger(__name__)
15
-
16
- DEFAULT_AGENT_MODEL = "gemini-2.5-flash-preview-05-20"
17
-
18
- class EnhancedMentionsAnalysisAgent:
19
- """
20
- Enhanced mentions analysis agent with time-series metric extraction and sentiment processing.
21
- """
22
- AGENT_NAME = "mentions_analyst"
23
- AGENT_DESCRIPTION = "Expert analyst specializing in brand mention trends and sentiment patterns."
24
- AGENT_INSTRUCTION = """
25
- You are a specialized LinkedIn brand mentions expert focused on sentiment trends and mention patterns over time.
26
-
27
- Your role includes:
28
-
29
- 1. MENTION TREND ANALYSIS (monthly, using 'date' column):
30
- - Analyze mention volume trends over time.
31
- - Identify periods with significant spikes or dips in mention activity.
32
-
33
- 2. SENTIMENT PATTERN ANALYSIS (monthly, using 'date' and 'sentiment_label'):
34
- - Track the evolution of sentiment (e.g., positive, negative, neutral) associated with mentions.
35
- - Calculate and analyze the average sentiment score over time (if sentiment can be quantified).
36
- - Identify shifts in overall sentiment and potential drivers for these changes.
37
-
38
- 3. CORRELATION (Conceptual):
39
- - Consider if mention spikes/dips or sentiment shifts correlate with any known company activities, campaigns, or external events (though this data might not be in the input DataFrame, mention the need to investigate).
40
-
41
- 4. METRIC EXTRACTION (for AgentMetrics):
42
- - Extract time-series data for monthly mention volume.
43
- - Extract time-series data for monthly sentiment distribution (e.g., count of positive/negative/neutral mentions) and average sentiment score.
44
- - Provide aggregate metrics like total mentions, overall sentiment distribution, and average sentiment score for the period.
45
- - Include categorical metrics like the distribution of sentiment labels.
46
-
47
- Focus on identifying actionable insights from mention data. How is the brand being perceived? Are there emerging reputational risks or opportunities?
48
- Use the provided DataFrame columns: 'date' (for mentions), 'sentiment_label' (e.g., 'Positive 👍', 'Negative 👎', 'Neutral 😐'), and potentially 'mention_source' or 'mention_content' if available and relevant for deeper analysis (though focus on 'date' and 'sentiment_label' for core metrics).
49
- """
50
-
51
- # Standardized sentiment mapping (can be expanded)
52
- # This mapping is crucial for converting labels to scores.
53
- SENTIMENT_MAPPING = {
54
- 'Positive 👍': 1,
55
- 'Positive': 1, # Adding common variations
56
- 'Very Positive': 1.5, # Example for more granular sentiment
57
- 'Negative 👎': -1,
58
- 'Negative': -1,
59
- 'Very Negative': -1.5,
60
- 'Neutral 😐': 0,
61
- 'Neutral': 0,
62
- 'Mixed': 0, # Or handle mixed sentiment differently
63
- 'Unknown': 0 # Default score for unmapped or unknown sentiments
64
- }
65
-
66
-
67
- def __init__(self, api_key: str, model_name: Optional[str] = None):
68
- self.api_key = api_key
69
- self.model_name = model_name or DEFAULT_AGENT_MODEL
70
- self.agent = LlmAgent(
71
- name=self.AGENT_NAME,
72
- model=self.model_name,
73
- description=self.AGENT_DESCRIPTION,
74
- instruction=self.AGENT_INSTRUCTION
75
- )
76
- self.retry_mechanism = RetryMechanism()
77
- logger.info(f"{self.AGENT_NAME} initialized with model {self.model_name}.")
78
-
79
- def _get_sentiment_score(self, sentiment_label: Optional[str]) -> float:
80
- """Maps a sentiment label to a numerical score using SENTIMENT_MAPPING."""
81
- if sentiment_label is None:
82
- return self.SENTIMENT_MAPPING.get('Unknown', 0)
83
- # Attempt to match known labels, case-insensitively for robustness if needed,
84
- # but exact match is safer with the current emoji-inclusive keys.
85
- return float(self.SENTIMENT_MAPPING.get(str(sentiment_label).strip(), self.SENTIMENT_MAPPING.get('Unknown',0)))
86
-
87
-
88
- def _preprocess_mentions_data(self, df: pd.DataFrame) -> pd.DataFrame:
89
- """Cleans and prepares mentions data for analysis."""
90
- if df is None or df.empty:
91
- return pd.DataFrame()
92
-
93
- df_processed = df.copy()
94
-
95
- # Convert 'date' to datetime
96
- if 'date' in df_processed.columns:
97
- df_processed['date'] = pd.to_datetime(df_processed['date'], errors='coerce')
98
- # df_processed.dropna(subset=['date'], inplace=True) # Keep for other metrics even if date is NaT
99
- else:
100
- logger.warning("'date' column not found in mentions data. Time-series analysis will be limited.")
101
- # df_processed['date'] = pd.NaT # Add placeholder if critical
102
-
103
- # Process 'sentiment_label' and create 'sentiment_score'
104
- if 'sentiment_label' in df_processed.columns:
105
- df_processed['sentiment_label'] = df_processed['sentiment_label'].astype(str).fillna('Unknown')
106
- df_processed['sentiment_score'] = df_processed['sentiment_label'].apply(self._get_sentiment_score)
107
- else:
108
- logger.info("'sentiment_label' column not found. Sentiment analysis will be limited.")
109
- df_processed['sentiment_label'] = 'Unknown'
110
- df_processed['sentiment_score'] = self._get_sentiment_score('Unknown')
111
-
112
- return df_processed
113
-
114
- def _extract_time_series_metrics(self, df_processed: pd.DataFrame) -> List[TimeSeriesMetric]:
115
- """Extracts monthly time-series metrics from processed mentions data."""
116
- ts_metrics = []
117
- if df_processed.empty or 'date' not in df_processed.columns or df_processed['date'].isnull().all():
118
- logger.info("Cannot extract time-series metrics for mentions: 'date' is missing or all null.")
119
- return ts_metrics
120
-
121
- df_ts = df_processed.dropna(subset=['date']).copy()
122
- if df_ts.empty:
123
- logger.info("No valid 'date' values for mentions time-series metrics after filtering NaT.")
124
- return ts_metrics
125
-
126
- df_ts['year_month'] = df_ts['date'].dt.strftime('%Y-%m')
127
-
128
- # Monthly mention volume
129
- monthly_volume = df_ts.groupby('year_month').size().reset_index(name='mention_count')
130
- if not monthly_volume.empty:
131
- ts_metrics.append(TimeSeriesMetric(
132
- metric_name="monthly_mention_volume",
133
- values=monthly_volume['mention_count'].tolist(),
134
- timestamps=monthly_volume['year_month'].tolist(),
135
- metric_type="time_series",
136
- time_granularity="monthly",
137
- unit="count"
138
- ))
139
-
140
- # Monthly average sentiment score
141
- if 'sentiment_score' in df_ts.columns:
142
- monthly_avg_sentiment = df_ts.groupby('year_month')['sentiment_score'].mean().reset_index()
143
- if not monthly_avg_sentiment.empty:
144
- ts_metrics.append(TimeSeriesMetric(
145
- metric_name="avg_monthly_sentiment_score",
146
- values=monthly_avg_sentiment['sentiment_score'].tolist(),
147
- timestamps=monthly_avg_sentiment['year_month'].tolist(),
148
- metric_type="time_series",
149
- time_granularity="monthly",
150
- unit="score" # Score range depends on SENTIMENT_MAPPING
151
- ))
152
-
153
- # Monthly distribution of sentiment labels
154
- if 'sentiment_label' in df_ts.columns and df_ts['sentiment_label'].nunique() > 1:
155
- # Ensure 'sentiment_label' is not all 'Unknown'
156
- if not (df_ts['sentiment_label'] == 'Unknown').all():
157
- sentiment_counts_by_month = df_ts.groupby(['year_month', 'sentiment_label']).size().unstack(fill_value=0)
158
- for sentiment_val in sentiment_counts_by_month.columns:
159
- if sentiment_val == 'Unknown' and (sentiment_counts_by_month[sentiment_val] == 0).all():
160
- continue
161
- ts_metrics.append(TimeSeriesMetric(
162
- metric_name=f"monthly_mention_count_sentiment_{str(sentiment_val).lower().replace(' ', '_').replace('👍','positive').replace('👎','negative').replace('😐','neutral')}",
163
- values=sentiment_counts_by_month[sentiment_val].tolist(),
164
- timestamps=sentiment_counts_by_month.index.tolist(), # year_month is index
165
- metric_type="time_series",
166
- time_granularity="monthly",
167
- unit="count"
168
- ))
169
- else:
170
- logger.info("Sentiment label data is all 'Unknown', skipping sentiment distribution time series.")
171
-
172
- return ts_metrics
173
-
174
- def _calculate_aggregate_metrics(self, df_processed: pd.DataFrame) -> Dict[str, float]:
175
- """Calculates aggregate metrics for mentions."""
176
- agg_metrics = {}
177
- if df_processed.empty:
178
- return agg_metrics
179
-
180
- agg_metrics['total_mentions_analyzed'] = float(len(df_processed))
181
-
182
- if 'sentiment_score' in df_processed.columns and not df_processed['sentiment_score'].empty:
183
- agg_metrics['overall_avg_sentiment_score'] = float(df_processed['sentiment_score'].mean())
184
-
185
- if 'sentiment_label' in df_processed.columns:
186
- total_valid_sentiments = len(df_processed.dropna(subset=['sentiment_label'])) # Count non-NaN labels
187
- if total_valid_sentiments > 0:
188
- # Iterate through our defined sentiment mapping to count occurrences
189
- sentiment_counts = df_processed['sentiment_label'].value_counts()
190
- for label, score_val in self.SENTIMENT_MAPPING.items():
191
- # Use a clean key for the metric name
192
- clean_label_key = str(label).lower().replace(' ', '_').replace('👍','positive').replace('👎','negative').replace('😐','neutral')
193
- if clean_label_key == "unknown" and score_val == 0: # Skip generic unknown if it's just a fallback
194
- if sentiment_counts.get(label, 0) == 0 and 'Unknown' not in label : continue
195
-
196
-
197
- count = sentiment_counts.get(label, 0)
198
- if count > 0 or label == 'Unknown': # Report if count > 0 or if it's the 'Unknown' category itself
199
- agg_metrics[f'{clean_label_key}_mention_ratio'] = float(count / total_valid_sentiments)
200
- agg_metrics[f'{clean_label_key}_mention_count'] = float(count)
201
-
202
-
203
- # Mentions per day/week (if 'date' column is valid)
204
- if 'date' in df_processed.columns and not df_processed['date'].isnull().all():
205
- df_dated = df_processed.dropna(subset=['date']).sort_values('date')
206
- if len(df_dated) > 1:
207
- duration_days = (df_dated['date'].max() - df_dated['date'].min()).days
208
- if duration_days > 0:
209
- agg_metrics['avg_mentions_per_day'] = float(len(df_dated) / duration_days)
210
- agg_metrics['avg_mentions_per_week'] = float(len(df_dated) / (duration_days / 7.0))
211
- elif len(df_dated) == 1: # Single day with mentions
212
- agg_metrics['avg_mentions_per_day'] = float(len(df_dated))
213
- agg_metrics['avg_mentions_per_week'] = float(len(df_dated) * 7) # Extrapolate
214
-
215
- return agg_metrics
216
-
217
- def _extract_categorical_metrics(self, df_processed: pd.DataFrame) -> Dict[str, Any]:
218
- """Extracts categorical distributions for mentions."""
219
- cat_metrics = {}
220
- if df_processed.empty:
221
- return cat_metrics
222
-
223
- # Sentiment label distribution (counts and percentages)
224
- if 'sentiment_label' in df_processed.columns and df_processed['sentiment_label'].nunique() > 0:
225
- cat_metrics['sentiment_label_distribution_percentage'] = df_processed['sentiment_label'].value_counts(normalize=True).apply(lambda x: f"{x:.2%}").to_dict()
226
- cat_metrics['sentiment_label_counts'] = df_processed['sentiment_label'].value_counts().to_dict()
227
-
228
- # Example: If 'mention_source' column existed:
229
- # if 'mention_source' in df_processed.columns:
230
- # cat_metrics['mention_source_distribution'] = df_processed['mention_source'].value_counts(normalize=True).to_dict()
231
- # cat_metrics['mention_source_counts'] = df_processed['mention_source'].value_counts().to_dict()
232
-
233
- return cat_metrics
234
-
235
- def _extract_time_periods(self, df_processed: pd.DataFrame) -> List[str]:
236
- """Extracts unique year-month time periods covered by the mentions data."""
237
- if df_processed.empty or 'date' not in df_processed.columns or df_processed['date'].isnull().all():
238
- return ["Data period not available or N/A"]
239
-
240
- if 'year_month' in df_processed.columns: # If already created during TS extraction
241
- periods = sorted(df_processed['year_month'].dropna().unique().tolist(), reverse=True)
242
- elif 'date' in df_processed.columns: # Derive if not present
243
- dates = df_processed['date'].dropna()
244
- if not dates.empty:
245
- periods = sorted(dates.dt.strftime('%Y-%m').unique().tolist(), reverse=True)
246
- else: return ["N/A"]
247
- else: return ["N/A"]
248
-
249
- return periods[:12] # Return up to the last 12 months
250
-
251
- def analyze_mentions_data(self, mentions_df: pd.DataFrame) -> AgentMetrics:
252
- """
253
- Generates comprehensive mentions analysis.
254
- """
255
- if mentions_df is None or mentions_df.empty:
256
- logger.warning("Mentions DataFrame is empty. Returning empty metrics.")
257
- return AgentMetrics(
258
- agent_name=self.AGENT_NAME,
259
- analysis_summary="No mentions data provided for analysis.",
260
- time_periods_covered=["N/A"]
261
- )
262
-
263
- # 1. Preprocess data
264
- df_processed = self._preprocess_mentions_data(mentions_df)
265
- if df_processed.empty and not mentions_df.empty:
266
- logger.warning("Mentions DataFrame became empty after preprocessing.")
267
- return AgentMetrics(
268
- agent_name=self.AGENT_NAME,
269
- analysis_summary="Mentions data could not be processed.",
270
- time_periods_covered=["N/A"]
271
- )
272
- elif df_processed.empty and mentions_df.empty:
273
- return AgentMetrics(agent_name=self.AGENT_NAME, analysis_summary="No mentions data provided.")
274
-
275
-
276
- # 2. Generate textual analysis using PandasAI
277
- df_description_for_pandasai = "LinkedIn brand mentions data. Key columns: 'date' (date of mention), 'sentiment_label' (e.g., 'Positive 👍', 'Negative 👎', 'Neutral 😐'), 'sentiment_score' (numeric score from -1.5 to 1.5)."
278
-
279
- analysis_result_text = "PandasAI analysis for mentions could not be performed."
280
- try:
281
- pandas_ai_df = pai.DataFrame(df_processed, description=df_description_for_pandasai)
282
- analysis_query = f"""
283
- Analyze the provided LinkedIn brand mentions data. Focus on:
284
- 1. Monthly trends in mention volume.
285
- 2. Monthly trends in sentiment (average 'sentiment_score' and distribution of 'sentiment_label').
286
- 3. Identify any significant spikes/dips in mentions or shifts in sentiment.
287
- Provide a concise summary of brand perception based on this data.
288
- """
289
- def chat_operation():
290
- config = pai.config.get()
291
- logger.info(f"pai_config: {config}, Type of config: {type(config)}")
292
- if not config.llm:
293
- logger.warning("PandasAI LLM not configured. Attempting to configure now.")
294
- # This assumes configure_pandasai is available and sets the LLM config
295
- from insight_and_tasks.utils.pandasai_setup import configure_pandasai
296
- configure_pandasai(self.api_key, self.model_name)
297
-
298
- # Re-check configuration after setup attempt
299
- config = pai.config.get()
300
- if not config.llm:
301
- raise RuntimeError("PandasAI LLM could not be configured for chat operation.")
302
-
303
- logger.info(f"Executing PandasAI chat for follower analysis with LLM: {config.llm}")
304
- return pandas_ai_df.chat(analysis_query)
305
-
306
- analysis_result_raw = self.retry_mechanism.retry_with_backoff(
307
- func=chat_operation, max_retries=2, base_delay=2.0, exceptions=(Exception,)
308
- )
309
- analysis_result_text = str(analysis_result_raw) if analysis_result_raw else "No textual analysis for mentions generated by PandasAI."
310
- logger.info("Mentions analysis via PandasAI completed.")
311
-
312
- except Exception as e:
313
- logger.error(f"Mentions analysis with PandasAI failed: {e}", exc_info=True)
314
- analysis_result_text = f"Mentions analysis using PandasAI failed. Error: {str(e)[:200]}"
315
-
316
- # 3. Extract structured metrics
317
- time_series_metrics = self._extract_time_series_metrics(df_processed)
318
- aggregate_metrics = self._calculate_aggregate_metrics(df_processed)
319
- categorical_metrics = self._extract_categorical_metrics(df_processed)
320
- time_periods = self._extract_time_periods(df_processed)
321
-
322
- return AgentMetrics(
323
- agent_name=self.AGENT_NAME,
324
- analysis_summary=analysis_result_text[:2000],
325
- time_series_metrics=time_series_metrics,
326
- aggregate_metrics=aggregate_metrics,
327
- categorical_metrics=categorical_metrics,
328
- time_periods_covered=time_periods,
329
- data_sources_used=[f"mentions_df (shape: {mentions_df.shape}) -> df_processed (shape: {df_processed.shape})"]
330
- )
331
-
332
- if __name__ == '__main__':
333
- try:
334
- from utils.logging_config import setup_logging
335
- setup_logging()
336
- logger.info("Logging setup for EnhancedMentionsAnalysisAgent test.")
337
- except ImportError:
338
- logging.basicConfig(level=logging.INFO)
339
- logger.warning("Could not import setup_logging. Using basicConfig.")
340
-
341
- MOCK_API_KEY = os.environ.get("GOOGLE_API_KEY", "test_api_key_mentions")
342
- MODEL_NAME = DEFAULT_AGENT_MODEL
343
-
344
- try:
345
- from utils.pandasai_setup import configure_pandasai
346
- if MOCK_API_KEY != "test_api_key_mentions":
347
- configure_pandasai(MOCK_API_KEY, MODEL_NAME)
348
- logger.info("PandasAI configured for testing EnhancedMentionsAnalysisAgent.")
349
- else:
350
- logger.warning("Using mock API key for mentions. PandasAI chat will likely fail or use a mock.")
351
- class MockPandasAIDataFrame:
352
- def __init__(self, df, description): self.df = df; self.description = description
353
- def chat(self, query): return f"Mock PandasAI mentions response to: {query}"
354
- pai.DataFrame = MockPandasAIDataFrame
355
- except ImportError:
356
- logger.error("utils.pandasai_setup not found. PandasAI will not be configured for mentions.")
357
- class MockPandasAIDataFrame:
358
- def __init__(self, df, description): self.df = df; self.description = description
359
- def chat(self, query): return f"Mock PandasAI mentions response to: {query}"
360
- pai.DataFrame = MockPandasAIDataFrame
361
-
362
-
363
- sample_mentions_data = {
364
- 'date': pd.to_datetime(['2023-01-05', '2023-01-15', '2023-02-02', '2023-02-20', '2023-03-10', '2023-03-12']),
365
- 'sentiment_label': ['Positive 👍', 'Negative 👎', 'Neutral 😐', 'Positive 👍', 'Positive 👍', 'Unknown'],
366
- # 'mention_content': ['Great product!', 'Service was slow.', 'Just a mention.', 'Love the new feature!', 'Highly recommend.', 'Seen this around.']
367
- }
368
- sample_df_mentions = pd.DataFrame(sample_mentions_data)
369
-
370
- mentions_agent = EnhancedMentionsAnalysisAgent(api_key=MOCK_API_KEY, model_name=MODEL_NAME)
371
-
372
- logger.info("Analyzing sample mentions data...")
373
- mentions_metrics_result = mentions_agent.analyze_mentions_data(sample_df_mentions)
374
-
375
- print("\n--- EnhancedMentionsAnalysisAgent Results ---")
376
- print(f"Agent Name: {mentions_metrics_result.agent_name}")
377
- print(f"Analysis Summary: {mentions_metrics_result.analysis_summary}")
378
- print("\nTime Series Metrics (Mentions):")
379
- for ts_metric in mentions_metrics_result.time_series_metrics:
380
- print(f" - {ts_metric.metric_name}: {len(ts_metric.values)} data points, e.g., {ts_metric.values[:3]} for ts {ts_metric.timestamps[:3]} (Unit: {ts_metric.unit})")
381
- print("\nAggregate Metrics (Mentions):")
382
- for key, value in mentions_metrics_result.aggregate_metrics.items():
383
- print(f" - {key}: {value}")
384
- print("\nCategorical Metrics (Mentions):")
385
- for key, value in mentions_metrics_result.categorical_metrics.items():
386
- print(f" - {key}:")
387
- if isinstance(value, dict):
388
- for sub_key, sub_value in list(value.items())[:2]: # Print first 2 for brevity
389
- print(f" - {sub_key}: {sub_value}")
390
- else:
391
- print(f" {value}")
392
- print(f"\nTime Periods Covered (Mentions): {mentions_metrics_result.time_periods_covered}")
393
-
394
- # Test with empty DataFrame
395
- logger.info("\n--- Testing Mentions Agent with empty DataFrame ---")
396
- empty_mentions_metrics = mentions_agent.analyze_mentions_data(pd.DataFrame())
397
- print(f"Empty Mentions DF Analysis Summary: {empty_mentions_metrics.analysis_summary}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
features/insight_and_tasks/agents/post_agent.py DELETED
@@ -1,538 +0,0 @@
1
- # agents/post_agent.py
2
- import pandas as pd
3
- from typing import Dict, List, Any, Optional
4
- import logging
5
- import pandasai as pai # Assuming pandasai is imported as pai globally or configured
6
-
7
- from google.adk.agents import LlmAgent # Assuming this is the correct import path
8
-
9
- # Project-specific imports
10
- from features.insight_and_tasks.utils.retry_mechanism import RetryMechanism
11
- from features.insight_and_tasks.data_models.metrics import AgentMetrics, TimeSeriesMetric
12
-
13
- # Configure logger for this module
14
- logger = logging.getLogger(__name__)
15
-
16
- DEFAULT_AGENT_MODEL = "gemini-2.5-flash-preview-05-20"
17
-
18
- class EnhancedPostPerformanceAgent:
19
- """
20
- Enhanced post performance agent with time-series metric extraction and detailed analysis.
21
- """
22
- AGENT_NAME = "post_analyst"
23
- AGENT_DESCRIPTION = "Expert analyst specializing in content performance trends and engagement patterns."
24
- AGENT_INSTRUCTION = """
25
- You are a specialized LinkedIn content performance expert focused on temporal engagement patterns,
26
- content type effectiveness, and audience interaction.
27
-
28
- Your role includes:
29
-
30
- 1. ENGAGEMENT TREND ANALYSIS (monthly, using 'published_at'):
31
- - Analyze trends for key engagement metrics: likes, comments, shares, overall engagement ('engagement' column), impressions, clicks.
32
- - Calculate and analyze engagement rate (engagement / impressionCount) over time.
33
- - Calculate and analyze click-through rate (CTR: clickCount / impressionCount) over time.
34
- - Identify periods of high/low engagement and potential drivers.
35
-
36
- 2. CONTENT TYPE & TOPIC PERFORMANCE:
37
- - Compare performance across different media types (using 'media_type' column).
38
- - Analyze performance by content topic/pillar (using 'li_eb_label' column).
39
- - Identify which content types/topics drive the most engagement, impressions, or clicks.
40
- - Analyze if the effectiveness of certain media types or topics changes over time.
41
-
42
- 3. POSTING BEHAVIOR ANALYSIS:
43
- - Analyze posting frequency (e.g., posts per week/month) and its potential impact on overall engagement or reach.
44
- - Identify if there are optimal posting times or days based on engagement patterns (if 'published_at' provides time detail).
45
-
46
- 4. SENTIMENT ANALYSIS (if 'sentiment' column is available):
47
- - Analyze the distribution of sentiment (e.g., positive, negative, neutral) associated with posts.
48
- - Track how average sentiment of posts evolves over time.
49
-
50
- 5. AD PERFORMANCE (if 'is_ad' column is available):
51
- - Compare performance (engagement, reach, CTR) of ad posts vs. organic posts.
52
-
53
- 6. METRIC EXTRACTION (for AgentMetrics):
54
- - Extract time-series data for average monthly engagement metrics (likes, comments, engagement rate, CTR, etc.).
55
- - Provide aggregate performance metrics (e.g., overall average engagement rate, total impressions, top performing media type).
56
- - Include distributions for content types, topics, and sentiment as categorical metrics.
57
-
58
- Focus on actionable insights. What content resonates most? When is the audience most active? How can strategy be improved?
59
- Structure your analysis clearly. Use the provided DataFrame columns ('published_at', 'media_type', 'li_eb_label',
60
- 'likeCount', 'commentCount', 'shareCount', 'engagement', 'impressionCount', 'clickCount', 'sentiment', 'is_ad').
61
- """
62
-
63
- def __init__(self, api_key: str, model_name: Optional[str] = None):
64
- self.api_key = api_key
65
- self.model_name = model_name or DEFAULT_AGENT_MODEL
66
- self.agent = LlmAgent(
67
- name=self.AGENT_NAME,
68
- model=self.model_name,
69
- description=self.AGENT_DESCRIPTION,
70
- instruction=self.AGENT_INSTRUCTION
71
- )
72
- self.retry_mechanism = RetryMechanism()
73
- logger.info(f"{self.AGENT_NAME} initialized with model {self.model_name}.")
74
-
75
- def _preprocess_post_data(self, df: pd.DataFrame) -> pd.DataFrame:
76
- """Cleans and prepares post data for analysis."""
77
- if df is None or df.empty:
78
- return pd.DataFrame()
79
-
80
- df_processed = df.copy()
81
-
82
- # Convert 'published_at' to datetime
83
- if 'published_at' in df_processed.columns:
84
- df_processed['published_at'] = pd.to_datetime(df_processed['published_at'], errors='coerce')
85
- # df_processed.dropna(subset=['published_at'], inplace=True) # Keep rows even if date is NaT for other metrics
86
- else:
87
- logger.warning("'published_at' column not found. Time-series analysis will be limited.")
88
- # Add a placeholder if critical for downstream, or handle absence gracefully
89
- # df_processed['published_at'] = pd.NaT
90
-
91
- # Ensure numeric types for engagement metrics, coercing errors and filling NaNs
92
- metric_cols = ['likeCount', 'commentCount', 'shareCount', 'engagement', 'impressionCount', 'clickCount']
93
- for col in metric_cols:
94
- if col in df_processed.columns:
95
- df_processed[col] = pd.to_numeric(df_processed[col], errors='coerce').fillna(0)
96
- else:
97
- logger.info(f"Metric column '{col}' not found in post data. Will be treated as 0.")
98
- df_processed[col] = 0 # Add column with zeros if missing
99
-
100
- # Calculate Engagement Rate and CTR where possible
101
- if 'impressionCount' in df_processed.columns and 'engagement' in df_processed.columns:
102
- df_processed['engagement_rate'] = df_processed.apply(
103
- lambda row: (row['engagement'] / row['impressionCount']) if row['impressionCount'] > 0 else 0.0, axis=1
104
- )
105
- else:
106
- df_processed['engagement_rate'] = 0.0
107
-
108
- if 'impressionCount' in df_processed.columns and 'clickCount' in df_processed.columns:
109
- df_processed['ctr'] = df_processed.apply(
110
- lambda row: (row['clickCount'] / row['impressionCount']) if row['impressionCount'] > 0 else 0.0, axis=1
111
- )
112
- else:
113
- df_processed['ctr'] = 0.0
114
-
115
- # Handle 'is_ad' boolean conversion if it exists
116
- if 'is_ad' in df_processed.columns:
117
- df_processed['is_ad'] = df_processed['is_ad'].astype(bool)
118
- else:
119
- df_processed['is_ad'] = False # Assume organic if not specified
120
-
121
- # Handle 'sentiment' - ensure it's string, fill NaNs
122
- if 'sentiment' in df_processed.columns:
123
- df_processed['sentiment'] = df_processed['sentiment'].astype(str).fillna('Unknown')
124
- else:
125
- df_processed['sentiment'] = 'Unknown'
126
-
127
- # Handle 'media_type' and 'li_eb_label' - ensure string, fill NaNs
128
- for col in ['media_type', 'li_eb_label']:
129
- if col in df_processed.columns:
130
- df_processed[col] = df_processed[col].astype(str).fillna('Unknown')
131
- else:
132
- df_processed[col] = 'Unknown'
133
-
134
- return df_processed
135
-
136
- def _extract_time_series_metrics(self, df_processed: pd.DataFrame) -> List[TimeSeriesMetric]:
137
- """Extracts monthly time-series metrics from processed post data."""
138
- ts_metrics = []
139
- if df_processed.empty or 'published_at' not in df_processed.columns or df_processed['published_at'].isnull().all():
140
- logger.info("Cannot extract time-series metrics for posts: 'published_at' is missing or all null.")
141
- return ts_metrics
142
-
143
- # Filter out rows where 'published_at' is NaT for time-series aggregation
144
- df_ts = df_processed.dropna(subset=['published_at']).copy()
145
- if df_ts.empty:
146
- logger.info("No valid 'published_at' dates for post time-series metrics after filtering NaT.")
147
- return ts_metrics
148
-
149
- df_ts['year_month'] = df_ts['published_at'].dt.strftime('%Y-%m')
150
-
151
- # Metrics to average monthly
152
- metrics_to_agg = {
153
- 'likeCount': 'mean', 'commentCount': 'mean', 'shareCount': 'mean',
154
- 'engagement': 'mean', 'impressionCount': 'mean', 'clickCount': 'mean',
155
- 'engagement_rate': 'mean', 'ctr': 'mean'
156
- }
157
- # Filter out metrics not present in the DataFrame
158
- available_metrics_to_agg = {k: v for k, v in metrics_to_agg.items() if k in df_ts.columns}
159
-
160
- if not available_metrics_to_agg:
161
- logger.info("No standard engagement metric columns found for time-series aggregation.")
162
- else:
163
- monthly_stats = df_ts.groupby('year_month').agg(available_metrics_to_agg).reset_index()
164
- timestamps = monthly_stats['year_month'].tolist()
165
-
166
- for metric_col, agg_type in available_metrics_to_agg.items():
167
- # Use original column name, or a more descriptive one like "avg_monthly_likes"
168
- ts_metrics.append(TimeSeriesMetric(
169
- metric_name=f"avg_monthly_{metric_col.lower()}",
170
- values=monthly_stats[metric_col].fillna(0).tolist(),
171
- timestamps=timestamps,
172
- metric_type="time_series",
173
- time_granularity="monthly",
174
- unit="%" if "_rate" in metric_col or "ctr" in metric_col else "count"
175
- ))
176
-
177
- # Time series for sentiment distribution (count of posts by sentiment per month)
178
- if 'sentiment' in df_ts.columns and df_ts['sentiment'].nunique() > 1 : # if sentiment data exists
179
- # Ensure 'sentiment' is not all 'Unknown'
180
- if not (df_ts['sentiment'] == 'Unknown').all():
181
- sentiment_by_month = df_ts.groupby(['year_month', 'sentiment']).size().unstack(fill_value=0)
182
- for sentiment_value in sentiment_by_month.columns:
183
- if sentiment_value == 'Unknown' and (sentiment_by_month[sentiment_value] == 0).all():
184
- continue # Skip if 'Unknown' sentiment has no posts
185
- ts_metrics.append(TimeSeriesMetric(
186
- metric_name=f"monthly_post_count_sentiment_{str(sentiment_value).lower().replace(' ', '_')}",
187
- values=sentiment_by_month[sentiment_value].tolist(),
188
- timestamps=sentiment_by_month.index.tolist(), # year_month is the index
189
- metric_type="time_series",
190
- time_granularity="monthly",
191
- unit="count"
192
- ))
193
- else:
194
- logger.info("Sentiment data is all 'Unknown', skipping sentiment time series.")
195
-
196
- # Time series for post count
197
- monthly_post_counts = df_ts.groupby('year_month').size().reset_index(name='post_count')
198
- if not monthly_post_counts.empty:
199
- ts_metrics.append(TimeSeriesMetric(
200
- metric_name="monthly_post_count",
201
- values=monthly_post_counts['post_count'].tolist(),
202
- timestamps=monthly_post_counts['year_month'].tolist(),
203
- metric_type="time_series",
204
- time_granularity="monthly",
205
- unit="count"
206
- ))
207
-
208
- return ts_metrics
209
-
210
- def _calculate_aggregate_metrics(self, df_processed: pd.DataFrame) -> Dict[str, Any]:
211
- """Calculates aggregate performance metrics for posts."""
212
- agg_metrics = {}
213
- if df_processed.empty:
214
- return agg_metrics
215
-
216
- # Overall averages and totals
217
- metric_cols_for_agg = ['likeCount', 'commentCount', 'shareCount', 'engagement',
218
- 'impressionCount', 'clickCount', 'engagement_rate', 'ctr']
219
- for col in metric_cols_for_agg:
220
- if col in df_processed.columns and pd.api.types.is_numeric_dtype(df_processed[col]):
221
- agg_metrics[f'overall_avg_{col.lower()}'] = float(df_processed[col].mean())
222
- if col not in ['engagement_rate', 'ctr']: # Totals make sense for counts
223
- agg_metrics[f'overall_total_{col.lower()}'] = float(df_processed[col].sum())
224
-
225
- agg_metrics['total_posts_analyzed'] = float(len(df_processed))
226
-
227
- # Posting frequency (posts per week)
228
- if 'published_at' in df_processed.columns and not df_processed['published_at'].isnull().all():
229
- df_dated = df_processed.dropna(subset=['published_at']).sort_values('published_at')
230
- if len(df_dated) > 1:
231
- # Calculate total duration in days
232
- duration_days = (df_dated['published_at'].max() - df_dated['published_at'].min()).days
233
- if duration_days > 0:
234
- agg_metrics['avg_posts_per_week'] = float(len(df_dated) / (duration_days / 7.0))
235
- elif len(df_dated) > 0: # All posts on the same day or within a day
236
- agg_metrics['avg_posts_per_week'] = float(len(df_dated) * 7) # Extrapolate
237
- elif len(df_dated) == 1:
238
- agg_metrics['avg_posts_per_week'] = 7.0 # One post, extrapolate to 7 per week
239
-
240
- # Performance by media type and topic (as tables/structured dicts)
241
- agg_metrics['performance_by_media_type'] = self._create_performance_table(df_processed, 'media_type')
242
- agg_metrics['performance_by_topic'] = self._create_performance_table(df_processed, 'li_eb_label')
243
-
244
- return agg_metrics
245
-
246
- def _create_performance_table(self, df: pd.DataFrame, group_column: str) -> Dict[str, Any]:
247
- """Helper to create a structured performance table for categorical analysis."""
248
- if group_column not in df.columns or df[group_column].isnull().all() or (df[group_column] == 'Unknown').all():
249
- return {"message": f"No data or only 'Unknown' values for grouping by {group_column}."}
250
-
251
- # Filter out 'Unknown' category if it's the only one or for cleaner tables
252
- df_filtered = df[df[group_column] != 'Unknown']
253
- if df_filtered.empty: # If filtering 'Unknown' leaves no data, use original df but acknowledge
254
- df_filtered = df
255
- logger.info(f"Performance table for {group_column} includes 'Unknown' as it's the only/main category.")
256
-
257
- # Define metrics to aggregate
258
- agg_config = {
259
- 'engagement': 'mean',
260
- 'impressionCount': 'mean',
261
- 'clickCount': 'mean',
262
- 'likeCount': 'mean',
263
- 'commentCount': 'mean',
264
- 'shareCount': 'mean',
265
- 'engagement_rate': 'mean',
266
- 'ctr': 'mean',
267
- 'published_at': 'count' # To get number of posts per category
268
- }
269
- # Filter config for columns that actually exist in df_filtered
270
- valid_agg_config = {k: v for k, v in agg_config.items() if k in df_filtered.columns or k == 'published_at'} # 'published_at' for count
271
-
272
- if not valid_agg_config or 'published_at' not in valid_agg_config : # Need at least one metric or count
273
- return {"message": f"Not enough relevant metric columns to create performance table for {group_column}."}
274
-
275
-
276
- try:
277
- # Group by the specified column and aggregate
278
- # Rename 'published_at' count to 'num_posts' for clarity
279
- grouped = df_filtered.groupby(group_column).agg(valid_agg_config).rename(
280
- columns={'published_at': 'num_posts'}
281
- ).reset_index()
282
-
283
- # Sort by a primary engagement metric, e.g., average engagement rate or num_posts
284
- sort_key = 'num_posts'
285
- if 'engagement_rate' in grouped.columns:
286
- sort_key = 'engagement_rate'
287
- elif 'engagement' in grouped.columns:
288
- sort_key = 'engagement'
289
-
290
- grouped = grouped.sort_values(by=sort_key, ascending=False)
291
-
292
- # Prepare for JSON serializable output
293
- table_data = []
294
- for _, row in grouped.iterrows():
295
- row_dict = {'category': row[group_column]}
296
- for col in grouped.columns:
297
- if col == group_column: continue
298
- value = row[col]
299
- if isinstance(value, (int, float)):
300
- if "_rate" in col or "ctr" in col:
301
- row_dict[col] = f"{value:.2%}" # Percentage
302
- else:
303
- row_dict[col] = round(value, 2) if isinstance(value, float) else value
304
- else:
305
- row_dict[col] = str(value)
306
- table_data.append(row_dict)
307
-
308
- return {
309
- "grouping_column": group_column,
310
- "columns_reported": [col for col in grouped.columns.tolist() if col != group_column],
311
- "data": table_data,
312
- "note": f"Top categories by {sort_key}."
313
- }
314
-
315
- except Exception as e:
316
- logger.error(f"Error creating performance table for {group_column}: {e}", exc_info=True)
317
- return {"error": f"Could not generate table for {group_column}: {e}"}
318
-
319
-
320
- def _extract_categorical_metrics(self, df_processed: pd.DataFrame) -> Dict[str, Any]:
321
- """Extracts distributions and other categorical insights for posts."""
322
- cat_metrics = {}
323
- if df_processed.empty:
324
- return cat_metrics
325
-
326
- # Media type distribution
327
- if 'media_type' in df_processed.columns and df_processed['media_type'].nunique() > 0:
328
- cat_metrics['media_type_distribution'] = df_processed['media_type'].value_counts(normalize=True).apply(lambda x: f"{x:.2%}").to_dict()
329
- cat_metrics['media_type_counts'] = df_processed['media_type'].value_counts().to_dict()
330
-
331
-
332
- # Topic distribution (li_eb_label)
333
- if 'li_eb_label' in df_processed.columns and df_processed['li_eb_label'].nunique() > 0:
334
- cat_metrics['topic_distribution'] = df_processed['li_eb_label'].value_counts(normalize=True).apply(lambda x: f"{x:.2%}").to_dict()
335
- cat_metrics['topic_counts'] = df_processed['li_eb_label'].value_counts().to_dict()
336
-
337
- # Sentiment distribution
338
- if 'sentiment' in df_processed.columns and df_processed['sentiment'].nunique() > 0:
339
- cat_metrics['sentiment_distribution'] = df_processed['sentiment'].value_counts(normalize=True).apply(lambda x: f"{x:.2%}").to_dict()
340
- cat_metrics['sentiment_counts'] = df_processed['sentiment'].value_counts().to_dict()
341
-
342
- # Ad vs. Organic performance summary
343
- if 'is_ad' in df_processed.columns:
344
- ad_summary = {}
345
- for ad_status in [True, False]:
346
- subset = df_processed[df_processed['is_ad'] == ad_status]
347
- if not subset.empty:
348
- label = "ad" if ad_status else "organic"
349
- ad_summary[f'{label}_post_count'] = int(len(subset))
350
- ad_summary[f'{label}_avg_engagement_rate'] = float(subset['engagement_rate'].mean())
351
- ad_summary[f'{label}_avg_impressions'] = float(subset['impressionCount'].mean())
352
- ad_summary[f'{label}_avg_ctr'] = float(subset['ctr'].mean())
353
- if ad_summary:
354
- cat_metrics['ad_vs_organic_summary'] = ad_summary
355
-
356
- return cat_metrics
357
-
358
- def _extract_time_periods(self, df_processed: pd.DataFrame) -> List[str]:
359
- """Extracts unique year-month time periods covered by the post data."""
360
- if df_processed.empty or 'published_at' not in df_processed.columns or df_processed['published_at'].isnull().all():
361
- return ["Data period not available or N/A"]
362
-
363
- # Use already created 'year_month' if available from preprocessing, or derive it
364
- if 'year_month' in df_processed.columns:
365
- periods = sorted(df_processed['year_month'].dropna().unique().tolist(), reverse=True)
366
- elif 'published_at' in df_processed.columns: # Derive if not present
367
- dates = df_processed['published_at'].dropna()
368
- if not dates.empty:
369
- periods = sorted(dates.dt.strftime('%Y-%m').unique().tolist(), reverse=True)
370
- else: return ["N/A"]
371
- else: return ["N/A"]
372
-
373
- return periods[:12] # Return up to the last 12 months
374
-
375
- def analyze_post_data(self, post_df: pd.DataFrame) -> AgentMetrics:
376
- """
377
- Generates comprehensive post performance analysis.
378
- """
379
- if post_df is None or post_df.empty:
380
- logger.warning("Post DataFrame is empty. Returning empty metrics.")
381
- return AgentMetrics(
382
- agent_name=self.AGENT_NAME,
383
- analysis_summary="No post data provided for analysis.",
384
- time_periods_covered=["N/A"]
385
- )
386
-
387
- # 1. Preprocess data
388
- df_processed = self._preprocess_post_data(post_df)
389
- if df_processed.empty and not post_df.empty : # Preprocessing resulted in empty df
390
- logger.warning("Post DataFrame became empty after preprocessing. Original data might have issues.")
391
- return AgentMetrics(
392
- agent_name=self.AGENT_NAME,
393
- analysis_summary="Post data could not be processed (e.g., all dates invalid).",
394
- time_periods_covered=["N/A"]
395
- )
396
- elif df_processed.empty and post_df.empty: # Was already empty
397
- # This case is handled by the initial check, but as a safeguard:
398
- return AgentMetrics(agent_name=self.AGENT_NAME, analysis_summary="No post data provided.")
399
-
400
-
401
- # 2. Generate textual analysis using PandasAI (similar to follower agent)
402
- df_description_for_pandasai = "LinkedIn post performance data. Key columns: 'published_at' (date of post), 'media_type' (e.g., IMAGE, VIDEO, ARTICLE), 'li_eb_label' (content topic/pillar), 'likeCount', 'commentCount', 'shareCount', 'engagement' (sum of reactions, comments, shares), 'impressionCount', 'clickCount', 'sentiment' (post sentiment), 'is_ad' (boolean), 'engagement_rate', 'ctr'."
403
-
404
- analysis_result_text = "PandasAI analysis for posts could not be performed."
405
- try:
406
- # Ensure PandasAI is configured
407
- pandas_ai_df = pai.DataFrame(df_processed, description=df_description_for_pandasai)
408
-
409
- analysis_query = f"""
410
- Analyze the provided LinkedIn post performance data. Focus on:
411
- 1. Monthly trends for key metrics (engagement, impressions, engagement rate, CTR).
412
- 2. Performance comparison by 'media_type' and 'li_eb_label'. Which ones are most effective?
413
- 3. Impact of posting frequency (if derivable from 'published_at' timestamps).
414
- 4. Sentiment trends and distribution.
415
- 5. Differences in performance between ad posts ('is_ad'=True) and organic posts.
416
- Provide a concise summary of findings and actionable recommendations.
417
- """
418
- def chat_operation():
419
- config = pai.config.get()
420
- logger.info(f"pai_config: {config}, Type of config: {type(config)}")
421
- if not config.llm:
422
- logger.warning("PandasAI LLM not configured. Attempting to configure now.")
423
- # This assumes configure_pandasai is available and sets the LLM config
424
- from insight_and_tasks.utils.pandasai_setup import configure_pandasai
425
- configure_pandasai(self.api_key, self.model_name)
426
-
427
- # Re-check configuration after setup attempt
428
- config = pai.config.get()
429
- if not config.llm:
430
- raise RuntimeError("PandasAI LLM could not be configured for chat operation.")
431
-
432
- logger.info(f"Executing PandasAI chat for follower analysis with LLM: {config.llm}")
433
- return pandas_ai_df.chat(analysis_query)
434
-
435
- analysis_result_raw = self.retry_mechanism.retry_with_backoff(
436
- func=chat_operation, max_retries=2, base_delay=2.0, exceptions=(Exception,)
437
- )
438
- analysis_result_text = str(analysis_result_raw) if analysis_result_raw else "No textual analysis for posts generated by PandasAI."
439
- logger.info("Post performance analysis via PandasAI completed.")
440
-
441
- except Exception as e:
442
- logger.error(f"Post analysis with PandasAI failed: {e}", exc_info=True)
443
- analysis_result_text = f"Post analysis using PandasAI failed. Error: {str(e)[:200]}"
444
-
445
- # 3. Extract structured metrics
446
- time_series_metrics = self._extract_time_series_metrics(df_processed)
447
- aggregate_metrics = self._calculate_aggregate_metrics(df_processed)
448
- categorical_metrics = self._extract_categorical_metrics(df_processed)
449
- time_periods = self._extract_time_periods(df_processed)
450
-
451
- return AgentMetrics(
452
- agent_name=self.AGENT_NAME,
453
- analysis_summary=analysis_result_text[:2000],
454
- time_series_metrics=time_series_metrics,
455
- aggregate_metrics=aggregate_metrics,
456
- categorical_metrics=categorical_metrics,
457
- time_periods_covered=time_periods,
458
- data_sources_used=[f"post_df (shape: {post_df.shape}) -> df_processed (shape: {df_processed.shape})"]
459
- )
460
-
461
- if __name__ == '__main__':
462
- try:
463
- from utils.logging_config import setup_logging
464
- setup_logging()
465
- logger.info("Logging setup for EnhancedPostPerformanceAgent test.")
466
- except ImportError:
467
- logging.basicConfig(level=logging.INFO)
468
- logger.warning("Could not import setup_logging. Using basicConfig.")
469
-
470
- MOCK_API_KEY = os.environ.get("GOOGLE_API_KEY", "test_api_key_posts")
471
- MODEL_NAME = DEFAULT_AGENT_MODEL
472
-
473
- try:
474
- from utils.pandasai_setup import configure_pandasai
475
- if MOCK_API_KEY != "test_api_key_posts":
476
- configure_pandasai(MOCK_API_KEY, MODEL_NAME)
477
- logger.info("PandasAI configured for testing EnhancedPostPerformanceAgent.")
478
- else:
479
- logger.warning("Using mock API key for posts. PandasAI chat will likely fail or use a mock.")
480
- class MockPandasAIDataFrame:
481
- def __init__(self, df, description): self.df = df; self.description = description
482
- def chat(self, query): return f"Mock PandasAI post response to: {query}"
483
- pai.DataFrame = MockPandasAIDataFrame
484
- except ImportError:
485
- logger.error("utils.pandasai_setup not found. PandasAI will not be configured for posts.")
486
- class MockPandasAIDataFrame:
487
- def __init__(self, df, description): self.df = df; self.description = description
488
- def chat(self, query): return f"Mock PandasAI post response to: {query}"
489
- pai.DataFrame = MockPandasAIDataFrame
490
-
491
- sample_post_data = {
492
- 'published_at': pd.to_datetime(['2023-01-15', '2023-01-20', '2023-02-10', '2023-02-25', '2023-03-05', None]),
493
- 'media_type': ['IMAGE', 'VIDEO', 'IMAGE', 'ARTICLE', 'IMAGE', 'IMAGE'],
494
- 'li_eb_label': ['Product Update', 'Company Culture', 'Product Update', 'Industry Insights', 'Company Culture', 'Product Update'],
495
- 'likeCount': [100, 150, 120, 80, 200, 50],
496
- 'commentCount': [10, 20, 15, 5, 25, 3],
497
- 'shareCount': [5, 10, 8, 2, 12, 1],
498
- 'engagement': [115, 180, 143, 87, 237, 54], # Sum of likes, comments, shares
499
- 'impressionCount': [1000, 1500, 1200, 900, 2000, 600],
500
- 'clickCount': [50, 70, 60, 30, 90, 20],
501
- 'sentiment': ['Positive 👍', 'Positive 👍', 'Neutral 😐', 'Positive 👍', 'Negative 👎', 'Positive 👍'],
502
- 'is_ad': [False, False, True, False, False, True]
503
- }
504
- sample_df_posts = pd.DataFrame(sample_post_data)
505
-
506
- post_agent = EnhancedPostPerformanceAgent(api_key=MOCK_API_KEY, model_name=MODEL_NAME)
507
-
508
- logger.info("Analyzing sample post data...")
509
- post_metrics_result = post_agent.analyze_post_data(sample_df_posts)
510
-
511
- print("\n--- EnhancedPostPerformanceAgent Results ---")
512
- print(f"Agent Name: {post_metrics_result.agent_name}")
513
- print(f"Analysis Summary: {post_metrics_result.analysis_summary}")
514
- print("\nTime Series Metrics (Post):")
515
- for ts_metric in post_metrics_result.time_series_metrics:
516
- print(f" - {ts_metric.metric_name}: {len(ts_metric.values)} data points, e.g., {ts_metric.values[:3]} for ts {ts_metric.timestamps[:3]} (Unit: {ts_metric.unit})")
517
- print("\nAggregate Metrics (Post):")
518
- for key, value in post_metrics_result.aggregate_metrics.items():
519
- if isinstance(value, dict) and 'data' in value: # Performance table
520
- print(f" - {key}: (Table - {value.get('grouping_column', '')}) - {len(value['data'])} categories")
521
- for item in value['data'][:1]: # Print first item for brevity
522
- print(f" Example Category '{item.get('category')}': { {k:v for k,v in item.items() if k!='category'} }")
523
- else:
524
- print(f" - {key}: {value}")
525
- print("\nCategorical Metrics (Post):")
526
- for key, value in post_metrics_result.categorical_metrics.items():
527
- print(f" - {key}:")
528
- if isinstance(value, dict):
529
- for sub_key, sub_value in list(value.items())[:2]:
530
- print(f" - {sub_key}: {sub_value}")
531
- else:
532
- print(f" {value}")
533
- print(f"\nTime Periods Covered (Post): {post_metrics_result.time_periods_covered}")
534
-
535
- # Test with empty DataFrame
536
- logger.info("\n--- Testing Post Agent with empty DataFrame ---")
537
- empty_post_metrics = post_agent.analyze_post_data(pd.DataFrame())
538
- print(f"Empty Post DF Analysis Summary: {empty_post_metrics.analysis_summary}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
features/insight_and_tasks/agents/task_extraction_agent.py DELETED
@@ -1,400 +0,0 @@
1
- # agents/task_extraction_agent.py
2
- import logging
3
- from typing import Optional
4
- from datetime import datetime, date # Ensure date is imported if used for type hints
5
-
6
- from google.adk.agents import LlmAgent
7
- from google.adk.runners import InMemoryRunner # Assuming this is used for direct agent running
8
- from google.genai import types as genai_types # For constructing ADK agent inputs
9
-
10
- # Project-specific imports
11
- from features.insight_and_tasks.data_models.tasks import (
12
- TaskExtractionOutput,
13
- OKR,
14
- KeyResult,
15
- Task,
16
- EffortLevel,
17
- TimelineCategory,
18
- PriorityLevel,
19
- TaskType,
20
- DataSubject # Ensure all are imported
21
- )
22
- from features.insight_and_tasks.utils.retry_mechanism import RetryMechanism # If retries are needed for ADK calls
23
-
24
- # Configure logger for this module
25
- logger = logging.getLogger(__name__)
26
-
27
- DEFAULT_AGENT_MODEL = "gemini-2.5-flash-preview-05-20" # Or your specific model
28
-
29
- class TaskExtractionAgent:
30
- """
31
- Agent specialized in extracting actionable tasks and OKRs from analysis insights,
32
- with awareness of the current date and quarter.
33
- """
34
- AGENT_NAME = "task_extractor"
35
- AGENT_DESCRIPTION = "Specialist in converting strategic insights into specific, time-aware actionable tasks and OKRs."
36
-
37
- def __init__(self, api_key: str, model_name: Optional[str] = None, current_date: Optional[date] = None):
38
- """
39
- Initializes the TaskExtractionAgent.
40
- Args:
41
- api_key: API key (may be used by LlmAgent configuration or future needs).
42
- model_name: Name of the language model to use.
43
- current_date: The current date to use for quarter calculations. Defaults to today.
44
- """
45
- self.api_key = api_key # Store if needed by LlmAgent or other components
46
- self.model_name = model_name or DEFAULT_AGENT_MODEL
47
- self.current_date = current_date or datetime.utcnow().date() # Use date object for consistency
48
-
49
- # LlmAgent is initialized with dynamic instruction and output schema
50
- self.agent = LlmAgent(
51
- name=self.AGENT_NAME,
52
- model=self.model_name,
53
- description=self.AGENT_DESCRIPTION,
54
- instruction=self._get_instruction_prompt(), # Instruction generated dynamically
55
- output_schema=TaskExtractionOutput, # Pydantic model for structured output
56
- output_key="extracted_tasks_okrs" # Key where LlmAgent stores structured output in state
57
- )
58
- self.retry_mechanism = RetryMechanism() # For retrying ADK runner if needed
59
- logger.info(f"{self.AGENT_NAME} initialized for Q{self._get_quarter(self.current_date)}, "
60
- f"{self._days_until_quarter_end(self.current_date)} days remaining in quarter. Model: {self.model_name}")
61
-
62
- def _get_quarter(self, d: date) -> int:
63
- """Calculates the quarter for a given date."""
64
- return (d.month - 1) // 3 + 1
65
-
66
- def _days_until_quarter_end(self, d: date) -> int:
67
- """Calculates the number of days remaining in the current quarter from date d."""
68
- current_q = self._get_quarter(d)
69
- year = d.year
70
- if current_q == 1:
71
- quarter_end_date = date(year, 3, 31)
72
- elif current_q == 2:
73
- quarter_end_date = date(year, 6, 30)
74
- elif current_q == 3:
75
- quarter_end_date = date(year, 9, 30)
76
- else: # Quarter 4
77
- quarter_end_date = date(year, 12, 31)
78
-
79
- days_remaining = (quarter_end_date - d).days
80
- return max(0, days_remaining) # Ensure non-negative
81
-
82
- def _get_instruction_prompt(self) -> str:
83
- """Generates the dynamic instruction string for the LLM agent."""
84
- quarter = self._get_quarter(self.current_date)
85
- days_remaining = self._days_until_quarter_end(self.current_date)
86
-
87
-
88
- return f"""
89
- You are a Time-Aware Task Extraction Specialist, an AI expert in meticulously analyzing strategic insights (e.g., from LinkedIn analytics) and transforming them into a structured set of actionable tasks, organized within an Objectives and KeyResults (OKRs) framework.
90
-
91
- CURRENT CONTEXTUAL INFORMATION (CRITICAL - Use these exact values in your output where specified):
92
- - Current Quarter: Q{quarter}
93
- - Days remaining in current quarter: {days_remaining}
94
- - Today's Date (for your context only, not for direct output unless specified by a schema field): {self.current_date.isoformat()}
95
-
96
- For EACH 'OKR' object, you MUST generate a 'key_results' array containing 1 to 3 'KeyResult' objects.
97
- For EACH 'KeyResult' object, you MUST generate a 'tasks' array containing 1 to 3 'Task' objects.
98
- It is CRITICAL that you populate the 'key_results' list for every OKR, and the 'tasks' list for every KeyResult.
99
-
100
- KEY GUIDELINES FOR QUALITY AND ACCURACY:
101
- - Actionability: All descriptions (Objective, Key Result, Task) must be clear, concise, and define concrete actions or measurable outcomes.
102
- - Measurability: Key Results and Task 'success_criteria_metrics' must be specific and quantifiable.
103
- - Completeness: Ensure all REQUIRED fields in every Pydantic model are present in your JSON output. Optional fields can be omitted or set to null if not applicable.
104
-
105
- INPUT:
106
- You will receive a 'comprehensive_analysis' text.
107
-
108
- OUTPUT FORMAT:
109
- # Example of the overall JSON structure (content is illustrative; refer to schemas for full details):
110
- {{
111
- "current_quarter_info": "Q{quarter}, {days_remaining} days remaining",
112
- "okrs": [
113
- {{
114
- "objective_description": "Example: Elevate brand visibility and engagement across key digital channels.",
115
- "objective_timeline": "{TimelineCategory.SHORT_TERM.value}",
116
- "objective_owner": "Marketing Department",
117
- "key_results": [
118
- {{
119
- "key_result_description": "Example: Increase organic reach on LinkedIn by 15%.",
120
- "target_metric": "LinkedIn Organic Reach Percentage Increase",
121
- "target_value": "15%",
122
- "tasks": [
123
- {{
124
- "task_category": "Content Strategy",
125
- "task_description": "Develop and schedule a 4-week content calendar for LinkedIn focusing on industry insights.",
126
- "objective_deliverable": "Deliverable: A finalized 4-week content calendar with 3 posts per week, approved and scheduled.",
127
- "effort": "{EffortLevel.MEDIUM.value}",
128
- "timeline": "{TimelineCategory.IMMEDIATE.value}",
129
- "responsible_party": "Content Marketing Manager",
130
- "success_criteria_metrics": "Content calendar completed and approved by [Date]. All posts scheduled by [Date].",
131
- "dependencies_prerequisites": "Completion of Q{quarter} keyword research and audience persona refinement.",
132
- "priority": "{PriorityLevel.HIGH.value}",
133
- "priority_justification": "Critical for maintaining consistent brand voice and achieving engagement targets for the quarter.",
134
- "why_proposed": "Analysis of LinkedIn insights report (Page 3) showed a 20% drop in engagement last month, attributed to inconsistent posting schedule and lack of targeted content themes.",
135
- "task_type": "{TaskType.INITIATIVE.value}",
136
- "data_subject": "{DataSubject.POSTS.value}"
137
- }}
138
- ]
139
- }}
140
- ]
141
- }}
142
- ],
143
- "overall_strategic_focus": "Example: Focus on data-driven content strategy and proactive community engagement to boost Q{quarter} performance.",
144
- "generation_timestamp": "{datetime.utcnow().isoformat()}Z"
145
- }}
146
-
147
- Focus on precision, quality, actionability, and strict adherence to the specified JSON output schema and all constraints.
148
- Ensure all string values in the JSON are properly escaped if they contain special characters (e.g., newlines, quotes).
149
- """
150
-
151
- async def extract_tasks(self, comprehensive_analysis: str) -> TaskExtractionOutput:
152
- """
153
- Extracts time-aware actionable tasks from the comprehensive analysis text.
154
- Args:
155
- comprehensive_analysis: The text analysis from which to extract tasks.
156
- Returns:
157
- A TaskExtractionOutput Pydantic model instance.
158
- """
159
- if not comprehensive_analysis or not comprehensive_analysis.strip():
160
- logger.warning("Comprehensive analysis text is empty. Cannot extract tasks.")
161
- return TaskExtractionOutput(
162
- current_quarter_info=f"Q{self._get_quarter(self.current_date)}, {self._days_until_quarter_end(self.current_date)} days remaining",
163
- okrs=[],
164
- overall_strategic_focus="No analysis provided to extract tasks."
165
- )
166
-
167
- # The LlmAgent's instruction already contains the dynamic date info and output format.
168
- # The input to the agent's run method will be the comprehensive_analysis.
169
- prompt_for_adk_agent = f"""
170
- Comprehensive Analysis for Task Extraction:
171
- ---
172
- {comprehensive_analysis}
173
- ---
174
- Based on the analysis above, and adhering strictly to your primary instructions (especially regarding current quarter context, task field requirements, and JSON output schema 'TaskExtractionOutput'), generate the OKRs and tasks.
175
- Ensure the 'current_quarter_info' field in your JSON output is exactly: "Q{self._get_quarter(self.current_date)}, {self._days_until_quarter_end(self.current_date)} days remaining".
176
- """
177
-
178
- user_input_content = genai_types.Content(
179
- role="user",
180
- parts=[genai_types.Part(text=prompt_for_adk_agent)]
181
- )
182
-
183
- # Using InMemoryRunner as per original structure for LlmAgent with output_schema
184
- runner = InMemoryRunner(agent=self.agent, app_name=f"{self.AGENT_NAME}Runner")
185
- # Generate a unique user_id for each run to ensure fresh session state if needed.
186
- user_id = f"system_user_task_extractor_{int(datetime.utcnow().timestamp())}"
187
-
188
- session = await runner.session_service.create_session(
189
- app_name=f"{self.AGENT_NAME}Runner",
190
- user_id=user_id
191
- )
192
-
193
- extracted_data_dict = None
194
- full_response_text_for_debug = "" # To capture raw text if parsing fails
195
-
196
- try:
197
- logger.info(f"Running TaskExtractionAgent for user_id: {user_id}, session_id: {session.id}")
198
-
199
- # Fix: Use regular for loop instead of async for, since runner.run() returns a generator
200
- run_result = runner.run(
201
- user_id=user_id,
202
- session_id=session.id,
203
- new_message=user_input_content
204
- )
205
-
206
- # Check if it's an async iterator or regular generator
207
- if hasattr(run_result, '__aiter__'):
208
- # It's an async iterator, use async for
209
- async for event in run_result:
210
- if (hasattr(event, 'actions') and event.actions and
211
- hasattr(event.actions, 'state_delta') and
212
- isinstance(event.actions.state_delta, dict) and
213
- self.agent.output_key in event.actions.state_delta):
214
-
215
- extracted_data_dict = event.actions.state_delta[self.agent.output_key]
216
- logger.info(f"Successfully extracted structured data via LlmAgent state_delta.")
217
- break
218
-
219
- # Capture text parts for debugging if direct structured output isn't found first
220
- if hasattr(event, 'content') and event.content and event.content.parts:
221
- for part in event.content.parts:
222
- if hasattr(part, 'text'):
223
- full_response_text_for_debug += part.text
224
- else:
225
- # It's a regular generator, use regular for loop
226
- for event in run_result:
227
- if (hasattr(event, 'actions') and event.actions and
228
- hasattr(event.actions, 'state_delta') and
229
- isinstance(event.actions.state_delta, dict) and
230
- self.agent.output_key in event.actions.state_delta):
231
-
232
- extracted_data_dict = event.actions.state_delta[self.agent.output_key]
233
- logger.info(f"Successfully extracted structured data via LlmAgent state_delta.")
234
- break
235
-
236
- # Capture text parts for debugging if direct structured output isn't found first
237
- if hasattr(event, 'content') and event.content and event.content.parts:
238
- for part in event.content.parts:
239
- if hasattr(part, 'text'):
240
- full_response_text_for_debug += part.text
241
-
242
- if not extracted_data_dict and full_response_text_for_debug:
243
- logger.warning("LlmAgent did not produce structured output in state_delta. Raw text response was: %s",
244
- full_response_text_for_debug[:500] + "...")
245
-
246
- except Exception as e:
247
- logger.error(f"Error during TaskExtractionAgent execution: {e}", exc_info=True)
248
- finally:
249
- try:
250
- await runner.session_service.delete_session(
251
- app_name=f"{self.AGENT_NAME}Runner", user_id=user_id, session_id=session.id
252
- )
253
- except Exception as session_del_e:
254
- logger.error(f"Error deleting task extractor session: {session_del_e}")
255
-
256
- if extracted_data_dict:
257
- if isinstance(extracted_data_dict, TaskExtractionOutput): # Already a Pydantic model
258
- return extracted_data_dict
259
- elif isinstance(extracted_data_dict, dict): # If it's a dict, parse it
260
- try:
261
- return TaskExtractionOutput(**extracted_data_dict)
262
- except Exception as pydantic_error:
263
- logger.error(f"Error parsing extracted dictionary into TaskExtractionOutput: {pydantic_error}", exc_info=True)
264
- logger.error(f"Problematic dictionary data: {extracted_data_dict}")
265
- else:
266
- logger.error(f"Extracted data is not a dictionary or TaskExtractionOutput model: {type(extracted_data_dict)}")
267
-
268
- # Fallback if no valid data extracted
269
- logger.warning("No valid structured data extracted by TaskExtractionAgent.")
270
- return TaskExtractionOutput(
271
- current_quarter_info=f"Q{self._get_quarter(self.current_date)}, {self._days_until_quarter_end(self.current_date)} days remaining",
272
- okrs=[],
273
- overall_strategic_focus="Failed to extract tasks or no tasks were identified.",
274
- generation_timestamp=datetime.utcnow().isoformat()
275
- )
276
-
277
- def update_current_date(self, new_date: date):
278
- """
279
- Updates the current date for the agent and re-initializes the LlmAgent
280
- to reflect the new date context in its instructions.
281
- """
282
- self.current_date = new_date
283
- # Re-initialize the LlmAgent with the new instruction based on the new date
284
- self.agent = LlmAgent(
285
- name=self.AGENT_NAME,
286
- model=self.model_name,
287
- description=self.AGENT_DESCRIPTION,
288
- instruction=self._get_instruction_prompt(), # Get updated instruction
289
- output_schema=TaskExtractionOutput,
290
- output_key="extracted_tasks_okrs"
291
- )
292
- logger.info(f"{self.AGENT_NAME} date updated. New context: Q{self._get_quarter(self.current_date)}, "
293
- f"{self._days_until_quarter_end(self.current_date)} days remaining.")
294
-
295
-
296
- if __name__ == '__main__':
297
- import asyncio
298
- # (Ensure logging_config.py is in the same directory or PYTHONPATH is set for this example to run standalone)
299
- try:
300
- from utils.logging_config import setup_logging
301
- setup_logging()
302
- logger.info("Logging setup for TaskExtractionAgent test.")
303
- except ImportError:
304
- logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")
305
- logger.warning("logging_config.py not found, using basicConfig for logging.")
306
-
307
- MOCK_API_KEY = os.environ.get("GOOGLE_API_KEY", "test_api_key_task_extractor") # Use your actual key or env var
308
- MODEL_NAME = DEFAULT_AGENT_MODEL
309
-
310
- # Example comprehensive analysis text (replace with actual analysis output)
311
- sample_analysis_text = """
312
- Overall Summary: Follower growth is steady at 5% MoM. Post engagement is highest for video content
313
- (avg 8% engagement rate) published on weekdays. However, mentions sentiment dipped in the last month
314
- (-0.2 avg score) due to complaints about customer service response times.
315
- Key opportunity: Improve customer service communication and leverage video content more effectively.
316
- Strategic Recommendation: Launch a 'Customer First' initiative and create a video series showcasing customer success stories.
317
- """
318
-
319
- # Test with a specific date
320
- test_date = date(2025, 4, 15) # Example: Mid-Q2 2025
321
- task_agent = TaskExtractionAgent(api_key=MOCK_API_KEY, model_name=MODEL_NAME, current_date=test_date)
322
-
323
- logger.info(f"Task Agent Instruction for test_date {test_date}:\n{task_agent._get_instruction_prompt()[:500]}...")
324
-
325
- async def run_extraction():
326
- logger.info("Extracting tasks from sample analysis...")
327
- # In a real scenario, ensure GOOGLE_API_KEY is set if the LlmAgent makes actual calls.
328
- # For local tests without real API calls, the LlmAgent might behave as a mock or require specific test setup.
329
- if MOCK_API_KEY == "test_api_key_task_extractor":
330
- logger.warning("Using a mock API key. LlmAgent behavior might be limited or mocked for task extraction.")
331
- # Mock the runner if no real API call should be made
332
- class MockADKRunner:
333
- def __init__(self, agent, app_name): self.agent = agent
334
- async def session_service_create_session(self, app_name, user_id):
335
- class MockSession: id = "mock_session_id"
336
- return MockSession()
337
- async def run(self, user_id, session_id, new_message):
338
- # Simulate a response structure
339
- mock_okr = OKR(
340
- objective_description="Improve Customer Satisfaction",
341
- key_results=[KeyResult(
342
- key_result_description="Reduce negative mentions by 10%",
343
- tasks=[Task(
344
- task_category="Customer Service", task_description="Respond to all negative mentions within 2 hours.",
345
- objective_deliverable="Improved response time.", effort=EffortLevel.MEDIUM, timeline=TimelineCategory.IMMEDIATE,
346
- responsible_party="Support Team", success_criteria_metrics="Avg response time < 2hrs.",
347
- priority=PriorityLevel.HIGH, priority_justification="Critical for reputation.",
348
- why_proposed="Analysis showed dip in sentiment due to slow responses.", task_type=TaskType.INITIATIVE,
349
- data_subject=DataSubject.MENTIONS
350
- )]
351
- )],
352
- objective_timeline=TimelineCategory.SHORT_TERM
353
- )
354
- mock_output = TaskExtractionOutput(
355
- current_quarter_info=f"Q{task_agent._get_quarter(task_agent.current_date)}, {task_agent._days_until_quarter_end(task_agent.current_date)} days remaining",
356
- okrs=[mock_okr],
357
- overall_strategic_focus="Focus on customer service improvement."
358
- )
359
- # Simulate the event structure LlmAgent with output_schema would produce
360
- class MockEvent:
361
- def __init__(self):
362
- self.actions = type('Actions', (), {'state_delta': {task_agent.agent.output_key: mock_output.model_dump()}})() # .model_dump() for Pydantic v2
363
- yield MockEvent()
364
-
365
- async def session_service_delete_session(self, app_name, user_id, session_id): pass
366
-
367
- # Monkey patch the InMemoryRunner for this test if using mock key
368
- global InMemoryRunner
369
- OriginalInMemoryRunner = InMemoryRunner
370
- InMemoryRunner = MockADKRunner
371
-
372
-
373
- extracted_okrs_output = await task_agent.extract_tasks(sample_analysis_text)
374
-
375
- # Restore InMemoryRunner if it was patched
376
- if MOCK_API_KEY == "test_api_key_task_extractor" and 'OriginalInMemoryRunner' in globals():
377
- InMemoryRunner = OriginalInMemoryRunner
378
-
379
-
380
- print("\n--- TaskExtractionAgent Results ---")
381
- if extracted_okrs_output:
382
- print(f"Current Quarter Info: {extracted_okrs_output.current_quarter_info}")
383
- print(f"Overall Strategic Focus: {extracted_okrs_output.overall_strategic_focus}")
384
- print(f"Generated Timestamp: {extracted_okrs_output.generation_timestamp}")
385
- print("\nOKRs Extracted:")
386
- # Use .model_dump_json() for Pydantic v2 for pretty printing
387
- print(extracted_okrs_output.model_dump_json(indent=2))
388
- else:
389
- print("No OKRs extracted or an error occurred.")
390
-
391
- if __name__ == '__main__': # This check is technically inside another if __name__ == '__main__'
392
- asyncio.run(run_extraction())
393
-
394
- # Example of updating date
395
- logger.info("\n--- Updating date for Task Agent ---")
396
- new_test_date = date(2025, 10, 5) # Q4
397
- task_agent.update_current_date(new_test_date)
398
- # The instruction within task_agent.agent is now updated.
399
- # logger.info(f"Task Agent NEW Instruction for test_date {new_test_date}:\n{task_agent.agent.instruction[:500]}...")
400
- # A new call to extract_tasks would use this updated context.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
features/insight_and_tasks/agents/task_extraction_model.py DELETED
@@ -1,226 +0,0 @@
1
- import enum
2
- import json
3
- from typing import List, Optional, Literal
4
- from pydantic import BaseModel, Field, field_validator, ValidationInfo
5
- from datetime import datetime, date
6
-
7
- try:
8
- from google import genai
9
- except ImportError:
10
- print("Warning: 'google.generai' library not found. Please install it.")
11
- print("If you are using the standard Gemini API, try: pip install google-generativeai")
12
- print("If using Vertex AI, ensure the Google Cloud SDK is configured.")
13
- genai = None # Placeholder to allow script to be parsed
14
-
15
- from features.insight_and_tasks.data_models.tasks import (
16
- TaskExtractionOutput,
17
- OKR,
18
- KeyResult,
19
- Task,
20
- EffortLevel,
21
- TimelineCategory,
22
- PriorityLevel,
23
- TaskType,
24
- DataSubject # Ensure all are imported
25
- )
26
-
27
- def create_example_structure():
28
- """
29
- Creates a valid example structure that conforms to the Pydantic models
30
- to show the AI what the output should look like.
31
- """
32
- return {
33
- "current_quarter_info": "Q2 2025, 24 days remaining",
34
- "okrs": [
35
- {
36
- "objective_description": "Significantly improve our LinkedIn employer branding performance to attract top-tier talent and establish our company as a thought leader in the tech industry.",
37
-
38
- "objective_timeline": "Short-term",
39
- "objective_owner": "Marketing Department",
40
- "key_results": [
41
- {
42
- # CORRECTION: Description expanded to satisfy the 'min_length=100' validation rule.
43
- "key_result_description": "Achieve a sustained 50% increase in the rate of monthly follower growth on our company LinkedIn page, demonstrating enhanced audience engagement and brand reach.",
44
- "target_metric": "Monthly Follower Growth Rate",
45
- "target_value": "50% increase",
46
-
47
- # CORRECTION: Extra 'current_value' field removed as it's not in the Pydantic model.
48
-
49
- # CORRECTION: Value changed from "performance" to "PERFORMANCE" to match 'KeyResultType' enum.
50
- "key_result_type": "PERFORMANCE",
51
-
52
- # CORRECTION: Value changed from "posts" to "FOLLOWER_STATS" to match 'DataSubject' enum and better reflect the key result.
53
- "data_subject": "FOLLOWER_STATS",
54
- "tasks": [
55
- {
56
- "task_description": "Increase posting frequency to a consistent, high-quality schedule.",
57
- "objective_deliverable": "Post a minimum of 3 high-quality, relevant articles or updates per week.",
58
- "task_category": "Content Creation",
59
-
60
- # CORRECTION: Added the missing required field 'task_type' with a valid 'TaskType' enum value.
61
- "task_type": "INITIATIVE",
62
-
63
- # CORRECTION: Value changed from "high" to "High" to match the 'PriorityLevel' enum.
64
- "priority": "High",
65
-
66
- # CORRECTION: Added the missing required field 'priority_justification'.
67
- "priority_justification": "Increasing post frequency is a primary driver for engagement and follower growth, directly impacting the key result.",
68
-
69
- # CORRECTION: Value changed from "medium" to "Medium" to match the 'EffortLevel' enum.
70
- "effort": "Medium",
71
-
72
- # CORRECTION: Value changed from "this_quarter" to "Short-term" to match the 'TimelineCategory' enum.
73
- "timeline": "Short-term",
74
-
75
- # CORRECTION: Value changed from "linkedin_performance" to "POSTS" to match the 'DataSubject' enum.
76
- "data_subject": "POSTS",
77
- "responsible_party": "Social Media Manager",
78
- "success_criteria_metrics": "A weekly average of 3 or more posts is maintained over the quarter.",
79
- "dependencies_prerequisites": "A finalized content calendar for the quarter.",
80
- "why_proposed": "Historical data analysis shows a direct correlation between low posting frequency and stagnant follower gains. This task addresses the root cause."
81
- }
82
- ]
83
- }
84
- ]
85
- }
86
- ],
87
- "overall_strategic_focus": "Accelerate follower growth and enhance brand authority on LinkedIn."
88
- }
89
-
90
-
91
- # --- Helper Function for Date Calculations ---
92
- def get_quarter_info():
93
- """Calculates current quarter, year, and days remaining in the quarter."""
94
- today = date.today()
95
- current_year = today.year
96
- current_quarter = (today.month - 1) // 3 + 1
97
-
98
- # Determine the end date of the current quarter
99
- if current_quarter == 1:
100
- end_of_quarter_date = date(current_year, 3, 31)
101
- elif current_quarter == 2:
102
- end_of_quarter_date = date(current_year, 6, 30)
103
- elif current_quarter == 3:
104
- end_of_quarter_date = date(current_year, 9, 30)
105
- else: # current_quarter == 4
106
- end_of_quarter_date = date(current_year, 12, 31)
107
-
108
- days_remaining = (end_of_quarter_date - today).days
109
- days_remaining = max(0, days_remaining) # Ensure it's not negative
110
-
111
- return current_quarter, current_year, days_remaining, today
112
-
113
- # --- Main Task Extraction Function ---
114
- def extract_tasks_from_text(user_text_input: str, api_key: str) -> TaskExtractionOutput:
115
- """
116
- Extracts tasks from input text using Gemini API and structures them as TaskExtractionOutput.
117
-
118
- Args:
119
- user_text_input: The text to analyze.
120
- api_key: The Gemini API key.
121
-
122
- Returns:
123
- A TaskExtractionOutput Pydantic model instance.
124
-
125
- Raises:
126
- ValueError: If API call fails or response parsing is unsuccessful.
127
- ImportError: If 'google.generai' is not available.
128
- """
129
- if not genai:
130
- raise ImportError("The 'google.generai' library is not available. Please install and configure it.")
131
-
132
- # Initialize the Gemini client (as per user's example structure)
133
- # This specific client initialization might vary based on the exact 'google.generai' library version/origin.
134
- try:
135
- client = genai.Client(api_key=api_key)
136
- except AttributeError:
137
- # Fallback for standard google-generativeai SDK if genai.Client is not found
138
- try:
139
- genai.configure(api_key=api_key)
140
- # This function will then need to use genai.GenerativeModel('gemini-2.0-flash')
141
- # For simplicity, sticking to user's client.models.generate_content structure.
142
- # This part would need significant rework if genai.Client is not the correct interface.
143
- print("Warning: genai.Client not found. The API call structure might be incorrect for your 'google.generai' version.")
144
- print("Assuming a client object with 'models.generate_content' method is expected.")
145
- # This is a placeholder; actual client setup depends on the specific library.
146
- # If this is google-generativeai, the user should adapt to use genai.GenerativeModel.
147
- raise NotImplementedError("genai.Client not found. Please adapt API call to your SDK version.")
148
-
149
- except Exception as e:
150
- raise ImportError(f"Failed to initialize Gemini client or configure API key: {e}")
151
-
152
-
153
- quarter, year, days_remaining, current_date_obj = get_quarter_info()
154
- current_date_iso = current_date_obj.isoformat()
155
- example_structure = create_example_structure()
156
-
157
- # Construct the detailed prompt for the LLM
158
- prompt = f"""You are a Time-Aware Task Extraction Specialist, an AI expert in meticulously analyzing strategic insights (e.g., from LinkedIn analytics) and transforming them into a structured set of actionable tasks, organized within an Objectives and KeyResults (OKRs) framework.
159
-
160
- Your output MUST be a valid JSON object that strictly conforms to the 'TaskExtractionOutput' schema provided.
161
-
162
- CURRENT CONTEXTUAL INFORMATION (CRITICAL - Use these exact values in your output where specified):
163
- - Current Quarter: Q{quarter}
164
- - Current Year: {year}
165
- - Days remaining in current quarter: {days_remaining}
166
- - Today's Date (for your context only, not for direct output unless specified by a schema field): {current_date_iso}
167
-
168
- When populating the 'current_quarter_info' field in the TaskExtractionOutput, use the format: 'Q{quarter} {year}, {days_remaining} days remaining'.
169
-
170
- GENERATION RULES:
171
- 1. Create 1-3 OKR objects based on the input text
172
- 2. For each OKR, create 1-3 KeyResult objects (MANDATORY - cannot be empty)
173
- 3. For each KeyResult, create 1-3 Task objects (MANDATORY - cannot be empty)
174
- 4. Make tasks specific, actionable, and directly related to the insights in the input text
175
- 5. No repetitive text allowed
176
- 6. Complete JSON object with proper closing braces
177
- 7. Maximum response length: 5000 characters
178
-
179
- Now, analyze the following text and generate the structured output:
180
- ---
181
- TEXT TO ANALYZE:
182
- {user_text_input}
183
- ---
184
- """
185
-
186
- try:
187
- response = client.models.generate_content(
188
- model="gemini-2.5-flash-preview-05-20", # As per user's example
189
- contents=prompt,
190
- config={
191
- 'response_mime_type': 'application/json',
192
- 'response_schema': TaskExtractionOutput, # Pass the Pydantic model class
193
- 'temperature': 0.1,
194
- 'top_p': 0.8,
195
- },
196
- )
197
- except Exception as e:
198
- raise ValueError(f"Gemini API call failed: {e}")
199
-
200
- # Process the response
201
- # Based on user's example `print(response.text)`, we assume .text contains the JSON.
202
- # However, standard Gemini API often has it in response.candidates[0].content.parts[0].text.
203
- response_json_text = None
204
- if hasattr(response, 'text') and response.text:
205
- response_json_text = response.text
206
- elif hasattr(response, 'candidates') and response.candidates:
207
- try:
208
- part = response.candidates[0].content.parts[0]
209
- if hasattr(part, 'text') and part.text:
210
- response_json_text = part.text
211
- except (IndexError, AttributeError):
212
- pass # Could not find text in candidates
213
-
214
- if response_json_text:
215
- try:
216
- # Validate and parse the JSON response using the Pydantic model
217
- task_output = TaskExtractionOutput.model_validate_json(response_json_text)
218
- return task_output, quarter, year, days_remaining
219
- except Exception as e: # Catch Pydantic validation errors or JSON parsing errors
220
- raise ValueError(f"Failed to parse or validate API response: {e}\nRaw response text: {response_json_text}")
221
- else:
222
- # Handle cases where the response is empty or indicates an error
223
- feedback_message = ""
224
- if hasattr(response, 'prompt_feedback') and response.prompt_feedback:
225
- feedback_message = f"Prompt feedback: {response.prompt_feedback}. "
226
- raise ValueError(f"Failed to generate content or response text is empty. {feedback_message}Full response: {response}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
features/insight_and_tasks/agents/task_extraction_model_groq.py DELETED
@@ -1,143 +0,0 @@
1
- import enum
2
- import json
3
- import os
4
- from typing import List, Optional, Literal
5
- from pydantic import BaseModel, Field, field_validator, ValidationInfo
6
- from datetime import datetime, date
7
-
8
- # Import Groq and instructor for structured output
9
- try:
10
- from groq import Groq, RateLimitError
11
- import instructor
12
- except ImportError:
13
- print("Warning: 'groq' or 'instructor' library not found. Please install them.")
14
- print("Try: pip install groq instructor")
15
- Groq = None
16
- instructor = None
17
-
18
-
19
- from features.insight_and_tasks.data_models.tasks import (
20
- TaskExtractionOutput,
21
- OKR,
22
- KeyResult,
23
- Task,
24
- EffortLevel,
25
- TimelineCategory,
26
- PriorityLevel,
27
- TaskType,
28
- DataSubject # Ensure all are imported
29
- )
30
-
31
- # --- Groq Client Initialization with Instructor ---
32
- # Ensure GROQ_API_KEY is set in your environment variables before running
33
- if Groq and instructor:
34
- try:
35
- api_key = os.getenv('GROQ_API_KEY')
36
- if not api_key:
37
- raise ValueError("GROQ_API_KEY environment variable not set. Please set it to your Groq API key.")
38
-
39
- # Create a single, patched Groq client for structured output using instructor
40
- # Mode.JSON ensures the output is a valid JSON object based on the Pydantic model
41
- client = instructor.from_groq(Groq(api_key=api_key), mode=instructor.Mode.JSON)
42
- except Exception as e:
43
- print(f"Failed to initialize Groq client: {e}")
44
- client = None
45
- else:
46
- client = None
47
-
48
-
49
-
50
- # --- Helper Function for Date Calculations (Unchanged) ---
51
- def get_quarter_info():
52
- """Calculates current quarter, year, and days remaining in the quarter."""
53
- today = date.today()
54
- current_year = today.year
55
- current_quarter = (today.month - 1) // 3 + 1
56
-
57
- if current_quarter == 1:
58
- end_of_quarter_date = date(current_year, 3, 31)
59
- elif current_quarter == 2:
60
- end_of_quarter_date = date(current_year, 6, 30)
61
- elif current_quarter == 3:
62
- end_of_quarter_date = date(current_year, 9, 30)
63
- else: # current_quarter == 4
64
- end_of_quarter_date = date(current_year, 12, 31)
65
-
66
- days_remaining = (end_of_quarter_date - today).days
67
- days_remaining = max(0, days_remaining)
68
-
69
- return current_quarter, current_year, days_remaining, today
70
-
71
- # --- Main Task Extraction Function (Refactored for Groq) ---
72
- def extract_tasks_from_text_groq(user_text_input: str) -> (Optional[TaskExtractionOutput], int, int, int):
73
- """
74
- Extracts tasks from input text using the Groq API and structures them
75
- using instructor.
76
-
77
- Args:
78
- user_text_input: The text to analyze.
79
-
80
- Returns:
81
- A tuple containing:
82
- - A TaskExtractionOutput Pydantic model instance, or None on failure.
83
- - The current quarter number.
84
- - The current year.
85
- - The number of days remaining in the quarter.
86
-
87
- Raises:
88
- ValueError: If the Groq client is not initialized or if the API call fails.
89
- RateLimitError: If the Groq API rate limit is exceeded.
90
- """
91
- if not client:
92
- raise ValueError("Groq client is not initialized. Check your API key and library installations.")
93
-
94
- quarter, year, days_remaining, current_date_obj = get_quarter_info()
95
-
96
- # The prompt structure remains the same as it is effective.
97
- # We explicitly tell the model its role and the structure we expect.
98
- prompt = f"""You are a Time-Aware Task Extraction Specialist, an AI expert in meticulously analyzing strategic insights (e.g., from LinkedIn analytics) and transforming them into a structured set of actionable tasks, organized within an Objectives and Key Results (OKRs) framework.
99
-
100
- Your output MUST be a valid JSON object that strictly conforms to the 'TaskExtractionOutput' Pydantic schema.
101
-
102
- CURRENT CONTEXTUAL INFORMATION:
103
- - Use this exact string for the 'current_quarter_info' field: 'Q{quarter} {year}, {days_remaining} days remaining'.
104
-
105
- GENERATION RULES:
106
- 1. Your primary goal is to identify every distinct, high-level strategic objective from the input text. For each and every distinct objective you find, you must create a corresponding OKR object.
107
- 2. For each OKR, extract all relevant Key Results. Key Results must be measurable outcomes.
108
- 3. For each KeyResult, extract all specific and actionable Tasks that are directly derived from the input text.
109
- 4. Considering the days remaining in the quarter, prioritize tasks with the highest immediate impact where possible.
110
- 5. Tasks must be specific, actionable, and directly derived from the input text.
111
- 6. Do not create redundant or repetitive content.
112
- 7. Ensure the final output is a complete JSON object.
113
-
114
- Now, analyze the following text and generate the structured JSON output:
115
- ---
116
- TEXT TO ANALYZE:
117
- {user_text_input}
118
- ---
119
- """
120
-
121
- try:
122
- # Use the instructor-patched client to make the call.
123
- # Pass the Pydantic model to `response_model`.
124
- # Instructor will handle the validation and parsing automatically.
125
- task_output = client.chat.completions.create(
126
- model="llama-3.3-70b-versatile", # A powerful model available on Groq
127
- response_model=TaskExtractionOutput,
128
- messages=[
129
- {"role": "user", "content": prompt},
130
- ],
131
- temperature=0.1,
132
- top_p=0.8,
133
- max_retries=3, # Instructor can automatically retry on validation errors
134
- )
135
- return task_output, quarter, year, days_remaining
136
-
137
- except RateLimitError as e:
138
- print(f"Error: Groq API rate limit exceeded. Please wait and try again. Details: {e}")
139
- raise # Re-raise the specific error
140
- except Exception as e:
141
- # This can catch Pydantic validation errors or other API issues.
142
- print(f"An unexpected error occurred during the Groq API call or data validation: {e}")
143
- raise ValueError(f"Failed to process text with Groq: {e}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
features/insight_and_tasks/coordinators/employer_branding_coordinator.py DELETED
@@ -1,331 +0,0 @@
1
- # coordinators/employer_branding_coordinator.py
2
- import json
3
- import logging
4
- from typing import Optional, Dict, Any # Added Dict, Any
5
- from dataclasses import asdict # For converting dataclasses (like AgentMetrics) to dict
6
- import os
7
- from datetime import datetime
8
-
9
- from google.adk.agents import LlmAgent
10
- from google.adk.runners import InMemoryRunner
11
- from google.genai import types as genai_types # For ADK agent inputs
12
-
13
- # Project-specific imports
14
- from features.insight_and_tasks.agents.follower_agent import EnhancedFollowerAnalysisAgent
15
- from features.insight_and_tasks.agents.post_agent import EnhancedPostPerformanceAgent
16
- from features.insight_and_tasks.agents.mentions_agent import EnhancedMentionsAnalysisAgent
17
- from features.insight_and_tasks.data_models.metrics import AgentMetrics # To type hint inputs
18
- from features.insight_and_tasks.utils.retry_mechanism import RetryMechanism # If ADK calls need retry
19
-
20
- # Configure logger for this module
21
- logger = logging.getLogger(__name__)
22
-
23
- DEFAULT_COORDINATOR_MODEL = "gemini-2.5-flash-preview-05-20" # Use a more capable model for synthesis
24
- os.environ["GOOGLE_GENAI_USE_VERTEXAI"] = "False"
25
- GOOGLE_API_KEY = os.environ.get("GEMINI_API_KEY")
26
- os.environ["GOOGLE_API_KEY"] = GOOGLE_API_KEY
27
-
28
- class EnhancedEmployerBrandingCoordinator:
29
- """
30
- Enhanced coordinator for synthesizing insights from multiple agent metrics,
31
- identifying correlations, and generating integrated strategic recommendations.
32
- """
33
- COORDINATOR_AGENT_NAME = "employer_branding_coordinator"
34
- COORDINATOR_AGENT_DESCRIPTION = (
35
- "Strategic coordinator that analyzes metrics from Follower, Post Performance, and Mentions agents "
36
- "to find correlations, suggest potential causal links, and generate integrated strategies."
37
- )
38
- COORDINATOR_AGENT_INSTRUCTION = """
39
- You are the Enhanced Employer Branding Coordinator. Your primary mission is to synthesize analyses and
40
- structured metrics (TimeSeries, Aggregate, Categorical) from three specialized agents: Follower Analysis,
41
- Post Performance, and Mentions Analysis. Your goal is to provide a holistic, integrated understanding of
42
- the LinkedIn employer branding performance.
43
-
44
- You MUST focus on:
45
- 1. Cross-Agent Correlations: Analyze how metrics from different agents relate to each other over time.
46
- Pay close attention to the 'time_series_metrics' provided by each agent.
47
- - Identify positive or negative correlations (e.g., "Follower growth rate increased by X% when posts about 'company culture' (Post Agent) were published, coinciding with a Y% rise in positive mentions (Mentions Agent)").
48
- - Note any leading or lagging indicators (e.g., "A spike in negative mentions often preceded a dip in follower growth by approximately 2 weeks.").
49
- - Look for relationships between specific content types/topics (from Post Agent) and follower engagement/growth (Follower Agent) or brand sentiment (Mentions Agent).
50
- 2. Potential Causal Insights & Hypotheses: Based on observed correlations and temporal sequences, suggest plausible causal relationships.
51
- These are hypotheses, not definitive conclusions.
52
- - Example: "The Q2 campaign focusing on 'employee testimonials' (Post Agent data) likely contributed to the observed 15% increase in organic follower acquisition (Follower Agent data) and the shift towards more positive sentiment in mentions (Mentions Agent data) during the same period."
53
- 3. Root Cause Analysis (Conceptual): For significant performance changes (e.g., sudden engagement drops, unexpected follower spikes, sharp sentiment shifts), attempt to identify potential root causes by cross-referencing data and summaries from all three agents.
54
- 4. Predictive Insights (High-Level): Based on established trends and correlations, what are potential future performance trajectories or risks?
55
- - Example: "If the current trend of declining engagement on text-only posts continues, overall reach may decrease by X% next quarter unless content strategy is diversified."
56
- 5. Integrated Strategic Recommendations: Formulate actionable, strategic advice that leverages insights from ALL THREE data sources to optimize overall employer branding.
57
- - Recommendations should be specific (e.g., "Increase frequency of video posts related to 'Team Achievements' as this format shows high engagement and correlates with positive mention spikes.").
58
- - Prioritize recommendations based on their potential impact, supported by the cross-agent analysis.
59
- - Suggest A/B tests or further investigations where appropriate.
60
-
61
- INPUT: You will receive structured 'AgentMetrics' data (JSON format) from each of the three agents. This includes their own analysis summaries, time-series data, aggregate figures, and categorical breakdowns.
62
-
63
- OUTPUT: A comprehensive, well-structured report covering:
64
- I. Overall Executive Summary: A brief (2-3 paragraph) overview of the most critical findings and strategic implications derived from the integrated analysis.
65
- II. Detailed Cross-Agent Correlation Analysis: Elaborate on specific correlations found, with examples.
66
- III.Key Causal Hypotheses: Present the most compelling potential causal links.
67
- IV. Noteworthy Performance Shifts & Potential Root Causes: Discuss any major changes and their likely drivers.
68
- V. Forward-Looking Predictive Insights: Offer high-level predictions.
69
- VI. Actionable Integrated Strategic Recommendations: Provide clear, prioritized recommendations.
70
-
71
- Your analysis must be grounded in the provided data. Refer to specific metrics and agent findings to support your conclusions.
72
- Be insightful and strategic. The goal is to provide a unified view that is more valuable than the sum of the individual agent analyses.
73
- """
74
-
75
- def __init__(self, api_key: str, model_name: Optional[str] = None):
76
- self.api_key = api_key # Stored for LlmAgent or if agents need it passed explicitly
77
- self.model_name = model_name or DEFAULT_COORDINATOR_MODEL
78
-
79
- # Initialize individual agents. The coordinator will use their output.
80
- # These agents are internal to the coordinator's process of getting data to synthesize.
81
- self.follower_agent = EnhancedFollowerAnalysisAgent(api_key=api_key, model_name=model_name) # Pass down model if needed
82
- self.post_agent = EnhancedPostPerformanceAgent(api_key=api_key, model_name=model_name)
83
- self.mentions_agent = EnhancedMentionsAnalysisAgent(api_key=api_key, model_name=model_name)
84
-
85
- # The LLM agent for the coordinator itself, responsible for synthesis
86
- self.coordinator_llm_agent = LlmAgent(
87
- name=self.COORDINATOR_AGENT_NAME,
88
- model=self.model_name, # Use the coordinator's (potentially more powerful) model
89
- description=self.COORDINATOR_AGENT_DESCRIPTION,
90
- instruction=self.COORDINATOR_AGENT_INSTRUCTION
91
- )
92
- self.retry_mechanism = RetryMechanism()
93
- logger.info(f"{self.COORDINATOR_AGENT_NAME} initialized with model {self.model_name}.")
94
- logger.info(f"It internally uses: Follower Agent ({self.follower_agent.model_name}), "
95
- f"Post Agent ({self.post_agent.model_name}), Mentions Agent ({self.mentions_agent.model_name}).")
96
-
97
-
98
- async def generate_comprehensive_analysis(
99
- self,
100
- follower_metrics: AgentMetrics,
101
- post_metrics: AgentMetrics,
102
- mentions_metrics: AgentMetrics
103
- ) -> str:
104
- """
105
- Generates a comprehensive analysis by synthesizing metrics from all specialized agents.
106
-
107
- Args:
108
- follower_metrics: Metrics from the EnhancedFollowerAnalysisAgent.
109
- post_metrics: Metrics from the EnhancedPostPerformanceAgent.
110
- mentions_metrics: Metrics from the EnhancedMentionsAnalysisAgent.
111
-
112
- Returns:
113
- A string containing the comprehensive analysis report.
114
- """
115
-
116
- # Prepare the input prompt for the coordinator's LlmAgent
117
- # Serialize the AgentMetrics objects (which are dataclasses) to dictionaries
118
- # then to JSON strings for clean inclusion in the prompt.
119
- try:
120
- follower_metrics_dict = asdict(follower_metrics)
121
- post_metrics_dict = asdict(post_metrics)
122
- mentions_metrics_dict = asdict(mentions_metrics)
123
- except Exception as e:
124
- logger.error(f"Error converting AgentMetrics to dict: {e}", exc_info=True)
125
- return "Error: Could not process input metrics for coordination."
126
-
127
- # Truncate individual agent summaries if they are too long to avoid overly large prompts
128
- max_summary_len = 500 # Max characters for individual agent summaries in the prompt
129
- follower_metrics_dict['analysis_summary'] = follower_metrics_dict.get('analysis_summary', '')[:max_summary_len]
130
- post_metrics_dict['analysis_summary'] = post_metrics_dict.get('analysis_summary', '')[:max_summary_len]
131
- mentions_metrics_dict['analysis_summary'] = mentions_metrics_dict.get('analysis_summary', '')[:max_summary_len]
132
-
133
-
134
- synthesis_prompt = f"""
135
- Please synthesize the following LinkedIn analytics insights, which are structured as 'AgentMetrics'
136
- from three specialized agents. Your primary task is to identify cross-metric correlations,
137
- deduce potential causal relationships, and provide integrated strategic recommendations based on
138
- your core instructions.
139
-
140
- DATA FROM SPECIALIZED AGENTS:
141
-
142
- 1. Follower Analysis Agent Metrics:
143
- - Agent Name: {follower_metrics_dict.get('agent_name')}
144
- - Agent's Analysis Summary: {follower_metrics_dict.get('analysis_summary')}
145
- - Time Series Metrics: {json.dumps([asdict(m) for m in follower_metrics.time_series_metrics], indent=2, default=str)}
146
- - Aggregate Metrics: {json.dumps(follower_metrics_dict.get('aggregate_metrics'), indent=2, default=str)}
147
- - Categorical Metrics: {json.dumps(follower_metrics_dict.get('categorical_metrics'), indent=2, default=str)}
148
- - Time Periods Covered: {json.dumps(follower_metrics_dict.get('time_periods_covered'), default=str)}
149
- - Key Insights by Agent: {json.dumps(follower_metrics_dict.get('key_insights'), default=str)}
150
-
151
- 2. Post Performance Agent Metrics:
152
- - Agent Name: {post_metrics_dict.get('agent_name')}
153
- - Agent's Analysis Summary: {post_metrics_dict.get('analysis_summary')}
154
- - Time Series Metrics: {json.dumps([asdict(m) for m in post_metrics.time_series_metrics], indent=2, default=str)}
155
- - Aggregate Metrics: {json.dumps(post_metrics_dict.get('aggregate_metrics'), indent=2, default=str)}
156
- - Categorical Metrics: {json.dumps(post_metrics_dict.get('categorical_metrics'), indent=2, default=str)}
157
- - Time Periods Covered: {json.dumps(post_metrics_dict.get('time_periods_covered'), default=str)}
158
- - Key Insights by Agent: {json.dumps(post_metrics_dict.get('key_insights'), default=str)}
159
-
160
- 3. Mentions Analysis Agent Metrics:
161
- - Agent Name: {mentions_metrics_dict.get('agent_name')}
162
- - Agent's Analysis Summary: {mentions_metrics_dict.get('analysis_summary')}
163
- - Time Series Metrics: {json.dumps([asdict(m) for m in mentions_metrics.time_series_metrics], indent=2, default=str)}
164
- - Aggregate Metrics: {json.dumps(mentions_metrics_dict.get('aggregate_metrics'), indent=2, default=str)}
165
- - Categorical Metrics: {json.dumps(mentions_metrics_dict.get('categorical_metrics'), indent=2, default=str)}
166
- - Time Periods Covered: {json.dumps(mentions_metrics_dict.get('time_periods_covered'), default=str)}
167
- - Key Insights by Agent: {json.dumps(mentions_metrics_dict.get('key_insights'), default=str)}
168
-
169
- COORDINATION TASK:
170
- Based on ALL the data presented above from the three agents, generate a comprehensive synthesis report.
171
- Follow your core instructions meticulously, focusing on cross-agent correlations (especially using the
172
- time_series_metrics), causal hypotheses, root cause considerations for major shifts, predictive insights,
173
- and actionable, integrated strategic recommendations.
174
- Structure your output as a detailed report with the specified sections.
175
- """
176
-
177
- user_input_content = genai_types.Content(
178
- role="user",
179
- parts=[genai_types.Part(text=synthesis_prompt)]
180
- )
181
-
182
- runner = InMemoryRunner(agent=self.coordinator_llm_agent, app_name=f"{self.COORDINATOR_AGENT_NAME}Runner")
183
- user_id = f"system_user_coordinator_{int(datetime.utcnow().timestamp())}" # Unique ID for the run
184
-
185
- session = await runner.session_service.create_session(
186
- app_name=f"{self.COORDINATOR_AGENT_NAME}Runner",
187
- user_id=user_id
188
- )
189
-
190
- result_text_parts = []
191
- try:
192
- logger.info(f"Running {self.COORDINATOR_AGENT_NAME} for synthesis. User ID: {user_id}, Session ID: {session.id}")
193
- # Using retry for the ADK runner execution part
194
- async def run_adk_coordinator():
195
- temp_result_parts = []
196
- async for event in runner.run(
197
- user_id=user_id,
198
- session_id=session.id,
199
- new_message=user_input_content
200
- ):
201
- if hasattr(event, 'content') and event.content and event.content.parts:
202
- for part in event.content.parts:
203
- if hasattr(part, 'text'):
204
- temp_result_parts.append(part.text)
205
- if not temp_result_parts:
206
- # This could happen if the LLM returns no content or an error not caught by ADK
207
- logger.warning(f"{self.COORDINATOR_AGENT_NAME} produced no text output from ADK run.")
208
- # Consider raising a specific error or returning a default message
209
- # For now, it will result in an empty string if no parts are collected.
210
- return "".join(temp_result_parts)
211
-
212
- # The retry_with_backoff expects a synchronous function.
213
- # For async, you'd typically handle retries within the async logic or use an async retry library.
214
- # For simplicity here, we'll run it once. If retries are critical for ADK calls,
215
- # the ADK runner itself might have retry mechanisms, or this part needs adjustment.
216
- # The original code didn't show retry for this ADK call, so keeping it direct.
217
-
218
- # Direct call without retry for the async ADK runner:
219
- for event in runner.run(
220
- user_id=user_id,
221
- session_id=session.id,
222
- new_message=user_input_content
223
- ):
224
- if hasattr(event, 'content') and event.content and event.content.parts:
225
- for part in event.content.parts:
226
- if hasattr(part, 'text'):
227
- result_text_parts.append(part.text)
228
-
229
- final_result_text = "".join(result_text_parts)
230
- if not final_result_text.strip():
231
- logger.warning(f"{self.COORDINATOR_AGENT_NAME} synthesis resulted in an empty string.")
232
- final_result_text = "Coordinator analysis did not produce output. Please check logs."
233
-
234
-
235
- except Exception as e:
236
- logger.error(f"Error during {self.COORDINATOR_AGENT_NAME} LLM agent execution: {e}", exc_info=True)
237
- final_result_text = f"Error in coordinator synthesis: {str(e)}"
238
- finally:
239
- try:
240
- await runner.session_service.delete_session(
241
- app_name=f"{self.COORDINATOR_AGENT_NAME}Runner", user_id=user_id, session_id=session.id
242
- )
243
- except Exception as session_del_e:
244
- logger.error(f"Error deleting coordinator session: {session_del_e}")
245
-
246
- return final_result_text
247
-
248
- if __name__ == '__main__':
249
- import asyncio
250
- import pandas as pd # For creating dummy data
251
- from datetime import datetime # For dummy data AgentMetrics
252
-
253
- try:
254
- from utils.logging_config import setup_logging
255
- setup_logging()
256
- logger.info("Logging setup for EnhancedEmployerBrandingCoordinator test.")
257
- except ImportError:
258
- logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")
259
- logger.warning("logging_config.py not found, using basicConfig for logging.")
260
-
261
- MOCK_API_KEY = os.environ.get("GOOGLE_API_KEY", "test_api_key_coordinator")
262
- MODEL_NAME = DEFAULT_COORDINATOR_MODEL # Or a specific test model
263
-
264
- # Create dummy AgentMetrics data for testing
265
- dummy_ts_metric = TimeSeriesMetric(metric_name="dummy_visits", values=[10.0,20.0], timestamps=["2023-01","2023-02"])
266
-
267
- follower_metrics_data = AgentMetrics(
268
- agent_name="follower_analyst_test",
269
- analysis_summary="Followers grew steadily. Demographic: Young professionals.",
270
- time_series_metrics=[dummy_ts_metric],
271
- aggregate_metrics={"avg_growth_rate": 0.05},
272
- categorical_metrics={"top_industry": "Tech"},
273
- time_periods_covered=["2023-01", "2023-02"],
274
- key_insights=["Organic growth is strong."]
275
- )
276
- post_metrics_data = AgentMetrics(
277
- agent_name="post_analyst_test",
278
- analysis_summary="Video posts performed best. Engagement rate is 3%.",
279
- time_series_metrics=[TimeSeriesMetric(metric_name="dummy_engagement", values=[0.03,0.035], timestamps=["2023-01","2023-02"], unit="%")],
280
- aggregate_metrics={"avg_engagement_rate_overall": 0.032},
281
- categorical_metrics={"top_media_type": "VIDEO"},
282
- time_periods_covered=["2023-01", "2023-02"],
283
- key_insights=["Video content is key for engagement."]
284
- )
285
- mentions_metrics_data = AgentMetrics(
286
- agent_name="mentions_analyst_test",
287
- analysis_summary="Mentions are mostly neutral. Sentiment score avg 0.1.",
288
- time_series_metrics=[TimeSeriesMetric(metric_name="dummy_sentiment_score", values=[0.1,0.12], timestamps=["2023-01","2023-02"])],
289
- aggregate_metrics={"overall_avg_sentiment": 0.11},
290
- categorical_metrics={"dominant_sentiment": "Neutral"},
291
- time_periods_covered=["2023-01", "2023-02"],
292
- key_insights=["Brand perception is stable but not overly positive."]
293
- )
294
-
295
- coordinator = EnhancedEmployerBrandingCoordinator(api_key=MOCK_API_KEY, model_name=MODEL_NAME)
296
-
297
- async def run_coordination():
298
- logger.info("Generating comprehensive analysis from dummy metrics...")
299
- # For local tests without real API calls, the LlmAgent might behave as a mock.
300
- if MOCK_API_KEY == "test_api_key_coordinator":
301
- logger.warning("Using a mock API key. Coordinator LlmAgent behavior might be limited or mocked.")
302
- # Mock the ADK runner for the coordinator's LLM agent if needed
303
- class MockCoordinatorADKRunner:
304
- def __init__(self, agent, app_name): self.agent = agent
305
- async def session_service_create_session(self, app_name, user_id):
306
- class MockSession: id = "mock_coord_session_id"
307
- return MockSession()
308
- async def run(self, user_id, session_id, new_message):
309
- # Simulate a response from the coordinator LLM
310
- yield genai_types.Content(parts=[genai_types.Part(text="Mock Coordinator Synthesis Report: Blah blah correlation. Recommendation: Do X.")])
311
- async def session_service_delete_session(self, app_name, user_id, session_id): pass
312
-
313
- global InMemoryRunner # Make sure we are modifying the correct InMemoryRunner
314
- OriginalInMemoryRunnerCoord = InMemoryRunner
315
- InMemoryRunner = MockCoordinatorADKRunner
316
-
317
-
318
- report = await coordinator.generate_comprehensive_analysis(
319
- follower_metrics_data,
320
- post_metrics_data,
321
- mentions_metrics_data
322
- )
323
-
324
- if MOCK_API_KEY == "test_api_key_coordinator" and 'OriginalInMemoryRunnerCoord' in globals():
325
- InMemoryRunner = OriginalInMemoryRunnerCoord # Restore
326
-
327
- print("\n--- EnhancedEmployerBrandingCoordinator Report ---")
328
- print(report)
329
-
330
- if __name__ == '__main__': # Inner check
331
- asyncio.run(run_coordination())
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
features/insight_and_tasks/data_models/__init__.py DELETED
@@ -1,35 +0,0 @@
1
- # data_models/__init__.py
2
-
3
- # This file makes the 'data_models' directory a Python package.
4
-
5
- # Expose key models at the package level for easier importing.
6
- from .metrics import TimeSeriesMetric, AgentMetrics, MetricType, TimeGranularity
7
- from .tasks import (
8
- EffortLevel,
9
- TaskType,
10
- DataSubject,
11
- TimelineCategory,
12
- PriorityLevel,
13
- Task,
14
- KeyResult,
15
- OKR,
16
- TaskExtractionOutput
17
- )
18
-
19
- __all__ = [
20
- # From metrics.py
21
- "TimeSeriesMetric",
22
- "AgentMetrics",
23
- "MetricType",
24
- "TimeGranularity",
25
- # From tasks.py
26
- "EffortLevel",
27
- "TaskType",
28
- "DataSubject",
29
- "TimelineCategory",
30
- "PriorityLevel",
31
- "Task",
32
- "KeyResult",
33
- "OKR",
34
- "TaskExtractionOutput"
35
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
features/insight_and_tasks/data_models/metrics.py DELETED
@@ -1,50 +0,0 @@
1
- # data_models/metrics.py
2
- from dataclasses import dataclass, field
3
- from typing import List, Dict, Any, Literal, Optional
4
- from datetime import datetime
5
-
6
-
7
- # Define literal types for more specific type hinting
8
- MetricType = Literal['time_series', 'aggregate', 'categorical']
9
- TimeGranularity = Literal['daily', 'weekly', 'monthly', 'yearly', 'other'] # Added 'yearly' and 'other'
10
-
11
- @dataclass
12
- class TimeSeriesMetric:
13
- """Structure for time-series based metrics"""
14
- metric_name: str
15
- values: List[float] = field(default_factory=list)
16
- timestamps: List[str] = field(default_factory=list) # Consider using datetime objects or ISO format strings
17
- metric_type: MetricType = 'time_series'
18
- time_granularity: TimeGranularity = 'monthly'
19
- unit: Optional[str] = None # e.g., 'count', '%', 'USD'
20
- description: Optional[str] = None # Optional description of the metric
21
-
22
- def __post_init__(self):
23
- if len(self.values) != len(self.timestamps):
24
- # Or log a warning, or handle as appropriate for your application
25
- raise ValueError(f"Length of values ({len(self.values)}) and timestamps ({len(self.timestamps)}) must match for metric '{self.metric_name}'.")
26
-
27
- @dataclass
28
- class AgentMetrics:
29
- """
30
- Enhanced structure for agent metrics with time-awareness and more details.
31
- """
32
- agent_name: str
33
- analysis_summary: str # Summary text from the agent's analysis
34
-
35
- # Specific metric categories
36
- time_series_metrics: List[TimeSeriesMetric] = field(default_factory=list)
37
- aggregate_metrics: Dict[str, float] = field(default_factory=dict) # Key-value pairs for single value metrics
38
- categorical_metrics: Dict[str, Any] = field(default_factory=dict) # For distributions, counts by category, etc.
39
- # Example: {'industry_distribution': {'Tech': 100, 'Finance': 50}}
40
-
41
- # Contextual information
42
- time_periods_covered: List[str] = field(default_factory=list) # e.g., ["2023-01", "2023-02"] or ["Q1 2023", "Q2 2023"]
43
- data_sources_used: List[str] = field(default_factory=list) # Information about the input data
44
- generation_timestamp: str = field(default_factory=lambda: datetime.utcnow().isoformat()) # When these metrics were generated
45
-
46
- # Optional fields for richer reporting
47
- key_insights: List[str] = field(default_factory=list) # Bullet points of key findings
48
- potential_errors_or_warnings: List[str] = field(default_factory=list) # Any issues encountered during analysis
49
-
50
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
features/insight_and_tasks/data_models/tasks.py DELETED
@@ -1,197 +0,0 @@
1
- # data_models/tasks.py
2
- from enum import Enum
3
- from typing import List, Optional, Literal
4
- from pydantic import BaseModel, Field, field_validator
5
- from datetime import datetime
6
-
7
- # Using Literal for more precise string enums if preferred over Enum class for Pydantic
8
- # However, Enum provides better structure and can be used with Field choices.
9
-
10
- class EffortLevel(str, Enum):
11
- """Estimated effort level for a task."""
12
- SMALL = "Small"
13
- MEDIUM = "Medium"
14
- LARGE = "Large"
15
-
16
- class TaskType(str, Enum):
17
- """Type of task, indicating its nature."""
18
- INITIATIVE = "initiative" # Action-oriented, new projects/changes
19
- TRACKING = "tracking" # Measurement-focused, monitoring existing metrics/processes
20
-
21
- class KeyResultType(str, Enum):
22
- PERFORMANCE = "performance"
23
- COMPLETION = "completion"
24
-
25
- class DataSubject(str, Enum):
26
- """Specifies the primary data domain a tracking task relates to."""
27
- FOLLOWER_STATS = "follower_stats"
28
- POSTS = "posts"
29
- MENTIONS = "mentions"
30
- GENERAL = "general" # For initiatives or tasks not tied to a single data type
31
-
32
- class TimelineCategory(str, Enum):
33
- """Categorization of task timelines."""
34
- IMMEDIATE = "Immediate" # (e.g., 1-2 weeks)
35
- SHORT_TERM = "Short-term" # (e.g., rest of current quarter, up to 3 months)
36
- MEDIUM_TERM = "Medium-term" # (e.g., next quarter, 3-6 months)
37
- LONG_TERM = "Long-term" # (e.g., 6+ months)
38
-
39
- class PriorityLevel(str, Enum):
40
- """Priority level for tasks."""
41
- HIGH = "High"
42
- MEDIUM = "Medium"
43
- LOW = "Low"
44
-
45
- class Task(BaseModel):
46
- """
47
- Represents a single actionable task derived from analysis.
48
- """
49
- task_category: str = Field(
50
- description="The broader category or theme of the task (e.g., Content Strategy, Audience Engagement, Reputation Management, Performance Monitoring)."
51
- )
52
- task_description: str = Field( # Renamed from 'task' for clarity
53
- description="A concise yet clear description of the specific action to be taken."
54
- )
55
- objective_deliverable: str = Field(
56
- description="The clear, measurable objective this task aims to achieve and the specific deliverable(s) expected upon completion."
57
- )
58
- effort: EffortLevel = Field(
59
- description="Estimated effort required to complete the task (Small, Medium, Large)."
60
- )
61
- timeline: TimelineCategory = Field(
62
- description="Projected timeline for task completion, considering urgency and dependencies."
63
- )
64
- responsible_party: str = Field(
65
- description="The team, role, or individual suggested to be responsible for executing this task (e.g., Marketing Team, Content Creation Lead, Social Media Manager)."
66
- )
67
- success_criteria_metrics: str = Field(
68
- description="Specific, measurable criteria and metrics that will be used to determine if the task was successfully completed and achieved its objective."
69
- )
70
- dependencies_prerequisites: Optional[str] = Field(
71
- default=None,
72
- description="Any other tasks, resources, or conditions that must be met before this task can begin or be completed."
73
- )
74
- priority: PriorityLevel = Field(
75
- description="The priority level of the task (High, Medium, Low)."
76
- )
77
- priority_justification: str = Field(
78
- description="A brief explanation for the assigned priority level, linking it to impact or urgency."
79
- )
80
- why_proposed: str = Field(
81
- description="The rationale behind proposing this task, clearly linking it back to specific findings or insights from the data analysis."
82
- )
83
- task_type: TaskType = Field(
84
- description="Indicates whether this task is a new 'initiative' or ongoing 'tracking' of performance/metrics."
85
- )
86
- data_subject: Optional[DataSubject] = Field(
87
- default=None,
88
- description="For 'tracking' tasks, specifies the primary data subject (e.g., follower_stats, posts, mentions). Can be 'general' or null for 'initiative' tasks."
89
- )
90
-
91
- @field_validator('data_subject')
92
- @classmethod
93
- def check_data_subject_for_tracking(cls, value: Optional[DataSubject], values) -> Optional[DataSubject]:
94
- # Pydantic v2 uses `values.data` to get other field values if needed before validation
95
- # For Pydantic v1, it would be `values.get('task_type')`
96
- # This example assumes Pydantic v2 structure for `values` if needed, but here we only need `task_type`
97
- # which should already be validated or available.
98
- # For simplicity, accessing it via `values.data.get('task_type')` in Pydantic v2 context.
99
- # If using Pydantic v1, it's `values.get('task_type')`.
100
- # Let's assume `values` is a dict-like object containing other fields.
101
-
102
- # The validator structure depends on Pydantic version.
103
- # For Pydantic v2, it's `info: ValidationInfo` and `info.data.get('task_type')`
104
- # For Pydantic v1, `values` is a dict.
105
- # For this example, let's assume `values` is a dict of the fields.
106
- task_type_value = None
107
- if hasattr(values, 'data'): # Pydantic v2 way
108
- task_type_value = values.data.get('task_type')
109
- elif isinstance(values, dict): # Pydantic v1 way (or if it's passed as a dict)
110
- task_type_value = values.get('task_type')
111
-
112
-
113
- if task_type_value == TaskType.TRACKING and value is None:
114
- raise ValueError("For 'tracking' tasks, 'data_subject' must be specified.")
115
- if task_type_value == TaskType.INITIATIVE and value is DataSubject.GENERAL:
116
- # This is acceptable, or you might want to enforce it to be None
117
- pass
118
- return value
119
-
120
- class KeyResult(BaseModel):
121
- """
122
- A measurable outcome that contributes to an Objective.
123
- """
124
- key_result_description: str = Field( # Renamed from 'key_result'
125
- description="A clear, specific, and measurable description of the key result."
126
- )
127
- tasks: List[Task] = Field(
128
- default_factory=list,
129
- description="A list of specific tasks that will be undertaken to achieve this key result."
130
- )
131
- target_metric: Optional[str] = Field(
132
- default=None,
133
- description="The primary metric used to measure the achievement of this key result (e.g., 'Follower Growth Rate', 'Average Engagement Rate')."
134
- )
135
- target_value: Optional[str] = Field( # Can be numeric or descriptive (e.g., "Increase by 10%", "Achieve 5%")
136
- default=None,
137
- description="The specific target value for the metric (e.g., '5%', '1000 new followers')."
138
- )
139
- key_result_type: KeyResultType = Field(
140
- description=(
141
- "Indicates the nature of the Key Result. "
142
- "PERFORMANCE: Focused on achieving a specific, measurable level for a defined metric. "
143
- "Its core metric can typically be extracted and monitored directly via a data source, such as the LinkedIn API "
144
- "(e.g., monthly post count, engagement rate, follower gains, impressions, CTR, mention volume). "
145
- "The goal is to hit or exceed a target for this metric. "
146
- "COMPLETION: Focused on finishing a distinct project, delivering a specific output, or establishing a new process. "
147
- "Progress is primarily tracked by the successful completion of the defined scope of work. "
148
- "Generally, the primary outcome of a COMPLETION Key Result is not a metric continuously tracked via an automated "
149
- "data source like the LinkedIn API, or the 'metric' itself describes the state of completion (e.g., 'report delivered', 'process established')."
150
- )
151
- )
152
- data_subject: Optional[DataSubject] = Field(
153
- default=None,
154
- description="For 'performance' key results, specifies the primary data subject (e.g., follower_stats, posts, mentions). Can be 'general' or null for 'completion' tasks."
155
- )
156
-
157
- class OKR(BaseModel):
158
- """
159
- Defines an Objective and its associated Key Results (OKRs).
160
- """
161
- objective_description: str = Field( # Renamed from 'objective'
162
- description="A high-level, qualitative goal that the team aims to achieve. Should be aspirational and motivating."
163
- )
164
- key_results: List[KeyResult] = Field(
165
- default_factory=list,
166
- description="A list of 2-5 specific, measurable, achievable, relevant, and time-bound (SMART) key results that define success for the objective."
167
- )
168
- objective_timeline: TimelineCategory = Field(
169
- description="The overall timeline category for achieving this objective."
170
- )
171
- objective_owner: Optional[str] = Field(
172
- default=None,
173
- description="The team name",
174
- max_length=50
175
- )
176
-
177
-
178
- class TaskExtractionOutput(BaseModel):
179
- """
180
- Structured output from the TaskExtractionAgent, including context and OKRs.
181
- """
182
- current_quarter_info: str = Field(
183
- description="Information about the current quarter and days remaining (e.g., 'Q2 2025, 45 days remaining')."
184
- )
185
- okrs: List[OKR] = Field(
186
- default_factory=list,
187
- description="A list of Objectives and Key Results (OKRs) derived from the analysis."
188
- )
189
- overall_strategic_focus: Optional[str] = Field(
190
- default=None,
191
- description="A brief summary of the main strategic focus areas identified from the tasks."
192
- )
193
- generation_timestamp: str = Field(
194
- default_factory=lambda: datetime.utcnow().isoformat(),
195
- description="Timestamp of when this task extraction output was generated."
196
- )
197
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
features/insight_and_tasks/orchestrators/linkedin_analytics_orchestrator.py DELETED
@@ -1,299 +0,0 @@
1
- # orchestrators/linkedin_analytics_orchestrator.py
2
- import pandas as pd
3
- import logging
4
- from typing import Dict, Any, Optional
5
- from datetime import date, datetime # For TaskExtractionAgent date
6
- from dataclasses import asdict # For converting AgentMetrics to dict if needed for final output
7
- import os
8
-
9
- os.environ["GOOGLE_GENAI_USE_VERTEXAI"] = "False"
10
- GOOGLE_API_KEY = os.environ.get("GEMINI_API_KEY")
11
- os.environ["GOOGLE_API_KEY"] = GOOGLE_API_KEY
12
-
13
- # Project-specific imports
14
- from features.insight_and_tasks.utils.pandasai_setup import configure_pandasai # Centralized PandasAI config
15
- from features.insight_and_tasks.coordinators.employer_branding_coordinator import EnhancedEmployerBrandingCoordinator
16
- from features.insight_and_tasks.agents.task_extraction_agent import TaskExtractionAgent
17
- from features.insight_and_tasks.data_models.metrics import AgentMetrics # For type hinting
18
- from features.insight_and_tasks.data_models.tasks import TaskExtractionOutput # For type hinting
19
- from features.insight_and_tasks.agents.task_extraction_model_groq import extract_tasks_from_text_groq
20
-
21
- # Configure logger for this module
22
- logger = logging.getLogger(__name__)
23
-
24
- class EnhancedLinkedInAnalyticsOrchestrator:
25
- """
26
- Orchestrates the end-to-end LinkedIn analytics process, from data input through
27
- specialized agent analysis, coordinator synthesis, and actionable task extraction.
28
- """
29
-
30
- def __init__(self, api_key: str, llm_model_name: Optional[str] = None, current_date_for_tasks: Optional[date] = None):
31
- """
32
- Initializes the orchestrator.
33
- Args:
34
- api_key: The API key for Google services (used by PandasAI and LlmAgents).
35
- llm_model_name: Optional. The primary LLM model name to be used by agents.
36
- Specific agents/coordinator might override with their defaults if not set.
37
- current_date_for_tasks: Optional. The date to be used by TaskExtractionAgent for quarter calculations. Defaults to today.
38
- """
39
- self.api_key = api_key
40
- self.llm_model_name = llm_model_name # Can be passed down or agents use their defaults
41
-
42
- # Configure PandasAI globally at the start of orchestration.
43
- # Pass the model_name if specified, otherwise pandasai_setup might use its own default.
44
- try:
45
- configure_pandasai(api_key=self.api_key, model_name=self.llm_model_name)
46
- logger.info(f"PandasAI configured by orchestrator with model hint: {self.llm_model_name or 'default'}.")
47
- except Exception as e:
48
- logger.error(f"Failed to configure PandasAI in orchestrator: {e}", exc_info=True)
49
- # Decide if this is a critical failure or if agents can proceed (they might try to reconfigure)
50
-
51
- # Initialize the coordinator, which in turn initializes its specialized agents.
52
- # Pass the model_name hint to the coordinator.
53
- self.coordinator = EnhancedEmployerBrandingCoordinator(api_key=self.api_key, model_name=self.llm_model_name)
54
-
55
- # Initialize the TaskExtractionAgent.
56
- # It uses its own default model unless overridden here.
57
- self.task_extractor = TaskExtractionAgent(
58
- api_key=self.api_key,
59
- model_name=self.llm_model_name, # Pass model hint
60
- current_date=current_date_for_tasks # Defaults to today if None
61
- )
62
- logger.info("EnhancedLinkedInAnalyticsOrchestrator initialized.")
63
-
64
- async def generate_full_analysis_and_tasks(
65
- self,
66
- follower_stats_df: pd.DataFrame,
67
- post_df: pd.DataFrame,
68
- mentions_df: pd.DataFrame
69
- ) -> Dict[str, Any]:
70
- """
71
- Executes the full pipeline: agent analyses, coordinator synthesis, and task extraction.
72
- Args:
73
- follower_stats_df: DataFrame containing follower statistics.
74
- post_df: DataFrame containing post performance data.
75
- mentions_df: DataFrame containing brand mentions data.
76
- Returns:
77
- A dictionary containing the comprehensive analysis text, actionable tasks (OKRs),
78
- and the detailed metrics from each specialized agent.
79
- """
80
- logger.info("Starting full analysis and task generation pipeline...")
81
-
82
- # Step 1: Get analyses and metrics from specialized agents.
83
- # The coordinator's internal agents are used here.
84
- logger.info("Running follower analysis...")
85
- follower_agent_metrics: AgentMetrics = self.coordinator.follower_agent.analyze_follower_data(follower_stats_df)
86
- logger.info(f"Follower analysis complete. Summary: {follower_agent_metrics.analysis_summary[:100]}...")
87
-
88
- logger.info("Running post performance analysis...")
89
- post_agent_metrics: AgentMetrics = self.coordinator.post_agent.analyze_post_data(post_df)
90
- logger.info(f"Post analysis complete. Summary: {post_agent_metrics.analysis_summary[:100]}...")
91
-
92
- logger.info("Running mentions analysis...")
93
- mentions_agent_metrics: AgentMetrics = self.coordinator.mentions_agent.analyze_mentions_data(mentions_df)
94
- logger.info(f"Mentions analysis complete. Summary: {mentions_agent_metrics.analysis_summary[:100]}...")
95
-
96
- # Step 2: Coordinator synthesizes these metrics into a comprehensive analysis text.
97
- logger.info("Running coordinator for synthesis...")
98
- comprehensive_analysis_text: str = await self.coordinator.generate_comprehensive_analysis(
99
- follower_agent_metrics, post_agent_metrics, mentions_agent_metrics
100
- )
101
- logger.info(f"Coordinator synthesis complete. Report length: {len(comprehensive_analysis_text)} chars.")
102
- if not comprehensive_analysis_text or comprehensive_analysis_text.startswith("Error"):
103
- logger.error(f"Coordinator synthesis failed or produced an error message: {comprehensive_analysis_text}")
104
- # Potentially stop here or proceed with task extraction on whatever text was generated.
105
-
106
- # Step 3: TaskExtractionAgent extracts actionable tasks (OKRs) from the comprehensive text.
107
- logger.info("Running task extraction...")
108
- #actionable_tasks_okrs, quarter, year, days_remaining = extract_tasks_from_text(comprehensive_analysis_text, GOOGLE_API_KEY)
109
- actionable_tasks_okrs, quarter, year, days_remaining = extract_tasks_from_text_groq(comprehensive_analysis_text)
110
- logger.info(f"Task extraction complete. Number of OKRs: {len(actionable_tasks_okrs.okrs) if actionable_tasks_okrs else 'Error'}.")
111
-
112
- # Step 4: Compile and return all results.
113
- # Convert Pydantic/dataclass objects to dicts for easier JSON serialization if the final output needs it.
114
- # The `actionable_tasks_okrs` is already a Pydantic model, which can be serialized with .model_dump() / .json().
115
- # `AgentMetrics` are dataclasses, use `asdict`.
116
-
117
- final_results = {
118
- "comprehensive_analysis_report": comprehensive_analysis_text,
119
- "actionable_okrs_and_tasks": actionable_tasks_okrs.model_dump() if actionable_tasks_okrs else None, # Pydantic v2
120
- "quarter": quarter,
121
- "year": year,
122
- "days_remaining": days_remaining,
123
- # "actionable_okrs_and_tasks": actionable_tasks_okrs.dict() if actionable_tasks_okrs else None, # Pydantic v1
124
- "detailed_metrics": {
125
- "follower_agent": asdict(follower_agent_metrics) if follower_agent_metrics else None,
126
- "post_agent": asdict(post_agent_metrics) if post_agent_metrics else None,
127
- "mentions_agent": asdict(mentions_agent_metrics) if mentions_agent_metrics else None,
128
- }
129
- }
130
- logger.info("Full analysis and task generation pipeline finished successfully.")
131
- return final_results
132
-
133
- # Example usage (similar to the original script's main execution block)
134
- if __name__ == '__main__':
135
- import asyncio
136
- import os
137
- from utils.logging_config import setup_logging
138
- from utils.data_fetching import fetch_linkedin_data_from_bubble, VALID_DATA_TYPES
139
-
140
- setup_logging() # Configure logging for the application
141
-
142
- # --- Configuration ---
143
- # Attempt to get API key from environment variable
144
- # IMPORTANT: Set GOOGLE_API_KEY and BUBBLE_API_KEY in your environment for this to run.
145
- GOOGLE_API_KEY = os.environ.get("GOOGLE_API_KEY")
146
- BUBBLE_API_KEY_ENV = os.environ.get("BUBBLE_API_KEY") # Used by data_fetching
147
-
148
- if not GOOGLE_API_KEY:
149
- logger.critical("GOOGLE_API_KEY environment variable not set. Orchestrator cannot initialize LLM agents.")
150
- exit(1)
151
- if not BUBBLE_API_KEY_ENV: # data_fetching will also check, but good to note here
152
- logger.warning("BUBBLE_API_KEY environment variable not set. Data fetching from Bubble will fail.")
153
- # You might want to exit or use mock data if Bubble is essential.
154
-
155
- # Set the Google Vertex AI environment variable if not using Vertex AI (as in original)
156
- os.environ["GOOGLE_GENAI_USE_VERTEXAI"] = "False"
157
-
158
- # Orchestrator settings
159
- ORG_URN_EXAMPLE = "urn:li:organization:19010008" # Example, replace with actual
160
- # Specify a model or let orchestrator/agents use their defaults
161
- # LLM_MODEL_FOR_ORCHESTRATION = "gemini-2.5-flash-preview-05-20" # Example: use a powerful model
162
- LLM_MODEL_FOR_ORCHESTRATION = None # Let agents use their defaults or pass a specific one
163
-
164
- # --- Initialize Orchestrator ---
165
- orchestrator = EnhancedLinkedInAnalyticsOrchestrator(
166
- api_key=GOOGLE_API_KEY,
167
- llm_model_name=LLM_MODEL_FOR_ORCHESTRATION,
168
- current_date_for_tasks=datetime.utcnow().date() # Use today for task planning
169
- )
170
-
171
- # --- Data Fetching ---
172
- logger.info(f"Fetching data for organization URN: {ORG_URN_EXAMPLE}")
173
-
174
- # Helper to fetch and log
175
- def get_data(data_type: VALID_DATA_TYPES, org_urn: str) -> pd.DataFrame:
176
- df, error = fetch_linkedin_data_from_bubble(org_urn=org_urn, data_type=data_type)
177
- if error:
178
- logger.error(f"Error fetching {data_type}: {error}. Using empty DataFrame.")
179
- return pd.DataFrame()
180
- if df is None: # Should not happen if error is None, but as a safeguard
181
- logger.warning(f"Fetched {data_type} is None but no error reported. Using empty DataFrame.")
182
- return pd.DataFrame()
183
- logger.info(f"Successfully fetched {data_type} with {len(df)} rows.")
184
- return df
185
-
186
- follower_stats_df_raw = get_data("li_follower_stats", ORG_URN_EXAMPLE)
187
- posts_df_raw = get_data("LI_posts", ORG_URN_EXAMPLE) # Contains post content, media_type, etc.
188
- mentions_df_raw = get_data("Li_mentions", ORG_URN_EXAMPLE)
189
- post_stats_df_raw = get_data("LI_post_stats", ORG_URN_EXAMPLE) # Contains engagement numbers for posts
190
-
191
- # --- Data Preprocessing/Merging (as in original example) ---
192
-
193
- # Select relevant columns for follower_stats_df
194
- if not follower_stats_df_raw.empty:
195
- follower_stats_df = follower_stats_df_raw[[
196
- 'category_name', "follower_count_organic", "follower_count_paid", "follower_count_type"
197
- ]].copy()
198
- else:
199
- follower_stats_df = pd.DataFrame() # Ensure it's an empty DF if raw is empty
200
-
201
- # Merge posts_df and post_stats_df
202
- # This logic assumes 'id' in posts_df_raw and 'post_id' in post_stats_df_raw
203
- merged_posts_df = pd.DataFrame()
204
- if not posts_df_raw.empty and not post_stats_df_raw.empty:
205
- if 'id' in posts_df_raw.columns and 'post_id' in post_stats_df_raw.columns:
206
- # Ensure 'id' in posts_df_raw is unique before merge if it's a left table key
207
- # posts_df_raw.drop_duplicates(subset=['id'], keep='first', inplace=True)
208
- merged_posts_df = pd.merge(posts_df_raw, post_stats_df_raw, left_on='id', right_on='post_id', how='left', suffixes=('', '_stats'))
209
- logger.info(f"Merged posts_df ({len(posts_df_raw)}) and post_stats_df ({len(post_stats_df_raw)}) into merged_posts_df ({len(merged_posts_df)}).")
210
- else:
211
- logger.warning("Cannot merge posts_df and post_stats_df due to missing 'id' or 'post_id'. Using posts_df_raw.")
212
- merged_posts_df = posts_df_raw.copy() # Fallback to posts_df_raw
213
- elif not posts_df_raw.empty:
214
- logger.info("post_stats_df is empty. Using posts_df_raw for post analysis.")
215
- merged_posts_df = posts_df_raw.copy()
216
- else:
217
- logger.warning("Both posts_df_raw and post_stats_df_raw are empty.")
218
- merged_posts_df = pd.DataFrame() # Empty DF
219
-
220
- # Select and ensure essential columns for merged_posts_df
221
- # These are columns expected by EnhancedPostPerformanceAgent
222
- expected_post_cols = [
223
- 'li_eb_label', 'media_type', 'is_ad', 'id', 'published_at', 'sentiment',
224
- 'engagement', 'impressionCount', 'clickCount', 'likeCount', 'commentCount', 'shareCount'
225
- ]
226
- if not merged_posts_df.empty:
227
- final_post_df_cols = {}
228
- for col in expected_post_cols:
229
- if col in merged_posts_df.columns:
230
- final_post_df_cols[col] = merged_posts_df[col]
231
- elif f"{col}_stats" in merged_posts_df.columns: # Check for suffixed columns from merge
232
- final_post_df_cols[col] = merged_posts_df[f"{col}_stats"]
233
- else:
234
- logger.debug(f"Expected column '{col}' not found in merged_posts_df. Will be created as empty/default by agent if needed.")
235
- # Agent preprocessing should handle missing columns by creating them with defaults (0 or 'Unknown')
236
-
237
- # Create the final DataFrame with only the selected/available columns
238
- # This ensures that if a column is missing, it doesn't cause an error here,
239
- # but the agent's preprocessing will handle it.
240
- # However, it's better to ensure they exist with NAs if the agent expects them.
241
- temp_post_df = pd.DataFrame(final_post_df_cols)
242
- # Ensure all expected columns are present, filling with NA if missing from selection
243
- for col in expected_post_cols:
244
- if col not in temp_post_df.columns:
245
- temp_post_df[col] = pd.NA # Or appropriate default like 0 for numeric, 'Unknown' for categorical
246
- merged_posts_df = temp_post_df[expected_post_cols].copy() # Ensure correct order and all columns
247
-
248
- else: # If merged_posts_df started empty and stayed empty
249
- merged_posts_df = pd.DataFrame(columns=expected_post_cols)
250
-
251
-
252
- # Mentions DataFrame - select relevant columns if necessary, or pass as is
253
- # Assuming mentions_df_raw is already in the correct shape or agent handles it.
254
- # For example, if it needs specific columns:
255
- # mentions_df = mentions_df_raw[['date', 'sentiment_label', 'mention_content']].copy() if not mentions_df_raw.empty else pd.DataFrame()
256
- mentions_df = mentions_df_raw.copy() # Pass as is, agent will preprocess
257
-
258
-
259
- # --- Run Orchestration ---
260
- async def main_orchestration():
261
- if follower_stats_df.empty and merged_posts_df.empty and mentions_df.empty:
262
- logger.error("All input DataFrames are empty. Aborting orchestration.")
263
- return None
264
-
265
- logger.info("Orchestrator starting generate_full_analysis_and_tasks...")
266
- results = await orchestrator.generate_full_analysis_and_tasks(
267
- follower_stats_df=follower_stats_df,
268
- post_df=merged_posts_df,
269
- mentions_df=mentions_df
270
- )
271
- return results
272
-
273
- orchestration_results = asyncio.run(main_orchestration())
274
-
275
- # --- Output Results ---
276
- if orchestration_results:
277
- print("\n\n" + "="*30 + " COMPREHENSIVE ANALYSIS REPORT " + "="*30)
278
- print(orchestration_results.get("comprehensive_analysis_report", "Report not generated."))
279
-
280
- print("\n\n" + "="*30 + " ACTIONABLE TASKS (OKRs) " + "="*30)
281
- okrs_data = orchestration_results.get("actionable_okrs_and_tasks")
282
- if okrs_data:
283
- # okrs_data is already a dict from .model_dump()
284
- print(json.dumps(okrs_data, indent=2))
285
- else:
286
- print("No actionable tasks (OKRs) generated or an error occurred.")
287
-
288
- print("\n\n" + "="*30 + " DETAILED AGENT METRICS " + "="*30)
289
- detailed_metrics = orchestration_results.get("detailed_metrics", {})
290
- for agent_name, metrics_dict in detailed_metrics.items():
291
- print(f"\n--- {agent_name.replace('_', ' ').title()} Metrics ---")
292
- if metrics_dict:
293
- print(json.dumps(metrics_dict, indent=2, default=str)) # default=str for any non-serializable types
294
- else:
295
- print("Metrics not available for this agent.")
296
- else:
297
- logger.info("Orchestration did not produce results (likely due to empty input data).")
298
-
299
- logger.info("Orchestration example finished.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
features/insight_and_tasks/utils/__init__.py DELETED
@@ -1,31 +0,0 @@
1
- # utils/__init__.py
2
-
3
- # This file makes the 'utils' directory a Python package.
4
- # You can choose to expose certain classes or functions directly at the package level
5
- # for easier importing, if desired.
6
-
7
- # For example:
8
- # from .retry_mechanism import RetryMechanism
9
- # from .pandasai_setup import configure_pandasai
10
- # from .data_fetching import fetch_linkedin_data_from_bubble
11
- # from .logging_config import setup_logging
12
-
13
- # Or, you can let users import them directly from the modules:
14
- # from utils.retry_mechanism import RetryMechanism
15
-
16
- # For now, keeping it simple and allowing module-level imports.
17
- # setup_logging() # Optionally call setup_logging() when the utils package is imported.
18
- # However, it's often better to call this explicitly at the application entry point.
19
-
20
- __all__ = [
21
- "RetryMechanism",
22
- "configure_pandasai",
23
- "fetch_linkedin_data_from_bubble",
24
- "setup_logging"
25
- ]
26
-
27
- # Import them here to make them available when 'from utils import *' is used,
28
- # or for direct access like 'utils.RetryMechanism'.
29
- from .retry_mechanism import RetryMechanism
30
- from .pandasai_setup import configure_pandasai
31
- from .logging_config import setup_logging
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
features/insight_and_tasks/utils/logging_config.py DELETED
@@ -1,28 +0,0 @@
1
- # utils/logging_config.py
2
- import logging
3
- import os
4
-
5
- def setup_logging():
6
- """
7
- Configures basic logging for the application.
8
- Logs to console.
9
- """
10
- log_level_str = os.environ.get("LOG_LEVEL", "INFO").upper()
11
- log_level = getattr(logging, log_level_str, logging.INFO)
12
-
13
- logging.basicConfig(
14
- level=log_level,
15
- format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
16
- datefmt="%Y-%m-%d %H:%M:%S"
17
- )
18
- # You can also direct logs to a file if needed:
19
- # file_handler = logging.FileHandler("app.log")
20
- # file_handler.setLevel(log_level)
21
- # file_handler.setFormatter(logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s"))
22
- # logging.getLogger().addHandler(file_handler)
23
-
24
- # Silence overly verbose libraries if necessary
25
- # logging.getLogger("some_verbose_library").setLevel(logging.WARNING)
26
-
27
- logger = logging.getLogger(__name__)
28
- logger.info(f"Logging configured with level: {log_level_str}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
features/insight_and_tasks/utils/pandasai_setup.py DELETED
@@ -1,54 +0,0 @@
1
- # utils/pandasai_setup.py
2
- import os
3
- import logging
4
- import pandasai as pai
5
- from pandasai_litellm import LiteLLM # Ensure this import matches your installed library
6
-
7
- # Configure logger for this module
8
- logger = logging.getLogger(__name__)
9
-
10
- # It's good practice to define constants at the top or in a config file
11
- DEFAULT_PANDASAI_MODEL = "gemini/gemini-2.5-flash-preview-05-20" # Using a common default
12
-
13
- def configure_pandasai(api_key: str, model_name: str = None):
14
- """
15
- Configures PandasAI with LiteLLM using the provided API key and model.
16
-
17
- Args:
18
- api_key: The Google API key.
19
- model_name: The specific model to use (e.g., "gemini/gemini-1.5-flash-latest").
20
- If None, uses DEFAULT_PANDASAI_MODEL.
21
- """
22
- if not api_key:
23
- logger.error("PandasAI Configuration Error: API key is missing.")
24
- # Depending on strictness, you might raise an error or just log
25
- # raise ValueError("API key must be provided for PandasAI configuration")
26
- return
27
-
28
- os.environ["GOOGLE_GENAI_USE_VERTEXAI"] = "False"
29
- os.environ["GOOGLE_API_KEY"] = api_key
30
-
31
-
32
- selected_model = model_name if model_name else DEFAULT_PANDASAI_MODEL
33
-
34
- try:
35
- llm = LiteLLM(
36
- model=DEFAULT_PANDASAI_MODEL , # Use the selected model
37
- api_key=api_key
38
- )
39
-
40
- # PandasAI configuration
41
- pai.config.set({
42
- "llm": llm,
43
- "temperature": 0.3, # Lower temperature for more consistent results
44
- "max_retries": 3
45
- })
46
- logger.info(f"PandasAI configured successfully with model: {selected_model}")
47
- logger.info(f"PandasAI LLM object: {llm}")
48
-
49
-
50
- except ImportError:
51
- logger.error("PandasAI or pandasai_litellm is not installed. Please install the required packages.")
52
- except Exception as e:
53
- logger.error(f"Error configuring PandasAI: {e}", exc_info=True)
54
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
features/insight_and_tasks/utils/retry_mechanism.py DELETED
@@ -1,61 +0,0 @@
1
- # utils/retry_mechanism.py
2
- import time
3
- import logging
4
- from typing import Callable, Any, Tuple
5
-
6
- # Configure logger for this module
7
- logger = logging.getLogger(__name__)
8
-
9
- class RetryMechanism:
10
- """External retry mechanism with exponential backoff"""
11
-
12
- @staticmethod
13
- def retry_with_backoff(
14
- func: Callable,
15
- max_retries: int = 3,
16
- base_delay: float = 1.0,
17
- exceptions: Tuple[type[Exception], ...] = (Exception,) # More specific type hint
18
- ) -> Any:
19
- """
20
- Retries a function call with exponential backoff.
21
-
22
- Args:
23
- func: The function to call.
24
- max_retries: Maximum number of retries.
25
- base_delay: Base delay in seconds for backoff.
26
- exceptions: A tuple of exception types to catch and retry on.
27
-
28
- Returns:
29
- The result of the function call if successful.
30
-
31
- Raises:
32
- The last exception encountered if all retries fail.
33
- """
34
- last_exception = None
35
- current_delay = base_delay
36
-
37
- for attempt in range(max_retries + 1): # +1 for initial attempt
38
- try:
39
- logger.info(f"Attempt {attempt + 1}/{max_retries + 1} for function {func.__name__}")
40
- result = func()
41
- if attempt > 0: # Log if a retry was successful
42
- logger.info(f"Function {func.__name__} succeeded on attempt {attempt + 1}")
43
- return result
44
- except exceptions as e:
45
- last_exception = e
46
- logger.warning(f"Attempt {attempt + 1} for {func.__name__} failed: {str(e)}")
47
-
48
- if attempt < max_retries:
49
- logger.info(f"Waiting {current_delay:.2f} seconds before retrying {func.__name__}...")
50
- time.sleep(current_delay)
51
- current_delay *= 2 # Exponential backoff
52
- else:
53
- logger.error(f"All {max_retries + 1} attempts for {func.__name__} failed.")
54
-
55
- # If loop finishes, all retries failed, raise the last exception
56
- if last_exception is not None:
57
- raise last_exception
58
- else:
59
- # This case should ideally not be reached if func always raises on failure
60
- # or returns successfully. Added for completeness.
61
- raise RuntimeError(f"Function {func.__name__} failed after all retries without a specific exception.")