Spaces:
Running
Running
Delete features/insight_and_tasks
Browse files- features/insight_and_tasks/__init__.py +0 -0
- features/insight_and_tasks/agents/follower_agent.py +0 -509
- features/insight_and_tasks/agents/mentions_agent.py +0 -397
- features/insight_and_tasks/agents/post_agent.py +0 -538
- features/insight_and_tasks/agents/task_extraction_agent.py +0 -400
- features/insight_and_tasks/agents/task_extraction_model.py +0 -226
- features/insight_and_tasks/agents/task_extraction_model_groq.py +0 -143
- features/insight_and_tasks/coordinators/employer_branding_coordinator.py +0 -331
- features/insight_and_tasks/data_models/__init__.py +0 -35
- features/insight_and_tasks/data_models/metrics.py +0 -50
- features/insight_and_tasks/data_models/tasks.py +0 -197
- features/insight_and_tasks/orchestrators/linkedin_analytics_orchestrator.py +0 -299
- features/insight_and_tasks/utils/__init__.py +0 -31
- features/insight_and_tasks/utils/logging_config.py +0 -28
- features/insight_and_tasks/utils/pandasai_setup.py +0 -54
- features/insight_and_tasks/utils/retry_mechanism.py +0 -61
features/insight_and_tasks/__init__.py
DELETED
File without changes
|
features/insight_and_tasks/agents/follower_agent.py
DELETED
@@ -1,509 +0,0 @@
|
|
1 |
-
# agents/follower_agent.py
|
2 |
-
import pandas as pd
|
3 |
-
from typing import Dict, List, Any, Optional
|
4 |
-
import logging
|
5 |
-
import pandasai as pai # Assuming pandasai is imported as pai globally or configured
|
6 |
-
|
7 |
-
from google.adk.agents import LlmAgent # Assuming this is the correct import path
|
8 |
-
|
9 |
-
# Project-specific imports
|
10 |
-
from features.insight_and_tasks.utils.retry_mechanism import RetryMechanism
|
11 |
-
from features.insight_and_tasks.data_models.metrics import AgentMetrics, TimeSeriesMetric
|
12 |
-
|
13 |
-
# Configure logger for this module
|
14 |
-
logger = logging.getLogger(__name__)
|
15 |
-
|
16 |
-
# Define the model globally or pass it as a parameter. For now, using a constant.
|
17 |
-
# Consider moving this to a shared config or environment variable.
|
18 |
-
DEFAULT_AGENT_MODEL = "gemini-2.5-flash-preview-05-20" # Or your specific model like "gemini-1.5-flash-preview-05-20"
|
19 |
-
|
20 |
-
|
21 |
-
class EnhancedFollowerAnalysisAgent:
|
22 |
-
"""
|
23 |
-
Enhanced follower analysis agent with proper handling of different follower count types
|
24 |
-
and structured metric extraction.
|
25 |
-
"""
|
26 |
-
|
27 |
-
AGENT_NAME = "follower_analyst"
|
28 |
-
AGENT_DESCRIPTION = "Expert analyst specializing in follower growth patterns and demographic analysis."
|
29 |
-
AGENT_INSTRUCTION = """
|
30 |
-
You are a specialized LinkedIn follower analytics expert focused on temporal patterns and demographic trends.
|
31 |
-
|
32 |
-
Your role includes:
|
33 |
-
|
34 |
-
1. FOLLOWER TREND ANALYSIS:
|
35 |
-
- Analyze follower growth trends over time (monthly data from 'follower_gains_monthly' type).
|
36 |
-
- Identify growth acceleration/deceleration periods.
|
37 |
-
- Calculate growth rates and velocity changes.
|
38 |
-
- Detect seasonal patterns and anomalies.
|
39 |
-
- Analyze organic vs paid follower counts over time.
|
40 |
-
|
41 |
-
2. DEMOGRAPHIC ANALYSIS (based on 'follower_industry', 'follower_seniority', etc.):
|
42 |
-
- Analyze follower distribution by industry, seniority, function, and geography.
|
43 |
-
- Compare organic vs paid followers across these demographic segments.
|
44 |
-
- Identify high-value audience segments based on counts and potential engagement.
|
45 |
-
|
46 |
-
3. TIME-BASED INSIGHTS:
|
47 |
-
- Provide month-over-month comparisons for growth data.
|
48 |
-
- Identify critical inflection points in follower growth.
|
49 |
-
- Calculate trend momentum and acceleration.
|
50 |
-
|
51 |
-
4. METRIC EXTRACTION (for the AgentMetrics structure):
|
52 |
-
- Extract time-series data for total, organic, and paid follower counts, and growth rates.
|
53 |
-
- Provide aggregate metrics like average monthly growth, total organic/paid followers.
|
54 |
-
- Provide demographic breakdowns as categorical metrics (e.g., top N industries by follower count).
|
55 |
-
|
56 |
-
Focus on separating temporal analysis (monthly) from demographic analysis.
|
57 |
-
When analyzing demographics, consider the top N segments (e.g., top 10 industries) for conciseness.
|
58 |
-
Ensure your analysis summary is comprehensive and insightful.
|
59 |
-
"""
|
60 |
-
|
61 |
-
def __init__(self, api_key: str, model_name: Optional[str] = None):
|
62 |
-
"""
|
63 |
-
Initializes the Follower Analysis Agent.
|
64 |
-
|
65 |
-
Args:
|
66 |
-
api_key: API key for LLM and potentially PandasAI.
|
67 |
-
model_name: Name of the language model to use. Defaults to DEFAULT_AGENT_MODEL.
|
68 |
-
"""
|
69 |
-
self.api_key = api_key # May be used if PandasAI is configured per agent or for other API calls
|
70 |
-
self.model_name = model_name or DEFAULT_AGENT_MODEL
|
71 |
-
|
72 |
-
self.agent = LlmAgent(
|
73 |
-
name=self.AGENT_NAME,
|
74 |
-
model=self.model_name,
|
75 |
-
description=self.AGENT_DESCRIPTION,
|
76 |
-
instruction=self.AGENT_INSTRUCTION
|
77 |
-
)
|
78 |
-
self.retry_mechanism = RetryMechanism()
|
79 |
-
logger.info(f"{self.AGENT_NAME} initialized with model {self.model_name}.")
|
80 |
-
|
81 |
-
def _separate_follower_data_by_type(self, df: pd.DataFrame) -> Dict[str, pd.DataFrame]:
|
82 |
-
"""Separate follower data by follower_count_type and process appropriately."""
|
83 |
-
separated_data = {}
|
84 |
-
|
85 |
-
if df is None or df.empty or 'follower_count_type' not in df.columns:
|
86 |
-
logger.warning("Input DataFrame is empty or 'follower_count_type' column is missing.")
|
87 |
-
return separated_data
|
88 |
-
|
89 |
-
# Define the expected follower count types
|
90 |
-
# These should match the 'follower_count_type' values in your Bubble data
|
91 |
-
follower_types = [
|
92 |
-
'follower_gains_monthly', # For time-series analysis
|
93 |
-
'follower_industry', # For demographic analysis
|
94 |
-
'follower_seniority',
|
95 |
-
'follower_function',
|
96 |
-
'follower_geo'
|
97 |
-
]
|
98 |
-
|
99 |
-
for ftype in follower_types:
|
100 |
-
type_data = df[df['follower_count_type'] == ftype].copy()
|
101 |
-
if not type_data.empty:
|
102 |
-
if ftype == 'follower_gains_monthly':
|
103 |
-
type_data = self._process_monthly_data(type_data)
|
104 |
-
else: # Demographic data
|
105 |
-
type_data = self._get_top_demographic_segments(type_data, top_n=10)
|
106 |
-
separated_data[ftype] = type_data
|
107 |
-
else:
|
108 |
-
logger.info(f"No data found for follower_count_type: {ftype}")
|
109 |
-
|
110 |
-
return separated_data
|
111 |
-
|
112 |
-
def _get_top_demographic_segments(self, demo_df: pd.DataFrame, top_n: int = 10) -> pd.DataFrame:
|
113 |
-
"""Get top N demographic segments by total follower count (organic + paid)."""
|
114 |
-
if demo_df.empty:
|
115 |
-
return demo_df
|
116 |
-
|
117 |
-
# Ensure required columns exist and are numeric, fill NaNs with 0 for sum
|
118 |
-
demo_df = demo_df.copy() # Work on a copy
|
119 |
-
demo_df['follower_count_organic'] = pd.to_numeric(demo_df.get('follower_count_organic'), errors='coerce').fillna(0)
|
120 |
-
demo_df['follower_count_paid'] = pd.to_numeric(demo_df.get('follower_count_paid'), errors='coerce').fillna(0)
|
121 |
-
|
122 |
-
demo_df['total_followers'] = demo_df['follower_count_organic'] + demo_df['follower_count_paid']
|
123 |
-
|
124 |
-
# Sort by total followers and take top N
|
125 |
-
# 'category_name' usually holds the demographic label (e.g., industry name)
|
126 |
-
if 'category_name' not in demo_df.columns:
|
127 |
-
logger.warning("'_get_top_demographic_segments' expects 'category_name' column for grouping.")
|
128 |
-
return demo_df.drop(columns=['total_followers'], errors='ignore')
|
129 |
-
|
130 |
-
# Group by category_name if there are multiple entries for the same category, sum followers
|
131 |
-
# This step might be redundant if data is already aggregated per category_name
|
132 |
-
# demo_df_grouped = demo_df.groupby('category_name').agg(
|
133 |
-
# follower_count_organic=('follower_count_organic', 'sum'),
|
134 |
-
# follower_count_paid=('follower_count_paid', 'sum'),
|
135 |
-
# total_followers=('total_followers', 'sum')
|
136 |
-
# ).reset_index()
|
137 |
-
|
138 |
-
top_segments = demo_df.nlargest(top_n, 'total_followers')
|
139 |
-
|
140 |
-
return top_segments.drop(columns=['total_followers'], errors='ignore')
|
141 |
-
|
142 |
-
|
143 |
-
def _process_monthly_data(self, monthly_df: pd.DataFrame) -> pd.DataFrame:
|
144 |
-
"""Process monthly follower data: parse dates, sort."""
|
145 |
-
if monthly_df.empty or 'category_name' not in monthly_df.columns:
|
146 |
-
logger.warning("Monthly data DataFrame is empty or 'category_name' column is missing.")
|
147 |
-
return monthly_df
|
148 |
-
|
149 |
-
df_processed = monthly_df.copy()
|
150 |
-
|
151 |
-
# 'category_name' for monthly data is expected to be a date string like 'YYYY-MM-DD'
|
152 |
-
# Attempt to convert 'category_name' to datetime
|
153 |
-
df_processed['date_for_analysis'] = pd.to_datetime(df_processed['category_name'], errors='coerce')
|
154 |
-
|
155 |
-
# Drop rows where date conversion failed
|
156 |
-
df_processed.dropna(subset=['date_for_analysis'], inplace=True)
|
157 |
-
|
158 |
-
if df_processed.empty:
|
159 |
-
logger.warning("No valid dates found in 'category_name' for monthly data after processing.")
|
160 |
-
return df_processed
|
161 |
-
|
162 |
-
df_processed['year_month'] = df_processed['date_for_analysis'].dt.strftime('%Y-%m')
|
163 |
-
df_processed['month_name'] = df_processed['date_for_analysis'].dt.strftime('%B %Y')
|
164 |
-
|
165 |
-
# Ensure numeric types for follower counts
|
166 |
-
for col in ['follower_count_organic', 'follower_count_paid']:
|
167 |
-
if col in df_processed.columns:
|
168 |
-
df_processed[col] = pd.to_numeric(df_processed[col], errors='coerce').fillna(0)
|
169 |
-
else: # Add column with zeros if missing, to prevent errors in later calculations
|
170 |
-
df_processed[col] = 0
|
171 |
-
|
172 |
-
|
173 |
-
return df_processed.sort_values('date_for_analysis')
|
174 |
-
|
175 |
-
def _extract_time_series_metrics(self, monthly_df: pd.DataFrame) -> List[TimeSeriesMetric]:
|
176 |
-
"""Extract time-series metrics from processed monthly follower data."""
|
177 |
-
ts_metrics = []
|
178 |
-
if monthly_df.empty or 'date_for_analysis' not in monthly_df.columns:
|
179 |
-
logger.info("Cannot extract time-series metrics: monthly DataFrame is empty or lacks 'date_for_analysis'.")
|
180 |
-
return ts_metrics
|
181 |
-
|
182 |
-
# Ensure data is sorted by date for correct growth rate calculation
|
183 |
-
monthly_df_sorted = monthly_df.sort_values('date_for_analysis').copy()
|
184 |
-
|
185 |
-
timestamps = monthly_df_sorted['year_month'].tolist()
|
186 |
-
|
187 |
-
# Calculate total followers
|
188 |
-
monthly_df_sorted['total_followers'] = monthly_df_sorted.get('follower_count_organic', 0) + \
|
189 |
-
monthly_df_sorted.get('follower_count_paid', 0)
|
190 |
-
|
191 |
-
metric_definitions = {
|
192 |
-
"total_follower_count": monthly_df_sorted['total_followers'],
|
193 |
-
"organic_follower_count": monthly_df_sorted.get('follower_count_organic', pd.Series(0, index=monthly_df_sorted.index)),
|
194 |
-
"paid_follower_count": monthly_df_sorted.get('follower_count_paid', pd.Series(0, index=monthly_df_sorted.index))
|
195 |
-
}
|
196 |
-
|
197 |
-
for name, values_series in metric_definitions.items():
|
198 |
-
ts_metrics.append(TimeSeriesMetric(
|
199 |
-
metric_name=name,
|
200 |
-
values=values_series.tolist(),
|
201 |
-
timestamps=timestamps,
|
202 |
-
metric_type="time_series",
|
203 |
-
time_granularity="monthly"
|
204 |
-
))
|
205 |
-
|
206 |
-
# Calculate growth rate for total followers
|
207 |
-
if len(monthly_df_sorted) > 1:
|
208 |
-
# pct_change gives NaN for the first element, fill with 0
|
209 |
-
growth_rates = monthly_df_sorted['total_followers'].pct_change().fillna(0).tolist()
|
210 |
-
ts_metrics.append(TimeSeriesMetric(
|
211 |
-
metric_name="total_follower_growth_rate",
|
212 |
-
values=growth_rates,
|
213 |
-
timestamps=timestamps, # Timestamps align, first growth rate is vs non-existent previous point (so 0)
|
214 |
-
metric_type="time_series",
|
215 |
-
time_granularity="monthly",
|
216 |
-
unit="%"
|
217 |
-
))
|
218 |
-
else:
|
219 |
-
logger.info("Not enough data points (<=1) to calculate growth rate.")
|
220 |
-
|
221 |
-
return ts_metrics
|
222 |
-
|
223 |
-
def _calculate_aggregate_metrics(self, separated_data: Dict[str, pd.DataFrame]) -> Dict[str, float]:
|
224 |
-
"""Calculate aggregate metrics from all follower data."""
|
225 |
-
agg_metrics = {}
|
226 |
-
|
227 |
-
monthly_df = separated_data.get('follower_gains_monthly')
|
228 |
-
if monthly_df is not None and not monthly_df.empty:
|
229 |
-
total_organic = monthly_df['follower_count_organic'].sum()
|
230 |
-
total_paid = monthly_df['follower_count_paid'].sum()
|
231 |
-
total_all_followers = total_organic + total_paid
|
232 |
-
|
233 |
-
agg_metrics['total_organic_followers_gained_period'] = float(total_organic)
|
234 |
-
agg_metrics['total_paid_followers_gained_period'] = float(total_paid)
|
235 |
-
agg_metrics['overall_total_followers_gained_period'] = float(total_all_followers)
|
236 |
-
|
237 |
-
if total_all_followers > 0:
|
238 |
-
agg_metrics['overall_organic_follower_ratio_gained'] = float(total_organic / total_all_followers)
|
239 |
-
agg_metrics['overall_paid_follower_ratio_gained'] = float(total_paid / total_all_followers)
|
240 |
-
|
241 |
-
# Average monthly gain (if 'total_followers' represents gain, not cumulative)
|
242 |
-
# Assuming 'follower_count_organic/paid' in 'follower_gains_monthly' are indeed GAINS for that month
|
243 |
-
monthly_df['monthly_total_gain'] = monthly_df['follower_count_organic'] + monthly_df['follower_count_paid']
|
244 |
-
if not monthly_df['monthly_total_gain'].empty:
|
245 |
-
agg_metrics['avg_monthly_follower_gain'] = float(monthly_df['monthly_total_gain'].mean())
|
246 |
-
agg_metrics['max_monthly_follower_gain'] = float(monthly_df['monthly_total_gain'].max())
|
247 |
-
agg_metrics['min_monthly_follower_gain'] = float(monthly_df['monthly_total_gain'].min())
|
248 |
-
|
249 |
-
|
250 |
-
# Count of distinct demographic segments identified (top N for each)
|
251 |
-
for demo_type in ['follower_industry', 'follower_seniority', 'follower_function', 'follower_geo']:
|
252 |
-
if demo_type in separated_data and not separated_data[demo_type].empty:
|
253 |
-
agg_metrics[f'distinct_{demo_type}_segments_analyzed'] = float(len(separated_data[demo_type]))
|
254 |
-
|
255 |
-
return agg_metrics
|
256 |
-
|
257 |
-
def _extract_demographic_metrics(self, separated_data: Dict[str, pd.DataFrame]) -> Dict[str, Any]:
|
258 |
-
"""Extract demographic distributions (categorical metrics)."""
|
259 |
-
cat_metrics = {}
|
260 |
-
demographic_types_map = {
|
261 |
-
'follower_industry': 'industry_distribution',
|
262 |
-
'follower_seniority': 'seniority_distribution',
|
263 |
-
'follower_function': 'function_distribution',
|
264 |
-
'follower_geo': 'geographic_distribution'
|
265 |
-
}
|
266 |
-
|
267 |
-
for demo_type_key, metric_name_prefix in demographic_types_map.items():
|
268 |
-
demo_df = separated_data.get(demo_type_key)
|
269 |
-
if demo_df is not None and not demo_df.empty and 'category_name' in demo_df.columns:
|
270 |
-
distribution = {}
|
271 |
-
for _, row in demo_df.iterrows():
|
272 |
-
category = row['category_name']
|
273 |
-
organic = float(row.get('follower_count_organic', 0))
|
274 |
-
paid = float(row.get('follower_count_paid', 0))
|
275 |
-
total = organic + paid
|
276 |
-
distribution[category] = {
|
277 |
-
'total_followers': total,
|
278 |
-
'organic_followers': organic,
|
279 |
-
'paid_followers': paid,
|
280 |
-
'organic_ratio': organic / total if total > 0 else 0.0
|
281 |
-
}
|
282 |
-
|
283 |
-
# Sort by total followers descending for the distribution
|
284 |
-
sorted_distribution = dict(sorted(distribution.items(), key=lambda item: item[1]['total_followers'], reverse=True))
|
285 |
-
cat_metrics[metric_name_prefix] = sorted_distribution
|
286 |
-
|
287 |
-
# Summary for this demographic type
|
288 |
-
total_followers_in_type = sum(item['total_followers'] for item in distribution.values())
|
289 |
-
cat_metrics[f'{metric_name_prefix}_summary'] = {
|
290 |
-
'total_followers_in_top_segments': total_followers_in_type,
|
291 |
-
'number_of_segments_reported': len(distribution),
|
292 |
-
'top_segment': list(sorted_distribution.keys())[0] if sorted_distribution else "N/A"
|
293 |
-
}
|
294 |
-
return cat_metrics
|
295 |
-
|
296 |
-
def _extract_time_periods(self, monthly_df: Optional[pd.DataFrame]) -> List[str]:
|
297 |
-
"""Extract unique year-month time periods covered by the monthly data."""
|
298 |
-
if monthly_df is None or monthly_df.empty or 'year_month' not in monthly_df.columns:
|
299 |
-
return ["Data period not available or N/A"]
|
300 |
-
|
301 |
-
periods = sorted(monthly_df['year_month'].dropna().unique().tolist(), reverse=True)
|
302 |
-
return periods[:12] # Return up to the last 12 months if available
|
303 |
-
|
304 |
-
|
305 |
-
def analyze_follower_data(self, follower_stats_df: pd.DataFrame) -> AgentMetrics:
|
306 |
-
"""
|
307 |
-
Generate comprehensive follower analysis using PandasAI and structured metric extraction.
|
308 |
-
"""
|
309 |
-
if follower_stats_df is None or follower_stats_df.empty:
|
310 |
-
logger.warning("Follower statistics DataFrame is empty. Returning empty metrics.")
|
311 |
-
return AgentMetrics(
|
312 |
-
agent_name=self.AGENT_NAME,
|
313 |
-
analysis_summary="No follower data provided for analysis.",
|
314 |
-
time_periods_covered=["N/A"]
|
315 |
-
)
|
316 |
-
|
317 |
-
# 1. Pre-process and separate data
|
318 |
-
separated_data = self._separate_follower_data_by_type(follower_stats_df)
|
319 |
-
|
320 |
-
# Prepare a combined DataFrame for PandasAI if needed, or use the original one.
|
321 |
-
# For PandasAI, it's often better to provide a clean, understandable DataFrame.
|
322 |
-
# Let's use the original df for the textual analysis by PandasAI,
|
323 |
-
# as it contains all types and the LLM can be instructed to differentiate.
|
324 |
-
|
325 |
-
# Ensure PandasAI is configured (this should ideally be done once at orchestrator level)
|
326 |
-
# from utils.pandasai_setup import configure_pandasai
|
327 |
-
# configure_pandasai(self.api_key, self.model_name) # Or pass LLM object if configured outside
|
328 |
-
|
329 |
-
df_description = "LinkedIn follower statistics. Contains 'follower_count_type' indicating data category (e.g., 'follower_gains_monthly', 'follower_industry'), 'category_name' (e.g., date for monthly, industry name for industry type), 'follower_count_organic', 'follower_count_paid'."
|
330 |
-
|
331 |
-
# Create PandasAI DataFrame
|
332 |
-
# Check if pai.DataFrame is the correct way to initialize based on your pandasai version
|
333 |
-
try:
|
334 |
-
pandas_ai_df = pai.DataFrame(follower_stats_df, description=df_description)
|
335 |
-
except Exception as e:
|
336 |
-
logger.error(f"Failed to create PandasAI DataFrame: {e}", exc_info=True)
|
337 |
-
return AgentMetrics(
|
338 |
-
agent_name=self.AGENT_NAME,
|
339 |
-
analysis_summary=f"Error initializing PandasAI: {e}",
|
340 |
-
time_periods_covered=self._extract_time_periods(separated_data.get('follower_gains_monthly'))
|
341 |
-
)
|
342 |
-
|
343 |
-
# 2. Generate textual analysis using PandasAI via LlmAgent
|
344 |
-
# The LlmAgent itself doesn't directly use PandasAI's .chat() method.
|
345 |
-
# The instruction for LlmAgent should guide it to perform analysis.
|
346 |
-
# If direct PandasAI chat is needed, it's a separate call.
|
347 |
-
# The original code uses pandas_df.chat(analysis_query). This implies PandasAI is used directly.
|
348 |
-
# Let's stick to the direct PandasAI chat call as in the original structure.
|
349 |
-
|
350 |
-
analysis_query = f"""
|
351 |
-
Analyze the provided LinkedIn follower statistics. The DataFrame contains various 'follower_count_type' values.
|
352 |
-
Focus on:
|
353 |
-
1. For 'follower_gains_monthly': Analyze monthly follower growth trends (total, organic, paid). Identify key periods of growth or decline.
|
354 |
-
2. For demographic types (industry, seniority, function, geo): Describe the distribution of followers. Which are the top segments? How do organic vs paid compare?
|
355 |
-
3. Synthesize these findings into an overall summary of follower dynamics.
|
356 |
-
|
357 |
-
Consider the data structure: 'category_name' holds the date for monthly data or the demographic label.
|
358 |
-
'follower_count_organic' and 'follower_count_paid' are the key metrics.
|
359 |
-
"""
|
360 |
-
|
361 |
-
analysis_result_text = "PandasAI analysis could not be performed." # Default
|
362 |
-
try:
|
363 |
-
def chat_operation():
|
364 |
-
# Ensure the LLM for PandasAI is correctly configured before this call
|
365 |
-
# This might involve re-calling configure_pandasai if it's not persistent
|
366 |
-
# or if the LLM object needs to be explicitly passed to PandasAI DataFrame.
|
367 |
-
# Check if LLM is configured using the proper config.get() method
|
368 |
-
config = pai.config.get()
|
369 |
-
logger.info(f"pai_config: {config}, Type of config: {type(config)}")
|
370 |
-
if not config.llm:
|
371 |
-
logger.warning("PandasAI LLM not configured. Attempting to configure now.")
|
372 |
-
# This assumes configure_pandasai is available and sets the LLM config
|
373 |
-
from insight_and_tasks.utils.pandasai_setup import configure_pandasai
|
374 |
-
configure_pandasai(self.api_key, self.model_name)
|
375 |
-
|
376 |
-
# Re-check configuration after setup attempt
|
377 |
-
config = pai.config.get()
|
378 |
-
if not config.llm:
|
379 |
-
raise RuntimeError("PandasAI LLM could not be configured for chat operation.")
|
380 |
-
|
381 |
-
logger.info(f"Executing PandasAI chat for follower analysis with LLM: {config.llm}")
|
382 |
-
return pandas_ai_df.chat(analysis_query)
|
383 |
-
|
384 |
-
analysis_result_raw = self.retry_mechanism.retry_with_backoff(
|
385 |
-
func=chat_operation,
|
386 |
-
max_retries=2, # Adjusted retries
|
387 |
-
base_delay=2.0,
|
388 |
-
exceptions=(Exception,) # Catch broader exceptions for PandasAI calls
|
389 |
-
)
|
390 |
-
analysis_result_text = str(analysis_result_raw) if analysis_result_raw else "No textual analysis generated by PandasAI."
|
391 |
-
logger.info("Follower analysis via PandasAI completed.")
|
392 |
-
|
393 |
-
except Exception as e:
|
394 |
-
logger.error(f"Follower analysis with PandasAI failed after retries: {e}", exc_info=True)
|
395 |
-
analysis_result_text = f"Follower analysis using PandasAI failed. Error: {str(e)[:200]}"
|
396 |
-
|
397 |
-
# 3. Extract structured metrics using the separated and processed data
|
398 |
-
monthly_data_for_metrics = separated_data.get('follower_gains_monthly', pd.DataFrame())
|
399 |
-
|
400 |
-
time_series_metrics = self._extract_time_series_metrics(monthly_data_for_metrics)
|
401 |
-
aggregate_metrics = self._calculate_aggregate_metrics(separated_data) # Uses all separated types
|
402 |
-
categorical_metrics = self._extract_demographic_metrics(separated_data) # Uses demographic types
|
403 |
-
time_periods = self._extract_time_periods(monthly_data_for_metrics)
|
404 |
-
|
405 |
-
return AgentMetrics(
|
406 |
-
agent_name=self.AGENT_NAME,
|
407 |
-
analysis_summary=analysis_result_text[:2000], # Truncate if too long
|
408 |
-
time_series_metrics=time_series_metrics,
|
409 |
-
aggregate_metrics=aggregate_metrics,
|
410 |
-
categorical_metrics=categorical_metrics,
|
411 |
-
time_periods_covered=time_periods,
|
412 |
-
data_sources_used=[f"follower_stats_df (shape: {follower_stats_df.shape})"]
|
413 |
-
)
|
414 |
-
|
415 |
-
if __name__ == '__main__':
|
416 |
-
# This is for example and testing purposes.
|
417 |
-
# Ensure logging and other necessary setups are done.
|
418 |
-
try:
|
419 |
-
from utils.logging_config import setup_logging
|
420 |
-
setup_logging()
|
421 |
-
logger.info("Logging setup for EnhancedFollowerAnalysisAgent test.")
|
422 |
-
except ImportError:
|
423 |
-
logging.basicConfig(level=logging.INFO)
|
424 |
-
logger.warning("Could not import setup_logging. Using basicConfig.")
|
425 |
-
|
426 |
-
# Mock API Key and Model for testing
|
427 |
-
# IMPORTANT: For PandasAI to run, a valid API key and model setup are needed.
|
428 |
-
# This example might not fully execute PandasAI chat without proper environment setup.
|
429 |
-
MOCK_API_KEY = os.environ.get("GOOGLE_API_KEY", "test_api_key_followers")
|
430 |
-
MODEL_NAME = DEFAULT_AGENT_MODEL
|
431 |
-
|
432 |
-
# Configure PandasAI (essential for the .chat() part)
|
433 |
-
try:
|
434 |
-
from utils.pandasai_setup import configure_pandasai
|
435 |
-
if MOCK_API_KEY != "test_api_key_followers": # Only configure if a real key might be present
|
436 |
-
configure_pandasai(MOCK_API_KEY, MODEL_NAME)
|
437 |
-
logger.info("PandasAI configured for testing EnhancedFollowerAnalysisAgent.")
|
438 |
-
else:
|
439 |
-
logger.warning("Using mock API key. PandasAI chat will likely fail or use a default/mock LLM if available.")
|
440 |
-
# Mock pai.DataFrame if pandasai is not fully set up to avoid errors
|
441 |
-
class MockPandasAIDataFrame:
|
442 |
-
def __init__(self, df, description): self.df = df; self.description = description
|
443 |
-
def chat(self, query): return f"Mock PandasAI response to: {query}"
|
444 |
-
pai.DataFrame = MockPandasAIDataFrame
|
445 |
-
|
446 |
-
except ImportError:
|
447 |
-
logger.error("utils.pandasai_setup not found. PandasAI will not be configured.")
|
448 |
-
class MockPandasAIDataFrame:
|
449 |
-
def __init__(self, df, description): self.df = df; self.description = description
|
450 |
-
def chat(self, query): return f"Mock PandasAI response to: {query}"
|
451 |
-
pai.DataFrame = MockPandasAIDataFrame
|
452 |
-
|
453 |
-
# Sample Data
|
454 |
-
sample_follower_data = {
|
455 |
-
'follower_count_type': [
|
456 |
-
'follower_gains_monthly', 'follower_gains_monthly', 'follower_gains_monthly',
|
457 |
-
'follower_industry', 'follower_industry', 'follower_industry', 'follower_industry',
|
458 |
-
'follower_seniority', 'follower_seniority'
|
459 |
-
],
|
460 |
-
'category_name': [ # Dates for monthly, names for demographics
|
461 |
-
'2023-01-01', '2023-02-01', '2023-03-01',
|
462 |
-
'Technology', 'Finance', 'Healthcare', 'Retail',
|
463 |
-
'Senior', 'Entry-Level'
|
464 |
-
],
|
465 |
-
'follower_count_organic': [
|
466 |
-
100, 120, 110, # Monthly gains
|
467 |
-
500, 300, 200, 150, # Industry organic
|
468 |
-
600, 400 # Seniority organic
|
469 |
-
],
|
470 |
-
'follower_count_paid': [
|
471 |
-
10, 15, 12, # Monthly gains
|
472 |
-
50, 30, 20, 10, # Industry paid
|
473 |
-
60, 40 # Seniority paid
|
474 |
-
]
|
475 |
-
}
|
476 |
-
sample_df = pd.DataFrame(sample_follower_data)
|
477 |
-
|
478 |
-
# Initialize agent
|
479 |
-
follower_agent = EnhancedFollowerAnalysisAgent(api_key=MOCK_API_KEY, model_name=MODEL_NAME)
|
480 |
-
|
481 |
-
logger.info("Analyzing sample follower data...")
|
482 |
-
metrics_result = follower_agent.analyze_follower_data(sample_df)
|
483 |
-
|
484 |
-
print("\n--- EnhancedFollowerAnalysisAgent Results ---")
|
485 |
-
print(f"Agent Name: {metrics_result.agent_name}")
|
486 |
-
print(f"Analysis Summary: {metrics_result.analysis_summary}")
|
487 |
-
print("\nTime Series Metrics:")
|
488 |
-
for ts_metric in metrics_result.time_series_metrics:
|
489 |
-
print(f" - {ts_metric.metric_name}: {len(ts_metric.values)} data points, e.g., {ts_metric.values[:3]} for ts {ts_metric.timestamps[:3]}")
|
490 |
-
print("\nAggregate Metrics:")
|
491 |
-
for key, value in metrics_result.aggregate_metrics.items():
|
492 |
-
print(f" - {key}: {value}")
|
493 |
-
print("\nCategorical Metrics:")
|
494 |
-
for key, value in metrics_result.categorical_metrics.items():
|
495 |
-
print(f" - {key}: (details below)")
|
496 |
-
if isinstance(value, dict):
|
497 |
-
for sub_key, sub_value in list(value.items())[:2]: # Print first 2 items for brevity
|
498 |
-
print(f" - {sub_key}: {sub_value}")
|
499 |
-
else:
|
500 |
-
print(f" {value}")
|
501 |
-
|
502 |
-
print(f"\nTime Periods Covered: {metrics_result.time_periods_covered}")
|
503 |
-
print(f"Data Sources Used: {metrics_result.data_sources_used}")
|
504 |
-
print(f"Generated Timestamp: {metrics_result.generation_timestamp}")
|
505 |
-
|
506 |
-
# Test with empty DataFrame
|
507 |
-
logger.info("\n--- Testing with empty DataFrame ---")
|
508 |
-
empty_metrics_result = follower_agent.analyze_follower_data(pd.DataFrame())
|
509 |
-
print(f"Empty DF Analysis Summary: {empty_metrics_result.analysis_summary}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
features/insight_and_tasks/agents/mentions_agent.py
DELETED
@@ -1,397 +0,0 @@
|
|
1 |
-
# agents/mentions_agent.py
|
2 |
-
import pandas as pd
|
3 |
-
from typing import Dict, List, Any, Optional, Mapping
|
4 |
-
import logging
|
5 |
-
import pandasai as pai # Assuming pandasai is imported as pai globally or configured
|
6 |
-
|
7 |
-
from google.adk.agents import LlmAgent # Assuming this is the correct import path
|
8 |
-
|
9 |
-
# Project-specific imports
|
10 |
-
from features.insight_and_tasks.utils.retry_mechanism import RetryMechanism
|
11 |
-
from features.insight_and_tasks.data_models.metrics import AgentMetrics, TimeSeriesMetric
|
12 |
-
|
13 |
-
# Configure logger for this module
|
14 |
-
logger = logging.getLogger(__name__)
|
15 |
-
|
16 |
-
DEFAULT_AGENT_MODEL = "gemini-2.5-flash-preview-05-20"
|
17 |
-
|
18 |
-
class EnhancedMentionsAnalysisAgent:
|
19 |
-
"""
|
20 |
-
Enhanced mentions analysis agent with time-series metric extraction and sentiment processing.
|
21 |
-
"""
|
22 |
-
AGENT_NAME = "mentions_analyst"
|
23 |
-
AGENT_DESCRIPTION = "Expert analyst specializing in brand mention trends and sentiment patterns."
|
24 |
-
AGENT_INSTRUCTION = """
|
25 |
-
You are a specialized LinkedIn brand mentions expert focused on sentiment trends and mention patterns over time.
|
26 |
-
|
27 |
-
Your role includes:
|
28 |
-
|
29 |
-
1. MENTION TREND ANALYSIS (monthly, using 'date' column):
|
30 |
-
- Analyze mention volume trends over time.
|
31 |
-
- Identify periods with significant spikes or dips in mention activity.
|
32 |
-
|
33 |
-
2. SENTIMENT PATTERN ANALYSIS (monthly, using 'date' and 'sentiment_label'):
|
34 |
-
- Track the evolution of sentiment (e.g., positive, negative, neutral) associated with mentions.
|
35 |
-
- Calculate and analyze the average sentiment score over time (if sentiment can be quantified).
|
36 |
-
- Identify shifts in overall sentiment and potential drivers for these changes.
|
37 |
-
|
38 |
-
3. CORRELATION (Conceptual):
|
39 |
-
- Consider if mention spikes/dips or sentiment shifts correlate with any known company activities, campaigns, or external events (though this data might not be in the input DataFrame, mention the need to investigate).
|
40 |
-
|
41 |
-
4. METRIC EXTRACTION (for AgentMetrics):
|
42 |
-
- Extract time-series data for monthly mention volume.
|
43 |
-
- Extract time-series data for monthly sentiment distribution (e.g., count of positive/negative/neutral mentions) and average sentiment score.
|
44 |
-
- Provide aggregate metrics like total mentions, overall sentiment distribution, and average sentiment score for the period.
|
45 |
-
- Include categorical metrics like the distribution of sentiment labels.
|
46 |
-
|
47 |
-
Focus on identifying actionable insights from mention data. How is the brand being perceived? Are there emerging reputational risks or opportunities?
|
48 |
-
Use the provided DataFrame columns: 'date' (for mentions), 'sentiment_label' (e.g., 'Positive 👍', 'Negative 👎', 'Neutral 😐'), and potentially 'mention_source' or 'mention_content' if available and relevant for deeper analysis (though focus on 'date' and 'sentiment_label' for core metrics).
|
49 |
-
"""
|
50 |
-
|
51 |
-
# Standardized sentiment mapping (can be expanded)
|
52 |
-
# This mapping is crucial for converting labels to scores.
|
53 |
-
SENTIMENT_MAPPING = {
|
54 |
-
'Positive 👍': 1,
|
55 |
-
'Positive': 1, # Adding common variations
|
56 |
-
'Very Positive': 1.5, # Example for more granular sentiment
|
57 |
-
'Negative 👎': -1,
|
58 |
-
'Negative': -1,
|
59 |
-
'Very Negative': -1.5,
|
60 |
-
'Neutral 😐': 0,
|
61 |
-
'Neutral': 0,
|
62 |
-
'Mixed': 0, # Or handle mixed sentiment differently
|
63 |
-
'Unknown': 0 # Default score for unmapped or unknown sentiments
|
64 |
-
}
|
65 |
-
|
66 |
-
|
67 |
-
def __init__(self, api_key: str, model_name: Optional[str] = None):
|
68 |
-
self.api_key = api_key
|
69 |
-
self.model_name = model_name or DEFAULT_AGENT_MODEL
|
70 |
-
self.agent = LlmAgent(
|
71 |
-
name=self.AGENT_NAME,
|
72 |
-
model=self.model_name,
|
73 |
-
description=self.AGENT_DESCRIPTION,
|
74 |
-
instruction=self.AGENT_INSTRUCTION
|
75 |
-
)
|
76 |
-
self.retry_mechanism = RetryMechanism()
|
77 |
-
logger.info(f"{self.AGENT_NAME} initialized with model {self.model_name}.")
|
78 |
-
|
79 |
-
def _get_sentiment_score(self, sentiment_label: Optional[str]) -> float:
|
80 |
-
"""Maps a sentiment label to a numerical score using SENTIMENT_MAPPING."""
|
81 |
-
if sentiment_label is None:
|
82 |
-
return self.SENTIMENT_MAPPING.get('Unknown', 0)
|
83 |
-
# Attempt to match known labels, case-insensitively for robustness if needed,
|
84 |
-
# but exact match is safer with the current emoji-inclusive keys.
|
85 |
-
return float(self.SENTIMENT_MAPPING.get(str(sentiment_label).strip(), self.SENTIMENT_MAPPING.get('Unknown',0)))
|
86 |
-
|
87 |
-
|
88 |
-
def _preprocess_mentions_data(self, df: pd.DataFrame) -> pd.DataFrame:
|
89 |
-
"""Cleans and prepares mentions data for analysis."""
|
90 |
-
if df is None or df.empty:
|
91 |
-
return pd.DataFrame()
|
92 |
-
|
93 |
-
df_processed = df.copy()
|
94 |
-
|
95 |
-
# Convert 'date' to datetime
|
96 |
-
if 'date' in df_processed.columns:
|
97 |
-
df_processed['date'] = pd.to_datetime(df_processed['date'], errors='coerce')
|
98 |
-
# df_processed.dropna(subset=['date'], inplace=True) # Keep for other metrics even if date is NaT
|
99 |
-
else:
|
100 |
-
logger.warning("'date' column not found in mentions data. Time-series analysis will be limited.")
|
101 |
-
# df_processed['date'] = pd.NaT # Add placeholder if critical
|
102 |
-
|
103 |
-
# Process 'sentiment_label' and create 'sentiment_score'
|
104 |
-
if 'sentiment_label' in df_processed.columns:
|
105 |
-
df_processed['sentiment_label'] = df_processed['sentiment_label'].astype(str).fillna('Unknown')
|
106 |
-
df_processed['sentiment_score'] = df_processed['sentiment_label'].apply(self._get_sentiment_score)
|
107 |
-
else:
|
108 |
-
logger.info("'sentiment_label' column not found. Sentiment analysis will be limited.")
|
109 |
-
df_processed['sentiment_label'] = 'Unknown'
|
110 |
-
df_processed['sentiment_score'] = self._get_sentiment_score('Unknown')
|
111 |
-
|
112 |
-
return df_processed
|
113 |
-
|
114 |
-
def _extract_time_series_metrics(self, df_processed: pd.DataFrame) -> List[TimeSeriesMetric]:
|
115 |
-
"""Extracts monthly time-series metrics from processed mentions data."""
|
116 |
-
ts_metrics = []
|
117 |
-
if df_processed.empty or 'date' not in df_processed.columns or df_processed['date'].isnull().all():
|
118 |
-
logger.info("Cannot extract time-series metrics for mentions: 'date' is missing or all null.")
|
119 |
-
return ts_metrics
|
120 |
-
|
121 |
-
df_ts = df_processed.dropna(subset=['date']).copy()
|
122 |
-
if df_ts.empty:
|
123 |
-
logger.info("No valid 'date' values for mentions time-series metrics after filtering NaT.")
|
124 |
-
return ts_metrics
|
125 |
-
|
126 |
-
df_ts['year_month'] = df_ts['date'].dt.strftime('%Y-%m')
|
127 |
-
|
128 |
-
# Monthly mention volume
|
129 |
-
monthly_volume = df_ts.groupby('year_month').size().reset_index(name='mention_count')
|
130 |
-
if not monthly_volume.empty:
|
131 |
-
ts_metrics.append(TimeSeriesMetric(
|
132 |
-
metric_name="monthly_mention_volume",
|
133 |
-
values=monthly_volume['mention_count'].tolist(),
|
134 |
-
timestamps=monthly_volume['year_month'].tolist(),
|
135 |
-
metric_type="time_series",
|
136 |
-
time_granularity="monthly",
|
137 |
-
unit="count"
|
138 |
-
))
|
139 |
-
|
140 |
-
# Monthly average sentiment score
|
141 |
-
if 'sentiment_score' in df_ts.columns:
|
142 |
-
monthly_avg_sentiment = df_ts.groupby('year_month')['sentiment_score'].mean().reset_index()
|
143 |
-
if not monthly_avg_sentiment.empty:
|
144 |
-
ts_metrics.append(TimeSeriesMetric(
|
145 |
-
metric_name="avg_monthly_sentiment_score",
|
146 |
-
values=monthly_avg_sentiment['sentiment_score'].tolist(),
|
147 |
-
timestamps=monthly_avg_sentiment['year_month'].tolist(),
|
148 |
-
metric_type="time_series",
|
149 |
-
time_granularity="monthly",
|
150 |
-
unit="score" # Score range depends on SENTIMENT_MAPPING
|
151 |
-
))
|
152 |
-
|
153 |
-
# Monthly distribution of sentiment labels
|
154 |
-
if 'sentiment_label' in df_ts.columns and df_ts['sentiment_label'].nunique() > 1:
|
155 |
-
# Ensure 'sentiment_label' is not all 'Unknown'
|
156 |
-
if not (df_ts['sentiment_label'] == 'Unknown').all():
|
157 |
-
sentiment_counts_by_month = df_ts.groupby(['year_month', 'sentiment_label']).size().unstack(fill_value=0)
|
158 |
-
for sentiment_val in sentiment_counts_by_month.columns:
|
159 |
-
if sentiment_val == 'Unknown' and (sentiment_counts_by_month[sentiment_val] == 0).all():
|
160 |
-
continue
|
161 |
-
ts_metrics.append(TimeSeriesMetric(
|
162 |
-
metric_name=f"monthly_mention_count_sentiment_{str(sentiment_val).lower().replace(' ', '_').replace('👍','positive').replace('👎','negative').replace('😐','neutral')}",
|
163 |
-
values=sentiment_counts_by_month[sentiment_val].tolist(),
|
164 |
-
timestamps=sentiment_counts_by_month.index.tolist(), # year_month is index
|
165 |
-
metric_type="time_series",
|
166 |
-
time_granularity="monthly",
|
167 |
-
unit="count"
|
168 |
-
))
|
169 |
-
else:
|
170 |
-
logger.info("Sentiment label data is all 'Unknown', skipping sentiment distribution time series.")
|
171 |
-
|
172 |
-
return ts_metrics
|
173 |
-
|
174 |
-
def _calculate_aggregate_metrics(self, df_processed: pd.DataFrame) -> Dict[str, float]:
|
175 |
-
"""Calculates aggregate metrics for mentions."""
|
176 |
-
agg_metrics = {}
|
177 |
-
if df_processed.empty:
|
178 |
-
return agg_metrics
|
179 |
-
|
180 |
-
agg_metrics['total_mentions_analyzed'] = float(len(df_processed))
|
181 |
-
|
182 |
-
if 'sentiment_score' in df_processed.columns and not df_processed['sentiment_score'].empty:
|
183 |
-
agg_metrics['overall_avg_sentiment_score'] = float(df_processed['sentiment_score'].mean())
|
184 |
-
|
185 |
-
if 'sentiment_label' in df_processed.columns:
|
186 |
-
total_valid_sentiments = len(df_processed.dropna(subset=['sentiment_label'])) # Count non-NaN labels
|
187 |
-
if total_valid_sentiments > 0:
|
188 |
-
# Iterate through our defined sentiment mapping to count occurrences
|
189 |
-
sentiment_counts = df_processed['sentiment_label'].value_counts()
|
190 |
-
for label, score_val in self.SENTIMENT_MAPPING.items():
|
191 |
-
# Use a clean key for the metric name
|
192 |
-
clean_label_key = str(label).lower().replace(' ', '_').replace('👍','positive').replace('👎','negative').replace('😐','neutral')
|
193 |
-
if clean_label_key == "unknown" and score_val == 0: # Skip generic unknown if it's just a fallback
|
194 |
-
if sentiment_counts.get(label, 0) == 0 and 'Unknown' not in label : continue
|
195 |
-
|
196 |
-
|
197 |
-
count = sentiment_counts.get(label, 0)
|
198 |
-
if count > 0 or label == 'Unknown': # Report if count > 0 or if it's the 'Unknown' category itself
|
199 |
-
agg_metrics[f'{clean_label_key}_mention_ratio'] = float(count / total_valid_sentiments)
|
200 |
-
agg_metrics[f'{clean_label_key}_mention_count'] = float(count)
|
201 |
-
|
202 |
-
|
203 |
-
# Mentions per day/week (if 'date' column is valid)
|
204 |
-
if 'date' in df_processed.columns and not df_processed['date'].isnull().all():
|
205 |
-
df_dated = df_processed.dropna(subset=['date']).sort_values('date')
|
206 |
-
if len(df_dated) > 1:
|
207 |
-
duration_days = (df_dated['date'].max() - df_dated['date'].min()).days
|
208 |
-
if duration_days > 0:
|
209 |
-
agg_metrics['avg_mentions_per_day'] = float(len(df_dated) / duration_days)
|
210 |
-
agg_metrics['avg_mentions_per_week'] = float(len(df_dated) / (duration_days / 7.0))
|
211 |
-
elif len(df_dated) == 1: # Single day with mentions
|
212 |
-
agg_metrics['avg_mentions_per_day'] = float(len(df_dated))
|
213 |
-
agg_metrics['avg_mentions_per_week'] = float(len(df_dated) * 7) # Extrapolate
|
214 |
-
|
215 |
-
return agg_metrics
|
216 |
-
|
217 |
-
def _extract_categorical_metrics(self, df_processed: pd.DataFrame) -> Dict[str, Any]:
|
218 |
-
"""Extracts categorical distributions for mentions."""
|
219 |
-
cat_metrics = {}
|
220 |
-
if df_processed.empty:
|
221 |
-
return cat_metrics
|
222 |
-
|
223 |
-
# Sentiment label distribution (counts and percentages)
|
224 |
-
if 'sentiment_label' in df_processed.columns and df_processed['sentiment_label'].nunique() > 0:
|
225 |
-
cat_metrics['sentiment_label_distribution_percentage'] = df_processed['sentiment_label'].value_counts(normalize=True).apply(lambda x: f"{x:.2%}").to_dict()
|
226 |
-
cat_metrics['sentiment_label_counts'] = df_processed['sentiment_label'].value_counts().to_dict()
|
227 |
-
|
228 |
-
# Example: If 'mention_source' column existed:
|
229 |
-
# if 'mention_source' in df_processed.columns:
|
230 |
-
# cat_metrics['mention_source_distribution'] = df_processed['mention_source'].value_counts(normalize=True).to_dict()
|
231 |
-
# cat_metrics['mention_source_counts'] = df_processed['mention_source'].value_counts().to_dict()
|
232 |
-
|
233 |
-
return cat_metrics
|
234 |
-
|
235 |
-
def _extract_time_periods(self, df_processed: pd.DataFrame) -> List[str]:
|
236 |
-
"""Extracts unique year-month time periods covered by the mentions data."""
|
237 |
-
if df_processed.empty or 'date' not in df_processed.columns or df_processed['date'].isnull().all():
|
238 |
-
return ["Data period not available or N/A"]
|
239 |
-
|
240 |
-
if 'year_month' in df_processed.columns: # If already created during TS extraction
|
241 |
-
periods = sorted(df_processed['year_month'].dropna().unique().tolist(), reverse=True)
|
242 |
-
elif 'date' in df_processed.columns: # Derive if not present
|
243 |
-
dates = df_processed['date'].dropna()
|
244 |
-
if not dates.empty:
|
245 |
-
periods = sorted(dates.dt.strftime('%Y-%m').unique().tolist(), reverse=True)
|
246 |
-
else: return ["N/A"]
|
247 |
-
else: return ["N/A"]
|
248 |
-
|
249 |
-
return periods[:12] # Return up to the last 12 months
|
250 |
-
|
251 |
-
def analyze_mentions_data(self, mentions_df: pd.DataFrame) -> AgentMetrics:
|
252 |
-
"""
|
253 |
-
Generates comprehensive mentions analysis.
|
254 |
-
"""
|
255 |
-
if mentions_df is None or mentions_df.empty:
|
256 |
-
logger.warning("Mentions DataFrame is empty. Returning empty metrics.")
|
257 |
-
return AgentMetrics(
|
258 |
-
agent_name=self.AGENT_NAME,
|
259 |
-
analysis_summary="No mentions data provided for analysis.",
|
260 |
-
time_periods_covered=["N/A"]
|
261 |
-
)
|
262 |
-
|
263 |
-
# 1. Preprocess data
|
264 |
-
df_processed = self._preprocess_mentions_data(mentions_df)
|
265 |
-
if df_processed.empty and not mentions_df.empty:
|
266 |
-
logger.warning("Mentions DataFrame became empty after preprocessing.")
|
267 |
-
return AgentMetrics(
|
268 |
-
agent_name=self.AGENT_NAME,
|
269 |
-
analysis_summary="Mentions data could not be processed.",
|
270 |
-
time_periods_covered=["N/A"]
|
271 |
-
)
|
272 |
-
elif df_processed.empty and mentions_df.empty:
|
273 |
-
return AgentMetrics(agent_name=self.AGENT_NAME, analysis_summary="No mentions data provided.")
|
274 |
-
|
275 |
-
|
276 |
-
# 2. Generate textual analysis using PandasAI
|
277 |
-
df_description_for_pandasai = "LinkedIn brand mentions data. Key columns: 'date' (date of mention), 'sentiment_label' (e.g., 'Positive 👍', 'Negative 👎', 'Neutral 😐'), 'sentiment_score' (numeric score from -1.5 to 1.5)."
|
278 |
-
|
279 |
-
analysis_result_text = "PandasAI analysis for mentions could not be performed."
|
280 |
-
try:
|
281 |
-
pandas_ai_df = pai.DataFrame(df_processed, description=df_description_for_pandasai)
|
282 |
-
analysis_query = f"""
|
283 |
-
Analyze the provided LinkedIn brand mentions data. Focus on:
|
284 |
-
1. Monthly trends in mention volume.
|
285 |
-
2. Monthly trends in sentiment (average 'sentiment_score' and distribution of 'sentiment_label').
|
286 |
-
3. Identify any significant spikes/dips in mentions or shifts in sentiment.
|
287 |
-
Provide a concise summary of brand perception based on this data.
|
288 |
-
"""
|
289 |
-
def chat_operation():
|
290 |
-
config = pai.config.get()
|
291 |
-
logger.info(f"pai_config: {config}, Type of config: {type(config)}")
|
292 |
-
if not config.llm:
|
293 |
-
logger.warning("PandasAI LLM not configured. Attempting to configure now.")
|
294 |
-
# This assumes configure_pandasai is available and sets the LLM config
|
295 |
-
from insight_and_tasks.utils.pandasai_setup import configure_pandasai
|
296 |
-
configure_pandasai(self.api_key, self.model_name)
|
297 |
-
|
298 |
-
# Re-check configuration after setup attempt
|
299 |
-
config = pai.config.get()
|
300 |
-
if not config.llm:
|
301 |
-
raise RuntimeError("PandasAI LLM could not be configured for chat operation.")
|
302 |
-
|
303 |
-
logger.info(f"Executing PandasAI chat for follower analysis with LLM: {config.llm}")
|
304 |
-
return pandas_ai_df.chat(analysis_query)
|
305 |
-
|
306 |
-
analysis_result_raw = self.retry_mechanism.retry_with_backoff(
|
307 |
-
func=chat_operation, max_retries=2, base_delay=2.0, exceptions=(Exception,)
|
308 |
-
)
|
309 |
-
analysis_result_text = str(analysis_result_raw) if analysis_result_raw else "No textual analysis for mentions generated by PandasAI."
|
310 |
-
logger.info("Mentions analysis via PandasAI completed.")
|
311 |
-
|
312 |
-
except Exception as e:
|
313 |
-
logger.error(f"Mentions analysis with PandasAI failed: {e}", exc_info=True)
|
314 |
-
analysis_result_text = f"Mentions analysis using PandasAI failed. Error: {str(e)[:200]}"
|
315 |
-
|
316 |
-
# 3. Extract structured metrics
|
317 |
-
time_series_metrics = self._extract_time_series_metrics(df_processed)
|
318 |
-
aggregate_metrics = self._calculate_aggregate_metrics(df_processed)
|
319 |
-
categorical_metrics = self._extract_categorical_metrics(df_processed)
|
320 |
-
time_periods = self._extract_time_periods(df_processed)
|
321 |
-
|
322 |
-
return AgentMetrics(
|
323 |
-
agent_name=self.AGENT_NAME,
|
324 |
-
analysis_summary=analysis_result_text[:2000],
|
325 |
-
time_series_metrics=time_series_metrics,
|
326 |
-
aggregate_metrics=aggregate_metrics,
|
327 |
-
categorical_metrics=categorical_metrics,
|
328 |
-
time_periods_covered=time_periods,
|
329 |
-
data_sources_used=[f"mentions_df (shape: {mentions_df.shape}) -> df_processed (shape: {df_processed.shape})"]
|
330 |
-
)
|
331 |
-
|
332 |
-
if __name__ == '__main__':
|
333 |
-
try:
|
334 |
-
from utils.logging_config import setup_logging
|
335 |
-
setup_logging()
|
336 |
-
logger.info("Logging setup for EnhancedMentionsAnalysisAgent test.")
|
337 |
-
except ImportError:
|
338 |
-
logging.basicConfig(level=logging.INFO)
|
339 |
-
logger.warning("Could not import setup_logging. Using basicConfig.")
|
340 |
-
|
341 |
-
MOCK_API_KEY = os.environ.get("GOOGLE_API_KEY", "test_api_key_mentions")
|
342 |
-
MODEL_NAME = DEFAULT_AGENT_MODEL
|
343 |
-
|
344 |
-
try:
|
345 |
-
from utils.pandasai_setup import configure_pandasai
|
346 |
-
if MOCK_API_KEY != "test_api_key_mentions":
|
347 |
-
configure_pandasai(MOCK_API_KEY, MODEL_NAME)
|
348 |
-
logger.info("PandasAI configured for testing EnhancedMentionsAnalysisAgent.")
|
349 |
-
else:
|
350 |
-
logger.warning("Using mock API key for mentions. PandasAI chat will likely fail or use a mock.")
|
351 |
-
class MockPandasAIDataFrame:
|
352 |
-
def __init__(self, df, description): self.df = df; self.description = description
|
353 |
-
def chat(self, query): return f"Mock PandasAI mentions response to: {query}"
|
354 |
-
pai.DataFrame = MockPandasAIDataFrame
|
355 |
-
except ImportError:
|
356 |
-
logger.error("utils.pandasai_setup not found. PandasAI will not be configured for mentions.")
|
357 |
-
class MockPandasAIDataFrame:
|
358 |
-
def __init__(self, df, description): self.df = df; self.description = description
|
359 |
-
def chat(self, query): return f"Mock PandasAI mentions response to: {query}"
|
360 |
-
pai.DataFrame = MockPandasAIDataFrame
|
361 |
-
|
362 |
-
|
363 |
-
sample_mentions_data = {
|
364 |
-
'date': pd.to_datetime(['2023-01-05', '2023-01-15', '2023-02-02', '2023-02-20', '2023-03-10', '2023-03-12']),
|
365 |
-
'sentiment_label': ['Positive 👍', 'Negative 👎', 'Neutral 😐', 'Positive 👍', 'Positive 👍', 'Unknown'],
|
366 |
-
# 'mention_content': ['Great product!', 'Service was slow.', 'Just a mention.', 'Love the new feature!', 'Highly recommend.', 'Seen this around.']
|
367 |
-
}
|
368 |
-
sample_df_mentions = pd.DataFrame(sample_mentions_data)
|
369 |
-
|
370 |
-
mentions_agent = EnhancedMentionsAnalysisAgent(api_key=MOCK_API_KEY, model_name=MODEL_NAME)
|
371 |
-
|
372 |
-
logger.info("Analyzing sample mentions data...")
|
373 |
-
mentions_metrics_result = mentions_agent.analyze_mentions_data(sample_df_mentions)
|
374 |
-
|
375 |
-
print("\n--- EnhancedMentionsAnalysisAgent Results ---")
|
376 |
-
print(f"Agent Name: {mentions_metrics_result.agent_name}")
|
377 |
-
print(f"Analysis Summary: {mentions_metrics_result.analysis_summary}")
|
378 |
-
print("\nTime Series Metrics (Mentions):")
|
379 |
-
for ts_metric in mentions_metrics_result.time_series_metrics:
|
380 |
-
print(f" - {ts_metric.metric_name}: {len(ts_metric.values)} data points, e.g., {ts_metric.values[:3]} for ts {ts_metric.timestamps[:3]} (Unit: {ts_metric.unit})")
|
381 |
-
print("\nAggregate Metrics (Mentions):")
|
382 |
-
for key, value in mentions_metrics_result.aggregate_metrics.items():
|
383 |
-
print(f" - {key}: {value}")
|
384 |
-
print("\nCategorical Metrics (Mentions):")
|
385 |
-
for key, value in mentions_metrics_result.categorical_metrics.items():
|
386 |
-
print(f" - {key}:")
|
387 |
-
if isinstance(value, dict):
|
388 |
-
for sub_key, sub_value in list(value.items())[:2]: # Print first 2 for brevity
|
389 |
-
print(f" - {sub_key}: {sub_value}")
|
390 |
-
else:
|
391 |
-
print(f" {value}")
|
392 |
-
print(f"\nTime Periods Covered (Mentions): {mentions_metrics_result.time_periods_covered}")
|
393 |
-
|
394 |
-
# Test with empty DataFrame
|
395 |
-
logger.info("\n--- Testing Mentions Agent with empty DataFrame ---")
|
396 |
-
empty_mentions_metrics = mentions_agent.analyze_mentions_data(pd.DataFrame())
|
397 |
-
print(f"Empty Mentions DF Analysis Summary: {empty_mentions_metrics.analysis_summary}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
features/insight_and_tasks/agents/post_agent.py
DELETED
@@ -1,538 +0,0 @@
|
|
1 |
-
# agents/post_agent.py
|
2 |
-
import pandas as pd
|
3 |
-
from typing import Dict, List, Any, Optional
|
4 |
-
import logging
|
5 |
-
import pandasai as pai # Assuming pandasai is imported as pai globally or configured
|
6 |
-
|
7 |
-
from google.adk.agents import LlmAgent # Assuming this is the correct import path
|
8 |
-
|
9 |
-
# Project-specific imports
|
10 |
-
from features.insight_and_tasks.utils.retry_mechanism import RetryMechanism
|
11 |
-
from features.insight_and_tasks.data_models.metrics import AgentMetrics, TimeSeriesMetric
|
12 |
-
|
13 |
-
# Configure logger for this module
|
14 |
-
logger = logging.getLogger(__name__)
|
15 |
-
|
16 |
-
DEFAULT_AGENT_MODEL = "gemini-2.5-flash-preview-05-20"
|
17 |
-
|
18 |
-
class EnhancedPostPerformanceAgent:
|
19 |
-
"""
|
20 |
-
Enhanced post performance agent with time-series metric extraction and detailed analysis.
|
21 |
-
"""
|
22 |
-
AGENT_NAME = "post_analyst"
|
23 |
-
AGENT_DESCRIPTION = "Expert analyst specializing in content performance trends and engagement patterns."
|
24 |
-
AGENT_INSTRUCTION = """
|
25 |
-
You are a specialized LinkedIn content performance expert focused on temporal engagement patterns,
|
26 |
-
content type effectiveness, and audience interaction.
|
27 |
-
|
28 |
-
Your role includes:
|
29 |
-
|
30 |
-
1. ENGAGEMENT TREND ANALYSIS (monthly, using 'published_at'):
|
31 |
-
- Analyze trends for key engagement metrics: likes, comments, shares, overall engagement ('engagement' column), impressions, clicks.
|
32 |
-
- Calculate and analyze engagement rate (engagement / impressionCount) over time.
|
33 |
-
- Calculate and analyze click-through rate (CTR: clickCount / impressionCount) over time.
|
34 |
-
- Identify periods of high/low engagement and potential drivers.
|
35 |
-
|
36 |
-
2. CONTENT TYPE & TOPIC PERFORMANCE:
|
37 |
-
- Compare performance across different media types (using 'media_type' column).
|
38 |
-
- Analyze performance by content topic/pillar (using 'li_eb_label' column).
|
39 |
-
- Identify which content types/topics drive the most engagement, impressions, or clicks.
|
40 |
-
- Analyze if the effectiveness of certain media types or topics changes over time.
|
41 |
-
|
42 |
-
3. POSTING BEHAVIOR ANALYSIS:
|
43 |
-
- Analyze posting frequency (e.g., posts per week/month) and its potential impact on overall engagement or reach.
|
44 |
-
- Identify if there are optimal posting times or days based on engagement patterns (if 'published_at' provides time detail).
|
45 |
-
|
46 |
-
4. SENTIMENT ANALYSIS (if 'sentiment' column is available):
|
47 |
-
- Analyze the distribution of sentiment (e.g., positive, negative, neutral) associated with posts.
|
48 |
-
- Track how average sentiment of posts evolves over time.
|
49 |
-
|
50 |
-
5. AD PERFORMANCE (if 'is_ad' column is available):
|
51 |
-
- Compare performance (engagement, reach, CTR) of ad posts vs. organic posts.
|
52 |
-
|
53 |
-
6. METRIC EXTRACTION (for AgentMetrics):
|
54 |
-
- Extract time-series data for average monthly engagement metrics (likes, comments, engagement rate, CTR, etc.).
|
55 |
-
- Provide aggregate performance metrics (e.g., overall average engagement rate, total impressions, top performing media type).
|
56 |
-
- Include distributions for content types, topics, and sentiment as categorical metrics.
|
57 |
-
|
58 |
-
Focus on actionable insights. What content resonates most? When is the audience most active? How can strategy be improved?
|
59 |
-
Structure your analysis clearly. Use the provided DataFrame columns ('published_at', 'media_type', 'li_eb_label',
|
60 |
-
'likeCount', 'commentCount', 'shareCount', 'engagement', 'impressionCount', 'clickCount', 'sentiment', 'is_ad').
|
61 |
-
"""
|
62 |
-
|
63 |
-
def __init__(self, api_key: str, model_name: Optional[str] = None):
|
64 |
-
self.api_key = api_key
|
65 |
-
self.model_name = model_name or DEFAULT_AGENT_MODEL
|
66 |
-
self.agent = LlmAgent(
|
67 |
-
name=self.AGENT_NAME,
|
68 |
-
model=self.model_name,
|
69 |
-
description=self.AGENT_DESCRIPTION,
|
70 |
-
instruction=self.AGENT_INSTRUCTION
|
71 |
-
)
|
72 |
-
self.retry_mechanism = RetryMechanism()
|
73 |
-
logger.info(f"{self.AGENT_NAME} initialized with model {self.model_name}.")
|
74 |
-
|
75 |
-
def _preprocess_post_data(self, df: pd.DataFrame) -> pd.DataFrame:
|
76 |
-
"""Cleans and prepares post data for analysis."""
|
77 |
-
if df is None or df.empty:
|
78 |
-
return pd.DataFrame()
|
79 |
-
|
80 |
-
df_processed = df.copy()
|
81 |
-
|
82 |
-
# Convert 'published_at' to datetime
|
83 |
-
if 'published_at' in df_processed.columns:
|
84 |
-
df_processed['published_at'] = pd.to_datetime(df_processed['published_at'], errors='coerce')
|
85 |
-
# df_processed.dropna(subset=['published_at'], inplace=True) # Keep rows even if date is NaT for other metrics
|
86 |
-
else:
|
87 |
-
logger.warning("'published_at' column not found. Time-series analysis will be limited.")
|
88 |
-
# Add a placeholder if critical for downstream, or handle absence gracefully
|
89 |
-
# df_processed['published_at'] = pd.NaT
|
90 |
-
|
91 |
-
# Ensure numeric types for engagement metrics, coercing errors and filling NaNs
|
92 |
-
metric_cols = ['likeCount', 'commentCount', 'shareCount', 'engagement', 'impressionCount', 'clickCount']
|
93 |
-
for col in metric_cols:
|
94 |
-
if col in df_processed.columns:
|
95 |
-
df_processed[col] = pd.to_numeric(df_processed[col], errors='coerce').fillna(0)
|
96 |
-
else:
|
97 |
-
logger.info(f"Metric column '{col}' not found in post data. Will be treated as 0.")
|
98 |
-
df_processed[col] = 0 # Add column with zeros if missing
|
99 |
-
|
100 |
-
# Calculate Engagement Rate and CTR where possible
|
101 |
-
if 'impressionCount' in df_processed.columns and 'engagement' in df_processed.columns:
|
102 |
-
df_processed['engagement_rate'] = df_processed.apply(
|
103 |
-
lambda row: (row['engagement'] / row['impressionCount']) if row['impressionCount'] > 0 else 0.0, axis=1
|
104 |
-
)
|
105 |
-
else:
|
106 |
-
df_processed['engagement_rate'] = 0.0
|
107 |
-
|
108 |
-
if 'impressionCount' in df_processed.columns and 'clickCount' in df_processed.columns:
|
109 |
-
df_processed['ctr'] = df_processed.apply(
|
110 |
-
lambda row: (row['clickCount'] / row['impressionCount']) if row['impressionCount'] > 0 else 0.0, axis=1
|
111 |
-
)
|
112 |
-
else:
|
113 |
-
df_processed['ctr'] = 0.0
|
114 |
-
|
115 |
-
# Handle 'is_ad' boolean conversion if it exists
|
116 |
-
if 'is_ad' in df_processed.columns:
|
117 |
-
df_processed['is_ad'] = df_processed['is_ad'].astype(bool)
|
118 |
-
else:
|
119 |
-
df_processed['is_ad'] = False # Assume organic if not specified
|
120 |
-
|
121 |
-
# Handle 'sentiment' - ensure it's string, fill NaNs
|
122 |
-
if 'sentiment' in df_processed.columns:
|
123 |
-
df_processed['sentiment'] = df_processed['sentiment'].astype(str).fillna('Unknown')
|
124 |
-
else:
|
125 |
-
df_processed['sentiment'] = 'Unknown'
|
126 |
-
|
127 |
-
# Handle 'media_type' and 'li_eb_label' - ensure string, fill NaNs
|
128 |
-
for col in ['media_type', 'li_eb_label']:
|
129 |
-
if col in df_processed.columns:
|
130 |
-
df_processed[col] = df_processed[col].astype(str).fillna('Unknown')
|
131 |
-
else:
|
132 |
-
df_processed[col] = 'Unknown'
|
133 |
-
|
134 |
-
return df_processed
|
135 |
-
|
136 |
-
def _extract_time_series_metrics(self, df_processed: pd.DataFrame) -> List[TimeSeriesMetric]:
|
137 |
-
"""Extracts monthly time-series metrics from processed post data."""
|
138 |
-
ts_metrics = []
|
139 |
-
if df_processed.empty or 'published_at' not in df_processed.columns or df_processed['published_at'].isnull().all():
|
140 |
-
logger.info("Cannot extract time-series metrics for posts: 'published_at' is missing or all null.")
|
141 |
-
return ts_metrics
|
142 |
-
|
143 |
-
# Filter out rows where 'published_at' is NaT for time-series aggregation
|
144 |
-
df_ts = df_processed.dropna(subset=['published_at']).copy()
|
145 |
-
if df_ts.empty:
|
146 |
-
logger.info("No valid 'published_at' dates for post time-series metrics after filtering NaT.")
|
147 |
-
return ts_metrics
|
148 |
-
|
149 |
-
df_ts['year_month'] = df_ts['published_at'].dt.strftime('%Y-%m')
|
150 |
-
|
151 |
-
# Metrics to average monthly
|
152 |
-
metrics_to_agg = {
|
153 |
-
'likeCount': 'mean', 'commentCount': 'mean', 'shareCount': 'mean',
|
154 |
-
'engagement': 'mean', 'impressionCount': 'mean', 'clickCount': 'mean',
|
155 |
-
'engagement_rate': 'mean', 'ctr': 'mean'
|
156 |
-
}
|
157 |
-
# Filter out metrics not present in the DataFrame
|
158 |
-
available_metrics_to_agg = {k: v for k, v in metrics_to_agg.items() if k in df_ts.columns}
|
159 |
-
|
160 |
-
if not available_metrics_to_agg:
|
161 |
-
logger.info("No standard engagement metric columns found for time-series aggregation.")
|
162 |
-
else:
|
163 |
-
monthly_stats = df_ts.groupby('year_month').agg(available_metrics_to_agg).reset_index()
|
164 |
-
timestamps = monthly_stats['year_month'].tolist()
|
165 |
-
|
166 |
-
for metric_col, agg_type in available_metrics_to_agg.items():
|
167 |
-
# Use original column name, or a more descriptive one like "avg_monthly_likes"
|
168 |
-
ts_metrics.append(TimeSeriesMetric(
|
169 |
-
metric_name=f"avg_monthly_{metric_col.lower()}",
|
170 |
-
values=monthly_stats[metric_col].fillna(0).tolist(),
|
171 |
-
timestamps=timestamps,
|
172 |
-
metric_type="time_series",
|
173 |
-
time_granularity="monthly",
|
174 |
-
unit="%" if "_rate" in metric_col or "ctr" in metric_col else "count"
|
175 |
-
))
|
176 |
-
|
177 |
-
# Time series for sentiment distribution (count of posts by sentiment per month)
|
178 |
-
if 'sentiment' in df_ts.columns and df_ts['sentiment'].nunique() > 1 : # if sentiment data exists
|
179 |
-
# Ensure 'sentiment' is not all 'Unknown'
|
180 |
-
if not (df_ts['sentiment'] == 'Unknown').all():
|
181 |
-
sentiment_by_month = df_ts.groupby(['year_month', 'sentiment']).size().unstack(fill_value=0)
|
182 |
-
for sentiment_value in sentiment_by_month.columns:
|
183 |
-
if sentiment_value == 'Unknown' and (sentiment_by_month[sentiment_value] == 0).all():
|
184 |
-
continue # Skip if 'Unknown' sentiment has no posts
|
185 |
-
ts_metrics.append(TimeSeriesMetric(
|
186 |
-
metric_name=f"monthly_post_count_sentiment_{str(sentiment_value).lower().replace(' ', '_')}",
|
187 |
-
values=sentiment_by_month[sentiment_value].tolist(),
|
188 |
-
timestamps=sentiment_by_month.index.tolist(), # year_month is the index
|
189 |
-
metric_type="time_series",
|
190 |
-
time_granularity="monthly",
|
191 |
-
unit="count"
|
192 |
-
))
|
193 |
-
else:
|
194 |
-
logger.info("Sentiment data is all 'Unknown', skipping sentiment time series.")
|
195 |
-
|
196 |
-
# Time series for post count
|
197 |
-
monthly_post_counts = df_ts.groupby('year_month').size().reset_index(name='post_count')
|
198 |
-
if not monthly_post_counts.empty:
|
199 |
-
ts_metrics.append(TimeSeriesMetric(
|
200 |
-
metric_name="monthly_post_count",
|
201 |
-
values=monthly_post_counts['post_count'].tolist(),
|
202 |
-
timestamps=monthly_post_counts['year_month'].tolist(),
|
203 |
-
metric_type="time_series",
|
204 |
-
time_granularity="monthly",
|
205 |
-
unit="count"
|
206 |
-
))
|
207 |
-
|
208 |
-
return ts_metrics
|
209 |
-
|
210 |
-
def _calculate_aggregate_metrics(self, df_processed: pd.DataFrame) -> Dict[str, Any]:
|
211 |
-
"""Calculates aggregate performance metrics for posts."""
|
212 |
-
agg_metrics = {}
|
213 |
-
if df_processed.empty:
|
214 |
-
return agg_metrics
|
215 |
-
|
216 |
-
# Overall averages and totals
|
217 |
-
metric_cols_for_agg = ['likeCount', 'commentCount', 'shareCount', 'engagement',
|
218 |
-
'impressionCount', 'clickCount', 'engagement_rate', 'ctr']
|
219 |
-
for col in metric_cols_for_agg:
|
220 |
-
if col in df_processed.columns and pd.api.types.is_numeric_dtype(df_processed[col]):
|
221 |
-
agg_metrics[f'overall_avg_{col.lower()}'] = float(df_processed[col].mean())
|
222 |
-
if col not in ['engagement_rate', 'ctr']: # Totals make sense for counts
|
223 |
-
agg_metrics[f'overall_total_{col.lower()}'] = float(df_processed[col].sum())
|
224 |
-
|
225 |
-
agg_metrics['total_posts_analyzed'] = float(len(df_processed))
|
226 |
-
|
227 |
-
# Posting frequency (posts per week)
|
228 |
-
if 'published_at' in df_processed.columns and not df_processed['published_at'].isnull().all():
|
229 |
-
df_dated = df_processed.dropna(subset=['published_at']).sort_values('published_at')
|
230 |
-
if len(df_dated) > 1:
|
231 |
-
# Calculate total duration in days
|
232 |
-
duration_days = (df_dated['published_at'].max() - df_dated['published_at'].min()).days
|
233 |
-
if duration_days > 0:
|
234 |
-
agg_metrics['avg_posts_per_week'] = float(len(df_dated) / (duration_days / 7.0))
|
235 |
-
elif len(df_dated) > 0: # All posts on the same day or within a day
|
236 |
-
agg_metrics['avg_posts_per_week'] = float(len(df_dated) * 7) # Extrapolate
|
237 |
-
elif len(df_dated) == 1:
|
238 |
-
agg_metrics['avg_posts_per_week'] = 7.0 # One post, extrapolate to 7 per week
|
239 |
-
|
240 |
-
# Performance by media type and topic (as tables/structured dicts)
|
241 |
-
agg_metrics['performance_by_media_type'] = self._create_performance_table(df_processed, 'media_type')
|
242 |
-
agg_metrics['performance_by_topic'] = self._create_performance_table(df_processed, 'li_eb_label')
|
243 |
-
|
244 |
-
return agg_metrics
|
245 |
-
|
246 |
-
def _create_performance_table(self, df: pd.DataFrame, group_column: str) -> Dict[str, Any]:
|
247 |
-
"""Helper to create a structured performance table for categorical analysis."""
|
248 |
-
if group_column not in df.columns or df[group_column].isnull().all() or (df[group_column] == 'Unknown').all():
|
249 |
-
return {"message": f"No data or only 'Unknown' values for grouping by {group_column}."}
|
250 |
-
|
251 |
-
# Filter out 'Unknown' category if it's the only one or for cleaner tables
|
252 |
-
df_filtered = df[df[group_column] != 'Unknown']
|
253 |
-
if df_filtered.empty: # If filtering 'Unknown' leaves no data, use original df but acknowledge
|
254 |
-
df_filtered = df
|
255 |
-
logger.info(f"Performance table for {group_column} includes 'Unknown' as it's the only/main category.")
|
256 |
-
|
257 |
-
# Define metrics to aggregate
|
258 |
-
agg_config = {
|
259 |
-
'engagement': 'mean',
|
260 |
-
'impressionCount': 'mean',
|
261 |
-
'clickCount': 'mean',
|
262 |
-
'likeCount': 'mean',
|
263 |
-
'commentCount': 'mean',
|
264 |
-
'shareCount': 'mean',
|
265 |
-
'engagement_rate': 'mean',
|
266 |
-
'ctr': 'mean',
|
267 |
-
'published_at': 'count' # To get number of posts per category
|
268 |
-
}
|
269 |
-
# Filter config for columns that actually exist in df_filtered
|
270 |
-
valid_agg_config = {k: v for k, v in agg_config.items() if k in df_filtered.columns or k == 'published_at'} # 'published_at' for count
|
271 |
-
|
272 |
-
if not valid_agg_config or 'published_at' not in valid_agg_config : # Need at least one metric or count
|
273 |
-
return {"message": f"Not enough relevant metric columns to create performance table for {group_column}."}
|
274 |
-
|
275 |
-
|
276 |
-
try:
|
277 |
-
# Group by the specified column and aggregate
|
278 |
-
# Rename 'published_at' count to 'num_posts' for clarity
|
279 |
-
grouped = df_filtered.groupby(group_column).agg(valid_agg_config).rename(
|
280 |
-
columns={'published_at': 'num_posts'}
|
281 |
-
).reset_index()
|
282 |
-
|
283 |
-
# Sort by a primary engagement metric, e.g., average engagement rate or num_posts
|
284 |
-
sort_key = 'num_posts'
|
285 |
-
if 'engagement_rate' in grouped.columns:
|
286 |
-
sort_key = 'engagement_rate'
|
287 |
-
elif 'engagement' in grouped.columns:
|
288 |
-
sort_key = 'engagement'
|
289 |
-
|
290 |
-
grouped = grouped.sort_values(by=sort_key, ascending=False)
|
291 |
-
|
292 |
-
# Prepare for JSON serializable output
|
293 |
-
table_data = []
|
294 |
-
for _, row in grouped.iterrows():
|
295 |
-
row_dict = {'category': row[group_column]}
|
296 |
-
for col in grouped.columns:
|
297 |
-
if col == group_column: continue
|
298 |
-
value = row[col]
|
299 |
-
if isinstance(value, (int, float)):
|
300 |
-
if "_rate" in col or "ctr" in col:
|
301 |
-
row_dict[col] = f"{value:.2%}" # Percentage
|
302 |
-
else:
|
303 |
-
row_dict[col] = round(value, 2) if isinstance(value, float) else value
|
304 |
-
else:
|
305 |
-
row_dict[col] = str(value)
|
306 |
-
table_data.append(row_dict)
|
307 |
-
|
308 |
-
return {
|
309 |
-
"grouping_column": group_column,
|
310 |
-
"columns_reported": [col for col in grouped.columns.tolist() if col != group_column],
|
311 |
-
"data": table_data,
|
312 |
-
"note": f"Top categories by {sort_key}."
|
313 |
-
}
|
314 |
-
|
315 |
-
except Exception as e:
|
316 |
-
logger.error(f"Error creating performance table for {group_column}: {e}", exc_info=True)
|
317 |
-
return {"error": f"Could not generate table for {group_column}: {e}"}
|
318 |
-
|
319 |
-
|
320 |
-
def _extract_categorical_metrics(self, df_processed: pd.DataFrame) -> Dict[str, Any]:
|
321 |
-
"""Extracts distributions and other categorical insights for posts."""
|
322 |
-
cat_metrics = {}
|
323 |
-
if df_processed.empty:
|
324 |
-
return cat_metrics
|
325 |
-
|
326 |
-
# Media type distribution
|
327 |
-
if 'media_type' in df_processed.columns and df_processed['media_type'].nunique() > 0:
|
328 |
-
cat_metrics['media_type_distribution'] = df_processed['media_type'].value_counts(normalize=True).apply(lambda x: f"{x:.2%}").to_dict()
|
329 |
-
cat_metrics['media_type_counts'] = df_processed['media_type'].value_counts().to_dict()
|
330 |
-
|
331 |
-
|
332 |
-
# Topic distribution (li_eb_label)
|
333 |
-
if 'li_eb_label' in df_processed.columns and df_processed['li_eb_label'].nunique() > 0:
|
334 |
-
cat_metrics['topic_distribution'] = df_processed['li_eb_label'].value_counts(normalize=True).apply(lambda x: f"{x:.2%}").to_dict()
|
335 |
-
cat_metrics['topic_counts'] = df_processed['li_eb_label'].value_counts().to_dict()
|
336 |
-
|
337 |
-
# Sentiment distribution
|
338 |
-
if 'sentiment' in df_processed.columns and df_processed['sentiment'].nunique() > 0:
|
339 |
-
cat_metrics['sentiment_distribution'] = df_processed['sentiment'].value_counts(normalize=True).apply(lambda x: f"{x:.2%}").to_dict()
|
340 |
-
cat_metrics['sentiment_counts'] = df_processed['sentiment'].value_counts().to_dict()
|
341 |
-
|
342 |
-
# Ad vs. Organic performance summary
|
343 |
-
if 'is_ad' in df_processed.columns:
|
344 |
-
ad_summary = {}
|
345 |
-
for ad_status in [True, False]:
|
346 |
-
subset = df_processed[df_processed['is_ad'] == ad_status]
|
347 |
-
if not subset.empty:
|
348 |
-
label = "ad" if ad_status else "organic"
|
349 |
-
ad_summary[f'{label}_post_count'] = int(len(subset))
|
350 |
-
ad_summary[f'{label}_avg_engagement_rate'] = float(subset['engagement_rate'].mean())
|
351 |
-
ad_summary[f'{label}_avg_impressions'] = float(subset['impressionCount'].mean())
|
352 |
-
ad_summary[f'{label}_avg_ctr'] = float(subset['ctr'].mean())
|
353 |
-
if ad_summary:
|
354 |
-
cat_metrics['ad_vs_organic_summary'] = ad_summary
|
355 |
-
|
356 |
-
return cat_metrics
|
357 |
-
|
358 |
-
def _extract_time_periods(self, df_processed: pd.DataFrame) -> List[str]:
|
359 |
-
"""Extracts unique year-month time periods covered by the post data."""
|
360 |
-
if df_processed.empty or 'published_at' not in df_processed.columns or df_processed['published_at'].isnull().all():
|
361 |
-
return ["Data period not available or N/A"]
|
362 |
-
|
363 |
-
# Use already created 'year_month' if available from preprocessing, or derive it
|
364 |
-
if 'year_month' in df_processed.columns:
|
365 |
-
periods = sorted(df_processed['year_month'].dropna().unique().tolist(), reverse=True)
|
366 |
-
elif 'published_at' in df_processed.columns: # Derive if not present
|
367 |
-
dates = df_processed['published_at'].dropna()
|
368 |
-
if not dates.empty:
|
369 |
-
periods = sorted(dates.dt.strftime('%Y-%m').unique().tolist(), reverse=True)
|
370 |
-
else: return ["N/A"]
|
371 |
-
else: return ["N/A"]
|
372 |
-
|
373 |
-
return periods[:12] # Return up to the last 12 months
|
374 |
-
|
375 |
-
def analyze_post_data(self, post_df: pd.DataFrame) -> AgentMetrics:
|
376 |
-
"""
|
377 |
-
Generates comprehensive post performance analysis.
|
378 |
-
"""
|
379 |
-
if post_df is None or post_df.empty:
|
380 |
-
logger.warning("Post DataFrame is empty. Returning empty metrics.")
|
381 |
-
return AgentMetrics(
|
382 |
-
agent_name=self.AGENT_NAME,
|
383 |
-
analysis_summary="No post data provided for analysis.",
|
384 |
-
time_periods_covered=["N/A"]
|
385 |
-
)
|
386 |
-
|
387 |
-
# 1. Preprocess data
|
388 |
-
df_processed = self._preprocess_post_data(post_df)
|
389 |
-
if df_processed.empty and not post_df.empty : # Preprocessing resulted in empty df
|
390 |
-
logger.warning("Post DataFrame became empty after preprocessing. Original data might have issues.")
|
391 |
-
return AgentMetrics(
|
392 |
-
agent_name=self.AGENT_NAME,
|
393 |
-
analysis_summary="Post data could not be processed (e.g., all dates invalid).",
|
394 |
-
time_periods_covered=["N/A"]
|
395 |
-
)
|
396 |
-
elif df_processed.empty and post_df.empty: # Was already empty
|
397 |
-
# This case is handled by the initial check, but as a safeguard:
|
398 |
-
return AgentMetrics(agent_name=self.AGENT_NAME, analysis_summary="No post data provided.")
|
399 |
-
|
400 |
-
|
401 |
-
# 2. Generate textual analysis using PandasAI (similar to follower agent)
|
402 |
-
df_description_for_pandasai = "LinkedIn post performance data. Key columns: 'published_at' (date of post), 'media_type' (e.g., IMAGE, VIDEO, ARTICLE), 'li_eb_label' (content topic/pillar), 'likeCount', 'commentCount', 'shareCount', 'engagement' (sum of reactions, comments, shares), 'impressionCount', 'clickCount', 'sentiment' (post sentiment), 'is_ad' (boolean), 'engagement_rate', 'ctr'."
|
403 |
-
|
404 |
-
analysis_result_text = "PandasAI analysis for posts could not be performed."
|
405 |
-
try:
|
406 |
-
# Ensure PandasAI is configured
|
407 |
-
pandas_ai_df = pai.DataFrame(df_processed, description=df_description_for_pandasai)
|
408 |
-
|
409 |
-
analysis_query = f"""
|
410 |
-
Analyze the provided LinkedIn post performance data. Focus on:
|
411 |
-
1. Monthly trends for key metrics (engagement, impressions, engagement rate, CTR).
|
412 |
-
2. Performance comparison by 'media_type' and 'li_eb_label'. Which ones are most effective?
|
413 |
-
3. Impact of posting frequency (if derivable from 'published_at' timestamps).
|
414 |
-
4. Sentiment trends and distribution.
|
415 |
-
5. Differences in performance between ad posts ('is_ad'=True) and organic posts.
|
416 |
-
Provide a concise summary of findings and actionable recommendations.
|
417 |
-
"""
|
418 |
-
def chat_operation():
|
419 |
-
config = pai.config.get()
|
420 |
-
logger.info(f"pai_config: {config}, Type of config: {type(config)}")
|
421 |
-
if not config.llm:
|
422 |
-
logger.warning("PandasAI LLM not configured. Attempting to configure now.")
|
423 |
-
# This assumes configure_pandasai is available and sets the LLM config
|
424 |
-
from insight_and_tasks.utils.pandasai_setup import configure_pandasai
|
425 |
-
configure_pandasai(self.api_key, self.model_name)
|
426 |
-
|
427 |
-
# Re-check configuration after setup attempt
|
428 |
-
config = pai.config.get()
|
429 |
-
if not config.llm:
|
430 |
-
raise RuntimeError("PandasAI LLM could not be configured for chat operation.")
|
431 |
-
|
432 |
-
logger.info(f"Executing PandasAI chat for follower analysis with LLM: {config.llm}")
|
433 |
-
return pandas_ai_df.chat(analysis_query)
|
434 |
-
|
435 |
-
analysis_result_raw = self.retry_mechanism.retry_with_backoff(
|
436 |
-
func=chat_operation, max_retries=2, base_delay=2.0, exceptions=(Exception,)
|
437 |
-
)
|
438 |
-
analysis_result_text = str(analysis_result_raw) if analysis_result_raw else "No textual analysis for posts generated by PandasAI."
|
439 |
-
logger.info("Post performance analysis via PandasAI completed.")
|
440 |
-
|
441 |
-
except Exception as e:
|
442 |
-
logger.error(f"Post analysis with PandasAI failed: {e}", exc_info=True)
|
443 |
-
analysis_result_text = f"Post analysis using PandasAI failed. Error: {str(e)[:200]}"
|
444 |
-
|
445 |
-
# 3. Extract structured metrics
|
446 |
-
time_series_metrics = self._extract_time_series_metrics(df_processed)
|
447 |
-
aggregate_metrics = self._calculate_aggregate_metrics(df_processed)
|
448 |
-
categorical_metrics = self._extract_categorical_metrics(df_processed)
|
449 |
-
time_periods = self._extract_time_periods(df_processed)
|
450 |
-
|
451 |
-
return AgentMetrics(
|
452 |
-
agent_name=self.AGENT_NAME,
|
453 |
-
analysis_summary=analysis_result_text[:2000],
|
454 |
-
time_series_metrics=time_series_metrics,
|
455 |
-
aggregate_metrics=aggregate_metrics,
|
456 |
-
categorical_metrics=categorical_metrics,
|
457 |
-
time_periods_covered=time_periods,
|
458 |
-
data_sources_used=[f"post_df (shape: {post_df.shape}) -> df_processed (shape: {df_processed.shape})"]
|
459 |
-
)
|
460 |
-
|
461 |
-
if __name__ == '__main__':
|
462 |
-
try:
|
463 |
-
from utils.logging_config import setup_logging
|
464 |
-
setup_logging()
|
465 |
-
logger.info("Logging setup for EnhancedPostPerformanceAgent test.")
|
466 |
-
except ImportError:
|
467 |
-
logging.basicConfig(level=logging.INFO)
|
468 |
-
logger.warning("Could not import setup_logging. Using basicConfig.")
|
469 |
-
|
470 |
-
MOCK_API_KEY = os.environ.get("GOOGLE_API_KEY", "test_api_key_posts")
|
471 |
-
MODEL_NAME = DEFAULT_AGENT_MODEL
|
472 |
-
|
473 |
-
try:
|
474 |
-
from utils.pandasai_setup import configure_pandasai
|
475 |
-
if MOCK_API_KEY != "test_api_key_posts":
|
476 |
-
configure_pandasai(MOCK_API_KEY, MODEL_NAME)
|
477 |
-
logger.info("PandasAI configured for testing EnhancedPostPerformanceAgent.")
|
478 |
-
else:
|
479 |
-
logger.warning("Using mock API key for posts. PandasAI chat will likely fail or use a mock.")
|
480 |
-
class MockPandasAIDataFrame:
|
481 |
-
def __init__(self, df, description): self.df = df; self.description = description
|
482 |
-
def chat(self, query): return f"Mock PandasAI post response to: {query}"
|
483 |
-
pai.DataFrame = MockPandasAIDataFrame
|
484 |
-
except ImportError:
|
485 |
-
logger.error("utils.pandasai_setup not found. PandasAI will not be configured for posts.")
|
486 |
-
class MockPandasAIDataFrame:
|
487 |
-
def __init__(self, df, description): self.df = df; self.description = description
|
488 |
-
def chat(self, query): return f"Mock PandasAI post response to: {query}"
|
489 |
-
pai.DataFrame = MockPandasAIDataFrame
|
490 |
-
|
491 |
-
sample_post_data = {
|
492 |
-
'published_at': pd.to_datetime(['2023-01-15', '2023-01-20', '2023-02-10', '2023-02-25', '2023-03-05', None]),
|
493 |
-
'media_type': ['IMAGE', 'VIDEO', 'IMAGE', 'ARTICLE', 'IMAGE', 'IMAGE'],
|
494 |
-
'li_eb_label': ['Product Update', 'Company Culture', 'Product Update', 'Industry Insights', 'Company Culture', 'Product Update'],
|
495 |
-
'likeCount': [100, 150, 120, 80, 200, 50],
|
496 |
-
'commentCount': [10, 20, 15, 5, 25, 3],
|
497 |
-
'shareCount': [5, 10, 8, 2, 12, 1],
|
498 |
-
'engagement': [115, 180, 143, 87, 237, 54], # Sum of likes, comments, shares
|
499 |
-
'impressionCount': [1000, 1500, 1200, 900, 2000, 600],
|
500 |
-
'clickCount': [50, 70, 60, 30, 90, 20],
|
501 |
-
'sentiment': ['Positive 👍', 'Positive 👍', 'Neutral 😐', 'Positive 👍', 'Negative 👎', 'Positive 👍'],
|
502 |
-
'is_ad': [False, False, True, False, False, True]
|
503 |
-
}
|
504 |
-
sample_df_posts = pd.DataFrame(sample_post_data)
|
505 |
-
|
506 |
-
post_agent = EnhancedPostPerformanceAgent(api_key=MOCK_API_KEY, model_name=MODEL_NAME)
|
507 |
-
|
508 |
-
logger.info("Analyzing sample post data...")
|
509 |
-
post_metrics_result = post_agent.analyze_post_data(sample_df_posts)
|
510 |
-
|
511 |
-
print("\n--- EnhancedPostPerformanceAgent Results ---")
|
512 |
-
print(f"Agent Name: {post_metrics_result.agent_name}")
|
513 |
-
print(f"Analysis Summary: {post_metrics_result.analysis_summary}")
|
514 |
-
print("\nTime Series Metrics (Post):")
|
515 |
-
for ts_metric in post_metrics_result.time_series_metrics:
|
516 |
-
print(f" - {ts_metric.metric_name}: {len(ts_metric.values)} data points, e.g., {ts_metric.values[:3]} for ts {ts_metric.timestamps[:3]} (Unit: {ts_metric.unit})")
|
517 |
-
print("\nAggregate Metrics (Post):")
|
518 |
-
for key, value in post_metrics_result.aggregate_metrics.items():
|
519 |
-
if isinstance(value, dict) and 'data' in value: # Performance table
|
520 |
-
print(f" - {key}: (Table - {value.get('grouping_column', '')}) - {len(value['data'])} categories")
|
521 |
-
for item in value['data'][:1]: # Print first item for brevity
|
522 |
-
print(f" Example Category '{item.get('category')}': { {k:v for k,v in item.items() if k!='category'} }")
|
523 |
-
else:
|
524 |
-
print(f" - {key}: {value}")
|
525 |
-
print("\nCategorical Metrics (Post):")
|
526 |
-
for key, value in post_metrics_result.categorical_metrics.items():
|
527 |
-
print(f" - {key}:")
|
528 |
-
if isinstance(value, dict):
|
529 |
-
for sub_key, sub_value in list(value.items())[:2]:
|
530 |
-
print(f" - {sub_key}: {sub_value}")
|
531 |
-
else:
|
532 |
-
print(f" {value}")
|
533 |
-
print(f"\nTime Periods Covered (Post): {post_metrics_result.time_periods_covered}")
|
534 |
-
|
535 |
-
# Test with empty DataFrame
|
536 |
-
logger.info("\n--- Testing Post Agent with empty DataFrame ---")
|
537 |
-
empty_post_metrics = post_agent.analyze_post_data(pd.DataFrame())
|
538 |
-
print(f"Empty Post DF Analysis Summary: {empty_post_metrics.analysis_summary}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
features/insight_and_tasks/agents/task_extraction_agent.py
DELETED
@@ -1,400 +0,0 @@
|
|
1 |
-
# agents/task_extraction_agent.py
|
2 |
-
import logging
|
3 |
-
from typing import Optional
|
4 |
-
from datetime import datetime, date # Ensure date is imported if used for type hints
|
5 |
-
|
6 |
-
from google.adk.agents import LlmAgent
|
7 |
-
from google.adk.runners import InMemoryRunner # Assuming this is used for direct agent running
|
8 |
-
from google.genai import types as genai_types # For constructing ADK agent inputs
|
9 |
-
|
10 |
-
# Project-specific imports
|
11 |
-
from features.insight_and_tasks.data_models.tasks import (
|
12 |
-
TaskExtractionOutput,
|
13 |
-
OKR,
|
14 |
-
KeyResult,
|
15 |
-
Task,
|
16 |
-
EffortLevel,
|
17 |
-
TimelineCategory,
|
18 |
-
PriorityLevel,
|
19 |
-
TaskType,
|
20 |
-
DataSubject # Ensure all are imported
|
21 |
-
)
|
22 |
-
from features.insight_and_tasks.utils.retry_mechanism import RetryMechanism # If retries are needed for ADK calls
|
23 |
-
|
24 |
-
# Configure logger for this module
|
25 |
-
logger = logging.getLogger(__name__)
|
26 |
-
|
27 |
-
DEFAULT_AGENT_MODEL = "gemini-2.5-flash-preview-05-20" # Or your specific model
|
28 |
-
|
29 |
-
class TaskExtractionAgent:
|
30 |
-
"""
|
31 |
-
Agent specialized in extracting actionable tasks and OKRs from analysis insights,
|
32 |
-
with awareness of the current date and quarter.
|
33 |
-
"""
|
34 |
-
AGENT_NAME = "task_extractor"
|
35 |
-
AGENT_DESCRIPTION = "Specialist in converting strategic insights into specific, time-aware actionable tasks and OKRs."
|
36 |
-
|
37 |
-
def __init__(self, api_key: str, model_name: Optional[str] = None, current_date: Optional[date] = None):
|
38 |
-
"""
|
39 |
-
Initializes the TaskExtractionAgent.
|
40 |
-
Args:
|
41 |
-
api_key: API key (may be used by LlmAgent configuration or future needs).
|
42 |
-
model_name: Name of the language model to use.
|
43 |
-
current_date: The current date to use for quarter calculations. Defaults to today.
|
44 |
-
"""
|
45 |
-
self.api_key = api_key # Store if needed by LlmAgent or other components
|
46 |
-
self.model_name = model_name or DEFAULT_AGENT_MODEL
|
47 |
-
self.current_date = current_date or datetime.utcnow().date() # Use date object for consistency
|
48 |
-
|
49 |
-
# LlmAgent is initialized with dynamic instruction and output schema
|
50 |
-
self.agent = LlmAgent(
|
51 |
-
name=self.AGENT_NAME,
|
52 |
-
model=self.model_name,
|
53 |
-
description=self.AGENT_DESCRIPTION,
|
54 |
-
instruction=self._get_instruction_prompt(), # Instruction generated dynamically
|
55 |
-
output_schema=TaskExtractionOutput, # Pydantic model for structured output
|
56 |
-
output_key="extracted_tasks_okrs" # Key where LlmAgent stores structured output in state
|
57 |
-
)
|
58 |
-
self.retry_mechanism = RetryMechanism() # For retrying ADK runner if needed
|
59 |
-
logger.info(f"{self.AGENT_NAME} initialized for Q{self._get_quarter(self.current_date)}, "
|
60 |
-
f"{self._days_until_quarter_end(self.current_date)} days remaining in quarter. Model: {self.model_name}")
|
61 |
-
|
62 |
-
def _get_quarter(self, d: date) -> int:
|
63 |
-
"""Calculates the quarter for a given date."""
|
64 |
-
return (d.month - 1) // 3 + 1
|
65 |
-
|
66 |
-
def _days_until_quarter_end(self, d: date) -> int:
|
67 |
-
"""Calculates the number of days remaining in the current quarter from date d."""
|
68 |
-
current_q = self._get_quarter(d)
|
69 |
-
year = d.year
|
70 |
-
if current_q == 1:
|
71 |
-
quarter_end_date = date(year, 3, 31)
|
72 |
-
elif current_q == 2:
|
73 |
-
quarter_end_date = date(year, 6, 30)
|
74 |
-
elif current_q == 3:
|
75 |
-
quarter_end_date = date(year, 9, 30)
|
76 |
-
else: # Quarter 4
|
77 |
-
quarter_end_date = date(year, 12, 31)
|
78 |
-
|
79 |
-
days_remaining = (quarter_end_date - d).days
|
80 |
-
return max(0, days_remaining) # Ensure non-negative
|
81 |
-
|
82 |
-
def _get_instruction_prompt(self) -> str:
|
83 |
-
"""Generates the dynamic instruction string for the LLM agent."""
|
84 |
-
quarter = self._get_quarter(self.current_date)
|
85 |
-
days_remaining = self._days_until_quarter_end(self.current_date)
|
86 |
-
|
87 |
-
|
88 |
-
return f"""
|
89 |
-
You are a Time-Aware Task Extraction Specialist, an AI expert in meticulously analyzing strategic insights (e.g., from LinkedIn analytics) and transforming them into a structured set of actionable tasks, organized within an Objectives and KeyResults (OKRs) framework.
|
90 |
-
|
91 |
-
CURRENT CONTEXTUAL INFORMATION (CRITICAL - Use these exact values in your output where specified):
|
92 |
-
- Current Quarter: Q{quarter}
|
93 |
-
- Days remaining in current quarter: {days_remaining}
|
94 |
-
- Today's Date (for your context only, not for direct output unless specified by a schema field): {self.current_date.isoformat()}
|
95 |
-
|
96 |
-
For EACH 'OKR' object, you MUST generate a 'key_results' array containing 1 to 3 'KeyResult' objects.
|
97 |
-
For EACH 'KeyResult' object, you MUST generate a 'tasks' array containing 1 to 3 'Task' objects.
|
98 |
-
It is CRITICAL that you populate the 'key_results' list for every OKR, and the 'tasks' list for every KeyResult.
|
99 |
-
|
100 |
-
KEY GUIDELINES FOR QUALITY AND ACCURACY:
|
101 |
-
- Actionability: All descriptions (Objective, Key Result, Task) must be clear, concise, and define concrete actions or measurable outcomes.
|
102 |
-
- Measurability: Key Results and Task 'success_criteria_metrics' must be specific and quantifiable.
|
103 |
-
- Completeness: Ensure all REQUIRED fields in every Pydantic model are present in your JSON output. Optional fields can be omitted or set to null if not applicable.
|
104 |
-
|
105 |
-
INPUT:
|
106 |
-
You will receive a 'comprehensive_analysis' text.
|
107 |
-
|
108 |
-
OUTPUT FORMAT:
|
109 |
-
# Example of the overall JSON structure (content is illustrative; refer to schemas for full details):
|
110 |
-
{{
|
111 |
-
"current_quarter_info": "Q{quarter}, {days_remaining} days remaining",
|
112 |
-
"okrs": [
|
113 |
-
{{
|
114 |
-
"objective_description": "Example: Elevate brand visibility and engagement across key digital channels.",
|
115 |
-
"objective_timeline": "{TimelineCategory.SHORT_TERM.value}",
|
116 |
-
"objective_owner": "Marketing Department",
|
117 |
-
"key_results": [
|
118 |
-
{{
|
119 |
-
"key_result_description": "Example: Increase organic reach on LinkedIn by 15%.",
|
120 |
-
"target_metric": "LinkedIn Organic Reach Percentage Increase",
|
121 |
-
"target_value": "15%",
|
122 |
-
"tasks": [
|
123 |
-
{{
|
124 |
-
"task_category": "Content Strategy",
|
125 |
-
"task_description": "Develop and schedule a 4-week content calendar for LinkedIn focusing on industry insights.",
|
126 |
-
"objective_deliverable": "Deliverable: A finalized 4-week content calendar with 3 posts per week, approved and scheduled.",
|
127 |
-
"effort": "{EffortLevel.MEDIUM.value}",
|
128 |
-
"timeline": "{TimelineCategory.IMMEDIATE.value}",
|
129 |
-
"responsible_party": "Content Marketing Manager",
|
130 |
-
"success_criteria_metrics": "Content calendar completed and approved by [Date]. All posts scheduled by [Date].",
|
131 |
-
"dependencies_prerequisites": "Completion of Q{quarter} keyword research and audience persona refinement.",
|
132 |
-
"priority": "{PriorityLevel.HIGH.value}",
|
133 |
-
"priority_justification": "Critical for maintaining consistent brand voice and achieving engagement targets for the quarter.",
|
134 |
-
"why_proposed": "Analysis of LinkedIn insights report (Page 3) showed a 20% drop in engagement last month, attributed to inconsistent posting schedule and lack of targeted content themes.",
|
135 |
-
"task_type": "{TaskType.INITIATIVE.value}",
|
136 |
-
"data_subject": "{DataSubject.POSTS.value}"
|
137 |
-
}}
|
138 |
-
]
|
139 |
-
}}
|
140 |
-
]
|
141 |
-
}}
|
142 |
-
],
|
143 |
-
"overall_strategic_focus": "Example: Focus on data-driven content strategy and proactive community engagement to boost Q{quarter} performance.",
|
144 |
-
"generation_timestamp": "{datetime.utcnow().isoformat()}Z"
|
145 |
-
}}
|
146 |
-
|
147 |
-
Focus on precision, quality, actionability, and strict adherence to the specified JSON output schema and all constraints.
|
148 |
-
Ensure all string values in the JSON are properly escaped if they contain special characters (e.g., newlines, quotes).
|
149 |
-
"""
|
150 |
-
|
151 |
-
async def extract_tasks(self, comprehensive_analysis: str) -> TaskExtractionOutput:
|
152 |
-
"""
|
153 |
-
Extracts time-aware actionable tasks from the comprehensive analysis text.
|
154 |
-
Args:
|
155 |
-
comprehensive_analysis: The text analysis from which to extract tasks.
|
156 |
-
Returns:
|
157 |
-
A TaskExtractionOutput Pydantic model instance.
|
158 |
-
"""
|
159 |
-
if not comprehensive_analysis or not comprehensive_analysis.strip():
|
160 |
-
logger.warning("Comprehensive analysis text is empty. Cannot extract tasks.")
|
161 |
-
return TaskExtractionOutput(
|
162 |
-
current_quarter_info=f"Q{self._get_quarter(self.current_date)}, {self._days_until_quarter_end(self.current_date)} days remaining",
|
163 |
-
okrs=[],
|
164 |
-
overall_strategic_focus="No analysis provided to extract tasks."
|
165 |
-
)
|
166 |
-
|
167 |
-
# The LlmAgent's instruction already contains the dynamic date info and output format.
|
168 |
-
# The input to the agent's run method will be the comprehensive_analysis.
|
169 |
-
prompt_for_adk_agent = f"""
|
170 |
-
Comprehensive Analysis for Task Extraction:
|
171 |
-
---
|
172 |
-
{comprehensive_analysis}
|
173 |
-
---
|
174 |
-
Based on the analysis above, and adhering strictly to your primary instructions (especially regarding current quarter context, task field requirements, and JSON output schema 'TaskExtractionOutput'), generate the OKRs and tasks.
|
175 |
-
Ensure the 'current_quarter_info' field in your JSON output is exactly: "Q{self._get_quarter(self.current_date)}, {self._days_until_quarter_end(self.current_date)} days remaining".
|
176 |
-
"""
|
177 |
-
|
178 |
-
user_input_content = genai_types.Content(
|
179 |
-
role="user",
|
180 |
-
parts=[genai_types.Part(text=prompt_for_adk_agent)]
|
181 |
-
)
|
182 |
-
|
183 |
-
# Using InMemoryRunner as per original structure for LlmAgent with output_schema
|
184 |
-
runner = InMemoryRunner(agent=self.agent, app_name=f"{self.AGENT_NAME}Runner")
|
185 |
-
# Generate a unique user_id for each run to ensure fresh session state if needed.
|
186 |
-
user_id = f"system_user_task_extractor_{int(datetime.utcnow().timestamp())}"
|
187 |
-
|
188 |
-
session = await runner.session_service.create_session(
|
189 |
-
app_name=f"{self.AGENT_NAME}Runner",
|
190 |
-
user_id=user_id
|
191 |
-
)
|
192 |
-
|
193 |
-
extracted_data_dict = None
|
194 |
-
full_response_text_for_debug = "" # To capture raw text if parsing fails
|
195 |
-
|
196 |
-
try:
|
197 |
-
logger.info(f"Running TaskExtractionAgent for user_id: {user_id}, session_id: {session.id}")
|
198 |
-
|
199 |
-
# Fix: Use regular for loop instead of async for, since runner.run() returns a generator
|
200 |
-
run_result = runner.run(
|
201 |
-
user_id=user_id,
|
202 |
-
session_id=session.id,
|
203 |
-
new_message=user_input_content
|
204 |
-
)
|
205 |
-
|
206 |
-
# Check if it's an async iterator or regular generator
|
207 |
-
if hasattr(run_result, '__aiter__'):
|
208 |
-
# It's an async iterator, use async for
|
209 |
-
async for event in run_result:
|
210 |
-
if (hasattr(event, 'actions') and event.actions and
|
211 |
-
hasattr(event.actions, 'state_delta') and
|
212 |
-
isinstance(event.actions.state_delta, dict) and
|
213 |
-
self.agent.output_key in event.actions.state_delta):
|
214 |
-
|
215 |
-
extracted_data_dict = event.actions.state_delta[self.agent.output_key]
|
216 |
-
logger.info(f"Successfully extracted structured data via LlmAgent state_delta.")
|
217 |
-
break
|
218 |
-
|
219 |
-
# Capture text parts for debugging if direct structured output isn't found first
|
220 |
-
if hasattr(event, 'content') and event.content and event.content.parts:
|
221 |
-
for part in event.content.parts:
|
222 |
-
if hasattr(part, 'text'):
|
223 |
-
full_response_text_for_debug += part.text
|
224 |
-
else:
|
225 |
-
# It's a regular generator, use regular for loop
|
226 |
-
for event in run_result:
|
227 |
-
if (hasattr(event, 'actions') and event.actions and
|
228 |
-
hasattr(event.actions, 'state_delta') and
|
229 |
-
isinstance(event.actions.state_delta, dict) and
|
230 |
-
self.agent.output_key in event.actions.state_delta):
|
231 |
-
|
232 |
-
extracted_data_dict = event.actions.state_delta[self.agent.output_key]
|
233 |
-
logger.info(f"Successfully extracted structured data via LlmAgent state_delta.")
|
234 |
-
break
|
235 |
-
|
236 |
-
# Capture text parts for debugging if direct structured output isn't found first
|
237 |
-
if hasattr(event, 'content') and event.content and event.content.parts:
|
238 |
-
for part in event.content.parts:
|
239 |
-
if hasattr(part, 'text'):
|
240 |
-
full_response_text_for_debug += part.text
|
241 |
-
|
242 |
-
if not extracted_data_dict and full_response_text_for_debug:
|
243 |
-
logger.warning("LlmAgent did not produce structured output in state_delta. Raw text response was: %s",
|
244 |
-
full_response_text_for_debug[:500] + "...")
|
245 |
-
|
246 |
-
except Exception as e:
|
247 |
-
logger.error(f"Error during TaskExtractionAgent execution: {e}", exc_info=True)
|
248 |
-
finally:
|
249 |
-
try:
|
250 |
-
await runner.session_service.delete_session(
|
251 |
-
app_name=f"{self.AGENT_NAME}Runner", user_id=user_id, session_id=session.id
|
252 |
-
)
|
253 |
-
except Exception as session_del_e:
|
254 |
-
logger.error(f"Error deleting task extractor session: {session_del_e}")
|
255 |
-
|
256 |
-
if extracted_data_dict:
|
257 |
-
if isinstance(extracted_data_dict, TaskExtractionOutput): # Already a Pydantic model
|
258 |
-
return extracted_data_dict
|
259 |
-
elif isinstance(extracted_data_dict, dict): # If it's a dict, parse it
|
260 |
-
try:
|
261 |
-
return TaskExtractionOutput(**extracted_data_dict)
|
262 |
-
except Exception as pydantic_error:
|
263 |
-
logger.error(f"Error parsing extracted dictionary into TaskExtractionOutput: {pydantic_error}", exc_info=True)
|
264 |
-
logger.error(f"Problematic dictionary data: {extracted_data_dict}")
|
265 |
-
else:
|
266 |
-
logger.error(f"Extracted data is not a dictionary or TaskExtractionOutput model: {type(extracted_data_dict)}")
|
267 |
-
|
268 |
-
# Fallback if no valid data extracted
|
269 |
-
logger.warning("No valid structured data extracted by TaskExtractionAgent.")
|
270 |
-
return TaskExtractionOutput(
|
271 |
-
current_quarter_info=f"Q{self._get_quarter(self.current_date)}, {self._days_until_quarter_end(self.current_date)} days remaining",
|
272 |
-
okrs=[],
|
273 |
-
overall_strategic_focus="Failed to extract tasks or no tasks were identified.",
|
274 |
-
generation_timestamp=datetime.utcnow().isoformat()
|
275 |
-
)
|
276 |
-
|
277 |
-
def update_current_date(self, new_date: date):
|
278 |
-
"""
|
279 |
-
Updates the current date for the agent and re-initializes the LlmAgent
|
280 |
-
to reflect the new date context in its instructions.
|
281 |
-
"""
|
282 |
-
self.current_date = new_date
|
283 |
-
# Re-initialize the LlmAgent with the new instruction based on the new date
|
284 |
-
self.agent = LlmAgent(
|
285 |
-
name=self.AGENT_NAME,
|
286 |
-
model=self.model_name,
|
287 |
-
description=self.AGENT_DESCRIPTION,
|
288 |
-
instruction=self._get_instruction_prompt(), # Get updated instruction
|
289 |
-
output_schema=TaskExtractionOutput,
|
290 |
-
output_key="extracted_tasks_okrs"
|
291 |
-
)
|
292 |
-
logger.info(f"{self.AGENT_NAME} date updated. New context: Q{self._get_quarter(self.current_date)}, "
|
293 |
-
f"{self._days_until_quarter_end(self.current_date)} days remaining.")
|
294 |
-
|
295 |
-
|
296 |
-
if __name__ == '__main__':
|
297 |
-
import asyncio
|
298 |
-
# (Ensure logging_config.py is in the same directory or PYTHONPATH is set for this example to run standalone)
|
299 |
-
try:
|
300 |
-
from utils.logging_config import setup_logging
|
301 |
-
setup_logging()
|
302 |
-
logger.info("Logging setup for TaskExtractionAgent test.")
|
303 |
-
except ImportError:
|
304 |
-
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")
|
305 |
-
logger.warning("logging_config.py not found, using basicConfig for logging.")
|
306 |
-
|
307 |
-
MOCK_API_KEY = os.environ.get("GOOGLE_API_KEY", "test_api_key_task_extractor") # Use your actual key or env var
|
308 |
-
MODEL_NAME = DEFAULT_AGENT_MODEL
|
309 |
-
|
310 |
-
# Example comprehensive analysis text (replace with actual analysis output)
|
311 |
-
sample_analysis_text = """
|
312 |
-
Overall Summary: Follower growth is steady at 5% MoM. Post engagement is highest for video content
|
313 |
-
(avg 8% engagement rate) published on weekdays. However, mentions sentiment dipped in the last month
|
314 |
-
(-0.2 avg score) due to complaints about customer service response times.
|
315 |
-
Key opportunity: Improve customer service communication and leverage video content more effectively.
|
316 |
-
Strategic Recommendation: Launch a 'Customer First' initiative and create a video series showcasing customer success stories.
|
317 |
-
"""
|
318 |
-
|
319 |
-
# Test with a specific date
|
320 |
-
test_date = date(2025, 4, 15) # Example: Mid-Q2 2025
|
321 |
-
task_agent = TaskExtractionAgent(api_key=MOCK_API_KEY, model_name=MODEL_NAME, current_date=test_date)
|
322 |
-
|
323 |
-
logger.info(f"Task Agent Instruction for test_date {test_date}:\n{task_agent._get_instruction_prompt()[:500]}...")
|
324 |
-
|
325 |
-
async def run_extraction():
|
326 |
-
logger.info("Extracting tasks from sample analysis...")
|
327 |
-
# In a real scenario, ensure GOOGLE_API_KEY is set if the LlmAgent makes actual calls.
|
328 |
-
# For local tests without real API calls, the LlmAgent might behave as a mock or require specific test setup.
|
329 |
-
if MOCK_API_KEY == "test_api_key_task_extractor":
|
330 |
-
logger.warning("Using a mock API key. LlmAgent behavior might be limited or mocked for task extraction.")
|
331 |
-
# Mock the runner if no real API call should be made
|
332 |
-
class MockADKRunner:
|
333 |
-
def __init__(self, agent, app_name): self.agent = agent
|
334 |
-
async def session_service_create_session(self, app_name, user_id):
|
335 |
-
class MockSession: id = "mock_session_id"
|
336 |
-
return MockSession()
|
337 |
-
async def run(self, user_id, session_id, new_message):
|
338 |
-
# Simulate a response structure
|
339 |
-
mock_okr = OKR(
|
340 |
-
objective_description="Improve Customer Satisfaction",
|
341 |
-
key_results=[KeyResult(
|
342 |
-
key_result_description="Reduce negative mentions by 10%",
|
343 |
-
tasks=[Task(
|
344 |
-
task_category="Customer Service", task_description="Respond to all negative mentions within 2 hours.",
|
345 |
-
objective_deliverable="Improved response time.", effort=EffortLevel.MEDIUM, timeline=TimelineCategory.IMMEDIATE,
|
346 |
-
responsible_party="Support Team", success_criteria_metrics="Avg response time < 2hrs.",
|
347 |
-
priority=PriorityLevel.HIGH, priority_justification="Critical for reputation.",
|
348 |
-
why_proposed="Analysis showed dip in sentiment due to slow responses.", task_type=TaskType.INITIATIVE,
|
349 |
-
data_subject=DataSubject.MENTIONS
|
350 |
-
)]
|
351 |
-
)],
|
352 |
-
objective_timeline=TimelineCategory.SHORT_TERM
|
353 |
-
)
|
354 |
-
mock_output = TaskExtractionOutput(
|
355 |
-
current_quarter_info=f"Q{task_agent._get_quarter(task_agent.current_date)}, {task_agent._days_until_quarter_end(task_agent.current_date)} days remaining",
|
356 |
-
okrs=[mock_okr],
|
357 |
-
overall_strategic_focus="Focus on customer service improvement."
|
358 |
-
)
|
359 |
-
# Simulate the event structure LlmAgent with output_schema would produce
|
360 |
-
class MockEvent:
|
361 |
-
def __init__(self):
|
362 |
-
self.actions = type('Actions', (), {'state_delta': {task_agent.agent.output_key: mock_output.model_dump()}})() # .model_dump() for Pydantic v2
|
363 |
-
yield MockEvent()
|
364 |
-
|
365 |
-
async def session_service_delete_session(self, app_name, user_id, session_id): pass
|
366 |
-
|
367 |
-
# Monkey patch the InMemoryRunner for this test if using mock key
|
368 |
-
global InMemoryRunner
|
369 |
-
OriginalInMemoryRunner = InMemoryRunner
|
370 |
-
InMemoryRunner = MockADKRunner
|
371 |
-
|
372 |
-
|
373 |
-
extracted_okrs_output = await task_agent.extract_tasks(sample_analysis_text)
|
374 |
-
|
375 |
-
# Restore InMemoryRunner if it was patched
|
376 |
-
if MOCK_API_KEY == "test_api_key_task_extractor" and 'OriginalInMemoryRunner' in globals():
|
377 |
-
InMemoryRunner = OriginalInMemoryRunner
|
378 |
-
|
379 |
-
|
380 |
-
print("\n--- TaskExtractionAgent Results ---")
|
381 |
-
if extracted_okrs_output:
|
382 |
-
print(f"Current Quarter Info: {extracted_okrs_output.current_quarter_info}")
|
383 |
-
print(f"Overall Strategic Focus: {extracted_okrs_output.overall_strategic_focus}")
|
384 |
-
print(f"Generated Timestamp: {extracted_okrs_output.generation_timestamp}")
|
385 |
-
print("\nOKRs Extracted:")
|
386 |
-
# Use .model_dump_json() for Pydantic v2 for pretty printing
|
387 |
-
print(extracted_okrs_output.model_dump_json(indent=2))
|
388 |
-
else:
|
389 |
-
print("No OKRs extracted or an error occurred.")
|
390 |
-
|
391 |
-
if __name__ == '__main__': # This check is technically inside another if __name__ == '__main__'
|
392 |
-
asyncio.run(run_extraction())
|
393 |
-
|
394 |
-
# Example of updating date
|
395 |
-
logger.info("\n--- Updating date for Task Agent ---")
|
396 |
-
new_test_date = date(2025, 10, 5) # Q4
|
397 |
-
task_agent.update_current_date(new_test_date)
|
398 |
-
# The instruction within task_agent.agent is now updated.
|
399 |
-
# logger.info(f"Task Agent NEW Instruction for test_date {new_test_date}:\n{task_agent.agent.instruction[:500]}...")
|
400 |
-
# A new call to extract_tasks would use this updated context.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
features/insight_and_tasks/agents/task_extraction_model.py
DELETED
@@ -1,226 +0,0 @@
|
|
1 |
-
import enum
|
2 |
-
import json
|
3 |
-
from typing import List, Optional, Literal
|
4 |
-
from pydantic import BaseModel, Field, field_validator, ValidationInfo
|
5 |
-
from datetime import datetime, date
|
6 |
-
|
7 |
-
try:
|
8 |
-
from google import genai
|
9 |
-
except ImportError:
|
10 |
-
print("Warning: 'google.generai' library not found. Please install it.")
|
11 |
-
print("If you are using the standard Gemini API, try: pip install google-generativeai")
|
12 |
-
print("If using Vertex AI, ensure the Google Cloud SDK is configured.")
|
13 |
-
genai = None # Placeholder to allow script to be parsed
|
14 |
-
|
15 |
-
from features.insight_and_tasks.data_models.tasks import (
|
16 |
-
TaskExtractionOutput,
|
17 |
-
OKR,
|
18 |
-
KeyResult,
|
19 |
-
Task,
|
20 |
-
EffortLevel,
|
21 |
-
TimelineCategory,
|
22 |
-
PriorityLevel,
|
23 |
-
TaskType,
|
24 |
-
DataSubject # Ensure all are imported
|
25 |
-
)
|
26 |
-
|
27 |
-
def create_example_structure():
|
28 |
-
"""
|
29 |
-
Creates a valid example structure that conforms to the Pydantic models
|
30 |
-
to show the AI what the output should look like.
|
31 |
-
"""
|
32 |
-
return {
|
33 |
-
"current_quarter_info": "Q2 2025, 24 days remaining",
|
34 |
-
"okrs": [
|
35 |
-
{
|
36 |
-
"objective_description": "Significantly improve our LinkedIn employer branding performance to attract top-tier talent and establish our company as a thought leader in the tech industry.",
|
37 |
-
|
38 |
-
"objective_timeline": "Short-term",
|
39 |
-
"objective_owner": "Marketing Department",
|
40 |
-
"key_results": [
|
41 |
-
{
|
42 |
-
# CORRECTION: Description expanded to satisfy the 'min_length=100' validation rule.
|
43 |
-
"key_result_description": "Achieve a sustained 50% increase in the rate of monthly follower growth on our company LinkedIn page, demonstrating enhanced audience engagement and brand reach.",
|
44 |
-
"target_metric": "Monthly Follower Growth Rate",
|
45 |
-
"target_value": "50% increase",
|
46 |
-
|
47 |
-
# CORRECTION: Extra 'current_value' field removed as it's not in the Pydantic model.
|
48 |
-
|
49 |
-
# CORRECTION: Value changed from "performance" to "PERFORMANCE" to match 'KeyResultType' enum.
|
50 |
-
"key_result_type": "PERFORMANCE",
|
51 |
-
|
52 |
-
# CORRECTION: Value changed from "posts" to "FOLLOWER_STATS" to match 'DataSubject' enum and better reflect the key result.
|
53 |
-
"data_subject": "FOLLOWER_STATS",
|
54 |
-
"tasks": [
|
55 |
-
{
|
56 |
-
"task_description": "Increase posting frequency to a consistent, high-quality schedule.",
|
57 |
-
"objective_deliverable": "Post a minimum of 3 high-quality, relevant articles or updates per week.",
|
58 |
-
"task_category": "Content Creation",
|
59 |
-
|
60 |
-
# CORRECTION: Added the missing required field 'task_type' with a valid 'TaskType' enum value.
|
61 |
-
"task_type": "INITIATIVE",
|
62 |
-
|
63 |
-
# CORRECTION: Value changed from "high" to "High" to match the 'PriorityLevel' enum.
|
64 |
-
"priority": "High",
|
65 |
-
|
66 |
-
# CORRECTION: Added the missing required field 'priority_justification'.
|
67 |
-
"priority_justification": "Increasing post frequency is a primary driver for engagement and follower growth, directly impacting the key result.",
|
68 |
-
|
69 |
-
# CORRECTION: Value changed from "medium" to "Medium" to match the 'EffortLevel' enum.
|
70 |
-
"effort": "Medium",
|
71 |
-
|
72 |
-
# CORRECTION: Value changed from "this_quarter" to "Short-term" to match the 'TimelineCategory' enum.
|
73 |
-
"timeline": "Short-term",
|
74 |
-
|
75 |
-
# CORRECTION: Value changed from "linkedin_performance" to "POSTS" to match the 'DataSubject' enum.
|
76 |
-
"data_subject": "POSTS",
|
77 |
-
"responsible_party": "Social Media Manager",
|
78 |
-
"success_criteria_metrics": "A weekly average of 3 or more posts is maintained over the quarter.",
|
79 |
-
"dependencies_prerequisites": "A finalized content calendar for the quarter.",
|
80 |
-
"why_proposed": "Historical data analysis shows a direct correlation between low posting frequency and stagnant follower gains. This task addresses the root cause."
|
81 |
-
}
|
82 |
-
]
|
83 |
-
}
|
84 |
-
]
|
85 |
-
}
|
86 |
-
],
|
87 |
-
"overall_strategic_focus": "Accelerate follower growth and enhance brand authority on LinkedIn."
|
88 |
-
}
|
89 |
-
|
90 |
-
|
91 |
-
# --- Helper Function for Date Calculations ---
|
92 |
-
def get_quarter_info():
|
93 |
-
"""Calculates current quarter, year, and days remaining in the quarter."""
|
94 |
-
today = date.today()
|
95 |
-
current_year = today.year
|
96 |
-
current_quarter = (today.month - 1) // 3 + 1
|
97 |
-
|
98 |
-
# Determine the end date of the current quarter
|
99 |
-
if current_quarter == 1:
|
100 |
-
end_of_quarter_date = date(current_year, 3, 31)
|
101 |
-
elif current_quarter == 2:
|
102 |
-
end_of_quarter_date = date(current_year, 6, 30)
|
103 |
-
elif current_quarter == 3:
|
104 |
-
end_of_quarter_date = date(current_year, 9, 30)
|
105 |
-
else: # current_quarter == 4
|
106 |
-
end_of_quarter_date = date(current_year, 12, 31)
|
107 |
-
|
108 |
-
days_remaining = (end_of_quarter_date - today).days
|
109 |
-
days_remaining = max(0, days_remaining) # Ensure it's not negative
|
110 |
-
|
111 |
-
return current_quarter, current_year, days_remaining, today
|
112 |
-
|
113 |
-
# --- Main Task Extraction Function ---
|
114 |
-
def extract_tasks_from_text(user_text_input: str, api_key: str) -> TaskExtractionOutput:
|
115 |
-
"""
|
116 |
-
Extracts tasks from input text using Gemini API and structures them as TaskExtractionOutput.
|
117 |
-
|
118 |
-
Args:
|
119 |
-
user_text_input: The text to analyze.
|
120 |
-
api_key: The Gemini API key.
|
121 |
-
|
122 |
-
Returns:
|
123 |
-
A TaskExtractionOutput Pydantic model instance.
|
124 |
-
|
125 |
-
Raises:
|
126 |
-
ValueError: If API call fails or response parsing is unsuccessful.
|
127 |
-
ImportError: If 'google.generai' is not available.
|
128 |
-
"""
|
129 |
-
if not genai:
|
130 |
-
raise ImportError("The 'google.generai' library is not available. Please install and configure it.")
|
131 |
-
|
132 |
-
# Initialize the Gemini client (as per user's example structure)
|
133 |
-
# This specific client initialization might vary based on the exact 'google.generai' library version/origin.
|
134 |
-
try:
|
135 |
-
client = genai.Client(api_key=api_key)
|
136 |
-
except AttributeError:
|
137 |
-
# Fallback for standard google-generativeai SDK if genai.Client is not found
|
138 |
-
try:
|
139 |
-
genai.configure(api_key=api_key)
|
140 |
-
# This function will then need to use genai.GenerativeModel('gemini-2.0-flash')
|
141 |
-
# For simplicity, sticking to user's client.models.generate_content structure.
|
142 |
-
# This part would need significant rework if genai.Client is not the correct interface.
|
143 |
-
print("Warning: genai.Client not found. The API call structure might be incorrect for your 'google.generai' version.")
|
144 |
-
print("Assuming a client object with 'models.generate_content' method is expected.")
|
145 |
-
# This is a placeholder; actual client setup depends on the specific library.
|
146 |
-
# If this is google-generativeai, the user should adapt to use genai.GenerativeModel.
|
147 |
-
raise NotImplementedError("genai.Client not found. Please adapt API call to your SDK version.")
|
148 |
-
|
149 |
-
except Exception as e:
|
150 |
-
raise ImportError(f"Failed to initialize Gemini client or configure API key: {e}")
|
151 |
-
|
152 |
-
|
153 |
-
quarter, year, days_remaining, current_date_obj = get_quarter_info()
|
154 |
-
current_date_iso = current_date_obj.isoformat()
|
155 |
-
example_structure = create_example_structure()
|
156 |
-
|
157 |
-
# Construct the detailed prompt for the LLM
|
158 |
-
prompt = f"""You are a Time-Aware Task Extraction Specialist, an AI expert in meticulously analyzing strategic insights (e.g., from LinkedIn analytics) and transforming them into a structured set of actionable tasks, organized within an Objectives and KeyResults (OKRs) framework.
|
159 |
-
|
160 |
-
Your output MUST be a valid JSON object that strictly conforms to the 'TaskExtractionOutput' schema provided.
|
161 |
-
|
162 |
-
CURRENT CONTEXTUAL INFORMATION (CRITICAL - Use these exact values in your output where specified):
|
163 |
-
- Current Quarter: Q{quarter}
|
164 |
-
- Current Year: {year}
|
165 |
-
- Days remaining in current quarter: {days_remaining}
|
166 |
-
- Today's Date (for your context only, not for direct output unless specified by a schema field): {current_date_iso}
|
167 |
-
|
168 |
-
When populating the 'current_quarter_info' field in the TaskExtractionOutput, use the format: 'Q{quarter} {year}, {days_remaining} days remaining'.
|
169 |
-
|
170 |
-
GENERATION RULES:
|
171 |
-
1. Create 1-3 OKR objects based on the input text
|
172 |
-
2. For each OKR, create 1-3 KeyResult objects (MANDATORY - cannot be empty)
|
173 |
-
3. For each KeyResult, create 1-3 Task objects (MANDATORY - cannot be empty)
|
174 |
-
4. Make tasks specific, actionable, and directly related to the insights in the input text
|
175 |
-
5. No repetitive text allowed
|
176 |
-
6. Complete JSON object with proper closing braces
|
177 |
-
7. Maximum response length: 5000 characters
|
178 |
-
|
179 |
-
Now, analyze the following text and generate the structured output:
|
180 |
-
---
|
181 |
-
TEXT TO ANALYZE:
|
182 |
-
{user_text_input}
|
183 |
-
---
|
184 |
-
"""
|
185 |
-
|
186 |
-
try:
|
187 |
-
response = client.models.generate_content(
|
188 |
-
model="gemini-2.5-flash-preview-05-20", # As per user's example
|
189 |
-
contents=prompt,
|
190 |
-
config={
|
191 |
-
'response_mime_type': 'application/json',
|
192 |
-
'response_schema': TaskExtractionOutput, # Pass the Pydantic model class
|
193 |
-
'temperature': 0.1,
|
194 |
-
'top_p': 0.8,
|
195 |
-
},
|
196 |
-
)
|
197 |
-
except Exception as e:
|
198 |
-
raise ValueError(f"Gemini API call failed: {e}")
|
199 |
-
|
200 |
-
# Process the response
|
201 |
-
# Based on user's example `print(response.text)`, we assume .text contains the JSON.
|
202 |
-
# However, standard Gemini API often has it in response.candidates[0].content.parts[0].text.
|
203 |
-
response_json_text = None
|
204 |
-
if hasattr(response, 'text') and response.text:
|
205 |
-
response_json_text = response.text
|
206 |
-
elif hasattr(response, 'candidates') and response.candidates:
|
207 |
-
try:
|
208 |
-
part = response.candidates[0].content.parts[0]
|
209 |
-
if hasattr(part, 'text') and part.text:
|
210 |
-
response_json_text = part.text
|
211 |
-
except (IndexError, AttributeError):
|
212 |
-
pass # Could not find text in candidates
|
213 |
-
|
214 |
-
if response_json_text:
|
215 |
-
try:
|
216 |
-
# Validate and parse the JSON response using the Pydantic model
|
217 |
-
task_output = TaskExtractionOutput.model_validate_json(response_json_text)
|
218 |
-
return task_output, quarter, year, days_remaining
|
219 |
-
except Exception as e: # Catch Pydantic validation errors or JSON parsing errors
|
220 |
-
raise ValueError(f"Failed to parse or validate API response: {e}\nRaw response text: {response_json_text}")
|
221 |
-
else:
|
222 |
-
# Handle cases where the response is empty or indicates an error
|
223 |
-
feedback_message = ""
|
224 |
-
if hasattr(response, 'prompt_feedback') and response.prompt_feedback:
|
225 |
-
feedback_message = f"Prompt feedback: {response.prompt_feedback}. "
|
226 |
-
raise ValueError(f"Failed to generate content or response text is empty. {feedback_message}Full response: {response}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
features/insight_and_tasks/agents/task_extraction_model_groq.py
DELETED
@@ -1,143 +0,0 @@
|
|
1 |
-
import enum
|
2 |
-
import json
|
3 |
-
import os
|
4 |
-
from typing import List, Optional, Literal
|
5 |
-
from pydantic import BaseModel, Field, field_validator, ValidationInfo
|
6 |
-
from datetime import datetime, date
|
7 |
-
|
8 |
-
# Import Groq and instructor for structured output
|
9 |
-
try:
|
10 |
-
from groq import Groq, RateLimitError
|
11 |
-
import instructor
|
12 |
-
except ImportError:
|
13 |
-
print("Warning: 'groq' or 'instructor' library not found. Please install them.")
|
14 |
-
print("Try: pip install groq instructor")
|
15 |
-
Groq = None
|
16 |
-
instructor = None
|
17 |
-
|
18 |
-
|
19 |
-
from features.insight_and_tasks.data_models.tasks import (
|
20 |
-
TaskExtractionOutput,
|
21 |
-
OKR,
|
22 |
-
KeyResult,
|
23 |
-
Task,
|
24 |
-
EffortLevel,
|
25 |
-
TimelineCategory,
|
26 |
-
PriorityLevel,
|
27 |
-
TaskType,
|
28 |
-
DataSubject # Ensure all are imported
|
29 |
-
)
|
30 |
-
|
31 |
-
# --- Groq Client Initialization with Instructor ---
|
32 |
-
# Ensure GROQ_API_KEY is set in your environment variables before running
|
33 |
-
if Groq and instructor:
|
34 |
-
try:
|
35 |
-
api_key = os.getenv('GROQ_API_KEY')
|
36 |
-
if not api_key:
|
37 |
-
raise ValueError("GROQ_API_KEY environment variable not set. Please set it to your Groq API key.")
|
38 |
-
|
39 |
-
# Create a single, patched Groq client for structured output using instructor
|
40 |
-
# Mode.JSON ensures the output is a valid JSON object based on the Pydantic model
|
41 |
-
client = instructor.from_groq(Groq(api_key=api_key), mode=instructor.Mode.JSON)
|
42 |
-
except Exception as e:
|
43 |
-
print(f"Failed to initialize Groq client: {e}")
|
44 |
-
client = None
|
45 |
-
else:
|
46 |
-
client = None
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
# --- Helper Function for Date Calculations (Unchanged) ---
|
51 |
-
def get_quarter_info():
|
52 |
-
"""Calculates current quarter, year, and days remaining in the quarter."""
|
53 |
-
today = date.today()
|
54 |
-
current_year = today.year
|
55 |
-
current_quarter = (today.month - 1) // 3 + 1
|
56 |
-
|
57 |
-
if current_quarter == 1:
|
58 |
-
end_of_quarter_date = date(current_year, 3, 31)
|
59 |
-
elif current_quarter == 2:
|
60 |
-
end_of_quarter_date = date(current_year, 6, 30)
|
61 |
-
elif current_quarter == 3:
|
62 |
-
end_of_quarter_date = date(current_year, 9, 30)
|
63 |
-
else: # current_quarter == 4
|
64 |
-
end_of_quarter_date = date(current_year, 12, 31)
|
65 |
-
|
66 |
-
days_remaining = (end_of_quarter_date - today).days
|
67 |
-
days_remaining = max(0, days_remaining)
|
68 |
-
|
69 |
-
return current_quarter, current_year, days_remaining, today
|
70 |
-
|
71 |
-
# --- Main Task Extraction Function (Refactored for Groq) ---
|
72 |
-
def extract_tasks_from_text_groq(user_text_input: str) -> (Optional[TaskExtractionOutput], int, int, int):
|
73 |
-
"""
|
74 |
-
Extracts tasks from input text using the Groq API and structures them
|
75 |
-
using instructor.
|
76 |
-
|
77 |
-
Args:
|
78 |
-
user_text_input: The text to analyze.
|
79 |
-
|
80 |
-
Returns:
|
81 |
-
A tuple containing:
|
82 |
-
- A TaskExtractionOutput Pydantic model instance, or None on failure.
|
83 |
-
- The current quarter number.
|
84 |
-
- The current year.
|
85 |
-
- The number of days remaining in the quarter.
|
86 |
-
|
87 |
-
Raises:
|
88 |
-
ValueError: If the Groq client is not initialized or if the API call fails.
|
89 |
-
RateLimitError: If the Groq API rate limit is exceeded.
|
90 |
-
"""
|
91 |
-
if not client:
|
92 |
-
raise ValueError("Groq client is not initialized. Check your API key and library installations.")
|
93 |
-
|
94 |
-
quarter, year, days_remaining, current_date_obj = get_quarter_info()
|
95 |
-
|
96 |
-
# The prompt structure remains the same as it is effective.
|
97 |
-
# We explicitly tell the model its role and the structure we expect.
|
98 |
-
prompt = f"""You are a Time-Aware Task Extraction Specialist, an AI expert in meticulously analyzing strategic insights (e.g., from LinkedIn analytics) and transforming them into a structured set of actionable tasks, organized within an Objectives and Key Results (OKRs) framework.
|
99 |
-
|
100 |
-
Your output MUST be a valid JSON object that strictly conforms to the 'TaskExtractionOutput' Pydantic schema.
|
101 |
-
|
102 |
-
CURRENT CONTEXTUAL INFORMATION:
|
103 |
-
- Use this exact string for the 'current_quarter_info' field: 'Q{quarter} {year}, {days_remaining} days remaining'.
|
104 |
-
|
105 |
-
GENERATION RULES:
|
106 |
-
1. Your primary goal is to identify every distinct, high-level strategic objective from the input text. For each and every distinct objective you find, you must create a corresponding OKR object.
|
107 |
-
2. For each OKR, extract all relevant Key Results. Key Results must be measurable outcomes.
|
108 |
-
3. For each KeyResult, extract all specific and actionable Tasks that are directly derived from the input text.
|
109 |
-
4. Considering the days remaining in the quarter, prioritize tasks with the highest immediate impact where possible.
|
110 |
-
5. Tasks must be specific, actionable, and directly derived from the input text.
|
111 |
-
6. Do not create redundant or repetitive content.
|
112 |
-
7. Ensure the final output is a complete JSON object.
|
113 |
-
|
114 |
-
Now, analyze the following text and generate the structured JSON output:
|
115 |
-
---
|
116 |
-
TEXT TO ANALYZE:
|
117 |
-
{user_text_input}
|
118 |
-
---
|
119 |
-
"""
|
120 |
-
|
121 |
-
try:
|
122 |
-
# Use the instructor-patched client to make the call.
|
123 |
-
# Pass the Pydantic model to `response_model`.
|
124 |
-
# Instructor will handle the validation and parsing automatically.
|
125 |
-
task_output = client.chat.completions.create(
|
126 |
-
model="llama-3.3-70b-versatile", # A powerful model available on Groq
|
127 |
-
response_model=TaskExtractionOutput,
|
128 |
-
messages=[
|
129 |
-
{"role": "user", "content": prompt},
|
130 |
-
],
|
131 |
-
temperature=0.1,
|
132 |
-
top_p=0.8,
|
133 |
-
max_retries=3, # Instructor can automatically retry on validation errors
|
134 |
-
)
|
135 |
-
return task_output, quarter, year, days_remaining
|
136 |
-
|
137 |
-
except RateLimitError as e:
|
138 |
-
print(f"Error: Groq API rate limit exceeded. Please wait and try again. Details: {e}")
|
139 |
-
raise # Re-raise the specific error
|
140 |
-
except Exception as e:
|
141 |
-
# This can catch Pydantic validation errors or other API issues.
|
142 |
-
print(f"An unexpected error occurred during the Groq API call or data validation: {e}")
|
143 |
-
raise ValueError(f"Failed to process text with Groq: {e}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
features/insight_and_tasks/coordinators/employer_branding_coordinator.py
DELETED
@@ -1,331 +0,0 @@
|
|
1 |
-
# coordinators/employer_branding_coordinator.py
|
2 |
-
import json
|
3 |
-
import logging
|
4 |
-
from typing import Optional, Dict, Any # Added Dict, Any
|
5 |
-
from dataclasses import asdict # For converting dataclasses (like AgentMetrics) to dict
|
6 |
-
import os
|
7 |
-
from datetime import datetime
|
8 |
-
|
9 |
-
from google.adk.agents import LlmAgent
|
10 |
-
from google.adk.runners import InMemoryRunner
|
11 |
-
from google.genai import types as genai_types # For ADK agent inputs
|
12 |
-
|
13 |
-
# Project-specific imports
|
14 |
-
from features.insight_and_tasks.agents.follower_agent import EnhancedFollowerAnalysisAgent
|
15 |
-
from features.insight_and_tasks.agents.post_agent import EnhancedPostPerformanceAgent
|
16 |
-
from features.insight_and_tasks.agents.mentions_agent import EnhancedMentionsAnalysisAgent
|
17 |
-
from features.insight_and_tasks.data_models.metrics import AgentMetrics # To type hint inputs
|
18 |
-
from features.insight_and_tasks.utils.retry_mechanism import RetryMechanism # If ADK calls need retry
|
19 |
-
|
20 |
-
# Configure logger for this module
|
21 |
-
logger = logging.getLogger(__name__)
|
22 |
-
|
23 |
-
DEFAULT_COORDINATOR_MODEL = "gemini-2.5-flash-preview-05-20" # Use a more capable model for synthesis
|
24 |
-
os.environ["GOOGLE_GENAI_USE_VERTEXAI"] = "False"
|
25 |
-
GOOGLE_API_KEY = os.environ.get("GEMINI_API_KEY")
|
26 |
-
os.environ["GOOGLE_API_KEY"] = GOOGLE_API_KEY
|
27 |
-
|
28 |
-
class EnhancedEmployerBrandingCoordinator:
|
29 |
-
"""
|
30 |
-
Enhanced coordinator for synthesizing insights from multiple agent metrics,
|
31 |
-
identifying correlations, and generating integrated strategic recommendations.
|
32 |
-
"""
|
33 |
-
COORDINATOR_AGENT_NAME = "employer_branding_coordinator"
|
34 |
-
COORDINATOR_AGENT_DESCRIPTION = (
|
35 |
-
"Strategic coordinator that analyzes metrics from Follower, Post Performance, and Mentions agents "
|
36 |
-
"to find correlations, suggest potential causal links, and generate integrated strategies."
|
37 |
-
)
|
38 |
-
COORDINATOR_AGENT_INSTRUCTION = """
|
39 |
-
You are the Enhanced Employer Branding Coordinator. Your primary mission is to synthesize analyses and
|
40 |
-
structured metrics (TimeSeries, Aggregate, Categorical) from three specialized agents: Follower Analysis,
|
41 |
-
Post Performance, and Mentions Analysis. Your goal is to provide a holistic, integrated understanding of
|
42 |
-
the LinkedIn employer branding performance.
|
43 |
-
|
44 |
-
You MUST focus on:
|
45 |
-
1. Cross-Agent Correlations: Analyze how metrics from different agents relate to each other over time.
|
46 |
-
Pay close attention to the 'time_series_metrics' provided by each agent.
|
47 |
-
- Identify positive or negative correlations (e.g., "Follower growth rate increased by X% when posts about 'company culture' (Post Agent) were published, coinciding with a Y% rise in positive mentions (Mentions Agent)").
|
48 |
-
- Note any leading or lagging indicators (e.g., "A spike in negative mentions often preceded a dip in follower growth by approximately 2 weeks.").
|
49 |
-
- Look for relationships between specific content types/topics (from Post Agent) and follower engagement/growth (Follower Agent) or brand sentiment (Mentions Agent).
|
50 |
-
2. Potential Causal Insights & Hypotheses: Based on observed correlations and temporal sequences, suggest plausible causal relationships.
|
51 |
-
These are hypotheses, not definitive conclusions.
|
52 |
-
- Example: "The Q2 campaign focusing on 'employee testimonials' (Post Agent data) likely contributed to the observed 15% increase in organic follower acquisition (Follower Agent data) and the shift towards more positive sentiment in mentions (Mentions Agent data) during the same period."
|
53 |
-
3. Root Cause Analysis (Conceptual): For significant performance changes (e.g., sudden engagement drops, unexpected follower spikes, sharp sentiment shifts), attempt to identify potential root causes by cross-referencing data and summaries from all three agents.
|
54 |
-
4. Predictive Insights (High-Level): Based on established trends and correlations, what are potential future performance trajectories or risks?
|
55 |
-
- Example: "If the current trend of declining engagement on text-only posts continues, overall reach may decrease by X% next quarter unless content strategy is diversified."
|
56 |
-
5. Integrated Strategic Recommendations: Formulate actionable, strategic advice that leverages insights from ALL THREE data sources to optimize overall employer branding.
|
57 |
-
- Recommendations should be specific (e.g., "Increase frequency of video posts related to 'Team Achievements' as this format shows high engagement and correlates with positive mention spikes.").
|
58 |
-
- Prioritize recommendations based on their potential impact, supported by the cross-agent analysis.
|
59 |
-
- Suggest A/B tests or further investigations where appropriate.
|
60 |
-
|
61 |
-
INPUT: You will receive structured 'AgentMetrics' data (JSON format) from each of the three agents. This includes their own analysis summaries, time-series data, aggregate figures, and categorical breakdowns.
|
62 |
-
|
63 |
-
OUTPUT: A comprehensive, well-structured report covering:
|
64 |
-
I. Overall Executive Summary: A brief (2-3 paragraph) overview of the most critical findings and strategic implications derived from the integrated analysis.
|
65 |
-
II. Detailed Cross-Agent Correlation Analysis: Elaborate on specific correlations found, with examples.
|
66 |
-
III.Key Causal Hypotheses: Present the most compelling potential causal links.
|
67 |
-
IV. Noteworthy Performance Shifts & Potential Root Causes: Discuss any major changes and their likely drivers.
|
68 |
-
V. Forward-Looking Predictive Insights: Offer high-level predictions.
|
69 |
-
VI. Actionable Integrated Strategic Recommendations: Provide clear, prioritized recommendations.
|
70 |
-
|
71 |
-
Your analysis must be grounded in the provided data. Refer to specific metrics and agent findings to support your conclusions.
|
72 |
-
Be insightful and strategic. The goal is to provide a unified view that is more valuable than the sum of the individual agent analyses.
|
73 |
-
"""
|
74 |
-
|
75 |
-
def __init__(self, api_key: str, model_name: Optional[str] = None):
|
76 |
-
self.api_key = api_key # Stored for LlmAgent or if agents need it passed explicitly
|
77 |
-
self.model_name = model_name or DEFAULT_COORDINATOR_MODEL
|
78 |
-
|
79 |
-
# Initialize individual agents. The coordinator will use their output.
|
80 |
-
# These agents are internal to the coordinator's process of getting data to synthesize.
|
81 |
-
self.follower_agent = EnhancedFollowerAnalysisAgent(api_key=api_key, model_name=model_name) # Pass down model if needed
|
82 |
-
self.post_agent = EnhancedPostPerformanceAgent(api_key=api_key, model_name=model_name)
|
83 |
-
self.mentions_agent = EnhancedMentionsAnalysisAgent(api_key=api_key, model_name=model_name)
|
84 |
-
|
85 |
-
# The LLM agent for the coordinator itself, responsible for synthesis
|
86 |
-
self.coordinator_llm_agent = LlmAgent(
|
87 |
-
name=self.COORDINATOR_AGENT_NAME,
|
88 |
-
model=self.model_name, # Use the coordinator's (potentially more powerful) model
|
89 |
-
description=self.COORDINATOR_AGENT_DESCRIPTION,
|
90 |
-
instruction=self.COORDINATOR_AGENT_INSTRUCTION
|
91 |
-
)
|
92 |
-
self.retry_mechanism = RetryMechanism()
|
93 |
-
logger.info(f"{self.COORDINATOR_AGENT_NAME} initialized with model {self.model_name}.")
|
94 |
-
logger.info(f"It internally uses: Follower Agent ({self.follower_agent.model_name}), "
|
95 |
-
f"Post Agent ({self.post_agent.model_name}), Mentions Agent ({self.mentions_agent.model_name}).")
|
96 |
-
|
97 |
-
|
98 |
-
async def generate_comprehensive_analysis(
|
99 |
-
self,
|
100 |
-
follower_metrics: AgentMetrics,
|
101 |
-
post_metrics: AgentMetrics,
|
102 |
-
mentions_metrics: AgentMetrics
|
103 |
-
) -> str:
|
104 |
-
"""
|
105 |
-
Generates a comprehensive analysis by synthesizing metrics from all specialized agents.
|
106 |
-
|
107 |
-
Args:
|
108 |
-
follower_metrics: Metrics from the EnhancedFollowerAnalysisAgent.
|
109 |
-
post_metrics: Metrics from the EnhancedPostPerformanceAgent.
|
110 |
-
mentions_metrics: Metrics from the EnhancedMentionsAnalysisAgent.
|
111 |
-
|
112 |
-
Returns:
|
113 |
-
A string containing the comprehensive analysis report.
|
114 |
-
"""
|
115 |
-
|
116 |
-
# Prepare the input prompt for the coordinator's LlmAgent
|
117 |
-
# Serialize the AgentMetrics objects (which are dataclasses) to dictionaries
|
118 |
-
# then to JSON strings for clean inclusion in the prompt.
|
119 |
-
try:
|
120 |
-
follower_metrics_dict = asdict(follower_metrics)
|
121 |
-
post_metrics_dict = asdict(post_metrics)
|
122 |
-
mentions_metrics_dict = asdict(mentions_metrics)
|
123 |
-
except Exception as e:
|
124 |
-
logger.error(f"Error converting AgentMetrics to dict: {e}", exc_info=True)
|
125 |
-
return "Error: Could not process input metrics for coordination."
|
126 |
-
|
127 |
-
# Truncate individual agent summaries if they are too long to avoid overly large prompts
|
128 |
-
max_summary_len = 500 # Max characters for individual agent summaries in the prompt
|
129 |
-
follower_metrics_dict['analysis_summary'] = follower_metrics_dict.get('analysis_summary', '')[:max_summary_len]
|
130 |
-
post_metrics_dict['analysis_summary'] = post_metrics_dict.get('analysis_summary', '')[:max_summary_len]
|
131 |
-
mentions_metrics_dict['analysis_summary'] = mentions_metrics_dict.get('analysis_summary', '')[:max_summary_len]
|
132 |
-
|
133 |
-
|
134 |
-
synthesis_prompt = f"""
|
135 |
-
Please synthesize the following LinkedIn analytics insights, which are structured as 'AgentMetrics'
|
136 |
-
from three specialized agents. Your primary task is to identify cross-metric correlations,
|
137 |
-
deduce potential causal relationships, and provide integrated strategic recommendations based on
|
138 |
-
your core instructions.
|
139 |
-
|
140 |
-
DATA FROM SPECIALIZED AGENTS:
|
141 |
-
|
142 |
-
1. Follower Analysis Agent Metrics:
|
143 |
-
- Agent Name: {follower_metrics_dict.get('agent_name')}
|
144 |
-
- Agent's Analysis Summary: {follower_metrics_dict.get('analysis_summary')}
|
145 |
-
- Time Series Metrics: {json.dumps([asdict(m) for m in follower_metrics.time_series_metrics], indent=2, default=str)}
|
146 |
-
- Aggregate Metrics: {json.dumps(follower_metrics_dict.get('aggregate_metrics'), indent=2, default=str)}
|
147 |
-
- Categorical Metrics: {json.dumps(follower_metrics_dict.get('categorical_metrics'), indent=2, default=str)}
|
148 |
-
- Time Periods Covered: {json.dumps(follower_metrics_dict.get('time_periods_covered'), default=str)}
|
149 |
-
- Key Insights by Agent: {json.dumps(follower_metrics_dict.get('key_insights'), default=str)}
|
150 |
-
|
151 |
-
2. Post Performance Agent Metrics:
|
152 |
-
- Agent Name: {post_metrics_dict.get('agent_name')}
|
153 |
-
- Agent's Analysis Summary: {post_metrics_dict.get('analysis_summary')}
|
154 |
-
- Time Series Metrics: {json.dumps([asdict(m) for m in post_metrics.time_series_metrics], indent=2, default=str)}
|
155 |
-
- Aggregate Metrics: {json.dumps(post_metrics_dict.get('aggregate_metrics'), indent=2, default=str)}
|
156 |
-
- Categorical Metrics: {json.dumps(post_metrics_dict.get('categorical_metrics'), indent=2, default=str)}
|
157 |
-
- Time Periods Covered: {json.dumps(post_metrics_dict.get('time_periods_covered'), default=str)}
|
158 |
-
- Key Insights by Agent: {json.dumps(post_metrics_dict.get('key_insights'), default=str)}
|
159 |
-
|
160 |
-
3. Mentions Analysis Agent Metrics:
|
161 |
-
- Agent Name: {mentions_metrics_dict.get('agent_name')}
|
162 |
-
- Agent's Analysis Summary: {mentions_metrics_dict.get('analysis_summary')}
|
163 |
-
- Time Series Metrics: {json.dumps([asdict(m) for m in mentions_metrics.time_series_metrics], indent=2, default=str)}
|
164 |
-
- Aggregate Metrics: {json.dumps(mentions_metrics_dict.get('aggregate_metrics'), indent=2, default=str)}
|
165 |
-
- Categorical Metrics: {json.dumps(mentions_metrics_dict.get('categorical_metrics'), indent=2, default=str)}
|
166 |
-
- Time Periods Covered: {json.dumps(mentions_metrics_dict.get('time_periods_covered'), default=str)}
|
167 |
-
- Key Insights by Agent: {json.dumps(mentions_metrics_dict.get('key_insights'), default=str)}
|
168 |
-
|
169 |
-
COORDINATION TASK:
|
170 |
-
Based on ALL the data presented above from the three agents, generate a comprehensive synthesis report.
|
171 |
-
Follow your core instructions meticulously, focusing on cross-agent correlations (especially using the
|
172 |
-
time_series_metrics), causal hypotheses, root cause considerations for major shifts, predictive insights,
|
173 |
-
and actionable, integrated strategic recommendations.
|
174 |
-
Structure your output as a detailed report with the specified sections.
|
175 |
-
"""
|
176 |
-
|
177 |
-
user_input_content = genai_types.Content(
|
178 |
-
role="user",
|
179 |
-
parts=[genai_types.Part(text=synthesis_prompt)]
|
180 |
-
)
|
181 |
-
|
182 |
-
runner = InMemoryRunner(agent=self.coordinator_llm_agent, app_name=f"{self.COORDINATOR_AGENT_NAME}Runner")
|
183 |
-
user_id = f"system_user_coordinator_{int(datetime.utcnow().timestamp())}" # Unique ID for the run
|
184 |
-
|
185 |
-
session = await runner.session_service.create_session(
|
186 |
-
app_name=f"{self.COORDINATOR_AGENT_NAME}Runner",
|
187 |
-
user_id=user_id
|
188 |
-
)
|
189 |
-
|
190 |
-
result_text_parts = []
|
191 |
-
try:
|
192 |
-
logger.info(f"Running {self.COORDINATOR_AGENT_NAME} for synthesis. User ID: {user_id}, Session ID: {session.id}")
|
193 |
-
# Using retry for the ADK runner execution part
|
194 |
-
async def run_adk_coordinator():
|
195 |
-
temp_result_parts = []
|
196 |
-
async for event in runner.run(
|
197 |
-
user_id=user_id,
|
198 |
-
session_id=session.id,
|
199 |
-
new_message=user_input_content
|
200 |
-
):
|
201 |
-
if hasattr(event, 'content') and event.content and event.content.parts:
|
202 |
-
for part in event.content.parts:
|
203 |
-
if hasattr(part, 'text'):
|
204 |
-
temp_result_parts.append(part.text)
|
205 |
-
if not temp_result_parts:
|
206 |
-
# This could happen if the LLM returns no content or an error not caught by ADK
|
207 |
-
logger.warning(f"{self.COORDINATOR_AGENT_NAME} produced no text output from ADK run.")
|
208 |
-
# Consider raising a specific error or returning a default message
|
209 |
-
# For now, it will result in an empty string if no parts are collected.
|
210 |
-
return "".join(temp_result_parts)
|
211 |
-
|
212 |
-
# The retry_with_backoff expects a synchronous function.
|
213 |
-
# For async, you'd typically handle retries within the async logic or use an async retry library.
|
214 |
-
# For simplicity here, we'll run it once. If retries are critical for ADK calls,
|
215 |
-
# the ADK runner itself might have retry mechanisms, or this part needs adjustment.
|
216 |
-
# The original code didn't show retry for this ADK call, so keeping it direct.
|
217 |
-
|
218 |
-
# Direct call without retry for the async ADK runner:
|
219 |
-
for event in runner.run(
|
220 |
-
user_id=user_id,
|
221 |
-
session_id=session.id,
|
222 |
-
new_message=user_input_content
|
223 |
-
):
|
224 |
-
if hasattr(event, 'content') and event.content and event.content.parts:
|
225 |
-
for part in event.content.parts:
|
226 |
-
if hasattr(part, 'text'):
|
227 |
-
result_text_parts.append(part.text)
|
228 |
-
|
229 |
-
final_result_text = "".join(result_text_parts)
|
230 |
-
if not final_result_text.strip():
|
231 |
-
logger.warning(f"{self.COORDINATOR_AGENT_NAME} synthesis resulted in an empty string.")
|
232 |
-
final_result_text = "Coordinator analysis did not produce output. Please check logs."
|
233 |
-
|
234 |
-
|
235 |
-
except Exception as e:
|
236 |
-
logger.error(f"Error during {self.COORDINATOR_AGENT_NAME} LLM agent execution: {e}", exc_info=True)
|
237 |
-
final_result_text = f"Error in coordinator synthesis: {str(e)}"
|
238 |
-
finally:
|
239 |
-
try:
|
240 |
-
await runner.session_service.delete_session(
|
241 |
-
app_name=f"{self.COORDINATOR_AGENT_NAME}Runner", user_id=user_id, session_id=session.id
|
242 |
-
)
|
243 |
-
except Exception as session_del_e:
|
244 |
-
logger.error(f"Error deleting coordinator session: {session_del_e}")
|
245 |
-
|
246 |
-
return final_result_text
|
247 |
-
|
248 |
-
if __name__ == '__main__':
|
249 |
-
import asyncio
|
250 |
-
import pandas as pd # For creating dummy data
|
251 |
-
from datetime import datetime # For dummy data AgentMetrics
|
252 |
-
|
253 |
-
try:
|
254 |
-
from utils.logging_config import setup_logging
|
255 |
-
setup_logging()
|
256 |
-
logger.info("Logging setup for EnhancedEmployerBrandingCoordinator test.")
|
257 |
-
except ImportError:
|
258 |
-
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")
|
259 |
-
logger.warning("logging_config.py not found, using basicConfig for logging.")
|
260 |
-
|
261 |
-
MOCK_API_KEY = os.environ.get("GOOGLE_API_KEY", "test_api_key_coordinator")
|
262 |
-
MODEL_NAME = DEFAULT_COORDINATOR_MODEL # Or a specific test model
|
263 |
-
|
264 |
-
# Create dummy AgentMetrics data for testing
|
265 |
-
dummy_ts_metric = TimeSeriesMetric(metric_name="dummy_visits", values=[10.0,20.0], timestamps=["2023-01","2023-02"])
|
266 |
-
|
267 |
-
follower_metrics_data = AgentMetrics(
|
268 |
-
agent_name="follower_analyst_test",
|
269 |
-
analysis_summary="Followers grew steadily. Demographic: Young professionals.",
|
270 |
-
time_series_metrics=[dummy_ts_metric],
|
271 |
-
aggregate_metrics={"avg_growth_rate": 0.05},
|
272 |
-
categorical_metrics={"top_industry": "Tech"},
|
273 |
-
time_periods_covered=["2023-01", "2023-02"],
|
274 |
-
key_insights=["Organic growth is strong."]
|
275 |
-
)
|
276 |
-
post_metrics_data = AgentMetrics(
|
277 |
-
agent_name="post_analyst_test",
|
278 |
-
analysis_summary="Video posts performed best. Engagement rate is 3%.",
|
279 |
-
time_series_metrics=[TimeSeriesMetric(metric_name="dummy_engagement", values=[0.03,0.035], timestamps=["2023-01","2023-02"], unit="%")],
|
280 |
-
aggregate_metrics={"avg_engagement_rate_overall": 0.032},
|
281 |
-
categorical_metrics={"top_media_type": "VIDEO"},
|
282 |
-
time_periods_covered=["2023-01", "2023-02"],
|
283 |
-
key_insights=["Video content is key for engagement."]
|
284 |
-
)
|
285 |
-
mentions_metrics_data = AgentMetrics(
|
286 |
-
agent_name="mentions_analyst_test",
|
287 |
-
analysis_summary="Mentions are mostly neutral. Sentiment score avg 0.1.",
|
288 |
-
time_series_metrics=[TimeSeriesMetric(metric_name="dummy_sentiment_score", values=[0.1,0.12], timestamps=["2023-01","2023-02"])],
|
289 |
-
aggregate_metrics={"overall_avg_sentiment": 0.11},
|
290 |
-
categorical_metrics={"dominant_sentiment": "Neutral"},
|
291 |
-
time_periods_covered=["2023-01", "2023-02"],
|
292 |
-
key_insights=["Brand perception is stable but not overly positive."]
|
293 |
-
)
|
294 |
-
|
295 |
-
coordinator = EnhancedEmployerBrandingCoordinator(api_key=MOCK_API_KEY, model_name=MODEL_NAME)
|
296 |
-
|
297 |
-
async def run_coordination():
|
298 |
-
logger.info("Generating comprehensive analysis from dummy metrics...")
|
299 |
-
# For local tests without real API calls, the LlmAgent might behave as a mock.
|
300 |
-
if MOCK_API_KEY == "test_api_key_coordinator":
|
301 |
-
logger.warning("Using a mock API key. Coordinator LlmAgent behavior might be limited or mocked.")
|
302 |
-
# Mock the ADK runner for the coordinator's LLM agent if needed
|
303 |
-
class MockCoordinatorADKRunner:
|
304 |
-
def __init__(self, agent, app_name): self.agent = agent
|
305 |
-
async def session_service_create_session(self, app_name, user_id):
|
306 |
-
class MockSession: id = "mock_coord_session_id"
|
307 |
-
return MockSession()
|
308 |
-
async def run(self, user_id, session_id, new_message):
|
309 |
-
# Simulate a response from the coordinator LLM
|
310 |
-
yield genai_types.Content(parts=[genai_types.Part(text="Mock Coordinator Synthesis Report: Blah blah correlation. Recommendation: Do X.")])
|
311 |
-
async def session_service_delete_session(self, app_name, user_id, session_id): pass
|
312 |
-
|
313 |
-
global InMemoryRunner # Make sure we are modifying the correct InMemoryRunner
|
314 |
-
OriginalInMemoryRunnerCoord = InMemoryRunner
|
315 |
-
InMemoryRunner = MockCoordinatorADKRunner
|
316 |
-
|
317 |
-
|
318 |
-
report = await coordinator.generate_comprehensive_analysis(
|
319 |
-
follower_metrics_data,
|
320 |
-
post_metrics_data,
|
321 |
-
mentions_metrics_data
|
322 |
-
)
|
323 |
-
|
324 |
-
if MOCK_API_KEY == "test_api_key_coordinator" and 'OriginalInMemoryRunnerCoord' in globals():
|
325 |
-
InMemoryRunner = OriginalInMemoryRunnerCoord # Restore
|
326 |
-
|
327 |
-
print("\n--- EnhancedEmployerBrandingCoordinator Report ---")
|
328 |
-
print(report)
|
329 |
-
|
330 |
-
if __name__ == '__main__': # Inner check
|
331 |
-
asyncio.run(run_coordination())
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
features/insight_and_tasks/data_models/__init__.py
DELETED
@@ -1,35 +0,0 @@
|
|
1 |
-
# data_models/__init__.py
|
2 |
-
|
3 |
-
# This file makes the 'data_models' directory a Python package.
|
4 |
-
|
5 |
-
# Expose key models at the package level for easier importing.
|
6 |
-
from .metrics import TimeSeriesMetric, AgentMetrics, MetricType, TimeGranularity
|
7 |
-
from .tasks import (
|
8 |
-
EffortLevel,
|
9 |
-
TaskType,
|
10 |
-
DataSubject,
|
11 |
-
TimelineCategory,
|
12 |
-
PriorityLevel,
|
13 |
-
Task,
|
14 |
-
KeyResult,
|
15 |
-
OKR,
|
16 |
-
TaskExtractionOutput
|
17 |
-
)
|
18 |
-
|
19 |
-
__all__ = [
|
20 |
-
# From metrics.py
|
21 |
-
"TimeSeriesMetric",
|
22 |
-
"AgentMetrics",
|
23 |
-
"MetricType",
|
24 |
-
"TimeGranularity",
|
25 |
-
# From tasks.py
|
26 |
-
"EffortLevel",
|
27 |
-
"TaskType",
|
28 |
-
"DataSubject",
|
29 |
-
"TimelineCategory",
|
30 |
-
"PriorityLevel",
|
31 |
-
"Task",
|
32 |
-
"KeyResult",
|
33 |
-
"OKR",
|
34 |
-
"TaskExtractionOutput"
|
35 |
-
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
features/insight_and_tasks/data_models/metrics.py
DELETED
@@ -1,50 +0,0 @@
|
|
1 |
-
# data_models/metrics.py
|
2 |
-
from dataclasses import dataclass, field
|
3 |
-
from typing import List, Dict, Any, Literal, Optional
|
4 |
-
from datetime import datetime
|
5 |
-
|
6 |
-
|
7 |
-
# Define literal types for more specific type hinting
|
8 |
-
MetricType = Literal['time_series', 'aggregate', 'categorical']
|
9 |
-
TimeGranularity = Literal['daily', 'weekly', 'monthly', 'yearly', 'other'] # Added 'yearly' and 'other'
|
10 |
-
|
11 |
-
@dataclass
|
12 |
-
class TimeSeriesMetric:
|
13 |
-
"""Structure for time-series based metrics"""
|
14 |
-
metric_name: str
|
15 |
-
values: List[float] = field(default_factory=list)
|
16 |
-
timestamps: List[str] = field(default_factory=list) # Consider using datetime objects or ISO format strings
|
17 |
-
metric_type: MetricType = 'time_series'
|
18 |
-
time_granularity: TimeGranularity = 'monthly'
|
19 |
-
unit: Optional[str] = None # e.g., 'count', '%', 'USD'
|
20 |
-
description: Optional[str] = None # Optional description of the metric
|
21 |
-
|
22 |
-
def __post_init__(self):
|
23 |
-
if len(self.values) != len(self.timestamps):
|
24 |
-
# Or log a warning, or handle as appropriate for your application
|
25 |
-
raise ValueError(f"Length of values ({len(self.values)}) and timestamps ({len(self.timestamps)}) must match for metric '{self.metric_name}'.")
|
26 |
-
|
27 |
-
@dataclass
|
28 |
-
class AgentMetrics:
|
29 |
-
"""
|
30 |
-
Enhanced structure for agent metrics with time-awareness and more details.
|
31 |
-
"""
|
32 |
-
agent_name: str
|
33 |
-
analysis_summary: str # Summary text from the agent's analysis
|
34 |
-
|
35 |
-
# Specific metric categories
|
36 |
-
time_series_metrics: List[TimeSeriesMetric] = field(default_factory=list)
|
37 |
-
aggregate_metrics: Dict[str, float] = field(default_factory=dict) # Key-value pairs for single value metrics
|
38 |
-
categorical_metrics: Dict[str, Any] = field(default_factory=dict) # For distributions, counts by category, etc.
|
39 |
-
# Example: {'industry_distribution': {'Tech': 100, 'Finance': 50}}
|
40 |
-
|
41 |
-
# Contextual information
|
42 |
-
time_periods_covered: List[str] = field(default_factory=list) # e.g., ["2023-01", "2023-02"] or ["Q1 2023", "Q2 2023"]
|
43 |
-
data_sources_used: List[str] = field(default_factory=list) # Information about the input data
|
44 |
-
generation_timestamp: str = field(default_factory=lambda: datetime.utcnow().isoformat()) # When these metrics were generated
|
45 |
-
|
46 |
-
# Optional fields for richer reporting
|
47 |
-
key_insights: List[str] = field(default_factory=list) # Bullet points of key findings
|
48 |
-
potential_errors_or_warnings: List[str] = field(default_factory=list) # Any issues encountered during analysis
|
49 |
-
|
50 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
features/insight_and_tasks/data_models/tasks.py
DELETED
@@ -1,197 +0,0 @@
|
|
1 |
-
# data_models/tasks.py
|
2 |
-
from enum import Enum
|
3 |
-
from typing import List, Optional, Literal
|
4 |
-
from pydantic import BaseModel, Field, field_validator
|
5 |
-
from datetime import datetime
|
6 |
-
|
7 |
-
# Using Literal for more precise string enums if preferred over Enum class for Pydantic
|
8 |
-
# However, Enum provides better structure and can be used with Field choices.
|
9 |
-
|
10 |
-
class EffortLevel(str, Enum):
|
11 |
-
"""Estimated effort level for a task."""
|
12 |
-
SMALL = "Small"
|
13 |
-
MEDIUM = "Medium"
|
14 |
-
LARGE = "Large"
|
15 |
-
|
16 |
-
class TaskType(str, Enum):
|
17 |
-
"""Type of task, indicating its nature."""
|
18 |
-
INITIATIVE = "initiative" # Action-oriented, new projects/changes
|
19 |
-
TRACKING = "tracking" # Measurement-focused, monitoring existing metrics/processes
|
20 |
-
|
21 |
-
class KeyResultType(str, Enum):
|
22 |
-
PERFORMANCE = "performance"
|
23 |
-
COMPLETION = "completion"
|
24 |
-
|
25 |
-
class DataSubject(str, Enum):
|
26 |
-
"""Specifies the primary data domain a tracking task relates to."""
|
27 |
-
FOLLOWER_STATS = "follower_stats"
|
28 |
-
POSTS = "posts"
|
29 |
-
MENTIONS = "mentions"
|
30 |
-
GENERAL = "general" # For initiatives or tasks not tied to a single data type
|
31 |
-
|
32 |
-
class TimelineCategory(str, Enum):
|
33 |
-
"""Categorization of task timelines."""
|
34 |
-
IMMEDIATE = "Immediate" # (e.g., 1-2 weeks)
|
35 |
-
SHORT_TERM = "Short-term" # (e.g., rest of current quarter, up to 3 months)
|
36 |
-
MEDIUM_TERM = "Medium-term" # (e.g., next quarter, 3-6 months)
|
37 |
-
LONG_TERM = "Long-term" # (e.g., 6+ months)
|
38 |
-
|
39 |
-
class PriorityLevel(str, Enum):
|
40 |
-
"""Priority level for tasks."""
|
41 |
-
HIGH = "High"
|
42 |
-
MEDIUM = "Medium"
|
43 |
-
LOW = "Low"
|
44 |
-
|
45 |
-
class Task(BaseModel):
|
46 |
-
"""
|
47 |
-
Represents a single actionable task derived from analysis.
|
48 |
-
"""
|
49 |
-
task_category: str = Field(
|
50 |
-
description="The broader category or theme of the task (e.g., Content Strategy, Audience Engagement, Reputation Management, Performance Monitoring)."
|
51 |
-
)
|
52 |
-
task_description: str = Field( # Renamed from 'task' for clarity
|
53 |
-
description="A concise yet clear description of the specific action to be taken."
|
54 |
-
)
|
55 |
-
objective_deliverable: str = Field(
|
56 |
-
description="The clear, measurable objective this task aims to achieve and the specific deliverable(s) expected upon completion."
|
57 |
-
)
|
58 |
-
effort: EffortLevel = Field(
|
59 |
-
description="Estimated effort required to complete the task (Small, Medium, Large)."
|
60 |
-
)
|
61 |
-
timeline: TimelineCategory = Field(
|
62 |
-
description="Projected timeline for task completion, considering urgency and dependencies."
|
63 |
-
)
|
64 |
-
responsible_party: str = Field(
|
65 |
-
description="The team, role, or individual suggested to be responsible for executing this task (e.g., Marketing Team, Content Creation Lead, Social Media Manager)."
|
66 |
-
)
|
67 |
-
success_criteria_metrics: str = Field(
|
68 |
-
description="Specific, measurable criteria and metrics that will be used to determine if the task was successfully completed and achieved its objective."
|
69 |
-
)
|
70 |
-
dependencies_prerequisites: Optional[str] = Field(
|
71 |
-
default=None,
|
72 |
-
description="Any other tasks, resources, or conditions that must be met before this task can begin or be completed."
|
73 |
-
)
|
74 |
-
priority: PriorityLevel = Field(
|
75 |
-
description="The priority level of the task (High, Medium, Low)."
|
76 |
-
)
|
77 |
-
priority_justification: str = Field(
|
78 |
-
description="A brief explanation for the assigned priority level, linking it to impact or urgency."
|
79 |
-
)
|
80 |
-
why_proposed: str = Field(
|
81 |
-
description="The rationale behind proposing this task, clearly linking it back to specific findings or insights from the data analysis."
|
82 |
-
)
|
83 |
-
task_type: TaskType = Field(
|
84 |
-
description="Indicates whether this task is a new 'initiative' or ongoing 'tracking' of performance/metrics."
|
85 |
-
)
|
86 |
-
data_subject: Optional[DataSubject] = Field(
|
87 |
-
default=None,
|
88 |
-
description="For 'tracking' tasks, specifies the primary data subject (e.g., follower_stats, posts, mentions). Can be 'general' or null for 'initiative' tasks."
|
89 |
-
)
|
90 |
-
|
91 |
-
@field_validator('data_subject')
|
92 |
-
@classmethod
|
93 |
-
def check_data_subject_for_tracking(cls, value: Optional[DataSubject], values) -> Optional[DataSubject]:
|
94 |
-
# Pydantic v2 uses `values.data` to get other field values if needed before validation
|
95 |
-
# For Pydantic v1, it would be `values.get('task_type')`
|
96 |
-
# This example assumes Pydantic v2 structure for `values` if needed, but here we only need `task_type`
|
97 |
-
# which should already be validated or available.
|
98 |
-
# For simplicity, accessing it via `values.data.get('task_type')` in Pydantic v2 context.
|
99 |
-
# If using Pydantic v1, it's `values.get('task_type')`.
|
100 |
-
# Let's assume `values` is a dict-like object containing other fields.
|
101 |
-
|
102 |
-
# The validator structure depends on Pydantic version.
|
103 |
-
# For Pydantic v2, it's `info: ValidationInfo` and `info.data.get('task_type')`
|
104 |
-
# For Pydantic v1, `values` is a dict.
|
105 |
-
# For this example, let's assume `values` is a dict of the fields.
|
106 |
-
task_type_value = None
|
107 |
-
if hasattr(values, 'data'): # Pydantic v2 way
|
108 |
-
task_type_value = values.data.get('task_type')
|
109 |
-
elif isinstance(values, dict): # Pydantic v1 way (or if it's passed as a dict)
|
110 |
-
task_type_value = values.get('task_type')
|
111 |
-
|
112 |
-
|
113 |
-
if task_type_value == TaskType.TRACKING and value is None:
|
114 |
-
raise ValueError("For 'tracking' tasks, 'data_subject' must be specified.")
|
115 |
-
if task_type_value == TaskType.INITIATIVE and value is DataSubject.GENERAL:
|
116 |
-
# This is acceptable, or you might want to enforce it to be None
|
117 |
-
pass
|
118 |
-
return value
|
119 |
-
|
120 |
-
class KeyResult(BaseModel):
|
121 |
-
"""
|
122 |
-
A measurable outcome that contributes to an Objective.
|
123 |
-
"""
|
124 |
-
key_result_description: str = Field( # Renamed from 'key_result'
|
125 |
-
description="A clear, specific, and measurable description of the key result."
|
126 |
-
)
|
127 |
-
tasks: List[Task] = Field(
|
128 |
-
default_factory=list,
|
129 |
-
description="A list of specific tasks that will be undertaken to achieve this key result."
|
130 |
-
)
|
131 |
-
target_metric: Optional[str] = Field(
|
132 |
-
default=None,
|
133 |
-
description="The primary metric used to measure the achievement of this key result (e.g., 'Follower Growth Rate', 'Average Engagement Rate')."
|
134 |
-
)
|
135 |
-
target_value: Optional[str] = Field( # Can be numeric or descriptive (e.g., "Increase by 10%", "Achieve 5%")
|
136 |
-
default=None,
|
137 |
-
description="The specific target value for the metric (e.g., '5%', '1000 new followers')."
|
138 |
-
)
|
139 |
-
key_result_type: KeyResultType = Field(
|
140 |
-
description=(
|
141 |
-
"Indicates the nature of the Key Result. "
|
142 |
-
"PERFORMANCE: Focused on achieving a specific, measurable level for a defined metric. "
|
143 |
-
"Its core metric can typically be extracted and monitored directly via a data source, such as the LinkedIn API "
|
144 |
-
"(e.g., monthly post count, engagement rate, follower gains, impressions, CTR, mention volume). "
|
145 |
-
"The goal is to hit or exceed a target for this metric. "
|
146 |
-
"COMPLETION: Focused on finishing a distinct project, delivering a specific output, or establishing a new process. "
|
147 |
-
"Progress is primarily tracked by the successful completion of the defined scope of work. "
|
148 |
-
"Generally, the primary outcome of a COMPLETION Key Result is not a metric continuously tracked via an automated "
|
149 |
-
"data source like the LinkedIn API, or the 'metric' itself describes the state of completion (e.g., 'report delivered', 'process established')."
|
150 |
-
)
|
151 |
-
)
|
152 |
-
data_subject: Optional[DataSubject] = Field(
|
153 |
-
default=None,
|
154 |
-
description="For 'performance' key results, specifies the primary data subject (e.g., follower_stats, posts, mentions). Can be 'general' or null for 'completion' tasks."
|
155 |
-
)
|
156 |
-
|
157 |
-
class OKR(BaseModel):
|
158 |
-
"""
|
159 |
-
Defines an Objective and its associated Key Results (OKRs).
|
160 |
-
"""
|
161 |
-
objective_description: str = Field( # Renamed from 'objective'
|
162 |
-
description="A high-level, qualitative goal that the team aims to achieve. Should be aspirational and motivating."
|
163 |
-
)
|
164 |
-
key_results: List[KeyResult] = Field(
|
165 |
-
default_factory=list,
|
166 |
-
description="A list of 2-5 specific, measurable, achievable, relevant, and time-bound (SMART) key results that define success for the objective."
|
167 |
-
)
|
168 |
-
objective_timeline: TimelineCategory = Field(
|
169 |
-
description="The overall timeline category for achieving this objective."
|
170 |
-
)
|
171 |
-
objective_owner: Optional[str] = Field(
|
172 |
-
default=None,
|
173 |
-
description="The team name",
|
174 |
-
max_length=50
|
175 |
-
)
|
176 |
-
|
177 |
-
|
178 |
-
class TaskExtractionOutput(BaseModel):
|
179 |
-
"""
|
180 |
-
Structured output from the TaskExtractionAgent, including context and OKRs.
|
181 |
-
"""
|
182 |
-
current_quarter_info: str = Field(
|
183 |
-
description="Information about the current quarter and days remaining (e.g., 'Q2 2025, 45 days remaining')."
|
184 |
-
)
|
185 |
-
okrs: List[OKR] = Field(
|
186 |
-
default_factory=list,
|
187 |
-
description="A list of Objectives and Key Results (OKRs) derived from the analysis."
|
188 |
-
)
|
189 |
-
overall_strategic_focus: Optional[str] = Field(
|
190 |
-
default=None,
|
191 |
-
description="A brief summary of the main strategic focus areas identified from the tasks."
|
192 |
-
)
|
193 |
-
generation_timestamp: str = Field(
|
194 |
-
default_factory=lambda: datetime.utcnow().isoformat(),
|
195 |
-
description="Timestamp of when this task extraction output was generated."
|
196 |
-
)
|
197 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
features/insight_and_tasks/orchestrators/linkedin_analytics_orchestrator.py
DELETED
@@ -1,299 +0,0 @@
|
|
1 |
-
# orchestrators/linkedin_analytics_orchestrator.py
|
2 |
-
import pandas as pd
|
3 |
-
import logging
|
4 |
-
from typing import Dict, Any, Optional
|
5 |
-
from datetime import date, datetime # For TaskExtractionAgent date
|
6 |
-
from dataclasses import asdict # For converting AgentMetrics to dict if needed for final output
|
7 |
-
import os
|
8 |
-
|
9 |
-
os.environ["GOOGLE_GENAI_USE_VERTEXAI"] = "False"
|
10 |
-
GOOGLE_API_KEY = os.environ.get("GEMINI_API_KEY")
|
11 |
-
os.environ["GOOGLE_API_KEY"] = GOOGLE_API_KEY
|
12 |
-
|
13 |
-
# Project-specific imports
|
14 |
-
from features.insight_and_tasks.utils.pandasai_setup import configure_pandasai # Centralized PandasAI config
|
15 |
-
from features.insight_and_tasks.coordinators.employer_branding_coordinator import EnhancedEmployerBrandingCoordinator
|
16 |
-
from features.insight_and_tasks.agents.task_extraction_agent import TaskExtractionAgent
|
17 |
-
from features.insight_and_tasks.data_models.metrics import AgentMetrics # For type hinting
|
18 |
-
from features.insight_and_tasks.data_models.tasks import TaskExtractionOutput # For type hinting
|
19 |
-
from features.insight_and_tasks.agents.task_extraction_model_groq import extract_tasks_from_text_groq
|
20 |
-
|
21 |
-
# Configure logger for this module
|
22 |
-
logger = logging.getLogger(__name__)
|
23 |
-
|
24 |
-
class EnhancedLinkedInAnalyticsOrchestrator:
|
25 |
-
"""
|
26 |
-
Orchestrates the end-to-end LinkedIn analytics process, from data input through
|
27 |
-
specialized agent analysis, coordinator synthesis, and actionable task extraction.
|
28 |
-
"""
|
29 |
-
|
30 |
-
def __init__(self, api_key: str, llm_model_name: Optional[str] = None, current_date_for_tasks: Optional[date] = None):
|
31 |
-
"""
|
32 |
-
Initializes the orchestrator.
|
33 |
-
Args:
|
34 |
-
api_key: The API key for Google services (used by PandasAI and LlmAgents).
|
35 |
-
llm_model_name: Optional. The primary LLM model name to be used by agents.
|
36 |
-
Specific agents/coordinator might override with their defaults if not set.
|
37 |
-
current_date_for_tasks: Optional. The date to be used by TaskExtractionAgent for quarter calculations. Defaults to today.
|
38 |
-
"""
|
39 |
-
self.api_key = api_key
|
40 |
-
self.llm_model_name = llm_model_name # Can be passed down or agents use their defaults
|
41 |
-
|
42 |
-
# Configure PandasAI globally at the start of orchestration.
|
43 |
-
# Pass the model_name if specified, otherwise pandasai_setup might use its own default.
|
44 |
-
try:
|
45 |
-
configure_pandasai(api_key=self.api_key, model_name=self.llm_model_name)
|
46 |
-
logger.info(f"PandasAI configured by orchestrator with model hint: {self.llm_model_name or 'default'}.")
|
47 |
-
except Exception as e:
|
48 |
-
logger.error(f"Failed to configure PandasAI in orchestrator: {e}", exc_info=True)
|
49 |
-
# Decide if this is a critical failure or if agents can proceed (they might try to reconfigure)
|
50 |
-
|
51 |
-
# Initialize the coordinator, which in turn initializes its specialized agents.
|
52 |
-
# Pass the model_name hint to the coordinator.
|
53 |
-
self.coordinator = EnhancedEmployerBrandingCoordinator(api_key=self.api_key, model_name=self.llm_model_name)
|
54 |
-
|
55 |
-
# Initialize the TaskExtractionAgent.
|
56 |
-
# It uses its own default model unless overridden here.
|
57 |
-
self.task_extractor = TaskExtractionAgent(
|
58 |
-
api_key=self.api_key,
|
59 |
-
model_name=self.llm_model_name, # Pass model hint
|
60 |
-
current_date=current_date_for_tasks # Defaults to today if None
|
61 |
-
)
|
62 |
-
logger.info("EnhancedLinkedInAnalyticsOrchestrator initialized.")
|
63 |
-
|
64 |
-
async def generate_full_analysis_and_tasks(
|
65 |
-
self,
|
66 |
-
follower_stats_df: pd.DataFrame,
|
67 |
-
post_df: pd.DataFrame,
|
68 |
-
mentions_df: pd.DataFrame
|
69 |
-
) -> Dict[str, Any]:
|
70 |
-
"""
|
71 |
-
Executes the full pipeline: agent analyses, coordinator synthesis, and task extraction.
|
72 |
-
Args:
|
73 |
-
follower_stats_df: DataFrame containing follower statistics.
|
74 |
-
post_df: DataFrame containing post performance data.
|
75 |
-
mentions_df: DataFrame containing brand mentions data.
|
76 |
-
Returns:
|
77 |
-
A dictionary containing the comprehensive analysis text, actionable tasks (OKRs),
|
78 |
-
and the detailed metrics from each specialized agent.
|
79 |
-
"""
|
80 |
-
logger.info("Starting full analysis and task generation pipeline...")
|
81 |
-
|
82 |
-
# Step 1: Get analyses and metrics from specialized agents.
|
83 |
-
# The coordinator's internal agents are used here.
|
84 |
-
logger.info("Running follower analysis...")
|
85 |
-
follower_agent_metrics: AgentMetrics = self.coordinator.follower_agent.analyze_follower_data(follower_stats_df)
|
86 |
-
logger.info(f"Follower analysis complete. Summary: {follower_agent_metrics.analysis_summary[:100]}...")
|
87 |
-
|
88 |
-
logger.info("Running post performance analysis...")
|
89 |
-
post_agent_metrics: AgentMetrics = self.coordinator.post_agent.analyze_post_data(post_df)
|
90 |
-
logger.info(f"Post analysis complete. Summary: {post_agent_metrics.analysis_summary[:100]}...")
|
91 |
-
|
92 |
-
logger.info("Running mentions analysis...")
|
93 |
-
mentions_agent_metrics: AgentMetrics = self.coordinator.mentions_agent.analyze_mentions_data(mentions_df)
|
94 |
-
logger.info(f"Mentions analysis complete. Summary: {mentions_agent_metrics.analysis_summary[:100]}...")
|
95 |
-
|
96 |
-
# Step 2: Coordinator synthesizes these metrics into a comprehensive analysis text.
|
97 |
-
logger.info("Running coordinator for synthesis...")
|
98 |
-
comprehensive_analysis_text: str = await self.coordinator.generate_comprehensive_analysis(
|
99 |
-
follower_agent_metrics, post_agent_metrics, mentions_agent_metrics
|
100 |
-
)
|
101 |
-
logger.info(f"Coordinator synthesis complete. Report length: {len(comprehensive_analysis_text)} chars.")
|
102 |
-
if not comprehensive_analysis_text or comprehensive_analysis_text.startswith("Error"):
|
103 |
-
logger.error(f"Coordinator synthesis failed or produced an error message: {comprehensive_analysis_text}")
|
104 |
-
# Potentially stop here or proceed with task extraction on whatever text was generated.
|
105 |
-
|
106 |
-
# Step 3: TaskExtractionAgent extracts actionable tasks (OKRs) from the comprehensive text.
|
107 |
-
logger.info("Running task extraction...")
|
108 |
-
#actionable_tasks_okrs, quarter, year, days_remaining = extract_tasks_from_text(comprehensive_analysis_text, GOOGLE_API_KEY)
|
109 |
-
actionable_tasks_okrs, quarter, year, days_remaining = extract_tasks_from_text_groq(comprehensive_analysis_text)
|
110 |
-
logger.info(f"Task extraction complete. Number of OKRs: {len(actionable_tasks_okrs.okrs) if actionable_tasks_okrs else 'Error'}.")
|
111 |
-
|
112 |
-
# Step 4: Compile and return all results.
|
113 |
-
# Convert Pydantic/dataclass objects to dicts for easier JSON serialization if the final output needs it.
|
114 |
-
# The `actionable_tasks_okrs` is already a Pydantic model, which can be serialized with .model_dump() / .json().
|
115 |
-
# `AgentMetrics` are dataclasses, use `asdict`.
|
116 |
-
|
117 |
-
final_results = {
|
118 |
-
"comprehensive_analysis_report": comprehensive_analysis_text,
|
119 |
-
"actionable_okrs_and_tasks": actionable_tasks_okrs.model_dump() if actionable_tasks_okrs else None, # Pydantic v2
|
120 |
-
"quarter": quarter,
|
121 |
-
"year": year,
|
122 |
-
"days_remaining": days_remaining,
|
123 |
-
# "actionable_okrs_and_tasks": actionable_tasks_okrs.dict() if actionable_tasks_okrs else None, # Pydantic v1
|
124 |
-
"detailed_metrics": {
|
125 |
-
"follower_agent": asdict(follower_agent_metrics) if follower_agent_metrics else None,
|
126 |
-
"post_agent": asdict(post_agent_metrics) if post_agent_metrics else None,
|
127 |
-
"mentions_agent": asdict(mentions_agent_metrics) if mentions_agent_metrics else None,
|
128 |
-
}
|
129 |
-
}
|
130 |
-
logger.info("Full analysis and task generation pipeline finished successfully.")
|
131 |
-
return final_results
|
132 |
-
|
133 |
-
# Example usage (similar to the original script's main execution block)
|
134 |
-
if __name__ == '__main__':
|
135 |
-
import asyncio
|
136 |
-
import os
|
137 |
-
from utils.logging_config import setup_logging
|
138 |
-
from utils.data_fetching import fetch_linkedin_data_from_bubble, VALID_DATA_TYPES
|
139 |
-
|
140 |
-
setup_logging() # Configure logging for the application
|
141 |
-
|
142 |
-
# --- Configuration ---
|
143 |
-
# Attempt to get API key from environment variable
|
144 |
-
# IMPORTANT: Set GOOGLE_API_KEY and BUBBLE_API_KEY in your environment for this to run.
|
145 |
-
GOOGLE_API_KEY = os.environ.get("GOOGLE_API_KEY")
|
146 |
-
BUBBLE_API_KEY_ENV = os.environ.get("BUBBLE_API_KEY") # Used by data_fetching
|
147 |
-
|
148 |
-
if not GOOGLE_API_KEY:
|
149 |
-
logger.critical("GOOGLE_API_KEY environment variable not set. Orchestrator cannot initialize LLM agents.")
|
150 |
-
exit(1)
|
151 |
-
if not BUBBLE_API_KEY_ENV: # data_fetching will also check, but good to note here
|
152 |
-
logger.warning("BUBBLE_API_KEY environment variable not set. Data fetching from Bubble will fail.")
|
153 |
-
# You might want to exit or use mock data if Bubble is essential.
|
154 |
-
|
155 |
-
# Set the Google Vertex AI environment variable if not using Vertex AI (as in original)
|
156 |
-
os.environ["GOOGLE_GENAI_USE_VERTEXAI"] = "False"
|
157 |
-
|
158 |
-
# Orchestrator settings
|
159 |
-
ORG_URN_EXAMPLE = "urn:li:organization:19010008" # Example, replace with actual
|
160 |
-
# Specify a model or let orchestrator/agents use their defaults
|
161 |
-
# LLM_MODEL_FOR_ORCHESTRATION = "gemini-2.5-flash-preview-05-20" # Example: use a powerful model
|
162 |
-
LLM_MODEL_FOR_ORCHESTRATION = None # Let agents use their defaults or pass a specific one
|
163 |
-
|
164 |
-
# --- Initialize Orchestrator ---
|
165 |
-
orchestrator = EnhancedLinkedInAnalyticsOrchestrator(
|
166 |
-
api_key=GOOGLE_API_KEY,
|
167 |
-
llm_model_name=LLM_MODEL_FOR_ORCHESTRATION,
|
168 |
-
current_date_for_tasks=datetime.utcnow().date() # Use today for task planning
|
169 |
-
)
|
170 |
-
|
171 |
-
# --- Data Fetching ---
|
172 |
-
logger.info(f"Fetching data for organization URN: {ORG_URN_EXAMPLE}")
|
173 |
-
|
174 |
-
# Helper to fetch and log
|
175 |
-
def get_data(data_type: VALID_DATA_TYPES, org_urn: str) -> pd.DataFrame:
|
176 |
-
df, error = fetch_linkedin_data_from_bubble(org_urn=org_urn, data_type=data_type)
|
177 |
-
if error:
|
178 |
-
logger.error(f"Error fetching {data_type}: {error}. Using empty DataFrame.")
|
179 |
-
return pd.DataFrame()
|
180 |
-
if df is None: # Should not happen if error is None, but as a safeguard
|
181 |
-
logger.warning(f"Fetched {data_type} is None but no error reported. Using empty DataFrame.")
|
182 |
-
return pd.DataFrame()
|
183 |
-
logger.info(f"Successfully fetched {data_type} with {len(df)} rows.")
|
184 |
-
return df
|
185 |
-
|
186 |
-
follower_stats_df_raw = get_data("li_follower_stats", ORG_URN_EXAMPLE)
|
187 |
-
posts_df_raw = get_data("LI_posts", ORG_URN_EXAMPLE) # Contains post content, media_type, etc.
|
188 |
-
mentions_df_raw = get_data("Li_mentions", ORG_URN_EXAMPLE)
|
189 |
-
post_stats_df_raw = get_data("LI_post_stats", ORG_URN_EXAMPLE) # Contains engagement numbers for posts
|
190 |
-
|
191 |
-
# --- Data Preprocessing/Merging (as in original example) ---
|
192 |
-
|
193 |
-
# Select relevant columns for follower_stats_df
|
194 |
-
if not follower_stats_df_raw.empty:
|
195 |
-
follower_stats_df = follower_stats_df_raw[[
|
196 |
-
'category_name', "follower_count_organic", "follower_count_paid", "follower_count_type"
|
197 |
-
]].copy()
|
198 |
-
else:
|
199 |
-
follower_stats_df = pd.DataFrame() # Ensure it's an empty DF if raw is empty
|
200 |
-
|
201 |
-
# Merge posts_df and post_stats_df
|
202 |
-
# This logic assumes 'id' in posts_df_raw and 'post_id' in post_stats_df_raw
|
203 |
-
merged_posts_df = pd.DataFrame()
|
204 |
-
if not posts_df_raw.empty and not post_stats_df_raw.empty:
|
205 |
-
if 'id' in posts_df_raw.columns and 'post_id' in post_stats_df_raw.columns:
|
206 |
-
# Ensure 'id' in posts_df_raw is unique before merge if it's a left table key
|
207 |
-
# posts_df_raw.drop_duplicates(subset=['id'], keep='first', inplace=True)
|
208 |
-
merged_posts_df = pd.merge(posts_df_raw, post_stats_df_raw, left_on='id', right_on='post_id', how='left', suffixes=('', '_stats'))
|
209 |
-
logger.info(f"Merged posts_df ({len(posts_df_raw)}) and post_stats_df ({len(post_stats_df_raw)}) into merged_posts_df ({len(merged_posts_df)}).")
|
210 |
-
else:
|
211 |
-
logger.warning("Cannot merge posts_df and post_stats_df due to missing 'id' or 'post_id'. Using posts_df_raw.")
|
212 |
-
merged_posts_df = posts_df_raw.copy() # Fallback to posts_df_raw
|
213 |
-
elif not posts_df_raw.empty:
|
214 |
-
logger.info("post_stats_df is empty. Using posts_df_raw for post analysis.")
|
215 |
-
merged_posts_df = posts_df_raw.copy()
|
216 |
-
else:
|
217 |
-
logger.warning("Both posts_df_raw and post_stats_df_raw are empty.")
|
218 |
-
merged_posts_df = pd.DataFrame() # Empty DF
|
219 |
-
|
220 |
-
# Select and ensure essential columns for merged_posts_df
|
221 |
-
# These are columns expected by EnhancedPostPerformanceAgent
|
222 |
-
expected_post_cols = [
|
223 |
-
'li_eb_label', 'media_type', 'is_ad', 'id', 'published_at', 'sentiment',
|
224 |
-
'engagement', 'impressionCount', 'clickCount', 'likeCount', 'commentCount', 'shareCount'
|
225 |
-
]
|
226 |
-
if not merged_posts_df.empty:
|
227 |
-
final_post_df_cols = {}
|
228 |
-
for col in expected_post_cols:
|
229 |
-
if col in merged_posts_df.columns:
|
230 |
-
final_post_df_cols[col] = merged_posts_df[col]
|
231 |
-
elif f"{col}_stats" in merged_posts_df.columns: # Check for suffixed columns from merge
|
232 |
-
final_post_df_cols[col] = merged_posts_df[f"{col}_stats"]
|
233 |
-
else:
|
234 |
-
logger.debug(f"Expected column '{col}' not found in merged_posts_df. Will be created as empty/default by agent if needed.")
|
235 |
-
# Agent preprocessing should handle missing columns by creating them with defaults (0 or 'Unknown')
|
236 |
-
|
237 |
-
# Create the final DataFrame with only the selected/available columns
|
238 |
-
# This ensures that if a column is missing, it doesn't cause an error here,
|
239 |
-
# but the agent's preprocessing will handle it.
|
240 |
-
# However, it's better to ensure they exist with NAs if the agent expects them.
|
241 |
-
temp_post_df = pd.DataFrame(final_post_df_cols)
|
242 |
-
# Ensure all expected columns are present, filling with NA if missing from selection
|
243 |
-
for col in expected_post_cols:
|
244 |
-
if col not in temp_post_df.columns:
|
245 |
-
temp_post_df[col] = pd.NA # Or appropriate default like 0 for numeric, 'Unknown' for categorical
|
246 |
-
merged_posts_df = temp_post_df[expected_post_cols].copy() # Ensure correct order and all columns
|
247 |
-
|
248 |
-
else: # If merged_posts_df started empty and stayed empty
|
249 |
-
merged_posts_df = pd.DataFrame(columns=expected_post_cols)
|
250 |
-
|
251 |
-
|
252 |
-
# Mentions DataFrame - select relevant columns if necessary, or pass as is
|
253 |
-
# Assuming mentions_df_raw is already in the correct shape or agent handles it.
|
254 |
-
# For example, if it needs specific columns:
|
255 |
-
# mentions_df = mentions_df_raw[['date', 'sentiment_label', 'mention_content']].copy() if not mentions_df_raw.empty else pd.DataFrame()
|
256 |
-
mentions_df = mentions_df_raw.copy() # Pass as is, agent will preprocess
|
257 |
-
|
258 |
-
|
259 |
-
# --- Run Orchestration ---
|
260 |
-
async def main_orchestration():
|
261 |
-
if follower_stats_df.empty and merged_posts_df.empty and mentions_df.empty:
|
262 |
-
logger.error("All input DataFrames are empty. Aborting orchestration.")
|
263 |
-
return None
|
264 |
-
|
265 |
-
logger.info("Orchestrator starting generate_full_analysis_and_tasks...")
|
266 |
-
results = await orchestrator.generate_full_analysis_and_tasks(
|
267 |
-
follower_stats_df=follower_stats_df,
|
268 |
-
post_df=merged_posts_df,
|
269 |
-
mentions_df=mentions_df
|
270 |
-
)
|
271 |
-
return results
|
272 |
-
|
273 |
-
orchestration_results = asyncio.run(main_orchestration())
|
274 |
-
|
275 |
-
# --- Output Results ---
|
276 |
-
if orchestration_results:
|
277 |
-
print("\n\n" + "="*30 + " COMPREHENSIVE ANALYSIS REPORT " + "="*30)
|
278 |
-
print(orchestration_results.get("comprehensive_analysis_report", "Report not generated."))
|
279 |
-
|
280 |
-
print("\n\n" + "="*30 + " ACTIONABLE TASKS (OKRs) " + "="*30)
|
281 |
-
okrs_data = orchestration_results.get("actionable_okrs_and_tasks")
|
282 |
-
if okrs_data:
|
283 |
-
# okrs_data is already a dict from .model_dump()
|
284 |
-
print(json.dumps(okrs_data, indent=2))
|
285 |
-
else:
|
286 |
-
print("No actionable tasks (OKRs) generated or an error occurred.")
|
287 |
-
|
288 |
-
print("\n\n" + "="*30 + " DETAILED AGENT METRICS " + "="*30)
|
289 |
-
detailed_metrics = orchestration_results.get("detailed_metrics", {})
|
290 |
-
for agent_name, metrics_dict in detailed_metrics.items():
|
291 |
-
print(f"\n--- {agent_name.replace('_', ' ').title()} Metrics ---")
|
292 |
-
if metrics_dict:
|
293 |
-
print(json.dumps(metrics_dict, indent=2, default=str)) # default=str for any non-serializable types
|
294 |
-
else:
|
295 |
-
print("Metrics not available for this agent.")
|
296 |
-
else:
|
297 |
-
logger.info("Orchestration did not produce results (likely due to empty input data).")
|
298 |
-
|
299 |
-
logger.info("Orchestration example finished.")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
features/insight_and_tasks/utils/__init__.py
DELETED
@@ -1,31 +0,0 @@
|
|
1 |
-
# utils/__init__.py
|
2 |
-
|
3 |
-
# This file makes the 'utils' directory a Python package.
|
4 |
-
# You can choose to expose certain classes or functions directly at the package level
|
5 |
-
# for easier importing, if desired.
|
6 |
-
|
7 |
-
# For example:
|
8 |
-
# from .retry_mechanism import RetryMechanism
|
9 |
-
# from .pandasai_setup import configure_pandasai
|
10 |
-
# from .data_fetching import fetch_linkedin_data_from_bubble
|
11 |
-
# from .logging_config import setup_logging
|
12 |
-
|
13 |
-
# Or, you can let users import them directly from the modules:
|
14 |
-
# from utils.retry_mechanism import RetryMechanism
|
15 |
-
|
16 |
-
# For now, keeping it simple and allowing module-level imports.
|
17 |
-
# setup_logging() # Optionally call setup_logging() when the utils package is imported.
|
18 |
-
# However, it's often better to call this explicitly at the application entry point.
|
19 |
-
|
20 |
-
__all__ = [
|
21 |
-
"RetryMechanism",
|
22 |
-
"configure_pandasai",
|
23 |
-
"fetch_linkedin_data_from_bubble",
|
24 |
-
"setup_logging"
|
25 |
-
]
|
26 |
-
|
27 |
-
# Import them here to make them available when 'from utils import *' is used,
|
28 |
-
# or for direct access like 'utils.RetryMechanism'.
|
29 |
-
from .retry_mechanism import RetryMechanism
|
30 |
-
from .pandasai_setup import configure_pandasai
|
31 |
-
from .logging_config import setup_logging
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
features/insight_and_tasks/utils/logging_config.py
DELETED
@@ -1,28 +0,0 @@
|
|
1 |
-
# utils/logging_config.py
|
2 |
-
import logging
|
3 |
-
import os
|
4 |
-
|
5 |
-
def setup_logging():
|
6 |
-
"""
|
7 |
-
Configures basic logging for the application.
|
8 |
-
Logs to console.
|
9 |
-
"""
|
10 |
-
log_level_str = os.environ.get("LOG_LEVEL", "INFO").upper()
|
11 |
-
log_level = getattr(logging, log_level_str, logging.INFO)
|
12 |
-
|
13 |
-
logging.basicConfig(
|
14 |
-
level=log_level,
|
15 |
-
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
|
16 |
-
datefmt="%Y-%m-%d %H:%M:%S"
|
17 |
-
)
|
18 |
-
# You can also direct logs to a file if needed:
|
19 |
-
# file_handler = logging.FileHandler("app.log")
|
20 |
-
# file_handler.setLevel(log_level)
|
21 |
-
# file_handler.setFormatter(logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s"))
|
22 |
-
# logging.getLogger().addHandler(file_handler)
|
23 |
-
|
24 |
-
# Silence overly verbose libraries if necessary
|
25 |
-
# logging.getLogger("some_verbose_library").setLevel(logging.WARNING)
|
26 |
-
|
27 |
-
logger = logging.getLogger(__name__)
|
28 |
-
logger.info(f"Logging configured with level: {log_level_str}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
features/insight_and_tasks/utils/pandasai_setup.py
DELETED
@@ -1,54 +0,0 @@
|
|
1 |
-
# utils/pandasai_setup.py
|
2 |
-
import os
|
3 |
-
import logging
|
4 |
-
import pandasai as pai
|
5 |
-
from pandasai_litellm import LiteLLM # Ensure this import matches your installed library
|
6 |
-
|
7 |
-
# Configure logger for this module
|
8 |
-
logger = logging.getLogger(__name__)
|
9 |
-
|
10 |
-
# It's good practice to define constants at the top or in a config file
|
11 |
-
DEFAULT_PANDASAI_MODEL = "gemini/gemini-2.5-flash-preview-05-20" # Using a common default
|
12 |
-
|
13 |
-
def configure_pandasai(api_key: str, model_name: str = None):
|
14 |
-
"""
|
15 |
-
Configures PandasAI with LiteLLM using the provided API key and model.
|
16 |
-
|
17 |
-
Args:
|
18 |
-
api_key: The Google API key.
|
19 |
-
model_name: The specific model to use (e.g., "gemini/gemini-1.5-flash-latest").
|
20 |
-
If None, uses DEFAULT_PANDASAI_MODEL.
|
21 |
-
"""
|
22 |
-
if not api_key:
|
23 |
-
logger.error("PandasAI Configuration Error: API key is missing.")
|
24 |
-
# Depending on strictness, you might raise an error or just log
|
25 |
-
# raise ValueError("API key must be provided for PandasAI configuration")
|
26 |
-
return
|
27 |
-
|
28 |
-
os.environ["GOOGLE_GENAI_USE_VERTEXAI"] = "False"
|
29 |
-
os.environ["GOOGLE_API_KEY"] = api_key
|
30 |
-
|
31 |
-
|
32 |
-
selected_model = model_name if model_name else DEFAULT_PANDASAI_MODEL
|
33 |
-
|
34 |
-
try:
|
35 |
-
llm = LiteLLM(
|
36 |
-
model=DEFAULT_PANDASAI_MODEL , # Use the selected model
|
37 |
-
api_key=api_key
|
38 |
-
)
|
39 |
-
|
40 |
-
# PandasAI configuration
|
41 |
-
pai.config.set({
|
42 |
-
"llm": llm,
|
43 |
-
"temperature": 0.3, # Lower temperature for more consistent results
|
44 |
-
"max_retries": 3
|
45 |
-
})
|
46 |
-
logger.info(f"PandasAI configured successfully with model: {selected_model}")
|
47 |
-
logger.info(f"PandasAI LLM object: {llm}")
|
48 |
-
|
49 |
-
|
50 |
-
except ImportError:
|
51 |
-
logger.error("PandasAI or pandasai_litellm is not installed. Please install the required packages.")
|
52 |
-
except Exception as e:
|
53 |
-
logger.error(f"Error configuring PandasAI: {e}", exc_info=True)
|
54 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
features/insight_and_tasks/utils/retry_mechanism.py
DELETED
@@ -1,61 +0,0 @@
|
|
1 |
-
# utils/retry_mechanism.py
|
2 |
-
import time
|
3 |
-
import logging
|
4 |
-
from typing import Callable, Any, Tuple
|
5 |
-
|
6 |
-
# Configure logger for this module
|
7 |
-
logger = logging.getLogger(__name__)
|
8 |
-
|
9 |
-
class RetryMechanism:
|
10 |
-
"""External retry mechanism with exponential backoff"""
|
11 |
-
|
12 |
-
@staticmethod
|
13 |
-
def retry_with_backoff(
|
14 |
-
func: Callable,
|
15 |
-
max_retries: int = 3,
|
16 |
-
base_delay: float = 1.0,
|
17 |
-
exceptions: Tuple[type[Exception], ...] = (Exception,) # More specific type hint
|
18 |
-
) -> Any:
|
19 |
-
"""
|
20 |
-
Retries a function call with exponential backoff.
|
21 |
-
|
22 |
-
Args:
|
23 |
-
func: The function to call.
|
24 |
-
max_retries: Maximum number of retries.
|
25 |
-
base_delay: Base delay in seconds for backoff.
|
26 |
-
exceptions: A tuple of exception types to catch and retry on.
|
27 |
-
|
28 |
-
Returns:
|
29 |
-
The result of the function call if successful.
|
30 |
-
|
31 |
-
Raises:
|
32 |
-
The last exception encountered if all retries fail.
|
33 |
-
"""
|
34 |
-
last_exception = None
|
35 |
-
current_delay = base_delay
|
36 |
-
|
37 |
-
for attempt in range(max_retries + 1): # +1 for initial attempt
|
38 |
-
try:
|
39 |
-
logger.info(f"Attempt {attempt + 1}/{max_retries + 1} for function {func.__name__}")
|
40 |
-
result = func()
|
41 |
-
if attempt > 0: # Log if a retry was successful
|
42 |
-
logger.info(f"Function {func.__name__} succeeded on attempt {attempt + 1}")
|
43 |
-
return result
|
44 |
-
except exceptions as e:
|
45 |
-
last_exception = e
|
46 |
-
logger.warning(f"Attempt {attempt + 1} for {func.__name__} failed: {str(e)}")
|
47 |
-
|
48 |
-
if attempt < max_retries:
|
49 |
-
logger.info(f"Waiting {current_delay:.2f} seconds before retrying {func.__name__}...")
|
50 |
-
time.sleep(current_delay)
|
51 |
-
current_delay *= 2 # Exponential backoff
|
52 |
-
else:
|
53 |
-
logger.error(f"All {max_retries + 1} attempts for {func.__name__} failed.")
|
54 |
-
|
55 |
-
# If loop finishes, all retries failed, raise the last exception
|
56 |
-
if last_exception is not None:
|
57 |
-
raise last_exception
|
58 |
-
else:
|
59 |
-
# This case should ideally not be reached if func always raises on failure
|
60 |
-
# or returns successfully. Added for completeness.
|
61 |
-
raise RuntimeError(f"Function {func.__name__} failed after all retries without a specific exception.")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|