Spaces:

dolphinium
/

pc-ai-data-analyst

Sleeping

File size: 34,520 Bytes

f4ff4c1
 
 
 
 
 
 
 
 
 
 
 
 
621afd7
f4ff4c1
 
d73150d
f4ff4c1
 
 
 
840c57d
f4ff4c1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
840c57d
f4ff4c1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
840c57d
f4ff4c1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bdcb123
f4ff4c1
bdcb123
f4ff4c1
 
 
 
 
bdcb123
f4ff4c1
 
 
 
 
 
 
 
 
 
 
bdcb123
f4ff4c1
 
 
 
 
bdcb123
f4ff4c1
 
 
bdcb123
f4ff4c1
bdcb123
f4ff4c1
 
 
 
 
bdcb123
f4ff4c1
 
 
bdcb123
f4ff4c1
bdcb123
f4ff4c1
 
 
 
 
bdcb123
f4ff4c1
 
 
 
 
bdcb123
f4ff4c1
 
 
 
 
bdcb123
f4ff4c1
 
 
bdcb123
f4ff4c1
bdcb123
f4ff4c1
 
 
 
 
bdcb123
f4ff4c1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bdcb123
f4ff4c1
 
 
 
 
bdcb123
f4ff4c1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
840c57d
f4ff4c1
 
 
 
 
 
840c57d
 
 
 
 
 
 
 
 
f4ff4c1
 
 
 
 
840c57d
f4ff4c1
840c57d
 
 
 
bdcb123
 
840c57d
 
 
 
f4ff4c1
 
 
 
840c57d
f4ff4c1
840c57d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f4ff4c1
840c57d
f4ff4c1
840c57d
f4ff4c1
 
840c57d
f4ff4c1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
840c57d
f4ff4c1
840c57d
f4ff4c1
 
 
840c57d
f4ff4c1
 
 
840c57d
 
f4ff4c1
 
840c57d
f4ff4c1
 
 
840c57d
8290c25
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
840c57d
f4ff4c1
8290c25
 
 
 
f4ff4c1
 
 
8290c25
f4ff4c1
 
 
 
 
 
 
 
840c57d
 
f4ff4c1
 
 
 
840c57d
f4ff4c1
 
 
 
 
 
840c57d
f4ff4c1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
840c57d
f4ff4c1
 
 
840c57d
f4ff4c1
 
 
 
 
840c57d
 
 
 
 
c3741ac
f4ff4c1
 
 
 
c3741ac
f4ff4c1
 
 
 
c3741ac
f4ff4c1
840c57d
 
f4ff4c1
 
c3741ac
f4ff4c1
 
 
 
 
c3741ac
f4ff4c1
 
 
 
c3741ac
f4ff4c1
 
 
 
c3741ac
 
 
f4ff4c1
 
 
c3741ac
f4ff4c1
 
 
 
c3741ac
f4ff4c1
621afd7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f4ff4c1
 
 
 
 
840c57d
f4ff4c1
c3741ac
f4ff4c1
 
 
 
 
c3741ac
f4ff4c1
840c57d
 
f4ff4c1
 
 
2cabd2d
 
 
 
 
 
f4ff4c1
 
 
 
eddfbb6
f4ff4c1
 
 
 
 
c3741ac
 
f4ff4c1
 
 
 
 
840c57d
f4ff4c1
840c57d
 
 
f4ff4c1
 
840c57d
 
f4ff4c1
 
840c57d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f4ff4c1

import gradio as gr
import json
import re
import datetime
import pandas as pd
import pysolr
import google.generativeai as genai
from sshtunnel import SSHTunnelForwarder
import matplotlib.pyplot as plt
import seaborn as sns
import io
import os
import logging
import concurrent.futures
from IPython.display import display, Markdown


# --- Suppress Matplotlib Debug Logs ---
logging.getLogger('matplotlib').setLevel(logging.WARNING)

# --- SSH Tunnel Configuration ---
# It's recommended to load secrets securely, e.g., from environment variables
SSH_HOST = os.environ.get('SSH_HOST')
SSH_PORT = 5322
SSH_USER = os.environ.get('SSH_USER')
SSH_PASS = os.environ.get('SSH_PASS')

# --- Solr Configuration ---
REMOTE_SOLR_HOST = '69.167.186.48'
REMOTE_SOLR_PORT = 8983
LOCAL_BIND_PORT = 8983
SOLR_CORE_NAME = 'news'
SOLR_USER = os.environ.get('SOLR_USER')
SOLR_PASS = os.environ.get('SOLR_PASS')

# --- Google Gemini Configuration ---
try:
    genai.configure(api_key=os.environ.get('GEMINI_API_KEY'))
except Exception as e:
    print(f"❌ Gemini API Key Error: {e}. Please ensure 'GEMINI_API_KEY' is set in your environment.")

# --- Global Variables ---
ssh_tunnel_server = None
solr_client = None
llm_model = None
is_initialized = False

try:
    # 1. Start the SSH Tunnel
    ssh_tunnel_server = SSHTunnelForwarder(
        (SSH_HOST, SSH_PORT),
        ssh_username=SSH_USER,
        ssh_password=SSH_PASS,
        remote_bind_address=(REMOTE_SOLR_HOST, REMOTE_SOLR_PORT),
        local_bind_address=('127.0.0.1', LOCAL_BIND_PORT)
    )
    ssh_tunnel_server.start()
    print(f"🚀 SSH tunnel established: Local Port {ssh_tunnel_server.local_bind_port} -> Remote Solr.")

    # 2. Initialize the pysolr client
    solr_url = f'http://127.0.0.1:{ssh_tunnel_server.local_bind_port}/solr/{SOLR_CORE_NAME}'
    solr_client = pysolr.Solr(solr_url, auth=(SOLR_USER, SOLR_PASS), always_commit=True)
    solr_client.ping()
    print(f"✅ Solr connection successful on core '{SOLR_CORE_NAME}'.")

    # 3. Initialize the LLM
    llm_model = genai.GenerativeModel('gemini-1.5-flash', generation_config=genai.types.GenerationConfig(temperature=0))
    print(f"✅ LLM Model '{llm_model.model_name}' initialized.")

    print("✅ System Initialized Successfully.")
    is_initialized = True

except Exception as e:
    print(f"\n❌ An error occurred during setup: {e}")
    if ssh_tunnel_server and ssh_tunnel_server.is_active:
        ssh_tunnel_server.stop()


field_metadata = [
    {
        "field_name": "business_model",
        "type": "string (categorical)",
        "example_values": ["pharma/bio", "drug delivery", "pharma services"],
        "definition": "The primary business category of the company involved in the news. Use for filtering by high-level industry segments."
    },
    {
        "field_name": "news_type",
        "type": "string (categorical)",
        "example_values": ["product news", "financial news", "regulatory news"],
        "definition": "The category of the news article itself (e.g., financial, regulatory, acquisition). Use for filtering by the type of event being reported."
    },
    {
        "field_name": "event_type",
        "type": "string (categorical)",
        "example_values": ["phase 2", "phase 1", "pre clinical", "marketed"],
        "definition": "The clinical or developmental stage of a product or event discussed in the article. Essential for queries about clinical trial phases."
    },
    {
        "field_name": "source",
        "type": "string (categorical)",
        "example_values": ["Press Release", "PR Newswire", "Business Wire"],
        "definition": "The original source of the news article, such as a newswire or official report."
    },
    {
        "field_name": "company_name",
        "type": "string (exact match, for faceting)",
        "example_values": ["pfizer inc.", "astrazeneca plc", "roche"],
        "definition": "The canonical, standardized name of a company. **Crucially, you MUST use this field for `terms` faceting** to group results by a unique company. Do NOT use this for searching."
    },
    {
        "field_name": "company_name_s",
        "type": "string (multi-valued, for searching)",
        "example_values": ["pfizer inc.", "roche", "f. hoffmann-la roche ag", "nih"],
        "definition": "A field containing all known names and synonyms for a company. **You MUST use this field for all `query` parameter searches involving a company name** to ensure comprehensive results. Do NOT use for `terms` faceting."
    },
    {
        "field_name": "territory_hq_s",
        "type": "string (multi-valued, hierarchical)",
        "example_values": ["united states of america", "europe", "europe western"],
        "definition": "The geographic location (country and continent) of a company's headquarters. It is hierarchical. Use for filtering by location."
    },
    {
        "field_name": "therapeutic_category",
        "type": "string (specific)",
        "example_values": ["cancer, other", "cancer, nsclc metastatic", "alzheimer's"],
        "definition": "The specific disease or therapeutic area being targeted. Use for very specific disease queries."
    },
    {
        "field_name": "therapeutic_category_s",
        "type": "string (multi-valued, for searching)",
        "example_values": ["cancer", "oncology", "infections", "cns"],
        "definition": "Broader, multi-valued therapeutic categories and their synonyms. **Use this field for broad category searches** in the `query` parameter."
    },
    {
        "field_name": "compound_name",
        "type": "string (exact match, for faceting)",
        "example_values": ["opdivo injection solution", "keytruda injection solution"],
        "definition": "The specific, full trade name of a drug. **Use this field for `terms` faceting** on compounds."
    },
    {
        "field_name": "compound_name_s",
        "type": "string (multi-valued, for searching)",
        "example_values": ["nivolumab injection solution", "opdivo injection solution", "ono-4538 injection solution"],
        "definition": "A field with all known trade names and synonyms for a drug. **Use this field for all `query` parameter searches** involving a compound name."
    },
    {
        "field_name": "molecule_name",
        "type": "string (exact match, for faceting)",
        "example_values": ["cannabidiol", "paclitaxel", "pembrolizumab"],
        "definition": "The generic, non-proprietary name of the active molecule. **Use this field for `terms` faceting** on molecules."
    },
    {
        "field_name": "molecule_name_s",
        "type": "string (multi-valued, for searching)",
        "example_values": ["cbd", "s1-220", "a1002n5s"],
        "definition": "A field with all known generic names and synonyms for a molecule. **Use this field for all `query` parameter searches** involving a molecule name."
    },
    {
        "field_name": "highest_phase",
        "type": "string (categorical)",
        "example_values": ["marketed", "phase 2", "phase 1"],
        "definition": "The highest stage of development a drug has ever reached."
    },
    {
        "field_name": "drug_delivery_branch_s",
        "type": "string (multi-valued, for searching)",
        "example_values": ["injection", "parenteral", "oral", "injection, other", "oral, other"],
        "definition": "The method of drug administration. **Use this for `query` parameter searches about route of administration** as it contains broader, search-friendly terms."
    },
    {
        "field_name": "drug_delivery_branch",
        "type": "string (categorical, specific, for faceting)",
        "example_values": ["injection, other", "prefilled syringes", "np liposome", "oral enteric/delayed release"],
        "definition": "The most specific category of drug delivery technology. **Use this field for `terms` faceting** on specific delivery technologies."
    },
    {
        "field_name": "route_branch",
        "type": "string (categorical)",
        "example_values": ["injection", "oral", "topical", "inhalation"],
        "definition": "The primary route of drug administration. Good for faceting on exact routes."
    },
    {
        "field_name": "molecule_api_group",
        "type": "string (categorical)",
        "example_values": ["small molecules", "biologics", "nucleic acids"],
        "definition": "High-level classification of the drug's molecular type."
    },
    {
        "field_name": "content",
        "type": "text (full-text search)",
        "example_values": ["The largest study to date...", "balstilimab..."],
        "definition": "The full text content of the news article. Use for keyword searches on topics not covered by other specific fields."
    },
    {
        "field_name": "date",
        "type": "date",
        "example_values": ["2020-10-22T00:00:00Z"],
        "definition": "The full publication date and time in ISO 8601 format. Use for precise date range queries."
    },
    {
        "field_name": "date_year",
        "type": "number (year)",
        "example_values": [2020, 2021, 2022],
        "definition": "The 4-digit year of publication. **Use this for queries involving whole years** (e.g., 'in 2023', 'last year', 'since 2020')."
    },
    {
        "field_name": "total_deal_value_in_million",
        "type": "number (metric)",
        "example_values": [50, 120.5, 176.157, 1000],
        "definition": "The total value of a financial deal, in millions of USD. This is the primary numeric field for financial aggregations (sum, avg, etc.). To use this, you must also filter for news that has a deal value, e.g., 'total_deal_value_in_million:[0 TO *]'."
    }
]

# Helper function to format the metadata for the prompt
def format_metadata_for_prompt(metadata):
    formatted_string = ""
    for field in metadata:
        formatted_string += f"- **{field['field_name']}**\n"
        formatted_string += f"  - **Type**: {field['type']}\n"
        formatted_string += f"  - **Definition**: {field['definition']}\n"
        formatted_string += f"  - **Examples**: {', '.join(map(str, field['example_values']))}\n\n"
    return formatted_string
formatted_field_info = format_metadata_for_prompt(field_metadata)


def parse_suggestions_from_report(report_text):
    """Extracts numbered suggestions from the report's markdown text."""
    suggestions_match = re.search(r"### (?:Deeper Dive: Suggested Follow-up Analyses|Suggestions for Further Exploration)\s*\n(.*?)$", report_text, re.DOTALL | re.IGNORECASE)
    if not suggestions_match: return []
    suggestions_text = suggestions_match.group(1)
    suggestions = re.findall(r"^\s*\d+\.\s*(.*)", suggestions_text, re.MULTILINE)
    return [s.strip() for s in suggestions]


def llm_generate_solr_query_with_history(natural_language_query, field_metadata, chat_history):
    """Generates a Solr query and facet JSON from a natural language query, considering the conversation history."""
    # Format the chat history for the prompt
    formatted_history = ""
    for user_msg, bot_msg in chat_history:
        # We only need the user's queries for context, not the bot's detailed responses.
        if user_msg:
            # CORRECTED: Properly formatted f-string with a newline character
            formatted_history += f"- User: \"{user_msg}\"\n"

    prompt = f"""
You are an expert Solr query engineer who converts natural language questions into precise Solr JSON Facet API query objects. Your primary goal is to create a valid JSON object with `query` and `json.facet` keys.

---
### CONVERSATIONAL CONTEXT & RULES

1.  **Today's Date for Calculations**: 2025-07-16
2.  **Allowed Facet Types**: The `type` key for any facet MUST be one of the following: `terms`, `query`, or `range`. **Do not use `date_histogram`**. For time-series analysis, use a `range` facet on a date field.
3.  **Field Usage**: You MUST use the fields described in the 'Field Definitions' section. Pay close attention to the definitions to select the correct field.
4.  **Facet vs. Query Field Distinction**: This is critical.
    *   For searching in the main `query` parameter, ALWAYS use the multi-valued search fields (ending in `_s`, like `company_name_s`) to get comprehensive results.
    *   For grouping in a `terms` facet, ALWAYS use the canonical, single-value field (e.g., `company_name`, `molecule_name`) to ensure unique and accurate grouping.
5.  **No `count(*)`**: Do NOT use functions like `count(*)`. The default facet bucket count is sufficient for counting documents.
6.  **Allowed Aggregations**: For statistical facets, only use these functions: `sum`, `avg`, `min`, `max`, `unique`. The primary metric field is `total_deal_value_in_million`. The aggregation MUST be a simple string like `"sum(total_deal_value_in_million)"` and not a nested JSON object.
7.  **Term Facet Limits**: Every `terms` facet MUST include a `limit` key. Default to `limit: 10` unless the user specifies a different number of top results.
8.  **Output Format**: Your final output must be a single, raw JSON object and nothing else. Do not add comments, explanations, or markdown formatting like ```json.

---
### FIELD DEFINITIONS (Your Source of Truth)

`{formatted_field_info}`
---
### CHAT HISTORY
`{formatted_history}`
---
### EXAMPLE OF A FOLLOW-UP QUERY

**Initial User Query:** "What are the infections news in this year?"
```json
{{
  "query": "date_year:2025 AND therapeutic_category_s:infections",
  "json.facet": {{
  "infections_news_by_type": {{
    "type": "terms",
    "field": "news_type",
    "limit": 10
  }}
}}
}}
```

**Follow-up User Query:** "Compare deal values for injection vs oral."

**Correct JSON Output for the Follow-up:**
```json
{{
  "query": "therapeutic_category_s:infections AND date_year:2025 AND total_deal_value_in_million:[0 TO *]",
  "json.facet": {{
    "injection_deals": {{
      "type": "query",
      "q": "route_branch:injection",
      "facet": {{
        "total_deal_value": "sum(total_deal_value_in_million)"
      }}
    }},
    "oral_deals": {{
      "type": "query",
      "q": "route_branch:oral",
      "facet": {{
        "total_deal_value": "sum(total_deal_value_in_million)"
      }}
    }}
  }}
}}
```
---
### YOUR TASK

Now, convert the following user query into a single, raw JSON object with 'query' and 'json.facet' keys, strictly following all rules and field definitions provided above and considering the chat history.

**Current User Query:** `{natural_language_query}`
"""
    try:
        response = llm_model.generate_content(prompt)
        # Using a more robust regex to clean the response
        cleaned_text = re.sub(r'```json\s*|\s*```', '', response.text, flags=re.MULTILINE | re.DOTALL).strip()
        return json.loads(cleaned_text)
    except Exception as e:
        raw_response_text = response.text if 'response' in locals() else 'N/A'
        print(f"Error in llm_generate_solr_query_with_history: {e}\nRaw Response:\n{raw_response_text}")
        return None


def llm_generate_visualization_code(query_context, facet_data):
    """Generates Python code for visualization based on query and data."""
    prompt = f"""
You are a Python Data Visualization expert specializing in Matplotlib and Seaborn.
Your task is to generate robust, error-free Python code to create a single, insightful visualization based on the user's query and the provided Solr facet data.

**User's Analytical Goal:**
"{query_context}"

**Aggregated Data (from Solr Facets):**
```json
{json.dumps(facet_data, indent=2)}
```

---
### **CRITICAL INSTRUCTIONS: CODE GENERATION RULES**
You MUST follow these rules to avoid errors.

**1. Identify the Data Structure FIRST:**
Before writing any code, analyze the `facet_data` JSON to determine its structure. There are three common patterns. Choose the correct template below.

   *   **Pattern A: Simple `terms` Facet.** The JSON has ONE main key (besides "count") which contains a list of "buckets". Each bucket has a "val" and a "count". Use this for standard bar charts.
   *   **Pattern B: Multiple `query` Facets.** The JSON has MULTIPLE keys (besides "count"), and each key is an object containing metrics like "count" or "sum(...)". Use this for comparing a few distinct items (e.g., "oral vs injection").
   *   **Pattern C: Nested `terms` Facet.** The JSON has one main key with a list of "buckets", but inside EACH bucket, there are nested metric objects. This is used for grouped comparisons (e.g., "compare 2024 vs 2025 across categories"). This almost always requires `pandas`.

**2. Use the Correct Parsing Template:**

---
**TEMPLATE FOR PATTERN A (Simple Bar Chart from `terms` facet):**
```python
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

plt.style.use('seaborn-v0_8-whitegrid')
fig, ax = plt.subplots(figsize=(12, 8))

# Dynamically find the main facet key (the one with 'buckets')
facet_key = None
for key, value in facet_data.items():
    if isinstance(value, dict) and 'buckets' in value:
        facet_key = key
        break

if facet_key:
    buckets = facet_data[facet_key].get('buckets', [])
    # Check if buckets contain data
    if buckets:
        df = pd.DataFrame(buckets)
        # Check for a nested metric or use 'count'
        if 'total_deal_value' in df.columns and pd.api.types.is_dict_like(df['total_deal_value'].iloc):
             # Example for nested sum metric
             df['value'] = df['total_deal_value'].apply(lambda x: x.get('sum', 0))
             y_axis_label = 'Sum of Total Deal Value'
        else:
             df.rename(columns={{'count': 'value'}}, inplace=True)
             y_axis_label = 'Count'

        sns.barplot(data=df, x='val', y='value', ax=ax, palette='viridis')
        ax.set_xlabel('Category')
        ax.set_ylabel(y_axis_label)
    else:
        ax.text(0.5, 0.5, 'No data in buckets to plot.', ha='center')


ax.set_title('Your Insightful Title Here')
# Correct way to rotate labels to prevent errors
plt.setp(ax.get_xticklabels(), rotation=45, ha="right", rotation_mode="anchor")
plt.tight_layout()
```
---
**TEMPLATE FOR PATTERN B (Comparison Bar Chart from `query` facets):**
```python
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

plt.style.use('seaborn-v0_8-whitegrid')
fig, ax = plt.subplots(figsize=(10, 6))

labels = []
values = []
# Iterate through top-level keys, skipping the 'count'
for key, data_dict in facet_data.items():
    if key == 'count' or not isinstance(data_dict, dict):
        continue
    # Extract the label (e.g., 'oral_deals' -> 'Oral')
    label = key.replace('_deals', '').replace('_', ' ').title()
    # Find the metric value, which is NOT 'count'
    metric_value = 0
    for sub_key, sub_value in data_dict.items():
        if sub_key != 'count':
            metric_value = sub_value
            break # Found the metric
    labels.append(label)
    values.append(metric_value)

if labels:
    sns.barplot(x=labels, y=values, ax=ax, palette='mako')
    ax.set_ylabel('Total Deal Value') # Or other metric name
    ax.set_xlabel('Category')
else:
    ax.text(0.5, 0.5, 'No query facet data to plot.', ha='center')


ax.set_title('Your Insightful Title Here')
plt.tight_layout()
```
---
**TEMPLATE FOR PATTERN C (Grouped Bar Chart from nested `terms` facet):**
```python
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

plt.style.use('seaborn-v0_8-whitegrid')
fig, ax = plt.subplots(figsize=(14, 8))

# Find the key that has the buckets
facet_key = None
for key, value in facet_data.items():
    if isinstance(value, dict) and 'buckets' in value:
        facet_key = key
        break

if facet_key and facet_data[facet_key].get('buckets'):
    # This list comprehension is robust for parsing nested metrics
    plot_data = []
    for bucket in facet_data[facet_key]['buckets']:
        category = bucket['val']
        # Find all nested metrics (e.g., total_deal_value_2025)
        for sub_key, sub_value in bucket.items():
            if isinstance(sub_value, dict) and 'sum' in sub_value:
                # Extracts year from 'total_deal_value_2025' -> '2025'
                year = sub_key.split('_')[-1]
                value = sub_value['sum']
                plot_data.append({{'Category': category, 'Year': year, 'Value': value}})

    if plot_data:
        df = pd.DataFrame(plot_data)
        sns.barplot(data=df, x='Category', y='Value', hue='Year', ax=ax)
        ax.set_ylabel('Total Deal Value')
        ax.set_xlabel('Business Model')
        # Correct way to rotate labels to prevent errors
        plt.setp(ax.get_xticklabels(), rotation=45, ha="right", rotation_mode="anchor")
    else:
        ax.text(0.5, 0.5, 'No nested data found to plot.', ha='center')
else:
    ax.text(0.5, 0.5, 'No data in buckets to plot.', ha='center')

ax.set_title('Your Insightful Title Here')
plt.tight_layout()
```
---
**3. Final Code Generation:**
- **DO NOT** include `plt.show()`.
- **DO** set a dynamic and descriptive `ax.set_title()`, `ax.set_xlabel()`, and `ax.set_ylabel()`.
- **DO NOT** wrap the code in ```python ... ```. Output only the raw Python code.
- Adapt the chosen template to the specific keys and metrics in the provided `facet_data`.

**Your Task:**
Now, generate the Python code.
"""
    try:
        # Increase the timeout for potentially complex generation
        generation_config = genai.types.GenerationConfig(temperature=0, max_output_tokens=2048)
        response = llm_model.generate_content(prompt, generation_config=generation_config)
        # Clean the response to remove markdown formatting
        code = re.sub(r'^```python\s*|\s*```$', '', response.text, flags=re.MULTILINE)
        return code
    except Exception as e:
        print(f"Error in llm_generate_visualization_code: {e}\nRaw response: {response.text}")
        return None

def execute_viz_code_and_get_path(viz_code, facet_data):
    """Executes visualization code and returns the path to the saved plot image."""
    if not viz_code: return None
    try:
        if not os.path.exists('/tmp/plots'): os.makedirs('/tmp/plots')
        plot_path = f"/tmp/plots/plot_{datetime.datetime.now().timestamp()}.png"
        # The exec environment needs access to the required libraries and the data
        exec_globals = {'facet_data': facet_data, 'plt': plt, 'sns': sns, 'pd': pd}
        exec(viz_code, exec_globals)
        fig = exec_globals.get('fig')
        if fig:
            fig.savefig(plot_path, bbox_inches='tight')
            plt.close(fig) # Important to free up memory
            return plot_path
        return None
    except Exception as e:
        print(f"ERROR executing visualization code: {e}\n---Code---\n{viz_code}")
        return None


def llm_generate_summary_and_suggestions_stream(query_context, facet_data):
    """
    Yields a streaming analytical report and strategic, context-aware suggestions for further exploration.
    """
    prompt = f"""
You are a leading business intelligence analyst and strategist. Your audience is an executive or decision-maker who relies on you to not just present data, but to uncover its meaning and suggest smart next steps.

Your task is to analyze the provided data, deliver a concise, insightful report, and then propose logical follow-up analyses that could uncover deeper trends or causes.

**Today's Date for Context:** {datetime.datetime.now().strftime('%Y-%m-%d')}

**Analysis Context:**
*   **User's Core Question:** "{query_context}"
*   **Structured Data (Your Evidence):**
    ```json
    {json.dumps(facet_data, indent=2)}
    ```

**--- INSTRUCTIONS ---**

**PART 1: THE ANALYTICAL REPORT**
Structure your report using Markdown. Your tone should be insightful, data-driven, and forward-looking.

*   `## Executive Summary`: A 1-2 sentence, top-line answer to the user's core question. Get straight to the point.

*   `### Key Findings & Insights`: Use bullet points. Don't just state the data; interpret it.
    *   Highlight the most significant figures, patterns, or anomalies.
    *   Where relevant, calculate key differences or growth rates (e.g., "X is 25% higher than Y").
    *   Pinpoint what the visualization or data reveals about the core business question.
    *   **Data Note:** Briefly mention any important caveats if apparent from the data (e.g., a short time frame, a small sample size).

*   `### Context & Implications`: Briefly explain the "so what?" of these findings. What might this mean for our strategy, the market, or operations?

**PART 2: DEEPER DIVE: SUGGESTED FOLLOW-UP ANALYSES**
After the report, create a final section titled `### Deeper Dive: Suggested Follow-up Analyses`.

*   **Think like a strategist.** Based on the findings, what would you ask next to validate a trend, understand a change, or uncover a root cause?
*   **Propose 2-3 logical next questions.** These should be concise and framed as natural language questions that inspire further exploration.
*   **Focus on comparative and trend analysis.** For example:
    *   If the user asked for "this year," suggest a comparison: *"How does this year's performance in [X] compare to last year?"*
    *   If a category is a clear leader, suggest breaking it down: *"What are the top sub-categories driving the growth in [Leading Category]?"*
    *   If there's a time-based trend, suggest exploring correlations: *"Is the decline in [Metric Z] correlated with changes in any other category during the same period?"*
*   Format them as a numbered list.
*   Ensure your suggestions are answerable using the available field definitions below.

### FIELD DEFINITIONS (Your Source of Truth)
{formatted_field_info}

**--- YOUR TASK ---**
Generate the full report and the strategic suggestions based on the user's question and the data provided.
"""
    try:
        response_stream = llm_model.generate_content(prompt, stream=True)
        for chunk in response_stream:
            yield chunk.text
    except Exception as e:
        print(f"Error in llm_generate_summary_and_suggestions_stream: {e}")
        yield "Sorry, I was unable to generate a summary for this data."

# CORRECTED: Only one, correctly implemented version of this function remains.
def process_analysis_flow(user_input, history, state):
    """
    A generator that manages the conversation and yields tuples of UI updates for Gradio.
    This version treats any user input as a new query and considers conversation history.
    """
    # Initialize state on the first run
    if state is None:
        state = {'query_count': 0, 'last_suggestions': []}

    # If history is None (from a reset), initialize it as an empty list
    if history is None:
        history = []
        
    # Reset UI components for the new analysis, but keep chat history
    yield (history, state, gr.update(value=None, visible=False), gr.update(value=None, visible=False), gr.update(value=None, visible=False), gr.update(value=None, visible=False))

    query_context = user_input.strip()
    if not query_context:
        history.append((user_input, "Please enter a question to analyze."))
        yield (history, state, None, None, None, None)
        return

    # 1. Acknowledge and start the process
    history.append((user_input, f"Analyzing: '{query_context}'\n\n*Generating Solr query...*"))
    yield (history, state, None, None, None, None)

    # 2. Generate Solr Query with history
    llm_solr_obj = llm_generate_solr_query_with_history(query_context, field_metadata, history)
    if not llm_solr_obj or 'query' not in llm_solr_obj or 'json.facet' not in llm_solr_obj:
        history.append((None, "I'm sorry, I couldn't generate a valid Solr query for that request. Please try rephrasing your question."))
        yield (history, state, None, None, None, None)
        return

    solr_q, solr_facet = llm_solr_obj.get('query'), llm_solr_obj.get('json.facet')
    history.append((None, "✅ Solr query generated!"))
    formatted_query = f"**Query:**\n```\n{solr_q}\n```\n\n**Facet JSON:**\n```json\n{json.dumps(solr_facet, indent=2)}\n```"
    yield (history, state, None, None, gr.update(value=formatted_query, visible=True), None)

    # 3. Execute Query
    try:
        history.append((None, "*Executing query against the database...*"))
        yield (history, state, None, None, gr.update(value=formatted_query, visible=True), None)

        search_params = {"rows": 0, "json.facet": json.dumps(solr_facet)}
        results = solr_client.search(q=solr_q, **search_params)
        facet_data = results.raw_response.get("facets", {})
        
        formatted_data = f"**Facet Data:**\n```json\n{json.dumps(facet_data, indent=2)}\n```"


        if not facet_data or facet_data.get('count', 0) == 0:
            history.append((None, "No data was found for your query. Please try a different question."))
            yield (history, state, None, None, gr.update(value=formatted_query, visible=True), gr.update(value=formatted_data, visible=True))
            return

        # 4. Generate Visualization
        history.append((None, "✅ Data retrieved. Generating visualization..."))
        yield (history, state, None, None, gr.update(value=formatted_query, visible=True), gr.update(value=formatted_data, visible=True))

        with concurrent.futures.ThreadPoolExecutor() as executor:
            # Start visualization generation in the background
            viz_future = executor.submit(llm_generate_visualization_code, query_context, facet_data)

            # 5. Generate and Stream Report
            history.append((None, "✅ Plot created. Streaming final report..."))
            output_report = gr.update(value="", visible=True) # Make it visible before streaming
            yield (history, state, None, output_report, gr.update(value=formatted_query, visible=True), gr.update(value=formatted_data, visible=True))

            report_text = ""
            # The history object is not modified during streaming, so we pass it once
            # The yield statement for streaming only updates the report text
            stream_history = history[:] # Make a copy
            for chunk in llm_generate_summary_and_suggestions_stream(query_context, facet_data):
                report_text += chunk
                yield (stream_history, state, None, report_text, gr.update(value=formatted_query, visible=True), gr.update(value=formatted_data, visible=True))
            
            # Update the main history with the final report text
            history.append((None, report_text))

            # Get the visualization code from the future
            viz_code = viz_future.result()
            plot_path = execute_viz_code_and_get_path(viz_code, facet_data)

            output_plot = gr.update(value=plot_path, visible=True) if plot_path else gr.update(visible=False)
            if not plot_path:
                history.append((None, "*I was unable to generate a plot for this data.*\n"))
            yield (history, state, output_plot, report_text, gr.update(value=formatted_query, visible=True), gr.update(value=formatted_data, visible=True))

        # 6. Finalize and prompt for next action
        state['query_count'] += 1
        state['last_suggestions'] = parse_suggestions_from_report(report_text)

        next_prompt = "Analysis complete. What would you like to explore next? You can ask a follow-up question, or ask something new."
        history.append((None, next_prompt))
        yield (history, state, output_plot, report_text, gr.update(value=formatted_query, visible=True), gr.update(value=formatted_data, visible=True))

    except Exception as e:
        error_message = f"An unexpected error occurred during analysis: {e}"
        history.append((None, error_message))
        print(f"Error during analysis execution: {e}")
        yield (history, state, None, None, gr.update(value=formatted_query, visible=True), None)


# --- Gradio UI ---
with gr.Blocks(theme=gr.themes.Soft(), css="footer {display: none !important}") as demo:
    state = gr.State()

    with gr.Row():
        with gr.Column(scale=4):
            gr.Markdown("# 💊 PharmaCircle AI Data Analyst")
        with gr.Column(scale=1):
            clear_button = gr.Button("🔄 Start New Analysis", variant="primary")

    gr.Markdown("Ask a question to begin your analysis. I will generate a Solr query, retrieve the data, create a visualization, and write a report. You can then ask follow-up questions freely.")

    with gr.Row():
        with gr.Column(scale=1):
            chatbot = gr.Chatbot(label="Analysis Chat Log", height=700, show_copy_button=True)
            msg_textbox = gr.Textbox(placeholder="Ask a question, e.g., 'Show me the top 5 companies by total deal value in 2023'", label="Your Question", interactive=True)

        with gr.Column(scale=2):
            with gr.Accordion("Generated Solr Query", open=False):
                solr_query_display = gr.Markdown("Query will appear here...", visible=True)
            with gr.Accordion("Retrieved Solr Data", open=False):
                solr_data_display = gr.Markdown("Data will appear here...", visible=False)
            plot_display = gr.Image(label="Visualization", type="filepath", visible=False)
            report_display = gr.Markdown("Report will be streamed here...", visible=False)

    # --- Event Wiring ---
    def reset_all():
        """Resets the entire UI for a new analysis session."""
        return (
            [],   # chatbot (cleared)
            None, # state (reset)
            "",   # msg_textbox (cleared)
            gr.update(value=None, visible=False), # plot_display
            gr.update(value=None, visible=False), # report_display
            gr.update(value=None, visible=False), # solr_query_display
            gr.update(value=None, visible=False)  # solr_data_display
        )

    # Main event handler for all user queries
    msg_textbox.submit(
        fn=process_analysis_flow,
        inputs=[msg_textbox, chatbot, state],
        outputs=[chatbot, state, plot_display, report_display, solr_query_display, solr_data_display],
    ).then(
        lambda: gr.update(value=""),
        None,
        [msg_textbox],
        queue=False,
    )

    clear_button.click(
        fn=reset_all, 
        inputs=None, 
        outputs=[chatbot, state, msg_textbox, plot_display, report_display, solr_query_display, solr_data_display], 
        queue=False
    )

if is_initialized:
    demo.queue().launch(debug=True, share=True)
else:
    print("\nSkipping Gradio launch due to initialization errors.")