File size: 34,520 Bytes
f4ff4c1
 
 
 
 
 
 
 
 
 
 
 
 
621afd7
f4ff4c1
 
d73150d
f4ff4c1
 
 
 
840c57d
f4ff4c1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
840c57d
f4ff4c1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
840c57d
f4ff4c1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bdcb123
f4ff4c1
bdcb123
f4ff4c1
 
 
 
 
bdcb123
f4ff4c1
 
 
 
 
 
 
 
 
 
 
bdcb123
f4ff4c1
 
 
 
 
bdcb123
f4ff4c1
 
 
bdcb123
f4ff4c1
bdcb123
f4ff4c1
 
 
 
 
bdcb123
f4ff4c1
 
 
bdcb123
f4ff4c1
bdcb123
f4ff4c1
 
 
 
 
bdcb123
f4ff4c1
 
 
 
 
bdcb123
f4ff4c1
 
 
 
 
bdcb123
f4ff4c1
 
 
bdcb123
f4ff4c1
bdcb123
f4ff4c1
 
 
 
 
bdcb123
f4ff4c1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bdcb123
f4ff4c1
 
 
 
 
bdcb123
f4ff4c1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
840c57d
f4ff4c1
 
 
 
 
 
840c57d
 
 
 
 
 
 
 
 
f4ff4c1
 
 
 
 
840c57d
f4ff4c1
840c57d
 
 
 
bdcb123
 
840c57d
 
 
 
f4ff4c1
 
 
 
840c57d
f4ff4c1
840c57d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f4ff4c1
840c57d
f4ff4c1
840c57d
f4ff4c1
 
840c57d
f4ff4c1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
840c57d
f4ff4c1
840c57d
f4ff4c1
 
 
840c57d
f4ff4c1
 
 
840c57d
 
f4ff4c1
 
840c57d
f4ff4c1
 
 
840c57d
8290c25
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
840c57d
f4ff4c1
8290c25
 
 
 
f4ff4c1
 
 
8290c25
f4ff4c1
 
 
 
 
 
 
 
840c57d
 
f4ff4c1
 
 
 
840c57d
f4ff4c1
 
 
 
 
 
840c57d
f4ff4c1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
840c57d
f4ff4c1
 
 
840c57d
f4ff4c1
 
 
 
 
840c57d
 
 
 
 
c3741ac
f4ff4c1
 
 
 
c3741ac
f4ff4c1
 
 
 
c3741ac
f4ff4c1
840c57d
 
f4ff4c1
 
c3741ac
f4ff4c1
 
 
 
 
c3741ac
f4ff4c1
 
 
 
c3741ac
f4ff4c1
 
 
 
c3741ac
 
 
f4ff4c1
 
 
c3741ac
f4ff4c1
 
 
 
c3741ac
f4ff4c1
621afd7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f4ff4c1
 
 
 
 
840c57d
f4ff4c1
c3741ac
f4ff4c1
 
 
 
 
c3741ac
f4ff4c1
840c57d
 
f4ff4c1
 
 
2cabd2d
 
 
 
 
 
f4ff4c1
 
 
 
eddfbb6
f4ff4c1
 
 
 
 
c3741ac
 
f4ff4c1
 
 
 
 
840c57d
f4ff4c1
840c57d
 
 
f4ff4c1
 
840c57d
 
f4ff4c1
 
840c57d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f4ff4c1
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
import gradio as gr
import json
import re
import datetime
import pandas as pd
import pysolr
import google.generativeai as genai
from sshtunnel import SSHTunnelForwarder
import matplotlib.pyplot as plt
import seaborn as sns
import io
import os
import logging
import concurrent.futures
from IPython.display import display, Markdown


# --- Suppress Matplotlib Debug Logs ---
logging.getLogger('matplotlib').setLevel(logging.WARNING)

# --- SSH Tunnel Configuration ---
# It's recommended to load secrets securely, e.g., from environment variables
SSH_HOST = os.environ.get('SSH_HOST')
SSH_PORT = 5322
SSH_USER = os.environ.get('SSH_USER')
SSH_PASS = os.environ.get('SSH_PASS')

# --- Solr Configuration ---
REMOTE_SOLR_HOST = '69.167.186.48'
REMOTE_SOLR_PORT = 8983
LOCAL_BIND_PORT = 8983
SOLR_CORE_NAME = 'news'
SOLR_USER = os.environ.get('SOLR_USER')
SOLR_PASS = os.environ.get('SOLR_PASS')

# --- Google Gemini Configuration ---
try:
    genai.configure(api_key=os.environ.get('GEMINI_API_KEY'))
except Exception as e:
    print(f"❌ Gemini API Key Error: {e}. Please ensure 'GEMINI_API_KEY' is set in your environment.")

# --- Global Variables ---
ssh_tunnel_server = None
solr_client = None
llm_model = None
is_initialized = False

try:
    # 1. Start the SSH Tunnel
    ssh_tunnel_server = SSHTunnelForwarder(
        (SSH_HOST, SSH_PORT),
        ssh_username=SSH_USER,
        ssh_password=SSH_PASS,
        remote_bind_address=(REMOTE_SOLR_HOST, REMOTE_SOLR_PORT),
        local_bind_address=('127.0.0.1', LOCAL_BIND_PORT)
    )
    ssh_tunnel_server.start()
    print(f"πŸš€ SSH tunnel established: Local Port {ssh_tunnel_server.local_bind_port} -> Remote Solr.")

    # 2. Initialize the pysolr client
    solr_url = f'http://127.0.0.1:{ssh_tunnel_server.local_bind_port}/solr/{SOLR_CORE_NAME}'
    solr_client = pysolr.Solr(solr_url, auth=(SOLR_USER, SOLR_PASS), always_commit=True)
    solr_client.ping()
    print(f"βœ… Solr connection successful on core '{SOLR_CORE_NAME}'.")

    # 3. Initialize the LLM
    llm_model = genai.GenerativeModel('gemini-1.5-flash', generation_config=genai.types.GenerationConfig(temperature=0))
    print(f"βœ… LLM Model '{llm_model.model_name}' initialized.")

    print("βœ… System Initialized Successfully.")
    is_initialized = True

except Exception as e:
    print(f"\n❌ An error occurred during setup: {e}")
    if ssh_tunnel_server and ssh_tunnel_server.is_active:
        ssh_tunnel_server.stop()


field_metadata = [
    {
        "field_name": "business_model",
        "type": "string (categorical)",
        "example_values": ["pharma/bio", "drug delivery", "pharma services"],
        "definition": "The primary business category of the company involved in the news. Use for filtering by high-level industry segments."
    },
    {
        "field_name": "news_type",
        "type": "string (categorical)",
        "example_values": ["product news", "financial news", "regulatory news"],
        "definition": "The category of the news article itself (e.g., financial, regulatory, acquisition). Use for filtering by the type of event being reported."
    },
    {
        "field_name": "event_type",
        "type": "string (categorical)",
        "example_values": ["phase 2", "phase 1", "pre clinical", "marketed"],
        "definition": "The clinical or developmental stage of a product or event discussed in the article. Essential for queries about clinical trial phases."
    },
    {
        "field_name": "source",
        "type": "string (categorical)",
        "example_values": ["Press Release", "PR Newswire", "Business Wire"],
        "definition": "The original source of the news article, such as a newswire or official report."
    },
    {
        "field_name": "company_name",
        "type": "string (exact match, for faceting)",
        "example_values": ["pfizer inc.", "astrazeneca plc", "roche"],
        "definition": "The canonical, standardized name of a company. **Crucially, you MUST use this field for `terms` faceting** to group results by a unique company. Do NOT use this for searching."
    },
    {
        "field_name": "company_name_s",
        "type": "string (multi-valued, for searching)",
        "example_values": ["pfizer inc.", "roche", "f. hoffmann-la roche ag", "nih"],
        "definition": "A field containing all known names and synonyms for a company. **You MUST use this field for all `query` parameter searches involving a company name** to ensure comprehensive results. Do NOT use for `terms` faceting."
    },
    {
        "field_name": "territory_hq_s",
        "type": "string (multi-valued, hierarchical)",
        "example_values": ["united states of america", "europe", "europe western"],
        "definition": "The geographic location (country and continent) of a company's headquarters. It is hierarchical. Use for filtering by location."
    },
    {
        "field_name": "therapeutic_category",
        "type": "string (specific)",
        "example_values": ["cancer, other", "cancer, nsclc metastatic", "alzheimer's"],
        "definition": "The specific disease or therapeutic area being targeted. Use for very specific disease queries."
    },
    {
        "field_name": "therapeutic_category_s",
        "type": "string (multi-valued, for searching)",
        "example_values": ["cancer", "oncology", "infections", "cns"],
        "definition": "Broader, multi-valued therapeutic categories and their synonyms. **Use this field for broad category searches** in the `query` parameter."
    },
    {
        "field_name": "compound_name",
        "type": "string (exact match, for faceting)",
        "example_values": ["opdivo injection solution", "keytruda injection solution"],
        "definition": "The specific, full trade name of a drug. **Use this field for `terms` faceting** on compounds."
    },
    {
        "field_name": "compound_name_s",
        "type": "string (multi-valued, for searching)",
        "example_values": ["nivolumab injection solution", "opdivo injection solution", "ono-4538 injection solution"],
        "definition": "A field with all known trade names and synonyms for a drug. **Use this field for all `query` parameter searches** involving a compound name."
    },
    {
        "field_name": "molecule_name",
        "type": "string (exact match, for faceting)",
        "example_values": ["cannabidiol", "paclitaxel", "pembrolizumab"],
        "definition": "The generic, non-proprietary name of the active molecule. **Use this field for `terms` faceting** on molecules."
    },
    {
        "field_name": "molecule_name_s",
        "type": "string (multi-valued, for searching)",
        "example_values": ["cbd", "s1-220", "a1002n5s"],
        "definition": "A field with all known generic names and synonyms for a molecule. **Use this field for all `query` parameter searches** involving a molecule name."
    },
    {
        "field_name": "highest_phase",
        "type": "string (categorical)",
        "example_values": ["marketed", "phase 2", "phase 1"],
        "definition": "The highest stage of development a drug has ever reached."
    },
    {
        "field_name": "drug_delivery_branch_s",
        "type": "string (multi-valued, for searching)",
        "example_values": ["injection", "parenteral", "oral", "injection, other", "oral, other"],
        "definition": "The method of drug administration. **Use this for `query` parameter searches about route of administration** as it contains broader, search-friendly terms."
    },
    {
        "field_name": "drug_delivery_branch",
        "type": "string (categorical, specific, for faceting)",
        "example_values": ["injection, other", "prefilled syringes", "np liposome", "oral enteric/delayed release"],
        "definition": "The most specific category of drug delivery technology. **Use this field for `terms` faceting** on specific delivery technologies."
    },
    {
        "field_name": "route_branch",
        "type": "string (categorical)",
        "example_values": ["injection", "oral", "topical", "inhalation"],
        "definition": "The primary route of drug administration. Good for faceting on exact routes."
    },
    {
        "field_name": "molecule_api_group",
        "type": "string (categorical)",
        "example_values": ["small molecules", "biologics", "nucleic acids"],
        "definition": "High-level classification of the drug's molecular type."
    },
    {
        "field_name": "content",
        "type": "text (full-text search)",
        "example_values": ["The largest study to date...", "balstilimab..."],
        "definition": "The full text content of the news article. Use for keyword searches on topics not covered by other specific fields."
    },
    {
        "field_name": "date",
        "type": "date",
        "example_values": ["2020-10-22T00:00:00Z"],
        "definition": "The full publication date and time in ISO 8601 format. Use for precise date range queries."
    },
    {
        "field_name": "date_year",
        "type": "number (year)",
        "example_values": [2020, 2021, 2022],
        "definition": "The 4-digit year of publication. **Use this for queries involving whole years** (e.g., 'in 2023', 'last year', 'since 2020')."
    },
    {
        "field_name": "total_deal_value_in_million",
        "type": "number (metric)",
        "example_values": [50, 120.5, 176.157, 1000],
        "definition": "The total value of a financial deal, in millions of USD. This is the primary numeric field for financial aggregations (sum, avg, etc.). To use this, you must also filter for news that has a deal value, e.g., 'total_deal_value_in_million:[0 TO *]'."
    }
]

# Helper function to format the metadata for the prompt
def format_metadata_for_prompt(metadata):
    formatted_string = ""
    for field in metadata:
        formatted_string += f"- **{field['field_name']}**\n"
        formatted_string += f"  - **Type**: {field['type']}\n"
        formatted_string += f"  - **Definition**: {field['definition']}\n"
        formatted_string += f"  - **Examples**: {', '.join(map(str, field['example_values']))}\n\n"
    return formatted_string
formatted_field_info = format_metadata_for_prompt(field_metadata)


def parse_suggestions_from_report(report_text):
    """Extracts numbered suggestions from the report's markdown text."""
    suggestions_match = re.search(r"### (?:Deeper Dive: Suggested Follow-up Analyses|Suggestions for Further Exploration)\s*\n(.*?)$", report_text, re.DOTALL | re.IGNORECASE)
    if not suggestions_match: return []
    suggestions_text = suggestions_match.group(1)
    suggestions = re.findall(r"^\s*\d+\.\s*(.*)", suggestions_text, re.MULTILINE)
    return [s.strip() for s in suggestions]


def llm_generate_solr_query_with_history(natural_language_query, field_metadata, chat_history):
    """Generates a Solr query and facet JSON from a natural language query, considering the conversation history."""
    # Format the chat history for the prompt
    formatted_history = ""
    for user_msg, bot_msg in chat_history:
        # We only need the user's queries for context, not the bot's detailed responses.
        if user_msg:
            # CORRECTED: Properly formatted f-string with a newline character
            formatted_history += f"- User: \"{user_msg}\"\n"

    prompt = f"""
You are an expert Solr query engineer who converts natural language questions into precise Solr JSON Facet API query objects. Your primary goal is to create a valid JSON object with `query` and `json.facet` keys.

---
### CONVERSATIONAL CONTEXT & RULES

1.  **Today's Date for Calculations**: 2025-07-16
2.  **Allowed Facet Types**: The `type` key for any facet MUST be one of the following: `terms`, `query`, or `range`. **Do not use `date_histogram`**. For time-series analysis, use a `range` facet on a date field.
3.  **Field Usage**: You MUST use the fields described in the 'Field Definitions' section. Pay close attention to the definitions to select the correct field.
4.  **Facet vs. Query Field Distinction**: This is critical.
    *   For searching in the main `query` parameter, ALWAYS use the multi-valued search fields (ending in `_s`, like `company_name_s`) to get comprehensive results.
    *   For grouping in a `terms` facet, ALWAYS use the canonical, single-value field (e.g., `company_name`, `molecule_name`) to ensure unique and accurate grouping.
5.  **No `count(*)`**: Do NOT use functions like `count(*)`. The default facet bucket count is sufficient for counting documents.
6.  **Allowed Aggregations**: For statistical facets, only use these functions: `sum`, `avg`, `min`, `max`, `unique`. The primary metric field is `total_deal_value_in_million`. The aggregation MUST be a simple string like `"sum(total_deal_value_in_million)"` and not a nested JSON object.
7.  **Term Facet Limits**: Every `terms` facet MUST include a `limit` key. Default to `limit: 10` unless the user specifies a different number of top results.
8.  **Output Format**: Your final output must be a single, raw JSON object and nothing else. Do not add comments, explanations, or markdown formatting like ```json.

---
### FIELD DEFINITIONS (Your Source of Truth)

`{formatted_field_info}`
---
### CHAT HISTORY
`{formatted_history}`
---
### EXAMPLE OF A FOLLOW-UP QUERY

**Initial User Query:** "What are the infections news in this year?"
```json
{{
  "query": "date_year:2025 AND therapeutic_category_s:infections",
  "json.facet": {{
  "infections_news_by_type": {{
    "type": "terms",
    "field": "news_type",
    "limit": 10
  }}
}}
}}
```

**Follow-up User Query:** "Compare deal values for injection vs oral."

**Correct JSON Output for the Follow-up:**
```json
{{
  "query": "therapeutic_category_s:infections AND date_year:2025 AND total_deal_value_in_million:[0 TO *]",
  "json.facet": {{
    "injection_deals": {{
      "type": "query",
      "q": "route_branch:injection",
      "facet": {{
        "total_deal_value": "sum(total_deal_value_in_million)"
      }}
    }},
    "oral_deals": {{
      "type": "query",
      "q": "route_branch:oral",
      "facet": {{
        "total_deal_value": "sum(total_deal_value_in_million)"
      }}
    }}
  }}
}}
```
---
### YOUR TASK

Now, convert the following user query into a single, raw JSON object with 'query' and 'json.facet' keys, strictly following all rules and field definitions provided above and considering the chat history.

**Current User Query:** `{natural_language_query}`
"""
    try:
        response = llm_model.generate_content(prompt)
        # Using a more robust regex to clean the response
        cleaned_text = re.sub(r'```json\s*|\s*```', '', response.text, flags=re.MULTILINE | re.DOTALL).strip()
        return json.loads(cleaned_text)
    except Exception as e:
        raw_response_text = response.text if 'response' in locals() else 'N/A'
        print(f"Error in llm_generate_solr_query_with_history: {e}\nRaw Response:\n{raw_response_text}")
        return None


def llm_generate_visualization_code(query_context, facet_data):
    """Generates Python code for visualization based on query and data."""
    prompt = f"""
You are a Python Data Visualization expert specializing in Matplotlib and Seaborn.
Your task is to generate robust, error-free Python code to create a single, insightful visualization based on the user's query and the provided Solr facet data.

**User's Analytical Goal:**
"{query_context}"

**Aggregated Data (from Solr Facets):**
```json
{json.dumps(facet_data, indent=2)}
```

---
### **CRITICAL INSTRUCTIONS: CODE GENERATION RULES**
You MUST follow these rules to avoid errors.

**1. Identify the Data Structure FIRST:**
Before writing any code, analyze the `facet_data` JSON to determine its structure. There are three common patterns. Choose the correct template below.

   *   **Pattern A: Simple `terms` Facet.** The JSON has ONE main key (besides "count") which contains a list of "buckets". Each bucket has a "val" and a "count". Use this for standard bar charts.
   *   **Pattern B: Multiple `query` Facets.** The JSON has MULTIPLE keys (besides "count"), and each key is an object containing metrics like "count" or "sum(...)". Use this for comparing a few distinct items (e.g., "oral vs injection").
   *   **Pattern C: Nested `terms` Facet.** The JSON has one main key with a list of "buckets", but inside EACH bucket, there are nested metric objects. This is used for grouped comparisons (e.g., "compare 2024 vs 2025 across categories"). This almost always requires `pandas`.

**2. Use the Correct Parsing Template:**

---
**TEMPLATE FOR PATTERN A (Simple Bar Chart from `terms` facet):**
```python
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

plt.style.use('seaborn-v0_8-whitegrid')
fig, ax = plt.subplots(figsize=(12, 8))

# Dynamically find the main facet key (the one with 'buckets')
facet_key = None
for key, value in facet_data.items():
    if isinstance(value, dict) and 'buckets' in value:
        facet_key = key
        break

if facet_key:
    buckets = facet_data[facet_key].get('buckets', [])
    # Check if buckets contain data
    if buckets:
        df = pd.DataFrame(buckets)
        # Check for a nested metric or use 'count'
        if 'total_deal_value' in df.columns and pd.api.types.is_dict_like(df['total_deal_value'].iloc):
             # Example for nested sum metric
             df['value'] = df['total_deal_value'].apply(lambda x: x.get('sum', 0))
             y_axis_label = 'Sum of Total Deal Value'
        else:
             df.rename(columns={{'count': 'value'}}, inplace=True)
             y_axis_label = 'Count'

        sns.barplot(data=df, x='val', y='value', ax=ax, palette='viridis')
        ax.set_xlabel('Category')
        ax.set_ylabel(y_axis_label)
    else:
        ax.text(0.5, 0.5, 'No data in buckets to plot.', ha='center')


ax.set_title('Your Insightful Title Here')
# Correct way to rotate labels to prevent errors
plt.setp(ax.get_xticklabels(), rotation=45, ha="right", rotation_mode="anchor")
plt.tight_layout()
```
---
**TEMPLATE FOR PATTERN B (Comparison Bar Chart from `query` facets):**
```python
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

plt.style.use('seaborn-v0_8-whitegrid')
fig, ax = plt.subplots(figsize=(10, 6))

labels = []
values = []
# Iterate through top-level keys, skipping the 'count'
for key, data_dict in facet_data.items():
    if key == 'count' or not isinstance(data_dict, dict):
        continue
    # Extract the label (e.g., 'oral_deals' -> 'Oral')
    label = key.replace('_deals', '').replace('_', ' ').title()
    # Find the metric value, which is NOT 'count'
    metric_value = 0
    for sub_key, sub_value in data_dict.items():
        if sub_key != 'count':
            metric_value = sub_value
            break # Found the metric
    labels.append(label)
    values.append(metric_value)

if labels:
    sns.barplot(x=labels, y=values, ax=ax, palette='mako')
    ax.set_ylabel('Total Deal Value') # Or other metric name
    ax.set_xlabel('Category')
else:
    ax.text(0.5, 0.5, 'No query facet data to plot.', ha='center')


ax.set_title('Your Insightful Title Here')
plt.tight_layout()
```
---
**TEMPLATE FOR PATTERN C (Grouped Bar Chart from nested `terms` facet):**
```python
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

plt.style.use('seaborn-v0_8-whitegrid')
fig, ax = plt.subplots(figsize=(14, 8))

# Find the key that has the buckets
facet_key = None
for key, value in facet_data.items():
    if isinstance(value, dict) and 'buckets' in value:
        facet_key = key
        break

if facet_key and facet_data[facet_key].get('buckets'):
    # This list comprehension is robust for parsing nested metrics
    plot_data = []
    for bucket in facet_data[facet_key]['buckets']:
        category = bucket['val']
        # Find all nested metrics (e.g., total_deal_value_2025)
        for sub_key, sub_value in bucket.items():
            if isinstance(sub_value, dict) and 'sum' in sub_value:
                # Extracts year from 'total_deal_value_2025' -> '2025'
                year = sub_key.split('_')[-1]
                value = sub_value['sum']
                plot_data.append({{'Category': category, 'Year': year, 'Value': value}})

    if plot_data:
        df = pd.DataFrame(plot_data)
        sns.barplot(data=df, x='Category', y='Value', hue='Year', ax=ax)
        ax.set_ylabel('Total Deal Value')
        ax.set_xlabel('Business Model')
        # Correct way to rotate labels to prevent errors
        plt.setp(ax.get_xticklabels(), rotation=45, ha="right", rotation_mode="anchor")
    else:
        ax.text(0.5, 0.5, 'No nested data found to plot.', ha='center')
else:
    ax.text(0.5, 0.5, 'No data in buckets to plot.', ha='center')

ax.set_title('Your Insightful Title Here')
plt.tight_layout()
```
---
**3. Final Code Generation:**
- **DO NOT** include `plt.show()`.
- **DO** set a dynamic and descriptive `ax.set_title()`, `ax.set_xlabel()`, and `ax.set_ylabel()`.
- **DO NOT** wrap the code in ```python ... ```. Output only the raw Python code.
- Adapt the chosen template to the specific keys and metrics in the provided `facet_data`.

**Your Task:**
Now, generate the Python code.
"""
    try:
        # Increase the timeout for potentially complex generation
        generation_config = genai.types.GenerationConfig(temperature=0, max_output_tokens=2048)
        response = llm_model.generate_content(prompt, generation_config=generation_config)
        # Clean the response to remove markdown formatting
        code = re.sub(r'^```python\s*|\s*```$', '', response.text, flags=re.MULTILINE)
        return code
    except Exception as e:
        print(f"Error in llm_generate_visualization_code: {e}\nRaw response: {response.text}")
        return None

def execute_viz_code_and_get_path(viz_code, facet_data):
    """Executes visualization code and returns the path to the saved plot image."""
    if not viz_code: return None
    try:
        if not os.path.exists('/tmp/plots'): os.makedirs('/tmp/plots')
        plot_path = f"/tmp/plots/plot_{datetime.datetime.now().timestamp()}.png"
        # The exec environment needs access to the required libraries and the data
        exec_globals = {'facet_data': facet_data, 'plt': plt, 'sns': sns, 'pd': pd}
        exec(viz_code, exec_globals)
        fig = exec_globals.get('fig')
        if fig:
            fig.savefig(plot_path, bbox_inches='tight')
            plt.close(fig) # Important to free up memory
            return plot_path
        return None
    except Exception as e:
        print(f"ERROR executing visualization code: {e}\n---Code---\n{viz_code}")
        return None


def llm_generate_summary_and_suggestions_stream(query_context, facet_data):
    """
    Yields a streaming analytical report and strategic, context-aware suggestions for further exploration.
    """
    prompt = f"""
You are a leading business intelligence analyst and strategist. Your audience is an executive or decision-maker who relies on you to not just present data, but to uncover its meaning and suggest smart next steps.

Your task is to analyze the provided data, deliver a concise, insightful report, and then propose logical follow-up analyses that could uncover deeper trends or causes.

**Today's Date for Context:** {datetime.datetime.now().strftime('%Y-%m-%d')}

**Analysis Context:**
*   **User's Core Question:** "{query_context}"
*   **Structured Data (Your Evidence):**
    ```json
    {json.dumps(facet_data, indent=2)}
    ```

**--- INSTRUCTIONS ---**

**PART 1: THE ANALYTICAL REPORT**
Structure your report using Markdown. Your tone should be insightful, data-driven, and forward-looking.

*   `## Executive Summary`: A 1-2 sentence, top-line answer to the user's core question. Get straight to the point.

*   `### Key Findings & Insights`: Use bullet points. Don't just state the data; interpret it.
    *   Highlight the most significant figures, patterns, or anomalies.
    *   Where relevant, calculate key differences or growth rates (e.g., "X is 25% higher than Y").
    *   Pinpoint what the visualization or data reveals about the core business question.
    *   **Data Note:** Briefly mention any important caveats if apparent from the data (e.g., a short time frame, a small sample size).

*   `### Context & Implications`: Briefly explain the "so what?" of these findings. What might this mean for our strategy, the market, or operations?

**PART 2: DEEPER DIVE: SUGGESTED FOLLOW-UP ANALYSES**
After the report, create a final section titled `### Deeper Dive: Suggested Follow-up Analyses`.

*   **Think like a strategist.** Based on the findings, what would you ask next to validate a trend, understand a change, or uncover a root cause?
*   **Propose 2-3 logical next questions.** These should be concise and framed as natural language questions that inspire further exploration.
*   **Focus on comparative and trend analysis.** For example:
    *   If the user asked for "this year," suggest a comparison: *"How does this year's performance in [X] compare to last year?"*
    *   If a category is a clear leader, suggest breaking it down: *"What are the top sub-categories driving the growth in [Leading Category]?"*
    *   If there's a time-based trend, suggest exploring correlations: *"Is the decline in [Metric Z] correlated with changes in any other category during the same period?"*
*   Format them as a numbered list.
*   Ensure your suggestions are answerable using the available field definitions below.

### FIELD DEFINITIONS (Your Source of Truth)
{formatted_field_info}

**--- YOUR TASK ---**
Generate the full report and the strategic suggestions based on the user's question and the data provided.
"""
    try:
        response_stream = llm_model.generate_content(prompt, stream=True)
        for chunk in response_stream:
            yield chunk.text
    except Exception as e:
        print(f"Error in llm_generate_summary_and_suggestions_stream: {e}")
        yield "Sorry, I was unable to generate a summary for this data."

# CORRECTED: Only one, correctly implemented version of this function remains.
def process_analysis_flow(user_input, history, state):
    """
    A generator that manages the conversation and yields tuples of UI updates for Gradio.
    This version treats any user input as a new query and considers conversation history.
    """
    # Initialize state on the first run
    if state is None:
        state = {'query_count': 0, 'last_suggestions': []}

    # If history is None (from a reset), initialize it as an empty list
    if history is None:
        history = []
        
    # Reset UI components for the new analysis, but keep chat history
    yield (history, state, gr.update(value=None, visible=False), gr.update(value=None, visible=False), gr.update(value=None, visible=False), gr.update(value=None, visible=False))

    query_context = user_input.strip()
    if not query_context:
        history.append((user_input, "Please enter a question to analyze."))
        yield (history, state, None, None, None, None)
        return

    # 1. Acknowledge and start the process
    history.append((user_input, f"Analyzing: '{query_context}'\n\n*Generating Solr query...*"))
    yield (history, state, None, None, None, None)

    # 2. Generate Solr Query with history
    llm_solr_obj = llm_generate_solr_query_with_history(query_context, field_metadata, history)
    if not llm_solr_obj or 'query' not in llm_solr_obj or 'json.facet' not in llm_solr_obj:
        history.append((None, "I'm sorry, I couldn't generate a valid Solr query for that request. Please try rephrasing your question."))
        yield (history, state, None, None, None, None)
        return

    solr_q, solr_facet = llm_solr_obj.get('query'), llm_solr_obj.get('json.facet')
    history.append((None, "βœ… Solr query generated!"))
    formatted_query = f"**Query:**\n```\n{solr_q}\n```\n\n**Facet JSON:**\n```json\n{json.dumps(solr_facet, indent=2)}\n```"
    yield (history, state, None, None, gr.update(value=formatted_query, visible=True), None)

    # 3. Execute Query
    try:
        history.append((None, "*Executing query against the database...*"))
        yield (history, state, None, None, gr.update(value=formatted_query, visible=True), None)

        search_params = {"rows": 0, "json.facet": json.dumps(solr_facet)}
        results = solr_client.search(q=solr_q, **search_params)
        facet_data = results.raw_response.get("facets", {})
        
        formatted_data = f"**Facet Data:**\n```json\n{json.dumps(facet_data, indent=2)}\n```"


        if not facet_data or facet_data.get('count', 0) == 0:
            history.append((None, "No data was found for your query. Please try a different question."))
            yield (history, state, None, None, gr.update(value=formatted_query, visible=True), gr.update(value=formatted_data, visible=True))
            return

        # 4. Generate Visualization
        history.append((None, "βœ… Data retrieved. Generating visualization..."))
        yield (history, state, None, None, gr.update(value=formatted_query, visible=True), gr.update(value=formatted_data, visible=True))

        with concurrent.futures.ThreadPoolExecutor() as executor:
            # Start visualization generation in the background
            viz_future = executor.submit(llm_generate_visualization_code, query_context, facet_data)

            # 5. Generate and Stream Report
            history.append((None, "βœ… Plot created. Streaming final report..."))
            output_report = gr.update(value="", visible=True) # Make it visible before streaming
            yield (history, state, None, output_report, gr.update(value=formatted_query, visible=True), gr.update(value=formatted_data, visible=True))

            report_text = ""
            # The history object is not modified during streaming, so we pass it once
            # The yield statement for streaming only updates the report text
            stream_history = history[:] # Make a copy
            for chunk in llm_generate_summary_and_suggestions_stream(query_context, facet_data):
                report_text += chunk
                yield (stream_history, state, None, report_text, gr.update(value=formatted_query, visible=True), gr.update(value=formatted_data, visible=True))
            
            # Update the main history with the final report text
            history.append((None, report_text))

            # Get the visualization code from the future
            viz_code = viz_future.result()
            plot_path = execute_viz_code_and_get_path(viz_code, facet_data)

            output_plot = gr.update(value=plot_path, visible=True) if plot_path else gr.update(visible=False)
            if not plot_path:
                history.append((None, "*I was unable to generate a plot for this data.*\n"))
            yield (history, state, output_plot, report_text, gr.update(value=formatted_query, visible=True), gr.update(value=formatted_data, visible=True))

        # 6. Finalize and prompt for next action
        state['query_count'] += 1
        state['last_suggestions'] = parse_suggestions_from_report(report_text)

        next_prompt = "Analysis complete. What would you like to explore next? You can ask a follow-up question, or ask something new."
        history.append((None, next_prompt))
        yield (history, state, output_plot, report_text, gr.update(value=formatted_query, visible=True), gr.update(value=formatted_data, visible=True))

    except Exception as e:
        error_message = f"An unexpected error occurred during analysis: {e}"
        history.append((None, error_message))
        print(f"Error during analysis execution: {e}")
        yield (history, state, None, None, gr.update(value=formatted_query, visible=True), None)


# --- Gradio UI ---
with gr.Blocks(theme=gr.themes.Soft(), css="footer {display: none !important}") as demo:
    state = gr.State()

    with gr.Row():
        with gr.Column(scale=4):
            gr.Markdown("# πŸ’Š PharmaCircle AI Data Analyst")
        with gr.Column(scale=1):
            clear_button = gr.Button("πŸ”„ Start New Analysis", variant="primary")

    gr.Markdown("Ask a question to begin your analysis. I will generate a Solr query, retrieve the data, create a visualization, and write a report. You can then ask follow-up questions freely.")

    with gr.Row():
        with gr.Column(scale=1):
            chatbot = gr.Chatbot(label="Analysis Chat Log", height=700, show_copy_button=True)
            msg_textbox = gr.Textbox(placeholder="Ask a question, e.g., 'Show me the top 5 companies by total deal value in 2023'", label="Your Question", interactive=True)

        with gr.Column(scale=2):
            with gr.Accordion("Generated Solr Query", open=False):
                solr_query_display = gr.Markdown("Query will appear here...", visible=True)
            with gr.Accordion("Retrieved Solr Data", open=False):
                solr_data_display = gr.Markdown("Data will appear here...", visible=False)
            plot_display = gr.Image(label="Visualization", type="filepath", visible=False)
            report_display = gr.Markdown("Report will be streamed here...", visible=False)

    # --- Event Wiring ---
    def reset_all():
        """Resets the entire UI for a new analysis session."""
        return (
            [],   # chatbot (cleared)
            None, # state (reset)
            "",   # msg_textbox (cleared)
            gr.update(value=None, visible=False), # plot_display
            gr.update(value=None, visible=False), # report_display
            gr.update(value=None, visible=False), # solr_query_display
            gr.update(value=None, visible=False)  # solr_data_display
        )

    # Main event handler for all user queries
    msg_textbox.submit(
        fn=process_analysis_flow,
        inputs=[msg_textbox, chatbot, state],
        outputs=[chatbot, state, plot_display, report_display, solr_query_display, solr_data_display],
    ).then(
        lambda: gr.update(value=""),
        None,
        [msg_textbox],
        queue=False,
    )

    clear_button.click(
        fn=reset_all, 
        inputs=None, 
        outputs=[chatbot, state, msg_textbox, plot_display, report_display, solr_query_display, solr_data_display], 
        queue=False
    )

if is_initialized:
    demo.queue().launch(debug=True, share=True)
else:
    print("\nSkipping Gradio launch due to initialization errors.")