Spaces:
Sleeping
Sleeping
""" | |
Stores the detailed metadata for the Solr index fields. | |
This information is crucial for the LLM to understand the data schema, | |
enabling it to construct accurate and efficient Solr queries. Separating it | |
into its own module keeps the main application logic cleaner. | |
""" | |
field_metadata = [ | |
{ | |
"field_name": "business_model", | |
"type": "string (categorical)", | |
"example_values": ["pharma/bio", "drug delivery", "pharma services"], | |
"definition": "The primary business category of the company involved in the news. Use for filtering by high-level industry segments." | |
}, | |
{ | |
"field_name": "news_type", | |
"type": "string (categorical)", | |
"example_values": ["product news", "financial news", "regulatory news"], | |
"definition": "The category of the news article itself (e.g., financial, regulatory, acquisition). Use for filtering by the type of event being reported." | |
}, | |
{ | |
"field_name": "event_type", | |
"type": "string (categorical)", | |
"example_values": ["phase 2", "phase 1", "pre clinical", "marketed"], | |
"definition": "The clinical or developmental stage of a product or event discussed in the article. Essential for queries about clinical trial phases." | |
}, | |
{ | |
"field_name": "source", | |
"type": "string (categorical)", | |
"example_values": ["Press Release", "PR Newswire", "Business Wire"], | |
"definition": "The original source of the news article, such as a newswire or official report." | |
}, | |
{ | |
"field_name": "company_name", | |
"type": "string (exact match, for faceting)", | |
"example_values": ["pfizer inc.", "astrazeneca plc", "roche"], | |
"definition": "The canonical, standardized name of a company. **Crucially, you MUST use this field for `terms` faceting** to group results by a unique company. Do NOT use this for searching." | |
}, | |
{ | |
"field_name": "company_name_s", | |
"type": "string (multi-valued, for searching)", | |
"example_values": ["pfizer inc.", "roche", "f. hoffmann-la roche ag", "nih"], | |
"definition": "A field containing all known names and synonyms for a company. **You MUST use this field for all `query` parameter searches involving a company name** to ensure comprehensive results. Do NOT use for `terms` faceting." | |
}, | |
{ | |
"field_name": "territory_hq_s", | |
"type": "string (multi-valued, hierarchical)", | |
"example_values": ["united states of america", "europe", "europe western"], | |
"definition": "The geographic location (country and continent) of a company's headquarters. It is hierarchical. Use for filtering by location." | |
}, | |
{ | |
"field_name": "therapeutic_category", | |
"type": "string (specific)", | |
"example_values": ["cancer, other", "cancer, nsclc metastatic", "alzheimer's"], | |
"definition": "The specific disease or therapeutic area being targeted. Use for very specific disease queries." | |
}, | |
{ | |
"field_name": "therapeutic_category_s", | |
"type": "string (multi-valued, for searching)", | |
"example_values": ["cancer", "oncology", "infections", "cns"], | |
"definition": "Broader, multi-valued therapeutic categories and their synonyms. **Use this field for broad category searches** in the `query` parameter." | |
}, | |
{ | |
"field_name": "compound_name", | |
"type": "string (exact match, for faceting)", | |
"example_values": ["opdivo injection solution", "keytruda injection solution"], | |
"definition": "The specific, full trade name of a drug. **Use this field for `terms` faceting** on compounds." | |
}, | |
{ | |
"field_name": "compound_name_s", | |
"type": "string (multi-valued, for searching)", | |
"example_values": ["nivolumab injection solution", "opdivo injection solution", "ono-4538 injection solution"], | |
"definition": "A field with all known trade names and synonyms for a drug. **Use this field for all `query` parameter searches** involving a compound name." | |
}, | |
{ | |
"field_name": "molecule_name", | |
"type": "string (exact match, for faceting)", | |
"example_values": ["cannabidiol", "paclitaxel", "pembrolizumab"], | |
"definition": "The generic, non-proprietary name of the active molecule. **Use this field for `terms` faceting** on molecules." | |
}, | |
{ | |
"field_name": "molecule_name_s", | |
"type": "string (multi-valued, for searching)", | |
"example_values": ["cbd", "s1-220", "a1002n5s"], | |
"definition": "A field with all known generic names and synonyms for a molecule. **Use this field for all `query` parameter searches** involving a molecule name." | |
}, | |
{ | |
"field_name": "highest_phase", | |
"type": "string (categorical)", | |
"example_values": ["marketed", "phase 2", "phase 1"], | |
"definition": "The highest stage of development a drug has ever reached." | |
}, | |
{ | |
"field_name": "drug_delivery_branch_s", | |
"type": "string (multi-valued, for searching)", | |
"example_values": ["injection", "parenteral", "oral", "injection, other", "oral, other"], | |
"definition": "The method of drug administration. **Use this for `query` parameter searches about route of administration** as it contains broader, search-friendly terms." | |
}, | |
{ | |
"field_name": "drug_delivery_branch", | |
"type": "string (categorical, specific, for faceting)", | |
"example_values": ["injection, other", "prefilled syringes", "np liposome", "oral enteric/delayed release"], | |
"definition": "The most specific category of drug delivery technology. **Use this field for `terms` faceting** on specific delivery technologies." | |
}, | |
{ | |
"field_name": "route_branch", | |
"type": "string (categorical)", | |
"example_values": ["injection", "oral", "topical", "inhalation"], | |
"definition": "The primary route of drug administration. Good for faceting on exact routes." | |
}, | |
{ | |
"field_name": "molecule_api_group", | |
"type": "string (categorical)", | |
"example_values": ["small molecules", "biologics", "nucleic acids"], | |
"definition": "High-level classification of the drug's molecular type." | |
}, | |
{ | |
"field_name": "content", | |
"type": "text (full-text search)", | |
"example_values": ["The largest study to date...", "balstilimab..."], | |
"definition": "The full text content of the news article. Use for keyword searches on topics not covered by other specific fields." | |
}, | |
{ | |
"field_name": "date", | |
"type": "date", | |
"example_values": ["2020-10-22T00:00:00Z"], | |
"definition": "The full publication date and time in ISO 8601 format. Use for precise date range queries." | |
}, | |
{ | |
"field_name": "date_year", | |
"type": "number (year)", | |
"example_values": [2020, 2021, 2022], | |
"definition": "The 4-digit year of publication. **Use this for queries involving whole years** (e.g., 'in 2023', 'last year', 'since 2020')." | |
}, | |
{ | |
"field_name": "total_deal_value_in_million", | |
"type": "number (metric)", | |
"example_values": [50, 120.5, 176.157, 1000], | |
"definition": "The total value of a financial deal, in millions of USD. This is the primary numeric field for financial aggregations (sum, avg, etc.). To use this, you must also filter for news that has a deal value, e.g., 'total_deal_value_in_million:[0 TO *]'." | |
} | |
] | |
def format_metadata_for_prompt(): | |
"""Formats the field metadata into a string for the LLM prompt.""" | |
formatted_string = "" | |
for field in field_metadata: | |
formatted_string += f"- **{field['field_name']}**\n" | |
formatted_string += f" - **Type**: {field['type']}\n" | |
formatted_string += f" - **Definition**: {field['definition']}\n" | |
formatted_string += f" - **Examples**: {', '.join(map(str, field['example_values']))}\n\n" | |
return formatted_string | |