Spaces:
Running
Running
import streamlit as st | |
import json | |
import pandas as pd | |
from huggingface_hub import hf_hub_download | |
import plotly.express as px | |
st.set_page_config( | |
page_title="PhytoAI Assistant", | |
page_icon="πΏ", | |
layout="wide" | |
) | |
def load_phytoai_data(): | |
"""Load PhytoAI data from HF dataset""" | |
try: | |
dataset_path = hf_hub_download( | |
repo_id="Gatescrispy/phytoai-mega-dataset", | |
filename="mega_final_dataset.json", | |
repo_type="dataset" | |
) | |
with open(dataset_path, 'r') as f: | |
return json.load(f) | |
except Exception as e: | |
st.error(f"Data loading error: {e}") | |
return None | |
def main(): | |
st.title("πΏ PhytoAI Assistant") | |
st.markdown("### AI Assistant for Phytotherapy Research") | |
st.markdown("---") | |
# Load data | |
with st.spinner("Loading PhytoAI data..."): | |
data = load_phytoai_data() | |
if data is None: | |
st.error("β Unable to load PhytoAI data") | |
st.info("The dataset will be available once uploaded to Hugging Face") | |
# Demo data | |
st.subheader("π PhytoAI Dataset Preview") | |
st.write("**Dataset content:**") | |
st.write("β’ 352 unique natural compounds") | |
st.write("β’ 1,314 documented bioactivities") | |
st.write("β’ Sources: PubChem, ChEMBL, scientific literature") | |
return | |
# Search interface | |
st.sidebar.header("π Compound Search") | |
search_type = st.sidebar.selectbox( | |
"Search type:", | |
["Compound name", "Therapeutic activity"] | |
) | |
if search_type == "Compound name": | |
compound_search = st.sidebar.text_input( | |
"Compound name", | |
placeholder="curcumin, resveratrol, quercetin..." | |
) | |
if compound_search: | |
search_compounds_by_name(data, compound_search) | |
elif search_type == "Therapeutic activity": | |
activity_search = st.sidebar.selectbox( | |
"Select an activity:", | |
["", "anti-inflammatory", "antioxidant", "cardiovascular", | |
"neuroprotective", "anti-cancer", "antimicrobial"] | |
) | |
if activity_search: | |
search_by_therapeutic_activity(data, activity_search) | |
# Main statistics | |
display_main_statistics(data) | |
# Visualizations | |
create_visualizations(data) | |
# Footer | |
st.markdown("---") | |
st.markdown("**πΏ PhytoAI** - AI Assistant for Phytotherapy Research") | |
st.markdown("π [PhytoAI Dataset](https://huggingface.co/datasets/Gatescrispy/phytoai-mega-dataset) | π¬ Research & Development") | |
def search_compounds_by_name(data, search_term): | |
"""Search by compound name""" | |
st.subheader(f"π Results for '{search_term}'") | |
results = [] | |
for compound_id, compound_data in data.items(): | |
compound_name = compound_data.get('compound_name', '').lower() | |
if search_term.lower() in compound_name: | |
results.append((compound_id, compound_data)) | |
if results: | |
for compound_id, compound_data in results[:5]: | |
with st.expander(f"𧬠{compound_data.get('compound_name', 'Unknown compound')}"): | |
col1, col2 = st.columns(2) | |
with col1: | |
st.write("**Molecular Properties:**") | |
st.write(f"β’ Formula: `{compound_data.get('molecular_formula', 'N/A')}`") | |
st.write(f"β’ SMILES: `{compound_data.get('smiles', 'N/A')}`") | |
st.write(f"β’ PubChem CID: `{compound_data.get('pubchem_cid', 'N/A')}`") | |
with col2: | |
st.write("**Bioactivities:**") | |
bioactivities = compound_data.get('bioactivities', []) | |
for i, activity in enumerate(bioactivities[:5]): | |
st.write(f"β’ {activity.get('activity_type', 'N/A')}") | |
if i >= 4 and len(bioactivities) > 5: | |
st.write(f"... and {len(bioactivities) - 5} others") | |
break | |
else: | |
st.info("No compounds found for this search") | |
def search_by_therapeutic_activity(data, activity_type): | |
"""Search by therapeutic activity""" | |
st.subheader(f"π― Compounds with activity: {activity_type}") | |
matching_compounds = [] | |
for compound_id, compound_data in data.items(): | |
bioactivities = compound_data.get('bioactivities', []) | |
for activity in bioactivities: | |
if activity_type.lower() in activity.get('activity_type', '').lower(): | |
matching_compounds.append({ | |
'Compound': compound_data.get('compound_name', 'N/A'), | |
'Formula': compound_data.get('molecular_formula', 'N/A'), | |
'Activity': activity.get('activity_type', 'N/A'), | |
'CID': compound_data.get('pubchem_cid', 'N/A') | |
}) | |
break | |
if matching_compounds: | |
df = pd.DataFrame(matching_compounds) | |
st.dataframe(df, use_container_width=True) | |
st.info(f"π {len(matching_compounds)} compounds found with this activity") | |
else: | |
st.warning("No compounds found for this activity") | |
def display_main_statistics(data): | |
"""Display main statistics""" | |
st.header("π PhytoAI Dataset Statistics") | |
col1, col2, col3, col4 = st.columns(4) | |
with col1: | |
st.metric("𧬠Total compounds", len(data)) | |
with col2: | |
total_bioactivities = sum(len(comp.get('bioactivities', [])) for comp in data.values()) | |
st.metric("π¬ Total bioactivities", f"{total_bioactivities:,}") | |
with col3: | |
therapeutic_areas = set() | |
for compound_data in data.values(): | |
for activity in compound_data.get('bioactivities', []): | |
activity_type = activity.get('activity_type', '').lower() | |
if any(term in activity_type for term in ['anti-inflammatory', 'antioxidant', 'cardiovascular', 'neuroprotective', 'anti-cancer', 'antimicrobial']): | |
therapeutic_areas.add(activity_type.split()[0] if activity_type else 'unknown') | |
st.metric("π― Therapeutic areas", len(therapeutic_areas)) | |
with col4: | |
compounds_with_pubchem = sum(1 for comp in data.values() if comp.get('pubchem_cid')) | |
coverage = (compounds_with_pubchem / len(data)) * 100 | |
st.metric("π PubChem coverage", f"{coverage:.1f}%") | |
def create_visualizations(data): | |
"""Create interactive visualizations""" | |
st.header("π Interactive Visualizations") | |
# Therapeutic activity analysis | |
activity_counts = {} | |
for compound_data in data.values(): | |
for activity in compound_data.get('bioactivities', []): | |
activity_type = activity.get('activity_type', '').lower() | |
# Categorize activities | |
if 'anti-inflammatory' in activity_type: | |
activity_counts['Anti-inflammatory'] = activity_counts.get('Anti-inflammatory', 0) + 1 | |
elif 'antioxidant' in activity_type: | |
activity_counts['Antioxidant'] = activity_counts.get('Antioxidant', 0) + 1 | |
elif 'cardiovascular' in activity_type: | |
activity_counts['Cardiovascular'] = activity_counts.get('Cardiovascular', 0) + 1 | |
elif 'neuroprotective' in activity_type: | |
activity_counts['Neuroprotective'] = activity_counts.get('Neuroprotective', 0) + 1 | |
elif 'anti-cancer' in activity_type or 'anticancer' in activity_type: | |
activity_counts['Anti-cancer'] = activity_counts.get('Anti-cancer', 0) + 1 | |
elif 'antimicrobial' in activity_type: | |
activity_counts['Antimicrobial'] = activity_counts.get('Antimicrobial', 0) + 1 | |
if activity_counts: | |
col1, col2 = st.columns(2) | |
with col1: | |
# Bar chart | |
fig_bar = px.bar( | |
x=list(activity_counts.keys()), | |
y=list(activity_counts.values()), | |
title="Distribution of Therapeutic Activities", | |
labels={'x': 'Activity Type', 'y': 'Number of Compounds'}, | |
color=list(activity_counts.values()), | |
color_continuous_scale="Viridis" | |
) | |
fig_bar.update_layout(showlegend=False) | |
st.plotly_chart(fig_bar, use_container_width=True) | |
with col2: | |
# Pie chart | |
fig_pie = px.pie( | |
values=list(activity_counts.values()), | |
names=list(activity_counts.keys()), | |
title="Therapeutic Areas Distribution" | |
) | |
st.plotly_chart(fig_pie, use_container_width=True) | |
if __name__ == "__main__": | |
main() | |