TANTCHEU Noussi CΓ©dric
Initial space upload: Interactive PhytoAI Assistant
7603b2e
import streamlit as st
import json
import pandas as pd
from huggingface_hub import hf_hub_download
import plotly.express as px
st.set_page_config(
page_title="PhytoAI Assistant",
page_icon="🌿",
layout="wide"
)
@st.cache_data
def load_phytoai_data():
"""Load PhytoAI data from HF dataset"""
try:
dataset_path = hf_hub_download(
repo_id="Gatescrispy/phytoai-mega-dataset",
filename="mega_final_dataset.json",
repo_type="dataset"
)
with open(dataset_path, 'r') as f:
return json.load(f)
except Exception as e:
st.error(f"Data loading error: {e}")
return None
def main():
st.title("🌿 PhytoAI Assistant")
st.markdown("### AI Assistant for Phytotherapy Research")
st.markdown("---")
# Load data
with st.spinner("Loading PhytoAI data..."):
data = load_phytoai_data()
if data is None:
st.error("❌ Unable to load PhytoAI data")
st.info("The dataset will be available once uploaded to Hugging Face")
# Demo data
st.subheader("πŸ“Š PhytoAI Dataset Preview")
st.write("**Dataset content:**")
st.write("β€’ 352 unique natural compounds")
st.write("β€’ 1,314 documented bioactivities")
st.write("β€’ Sources: PubChem, ChEMBL, scientific literature")
return
# Search interface
st.sidebar.header("πŸ” Compound Search")
search_type = st.sidebar.selectbox(
"Search type:",
["Compound name", "Therapeutic activity"]
)
if search_type == "Compound name":
compound_search = st.sidebar.text_input(
"Compound name",
placeholder="curcumin, resveratrol, quercetin..."
)
if compound_search:
search_compounds_by_name(data, compound_search)
elif search_type == "Therapeutic activity":
activity_search = st.sidebar.selectbox(
"Select an activity:",
["", "anti-inflammatory", "antioxidant", "cardiovascular",
"neuroprotective", "anti-cancer", "antimicrobial"]
)
if activity_search:
search_by_therapeutic_activity(data, activity_search)
# Main statistics
display_main_statistics(data)
# Visualizations
create_visualizations(data)
# Footer
st.markdown("---")
st.markdown("**🌿 PhytoAI** - AI Assistant for Phytotherapy Research")
st.markdown("πŸ“Š [PhytoAI Dataset](https://huggingface.co/datasets/Gatescrispy/phytoai-mega-dataset) | πŸ”¬ Research & Development")
def search_compounds_by_name(data, search_term):
"""Search by compound name"""
st.subheader(f"πŸ” Results for '{search_term}'")
results = []
for compound_id, compound_data in data.items():
compound_name = compound_data.get('compound_name', '').lower()
if search_term.lower() in compound_name:
results.append((compound_id, compound_data))
if results:
for compound_id, compound_data in results[:5]:
with st.expander(f"🧬 {compound_data.get('compound_name', 'Unknown compound')}"):
col1, col2 = st.columns(2)
with col1:
st.write("**Molecular Properties:**")
st.write(f"β€’ Formula: `{compound_data.get('molecular_formula', 'N/A')}`")
st.write(f"β€’ SMILES: `{compound_data.get('smiles', 'N/A')}`")
st.write(f"β€’ PubChem CID: `{compound_data.get('pubchem_cid', 'N/A')}`")
with col2:
st.write("**Bioactivities:**")
bioactivities = compound_data.get('bioactivities', [])
for i, activity in enumerate(bioactivities[:5]):
st.write(f"β€’ {activity.get('activity_type', 'N/A')}")
if i >= 4 and len(bioactivities) > 5:
st.write(f"... and {len(bioactivities) - 5} others")
break
else:
st.info("No compounds found for this search")
def search_by_therapeutic_activity(data, activity_type):
"""Search by therapeutic activity"""
st.subheader(f"🎯 Compounds with activity: {activity_type}")
matching_compounds = []
for compound_id, compound_data in data.items():
bioactivities = compound_data.get('bioactivities', [])
for activity in bioactivities:
if activity_type.lower() in activity.get('activity_type', '').lower():
matching_compounds.append({
'Compound': compound_data.get('compound_name', 'N/A'),
'Formula': compound_data.get('molecular_formula', 'N/A'),
'Activity': activity.get('activity_type', 'N/A'),
'CID': compound_data.get('pubchem_cid', 'N/A')
})
break
if matching_compounds:
df = pd.DataFrame(matching_compounds)
st.dataframe(df, use_container_width=True)
st.info(f"πŸ“Š {len(matching_compounds)} compounds found with this activity")
else:
st.warning("No compounds found for this activity")
def display_main_statistics(data):
"""Display main statistics"""
st.header("πŸ“ˆ PhytoAI Dataset Statistics")
col1, col2, col3, col4 = st.columns(4)
with col1:
st.metric("🧬 Total compounds", len(data))
with col2:
total_bioactivities = sum(len(comp.get('bioactivities', [])) for comp in data.values())
st.metric("πŸ”¬ Total bioactivities", f"{total_bioactivities:,}")
with col3:
therapeutic_areas = set()
for compound_data in data.values():
for activity in compound_data.get('bioactivities', []):
activity_type = activity.get('activity_type', '').lower()
if any(term in activity_type for term in ['anti-inflammatory', 'antioxidant', 'cardiovascular', 'neuroprotective', 'anti-cancer', 'antimicrobial']):
therapeutic_areas.add(activity_type.split()[0] if activity_type else 'unknown')
st.metric("🎯 Therapeutic areas", len(therapeutic_areas))
with col4:
compounds_with_pubchem = sum(1 for comp in data.values() if comp.get('pubchem_cid'))
coverage = (compounds_with_pubchem / len(data)) * 100
st.metric("πŸ“Š PubChem coverage", f"{coverage:.1f}%")
def create_visualizations(data):
"""Create interactive visualizations"""
st.header("πŸ“Š Interactive Visualizations")
# Therapeutic activity analysis
activity_counts = {}
for compound_data in data.values():
for activity in compound_data.get('bioactivities', []):
activity_type = activity.get('activity_type', '').lower()
# Categorize activities
if 'anti-inflammatory' in activity_type:
activity_counts['Anti-inflammatory'] = activity_counts.get('Anti-inflammatory', 0) + 1
elif 'antioxidant' in activity_type:
activity_counts['Antioxidant'] = activity_counts.get('Antioxidant', 0) + 1
elif 'cardiovascular' in activity_type:
activity_counts['Cardiovascular'] = activity_counts.get('Cardiovascular', 0) + 1
elif 'neuroprotective' in activity_type:
activity_counts['Neuroprotective'] = activity_counts.get('Neuroprotective', 0) + 1
elif 'anti-cancer' in activity_type or 'anticancer' in activity_type:
activity_counts['Anti-cancer'] = activity_counts.get('Anti-cancer', 0) + 1
elif 'antimicrobial' in activity_type:
activity_counts['Antimicrobial'] = activity_counts.get('Antimicrobial', 0) + 1
if activity_counts:
col1, col2 = st.columns(2)
with col1:
# Bar chart
fig_bar = px.bar(
x=list(activity_counts.keys()),
y=list(activity_counts.values()),
title="Distribution of Therapeutic Activities",
labels={'x': 'Activity Type', 'y': 'Number of Compounds'},
color=list(activity_counts.values()),
color_continuous_scale="Viridis"
)
fig_bar.update_layout(showlegend=False)
st.plotly_chart(fig_bar, use_container_width=True)
with col2:
# Pie chart
fig_pie = px.pie(
values=list(activity_counts.values()),
names=list(activity_counts.keys()),
title="Therapeutic Areas Distribution"
)
st.plotly_chart(fig_pie, use_container_width=True)
if __name__ == "__main__":
main()