import streamlit as st import json import pandas as pd from huggingface_hub import hf_hub_download import plotly.express as px st.set_page_config( page_title="PhytoAI Assistant", page_icon="🌿", layout="wide" ) @st.cache_data def load_phytoai_data(): """Load PhytoAI data from HF dataset""" try: dataset_path = hf_hub_download( repo_id="Gatescrispy/phytoai-mega-dataset", filename="mega_final_dataset.json", repo_type="dataset" ) with open(dataset_path, 'r') as f: return json.load(f) except Exception as e: st.error(f"Data loading error: {e}") return None def main(): st.title("🌿 PhytoAI Assistant") st.markdown("### AI Assistant for Phytotherapy Research") st.markdown("---") # Load data with st.spinner("Loading PhytoAI data..."): data = load_phytoai_data() if data is None: st.error("❌ Unable to load PhytoAI data") st.info("The dataset will be available once uploaded to Hugging Face") # Demo data st.subheader("📊 PhytoAI Dataset Preview") st.write("**Dataset content:**") st.write("• 352 unique natural compounds") st.write("• 1,314 documented bioactivities") st.write("• Sources: PubChem, ChEMBL, scientific literature") return # Search interface st.sidebar.header("🔍 Compound Search") search_type = st.sidebar.selectbox( "Search type:", ["Compound name", "Therapeutic activity"] ) if search_type == "Compound name": compound_search = st.sidebar.text_input( "Compound name", placeholder="curcumin, resveratrol, quercetin..." ) if compound_search: search_compounds_by_name(data, compound_search) elif search_type == "Therapeutic activity": activity_search = st.sidebar.selectbox( "Select an activity:", ["", "anti-inflammatory", "antioxidant", "cardiovascular", "neuroprotective", "anti-cancer", "antimicrobial"] ) if activity_search: search_by_therapeutic_activity(data, activity_search) # Main statistics display_main_statistics(data) # Visualizations create_visualizations(data) # Footer st.markdown("---") st.markdown("**🌿 PhytoAI** - AI Assistant for Phytotherapy Research") st.markdown("📊 [PhytoAI Dataset](https://huggingface.co/datasets/Gatescrispy/phytoai-mega-dataset) | 🔬 Research & Development") def search_compounds_by_name(data, search_term): """Search by compound name""" st.subheader(f"🔍 Results for '{search_term}'") results = [] for compound_id, compound_data in data.items(): compound_name = compound_data.get('compound_name', '').lower() if search_term.lower() in compound_name: results.append((compound_id, compound_data)) if results: for compound_id, compound_data in results[:5]: with st.expander(f"🧬 {compound_data.get('compound_name', 'Unknown compound')}"): col1, col2 = st.columns(2) with col1: st.write("**Molecular Properties:**") st.write(f"• Formula: `{compound_data.get('molecular_formula', 'N/A')}`") st.write(f"• SMILES: `{compound_data.get('smiles', 'N/A')}`") st.write(f"• PubChem CID: `{compound_data.get('pubchem_cid', 'N/A')}`") with col2: st.write("**Bioactivities:**") bioactivities = compound_data.get('bioactivities', []) for i, activity in enumerate(bioactivities[:5]): st.write(f"• {activity.get('activity_type', 'N/A')}") if i >= 4 and len(bioactivities) > 5: st.write(f"... and {len(bioactivities) - 5} others") break else: st.info("No compounds found for this search") def search_by_therapeutic_activity(data, activity_type): """Search by therapeutic activity""" st.subheader(f"🎯 Compounds with activity: {activity_type}") matching_compounds = [] for compound_id, compound_data in data.items(): bioactivities = compound_data.get('bioactivities', []) for activity in bioactivities: if activity_type.lower() in activity.get('activity_type', '').lower(): matching_compounds.append({ 'Compound': compound_data.get('compound_name', 'N/A'), 'Formula': compound_data.get('molecular_formula', 'N/A'), 'Activity': activity.get('activity_type', 'N/A'), 'CID': compound_data.get('pubchem_cid', 'N/A') }) break if matching_compounds: df = pd.DataFrame(matching_compounds) st.dataframe(df, use_container_width=True) st.info(f"📊 {len(matching_compounds)} compounds found with this activity") else: st.warning("No compounds found for this activity") def display_main_statistics(data): """Display main statistics""" st.header("📈 PhytoAI Dataset Statistics") col1, col2, col3, col4 = st.columns(4) with col1: st.metric("🧬 Total compounds", len(data)) with col2: total_bioactivities = sum(len(comp.get('bioactivities', [])) for comp in data.values()) st.metric("🔬 Total bioactivities", f"{total_bioactivities:,}") with col3: therapeutic_areas = set() for compound_data in data.values(): for activity in compound_data.get('bioactivities', []): activity_type = activity.get('activity_type', '').lower() if any(term in activity_type for term in ['anti-inflammatory', 'antioxidant', 'cardiovascular', 'neuroprotective', 'anti-cancer', 'antimicrobial']): therapeutic_areas.add(activity_type.split()[0] if activity_type else 'unknown') st.metric("🎯 Therapeutic areas", len(therapeutic_areas)) with col4: compounds_with_pubchem = sum(1 for comp in data.values() if comp.get('pubchem_cid')) coverage = (compounds_with_pubchem / len(data)) * 100 st.metric("📊 PubChem coverage", f"{coverage:.1f}%") def create_visualizations(data): """Create interactive visualizations""" st.header("📊 Interactive Visualizations") # Therapeutic activity analysis activity_counts = {} for compound_data in data.values(): for activity in compound_data.get('bioactivities', []): activity_type = activity.get('activity_type', '').lower() # Categorize activities if 'anti-inflammatory' in activity_type: activity_counts['Anti-inflammatory'] = activity_counts.get('Anti-inflammatory', 0) + 1 elif 'antioxidant' in activity_type: activity_counts['Antioxidant'] = activity_counts.get('Antioxidant', 0) + 1 elif 'cardiovascular' in activity_type: activity_counts['Cardiovascular'] = activity_counts.get('Cardiovascular', 0) + 1 elif 'neuroprotective' in activity_type: activity_counts['Neuroprotective'] = activity_counts.get('Neuroprotective', 0) + 1 elif 'anti-cancer' in activity_type or 'anticancer' in activity_type: activity_counts['Anti-cancer'] = activity_counts.get('Anti-cancer', 0) + 1 elif 'antimicrobial' in activity_type: activity_counts['Antimicrobial'] = activity_counts.get('Antimicrobial', 0) + 1 if activity_counts: col1, col2 = st.columns(2) with col1: # Bar chart fig_bar = px.bar( x=list(activity_counts.keys()), y=list(activity_counts.values()), title="Distribution of Therapeutic Activities", labels={'x': 'Activity Type', 'y': 'Number of Compounds'}, color=list(activity_counts.values()), color_continuous_scale="Viridis" ) fig_bar.update_layout(showlegend=False) st.plotly_chart(fig_bar, use_container_width=True) with col2: # Pie chart fig_pie = px.pie( values=list(activity_counts.values()), names=list(activity_counts.keys()), title="Therapeutic Areas Distribution" ) st.plotly_chart(fig_pie, use_container_width=True) if __name__ == "__main__": main()