Spaces:

sagawa
/

ReactionT5

Running

App Files Files Community

sagawa commited on 10 days ago

Commit

4a51867

verified ·

1 Parent(s): 8a0130f

Update app.py

Browse files

Files changed (1) hide show

app.py +227 -65

app.py CHANGED Viewed

@@ -2,14 +2,15 @@ import gc
 import os
 import sys
 import warnings
 import pandas as pd
 import streamlit as st
 import torch
 from torch.utils.data import DataLoader
-from tqdm import tqdm
 from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
 sys.path.append(
     os.path.abspath(os.path.join(os.path.dirname(__file__), "task_forward"))
 )
@@ -23,76 +24,218 @@ from utils import seed_everything
 warnings.filterwarnings("ignore")
-st.title("ReactionT5 task forward")
-st.markdown("""
-##### Predict reaction products from your inputs.
-##### Upload a CSV that contains a `REACTANT` column. Optionally include `REAGENT`, `SOLVENT`, and/or `CATALYST`.
-##### If a field lists multiple compounds, separate them with a dot (`.`). For details, download **demo_reaction_data.csv** and check its contents.
-##### The output shows product SMILES and the sum of log-likelihoods for each prediction, sorted by log-likelihood (index 0 is the most probable).
-""")
 st.download_button(
     label="Download demo_reaction_data.csv",
-    data=pd.read_csv("data/demo_reaction_data.csv").to_csv(index=False),
     file_name="demo_reaction_data.csv",
     mime="text/csv",
 )
-class CFG:
-    num_beams = st.number_input(
-        label="num beams", min_value=1, max_value=10, value=5, step=1
     )
-    num_return_sequences = num_beams
-    input_data = st.file_uploader("Choose a CSV file")
-    model_name_or_path = "sagawa/ReactionT5v2-forward"
-    input_column = "input"
-    input_max_length = 400
-    output_max_length = 300
-    output_min_length = -1
-    model = "t5"
-    seed = 42
-    batch_size = 1
-if st.button("predict"):
-    with st.spinner(
-        "Now processing. If num beams=5, this process takes about 15 seconds per reaction."
-    ):
-        CFG.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-        seed_everything(seed=CFG.seed)
-        CFG.tokenizer = AutoTokenizer.from_pretrained(
-            os.path.abspath(CFG.model_name_or_path)
-            if os.path.exists(CFG.model_name_or_path)
-            else CFG.model_name_or_path,
-            return_tensors="pt",
         )
-        model = AutoModelForSeq2SeqLM.from_pretrained(
-            os.path.abspath(CFG.model_name_or_path)
-            if os.path.exists(CFG.model_name_or_path)
-            else CFG.model_name_or_path
-        ).to(CFG.device)
-        model.eval()
-        input_data = pd.read_csv(CFG.input_data)
-        input_data = preprocess_df(input_data, drop_duplicates=False)
-        dataset = ReactionT5Dataset(CFG, input_data)
         dataloader = DataLoader(
             dataset,
             batch_size=CFG.batch_size,
             shuffle=False,
-            num_workers=4,
-            pin_memory=True,
             drop_last=False,
         )
         all_sequences, all_scores = [], []
-        for inputs in tqdm(dataloader, total=len(dataloader)):
-            inputs = {k: v.to(CFG.device) for k, v in inputs.items()}
             with torch.no_grad():
                 output = model.generate(
                     **inputs,
@@ -107,23 +250,42 @@ if st.button("predict"):
             all_sequences.extend(sequences)
             if scores:
                 all_scores.extend(scores)
             del output
-            torch.cuda.empty_cache()
             gc.collect()
-        output_df = save_multiple_predictions(
-            input_data, all_sequences, all_scores, CFG
-        )
-        @st.cache
-        def convert_df(df):
-            return df.to_csv(index=False)
-        csv = convert_df(output_df)
-        st.download_button(
-            label="Download data as CSV",
-            data=csv,
-            file_name="output.csv",
-            mime="text/csv",
-        )

 import os
 import sys
 import warnings
+from types import SimpleNamespace
 import pandas as pd
 import streamlit as st
 import torch
 from torch.utils.data import DataLoader
 from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
+# Local imports
 sys.path.append(
     os.path.abspath(os.path.join(os.path.dirname(__file__), "task_forward"))
 )
 warnings.filterwarnings("ignore")
+# ------------------------------
+# Page setup
+# ------------------------------
+st.set_page_config(
+    page_title="ReactionT5 — Product Prediction",
+    page_icon=None,
+    layout="wide",
+)
+st.title("ReactionT5 — Product Prediction")
+st.caption(
+    "Predict reaction products from your inputs using a pretrained ReactionT5 model."
+)
+with st.expander("How to format your CSV", expanded=False):
+    st.markdown(
+        """
+- Include a required `REACTANT` column.
+- Optional columns: `REAGENT`, `SOLVENT`, `CATALYST`.
+- If a field lists multiple compounds, separate them with a dot (`.`).
+- For details, download **demo_reaction_data.csv** and check its contents.
+- Output contains predicted product SMILES and the sum of log-likelihoods for each prediction, sorted by log-likelihood (index 0 is most probable).
+"""
+    )
+# ------------------------------
+# Demo data download
+# ------------------------------
+@st.cache_data(show_spinner=False)
+def load_demo_csv_as_bytes() -> bytes:
+    demo_df = pd.read_csv("data/demo_reaction_data.csv")
+    return demo_df.to_csv(index=False).encode("utf-8")
 st.download_button(
     label="Download demo_reaction_data.csv",
+    data=load_demo_csv_as_bytes(),
     file_name="demo_reaction_data.csv",
     mime="text/csv",
+    use_container_width=True,
 )
+st.divider()
+# ------------------------------
+# Sidebar: configuration
+# ------------------------------
+with st.sidebar:
+    st.header("Configuration")
+    model_name_or_path = st.text_input(
+        "Model",
+        value="sagawa/ReactionT5v2-forward",
+        help="Hugging Face model repo or a local path.",
+    )
+    num_beams = st.slider(
+        "Beam size",
+        min_value=1, max_value=10, value=5, step=1,
+        help="Number of beams for beam search.",
+    )
+    seed = st.number_input(
+        "Random seed",
+        min_value=0, max_value=2**32 - 1, value=42, step=1,
+        help="Seed for reproducibility.",
     )
+    with st.expander("Advanced generation", expanded=False):
+        input_max_length = st.number_input(
+            "Input max length", min_value=8, max_value=1024, value=400, step=8
         )
+        output_max_length = st.number_input(
+            "Output max length", min_value=8, max_value=1024, value=300, step=8
+        )
+        output_min_length = st.number_input(
+            "Output min length", min_value=-1, max_value=1024, value=-1, step=1,
+            help="Use -1 to let the model decide.",
+        )
+        batch_size = st.number_input(
+            "Batch size", min_value=1, max_value=16, value=1, step=1
+        )
+        num_workers = st.number_input(
+            "DataLoader workers", min_value=0, max_value=8, value=4, step=1,
+            help="Set to 0 if multiprocessing is restricted in your environment.",
+        )
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    st.caption(f"Detected device: **{device.type.upper()}**")
+# ------------------------------
+# Cached loaders
+# ------------------------------
+@st.cache_resource(show_spinner=False)
+def load_tokenizer(model_ref: str):
+    resolved = os.path.abspath(model_ref) if os.path.exists(model_ref) else model_ref
+    return AutoTokenizer.from_pretrained(resolved, return_tensors="pt")
+@st.cache_resource(show_spinner=True)
+def load_model(model_ref: str, device_str: str):
+    resolved = os.path.abspath(model_ref) if os.path.exists(model_ref) else model_ref
+    model = AutoModelForSeq2SeqLM.from_pretrained(resolved)
+    model.to(torch.device(device_str))
+    model.eval()
+    return model
+@st.cache_data(show_spinner=False)
+def df_to_csv_bytes(df: pd.DataFrame) -> bytes:
+    return df.to_csv(index=False).encode("utf-8")
+# ------------------------------
+# Main interaction
+# ------------------------------
+left, right = st.columns([1.4, 1.0], vertical_alignment="top")
+with left:
+    with st.form("predict_form", clear_on_submit=False):
+        uploaded = st.file_uploader(
+            "Upload a CSV file with reactions",
+            type=["csv"],
+            accept_multiple_files=False,
+            help="Must contain a REACTANT column. Optional: REAGENT, SOLVENT, CATALYST.",
+        )
+        run = st.form_submit_button("Predict", use_container_width=True)
+    if uploaded is not None:
+        try:
+            raw_df = pd.read_csv(uploaded)
+            st.subheader("Input preview")
+            st.dataframe(raw_df.head(20), use_container_width=True)
+        except Exception as e:
+            st.error(f"Failed to read CSV: {e}")
+with right:
+    st.subheader("Notes")
+    st.markdown(
+        f"""
+- Beam size: **{num_beams}**
+- Approximate time: about **15 seconds per reaction** when `beam size = 5` (varies by hardware).
+- Results include the **sum of log-likelihoods** per prediction and are **sorted** by that value.
+"""
+    )
+    st.info(
+        "If you encounter CUDA OOM issues, reduce max lengths or beam size, or switch to CPU."
+    )
+# ------------------------------
+# Inference
+# ------------------------------
+if 'results_df' not in st.session_state:
+    st.session_state['results_df'] = None
+if 'last_error' not in st.session_state:
+    st.session_state['last_error'] = None
+if run:
+    if uploaded is None:
+        st.warning("Please upload a CSV file before running prediction.")
+    else:
+        # Build config object expected by your dataset/utils
+        CFG = SimpleNamespace(
+            num_beams=int(num_beams),
+            num_return_sequences=int(num_beams),  # tie to beams by default
+            model_name_or_path=model_name_or_path,
+            input_column="input",
+            input_max_length=int(input_max_length),
+            output_max_length=int(output_max_length),
+            output_min_length=int(output_min_length),
+            model="t5",
+            seed=int(seed),
+            batch_size=int(batch_size),
+        )
+        seed_everything(seed=CFG.seed)
+        # Load model & tokenizer
+        with st.status("Loading model and tokenizer...", expanded=False) as status:
+            try:
+                tokenizer = load_tokenizer(CFG.model_name_or_path)
+                model = load_model(CFG.model_name_or_path, device.type)
+                status.update(label="Model ready.", state="complete")
+            except Exception as e:
+                st.session_state['last_error'] = f"Failed to load model: {e}"
+                status.update(label="Model load failed.", state="error")
+                st.stop()
+        # Prepare data
+        try:
+            input_df = pd.read_csv(uploaded)
+            input_df = preprocess_df(input_df, drop_duplicates=False)
+        except Exception as e:
+            st.error(f"Failed to preprocess input: {e}")
+            st.stop()
+        # Dataset & loader
+        dataset = ReactionT5Dataset(CFG, input_df)
         dataloader = DataLoader(
             dataset,
             batch_size=CFG.batch_size,
             shuffle=False,
+            num_workers=int(num_workers),
+            pin_memory=(device.type == "cuda"),
             drop_last=False,
         )
+        # Generation loop with progress
         all_sequences, all_scores = [], []
+        total = len(dataloader)
+        progress = st.progress(0, text="Generating predictions...")
+        info_placeholder = st.empty()
+        for i, inputs in enumerate(dataloader, start=1):
+            inputs = {k: v.to(device) for k, v in inputs.items()}
             with torch.no_grad():
                 output = model.generate(
                     **inputs,
             all_sequences.extend(sequences)
             if scores:
                 all_scores.extend(scores)
             del output
+            if device.type == "cuda":
+                torch.cuda.empty_cache()
             gc.collect()
+            progress.progress(i / total, text=f"Generating predictions... {i}/{total}")
+            info_placeholder.caption(f"Processed batch {i} of {total}")
+        progress.empty()
+        info_placeholder.empty()
+        # Save predictions
+        try:
+            output_df = save_multiple_predictions(input_df, all_sequences, all_scores, CFG)
+            st.session_state['results_df'] = output_df
+            st.success("Prediction complete.")
+        except Exception as e:
+            st.session_state['last_error'] = f"Failed to assemble output: {e}"
+            st.error(st.session_state['last_error'])
+            st.stop()
+# ------------------------------
+# Results
+# ------------------------------
+if st.session_state.get('results_df') is not None:
+    st.subheader("Results preview")
+    st.dataframe(st.session_state['results_df'].head(50), use_container_width=True)
+    st.download_button(
+        label="Download predictions as CSV",
+        data=df_to_csv_bytes(st.session_state['results_df']),
+        file_name="output.csv",
+        mime="text/csv",
+        use_container_width=True,
+    )
+if st.session_state.get('last_error'):
+    st.error(st.session_state['last_error'])