Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -2,14 +2,15 @@ import gc
|
|
2 |
import os
|
3 |
import sys
|
4 |
import warnings
|
|
|
5 |
|
6 |
import pandas as pd
|
7 |
import streamlit as st
|
8 |
import torch
|
9 |
from torch.utils.data import DataLoader
|
10 |
-
from tqdm import tqdm
|
11 |
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
|
12 |
|
|
|
13 |
sys.path.append(
|
14 |
os.path.abspath(os.path.join(os.path.dirname(__file__), "task_forward"))
|
15 |
)
|
@@ -23,76 +24,218 @@ from utils import seed_everything
|
|
23 |
|
24 |
warnings.filterwarnings("ignore")
|
25 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
26 |
|
27 |
-
st.
|
28 |
-
st.markdown(
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
34 |
|
35 |
st.download_button(
|
36 |
label="Download demo_reaction_data.csv",
|
37 |
-
data=
|
38 |
file_name="demo_reaction_data.csv",
|
39 |
mime="text/csv",
|
|
|
40 |
)
|
41 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
42 |
|
43 |
-
|
44 |
-
|
45 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
46 |
)
|
47 |
-
num_return_sequences = num_beams
|
48 |
-
input_data = st.file_uploader("Choose a CSV file")
|
49 |
-
model_name_or_path = "sagawa/ReactionT5v2-forward"
|
50 |
-
input_column = "input"
|
51 |
-
input_max_length = 400
|
52 |
-
output_max_length = 300
|
53 |
-
output_min_length = -1
|
54 |
-
model = "t5"
|
55 |
-
seed = 42
|
56 |
-
batch_size = 1
|
57 |
-
|
58 |
-
|
59 |
-
if st.button("predict"):
|
60 |
-
with st.spinner(
|
61 |
-
"Now processing. If num beams=5, this process takes about 15 seconds per reaction."
|
62 |
-
):
|
63 |
-
|
64 |
-
CFG.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
65 |
-
|
66 |
-
seed_everything(seed=CFG.seed)
|
67 |
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
else CFG.model_name_or_path,
|
72 |
-
return_tensors="pt",
|
73 |
)
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
84 |
dataloader = DataLoader(
|
85 |
dataset,
|
86 |
batch_size=CFG.batch_size,
|
87 |
shuffle=False,
|
88 |
-
num_workers=
|
89 |
-
pin_memory=
|
90 |
drop_last=False,
|
91 |
)
|
92 |
|
|
|
93 |
all_sequences, all_scores = [], []
|
94 |
-
|
95 |
-
|
|
|
|
|
|
|
|
|
96 |
with torch.no_grad():
|
97 |
output = model.generate(
|
98 |
**inputs,
|
@@ -107,23 +250,42 @@ if st.button("predict"):
|
|
107 |
all_sequences.extend(sequences)
|
108 |
if scores:
|
109 |
all_scores.extend(scores)
|
|
|
110 |
del output
|
111 |
-
|
|
|
112 |
gc.collect()
|
113 |
|
114 |
-
|
115 |
-
|
116 |
-
|
|
|
|
|
117 |
|
118 |
-
|
119 |
-
|
120 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
121 |
|
122 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
123 |
|
124 |
-
|
125 |
-
|
126 |
-
data=csv,
|
127 |
-
file_name="output.csv",
|
128 |
-
mime="text/csv",
|
129 |
-
)
|
|
|
2 |
import os
|
3 |
import sys
|
4 |
import warnings
|
5 |
+
from types import SimpleNamespace
|
6 |
|
7 |
import pandas as pd
|
8 |
import streamlit as st
|
9 |
import torch
|
10 |
from torch.utils.data import DataLoader
|
|
|
11 |
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
|
12 |
|
13 |
+
# Local imports
|
14 |
sys.path.append(
|
15 |
os.path.abspath(os.path.join(os.path.dirname(__file__), "task_forward"))
|
16 |
)
|
|
|
24 |
|
25 |
warnings.filterwarnings("ignore")
|
26 |
|
27 |
+
# ------------------------------
|
28 |
+
# Page setup
|
29 |
+
# ------------------------------
|
30 |
+
st.set_page_config(
|
31 |
+
page_title="ReactionT5 — Product Prediction",
|
32 |
+
page_icon=None,
|
33 |
+
layout="wide",
|
34 |
+
)
|
35 |
+
|
36 |
+
st.title("ReactionT5 — Product Prediction")
|
37 |
+
st.caption(
|
38 |
+
"Predict reaction products from your inputs using a pretrained ReactionT5 model."
|
39 |
+
)
|
40 |
|
41 |
+
with st.expander("How to format your CSV", expanded=False):
|
42 |
+
st.markdown(
|
43 |
+
"""
|
44 |
+
- Include a required `REACTANT` column.
|
45 |
+
- Optional columns: `REAGENT`, `SOLVENT`, `CATALYST`.
|
46 |
+
- If a field lists multiple compounds, separate them with a dot (`.`).
|
47 |
+
- For details, download **demo_reaction_data.csv** and check its contents.
|
48 |
+
- Output contains predicted product SMILES and the sum of log-likelihoods for each prediction, sorted by log-likelihood (index 0 is most probable).
|
49 |
+
"""
|
50 |
+
)
|
51 |
+
|
52 |
+
# ------------------------------
|
53 |
+
# Demo data download
|
54 |
+
# ------------------------------
|
55 |
+
@st.cache_data(show_spinner=False)
|
56 |
+
def load_demo_csv_as_bytes() -> bytes:
|
57 |
+
demo_df = pd.read_csv("data/demo_reaction_data.csv")
|
58 |
+
return demo_df.to_csv(index=False).encode("utf-8")
|
59 |
|
60 |
st.download_button(
|
61 |
label="Download demo_reaction_data.csv",
|
62 |
+
data=load_demo_csv_as_bytes(),
|
63 |
file_name="demo_reaction_data.csv",
|
64 |
mime="text/csv",
|
65 |
+
use_container_width=True,
|
66 |
)
|
67 |
|
68 |
+
st.divider()
|
69 |
+
|
70 |
+
# ------------------------------
|
71 |
+
# Sidebar: configuration
|
72 |
+
# ------------------------------
|
73 |
+
with st.sidebar:
|
74 |
+
st.header("Configuration")
|
75 |
|
76 |
+
model_name_or_path = st.text_input(
|
77 |
+
"Model",
|
78 |
+
value="sagawa/ReactionT5v2-forward",
|
79 |
+
help="Hugging Face model repo or a local path.",
|
80 |
+
)
|
81 |
+
|
82 |
+
num_beams = st.slider(
|
83 |
+
"Beam size",
|
84 |
+
min_value=1, max_value=10, value=5, step=1,
|
85 |
+
help="Number of beams for beam search.",
|
86 |
+
)
|
87 |
+
|
88 |
+
seed = st.number_input(
|
89 |
+
"Random seed",
|
90 |
+
min_value=0, max_value=2**32 - 1, value=42, step=1,
|
91 |
+
help="Seed for reproducibility.",
|
92 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
93 |
|
94 |
+
with st.expander("Advanced generation", expanded=False):
|
95 |
+
input_max_length = st.number_input(
|
96 |
+
"Input max length", min_value=8, max_value=1024, value=400, step=8
|
|
|
|
|
97 |
)
|
98 |
+
output_max_length = st.number_input(
|
99 |
+
"Output max length", min_value=8, max_value=1024, value=300, step=8
|
100 |
+
)
|
101 |
+
output_min_length = st.number_input(
|
102 |
+
"Output min length", min_value=-1, max_value=1024, value=-1, step=1,
|
103 |
+
help="Use -1 to let the model decide.",
|
104 |
+
)
|
105 |
+
batch_size = st.number_input(
|
106 |
+
"Batch size", min_value=1, max_value=16, value=1, step=1
|
107 |
+
)
|
108 |
+
num_workers = st.number_input(
|
109 |
+
"DataLoader workers", min_value=0, max_value=8, value=4, step=1,
|
110 |
+
help="Set to 0 if multiprocessing is restricted in your environment.",
|
111 |
+
)
|
112 |
+
|
113 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
114 |
+
st.caption(f"Detected device: **{device.type.upper()}**")
|
115 |
+
|
116 |
+
# ------------------------------
|
117 |
+
# Cached loaders
|
118 |
+
# ------------------------------
|
119 |
+
@st.cache_resource(show_spinner=False)
|
120 |
+
def load_tokenizer(model_ref: str):
|
121 |
+
resolved = os.path.abspath(model_ref) if os.path.exists(model_ref) else model_ref
|
122 |
+
return AutoTokenizer.from_pretrained(resolved, return_tensors="pt")
|
123 |
+
|
124 |
+
@st.cache_resource(show_spinner=True)
|
125 |
+
def load_model(model_ref: str, device_str: str):
|
126 |
+
resolved = os.path.abspath(model_ref) if os.path.exists(model_ref) else model_ref
|
127 |
+
model = AutoModelForSeq2SeqLM.from_pretrained(resolved)
|
128 |
+
model.to(torch.device(device_str))
|
129 |
+
model.eval()
|
130 |
+
return model
|
131 |
+
|
132 |
+
@st.cache_data(show_spinner=False)
|
133 |
+
def df_to_csv_bytes(df: pd.DataFrame) -> bytes:
|
134 |
+
return df.to_csv(index=False).encode("utf-8")
|
135 |
+
|
136 |
+
# ------------------------------
|
137 |
+
# Main interaction
|
138 |
+
# ------------------------------
|
139 |
+
left, right = st.columns([1.4, 1.0], vertical_alignment="top")
|
140 |
+
|
141 |
+
with left:
|
142 |
+
with st.form("predict_form", clear_on_submit=False):
|
143 |
+
uploaded = st.file_uploader(
|
144 |
+
"Upload a CSV file with reactions",
|
145 |
+
type=["csv"],
|
146 |
+
accept_multiple_files=False,
|
147 |
+
help="Must contain a REACTANT column. Optional: REAGENT, SOLVENT, CATALYST.",
|
148 |
+
)
|
149 |
+
run = st.form_submit_button("Predict", use_container_width=True)
|
150 |
+
|
151 |
+
if uploaded is not None:
|
152 |
+
try:
|
153 |
+
raw_df = pd.read_csv(uploaded)
|
154 |
+
st.subheader("Input preview")
|
155 |
+
st.dataframe(raw_df.head(20), use_container_width=True)
|
156 |
+
except Exception as e:
|
157 |
+
st.error(f"Failed to read CSV: {e}")
|
158 |
+
|
159 |
+
with right:
|
160 |
+
st.subheader("Notes")
|
161 |
+
st.markdown(
|
162 |
+
f"""
|
163 |
+
- Beam size: **{num_beams}**
|
164 |
+
- Approximate time: about **15 seconds per reaction** when `beam size = 5` (varies by hardware).
|
165 |
+
- Results include the **sum of log-likelihoods** per prediction and are **sorted** by that value.
|
166 |
+
"""
|
167 |
+
)
|
168 |
+
st.info(
|
169 |
+
"If you encounter CUDA OOM issues, reduce max lengths or beam size, or switch to CPU."
|
170 |
+
)
|
171 |
+
|
172 |
+
# ------------------------------
|
173 |
+
# Inference
|
174 |
+
# ------------------------------
|
175 |
+
if 'results_df' not in st.session_state:
|
176 |
+
st.session_state['results_df'] = None
|
177 |
+
|
178 |
+
if 'last_error' not in st.session_state:
|
179 |
+
st.session_state['last_error'] = None
|
180 |
+
|
181 |
+
if run:
|
182 |
+
if uploaded is None:
|
183 |
+
st.warning("Please upload a CSV file before running prediction.")
|
184 |
+
else:
|
185 |
+
# Build config object expected by your dataset/utils
|
186 |
+
CFG = SimpleNamespace(
|
187 |
+
num_beams=int(num_beams),
|
188 |
+
num_return_sequences=int(num_beams), # tie to beams by default
|
189 |
+
model_name_or_path=model_name_or_path,
|
190 |
+
input_column="input",
|
191 |
+
input_max_length=int(input_max_length),
|
192 |
+
output_max_length=int(output_max_length),
|
193 |
+
output_min_length=int(output_min_length),
|
194 |
+
model="t5",
|
195 |
+
seed=int(seed),
|
196 |
+
batch_size=int(batch_size),
|
197 |
+
)
|
198 |
+
|
199 |
+
seed_everything(seed=CFG.seed)
|
200 |
+
|
201 |
+
# Load model & tokenizer
|
202 |
+
with st.status("Loading model and tokenizer...", expanded=False) as status:
|
203 |
+
try:
|
204 |
+
tokenizer = load_tokenizer(CFG.model_name_or_path)
|
205 |
+
model = load_model(CFG.model_name_or_path, device.type)
|
206 |
+
status.update(label="Model ready.", state="complete")
|
207 |
+
except Exception as e:
|
208 |
+
st.session_state['last_error'] = f"Failed to load model: {e}"
|
209 |
+
status.update(label="Model load failed.", state="error")
|
210 |
+
st.stop()
|
211 |
+
|
212 |
+
# Prepare data
|
213 |
+
try:
|
214 |
+
input_df = pd.read_csv(uploaded)
|
215 |
+
input_df = preprocess_df(input_df, drop_duplicates=False)
|
216 |
+
except Exception as e:
|
217 |
+
st.error(f"Failed to preprocess input: {e}")
|
218 |
+
st.stop()
|
219 |
+
|
220 |
+
# Dataset & loader
|
221 |
+
dataset = ReactionT5Dataset(CFG, input_df)
|
222 |
dataloader = DataLoader(
|
223 |
dataset,
|
224 |
batch_size=CFG.batch_size,
|
225 |
shuffle=False,
|
226 |
+
num_workers=int(num_workers),
|
227 |
+
pin_memory=(device.type == "cuda"),
|
228 |
drop_last=False,
|
229 |
)
|
230 |
|
231 |
+
# Generation loop with progress
|
232 |
all_sequences, all_scores = [], []
|
233 |
+
total = len(dataloader)
|
234 |
+
progress = st.progress(0, text="Generating predictions...")
|
235 |
+
info_placeholder = st.empty()
|
236 |
+
|
237 |
+
for i, inputs in enumerate(dataloader, start=1):
|
238 |
+
inputs = {k: v.to(device) for k, v in inputs.items()}
|
239 |
with torch.no_grad():
|
240 |
output = model.generate(
|
241 |
**inputs,
|
|
|
250 |
all_sequences.extend(sequences)
|
251 |
if scores:
|
252 |
all_scores.extend(scores)
|
253 |
+
|
254 |
del output
|
255 |
+
if device.type == "cuda":
|
256 |
+
torch.cuda.empty_cache()
|
257 |
gc.collect()
|
258 |
|
259 |
+
progress.progress(i / total, text=f"Generating predictions... {i}/{total}")
|
260 |
+
info_placeholder.caption(f"Processed batch {i} of {total}")
|
261 |
+
|
262 |
+
progress.empty()
|
263 |
+
info_placeholder.empty()
|
264 |
|
265 |
+
# Save predictions
|
266 |
+
try:
|
267 |
+
output_df = save_multiple_predictions(input_df, all_sequences, all_scores, CFG)
|
268 |
+
st.session_state['results_df'] = output_df
|
269 |
+
st.success("Prediction complete.")
|
270 |
+
except Exception as e:
|
271 |
+
st.session_state['last_error'] = f"Failed to assemble output: {e}"
|
272 |
+
st.error(st.session_state['last_error'])
|
273 |
+
st.stop()
|
274 |
|
275 |
+
# ------------------------------
|
276 |
+
# Results
|
277 |
+
# ------------------------------
|
278 |
+
if st.session_state.get('results_df') is not None:
|
279 |
+
st.subheader("Results preview")
|
280 |
+
st.dataframe(st.session_state['results_df'].head(50), use_container_width=True)
|
281 |
+
|
282 |
+
st.download_button(
|
283 |
+
label="Download predictions as CSV",
|
284 |
+
data=df_to_csv_bytes(st.session_state['results_df']),
|
285 |
+
file_name="output.csv",
|
286 |
+
mime="text/csv",
|
287 |
+
use_container_width=True,
|
288 |
+
)
|
289 |
|
290 |
+
if st.session_state.get('last_error'):
|
291 |
+
st.error(st.session_state['last_error'])
|
|
|
|
|
|
|
|