Spaces:
Running
Running
Zhaohan Meng
commited on
Update app.py
Browse files
app.py
CHANGED
@@ -1,81 +1,36 @@
|
|
1 |
-
|
2 |
-
import
|
3 |
-
|
4 |
-
|
5 |
-
_orig_get_type = _gc_utils.get_type
|
6 |
-
_orig_json2py = _gc_utils._json_schema_to_python_type
|
7 |
-
|
8 |
-
def _patched_get_type(schema):
|
9 |
-
# treat any boolean schema as if it were an empty dict
|
10 |
-
if isinstance(schema, bool):
|
11 |
-
schema = {}
|
12 |
-
return _orig_get_type(schema)
|
13 |
-
|
14 |
-
def _patched_json_schema_to_python_type(schema, defs=None):
|
15 |
-
# treat any boolean schema as if it were an empty dict
|
16 |
-
if isinstance(schema, bool):
|
17 |
-
schema = {}
|
18 |
-
return _orig_json2py(schema, defs)
|
19 |
-
|
20 |
-
_gc_utils.get_type = _patched_get_type
|
21 |
-
_gc_utils._json_schema_to_python_type = _patched_json_schema_to_python_type
|
22 |
-
|
23 |
-
# โโโ now itโs safe to import Gradio and build your interface โโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
24 |
-
import gradio as gr
|
25 |
-
import os
|
26 |
-
import sys
|
27 |
-
import argparse
|
28 |
-
import tempfile
|
29 |
-
import shutil
|
30 |
-
import base64
|
31 |
-
import io
|
32 |
-
|
33 |
-
import torch
|
34 |
import selfies
|
35 |
from rdkit import Chem
|
|
|
|
|
|
|
36 |
import matplotlib
|
37 |
matplotlib.use("Agg")
|
38 |
import matplotlib.pyplot as plt
|
39 |
from matplotlib import cm
|
40 |
from typing import Optional
|
41 |
|
42 |
-
from transformers import EsmForMaskedLM, EsmTokenizer, AutoModel
|
43 |
-
from torch.utils.data import DataLoader
|
44 |
-
from Bio.PDB import PDBParser, MMCIFParser
|
45 |
-
from Bio.Data import IUPACData
|
46 |
-
|
47 |
from utils.drug_tokenizer import DrugTokenizer
|
|
|
48 |
from utils.metric_learning_models_att_maps import Pre_encoded, FusionDTI
|
49 |
from utils.foldseek_util import get_struc_seq
|
50 |
|
51 |
-
# โโโโโ
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
def simple_seq_from_structure(path: str) -> str:
|
56 |
-
parser = MMCIFParser(QUIET=True) if path.endswith(".cif") else PDBParser(QUIET=True)
|
57 |
-
structure = parser.get_structure("P", path)
|
58 |
-
chains = list(structure.get_chains())
|
59 |
-
if not chains:
|
60 |
-
return ""
|
61 |
-
chain = max(chains, key=lambda c: len(list(c.get_residues())))
|
62 |
-
return "".join(three2one.get(res.get_resname().upper(), "X") for res in chain)
|
63 |
-
|
64 |
-
def smiles_to_selfies(smiles: str) -> Optional[str]:
|
65 |
-
try:
|
66 |
-
mol = Chem.MolFromSmiles(smiles)
|
67 |
-
if mol is None:
|
68 |
-
return None
|
69 |
-
return selfies.encoder(smiles)
|
70 |
-
except Exception:
|
71 |
-
return None
|
72 |
|
73 |
def parse_config():
|
74 |
p = argparse.ArgumentParser()
|
|
|
75 |
p.add_argument("--prot_encoder_path", default="westlake-repl/SaProt_650M_AF2")
|
76 |
p.add_argument("--drug_encoder_path", default="HUBioDataLab/SELFormer")
|
77 |
-
p.add_argument("--agg_mode", type=str,
|
78 |
p.add_argument("--group_size", type=int, default=1)
|
|
|
79 |
p.add_argument("--fusion", default="CAN")
|
80 |
p.add_argument("--device", default="cuda" if torch.cuda.is_available() else "cpu")
|
81 |
p.add_argument("--save_path_prefix", default="save_model_ckp/")
|
@@ -85,13 +40,16 @@ def parse_config():
|
|
85 |
args = parse_config()
|
86 |
DEVICE = args.device
|
87 |
|
88 |
-
# โโโโโ
|
89 |
prot_tokenizer = EsmTokenizer.from_pretrained(args.prot_encoder_path)
|
90 |
prot_model = EsmForMaskedLM.from_pretrained(args.prot_encoder_path)
|
91 |
-
|
|
|
92 |
drug_model = AutoModel.from_pretrained(args.drug_encoder_path)
|
93 |
-
encoding = Pre_encoded(prot_model, drug_model, args).to(DEVICE)
|
94 |
|
|
|
|
|
|
|
95 |
def collate_fn(batch):
|
96 |
query1, query2, scores = zip(*batch)
|
97 |
|
@@ -117,8 +75,20 @@ def collate_fn(batch):
|
|
117 |
attention_mask2 = query_encodings2["attention_mask"].bool()
|
118 |
|
119 |
return query_encodings1["input_ids"], attention_mask1, query_encodings2["input_ids"], attention_mask2, scores
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
120 |
|
121 |
|
|
|
122 |
def get_case_feature(model, loader):
|
123 |
model.eval()
|
124 |
with torch.no_grad():
|
@@ -130,12 +100,17 @@ def get_case_feature(model, loader):
|
|
130 |
p_ids.cpu(), d_ids.cpu(),
|
131 |
p_mask.cpu(), d_mask.cpu(), None)]
|
132 |
|
133 |
-
|
134 |
-
|
|
|
|
|
|
|
|
|
|
|
135 |
def visualize_attention(model, feats, drug_idx: Optional[int] = None) -> str:
|
136 |
"""
|
137 |
Render a Protein โ Drug cross-attention heat-map and, optionally, a
|
138 |
-
Top-
|
139 |
|
140 |
The token index shown on the x-axis (and accepted via *drug_idx*) is **the
|
141 |
position of that token in the *original* drug sequence**, *after* the
|
@@ -234,8 +209,8 @@ def visualize_attention(model, feats, drug_idx: Optional[int] = None) -> str:
|
|
234 |
plt.close(fig)
|
235 |
html = f'<img src="data:image/png;base64,{base64.b64encode(buf.getvalue()).decode()}" />'
|
236 |
|
237 |
-
# โโโโโโโโโโโโโโโโโโโโโ Top-
|
238 |
-
table_html = ""
|
239 |
if drug_idx is not None:
|
240 |
# map original 0-based drug_idx โ current column position
|
241 |
if (drug_idx + 1) in d_indices:
|
@@ -247,7 +222,7 @@ def visualize_attention(model, feats, drug_idx: Optional[int] = None) -> str:
|
|
247 |
|
248 |
if col_pos is not None:
|
249 |
col_vec = attn[:, col_pos]
|
250 |
-
topk = torch.topk(col_vec, k=min(
|
251 |
|
252 |
rank_hdr = "".join(f"<th>{r+1}</th>" for r in range(len(topk)))
|
253 |
res_row = "".join(f"<td>{p_tokens[i]}</td>" for i in topk)
|
@@ -255,58 +230,24 @@ def visualize_attention(model, feats, drug_idx: Optional[int] = None) -> str:
|
|
255 |
|
256 |
drug_tok_text = d_tokens[col_pos]
|
257 |
orig_idx = d_indices[col_pos]
|
258 |
-
|
259 |
-
# 1) build the header row: leading โRankโ, then 1โฆ30
|
260 |
-
header_cells = (
|
261 |
-
"<th style='border:1px solid #ccc; padding:6px; "
|
262 |
-
"background:#f7f7f7; text-align:center;'>Rank</th>"
|
263 |
-
+ "".join(
|
264 |
-
f"<th style='border:1px solid #ccc; padding:6px; "
|
265 |
-
f"background:#f7f7f7; text-align:center'>{r+1}</th>"
|
266 |
-
for r in range(len(topk))
|
267 |
-
)
|
268 |
-
)
|
269 |
-
|
270 |
-
# 2) build the residue row: leading โResidueโ, then the residue tokens
|
271 |
-
residue_cells = (
|
272 |
-
"<th style='border:1px solid #ccc; padding:6px; "
|
273 |
-
"background:#f7f7f7; text-align:center;'>Residue</th>"
|
274 |
-
+ "".join(
|
275 |
-
f"<td style='border:1px solid #ccc; padding:6px; "
|
276 |
-
f"text-align:center'>{p_tokens[i]}</td>"
|
277 |
-
for i in topk
|
278 |
-
)
|
279 |
-
)
|
280 |
-
|
281 |
-
# 3) build the position row: leading โPositionโ, then the residue positions
|
282 |
-
position_cells = (
|
283 |
-
"<th style='border:1px solid #ccc; padding:6px; "
|
284 |
-
"background:#f7f7f7; text-align:center;'>Position</th>"
|
285 |
-
+ "".join(
|
286 |
-
f"<td style='border:1px solid #ccc; padding:6px; "
|
287 |
-
f"text-align:center'>{p_indices[i]}</td>"
|
288 |
-
for i in topk
|
289 |
-
)
|
290 |
-
)
|
291 |
-
|
292 |
-
# 4) assemble your table_html
|
293 |
-
table_html = (
|
294 |
-
f"<h4 style='margin-bottom:12px'>"
|
295 |
-
f"Drug atom #{orig_idx} <code>{drug_tok_text}</code> โ Top-30 Protein residues"
|
296 |
-
f"</h4>"
|
297 |
-
f"<table style='border-collapse:collapse; margin:0 auto 24px;'>"
|
298 |
-
f"<tr>{header_cells}</tr>"
|
299 |
-
f"<tr>{residue_cells}</tr>"
|
300 |
-
f"<tr>{position_cells}</tr>"
|
301 |
-
f"</table>"
|
302 |
-
)
|
303 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
304 |
buf_png = io.BytesIO()
|
305 |
-
fig.savefig(buf_png, format="png", dpi=140)
|
306 |
buf_png.seek(0)
|
307 |
|
308 |
buf_pdf = io.BytesIO()
|
309 |
-
fig.savefig(buf_pdf, format="pdf")
|
310 |
buf_pdf.seek(0)
|
311 |
plt.close(fig)
|
312 |
|
@@ -314,253 +255,228 @@ def visualize_attention(model, feats, drug_idx: Optional[int] = None) -> str:
|
|
314 |
pdf_b64 = base64.b64encode(buf_pdf.getvalue()).decode()
|
315 |
|
316 |
html_heat = (
|
317 |
-
f"<
|
318 |
-
|
319 |
-
|
320 |
-
|
321 |
-
|
322 |
-
|
323 |
-
|
324 |
-
"text-decoration: none;'>"
|
325 |
-
"Download PDF"
|
326 |
-
"</a>"
|
327 |
-
# the clickable heatโmap image
|
328 |
-
f"<a href='data:image/png;base64,{png_b64}' target='_blank' title='Click to enlarge'>"
|
329 |
-
f"<img src='data:image/png;base64,{png_b64}' "
|
330 |
-
"style='display: block; width: 100%; height: auto; cursor: zoom-in;'/>"
|
331 |
-
"</a>"
|
332 |
-
"</div>"
|
333 |
)
|
334 |
|
|
|
335 |
return table_html + html_heat
|
336 |
-
|
337 |
-
# โโโโโ Gradio Callbacks โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
338 |
-
|
339 |
-
ROOT = os.path.dirname(os.path.abspath(__file__))
|
340 |
-
FOLDSEEK_BIN = os.path.join(ROOT, "bin", "foldseek")
|
341 |
-
|
342 |
-
def extract_sequence_cb(structure_file):
|
343 |
-
if structure_file is None or not os.path.exists(structure_file.name):
|
344 |
-
return ""
|
345 |
-
parsed = get_struc_seq(FOLDSEEK_BIN, structure_file.name, None, plddt_mask=False)
|
346 |
-
first_chain = next(iter(parsed))
|
347 |
-
_, _, struct_seq = parsed[first_chain]
|
348 |
-
return struct_seq
|
349 |
-
|
350 |
-
def inference_cb(prot_seq, drug_seq, atom_idx):
|
351 |
-
if not prot_seq:
|
352 |
-
return "<p style='color:red'>Please extract or enter a protein sequence first.</p>"
|
353 |
-
if not drug_seq.strip():
|
354 |
-
return "<p style='color:red'>Please enter a drug sequence.</p>"
|
355 |
-
if not drug_seq.strip().startswith("["):
|
356 |
-
conv = smiles_to_selfies(drug_seq.strip())
|
357 |
-
if conv is None:
|
358 |
-
return "<p style='color:red'>SMILESโSELFIES conversion failed.</p>"
|
359 |
-
drug_seq = conv
|
360 |
-
loader = DataLoader([(prot_seq, drug_seq, 1)], batch_size=1, collate_fn=collate_fn)
|
361 |
-
feats = get_case_feature(encoding, loader)
|
362 |
-
model = FusionDTI(446, 768, args).to(DEVICE)
|
363 |
-
ckpt = os.path.join(f"{args.save_path_prefix}{args.dataset}_{args.fusion}", "best_model.ckpt")
|
364 |
-
if os.path.isfile(ckpt):
|
365 |
-
model.load_state_dict(torch.load(ckpt, map_location=DEVICE))
|
366 |
-
return visualize_attention(model, feats, int(atom_idx)-1 if atom_idx else None)
|
367 |
-
|
368 |
-
def clear_cb():
|
369 |
-
return None, "", "", None, ""
|
370 |
-
|
371 |
-
# โโโโโ Gradio Interface Definition โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
372 |
-
|
373 |
-
css = """
|
374 |
-
:root {
|
375 |
-
--bg: #f3f4f6;
|
376 |
-
--card: #ffffff;
|
377 |
-
--border: #e5e7eb;
|
378 |
-
--primary: #6366f1;
|
379 |
-
--primary-dark: #4f46e5;
|
380 |
-
--text: #111827;
|
381 |
-
}
|
382 |
-
* { box-sizing: border-box; margin: 0; padding: 0; }
|
383 |
-
body { background: var(--bg); color: var(--text); font-family: Inter,system-ui,Arial,sans-serif; }
|
384 |
-
h1 { font-family: Poppins,Inter,sans-serif; font-weight: 600; font-size: 2rem; text-align: center; margin: 24px 0; }
|
385 |
-
button, .gr-button { font-family: Inter,sans-serif; font-weight: 600; }
|
386 |
-
#project-links { text-align: center; margin-bottom: 32px; }
|
387 |
-
#project-links .gr-button { margin: 0 8px; min-width: 160px; }
|
388 |
-
#project-links .gr-button:nth-child(1) { background: #10b981; }
|
389 |
-
#project-links .gr-button:nth-child(2) { background: #ef4444; }
|
390 |
-
#project-links .gr-button:nth-child(3) { background: #3b82f6; }
|
391 |
-
#project-links .gr-button:hover { opacity: 0.9; }
|
392 |
-
.link-btn{display:inline-block;margin:0 8px;padding:10px 20px;border-radius:8px;
|
393 |
-
color:white;font-weight:600;text-decoration:none;box-shadow:0 2px 6px rgba(0,0,0,0.12);
|
394 |
-
transition:all .2s ease-in-out;}
|
395 |
-
.link-btn:hover{opacity:.9;}
|
396 |
-
.link-btn.project{background:linear-gradient(to right,#10b981,#059669);}
|
397 |
-
.link-btn.arxiv {background:linear-gradient(to right,#ef4444,#dc2626);}
|
398 |
-
.link-btn.github {background:linear-gradient(to right,#3b82f6,#2563eb);}
|
399 |
-
|
400 |
-
/* make *all* gradio buttons a bit taller */
|
401 |
-
.gr-button { min-height: 10px !important; }
|
402 |
-
|
403 |
-
/* now target just our two big action buttons */
|
404 |
-
#extract-btn, #inference-btn {
|
405 |
-
width: 5px !important;
|
406 |
-
min-height: 36px !important;
|
407 |
-
margin-top: 12px !important;
|
408 |
-
}
|
409 |
-
|
410 |
-
/* and make clear button full width but shorter */
|
411 |
-
#clear-btn {
|
412 |
-
width: 10px !important;
|
413 |
-
min-height: 36px !important;
|
414 |
-
margin-top: 12px !important;
|
415 |
-
}
|
416 |
-
|
417 |
-
#input-card label {
|
418 |
-
font-weight: 600 !important; /* make the text bold */
|
419 |
-
color: var(--text) !important; /* use your standard text color */
|
420 |
-
}
|
421 |
-
|
422 |
-
.card {
|
423 |
-
background: var(--card);
|
424 |
-
border: 1px solid var(--border);
|
425 |
-
border-radius: 12px;
|
426 |
-
padding: 24px;
|
427 |
-
max-width: 1000px;
|
428 |
-
margin: 0 auto 32px;
|
429 |
-
box-shadow: 0 2px 6px rgba(0,0,0,0.05);
|
430 |
-
}
|
431 |
-
|
432 |
-
#guidelines-card h2 {
|
433 |
-
font-size: 1.4rem;
|
434 |
-
margin-bottom: 16px;
|
435 |
-
text-align: center;
|
436 |
-
}
|
437 |
-
#guidelines-card ol {
|
438 |
-
margin-left: 20px;
|
439 |
-
line-height: 1.6;
|
440 |
-
font-size: 1rem;
|
441 |
-
}
|
442 |
-
#input-card .gr-row, #input-card .gr-cols {
|
443 |
-
gap: 16px;
|
444 |
-
}
|
445 |
-
#input-card .gr-button {
|
446 |
-
flex: 1;
|
447 |
-
}
|
448 |
-
#output-card {
|
449 |
-
padding-top: 0;
|
450 |
-
}
|
451 |
-
"""
|
452 |
-
|
453 |
-
with gr.Blocks(css=css) as demo:
|
454 |
-
# โโโโโโโโโโโโโ Title โโโโโโโโโโโโโ
|
455 |
-
gr.Markdown("<h1>Token-level Visualiser for Drug-Target Interaction</h1>")
|
456 |
-
|
457 |
-
# โโโโโโโโโโโโโ Project Links โโโโโโโโโโโโโ
|
458 |
-
gr.Markdown("""
|
459 |
-
<div style="text-align:center;margin-bottom:32px;">
|
460 |
-
<a class="link-btn project" href="https://zhaohanm.github.io/FusionDTI.github.io/" target="_blank">๐ Project Page</a>
|
461 |
-
<a class="link-btn arxiv" href="https://arxiv.org/abs/2406.01651" target="_blank">๐ ArXiv: 2406.01651</a>
|
462 |
-
<a class="link-btn github" href="https://github.com/ZhaohanM/FusionDTI" target="_blank">๐ป GitHub Repo</a>
|
463 |
-
</div>
|
464 |
-
""")
|
465 |
-
# โโโโโโโโโโโโโ Guidelines Card โโโโโโโโโโโโโ
|
466 |
|
467 |
-
|
468 |
-
|
469 |
-
|
470 |
-
|
471 |
-
|
472 |
-
|
473 |
-
|
474 |
-
|
475 |
-
|
476 |
-
|
477 |
-
|
478 |
-
|
479 |
-
|
480 |
-
|
481 |
-
|
482 |
-
|
483 |
-
|
484 |
-
|
485 |
-
|
486 |
-
|
487 |
-
|
488 |
-
|
489 |
-
|
490 |
-
|
491 |
-
|
492 |
-
|
493 |
-
|
494 |
-
|
495 |
-
|
496 |
-
|
497 |
-
|
498 |
-
|
499 |
-
|
500 |
-
|
501 |
-
|
502 |
-
|
503 |
-
|
504 |
-
|
505 |
-
|
506 |
-
|
507 |
-
|
508 |
-
|
509 |
-
|
510 |
-
|
511 |
-
|
512 |
-
|
513 |
-
|
514 |
-
|
515 |
-
|
516 |
-
|
517 |
-
|
518 |
-
|
519 |
-
|
520 |
-
|
521 |
-
|
522 |
-
|
523 |
-
|
524 |
-
|
525 |
-
|
526 |
-
|
527 |
-
|
528 |
-
|
529 |
-
|
530 |
-
|
531 |
-
|
532 |
-
|
533 |
-
|
534 |
-
|
535 |
-
|
536 |
-
|
537 |
-
|
538 |
-
|
539 |
-
|
540 |
-
|
541 |
-
|
542 |
-
|
543 |
-
|
544 |
-
|
545 |
-
|
546 |
-
|
547 |
-
|
548 |
-
|
549 |
-
|
550 |
-
|
551 |
-
|
552 |
-
|
553 |
-
|
554 |
-
|
555 |
-
|
556 |
-
|
557 |
-
|
558 |
-
|
559 |
-
|
560 |
-
|
561 |
-
|
562 |
-
|
563 |
-
|
564 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
565 |
if __name__ == "__main__":
|
566 |
-
|
|
|
1 |
+
import os, sys, argparse, tempfile, shutil, base64, io
|
2 |
+
from flask import Flask, request, render_template_string
|
3 |
+
from werkzeug.utils import secure_filename
|
4 |
+
from torch.utils.data import DataLoader
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
5 |
import selfies
|
6 |
from rdkit import Chem
|
7 |
+
import app as gr
|
8 |
+
|
9 |
+
import torch
|
10 |
import matplotlib
|
11 |
matplotlib.use("Agg")
|
12 |
import matplotlib.pyplot as plt
|
13 |
from matplotlib import cm
|
14 |
from typing import Optional
|
15 |
|
|
|
|
|
|
|
|
|
|
|
16 |
from utils.drug_tokenizer import DrugTokenizer
|
17 |
+
from transformers import EsmForMaskedLM, EsmTokenizer, AutoModel
|
18 |
from utils.metric_learning_models_att_maps import Pre_encoded, FusionDTI
|
19 |
from utils.foldseek_util import get_struc_seq
|
20 |
|
21 |
+
# โโโโโ global paths / args โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
22 |
+
FOLDSEEK_BIN = shutil.which("foldseek")
|
23 |
+
os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
24 |
+
sys.path.append("..")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
25 |
|
26 |
def parse_config():
|
27 |
p = argparse.ArgumentParser()
|
28 |
+
p.add_argument("-f")
|
29 |
p.add_argument("--prot_encoder_path", default="westlake-repl/SaProt_650M_AF2")
|
30 |
p.add_argument("--drug_encoder_path", default="HUBioDataLab/SELFormer")
|
31 |
+
p.add_argument("--agg_mode", default="mean_all_tok", type=str, help="{cls|mean|mean_all_tok}")
|
32 |
p.add_argument("--group_size", type=int, default=1)
|
33 |
+
p.add_argument("--lr", type=float, default=1e-4)
|
34 |
p.add_argument("--fusion", default="CAN")
|
35 |
p.add_argument("--device", default="cuda" if torch.cuda.is_available() else "cpu")
|
36 |
p.add_argument("--save_path_prefix", default="save_model_ckp/")
|
|
|
40 |
args = parse_config()
|
41 |
DEVICE = args.device
|
42 |
|
43 |
+
# โโโโโ tokenisers & encoders โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
44 |
prot_tokenizer = EsmTokenizer.from_pretrained(args.prot_encoder_path)
|
45 |
prot_model = EsmForMaskedLM.from_pretrained(args.prot_encoder_path)
|
46 |
+
|
47 |
+
drug_tokenizer = DrugTokenizer() # SELFIES
|
48 |
drug_model = AutoModel.from_pretrained(args.drug_encoder_path)
|
|
|
49 |
|
50 |
+
encoding = Pre_encoded(prot_model, drug_model, args).to(DEVICE)
|
51 |
+
|
52 |
+
# โโโ collate fn โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
53 |
def collate_fn(batch):
|
54 |
query1, query2, scores = zip(*batch)
|
55 |
|
|
|
75 |
attention_mask2 = query_encodings2["attention_mask"].bool()
|
76 |
|
77 |
return query_encodings1["input_ids"], attention_mask1, query_encodings2["input_ids"], attention_mask2, scores
|
78 |
+
# def collate_fn_batch_encoding(batch):
|
79 |
+
|
80 |
+
def smiles_to_selfies(smiles: str) -> Optional[str]:
|
81 |
+
try:
|
82 |
+
mol = Chem.MolFromSmiles(smiles)
|
83 |
+
if mol is None:
|
84 |
+
return None
|
85 |
+
selfies_str = selfies.encoder(smiles)
|
86 |
+
return selfies_str
|
87 |
+
except Exception:
|
88 |
+
return None
|
89 |
|
90 |
|
91 |
+
# โโโโโ single-case embedding โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
92 |
def get_case_feature(model, loader):
|
93 |
model.eval()
|
94 |
with torch.no_grad():
|
|
|
100 |
p_ids.cpu(), d_ids.cpu(),
|
101 |
p_mask.cpu(), d_mask.cpu(), None)]
|
102 |
|
103 |
+
# โโโโโ helper๏ผ่ฟๆปค็นๆฎ token โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
104 |
+
def clean_tokens(ids, tokenizer):
|
105 |
+
toks = tokenizer.convert_ids_to_tokens(ids.tolist())
|
106 |
+
return [t for t in toks if t not in tokenizer.all_special_tokens]
|
107 |
+
|
108 |
+
# โโโโโ visualisation โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
109 |
+
|
110 |
def visualize_attention(model, feats, drug_idx: Optional[int] = None) -> str:
|
111 |
"""
|
112 |
Render a Protein โ Drug cross-attention heat-map and, optionally, a
|
113 |
+
Top-20 protein-residue table for a chosen drug-token index.
|
114 |
|
115 |
The token index shown on the x-axis (and accepted via *drug_idx*) is **the
|
116 |
position of that token in the *original* drug sequence**, *after* the
|
|
|
209 |
plt.close(fig)
|
210 |
html = f'<img src="data:image/png;base64,{base64.b64encode(buf.getvalue()).decode()}" />'
|
211 |
|
212 |
+
# โโโโโโโโโโโโโโโโโโโโโ ็ๆ Top-20 ่กจ๏ผ่ฅ้่ฆ๏ผ โโโโโโโโโโโโโโโโโโโโโ
|
213 |
+
table_html = "" # ๅ
่ฎพ็ฉบไธฒ๏ผๆนไพฟๅ้ข็ปไธๆผๆฅ
|
214 |
if drug_idx is not None:
|
215 |
# map original 0-based drug_idx โ current column position
|
216 |
if (drug_idx + 1) in d_indices:
|
|
|
222 |
|
223 |
if col_pos is not None:
|
224 |
col_vec = attn[:, col_pos]
|
225 |
+
topk = torch.topk(col_vec, k=min(20, len(col_vec))).indices.tolist()
|
226 |
|
227 |
rank_hdr = "".join(f"<th>{r+1}</th>" for r in range(len(topk)))
|
228 |
res_row = "".join(f"<td>{p_tokens[i]}</td>" for i in topk)
|
|
|
230 |
|
231 |
drug_tok_text = d_tokens[col_pos]
|
232 |
orig_idx = d_indices[col_pos]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
233 |
|
234 |
+
table_html = (
|
235 |
+
f"<h4 style='margin-bottom:6px'>"
|
236 |
+
f"Drug token #{orig_idx} <code>{drug_tok_text}</code> "
|
237 |
+
f"โ Top-20 Protein residues</h4>"
|
238 |
+
"<table class='tg' style='margin-bottom:8px'>"
|
239 |
+
f"<tr><th>Rank</th>{rank_hdr}</tr>"
|
240 |
+
f"<tr><td>Residue</td>{res_row}</tr>"
|
241 |
+
f"<tr><td>Position</td>{pos_row}</tr>"
|
242 |
+
"</table>")
|
243 |
+
|
244 |
+
# โโโโโโโโโโโโโโโโโโ ็ๆๅฏๆพๅคง + ๅฏไธ่ฝฝ็็ญๅพ โโโโโโโโโโโโโโโโโโโโ
|
245 |
buf_png = io.BytesIO()
|
246 |
+
fig.savefig(buf_png, format="png", dpi=140) # ้ข่ง๏ผๅ
ๆ
๏ผ
|
247 |
buf_png.seek(0)
|
248 |
|
249 |
buf_pdf = io.BytesIO()
|
250 |
+
fig.savefig(buf_pdf, format="pdf") # ้ซๆธ
ไธ่ฝฝ๏ผ็ข้๏ผ
|
251 |
buf_pdf.seek(0)
|
252 |
plt.close(fig)
|
253 |
|
|
|
255 |
pdf_b64 = base64.b64encode(buf_pdf.getvalue()).decode()
|
256 |
|
257 |
html_heat = (
|
258 |
+
f"<a href='data:image/png;base64,{png_b64}' target='_blank' "
|
259 |
+
f"title='Click to enlarge'>"
|
260 |
+
f"<img src='data:image/png;base64,{png_b64}' "
|
261 |
+
f"style='max-width:100%;height:auto;cursor:zoom-in' /></a>"
|
262 |
+
f"<div style='margin-top:6px'>"
|
263 |
+
f"<a href='data:application/pdf;base64,{pdf_b64}' "
|
264 |
+
f"download='attention_heatmap.pdf'>Download PDF</a></div>"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
265 |
)
|
266 |
|
267 |
+
# โโโโโโโโโโโโโโโโโโโโโโโโโ ่ฟๅๆ็ป HTML โโโโโโโโโโโโโโโโโโโโโโโโโ
|
268 |
return table_html + html_heat
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
269 |
|
270 |
+
def inference(protein_seq, drug_seq, drug_idx, structure_file):
|
271 |
+
# โโ ่ฟไธๅๆขๆ Gradio ๅๆไปถ่ทฏๅพ โโ
|
272 |
+
if structure_file is not None and os.path.exists(structure_file.name):
|
273 |
+
tmp_structure_path = structure_file.name
|
274 |
+
else:
|
275 |
+
return "<p style='color:red'>่ฏทๅ
ไธไผ ไธไธชๆๆ็ .pdb ๆ .cif ๆไปถใ</p>"
|
276 |
+
|
277 |
+
# ่ฐ็จ foldseek
|
278 |
+
try:
|
279 |
+
parsed = get_struc_seq(FOLDSEEK_BIN, tmp_structure_path, ["A"], plddt_mask=False)
|
280 |
+
chain = next(iter(parsed))
|
281 |
+
protein_seq = parsed[chain][2]
|
282 |
+
except Exception as e:
|
283 |
+
return f"<p style='color:red'>Foldseek ๆๅๅคฑ่ดฅ๏ผ{e}</p>"
|
284 |
+
|
285 |
+
# โโโโโ Flask app โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
286 |
+
app = Flask(__name__)
|
287 |
+
|
288 |
+
@app.route("/", methods=["GET", "POST"])
|
289 |
+
def index():
|
290 |
+
protein_seq = drug_seq = structure_seq = ""; result_html = None
|
291 |
+
tmp_structure_path = ""; drug_idx = None
|
292 |
+
|
293 |
+
if request.method == "POST":
|
294 |
+
drug_idx_raw = request.form.get("drug_idx", "")
|
295 |
+
drug_idx = int(drug_idx_raw)-1 if drug_idx_raw.isdigit() else None
|
296 |
+
|
297 |
+
struct = request.files.get("structure_file")
|
298 |
+
if struct and struct.filename:
|
299 |
+
tmp_dir = tempfile.mkdtemp(prefix="foldseek_")
|
300 |
+
safe_name = secure_filename(struct.filename)
|
301 |
+
tmp_structure_path = os.path.join(tmp_dir, safe_name)
|
302 |
+
struct.save(tmp_structure_path)
|
303 |
+
else:
|
304 |
+
tmp_structure_path = request.form.get("tmp_structure_path", "")
|
305 |
+
|
306 |
+
if "clear" in request.form:
|
307 |
+
protein_seq = drug_seq = structure_seq = ""; tmp_structure_path = ""
|
308 |
+
|
309 |
+
elif "confirm_structure" in request.form and tmp_structure_path:
|
310 |
+
try:
|
311 |
+
parsed_seqs = get_struc_seq(FOLDSEEK_BIN, tmp_structure_path, ["A"], plddt_mask=False)["A"]
|
312 |
+
seq, foldseek_seq, structure_seq = parsed_seqs # ็จๅฎๅๆธ
้ค็ฎๅฝ
|
313 |
+
except Exception as e:
|
314 |
+
result_html = (
|
315 |
+
"<p style='color:red'><strong>Foldseek failed to extract sequence "
|
316 |
+
f"from structure: {e}</strong></p>")
|
317 |
+
structure_seq = ""
|
318 |
+
|
319 |
+
protein_seq = structure_seq
|
320 |
+
drug_input = request.form.get("drug_sequence", "")
|
321 |
+
# Heuristically check if input is SMILES (not starting with [) and convert
|
322 |
+
if not drug_input.strip().startswith("["):
|
323 |
+
converted = smiles_to_selfies(drug_input.strip())
|
324 |
+
if converted:
|
325 |
+
drug_seq = converted
|
326 |
+
else:
|
327 |
+
drug_seq = ""
|
328 |
+
result_html = "<p style='color:red'><strong>Failed to convert SMILES to SELFIES. Please check the input string.</strong></p>"
|
329 |
+
else:
|
330 |
+
drug_seq = drug_input
|
331 |
+
|
332 |
+
elif "Inference" in request.form:
|
333 |
+
protein_seq = request.form.get("protein_sequence", "")
|
334 |
+
drug_seq = request.form.get("drug_sequence", "")
|
335 |
+
if protein_seq and drug_seq:
|
336 |
+
loader = DataLoader([(protein_seq, drug_seq, 1)], batch_size=1,
|
337 |
+
collate_fn=collate_fn)
|
338 |
+
feats = get_case_feature(encoding, loader)
|
339 |
+
model = FusionDTI(446, 768, args).to(DEVICE)
|
340 |
+
ckpt = os.path.join(f"{args.save_path_prefix}{args.dataset}_{args.fusion}",
|
341 |
+
"best_model.ckpt")
|
342 |
+
if os.path.isfile(ckpt):
|
343 |
+
model.load_state_dict(torch.load(ckpt, map_location=DEVICE))
|
344 |
+
result_html = visualize_attention(model, feats, drug_idx)
|
345 |
+
|
346 |
+
return render_template_string(
|
347 |
+
# โโโโโโโโโโโโโ HTML (ๅ UI + ๆฐ่พๅ
ฅๆก) โโโโโโโโโโโโโ
|
348 |
+
"""
|
349 |
+
<!doctype html>
|
350 |
+
<html lang="en"><head><meta charset="utf-8"><title>FusionDTI </title>
|
351 |
+
<link href="https://fonts.googleapis.com/css2?family=Inter:wght@400;500;600&family=Poppins:wght@500;600&display=swap" rel="stylesheet">
|
352 |
+
|
353 |
+
<style>
|
354 |
+
:root{--bg:#f3f4f6;--card:#fff;--primary:#6366f1;--primary-dark:#4f46e5;--text:#111827;--border:#e5e7eb;}
|
355 |
+
*{box-sizing:border-box;margin:0;padding:0}
|
356 |
+
body{background:var(--bg);color:var(--text);font-family:Inter,system-ui,Arial,sans-serif;line-height:1.5;padding:32px 12px;}
|
357 |
+
h1{font-family:Poppins,Inter,sans-serif;font-weight:600;font-size:1.7rem;text-align:center;margin-bottom:28px;letter-spacing:-.2px;}
|
358 |
+
.card{max-width:1000px;margin:0 auto;background:var(--card);border:1px solid var(--border);
|
359 |
+
border-radius:12px;box-shadow:0 2px 6px rgba(0,0,0,.05);padding:32px 36px;}
|
360 |
+
label{font-weight:500;margin-bottom:6px;display:block}
|
361 |
+
textarea,input[type=file]{width:100%;font-size:.9rem;font-family:monospace;padding:10px 12px;
|
362 |
+
border:1px solid var(--border);border-radius:8px;background:#fff;resize:vertical;}
|
363 |
+
textarea{min-height:90px}
|
364 |
+
.btn{appearance:none;border:none;cursor:pointer;padding:12px 22px;border-radius:8px;font-weight:500;
|
365 |
+
font-family:Inter,sans-serif;transition:all .18s ease;color:#fff;}
|
366 |
+
.btn-primary{background:var(--primary)}.btn-primary:hover{background:var(--primary-dark)}
|
367 |
+
.btn-neutral{background:#9ca3af;}.btn-neutral:hover{background:#6b7280}
|
368 |
+
.grid{display:grid;gap:22px}.grid-2{grid-template-columns:1fr 1fr}
|
369 |
+
.vis-box{margin-top:28px;border:1px solid var(--border);border-radius:10px;overflow:auto;max-height:72vh;}
|
370 |
+
pre{white-space:pre-wrap;word-break:break-all;font-family:monospace;margin-top:8px}
|
371 |
+
|
372 |
+
/* โโ tidy table for Top-20 list โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ */
|
373 |
+
table.tg{border-collapse:collapse;margin-top:4px;font-size:0.83rem}
|
374 |
+
table.tg th,table.tg td{border:1px solid var(--border);padding:6px 8px;text-align:left}
|
375 |
+
table.tg th{background:var(--bg);font-weight:600}
|
376 |
+
</style>
|
377 |
+
</head>
|
378 |
+
<body>
|
379 |
+
<h1> Token-level Visualiser for Drug-Target Interaction</h1>
|
380 |
+
|
381 |
+
<!-- โโโโโโโโโโโโโ Project Links (larger + spaced) โโโโโโโโโโโโโ -->
|
382 |
+
<div style="margin-top:24px; text-align:center;">
|
383 |
+
<a href="https://zhaohanm.github.io/FusionDTI.github.io/" target="_blank"
|
384 |
+
style="display:inline-block;margin:8px 18px;padding:10px 20px;
|
385 |
+
background:linear-gradient(to right,#10b981,#059669);color:white;
|
386 |
+
font-weight:600;border-radius:8px;font-size:0.9rem;
|
387 |
+
font-family:Inter,sans-serif;text-decoration:none;
|
388 |
+
box-shadow:0 2px 6px rgba(0,0,0,0.12);transition:all 0.2s ease-in-out;"
|
389 |
+
onmouseover="this.style.opacity='0.9'" onmouseout="this.style.opacity='1'">
|
390 |
+
๐ Project Page
|
391 |
+
</a>
|
392 |
+
|
393 |
+
<a href="https://arxiv.org/abs/2406.01651" target="_blank"
|
394 |
+
style="display:inline-block;margin:8px 18px;padding:10px 20px;
|
395 |
+
background:linear-gradient(to right,#ef4444,#dc2626);color:white;
|
396 |
+
font-weight:600;border-radius:8px;font-size:0.9rem;
|
397 |
+
font-family:Inter,sans-serif;text-decoration:none;
|
398 |
+
box-shadow:0 2px 6px rgba(0,0,0,0.12);transition:all 0.2s ease-in-out;"
|
399 |
+
onmouseover="this.style.opacity='0.9'" onmouseout="this.style.opacity='1'">
|
400 |
+
๐ ArXiv: 2406.01651
|
401 |
+
</a>
|
402 |
+
|
403 |
+
<a href="https://github.com/ZhaohanM/FusionDTI" target="_blank"
|
404 |
+
style="display:inline-block;margin:8px 18px;padding:10px 20px;
|
405 |
+
background:linear-gradient(to right,#3b82f6,#2563eb);color:white;
|
406 |
+
font-weight:600;border-radius:8px;font-size:0.9rem;
|
407 |
+
font-family:Inter,sans-serif;text-decoration:none;
|
408 |
+
box-shadow:0 2px 6px rgba(0,0,0,0.12);transition:all 0.2s ease-in-out;"
|
409 |
+
onmouseover="this.style.opacity='0.9'" onmouseout="this.style.opacity='1'">
|
410 |
+
๐ป GitHub Repo
|
411 |
+
</a>
|
412 |
+
</div>
|
413 |
+
|
414 |
+
<!-- โโโโโโโโโโโโโ Guidelines for Use โโโโโโโโโโโโโ -->
|
415 |
+
<div class="card" style="margin-bottom:24px">
|
416 |
+
<h2 style="font-size:1.2rem;margin-bottom:14px">Guidelines for Use</h2>
|
417 |
+
<ul style="margin-left:18px;line-height:1.55;list-style:decimal;">
|
418 |
+
<li><strong>Convert protein structure into a structure-aware sequence:</strong>
|
419 |
+
Upload a <code>.pdb</code> or <code>.cif</code> file. A structure-aware
|
420 |
+
sequence will be generated using
|
421 |
+
<a href="https://github.com/steineggerlab/foldseek" target="_blank">Foldseek</a>,
|
422 |
+
based on 3D structures from
|
423 |
+
<a href="https://alphafold.ebi.ac.uk" target="_blank">AlphaFold DB</a> or the
|
424 |
+
<a href="https://www.rcsb.org" target="_blank">Protein Data Bank (PDB)</a>.</li>
|
425 |
+
|
426 |
+
<li><strong>If you only have an amino acid sequence or a UniProt ID,</strong>
|
427 |
+
you must first visit the
|
428 |
+
<a href="https://www.rcsb.org" target="_blank">Protein Data Bank (PDB)</a>
|
429 |
+
or <a href="https://alphafold.ebi.ac.uk" target="_blank">AlphaFold DB</a>
|
430 |
+
to search and download the corresponding <code>.cif</code> or <code>.pdb</code> file.</li>
|
431 |
+
|
432 |
+
<li><strong>Drug input supports both SELFIES and SMILES:</strong><br>
|
433 |
+
You can enter a SELFIES string directly, or paste a SMILES string.
|
434 |
+
SMILES will be automatically converted to SELFIES using
|
435 |
+
<a href="https://github.com/aspuru-guzik-group/selfies" target="_blank">SELFIES encoder</a>.
|
436 |
+
If conversion fails, a red error message will be displayed.</li>
|
437 |
+
|
438 |
+
<li>Optionally enter a <strong>1-based</strong> drug atom or substructure index
|
439 |
+
to highlight the Top-10 interacting protein residues.</li>
|
440 |
+
|
441 |
+
<li>After inference, you can use the
|
442 |
+
โDownload PDFโ link to export a high-resolution vector version.</li>
|
443 |
+
</ul>
|
444 |
+
</div>
|
445 |
+
|
446 |
+
<div class="card">
|
447 |
+
<form method="POST" enctype="multipart/form-data" class="grid">
|
448 |
+
|
449 |
+
<div><label>Protein Structure (.pdb / .cif)</label>
|
450 |
+
<input type="file" name="structure_file">
|
451 |
+
<input type="hidden" name="tmp_structure_path" value="{{ tmp_structure_path }}"></div>
|
452 |
+
|
453 |
+
<div><label>Protein Sequence</label>
|
454 |
+
<textarea name="protein_sequence" placeholder="Confirm / paste sequenceโฆ">{{ protein_seq }}</textarea></div>
|
455 |
+
|
456 |
+
<div><label>Drug Sequence (SELFIES/SMILES)</label>
|
457 |
+
<textarea name="drug_sequence" placeholder="[C][C][O]/cco โฆ">{{ drug_seq }}</textarea></div>
|
458 |
+
|
459 |
+
<label>Drug atom/substructure index (1-based) โ show Top-10 related protein residue</label>
|
460 |
+
<input type="number" name="drug_idx" min="1" style="width:120px">
|
461 |
+
|
462 |
+
<div class="grid grid-2">
|
463 |
+
<button class="btn btn-primary" type="Inference" name="confirm_structure">Confirm Structure</button>
|
464 |
+
<button class="btn btn-primary" type="Inference" name="Inference">Inference</button>
|
465 |
+
</div>
|
466 |
+
<button class="btn btn-neutral" style="width:100%" type="Inference" name="clear">Clear</button>
|
467 |
+
</form>
|
468 |
+
|
469 |
+
{% if structure_seq %}
|
470 |
+
<div style="margin-top:18px"><strong>Structure-aware sequence:</strong><pre>{{ structure_seq }}</pre></div>
|
471 |
+
{% endif %}
|
472 |
+
{% if result_html %}
|
473 |
+
<div class="vis-box" style="margin-top:26px">{{ result_html|safe }}</div>
|
474 |
+
{% endif %}
|
475 |
+
</div></body></html>
|
476 |
+
""",
|
477 |
+
protein_seq=protein_seq, drug_seq=drug_seq, structure_seq=structure_seq,
|
478 |
+
result_html=result_html, tmp_structure_path=tmp_structure_path)
|
479 |
+
|
480 |
+
# โโโโโ run โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
481 |
if __name__ == "__main__":
|
482 |
+
app.run(debug=True, host="0.0.0.0", port=7860)
|