|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
from pathlib import Path |
|
|
|
import matplotlib |
|
import seaborn as sn |
|
|
|
matplotlib.style.use("fivethirtyeight") |
|
matplotlib.style.use("seaborn-talk") |
|
matplotlib.rcParams["font.family"] = "monospace" |
|
matplotlib.pyplot.rcParams["savefig.facecolor"] = "white" |
|
sn.set_context("poster") |
|
|
|
|
|
zinc_dgl = Path.home() / ".dgl" / "jtvae" / "train.txt" |
|
lincs_trapnell = Path.cwd().parent / "lincs_trapnell.smiles" |
|
outfile = Path.cwd().parent / "lincs_trapnell.smiles.short" |
|
assert zinc_dgl.exists() and lincs_trapnell.exists() |
|
|
|
|
|
for p in [zinc_dgl, lincs_trapnell]: |
|
with open(p) as f: |
|
max_length = 0 |
|
for smile in f: |
|
if len(smile.strip()) > max_length: |
|
max_length = len(smile.strip()) |
|
print(f"Max length of {p} is {max_length}") |
|
|
|
|
|
with open(lincs_trapnell) as f: |
|
count = 0 |
|
for smile in f: |
|
smile = smile.strip() |
|
if len(smile) >= 200: |
|
count += 1 |
|
print(f"There are {count} SMILES >= 200") |
|
|
|
|
|
with open(lincs_trapnell) as f: |
|
h = [] |
|
for smile in f: |
|
h.append(len(smile.strip())) |
|
|
|
|
|
ax = sn.histplot(h) |
|
ax.set_title("SMILES-length in LINCS") |
|
|
|
|
|
|
|
|
|
|
|
|
|
with open(outfile, "w") as outfile, open(lincs_trapnell) as infile: |
|
for line in infile: |
|
line = line.strip() |
|
if len(line) < 200: |
|
outfile.write(line + "\n") |
|
|
|
|
|
with open(Path.cwd().parent / "lincs_trapnell.smiles.mini", "w") as outfile, open( |
|
lincs_trapnell |
|
) as infile: |
|
for line in infile: |
|
line = line.strip() |
|
if len(line) <= 120: |
|
outfile.write(line + "\n") |
|
|