|
""" |
|
The vocabulary building scripts. |
|
""" |
|
import os |
|
|
|
from grover.data.torchvocab import MolVocab |
|
|
|
|
|
def build(): |
|
import argparse |
|
|
|
parser = argparse.ArgumentParser() |
|
parser.add_argument( |
|
"--data_path", |
|
default="../../dataset/grover_new_dataset/druglike_merged_refine2.csv", |
|
type=str, |
|
) |
|
parser.add_argument( |
|
"--vocab_save_folder", default="../../dataset/grover_new_dataset", type=str |
|
) |
|
parser.add_argument( |
|
"--dataset_name", |
|
type=str, |
|
default=None, |
|
help="Will be the first part of the vocab file name. If it is None," |
|
"the vocab files will be: atom_vocab.pkl and bond_vocab.pkl", |
|
) |
|
parser.add_argument("--vocab_max_size", type=int, default=None) |
|
parser.add_argument("--vocab_min_freq", type=int, default=1) |
|
args = parser.parse_args() |
|
|
|
|
|
|
|
|
|
for vocab_type in ["atom", "bond"]: |
|
vocab_file = f"{vocab_type}_vocab.pkl" |
|
if args.dataset_name is not None: |
|
vocab_file = args.dataset_name + "_" + vocab_file |
|
vocab_save_path = os.path.join(args.vocab_save_folder, vocab_file) |
|
|
|
os.makedirs(os.path.dirname(vocab_save_path), exist_ok=True) |
|
vocab = MolVocab( |
|
file_path=args.data_path, |
|
max_size=args.vocab_max_size, |
|
min_freq=args.vocab_min_freq, |
|
num_workers=100, |
|
vocab_type=vocab_type, |
|
) |
|
print(f"{vocab_type} vocab size", len(vocab)) |
|
vocab.save_vocab(vocab_save_path) |
|
|
|
|
|
if __name__ == "__main__": |
|
build() |
|
|