File size: 4,906 Bytes
a48f0ae
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
"""
Computes and saves molecular features for a dataset.
"""
import os
import shutil
import sys
from argparse import ArgumentParser, Namespace
from multiprocessing import Pool
from typing import List, Tuple

from tqdm import tqdm

sys.path.append(os.path.dirname(os.path.dirname(os.path.realpath(__file__))))

from grover.data.molfeaturegenerator import (
    get_available_features_generators,
    get_features_generator,
)
from grover.data.task_labels import rdkit_functional_group_label_features_generator
from grover.util.utils import get_data, load_features, makedirs, save_features


def load_temp(temp_dir: str) -> Tuple[List[List[float]], int]:
    """
    Loads all features saved as .npz files in load_dir.

    Assumes temporary files are named in order 0.npz, 1.npz, ...

    :param temp_dir: Directory in which temporary .npz files containing features are stored.
    :return: A tuple with a list of molecule features, where each molecule's features is a list of floats,
    and the number of temporary files.
    """
    features = []
    temp_num = 0
    temp_path = os.path.join(temp_dir, f"{temp_num}.npz")

    while os.path.exists(temp_path):
        features.extend(load_features(temp_path))
        temp_num += 1
        temp_path = os.path.join(temp_dir, f"{temp_num}.npz")

    return features, temp_num


def generate_and_save_features(args: Namespace):
    """
    Computes and saves features for a dataset of molecules as a 2D array in a .npz file.

    :param args: Arguments.
    """
    # Create directory for save_path
    makedirs(args.save_path, isfile=True)

    # Get data and features function
    data = get_data(path=args.data_path, max_data_size=None)
    features_generator = get_features_generator(args.features_generator)
    temp_save_dir = args.save_path + "_temp"

    # Load partially complete data
    if args.restart:
        if os.path.exists(args.save_path):
            os.remove(args.save_path)
        if os.path.exists(temp_save_dir):
            shutil.rmtree(temp_save_dir)
    else:
        if os.path.exists(args.save_path):
            raise ValueError(
                f'"{args.save_path}" already exists and args.restart is False.'
            )

        if os.path.exists(temp_save_dir):
            features, temp_num = load_temp(temp_save_dir)

    if not os.path.exists(temp_save_dir):
        makedirs(temp_save_dir)
        features, temp_num = [], 0

    # Build features map function
    data = data[
        len(features) :
    ]  # restrict to data for which features have not been computed yet
    mols = (d.smiles for d in data)

    if args.sequential:
        features_map = map(features_generator, mols)
    else:
        features_map = Pool(30).imap(features_generator, mols)

    # Get features
    temp_features = []
    for i, feats in tqdm(enumerate(features_map), total=len(data)):
        temp_features.append(feats)

        # Save temporary features every save_frequency
        if (i > 0 and (i + 1) % args.save_frequency == 0) or i == len(data) - 1:
            save_features(os.path.join(temp_save_dir, f"{temp_num}.npz"), temp_features)
            features.extend(temp_features)
            temp_features = []
            temp_num += 1

    try:
        # Save all features
        save_features(args.save_path, features)

        # Remove temporary features
        shutil.rmtree(temp_save_dir)
    except OverflowError:
        print(
            "Features array is too large to save as a single file. Instead keeping features as a directory of files."
        )


if __name__ == "__main__":

    parser = ArgumentParser()
    parser.add_argument("--data_path", type=str, required=True, help="Path to data CSV")
    parser.add_argument(
        "--features_generator",
        type=str,
        required=True,
        choices=get_available_features_generators(),
        help="Type of features to generate",
    )
    parser.add_argument(
        "--save_path",
        type=str,
        default=None,
        help="Path to .npz file where features will be saved as a compressed numpy archive",
    )
    parser.add_argument(
        "--save_frequency",
        type=int,
        default=10000,
        help="Frequency with which to save the features",
    )
    parser.add_argument(
        "--restart",
        action="store_true",
        default=False,
        help="Whether to not load partially complete featurization and instead start from scratch",
    )
    parser.add_argument(
        "--max_data_size", type=int, help="Maximum number of data points to load"
    )
    parser.add_argument(
        "--sequential",
        action="store_true",
        default=False,
        help="Whether to task sequentially rather than in parallel",
    )
    args = parser.parse_args()
    if args.save_path is None:
        args.save_path = args.data_path.split("csv")[0] + "npz"
    generate_and_save_features(args)