Spaces:
Runtime error
Runtime error
| import warnings | |
| import lightgbm as lgb | |
| import numpy as np | |
| from bayes_opt import BayesianOptimization | |
| import argparse | |
| def read_labels(dataset_path): | |
| with open(dataset_path, mode="r", encoding="utf-8") as f: | |
| columns, labels = {}, [] | |
| for line_id, line in enumerate(f): | |
| if line_id == 0: | |
| for i, column_name in enumerate(line.rstrip("\r\n").split("\t")): | |
| columns[column_name] = i | |
| continue | |
| line = line.rstrip("\r\n").split("\t") | |
| labels.append(int(line[columns["label"]])) | |
| return labels | |
| def main(): | |
| parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) | |
| # Path options. | |
| parser.add_argument("--train_path", type=str, required=True, | |
| help="Path of the trainset.") | |
| parser.add_argument("--train_features_path", type=str, required=True, | |
| help="Path of the train features for stacking.") | |
| # Model options. | |
| parser.add_argument("--models_num", type=int, default=64, | |
| help="Number of models for ensemble.") | |
| parser.add_argument("--folds_num", type=int, default=5, | |
| help="Number of folds for cross validation.") | |
| parser.add_argument("--labels_num", type=int, default=2, | |
| help="Number of labels.") | |
| # Bayesian optimization options. | |
| parser.add_argument("--epochs_num", type=int, default=100, | |
| help="Number of epochs.") | |
| args = parser.parse_args() | |
| labels = read_labels(args.train_path) | |
| def lgb_cv(num_leaves, min_data_in_leaf, learning_rate, feature_fraction, lambda_l1, lambda_l2, max_depth): | |
| num_leaves = int(num_leaves) | |
| min_data_in_leaf = int(min_data_in_leaf) | |
| max_depth = int(max_depth) | |
| param = { | |
| "num_leaves": num_leaves, | |
| "min_data_in_leaf": min_data_in_leaf, | |
| "learning_rate": learning_rate, | |
| "feature_fraction": feature_fraction, | |
| "lambda_l1": lambda_l1, | |
| "lambda_l2": lambda_l2, | |
| "max_depth": max_depth, | |
| "save_binary": True, | |
| "objective": "multiclass", | |
| "num_class": args.labels_num, | |
| "verbose": -1, | |
| "metric": "multi_error" | |
| } | |
| scores = [] | |
| instances_num_per_fold = len(labels) // args.folds_num + 1 | |
| for fold_id in range(args.folds_num): | |
| x_train = np.concatenate((train_features[0: fold_id * instances_num_per_fold], train_features[(fold_id + 1) * instances_num_per_fold:]), axis = 0) | |
| x_val = train_features[fold_id * instances_num_per_fold: (fold_id + 1) * instances_num_per_fold] | |
| y_train = labels[0: fold_id * instances_num_per_fold] + labels[(fold_id + 1) * instances_num_per_fold:] | |
| y_val = labels[fold_id * instances_num_per_fold: (fold_id + 1) * instances_num_per_fold] | |
| lgb_train = lgb.Dataset(x_train, y_train) | |
| lgb_eval = lgb.Dataset(x_val, y_val, reference=lgb_train) | |
| model = lgb.train(param, lgb_train, valid_sets=lgb_eval, verbose_eval=0) | |
| pred = model.predict(x_val) | |
| val_pred = np.argmax(pred, axis=1) | |
| confusion = np.zeros((args.labels_num, args.labels_num)) | |
| for i in range(len(pred)): | |
| confusion[val_pred[i], y_val[i]] += 1 | |
| correct = np.sum(val_pred == y_val) | |
| marco_f1 = [] | |
| eps = 1e-9 | |
| for i in range(args.labels_num): | |
| p = confusion[i, i].item() / (confusion[i, :].sum().item() + eps) | |
| r = confusion[i, i].item() / (confusion[:, i].sum().item() + eps) | |
| f1 = 2 * p * r / (p + r + eps) | |
| marco_f1.append(f1) | |
| scores.append(np.mean(marco_f1)) | |
| return np.mean(scores) | |
| train_features = [] | |
| for i in range(args.models_num): | |
| train_features.append(np.load(args.train_features_path + "train_features_" + str(i) + ".npy")) | |
| train_features = np.concatenate(train_features, axis=-1) | |
| bounds = { | |
| "num_leaves": (10, 100), | |
| "min_data_in_leaf": (10, 100), | |
| "learning_rate": (0.005, 0.5), | |
| "feature_fraction": (0.001, 0.5), | |
| "lambda_l1": (0, 10), | |
| "lambda_l2": (0, 10), | |
| "max_depth":(3, 200) | |
| } | |
| lgb_bo = BayesianOptimization(lgb_cv, bounds) | |
| with warnings.catch_warnings(): | |
| warnings.filterwarnings('ignore') | |
| lgb_bo.maximize(n_iter=args.epochs_num) | |
| if __name__ == "__main__": | |
| main() | |