import _init_paths import argparse import os import copy import random import numpy as np from PIL import Image import scipy.io as scio import scipy.misc import numpy.ma as ma import math import torch import torch.nn as nn import torch.nn.parallel import torch.backends.cudnn as cudnn import torch.optim as optim import torch.utils.data import torchvision.datasets as dset import torchvision.transforms as transforms import torchvision.utils as vutils import torch.nn.functional as F from torch.autograd import Variable from datasets.ycb.dataset import PoseDataset from lib.network import PoseNet, PoseRefineNet from lib.transformations import euler_matrix, quaternion_matrix, quaternion_from_matrix parser = argparse.ArgumentParser() parser.add_argument('--dataset_root', type=str, default = '', help='dataset root dir') parser.add_argument('--model', type=str, default = '', help='resume PoseNet model') parser.add_argument('--refine_model', type=str, default = '', help='resume PoseRefineNet model') opt = parser.parse_args() norm = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) border_list = [-1, 40, 80, 120, 160, 200, 240, 280, 320, 360, 400, 440, 480, 520, 560, 600, 640, 680] xmap = np.array([[j for i in range(640)] for j in range(480)]) ymap = np.array([[i for i in range(640)] for j in range(480)]) cam_cx = 312.9869 cam_cy = 241.3109 cam_fx = 1066.778 cam_fy = 1067.487 cam_scale = 10000.0 num_obj = 21 img_width = 480 img_length = 640 num_points = 1000 num_points_mesh = 500 iteration = 2 bs = 1 dataset_config_dir = 'datasets/ycb/dataset_config' ycb_toolbox_dir = 'YCB_Video_toolbox' result_wo_refine_dir = 'experiments/eval_result/ycb/Densefusion_wo_refine_result' result_refine_dir = 'experiments/eval_result/ycb/Densefusion_iterative_result' def get_bbox(posecnn_rois): rmin = int(posecnn_rois[idx][3]) + 1 rmax = int(posecnn_rois[idx][5]) - 1 cmin = int(posecnn_rois[idx][2]) + 1 cmax = int(posecnn_rois[idx][4]) - 1 r_b = rmax - rmin for tt in range(len(border_list)): if r_b > border_list[tt] and r_b < border_list[tt + 1]: r_b = border_list[tt + 1] break c_b = cmax - cmin for tt in range(len(border_list)): if c_b > border_list[tt] and c_b < border_list[tt + 1]: c_b = border_list[tt + 1] break center = [int((rmin + rmax) / 2), int((cmin + cmax) / 2)] rmin = center[0] - int(r_b / 2) rmax = center[0] + int(r_b / 2) cmin = center[1] - int(c_b / 2) cmax = center[1] + int(c_b / 2) if rmin < 0: delt = -rmin rmin = 0 rmax += delt if cmin < 0: delt = -cmin cmin = 0 cmax += delt if rmax > img_width: delt = rmax - img_width rmax = img_width rmin -= delt if cmax > img_length: delt = cmax - img_length cmax = img_length cmin -= delt return rmin, rmax, cmin, cmax estimator = PoseNet(num_points = num_points, num_obj = num_obj) estimator.cuda() estimator.load_state_dict(torch.load(opt.model)) estimator.eval() refiner = PoseRefineNet(num_points = num_points, num_obj = num_obj) refiner.cuda() refiner.load_state_dict(torch.load(opt.refine_model)) refiner.eval() testlist = [] input_file = open('{0}/test_data_list.txt'.format(dataset_config_dir)) while 1: input_line = input_file.readline() if not input_line: break if input_line[-1:] == '\n': input_line = input_line[:-1] testlist.append(input_line) input_file.close() print(len(testlist)) class_file = open('{0}/classes.txt'.format(dataset_config_dir)) class_id = 1 cld = {} while 1: class_input = class_file.readline() if not class_input: break class_input = class_input[:-1] input_file = open('{0}/models/{1}/points.xyz'.format(opt.dataset_root, class_input)) cld[class_id] = [] while 1: input_line = input_file.readline() if not input_line: break input_line = input_line[:-1] input_line = input_line.split(' ') cld[class_id].append([float(input_line[0]), float(input_line[1]), float(input_line[2])]) input_file.close() cld[class_id] = np.array(cld[class_id]) class_id += 1 for now in range(0, 2949): img = Image.open('{0}/{1}-color.png'.format(opt.dataset_root, testlist[now])) depth = np.array(Image.open('{0}/{1}-depth.png'.format(opt.dataset_root, testlist[now]))) posecnn_meta = scio.loadmat('{0}/results_PoseCNN_RSS2018/{1}.mat'.format(ycb_toolbox_dir, '%06d' % now)) label = np.array(posecnn_meta['labels']) posecnn_rois = np.array(posecnn_meta['rois']) lst = posecnn_rois[:, 1:2].flatten() my_result_wo_refine = [] my_result = [] for idx in range(len(lst)): itemid = lst[idx] try: rmin, rmax, cmin, cmax = get_bbox(posecnn_rois) mask_depth = ma.getmaskarray(ma.masked_not_equal(depth, 0)) mask_label = ma.getmaskarray(ma.masked_equal(label, itemid)) mask = mask_label * mask_depth choose = mask[rmin:rmax, cmin:cmax].flatten().nonzero()[0] if len(choose) > num_points: c_mask = np.zeros(len(choose), dtype=int) c_mask[:num_points] = 1 np.random.shuffle(c_mask) choose = choose[c_mask.nonzero()] else: choose = np.pad(choose, (0, num_points - len(choose)), 'wrap') depth_masked = depth[rmin:rmax, cmin:cmax].flatten()[choose][:, np.newaxis].astype(np.float32) xmap_masked = xmap[rmin:rmax, cmin:cmax].flatten()[choose][:, np.newaxis].astype(np.float32) ymap_masked = ymap[rmin:rmax, cmin:cmax].flatten()[choose][:, np.newaxis].astype(np.float32) choose = np.array([choose]) pt2 = depth_masked / cam_scale pt0 = (ymap_masked - cam_cx) * pt2 / cam_fx pt1 = (xmap_masked - cam_cy) * pt2 / cam_fy cloud = np.concatenate((pt0, pt1, pt2), axis=1) img_masked = np.array(img)[:, :, :3] img_masked = np.transpose(img_masked, (2, 0, 1)) img_masked = img_masked[:, rmin:rmax, cmin:cmax] cloud = torch.from_numpy(cloud.astype(np.float32)) choose = torch.LongTensor(choose.astype(np.int32)) img_masked = norm(torch.from_numpy(img_masked.astype(np.float32))) index = torch.LongTensor([itemid - 1]) cloud = Variable(cloud).cuda() choose = Variable(choose).cuda() img_masked = Variable(img_masked).cuda() index = Variable(index).cuda() cloud = cloud.view(1, num_points, 3) img_masked = img_masked.view(1, 3, img_masked.size()[1], img_masked.size()[2]) pred_r, pred_t, pred_c, emb = estimator(img_masked, cloud, choose, index) pred_r = pred_r / torch.norm(pred_r, dim=2).view(1, num_points, 1) pred_c = pred_c.view(bs, num_points) how_max, which_max = torch.max(pred_c, 1) pred_t = pred_t.view(bs * num_points, 1, 3) points = cloud.view(bs * num_points, 1, 3) my_r = pred_r[0][which_max[0]].view(-1).cpu().data.numpy() my_t = (points + pred_t)[which_max[0]].view(-1).cpu().data.numpy() my_pred = np.append(my_r, my_t) my_result_wo_refine.append(my_pred.tolist()) for ite in range(0, iteration): T = Variable(torch.from_numpy(my_t.astype(np.float32))).cuda().view(1, 3).repeat(num_points, 1).contiguous().view(1, num_points, 3) my_mat = quaternion_matrix(my_r) R = Variable(torch.from_numpy(my_mat[:3, :3].astype(np.float32))).cuda().view(1, 3, 3) my_mat[0:3, 3] = my_t new_cloud = torch.bmm((cloud - T), R).contiguous() pred_r, pred_t = refiner(new_cloud, emb, index) pred_r = pred_r.view(1, 1, -1) pred_r = pred_r / (torch.norm(pred_r, dim=2).view(1, 1, 1)) my_r_2 = pred_r.view(-1).cpu().data.numpy() my_t_2 = pred_t.view(-1).cpu().data.numpy() my_mat_2 = quaternion_matrix(my_r_2) my_mat_2[0:3, 3] = my_t_2 my_mat_final = np.dot(my_mat, my_mat_2) my_r_final = copy.deepcopy(my_mat_final) my_r_final[0:3, 3] = 0 my_r_final = quaternion_from_matrix(my_r_final, True) my_t_final = np.array([my_mat_final[0][3], my_mat_final[1][3], my_mat_final[2][3]]) my_pred = np.append(my_r_final, my_t_final) my_r = my_r_final my_t = my_t_final # Here 'my_pred' is the final pose estimation result after refinement ('my_r': quaternion, 'my_t': translation) my_result.append(my_pred.tolist()) except ZeroDivisionError: print("PoseCNN Detector Lost {0} at No.{1} keyframe".format(itemid, now)) my_result_wo_refine.append([0.0 for i in range(7)]) my_result.append([0.0 for i in range(7)]) scio.savemat('{0}/{1}.mat'.format(result_wo_refine_dir, '%04d' % now), {'poses':my_result_wo_refine}) scio.savemat('{0}/{1}.mat'.format(result_refine_dir, '%04d' % now), {'poses':my_result}) print("Finish No.{0} keyframe".format(now))