#!/usr/bin/env python3 # Copyright 2023-2024 NXP # SPDX-License-Identifier: MIT import cv2 import tensorflow as tf import numpy as np import time import random random.seed(42) OBJECT_DETECTOR_TFLITE = 'yolov4-tiny_416_quant.tflite' LABELS_FILE = 'coco-labels-2014_2017.txt' IMAGE_FILENAME = 'example_input.jpg' SCORE_THRESHOLD = 0.20 NMS_IOU_THRESHOLD = 0.5 INFERENCE_IMG_SIZE = 416 MAX_DETS = 100 ANCHORS = [[[81, 82], [135, 169], [344, 319]], [[23, 27], [37, 58], [81, 82]]] SIGMOID_FACTOR = [1.05, 1.05] NUM_ANCHORS = 3 STRIDES = [32, 16] GRID_SIZES = [int(INFERENCE_IMG_SIZE / s) for s in STRIDES] with open(LABELS_FILE, 'r') as f: COCO_CLASSES = [line.strip() for line in f.readlines()] interpreter = tf.lite.Interpreter(OBJECT_DETECTOR_TFLITE) interpreter.allocate_tensors() def gen_box_colors(): colors = [] for _ in range(len(COCO_CLASSES)): r = random.randint(100, 255) g = random.randint(100, 255) b = random.randint(100, 255) colors.append((r, g, b)) return colors BOX_COLORS = gen_box_colors() def load_image(filename): orig_image = cv2.imread(filename, 1) image = cv2.cvtColor(orig_image, cv2.COLOR_BGR2RGB) image = cv2.resize(image, (INFERENCE_IMG_SIZE, INFERENCE_IMG_SIZE)) image = np.expand_dims(image, axis=0) image = image / 255.0 return orig_image, image def np_sigmoid(x): return 1 / (1 + np.exp(-x)) def reciprocal_sigmoid(x): return -np.log(1 / x - 1) def decode_boxes_prediction(yolo_output): # Each output level represents a grid of predictions. # The first output level is a 26x26 grid and the second 13x13. # Each cell of each grid is assigned to 3 anchor bounding boxes. # The bounding box predictions are regressed # relatively to these anchor boxes. # Thus, the model predicts 3 bounding boxes per cell per output level. # The output is structured as follows: # For each cell [[x, y, w, h, conf, cl_0, cl_1, ..., cl_79], # anchor 1 # [x, y, w, h, conf, cl_0, cl_1, ..., cl_79], # anchor 2 # [x, y, w, h, conf, cl_0, cl_1, ..., cl_79]] # anchor 3 # Hence, we have 85 values per anchor box, and thus 255 values per cell. # The decoding of the output bounding boxes is described in Figure 2 of # the YOLOv3 paper https://arxiv.org/pdf/1804.02767.pdf; boxes_list = [] scores_list = [] classes_list = [] for idx, feats in enumerate(yolo_output): features = np.reshape(feats, (NUM_ANCHORS * GRID_SIZES[idx] ** 2, 85)) anchor = np.array(ANCHORS[idx]) factor = SIGMOID_FACTOR[idx] grid_size = GRID_SIZES[idx] stride = STRIDES[idx] cell_confidence = features[..., 4] logit_threshold = reciprocal_sigmoid(SCORE_THRESHOLD) over_threshold_list = np.where(cell_confidence > logit_threshold) if over_threshold_list[0].size > 0: indices = np.array(over_threshold_list[0]) box_positions = np.floor_divide(indices, 3) list_xy = np.array(np.divmod(box_positions, grid_size)).T list_xy = list_xy[..., ::-1] boxes_xy = np.reshape(list_xy, (int(list_xy.size / 2), 2)) outxy = features[indices, :2] # boxes center coordinates centers = np_sigmoid(outxy * factor) - 0.5 * (factor - 1) centers += boxes_xy centers *= stride # boxes width and height width_height = np.exp(features[indices, 2:4]) width_height *= anchor[np.divmod(indices, NUM_ANCHORS)[1]] boxes_list.append(np.stack([centers[:, 0] - width_height[:, 0]/2, centers[:, 1] - width_height[:, 1]/2, centers[:, 0] + width_height[:, 0]/2, centers[:, 1] + width_height[:, 1]/2], axis=1)) # confidence that cell contains an object scores_list.append(np_sigmoid(features[indices, 4:5])) # class with the highest probability in this cell classes_list.append(np.argmax(features[indices, 5:], axis=1)) if len(boxes_list) > 0: boxes = np.concatenate(boxes_list, axis=0) scores = np.concatenate(scores_list, axis=0)[:, 0] classes = np.concatenate(classes_list, axis=0) return boxes, scores, classes else: return np.zeros((0, 4)), np.zeros((0)), np.zeros((0)) def decode_output(yolo_outputs, score_threshold=SCORE_THRESHOLD, iou_threshold=NMS_IOU_THRESHOLD): ''' Decode output from YOLOv4 tiny in inference size referential (416x416) ''' boxes, scores, classes = decode_boxes_prediction(yolo_outputs) # apply NMS from tensorflow inds = tf.image.non_max_suppression(boxes, scores, MAX_DETS, score_threshold=score_threshold, iou_threshold=iou_threshold) # keep only selected boxes boxes = tf.gather(boxes, inds) scores = tf.gather(scores, inds) classes = tf.gather(classes, inds) return scores, boxes, classes def run_inference(interpreter, image, threshold=SCORE_THRESHOLD): input_details = interpreter.get_input_details() output_details = interpreter.get_output_details() input_scale, input_zero_point = input_details[0]["quantization"] image = image / input_scale + input_zero_point image = image.astype(np.int8) interpreter.set_tensor(input_details[0]['index'], image) interpreter.invoke() boxes = interpreter.get_tensor(output_details[0]['index']) boxes2 = interpreter.get_tensor(output_details[1]['index']) return [boxes, boxes2] if __name__ == "__main__": orig_image, processed_image = load_image(IMAGE_FILENAME) start = time.time() yolo_output = run_inference(interpreter, processed_image) end = time.time() scores, boxes, classes = decode_output(yolo_output) # rescale boxes for display shp = orig_image.shape boxes = boxes.numpy() boxes /= INFERENCE_IMG_SIZE boxes *= np.array([shp[1], shp[0], shp[1], shp[0]]) boxes = boxes.astype(np.int32) print("Inference time", end - start, "ms") print("Detected", boxes.shape[0], "object(s)") print("Box coordinates:") for i in range(boxes.shape[0]): box = boxes[i, :] print(box, end=" ") class_name = COCO_CLASSES[classes[i].numpy()] score = scores[i].numpy() color = BOX_COLORS[classes[i]] print("class", class_name, end=" ") print("score", score) cv2.rectangle(orig_image, (box[0], box[1]), (box[2], box[3]), color, 3) cv2.putText(orig_image, f"{class_name} {score:.2f}", (box[0], box[1] - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.6, color, 2) cv2.imwrite('example_output.jpg', orig_image) cv2.imshow('', orig_image) cv2.waitKey()