/** Modifed version of knn-CUDA from https://github.com/vincentfpgarcia/kNN-CUDA | |
* The modifications are | |
* removed texture memory usage | |
* removed split query KNN computation | |
* added feature extraction with bilinear interpolation | |
* | |
* Last modified by Christopher B. Choy <chrischoy@ai.stanford.edu> 12/23/2016 | |
*/ | |
// Includes | |
// Constants used by the program | |
/** | |
* Computes the distance between two matrix A (reference points) and | |
* B (query points) containing respectively wA and wB points. | |
* | |
* @param A pointer on the matrix A | |
* @param wA width of the matrix A = number of points in A | |
* @param B pointer on the matrix B | |
* @param wB width of the matrix B = number of points in B | |
* @param dim dimension of points = height of matrices A and B | |
* @param AB pointer on the matrix containing the wA*wB distances computed | |
*/ | |
__global__ void cuComputeDistanceGlobal( float* A, int wA, | |
float* B, int wB, int dim, float* AB){ | |
// Declaration of the shared memory arrays As and Bs used to store the sub-matrix of A and B | |
__shared__ float shared_A[BLOCK_DIM][BLOCK_DIM]; | |
__shared__ float shared_B[BLOCK_DIM][BLOCK_DIM]; | |
// Sub-matrix of A (begin, step, end) and Sub-matrix of B (begin, step) | |
__shared__ int begin_A; | |
__shared__ int begin_B; | |
__shared__ int step_A; | |
__shared__ int step_B; | |
__shared__ int end_A; | |
// Thread index | |
int tx = threadIdx.x; | |
int ty = threadIdx.y; | |
// Other variables | |
float tmp; | |
float ssd = 0; | |
// Loop parameters | |
begin_A = BLOCK_DIM * blockIdx.y; | |
begin_B = BLOCK_DIM * blockIdx.x; | |
step_A = BLOCK_DIM * wA; | |
step_B = BLOCK_DIM * wB; | |
end_A = begin_A + (dim-1) * wA; | |
// Conditions | |
int cond0 = (begin_A + tx < wA); // used to write in shared memory | |
int cond1 = (begin_B + tx < wB); // used to write in shared memory & to computations and to write in output matrix | |
int cond2 = (begin_A + ty < wA); // used to computations and to write in output matrix | |
// Loop over all the sub-matrices of A and B required to compute the block sub-matrix | |
for (int a = begin_A, b = begin_B; a <= end_A; a += step_A, b += step_B) { | |
// Load the matrices from device memory to shared memory; each thread loads one element of each matrix | |
if (a/wA + ty < dim){ | |
shared_A[ty][tx] = (cond0)? A[a + wA * ty + tx] : 0; | |
shared_B[ty][tx] = (cond1)? B[b + wB * ty + tx] : 0; | |
} | |
else{ | |
shared_A[ty][tx] = 0; | |
shared_B[ty][tx] = 0; | |
} | |
// Synchronize to make sure the matrices are loaded | |
__syncthreads(); | |
// Compute the difference between the two matrixes; each thread computes one element of the block sub-matrix | |
if (cond2 && cond1){ | |
for (int k = 0; k < BLOCK_DIM; ++k){ | |
tmp = shared_A[k][ty] - shared_B[k][tx]; | |
ssd += tmp*tmp; | |
} | |
} | |
// Synchronize to make sure that the preceding computation is done before loading two new sub-matrices of A and B in the next iteration | |
__syncthreads(); | |
} | |
// Write the block sub-matrix to device memory; each thread writes one element | |
if (cond2 && cond1) | |
AB[(begin_A + ty) * wB + begin_B + tx] = ssd; | |
} | |
/** | |
* Gathers k-th smallest distances for each column of the distance matrix in the top. | |
* | |
* @param dist distance matrix | |
* @param ind index matrix | |
* @param width width of the distance matrix and of the index matrix | |
* @param height height of the distance matrix and of the index matrix | |
* @param k number of neighbors to consider | |
*/ | |
__global__ void cuInsertionSort(float *dist, long *ind, int width, int height, int k){ | |
// Variables | |
int l, i, j; | |
float *p_dist; | |
long *p_ind; | |
float curr_dist, max_dist; | |
long curr_row, max_row; | |
unsigned int xIndex = blockIdx.x * blockDim.x + threadIdx.x; | |
if (xIndex<width){ | |
// Pointer shift, initialization, and max value | |
p_dist = dist + xIndex; | |
p_ind = ind + xIndex; | |
max_dist = p_dist[0]; | |
p_ind[0] = 1; | |
// Part 1 : sort kth firt elementZ | |
for (l=1; l<k; l++){ | |
curr_row = l * width; | |
curr_dist = p_dist[curr_row]; | |
if (curr_dist<max_dist){ | |
i=l-1; | |
for (int a=0; a<l-1; a++){ | |
if (p_dist[a*width]>curr_dist){ | |
i=a; | |
break; | |
} | |
} | |
for (j=l; j>i; j--){ | |
p_dist[j*width] = p_dist[(j-1)*width]; | |
p_ind[j*width] = p_ind[(j-1)*width]; | |
} | |
p_dist[i*width] = curr_dist; | |
p_ind[i*width] = l+1; | |
} else { | |
p_ind[l*width] = l+1; | |
} | |
max_dist = p_dist[curr_row]; | |
} | |
// Part 2 : insert element in the k-th first lines | |
max_row = (k-1)*width; | |
for (l=k; l<height; l++){ | |
curr_dist = p_dist[l*width]; | |
if (curr_dist<max_dist){ | |
i=k-1; | |
for (int a=0; a<k-1; a++){ | |
if (p_dist[a*width]>curr_dist){ | |
i=a; | |
break; | |
} | |
} | |
for (j=k-1; j>i; j--){ | |
p_dist[j*width] = p_dist[(j-1)*width]; | |
p_ind[j*width] = p_ind[(j-1)*width]; | |
} | |
p_dist[i*width] = curr_dist; | |
p_ind[i*width] = l+1; | |
max_dist = p_dist[max_row]; | |
} | |
} | |
} | |
} | |
/** | |
* Computes the square root of the first line (width-th first element) | |
* of the distance matrix. | |
* | |
* @param dist distance matrix | |
* @param width width of the distance matrix | |
* @param k number of neighbors to consider | |
*/ | |
__global__ void cuParallelSqrt(float *dist, int width, int k){ | |
unsigned int xIndex = blockIdx.x * blockDim.x + threadIdx.x; | |
unsigned int yIndex = blockIdx.y * blockDim.y + threadIdx.y; | |
if (xIndex<width && yIndex<k) | |
dist[yIndex*width + xIndex] = sqrt(dist[yIndex*width + xIndex]); | |
} | |
//-----------------------------------------------------------------------------------------------// | |
// K-th NEAREST NEIGHBORS // | |
//-----------------------------------------------------------------------------------------------// | |
/** | |
* K nearest neighbor algorithm | |
* - Initialize CUDA | |
* - Allocate device memory | |
* - Copy point sets (reference and query points) from host to device memory | |
* - Compute the distances + indexes to the k nearest neighbors for each query point | |
* - Copy distances from device to host memory | |
* | |
* @param ref_host reference points ; pointer to linear matrix | |
* @param ref_nb number of reference points ; width of the matrix | |
* @param query_host query points ; pointer to linear matrix | |
* @param query_nb number of query points ; width of the matrix | |
* @param dim dimension of points ; height of the matrices | |
* @param k number of neighbor to consider | |
* @param dist_host distances to k nearest neighbors ; pointer to linear matrix | |
* @param dist_host indexes of the k nearest neighbors ; pointer to linear matrix | |
* | |
*/ | |
void knn_device(float* ref_dev, int ref_nb, float* query_dev, int query_nb, | |
int dim, int k, float* dist_dev, long* ind_dev, cudaStream_t stream){ | |
// Grids and threads | |
dim3 g_16x16(query_nb/16, ref_nb/16, 1); | |
dim3 t_16x16(16, 16, 1); | |
if (query_nb%16 != 0) g_16x16.x += 1; | |
if (ref_nb %16 != 0) g_16x16.y += 1; | |
// | |
dim3 g_256x1(query_nb/256, 1, 1); | |
dim3 t_256x1(256, 1, 1); | |
if (query_nb%256 != 0) g_256x1.x += 1; | |
dim3 g_k_16x16(query_nb/16, k/16, 1); | |
dim3 t_k_16x16(16, 16, 1); | |
if (query_nb%16 != 0) g_k_16x16.x += 1; | |
if (k %16 != 0) g_k_16x16.y += 1; | |
// Kernel 1: Compute all the distances | |
cuComputeDistanceGlobal<<<g_16x16, t_16x16, 0, stream>>>(ref_dev, ref_nb, | |
query_dev, query_nb, dim, dist_dev); | |
// Kernel 2: Sort each column | |
cuInsertionSort<<<g_256x1, t_256x1, 0, stream>>>(dist_dev, ind_dev, | |
query_nb, ref_nb, k); | |
// Kernel 3: Compute square root of k first elements | |
// cuParallelSqrt<<<g_k_16x16,t_k_16x16, 0, stream>>>(dist_dev, query_nb, k); | |
unsigned int size_of_float = sizeof(float); | |
unsigned long size_of_long = sizeof(long); | |
float* dist_host = new float[query_nb * k]; | |
long* idx_host = new long[query_nb * k]; | |
// Memory copy of output from device to host | |
cudaMemcpy(&dist_host[0], dist_dev, | |
query_nb * k *size_of_float, cudaMemcpyDeviceToHost); | |
cudaMemcpy(&idx_host[0], ind_dev, | |
query_nb * k * size_of_long, cudaMemcpyDeviceToHost); | |
int i = 0; | |
for(i = 0; i < 100; i++){ | |
printf("IDX[%d]: %d\n", i, (int)idx_host[i]); | |
} | |
} | |