Spaces:

AlexchIA
/

text-generation-webui

Build error

App Files Files Community

text-generation-webui / repositories /exllama /exllama_ext /cuda_func /rope.cu

AlexchIA

Upload folder using huggingface_hub

452b173 almost 2 years ago

raw

history blame contribute delete

3.68 kB

	#include "rope.cuh"
	#include "../util.cuh"
	#include "../matrix.cuh"

	const int THREADS_X = 32;
	const int THREADS_Y = 4;
	const int MAX_POS_EMBEDDINGS = 32768; // Actual number doesn't matter

	typedef void (*fp_rope_cuda_kernel)
	(
	half*,
	const half*,
	const half*,
	int,
	int,
	int,
	int
	);

	template<bool use_half2>
	__global__ void rope_cuda_kernel
	(
	half* __restrict__ x,
	const half* __restrict__ sin,
	const half* __restrict__ cos,
	int rows_per_batch,
	int head_dim,
	int num_heads,
	int past_len
	)
	{
	// These heights aren't used so it's okay if they're wrong.
	MatrixView_half_rw x_(x, rows_per_batch, head_dim);
	MatrixView_half sin_(sin, MAX_POS_EMBEDDINGS, head_dim);
	MatrixView_half cos_(cos, MAX_POS_EMBEDDINGS, head_dim);

	int column = (blockIdx.x * THREADS_X + threadIdx.x); if constexpr (use_half2) column *= 2;
	int half_dim = head_dim / 2;
	if (column >= half_dim) return;

	int row = blockIdx.y * THREADS_Y + threadIdx.y;
	if (row >= rows_per_batch) return;
	int batch_offset = blockIdx.z * rows_per_batch;
	int row_offset = batch_offset + row;

	// Get sin and cos

	int sincos_row = past_len + row / num_heads;

	if constexpr (use_half2)
	{
	half2 cos2_l = cos_.item_half2(sincos_row, column);
	half2 cos2_r = cos_.item_half2(sincos_row, column + half_dim);
	half2 sin2_l = sin_.item_half2(sincos_row, column);
	half2 sin2_r = sin_.item_half2(sincos_row, column + half_dim);
	sin2_l = __hneg2(sin2_l);

	// Apply embedding to row

	half2 item2_l = x_.item_half2(row_offset, column);
	half2 item2_r = x_.item_half2(row_offset, column + half_dim);
	half2 item2_ls = __hmul2(item2_r, sin2_l);
	half2 item2_rs = __hmul2(item2_l, sin2_r);
	item2_l = __hfma2(item2_l, cos2_l, item2_ls);
	item2_r = __hfma2(item2_r, cos2_r, item2_rs);
	x_.set_half2(row_offset, column, item2_l);
	x_.set_half2(row_offset, column + half_dim, item2_r);
	}
	else
	{
	half cos_l = cos_.item(sincos_row, column);
	half cos_r = cos_.item(sincos_row, column + half_dim);
	half sin_l = sin_.item(sincos_row, column);
	half sin_r = sin_.item(sincos_row, column + half_dim);
	sin_l = __hneg(sin_l);

	// Apply embedding to row

	half item_l = x_.item(row_offset, column);
	half item_r = x_.item(row_offset, column + half_dim);
	half item_ls = __hmul(item_r, sin_l);
	half item_rs = __hmul(item_l, sin_r);
	item_l = __hfma(item_l, cos_l, item_ls);
	item_r = __hfma(item_r, cos_r, item_rs);
	x_.set(row_offset, column, item_l);
	x_.set(row_offset, column + half_dim, item_r);
	}
	}

	fp_rope_cuda_kernel rope_cuda_kernel_pick(ExLlamaTuning* tuningParams)
	{
	// <bool use_half2>
	if (tuningParams->matmul_no_half2) {
	return rope_cuda_kernel<false>;
	} else {
	return rope_cuda_kernel<true>;
	}
	};

	void rope_cuda
	(
	ExLlamaTuning* tuningParams,
	half* x,
	const half* sin,
	const half* cos,
	const int bsz,
	const int rows_per_batch,
	const int head_dim,
	const int num_heads,
	const int past_len,
	cudaStream_t alt_stream
	)
	{
	dim3 threads(THREADS_X, THREADS_Y, 1);

	dim3 blocks
	(
	(head_dim + THREADS_X - 1) / THREADS_X / 2 / (tuningParams->rope_no_half2 ? 1 : 2),
	(rows_per_batch + THREADS_Y - 1) / THREADS_Y,
	int(bsz)
	);

	fp_rope_cuda_kernel kernel = rope_cuda_kernel_pick(tuningParams);
	kernel<<<blocks, threads, 0, alt_stream>>>(x, sin, cos, rows_per_batch, head_dim, num_heads, past_len);
	}