如何实现比PyTorch快6倍的Permute/Transpose算子?
x = flow.randn(2, 3)
y = x.permute(1, 0)
y.shape
(3, 2)
for row in x.shape[0]:
for col in x.shape[1]:
y[row][col] = x[col][row]
通过当前输出的一维偏移量(offset)计算对应的高维索引
然后根据参数dims重新排列输出索引,进而得到输入索引。
将输入索引转换成输入偏移量
最后进行数据移动,整个过程的示意图如下:
2 NdIndexOffsetHelper
NdIndexToOffset方法把高维索引转为一维偏移量
OffsetToNdIndex方法把一维偏移量转为高维索引
template<size_t num_dims, size_t movement_size, typename IndexType>
__global__ void PermuteKernel(PermuteKernelParamsparams) {
using T = typename std::aligned_storage::type;
const T* src = reinterpret_cast<const T*>(params.src);
T* dst = reinterpret_cast(params.dst);
IndexType src_index[num_dims];
IndexType dst_index[num_dims];
CUDA_1D_KERNEL_LOOP_T(IndexType, i, params.count) {
params.dst_index_helper.OffsetToNdIndex(i, dst_index);
#pragma unroll
for (size_t dim = 0; dim < num_dims; ++dim) {
src_index[params.permutation[dim]] = dst_index[dim];
}
IndexType src_offset = params.src_index_helper.NdIndexToOffset(src_index);
dst[i] = src[src_offset];
}
}
PermuteKernelParams是一个结构体,里面有初始化好的NdIndexOffsetHelper(src和dst各一个),元素总数count还有变换后的维度顺序permutation
首先我们取得当前处理输出元素的高维索引dst_index,然后赋给经过Permute后的输入索引src_index
再将输入索引转换成一维偏移量src_offset,取到输入元素并赋给对应的输出
3 常规情况的优化
大小为1的维度可以直接去除
连续排列的维度可以合并成一个维度
# 0, 1, 2, 3) -> (2, 3, 0, 1)
x = flow.randn(3, 4, 5, 6)
y = x.permute(2, 3, 0, 1)
y.shape
(5, 6, 3, 4)
# (0, 1, 2, 3) -> ((2, 3), (0, 1))
x = x.reshape(x.shape[0]*x.shape[1], x.shape[2]*x.shape[3])
y = x.permute(1, 0)
y = y.reshape(x.shape[2], x.shape[3], x.shape[0], x.shape[1])
3. 使用更大的访问粒度
CUDA支持的访问粒度为1B,2B,4B,8B,16B,粒度越大性能越好
最后一个维度是作为整体来移动的,即permutation[n-1]==x.dims[n-1],且大小是新访问粒度的倍数
保证数据指针满足新访问粒度的对齐要求
(0, 1, 2, 3) -> (0, 2, 1, 3)
这里Permute的带宽比原生Copy还高一点,是因为Copy Kernel里没有做unroll指令间并行优化,而Permute Kernel内部做了相关优化,这里仅做参考。
(0, 1) -> (1, 0)
(0, 1, 2) -> (0, 2, 1)
template<size_t num_dims, size_t movement_size, size_t tile_size, typename IndexType>
__global__ void BatchTransposeKernel(const void* src_ptr, void* dst_ptr, IndexType H, IndexType W,
IndexType num_tile_rows, IndexType num_tile_cols,
int32_t block_nums) {
using T = typename std::aligned_storage::type;
__shared__ T tile[tile_size][tile_size + 1]; // To avoid bank conflict.
const T* src = reinterpret_cast<const T*>(src_ptr);
T* dst = reinterpret_cast(dst_ptr);
IndexType batch_num_tile = num_tile_rows * num_tile_cols;
for (int i = blockIdx.x, step = gridDim.x; i < block_nums; i += step) {
const IndexType batch_index = i / batch_num_tile; // the index of batch.
const IndexType flatten_index =
i - batch_index * batch_num_tile;
const IndexType row_index = flatten_index / num_tile_cols; // the row index of tile in a batch.
const IndexType col_index =
flatten_index
- row_index
* num_tile_cols; // the col index of tile in a batch.
const IndexType offset = batch_index * H * W;
IndexType x = col_index * tile_size + threadIdx.x;
IndexType y = row_index * tile_size + threadIdx.y;
if (x < W) {
IndexType y_range =
((tile_size - threadIdx.y) < (H - y)) ? (tile_size - threadIdx.y) : (H - y);
#pragma unroll
for (int i = 0; i < y_range; i += kBlockRows) {
tile[threadIdx.y + i][threadIdx.x] = src[offset + (y + i) * W + x];
}
}
__syncthreads();
x = row_index * tile_size + threadIdx.x;
y = col_index * tile_size + threadIdx.y;
if (x < H) {
IndexType x_range =
((tile_size - threadIdx.y) < (W - y)) ? (tile_size - threadIdx.y) : (W - y);
#pragma unroll
// `i < x_range` equals to: `threadIdx.y + i < tile_size && y + i < W`.
for (int i = 0; i < x_range; i += kBlockRows) {
dst[offset + (y + i) * H + x] = tile[threadIdx.x][threadIdx.y + i];
}
}
__syncthreads();
}
}
显式展开循环
在先前版本,我们的for循环写法如下:
#pragma unroll
for (int i = 0; threadIdx.y + i < tile_size && y + i < H; i += kBlockRows) {
...
}
即便是加入了预编译指令#pragma unroll,在Nsight Compute里的汇编代码中,我们也只能看到两条相关指令,也就意味着这部分循环并没有展开。
#pragma unroll
for (int i = 0; threadIdx.y + i < tile_size && y + i < H; i += kBlockRows) {
...
}
IndexType y_range = ((tile_size - threadIdx.y) < (H - y)) ? (tile_size - threadIdx.y) : (H - y);
#pragma unroll
for (int i = 0; i < y_range; i += kBlockRows) {
...
}
针对half2版本优化
template<size_t num_dims, size_t tile_size, typename IndexType>
__global__ void BatchTransposeMovement2Kernel(const void* src_ptr, void* dst_ptr, IndexType rows,
IndexType cols, IndexType num_tile_rows,
IndexType num_tile_cols, int32_t block_nums) {
static_assert(tile_size % 2 == 0);
using T_MOV2 = typename std::aligned_storage<2, 2>::type;
using T_MOV4 = typename std::aligned_storage<4, 4>::type;
const T_MOV4* src = reinterpret_cast<const T_MOV4*>(src_ptr);
T_MOV4* dst = reinterpret_cast(dst_ptr);
// Use union structure to process Load and Store.
__shared__ union {
T_MOV2 tile_m2[tile_size][tile_size + 2]; // half [64][66]
T_MOV4 tile_m4[tile_size][tile_size / 2 + 1]; // half2 [64][33]
} tile_mem;
IndexType batch_num_tile = num_tile_rows * num_tile_cols;
for (int i = blockIdx.x, step = gridDim.x; i < block_nums; i += step) {
const IndexType batch_index = i / batch_num_tile; // the index of batch.
const IndexType flatten_index =
i - batch_index * batch_num_tile; // the flatten index of tile in a batch.
const IndexType row_index = flatten_index / num_tile_cols; // the row index of tile in a batch.
const IndexType col_index =
flatten_index
- row_index
* num_tile_cols; // equal to k % num_tile_cols. the col index of tile in a batch.
const IndexType offset = batch_index * rows * cols;
IndexType x =
col_index * tile_size + threadIdx.x * 2; // cause each thread process a half2 element, we need to multiply 2 for threadIdx.x.
IndexType y = row_index * tile_size + threadIdx.y;
if (x < cols) {
// each thread process 4 elements.
IndexType y_range =
((tile_size - threadIdx.y) < (rows - y)) ? (tile_size - threadIdx.y) : (rows - y);
#pragma unroll
// `i < y_range` equals to: `threadIdx.y + i < tile_size && y + i < rows`.
for (int i = 0; i < y_range; i += kBlockRows) {
// each thread load a half2.
tile_mem.tile_m4[threadIdx.y + i][threadIdx.x] = src[(offset + (y + i) * cols + x) / 2];
}
}
__syncthreads();
x = row_index * tile_size + threadIdx.x * 2; // cause each thread process a half2 element, we need to multiply 2 for threadIdx.x.
y = col_index * tile_size + threadIdx.y;
if (x < rows) {
IndexType x_range =
((tile_size - threadIdx.y) < (cols - y)) ? (tile_size - threadIdx.y) : (cols - y);
#pragma unroll
// `i < x_range` equals to: `threadIdx.y + i < tile_size && y + i < cols`.
for (int i = 0; i < x_range; i += kBlockRows) {
/*
When write back as column, it cannot be stored as half2 directly.
So we split as 2 half elements, and write back separately.
*/
union {
T_MOV4 m4;
T_MOV2 m2[2];
} tmp_storage;
tmp_storage.m2[0] = tile_mem.tile_m2[threadIdx.x * 2][threadIdx.y + i];
tmp_storage.m2[1] = tile_mem.tile_m2[threadIdx.x * 2 + 1][threadIdx.y + i];
dst[(offset + (y + i) * rows + x) / 2] = tmp_storage.m4;
}
}
__syncthreads();
}
}
5 未来优化方向
6 展望
https://developer.nvidia.com/blog/cuda-pro-tip-increase-performance-with-vectorized-memory-access/
https://developer.nvidia.com/blog/efficient-matrix-transpose-cuda-cc/
https://on-demand.gputechconf.com/gtc/2018/presentation/s81006-volta-architecture-and-performance-optimization.pdf
评论