cuda learning material
Published:
cuda learning notes
Introduction
最近阅读了“Hands-On GPU-Accelerated Computer Vision with OpenCV and CUDA”这本书,其代码在github上有托管,其对nvidia cuda中常见的概念,如grid, block, thread,全局内存,共享内存,常量内存,纹理内存, cuda event, cuda stream的概念及其典型的使用方法进行了介绍,同时介绍了常用的opencv中 cuda版本函数的使用,包括基本的图像预处理,thresholding, filtering, 特征点检测,匹配,基于颜色和形状的object detection & tracking进行了介绍,同时介绍了相关cuda算法在nvidia jetson TX1这样的嵌入式设备上的使用部署, 相对来说是一本比较容易阅读的好书。
Example
下面对几个简单的实例进行介绍:
- GPU 加法
__global__ void add_kernel(int* da, int* db, int* dc, int N) {
int tidx = threadIdx.x + blockIdx.x * blockDim.x;
while(tidx < N) {
dc[tidx] = da[tidx] + db[tidx];
tidx += gridDim.x * blockDim.x;
}
}
- GPU Rank Sort
__global__ void ranksortKernel(int* da, int* db) {
int count = 0;
int tid = threadIdx.x;
int ttid = blockIdx.x * threadPerBlock + tid;
int val = da[ttid];
__shared__ int cache[threadPerBlock];
for(int i = tid; i< arraySize; i += threadPerBlock) {
cache[tid] = da[i]; //每个线程对应的块里面都缓冲了数据的一部分,计算完之后再缓冲另外一部分
//两个块里面并行执行第一重循环,再并行执行第二次循环,分别同时缓冲同样的数据
__syncthreads();
for(int j = 0; j < threadPerBlock;j++) {
if(val > cache[j])
count++;
}
}
db[count] =val;
}
- GPU Matrix Multiply
__global__ void gpu_matrix_mul_kernel(float* da, float* db, float* dc, const int size) {
int row, col;
col = threadIdx.x + blockIdx.x * TileSize;
row = threadIdx.y + blockIdx.y * TileSize;
//每个对应的线程所属的block中都有shared_a和shared_b
__shared__ float shared_a[TileSize][TileSize];
__shared__ float shared_b[TileSize][TileSize];
for(int i = 0; i < size / TileSize; i++) {
shared_a[threadIdx.y][threadIdx.x] = da[row * size + (i * TileSize + threadIdx.x)];
shared_b[threadIdx.y][threadIdx.x] = db[col + (i * TileSize + threadIdx.y) * size];
__syncthreads(); //利用块里面的所有线程填充块
for(int j = 0; j < TileSize; j++) {
dc[row * size + col] += shared_a[threadIdx.y][j] * shared_b[j][threadIdx.x];
}
__syncthreads();
}
}
其对应的典型的调用方式如下:
void test() {
const int size = 4;
float ha[size][size], hb[size][size], h_result[size][size];
float* da, * db, * d_result;
for(int i =0; i < size;i++) {
for(int j = 0; j < size; j++) {
ha[i][j] = i;
hb[i][j] = j;
}
}
cudaMalloc((void**)&da, sizeof(int) * size * size); //要加取地址符号 &
cudaMalloc((void**)&db, sizeof(int) * size * size);
cudaMalloc((void**)&d_result, sizeof(int) * size * size);
cudaMemcpy(da, ha, sizeof(int) * size * size, cudaMemcpyHostToDevice);
cudaMemcpy(db, hb, sizeof(int) * size * size, cudaMemcpyHostToDevice);
gpu_matrix_mul(da, db, d_result,size); //在该函数中进行kernel函数的调用
cudaMemcpy(h_result, d_result, sizeof(int) * size * size, cudaMemcpyDeviceToHost);
for(int i = 0; i < size; i++) {
for(int j = 0; j < size; j++) {
std::cout<<h_result[i][j] <<" ";
}
std::cout<< std::endl;
}
cudaFree(da);
cudaFree(db);
cudaFree(d_result);
}
- GPU RGB Image To Gray
__global__ void gray(uchar4* d_in, uchar* d_out, int rows, int cols) {
int blockid = blockIdx.y * gridDim.x + blockIdx.x;
int threadid = blockDim.x * blockDim.y * blockid + threadIdx.y * blockDim.x + threadIdx.x;
if(threadid < rows * cols) {
uchar4 rgba = d_in[threadid];
d_out[threadid] = 0.299f*rgba.x+0.587f*rgba.y+0.114f*rgba.z;
}
}
cv::Mat cudafunc(cv::Mat rgba_image) {
int rows = rgba_image.rows;
int cols = rgba_image.cols;
uchar4* h_in = (uchar4*)rgba_image.data;
uchar* h_out = (uchar*)malloc(sizeof(uchar) * rows * cols);
uchar4* d_in;
uchar* d_out;
cudaMalloc((void**)&d_in, sizeof(uchar4) * rows * cols);
cudaMalloc((void**)&d_out, sizeof(uchar) * rows * cols);
// copy data from host to device
cudaMemcpy(d_in, h_in, sizeof(uchar4) * rows * cols, cudaMemcpyHostToDevice);
// call kernel function process image
const dim3 blocksize(32, 32, 1);
const dim3 gridsize((rows - 1 + 32) / 32, (cols - 1 + 32) / 32, 1);
gray <<<gridsize,blocksize>>>(d_in, d_out, rows, cols);
cudaMemcpy(h_out, d_out, sizeof(uchar) * rows * cols, cudaMemcpyDeviceToHost);
cudaFree(d_in);
cudaFree(d_out);
cv::Mat out(rows,cols, CV_8UC1, h_out);
return out;
}
- GPU HIST CAL
__global__ void histo_kernel(unsigned char* buffer,long size,unsigned int * histo) {
__shared__ unsigned int temp[256];
temp[threadIdx.x] = 0;
__syncthreads();
int i = threadIdx.x + blockIdx.x * blockDim.x;
int offset = blockDim.x * gridDim.x;
//每个线程对应的block都进行像素个数数组的缓冲
while(i < size){
atomicAdd(&temp[buffer[i]],1);
i += offset;
}
__syncthreads();
//叠加所有块中的缓冲统计数据
atomicAdd(&(histo[threadIdx.x]),temp[threadIdx.x]);
}
- GPU MEAN FILTER
#define TILE_W 16
#define TILE_H 16
#define R 2 // filter radius
#define BLOCK_W (TILE_W + (2 * R))
#define BLOCK_H (TILE_H + (2 * R))
__global__ void filter(unsigned char* d_in, unsigned char* d_out, int width, int height) {
__shared__ unsigned char ssem[BLOCK_W * BLOCK_H];
// 根据线程索引计算图像中的索引
int x = blockIdx.x * TILE_W + threadIdx.x - R;
int y = blockIdx.y * TILE_H + threadIdx.y - R;
x = max(0, x);
x = min(x, width - 1);
y = max(0, y);
y = min(y, height - 1);
int index = y * width + x;
int bindex = threadIdx.y * blockDim.x + threadIdx.x;
ssem[bindex] = int(d_in[index]);
__syncthreads();
if(threadIdx.x >= R && threadIdx.x < (BLOCK_W - R) &&
threadIdx.y >= R && threadIdx.y < (BLOCK_H - R)) {
float sum = 0;
for(int dy = -R; dy <= R; dy++) {
for(int dx = -R; dx <= R; dx++) {
sum += int(ssem[bindex + dy * blockDim.x + dx]);
}
}
d_out[index] = int(sum / S);
}
}