cuda learning material

4 minute read

Published:

cuda learning notes

Introduction

最近阅读了“Hands-On GPU-Accelerated Computer Vision with OpenCV and CUDA”这本书,其代码在github上有托管,其对nvidia cuda中常见的概念,如grid, block, thread,全局内存,共享内存,常量内存,纹理内存, cuda event, cuda stream的概念及其典型的使用方法进行了介绍,同时介绍了常用的opencv中 cuda版本函数的使用,包括基本的图像预处理,thresholding, filtering, 特征点检测,匹配,基于颜色和形状的object detection & tracking进行了介绍,同时介绍了相关cuda算法在nvidia jetson TX1这样的嵌入式设备上的使用部署, 相对来说是一本比较容易阅读的好书。

Example

下面对几个简单的实例进行介绍:

  • GPU 加法
__global__ void add_kernel(int* da, int* db, int* dc, int N) {
    int tidx = threadIdx.x + blockIdx.x * blockDim.x;
    while(tidx <  N) {
        dc[tidx] = da[tidx] + db[tidx];
        tidx += gridDim.x * blockDim.x;
    }
}

  • GPU Rank Sort
__global__ void ranksortKernel(int* da, int* db) {
    int count = 0;
    int tid = threadIdx.x;
    int ttid = blockIdx.x * threadPerBlock + tid;
    int val = da[ttid];
    __shared__ int cache[threadPerBlock];
    for(int i = tid; i< arraySize; i += threadPerBlock) {
        cache[tid] = da[i]; //每个线程对应的块里面都缓冲了数据的一部分,计算完之后再缓冲另外一部分
        //两个块里面并行执行第一重循环,再并行执行第二次循环,分别同时缓冲同样的数据
        __syncthreads();
        for(int j = 0; j < threadPerBlock;j++) {
            if(val > cache[j])
              count++;
        }
    }
    db[count] =val;
}

  • GPU Matrix Multiply
__global__ void gpu_matrix_mul_kernel(float* da, float* db, float* dc, const int size) {
    int row, col;
    col = threadIdx.x + blockIdx.x * TileSize;
    row = threadIdx.y + blockIdx.y * TileSize;
    //每个对应的线程所属的block中都有shared_a和shared_b
    __shared__ float shared_a[TileSize][TileSize];
    __shared__ float shared_b[TileSize][TileSize];
    for(int i = 0; i < size / TileSize; i++) {
      shared_a[threadIdx.y][threadIdx.x] = da[row * size + (i * TileSize + threadIdx.x)];
      shared_b[threadIdx.y][threadIdx.x] = db[col + (i * TileSize + threadIdx.y) * size];
      __syncthreads(); //利用块里面的所有线程填充块
      for(int j = 0; j < TileSize; j++) {
          dc[row * size + col] += shared_a[threadIdx.y][j] * shared_b[j][threadIdx.x];
      }
      __syncthreads();
    }
}

其对应的典型的调用方式如下:

void test() {
    const int size = 4;
    float ha[size][size], hb[size][size], h_result[size][size];
    float* da, * db, * d_result;
    for(int i =0; i < size;i++) {
        for(int j = 0; j < size; j++) {
            ha[i][j] = i;
            hb[i][j] = j;
        }
    }
    cudaMalloc((void**)&da, sizeof(int) * size * size); //要加取地址符号 &
    cudaMalloc((void**)&db, sizeof(int) * size * size);
    cudaMalloc((void**)&d_result, sizeof(int) * size * size);
    cudaMemcpy(da, ha, sizeof(int) * size * size, cudaMemcpyHostToDevice);
    cudaMemcpy(db, hb, sizeof(int) * size * size, cudaMemcpyHostToDevice);

    gpu_matrix_mul(da, db, d_result,size); //在该函数中进行kernel函数的调用
    cudaMemcpy(h_result, d_result, sizeof(int) * size * size, cudaMemcpyDeviceToHost);
    for(int i = 0; i < size; i++) {
        for(int j = 0; j < size; j++) {
            std::cout<<h_result[i][j] <<" ";
        }
        std::cout<< std::endl;
    }
    cudaFree(da);
    cudaFree(db);
    cudaFree(d_result);
}

  • GPU RGB Image To Gray
__global__ void gray(uchar4* d_in, uchar* d_out, int rows, int cols) {
    int blockid = blockIdx.y * gridDim.x + blockIdx.x;
    int threadid = blockDim.x * blockDim.y * blockid + threadIdx.y * blockDim.x + threadIdx.x;

    if(threadid < rows * cols) {
        uchar4 rgba = d_in[threadid];
        d_out[threadid] = 0.299f*rgba.x+0.587f*rgba.y+0.114f*rgba.z;
    }
}

cv::Mat cudafunc(cv::Mat rgba_image) {
    int rows = rgba_image.rows;
    int cols = rgba_image.cols;

    uchar4* h_in = (uchar4*)rgba_image.data;
    uchar* h_out = (uchar*)malloc(sizeof(uchar) * rows * cols);

    uchar4* d_in;
    uchar* d_out;
    cudaMalloc((void**)&d_in, sizeof(uchar4) * rows * cols);
    cudaMalloc((void**)&d_out, sizeof(uchar) * rows * cols);

    // copy data from host to device
    cudaMemcpy(d_in, h_in, sizeof(uchar4) * rows * cols, cudaMemcpyHostToDevice);

    // call kernel function process image
    const dim3 blocksize(32, 32, 1);
    const dim3 gridsize((rows - 1 + 32) / 32, (cols - 1 + 32) / 32, 1);
    gray <<<gridsize,blocksize>>>(d_in, d_out, rows, cols);
    cudaMemcpy(h_out, d_out, sizeof(uchar) * rows * cols, cudaMemcpyDeviceToHost);
    
    cudaFree(d_in);
    cudaFree(d_out);

    cv::Mat out(rows,cols, CV_8UC1, h_out);
    return out;
}

  • GPU HIST CAL
__global__ void histo_kernel(unsigned char* buffer,long size,unsigned int * histo) {
    __shared__ unsigned int temp[256];
    temp[threadIdx.x] = 0;
    __syncthreads();

    int i = threadIdx.x + blockIdx.x * blockDim.x;
    int offset = blockDim.x * gridDim.x;
    //每个线程对应的block都进行像素个数数组的缓冲
    while(i < size){
        atomicAdd(&temp[buffer[i]],1);
        i += offset;
    }
    __syncthreads();
    //叠加所有块中的缓冲统计数据
    atomicAdd(&(histo[threadIdx.x]),temp[threadIdx.x]);
}

  • GPU MEAN FILTER

#define TILE_W 16
#define TILE_H 16
#define R 2 // filter radius
#define BLOCK_W (TILE_W + (2 * R))
#define BLOCK_H (TILE_H + (2 * R))

__global__ void filter(unsigned char* d_in, unsigned char* d_out, int width, int height) {
  __shared__ unsigned char ssem[BLOCK_W * BLOCK_H];
  // 根据线程索引计算图像中的索引
  int x = blockIdx.x * TILE_W + threadIdx.x - R;
  int y = blockIdx.y * TILE_H + threadIdx.y - R;

  x = max(0, x);
  x = min(x, width - 1);
  y = max(0, y);
  y = min(y, height - 1);

  int index = y * width + x;
  int bindex = threadIdx.y * blockDim.x + threadIdx.x;

  ssem[bindex] = int(d_in[index]);
  __syncthreads();

  if(threadIdx.x >= R && threadIdx.x < (BLOCK_W - R) && 
     threadIdx.y >= R && threadIdx.y < (BLOCK_H - R)) {
      float sum = 0;
      for(int dy = -R; dy <= R; dy++) {
          for(int dx = -R; dx <= R; dx++) {
              sum += int(ssem[bindex + dy * blockDim.x + dx]);
          }
      }
      d_out[index] = int(sum / S);
  }
}