center line extraction

7 minute read

Published:

图像中心线提取

一种中心线提取方法

最近偶然看到了一种图像上的中心线段提取方法,其提取结果如下图所示:

算法为基于hessian矩阵的Steger方法,其在CPU上的实现代码如下所示: 方法的大致流程包括高斯滤波,一阶偏导,二阶偏导,求解hessian矩阵的最大特征值对应的特征向量,最终将满足条件的图像像素放入中心线像素集合


// lane center extraction func
// Steger C. An unbiased detector of curvilinear structures. IEEE Trans Pattern Anal Mach Intell[J].
// IEEE Transactions on Pattern Analysis & Machine Intelligence, 1998, 20(2):113-125.
// https://blog.csdn.net/Dangkie/article/details/78996761
cv::Mat StegerLine(const cv::Mat image) {
  cv::Mat img;
  //高斯滤波
  cv::GaussianBlur(image, img, cv::Size(31, 11), 5, 2);
  //一阶偏导数
  cv::Mat m1, m2;
  m1 = (cv::Mat_<float>(1, 2) << 1 ,-1);  //x偏导
  m2 = (cv::Mat_<float>(2, 1) << 1 ,-1);  //y偏导

  cv::Mat dx, dy;
  filter2D(img, dx, CV_32FC1, m1);
  filter2D(img, dy, CV_32FC1, m2);

  //二阶偏导数
  cv::Mat m3, m4, m5;
  m3 = (cv::Mat_<float>(1, 3) << 1, -2, 1);   //二阶x偏导
  m4 = (cv::Mat_<float>(3, 1) << 1, -2, 1);   //二阶y偏导
  m5 = (cv::Mat_<float>(2, 2) << 1, -1, -1, 1);   //二阶xy偏导

  cv::Mat dxx, dyy, dxy;
  filter2D(img, dxx, CV_32FC1, m3);
  filter2D(img, dyy, CV_32FC1, m4);
  filter2D(img, dxy, CV_32FC1, m5);

  //hessian矩阵
  std::vector<double> Pt;
  for (int i = 0; i < img.cols; i++) {
    for (int j = 0; j < img.rows; j++) {
      if (image.at<float>(j, i) > 200 / 255.0) {
        cv::Mat hessian(2, 2, CV_32FC1);
        hessian.at<float>(0, 0) = dxx.at<float>(j, i);
        hessian.at<float>(0, 1) = dxy.at<float>(j, i);
        hessian.at<float>(1, 0) = dxy.at<float>(j, i);
        hessian.at<float>(1, 1) = dyy.at<float>(j, i);

        cv::Mat eValue;
        cv::Mat eVectors;
        eigen(hessian, eValue, eVectors);

        double nx, ny;
        if (fabs(eValue.at<float>(0, 0)) >= fabs(eValue.at<float>(1, 0)))  //求特征值最大时对应的特征向量
        {
          nx = eVectors.at<float>(0, 0);
          ny = eVectors.at<float>(0, 1);
        } else {
          nx = eVectors.at<float>(1, 0);
          ny = eVectors.at<float>(1, 1);
        }

//        std::cout<< "row, col, a, b, c, nx, ny: " << j << " "
//                 << i << " " << dxx.at<float>(j, i)<< " "
//                 << dxy.at<float>(j, i) << " " << dyy.at<float>(j, i)
//                 << " " << nx <<" " << ny <<" " << std::endl;

        double t = -(nx * dx.at<float>(j, i) + ny * dy.at<float>(j, i)) /
                   (nx * nx * dxx.at<float>(j, i) + 2 * nx * ny * dxy.at<float>(j, i) +
                    ny * ny * dyy.at<float>(j, i));

        if (fabs(t * nx) <= 0.5 && fabs(t * ny) <= 0.5) {
          Pt.push_back(i);
          Pt.push_back(j);
        }
      }
    }
  }
  cv::Mat center_line = cv::Mat::zeros(image.rows, image.cols, CV_32FC1);
  for (int k = 0; k < static_cast<int>(Pt.size() / 2); k++) {
    int i = Pt[2 * k + 0];
    int j = Pt[2 * k + 1];
    center_line.ptr<float>(j)[i] = 1.0;
  }
  return center_line;
}

上述方法在CPU上的执行速度较低,将其代码实现转为在GPU上进行执行,主要分为三个步骤:

  1. void GaussianBlur();
  2. void ComputeDeriv();
  3. void ComputeHessian();

其中第一步高斯滤波可以在global_memory上执行,也可以把filter kernel的参数放到constant memory上以便加快内存访问效率,也可以用shared_memory的方式来进一步加快卷积运算,同时,二维高斯滤波是一种可分离滤波器,可以分解为一维row filter和col filter的组合,这样的实现在nvidia cuda官方的sample代码中已有实现,如果你已经在机器上安装了cuda的化,其位于”/usr/local/cuda/samples/3_Imaging/convolutionSeparable”.

这里先给出利用constant memory和global memory进行卷积运算的代码:

首先需要先生成Gaussian Kernel:

void GaussianMatrix2d(float *array, int radius_w, int radius_h,
                      float sigma_x_square, float sigma_y_square) {
  int kernel_size_w = radius_w * 2 + 1;
  float normalizationFactor = 0.0;
  for (int i = -radius_w; i <= radius_w; i++) {
    for (int j = -radius_h; j <= radius_h; j++) {
      array[(j + radius_h) * kernel_size_w + i + radius_w] =
          exp((-1.0 * i * i) / (2 * sigma_x_square) +
              (-1.0 * j * j) / (2 * sigma_y_square));
      normalizationFactor +=
          array[(j + radius_h) * kernel_size_w + i + radius_w];
    }
  }
  // Normalize, since the Gaussian is truncated, we would like to integrate to 1
  for (int i = -radius_w; i <= radius_w; i++) {
    for (int j = -radius_h; j <= radius_h; j++) {
      array[(j + radius_h) * kernel_size_w + i + radius_w] /=
          normalizationFactor;
    }
  }
}
const int kernel_w = 31;
const int kernel_h = 11;
__device__ __constant__ float d_cFilterKernel[kernel_w * kernel_h];

void setConvolutionKernel(float *h_Kernel)
{
  cudaMemcpyToSymbol(d_cFilterKernel, h_Kernel, kernel_w * kernel_h * sizeof(float),
                     0, cudaMemcpyHostToDevice);
}

__global__ void GaussianBlurKernel_Constant(float* src, float* dst,
                                            int width, int height) {
  int col = blockIdx.x * blockDim.x + threadIdx.x;
  int row = blockIdx.y * blockDim.y + threadIdx.y;

  const int kernel_radius_w = (kernel_w - 1) / 2;
  const int kernel_radius_h = (kernel_h - 1) / 2;

  if(col < width && row < height) {
    float convolved = 0.f; // temp variable for result
    int locX, locY; // Local x and y

    // Main loop. Iterates over all pixels in its domain, and then convolves the
    // Gaussian matrix with the submatrix around the pixel.
    for (int off_y = -kernel_radius_h ; off_y <= kernel_radius_h; off_y++)
    {
      for (int off_x = -kernel_radius_w; off_x <= kernel_radius_w; off_x++)
      {
        locX = min ( width-1, max(0, col + off_x));
        locY = min ( height-1,max(0, row + off_y));
        //convolved += src[locY* width + locX] *
        //             device_gaussian[(off_y + kernel_radius_h) * kernel_size_w + kernel_radius_w + off_x];
        convolved += src[locY * width + locX] *
                     d_cFilterKernel[(off_y + kernel_radius_h) * kernel_w + off_x + kernel_radius_w];
        //printf("%f ", d_cFilterKernel[(off_y + kernel_radius_h) * kernel_w + off_x + kernel_radius_w]);
      }
    }
    dst[row * width + col] = min(convolved, 1.0);
  }
}

对于第二步图像一阶微分和二阶微分的计算,由于滤波kernel比较简单, 直接采用global memory 进行计算:

__global__ void Filter2D_dxy_kernel(float* src, float* dst, int width, int height) {
  // dxy kernel
  // m5 = (cv::Mat_<float>(2, 2) << 1, -1, -1, 1);   //二阶xy偏导
  int col = threadIdx.x + blockIdx.x * blockDim.x;
  int row = threadIdx.y + blockIdx.y * blockDim.y;
  if(col < (width - 1) && row < (height - 1)) {
    float value = src[row * width + col] -
                  src[row * width + col + 1] -
                  src[(row + 1) * width + col] +
                  src[(row + 1) * width + col + 1];
    dst[row * width + col] = value;
  }
}

第三步需要进行二维矩阵的特征值和特征向量计算,这里直接给出实现代码:

__global__ void ComputeHessianKernel(float* src, float* dx, float* dy,
                                     float* dxx, float* dyy, float* dxy,
                                     float* result,
                                     int width, int height) {
  int col = threadIdx.x + blockIdx.x * blockDim.x;
  int row = threadIdx.y + blockIdx.y * blockDim.y;
  if(col < width && row < height && src[row * width + col] >  0.75) {
    // cal eigen vector and eigen value
    /*
     *   a  b
     *   b  c
     */
    float a = dxx[row * width + col];
    float b = dxy[row * width + col];
    float c = dyy[row * width + col];

    //求特征值最大时对应的特征向量
    float lambda1 = (a + c + sqrt((a - c) * (a - c) + 4 * b * b)) / 2.0;
    float lambda2 = (a + c - sqrt((a - c) * (a - c) + 4 * b * b)) / 2.0;
    double nx, ny;
    if(fabs(b) < 1e-15) {
      if(fabs(lambda1) >= fabs(lambda2)) {
        nx = 1.0;
        ny = 0.0;
      } else {
        nx = 0.0;
        ny = 1.0;
      }
    } else {
      if(fabs(lambda1) >= fabs(lambda2)) {
        float tmp_nx = 1.0;
        float tmp_ny = (lambda1 - a) / b;
        nx = tmp_nx / sqrt(1.0 + tmp_ny * tmp_ny);
        ny = tmp_ny / sqrt(1.0 + tmp_ny * tmp_ny);
      } else {
        float tmp_nx = 1.0;
        float tmp_ny = (lambda2 - a) / b;
        nx = tmp_nx / sqrt(1.0 + tmp_ny * tmp_ny);
        ny = tmp_ny / sqrt(1.0 + tmp_ny * tmp_ny);
      }
    }

//    if(row == 240 && col == 639) {
//      printf("a, b, c, nx, ny: %f, %f, %f, %f, %f\n", a, b, c, nx, ny);
//    }

    double t = -(nx * dx[row * width + col] + ny * dy[row * width + col]) /
              (nx * nx * dxx[row * width + col] + 2 * nx * ny * dxy[row * width + col] +
               ny * ny * dyy[row * width + col]);

    if(fabs(t * nx) <= 0.5 && fabs(t * ny) <= 0.5) {
      result[row * width + col] = 1.0;
    }
  }
}

经过上面的优化,可以达到约7-8倍的速度提升,同时中心线提取的效果与原有的一致