此文章作为存档文章,caffe虽然不是c++版本运行CenterNet的最优方式,但也是一种选择。这里仅仅是记录,承接利用Caffe推理CenterNet(上篇)。
在上文中,虽然通过外挂libpostprocess.so
动态链接库的方式,实现了CenterNet的后处理部分,但显然不是很优雅,频繁地对显存进行申请和释放可能会影响推理过程中的稳定性和吞吐量,因此我们有必要将后处理部分以Caffe层的形式执行。
将后处理移至Caffe层中
如果移到caffe层中,相当于自己添加一个新的层,那么需要使用protobuf定义新的层,首先我们需要修改caffe.proto。
修改caffe.proto
这里我定义了一个CenternetOutput层,作为CenterNet的后处理部分,需要在caffe.proto
中的合适位置添加以下内容:
optional CenternetOutputParameter centernet_output_param = 209;
message CenternetOutputParameterParameter { // Number of classes that are actually predicted. Required! optional uint32 num_classes = 1; optional uint32 kernel_size = 2 [default = 3]; optional float vis_threshold = 3 [default = 0.3]; optional bool apply_nms = 4 [default = false]; optional uint32 feature_map_h = 5 [default = 0]; optional uint32 feature_map_w = 6 [default = 0]; }
并且在之前的res50.prototxt
中最后添加以下部分,三个bottom分别为CenterNet最后三个输出:hm、hw、reg:
layer { name: "centernet_output" type: "CenternetOutput" bottom: "conv_blob55" bottom: "conv_blob57" bottom: "conv_blob59" top: "result_out" centernet_output_param { num_classes: 2 kernel_size: 3 vis_threshold: 0.3 } }
修改完prototxt
后模型最后几层的结果是这样的,CenternetOutpu即我们定义的后处理层:
修改后记得造出新的caffe.pb.cc
和caffe.pb.h
,否则会Error parsing text-format caffe.NetParameter: 2715:26: Message type "caffe.LayerParameter" has no field named "centernet_output_param"
,最好make clean
一下再重新编译。
对于这些后处理层,我们不需要只需要前向过程,不需要反向的过程,所以直接将其设置为:
virtual void Backward_cpu(const vector<Blob<Dtype>*>& top, const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) { NOT_IMPLEMENTED; } virtual void Backward_gpu(const vector<Blob<Dtype>*>& top, const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) { NOT_IMPLEMENTED; }
而我们的centernet_output_layer.hpp
这样写:
#ifndef CAFFE_CENTERNET_OUTPUT_LAYER_H #define CAFFE_CENTERNET_OUTPUT_LAYER_H #include <vector> #include "caffe/blob.hpp" #include "caffe/layer.hpp" #include "caffe/proto/caffe.pb.h" struct Box{ float x1; float y1; float x2; float y2; }; struct Detection{ //x1 y1 x2 y2 Box bbox; int classId; float prob; }; namespace caffe { /** * @brief Combine CenterNet (hm|wh|reg) layers to BoxOutput * */ template <typename Dtype> class CenternetOutputLayer : public Layer<Dtype> { public: explicit CenternetOutputLayer(const LayerParameter& param) : Layer<Dtype>(param) {} virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top); virtual void Reshape(const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top); // 返回层的名称 virtual inline const char* type() const { return "CenternetOutput"; } // 需要输入三个blob 最终输出一个blob virtual inline int ExactNumBottomBlobs() const { return 3; } virtual inline int ExactNumTopBlobs() const { return 1; } protected: virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top); virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top); /// @brief Not implemented virtual void Backward_cpu(const vector<Blob<Dtype>*>& top, const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) { NOT_IMPLEMENTED; } virtual void Backward_gpu(const vector<Blob<Dtype>*>& top, const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) { NOT_IMPLEMENTED; } int classes_; int kernel_size_; float vis_thresh_; int h_; int w_; Blob<float> output_bufs_; }; } // namespace caffe #endif // CAFFE_CENTERNET_OUTPUT_LAYER_H
接下来写两个这个layer头文件对应两个版本(GPU和CPU)的.cpp
。
CPU版本的Layer
#include <vector> #include "caffe/layers/centernet_output_layer.hpp" #include "caffe/util/math_functions.hpp" namespace caffe { double sigmoid(double p){ return 1.0 / (1 + exp(-p * 1.0)); } template <typename Dtype> void CenternetOutputLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) { const CenternetOutputParameter& centernet_output_param = this->layer_param_.centernet_output_param(); CHECK(centernet_output_param.has_num_classes()) << "Must specify num_classes"; classes_ = centernet_output_param.num_classes(); kernel_size_ = centernet_output_param.kernel_size(); vis_thresh_ = centernet_output_param.vis_threshold(); // 这里的第一维度的第一个数字代表数量,其余5个置为0,从第二个维度开始实际存结果(存100个),因此1+100 output_bufs_.Reshape(101,6,1,1); } template <typename Dtype> void CenternetOutputLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) { CHECK_EQ(bottom.size(), 3); Blob<Dtype>* hm_blob = bottom[0]; vector<int> hm_shape = hm_blob->shape(); h_ = hm_shape[2]; w_ = hm_shape[3]; vector<int> top_shape; // Since the number of bboxes to be kept is unknown before nms, we manually // set it to (fake) 1. top_shape.push_back(101); // Each row is a 6 dimension vector, which stores // [class, score, xmin, ymin, xmax, ymax] top_shape.push_back(6); top[0]->Reshape(top_shape); } template <typename Dtype> void CenternetOutputLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) { const Dtype* hm_data = bottom[0]->cpu_data(); vector<int> hm_shape = bottom[0]->shape(); const Dtype* wh_data = bottom[1]->cpu_data(); const Dtype* reg_data = bottom[2]->cpu_data(); int classes = classes_; int feature_size = hm_shape[2]*hm_shape[3]; vector<vector<float>> fscore_max; for(int j=0; j < classes; j++) //class { for(int k=0; k < feature_size; k++) { vector<float> temp_inx; temp_inx.push_back(j*feature_size+k); // 位置 temp_inx.push_back(hm_data[j*feature_size+k]); // 分数 fscore_max.push_back(temp_inx); } } std::sort(fscore_max.begin(), fscore_max.end(),[](const std::vector<float>& a, const std::vector<float>& b){ return a[1] > b[1];}); // get top 100 int iters = std::min<int>(fscore_max.size(), 10); int only_threshbox=0; for(int i=0;i<iters;i++) { // 这里根据阈值进行筛选 fscore_max[i][1] = sigmoid(fscore_max[i][1]); if(fscore_max[i][1]<vis_thresh_) { break; } only_threshbox++; } vector<vector<float>> boxes; for(int i = 0; i < only_threshbox;i++) { vector<float> box; int index = ((int)fscore_max[i][0]) / (hm_shape[2] * hm_shape[3]); int center_index = ((int)fscore_max[i][0]) % (hm_shape[2]*hm_shape[3]) - hm_shape[3]; int cls = index; float xs= center_index % hm_shape[3]; float ys= int(center_index / hm_shape[3] ) % hm_shape[2]; //reg batch 1 xs += reg_data[(int)(((int)ys)*hm_shape[3] + xs)]; ys += reg_data[(int)(hm_shape[3]*hm_shape[2]+((int)ys)*hm_shape[3]+xs)]; float w = wh_data[(int)(((int)ys)*hm_shape[3]+xs)]; float h = wh_data[(int)(hm_shape[2]*hm_shape[3]+((int)ys)*hm_shape[3]+xs)]; box.push_back((float)cls); box.push_back((float)fscore_max[i][1]); box.push_back((float)(xs-w/2.0)); box.push_back((float)(ys-h/2.0)); box.push_back((float)(xs+w/2.0)); box.push_back((float)(ys+h/2.0)); // 输出四个点 boxes.push_back(box); } vector<int> top_shape; top_shape.push_back(boxes.size()); // Each row is a 6 dimension vector, which stores // [class, score, xmin, ymin, xmax, ymax] top_shape.push_back(6); Dtype* top_data; top[0]->Reshape(top_shape); top_data = top[0]->mutable_cpu_data(); printf("boxes size: %d \n", boxes.size()); for(int i = 0; i < boxes.size(); i ++) { vector<float> box = boxes[i]; top_data[i * 6] = box[0]; top_data[i * 6 + 1] = box[1]; top_data[i * 6 + 2] = box[2]; top_data[i * 6 + 3] = box[3]; top_data[i * 6 + 4] = box[4]; top_data[i * 6 + 5] = box[5]; } } #ifdef CPU_ONLY STUB_GPU_FORWARD(CenternetOutputLayer, Forward); #endif INSTANTIATE_CLASS(CenternetOutputLayer); REGISTER_LAYER_CLASS(CenternetOutput); } // namespace caffe
GPU版本的如下,后处理部分借鉴了 https://github.com/CaoWGG/TensorRT-CenterNet 这个github。
#include <vector> #include "caffe/layers/centernet_output_layer.hpp" #include "caffe/util/math_functions.hpp" namespace caffe { #ifndef BLOCK #define BLOCK 512 #endif template <typename Dtype> __device__ float Logist(Dtype data){ return 1./(1. + exp(-data)); } template <typename Dtype> __global__ void CTdetforward_kernel(const Dtype *hm, const Dtype *reg,const Dtype *wh , float *output,const int w,const int h,const int classes,const int kernel_size, const float visthresh ) { int idx = blockDim.x * blockIdx.x + threadIdx.x; if (idx >= w * h * classes) return; int padding = (kernel_size - 1) / 2; int offset = -padding; int stride = w * h; int grid_x = idx % w; int grid_y = (idx / w) % h; int cls = idx/w/h ; int l, m; int reg_index = idx - cls*stride; float c_x, c_y; float objProb = Logist(hm[idx]); if (objProb > visthresh) { float max = -1; int max_index = 0; for (l = 0; l < kernel_size; ++l) for (m = 0; m < kernel_size; ++m) { int cur_x = offset + l + grid_x; int cur_y = offset + m + grid_y; int cur_index = cur_y * w + cur_x + stride * cls; int valid = (cur_x >= 0 && cur_x < w && cur_y >= 0 && cur_y < h); float val = (valid != 0) ? Logist(hm[cur_index]) : -1; max_index = (val > max) ? cur_index : max_index; max = (val > max) ? val : max; } if(idx == max_index){ // 取出output的第一个地址的数值(存着box数量)并且加1 并且将结果存于当前地址 原子操作 int resCount = (int) atomicAdd(output, 1); // 每次都会避开一个sizeof(float)的地址(存着数量) char *data = (char *) output + sizeof(Detection) + resCount * sizeof(Detection); Detection *det = (Detection *) (data); c_x = grid_x + reg[reg_index]; c_y = grid_y + reg[reg_index + stride]; det->bbox.x1 = (c_x - wh[reg_index] / 2) * 4; det->bbox.y1 = (c_y - wh[reg_index + stride] / 2) * 4; det->bbox.x2 = (c_x + wh[reg_index] / 2) * 4; det->bbox.y2 = (c_y + wh[reg_index + stride] / 2) * 4; det->classId = cls; det->prob = objProb; } } } template <typename Dtype> void CenternetOutputLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) { LOG(INFO) << "begin gpu"; Blob<Dtype>* hm_blob = bottom[0]; const Dtype* hm_data = hm_blob->gpu_data(); vector<int> hm_shape = hm_blob->shape(); Blob<Dtype>* wh_blob = bottom[1]; const Dtype* wh_data = wh_blob->gpu_data(); Blob<Dtype>* reg_blob = bottom[2]; const Dtype* reg_data = reg_blob->gpu_data(); // caffe初始化的GPU内存数值默认都为0 float* output_bufs = output_bufs_.mutable_gpu_data(); caffe_gpu_memset(sizeof(float), 0, output_bufs); uint num = w_ * h_ * classes_; int nbrBlocks = ceil((float)num / (float)BLOCK); CTdetforward_kernel<<<nbrBlocks,BLOCK>>>(hm_data,reg_data,wh_data,output_bufs,w_,h_,classes_, kernel_size_,vis_thresh_); // output_bufs_results为常量指针 指向的内容不能改变,但是可以改变指向 const float* output_bufs_results = output_bufs_.cpu_data(); int num_det = static_cast<int>(output_bufs_results[0]); std::cout<<"num_det: " << num_det <<std::endl; vector<int> top_shape; top_shape.push_back(num_det); top_shape.push_back(6); Dtype* top_data; top[0]->Reshape(top_shape); top_data = top[0]->mutable_cpu_data(); const float* temp_bufs_result = output_bufs_results + 6; for(int i = 0; i < num_det; i ++) { top_data[i * 6] = temp_bufs_result[4]; top_data[i * 6 + 1] = temp_bufs_result[5]; top_data[i * 6 + 2] = temp_bufs_result[0]; top_data[i * 6 + 3] = temp_bufs_result[1]; top_data[i * 6 + 4] = temp_bufs_result[2]; top_data[i * 6 + 5] = temp_bufs_result[3]; temp_bufs_result += 6; } CUDA_POST_KERNEL_CHECK; } INSTANTIATE_LAYER_GPU_FUNCS(CenternetOutputLayer); } // namespace caffe
将cuda后处理挪至caffe层后,推理代码修改如下:
std::vector<vector<float> > CenterNet_Detector::Detect(const cv::Mat& img) { int count = 1; std::vector<vector<float> > rlt; Blob<float>* input_layer = net_->input_blobs()[0]; typedef std::chrono::duration<double, std::ratio<1, 1000>> ms; auto total_t0 = std::chrono::high_resolution_clock::now(); auto t0 = std::chrono::high_resolution_clock::now(); // 这里按照实际输入图像的长宽设定模型的输入大小 input_layer->Reshape(1, num_channels_, img.rows, img.cols); input_geometry_ = cv::Size(input_layer->width(), input_layer->height()); net_->Reshape(); auto t1 = std::chrono::high_resolution_clock::now(); double reshape_time = std::chrono::duration_cast<ms>(t1 - t0).count(); std::cout << "Caffe Reshape time: " << std::fixed << std::setprecision(2) << reshape_time << " ms" << std::endl; t0 = std::chrono::high_resolution_clock::now(); std::vector<cv::Mat> input_channels; WrapInputLayer(&input_channels); cv::Mat tm = Preprocess(img, &input_channels); t1 = std::chrono::high_resolution_clock::now(); double preprocess_time = std::chrono::duration_cast<ms>(t1 - t0).count(); std::cout << "Preprocess time: " << std::fixed << std::setprecision(2) << preprocess_time << " ms" << std::endl; while(count--) { t0 = std::chrono::high_resolution_clock::now(); net_->Forward(); t1 = std::chrono::high_resolution_clock::now(); double net_time = std::chrono::duration_cast<ms>(t1 - t0).count(); std::cout << "Net processing time: " << std::fixed << std::setprecision(2) << net_time << " ms" << std::endl; Blob<float>* result_blob = net_->output_blobs()[0]; const float* result = result_blob->cpu_data(); std::cout<<"result shpae: " << result_blob->shape_string()<<std::endl; const int num_det = result_blob->num(); std::cout<<"num_det: " << num_det <<std::endl; vector<vector<float> > predictions; for(int i = 0; i < num_det; ++i) { vector<float> prediction; prediction.push_back(result[0]); prediction.push_back(result[1]); printf("score: %f ", result[1]); prediction.push_back(result[2]); prediction.push_back(result[3]); prediction.push_back(result[4]); prediction.push_back(result[5]); // printf("after x1 y1 x2 y2: %f %f %f %f\n", result[2],result[3],result[4],result[5]); predictions.push_back(prediction); result += 6; } Mat temp = img; std::cout << "boxes nums " << predictions.size() << std::endl; double total_time = std::chrono::duration_cast<ms>(t1 - total_t0).count(); std::cout << "Total time: " << std::fixed << std::setprecision(2) << total_time << " ms" << std::endl; for(int i = 0; i < predictions.size(); i ++) { vector<float> prediction; prediction = predictions[i]; cv::rectangle(temp,cv::Point((int)(prediction[2]),(int)(prediction[3])),cv::Point((int)(prediction[4]),(int)(prediction[5])),cv::Scalar(0,0,255),1,1,0); } cv::imwrite("image_result.jpg", temp); } return rlt; }
这样就完成了~
如果编译.cu
遇到identifier "nullptr" is undefined
,那是因为在.cu代码中使用了nullptr
,需要在nvcc编译命令后面flag加一个-std=c++11,在Caffe的Makefile中为NVCCFLAGS添加一个-std=c++11
即可。
NVCCFLAGS += -ccbin=$(CXX) -Xcompiler -fPIC $(COMMON_FLAGS) -std=c++11