2.那么 80*80*255、40*40*255、20*20*255数字分别代表什么意思,其中B是batch
3 上图输出的3个head,并不是最终的输出,还需要做很多的工作,如果直接这样输出,后续代码解码很麻烦,因此需要进一步的处理这三个头,以此方便后面的代码进行解码 *** 作,具体做以下工作:
3.1 需要做sigmoid激活函数
3.2 xy*2-0.5
3.3 (wh*2)**2*anchor
3.4 拿到640尺度下的框
从中可以看到需要很多种 *** 作,很麻烦,可以让onnx来做,因此为了更好的在连续空间可以访问到,可以通过变换以下输出的通道,即原来的B*3*85*80*80,可以变换为B*3*80*80*85, 得到这样的tensor,可以很容易的进行 *** 作,但是因为存在三个头,还是很麻烦,那么还可以继续合并,即B*19200*85,那么其他的三个头类似:
def forward(self, x): z = [] # inference output for i in range(self.nl): x[i] = self.m[i](x[i]) # conv # bs, _, ny, nx = x[i].shape # x(bs,255,20,20) to x(bs,3,20,20,85) bs, _, ny, nx = map(int, x[i].shape) # x(bs,255,20,20) to x(bs,3,20,20,85) # x[i] = x[i].view(bs, self.na, self.no, ny, nx).permute(0, 1, 3, 4, 2).contiguous() x[i] = x[i].view(-1, self.na, self.no, ny, nx).permute(0, 1, 3, 4, 2).contiguous() if not self.training: # inference if self.grid[i].shape[2:4] != x[i].shape[2:4] or self.onnx_dynamic: self.grid[i] = self._make_grid(nx, ny).to(x[i].device) y = x[i].sigmoid() # if self.inplace: if self.inplace: y[..., 0:2] = (y[..., 0:2] * 2. - 0.5 + self.grid[i]) * self.stride[i] # xy y[..., 2:4] = (y[..., 2:4] * 2) ** 2 * self.anchor_grid[i] # wh else: # for YOLOv5 on AWS Inferentia https://github.com/ultralytics/yolov5/pull/2953 xy = (y[..., 0:2] * 2. - 0.5 + self.grid[i]) * self.stride[i] # xy wh = (y[..., 2:4] * 2) ** 2 * self.anchor_grid[i].view(1, self.na, 1, 1, 2) # wh y = torch.cat((xy, wh, y[..., 4:]), -1) z.append(y.view(bs, -1, self.no)) return x if self.training else (torch.cat(z, 1), x)
1.预处理核函数static __global__ void warp_affine_bilinear_and_normalize_plane_kernel(uint8_t* src, int src_line_size, int src_width, int src_height, float* dst, int dst_width, int dst_height, uint8_t const_value_st, float* warp_affine_matrix_2_3, Norm norm, int edge){ int position = blockDim.x * blockIdx.x + threadIdx.x; if (position >= edge) return; float m_x1 = warp_affine_matrix_2_3[0]; float m_y1 = warp_affine_matrix_2_3[1]; float m_z1 = warp_affine_matrix_2_3[2]; float m_x2 = warp_affine_matrix_2_3[3]; float m_y2 = warp_affine_matrix_2_3[4]; float m_z2 = warp_affine_matrix_2_3[5]; int dx = position % dst_width; int dy = position / dst_width; float src_x = m_x1 * dx + m_y1 * dy + m_z1; float src_y = m_x2 * dx + m_y2 * dy + m_z2; float c0, c1, c2; if(src_x <= -1 || src_x >= src_width || src_y <= -1 || src_y >= src_height){ // out of range c0 = const_value_st; c1 = const_value_st; c2 = const_value_st; }else{ int y_low = floorf(src_y); int x_low = floorf(src_x); int y_high = y_low + 1; int x_high = x_low + 1; uint8_t const_value[] = {const_value_st, const_value_st, const_value_st}; float ly = src_y - y_low; float lx = src_x - x_low; float hy = 1 - ly; float hx = 1 - lx; float w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx; uint8_t* v1 = const_value; uint8_t* v2 = const_value; uint8_t* v3 = const_value; uint8_t* v4 = const_value; if(y_low >= 0){ if (x_low >= 0) v1 = src + y_low * src_line_size + x_low * 3; if (x_high < src_width) v2 = src + y_low * src_line_size + x_high * 3; } if(y_high < src_height){ if (x_low >= 0) v3 = src + y_high * src_line_size + x_low * 3; if (x_high < src_width) v4 = src + y_high * src_line_size + x_high * 3; } c0 = floorf(w1 * v1[0] + w2 * v2[0] + w3 * v3[0] + w4 * v4[0] + 0.5f); c1 = floorf(w1 * v1[1] + w2 * v2[1] + w3 * v3[1] + w4 * v4[1] + 0.5f); c2 = floorf(w1 * v1[2] + w2 * v2[2] + w3 * v3[2] + w4 * v4[2] + 0.5f); } if(norm.channel_type == ChannelType::SwapRB){ float t = c2; c2 = c0; c0 = t; } if(norm.type == NormType::MeanStd){ c0 = (c0 * norm.alpha - norm.mean[0]) / norm.std[0]; c1 = (c1 * norm.alpha - norm.mean[1]) / norm.std[1]; c2 = (c2 * norm.alpha - norm.mean[2]) / norm.std[2]; }else if(norm.type == NormType::AlphaBeta){ c0 = c0 * norm.alpha + norm.beta; c1 = c1 * norm.alpha + norm.beta; c2 = c2 * norm.alpha + norm.beta; } int area = dst_width * dst_height; float* pdst_c0 = dst + dy * dst_width + dx; float* pdst_c1 = pdst_c0 + area; float* pdst_c2 = pdst_c1 + area; *pdst_c0 = c0; *pdst_c1 = c1; *pdst_c2 = c2; } static void warp_affine_bilinear_and_normalize_plane( uint8_t* src, int src_line_size, int src_width, int src_height, float* dst, int dst_width, int dst_height, float* matrix_2_3, uint8_t const_value, const Norm& norm, cudaStream_t stream) { int jobs = dst_width * dst_height; auto grid = grid_dims(jobs); auto block = block_dims(jobs); checkCudaKernel(warp_affine_bilinear_and_normalize_plane_kernel <<2.解码核函数> > ( src, src_line_size, src_width, src_height, dst, dst_width, dst_height, const_value, matrix_2_3, norm, jobs )); }
const int NUM_BOX_ELEMENT = 7; // left, top, right, bottom, confidence, class, keepflag static __device__ void affine_project(float* matrix, float x, float y, float* ox, float* oy){ *ox = matrix[0] * x + matrix[1] * y + matrix[2]; *oy = matrix[3] * x + matrix[4] * y + matrix[5]; } static __global__ void decode_kernel(float* predict, int num_bboxes, int num_classes, float confidence_threshold, float* invert_affine_matrix, float* parray, int max_objects){ int position = blockDim.x * blockIdx.x + threadIdx.x; if (position >= num_bboxes) return; float* pitem = predict + (5 + num_classes) * position; float objectness = pitem[4]; if(objectness < confidence_threshold) return; float* class_confidence = pitem + 5; float confidence = *class_confidence++; int label = 0; for(int i = 1; i < num_classes; ++i, ++class_confidence){ if(*class_confidence > confidence){ confidence = *class_confidence; label = i; } } confidence *= objectness; if(confidence < confidence_threshold) return; int index = atomicAdd(parray, 1); if(index >= max_objects) return; float cx = *pitem++; float cy = *pitem++; float width = *pitem++; float height = *pitem++; float left = cx - width * 0.5f; float top = cy - height * 0.5f; float right = cx + width * 0.5f; float bottom = cy + height * 0.5f; affine_project(invert_affine_matrix, left, top, &left, &top); affine_project(invert_affine_matrix, right, bottom, &right, &bottom); float* pout_item = parray + 1 + index * NUM_BOX_ELEMENT; *pout_item++ = left; *pout_item++ = top; *pout_item++ = right; *pout_item++ = bottom; *pout_item++ = confidence; *pout_item++ = label; *pout_item++ = 1; // 1 = keep, 0 = ignore } static __device__ float box_iou( float aleft, float atop, float aright, float abottom, float bleft, float btop, float bright, float bbottom ){ float cleft = max(aleft, bleft); float ctop = max(atop, btop); float cright = min(aright, bright); float cbottom = min(abottom, bbottom); float c_area = max(cright - cleft, 0.0f) * max(cbottom - ctop, 0.0f); if(c_area == 0.0f) return 0.0f; float a_area = max(0.0f, aright - aleft) * max(0.0f, abottom - atop); float b_area = max(0.0f, bright - bleft) * max(0.0f, bbottom - btop); return c_area / (a_area + b_area - c_area); } static __global__ void fast_nms_kernel(float* bboxes, int max_objects, float threshold){ int position = (blockDim.x * blockIdx.x + threadIdx.x); int count = min((int)*bboxes, max_objects); if (position >= count) return; // left, top, right, bottom, confidence, class, keepflag float* pcurrent = bboxes + 1 + position * NUM_BOX_ELEMENT; for(int i = 0; i < count; ++i){ float* pitem = bboxes + 1 + i * NUM_BOX_ELEMENT; if(i == position || pcurrent[5] != pitem[5]) continue; if(pitem[4] >= pcurrent[4]){ if(pitem[4] == pcurrent[4] && i < position) continue; float iou = box_iou( pcurrent[0], pcurrent[1], pcurrent[2], pcurrent[3], pitem[0], pitem[1], pitem[2], pitem[3] ); if(iou > threshold){ pcurrent[6] = 0; // 1=keep, 0=ignore return; } } } } static void decode_kernel_invoker(float* predict, int num_bboxes, int num_classes, float confidence_threshold, float nms_threshold, float* invert_affine_matrix, float* parray, int max_objects, cudaStream_t stream){ auto grid = grid_dims(num_bboxes); auto block = block_dims(num_bboxes); checkCudaKernel(decode_kernel<<>>(predict, num_bboxes, num_classes, confidence_threshold, invert_affine_matrix, parray, max_objects)); grid = grid_dims(max_objects); block = block_dims(max_objects); checkCudaKernel(fast_nms_kernel<< >>(parray, max_objects, nms_threshold)); }