Search code examples
c++windowsarm

Code Conversion from ARM(intrinsic) platform to x86/x64 Windows


Here is the function using arm_neon.h

void NeonMeanScale(const float *din, float *dout, int size,
                   const std::vector<float> mean,
                   const std::vector<float> scale) {
  if (mean.size() != 3 || scale.size() != 3) {
    std::cerr << "[ERROR] mean or scale size must equal to 3" << std::endl;
    exit(1);
  }
  float32x4_t vmean0 = vdupq_n_f32(mean[0]);
  float32x4_t vmean1 = vdupq_n_f32(mean[1]);
  float32x4_t vmean2 = vdupq_n_f32(mean[2]);
  float32x4_t vscale0 = vdupq_n_f32(scale[0]);
  float32x4_t vscale1 = vdupq_n_f32(scale[1]);
  float32x4_t vscale2 = vdupq_n_f32(scale[2]);

  float *dout_c0 = dout;
  float *dout_c1 = dout + size;
  float *dout_c2 = dout + size * 2;

  int i = 0;
  for (; i < size - 3; i += 4) {
    float32x4x3_t vin3 = vld3q_f32(din);
    float32x4_t vsub0 = vsubq_f32(vin3.val[0], vmean0);
    float32x4_t vsub1 = vsubq_f32(vin3.val[1], vmean1);
    float32x4_t vsub2 = vsubq_f32(vin3.val[2], vmean2);
    float32x4_t vs0 = vmulq_f32(vsub0, vscale0);
    float32x4_t vs1 = vmulq_f32(vsub1, vscale1);
    float32x4_t vs2 = vmulq_f32(vsub2, vscale2);
    vst1q_f32(dout_c0, vs0);
    vst1q_f32(dout_c1, vs1);
    vst1q_f32(dout_c2, vs2);

    din += 12;
    dout_c0 += 4;
    dout_c1 += 4;
    dout_c2 += 4;
  }
  for (; i < size; i++) {
    *(dout_c0++) = (*(din++) - mean[0]) * scale[0];
    *(dout_c1++) = (*(din++) - mean[1]) * scale[1];
    *(dout_c2++) = (*(din++) - mean[2]) * scale[2];
  }
}

Function converted for windows x86/x64 platform

void NeonMeanScale(const float *din, float *dout, int size,
                   const std::vector<float> mean,
                   const std::vector<float> scale) {
  if (mean.size() != 3 || scale.size() != 3) {
    std::cerr << "[ERROR] mean or scale size must equal to 3" << std::endl;
    exit(1);
  }

  for (int i = 0; i < size; ++i) {
    dout[i] = (din[i] - mean[0]) * scale[0];
    dout[size + i] = (din[size + i] - mean[1]) * scale[1];
    dout[size * 2 + i] = (din[size * 2 + i] - mean[2]) * scale[2];
  }
}

This is how functions are called

cv::Mat img_fp;
// Load an image...
auto *data0 = input_tensor0->mutable_data<float>();
std::vector<float> mean = {0.485f, 0.456f, 0.406f};
std::vector<float> scale = {1 / 0.229f, 1 / 0.224f, 1 / 0.225f};
const float *dimg = reinterpret_cast<const float *>(img_fp.data);
NeonMeanScale(dimg, data0, img_fp.rows * img_fp.cols, mean, scale);

I'm a bit skeptical about this conversion. I certainly cannot test the ARM code and I believe that the Windows version is not correctly converted and causing failure in my program.


Solution

  • I'm not sure about the function used, but the last loop looks like assigning elements of 3-element blocks to each part of the output.

    assignment of elements

    Implementation of this operation can be like this:

    void NeonMeanScale(const float *din, float *dout, int size,
                       const std::vector<float> mean,
                       const std::vector<float> scale) {
      if (mean.size() != 3 || scale.size() != 3) {
        std::cerr << "[ERROR] mean or scale size must equal to 3" << std::endl;
        exit(1);
      }
    
      for (int i = 0; i < size; ++i) {
        dout[i] = (din[i * 3] - mean[0]) * scale[0];
        dout[size + i] = (din[i * 3 + 1] - mean[1]) * scale[1];
        dout[size * 2 + i] = (din[i * 3 + 2] - mean[2]) * scale[2];
      }
    }