I am working on a highly performance-critical image processing pipeline on a Jetson TX2 (with an ARM processor), which involves reading a set of images and then performing deep learning based object detection through Darknet. Darknet, written in C, has its own representation of how images are stored, which is different from how OpenCV's IplImage or a Python numpy array would store the images.
In my application, I am required to interface with Darknet through Python. So, as of now, I am reading a 'batch' of images (usually 16) into a numpy array and then passing it to Darknet as a contiguous array using ctypes. Within Darknet, I then have to rearrange the ordering of the pixels to go from the numpy format to Darknet's format.
While the input array is one contiguous block arranged column-wise, then row-wise, then channel-wise, and then by image, the Darknet format needs to be arranged by channel first, then by column, then by row: and contains one row per image in the batch instead of a contiguous block. The picture below tries to demonstrate the difference. In this example, I assume a single ixj
image. (0,0), (0,1) etc. indicate (row, col), whereas in the top, C0, C1, C2.. etc indicate the column in the corresponding row. Note that in the case of multiple images as part of a batch, the input format arranges them sequentially one after the other, but Darknet needs them to be on separate rows: each row containing data from only one image.
As of now, my code in C that converts the input array to the Darknet format looks like this, where it iteratively hits every pixel in every channel and puts it in a different place, while also normalizing the pixels along the way.
matrix ndarray_to_matrix(unsigned char* src, long* shape, long* strides)
{
int nb = shape[0]; // Batch size
int h = shape[1]; // Height of each image
int w = shape[2]; // Width of each image
int c = shape[3]; // No. of channels in each image
matrix X = make_matrix(nb, h*w*c); // Output array format: 2D
int step_b = strides[0];
int step_h = strides[1];
int step_w = strides[2];
int step_c = strides[3];
int b, i, j, k;
int index1, index2 = 0;
for(b = 0; b < nb ; ++b) {
for(i = 0; i < h; ++i) {
for(k= 0; k < c; ++k) {
for(j = 0; j < w; ++j) {
index1 = k*w*h + i*w + j;
index2 = step_b*b + step_h*i + step_w*j + step_c*k;
X.vals[b][index1] = src[index2]/255.;
}
}
}
}
return X;
}
Is there a more efficient way of doing this rearranging and normalization in C?
memcpy
:/*
* Created on: 2018. 5. 5.
* Author: Jake 'Alquimista' Lee
*/
.arch armv8-a
.text
.global alquimista_ndarray_to_matrix
// void alquimista_ndarray_to_matrix(uint8_t * pDst, uint8_t *pSrc);
pDst .req x0
pRed .req x1
pGrn .req x2
pBlu .req x3
count .req w4
.balign 64
.func
alquimista_ndarray_to_matrix:
mov x16, #(640*360) & 0xffff
str q8, [sp, #-16]!
movk x16, #((640*360)>>16), lsl #16
mov count, #(640*360)/128
add pGrn, pRed, x16
add pBlu, pRed, x16, lsl #1
b 1f
.balign 64
1:
ldp q0, q3, [pRed], #32
ldp q1, q4, [pGrn], #32
ldp q2, q5, [pBlu], #32
ldp q6, q16, [pRed], #32
ldp q7, q17, [pGrn], #32
ldp q8, q18, [pBlu], #32
ldp q19, q22, [pRed], #32
ldp q20, q23, [pGrn], #32
ldp q21, q24, [pBlu], #32
ldp q25, q28, [pRed], #32
ldp q26, q29, [pGrn], #32
ldp q27, q30, [pBlu], #32
subs count, count, #1
st3 {v0.16b, v1.16b, v2.16b}, [pDst], #48
st3 {v3.16b, v4.16b, v5.16b}, [pDst], #48
st3 {v6.16b, v7.16b, v8.16b}, [pDst], #48
st3 {v16.16b, v17.16b, v18.16b}, [pDst], #48
st3 {v19.16b, v20.16b, v21.16b}, [pDst], #48
st3 {v22.16b, v23.16b, v24.16b}, [pDst], #48
st3 {v25.16b, v26.16b, v27.16b}, [pDst], #48
st3 {v28.16b, v29.16b, v30.16b}, [pDst], #48
b.gt 1b
.balign 8
ldr q8, [sp], #16
ret
.endfunc
.end
For maximum performance and minimum power consumption, you might want to align the source pointer to 32 bytes and the destination to 16 bytes.
The function prototype is:
void alquimista_ndarray_to_matrix(uint8_t * pDst, uint8_t *pSrc);
Below is the function that does the conversion to float
on the fly.
And I added the batch number as parameter so that you don't have to do a function call for every image.
/*
* Created on: 2018. 5. 5.
* Copyright: Jake 'Alquimista' Lee. All rights reserved
*/
.arch armv8-a
.text
.global alquimista_ndarray_to_matrix_float
// void alquimista_ndarray_to_matrix_float(float *pDst, uint8_t *pSrc, uint32_t batch);
pDst .req x0
pRed .req x1
batch .req w2
pGrn .req x3
pBlu .req x4
stride .req x5
count .req w7
.balign 64
.func
alquimista_ndarray_to_matrix_float:
mov stride, #((640*360)<<1) & 0xffff
stp q8, q15, [sp, #-32]!
movk stride, #((640*360)>>15), lsl #16
mov count, #(640*360)/32
add pGrn, pRed, stride, lsr #1
add pBlu, pRed, stride
b 1f
.balign 64
1:
ldp q0, q1, [pRed], #32
ldp q2, q3, [pGrn], #32
ldp q4, q5, [pBlu], #32
subs count, count, #1
ushll v20.8h, v0.8b, #7
ushll2 v23.8h, v0.16b, #7
ushll v26.8h, v1.8b, #7
ushll2 v29.8h, v1.16b, #7
ushll v21.8h, v2.8b, #7
ushll2 v24.8h, v2.16b, #7
ushll v27.8h, v3.8b, #7
ushll2 v30.8h, v3.16b, #7
ushll v22.8h, v4.8b, #7
ushll2 v25.8h, v4.16b, #7
ushll v28.8h, v5.8b, #7
ushll2 v31.8h, v5.16b, #7
ursra v20.8h, v20.8h, #8
ursra v21.8h, v21.8h, #8
ursra v22.8h, v22.8h, #8
ursra v23.8h, v23.8h, #8
ursra v24.8h, v24.8h, #8
ursra v25.8h, v25.8h, #8
ursra v26.8h, v26.8h, #8
ursra v27.8h, v27.8h, #8
ursra v28.8h, v28.8h, #8
ursra v29.8h, v29.8h, #8
ursra v30.8h, v30.8h, #8
ursra v31.8h, v31.8h, #8
uxtl v0.4s, v20.4h
uxtl v1.4s, v21.4h
uxtl v2.4s, v22.4h
uxtl2 v3.4s, v20.8h
uxtl2 v4.4s, v21.8h
uxtl2 v5.4s, v22.8h
uxtl v6.4s, v23.4h
uxtl v7.4s, v24.4h
uxtl v8.4s, v25.4h
uxtl2 v15.4s, v23.8h
uxtl2 v16.4s, v24.8h
uxtl2 v17.4s, v25.8h
uxtl v18.4s, v26.4h
uxtl v19.4s, v27.4h
uxtl v20.4s, v28.4h
uxtl2 v21.4s, v26.8h
uxtl2 v22.4s, v27.8h
uxtl2 v23.4s, v28.8h
uxtl v24.4s, v29.4h
uxtl v25.4s, v30.4h
uxtl v26.4s, v31.4h
uxtl2 v27.4s, v29.8h
uxtl2 v28.4s, v30.8h
uxtl2 v29.4s, v31.8h
ucvtf v0.4s, v0.4s, #15
ucvtf v1.4s, v1.4s, #15
ucvtf v2.4s, v2.4s, #15
ucvtf v3.4s, v3.4s, #15
ucvtf v4.4s, v4.4s, #15
ucvtf v5.4s, v5.4s, #15
ucvtf v6.4s, v6.4s, #15
ucvtf v7.4s, v7.4s, #15
ucvtf v8.4s, v8.4s, #15
ucvtf v15.4s, v15.4s, #15
ucvtf v16.4s, v16.4s, #15
ucvtf v17.4s, v17.4s, #15
ucvtf v18.4s, v18.4s, #15
ucvtf v19.4s, v19.4s, #15
ucvtf v20.4s, v20.4s, #15
ucvtf v21.4s, v21.4s, #15
ucvtf v22.4s, v22.4s, #15
ucvtf v23.4s, v23.4s, #15
ucvtf v24.4s, v24.4s, #15
ucvtf v25.4s, v25.4s, #15
ucvtf v26.4s, v26.4s, #15
ucvtf v27.4s, v27.4s, #15
ucvtf v28.4s, v28.4s, #15
ucvtf v29.4s, v29.4s, #15
st3 {v0.4s - v2.4s}, [pDst], #48
st3 {v3.4s - v5.4s}, [pDst], #48
st3 {v6.4s - v8.4s}, [pDst], #48
st3 {v15.4s - v17.4s}, [pDst], #48
st3 {v18.4s - v20.4s}, [pDst], #48
st3 {v21.4s - v23.4s}, [pDst], #48
st3 {v24.4s - v26.4s}, [pDst], #48
st3 {v27.4s - v29.4s}, [pDst], #48
b.gt 1b
add pRed, pRed, stride
add pGrn, pGrn, stride
add pGrn, pGrn, stride
subs batch, batch, #1
mov count, #(640*360)/32
b.gt 1b
.balign 8
ldp q8, q15, [sp], #32
ret
.endfunc
.end
It's quite a long one, and it will take considerably longer than the uint8
one above.
Please note that it will scale extremely well to multi-core execution.
The function prototype is:
void alquimista_ndarray_to_matrix_float(float *pDst, uint8_t *pSrc, uint32_t batch);