Faster way to append to PyList in c++

I'm new to c++ and looking for a faster way to append pixel value to python list, since currrently on loop it takes around 0.1 second to process one frame of image with resolution of 854x480, do anyone have any idea?

I tried to avoid using third party module if possible.

Here is what I've got so far:

PyObject* byte_list = PyList_New(static_cast<Py_ssize_t>(0));

AVFrame *pFrameRGB = av_frame_alloc();
av_frame_copy_props(pFrameRGB, this->pFrame);
pFrameRGB->width = this->pFrame->width;
pFrameRGB->height = this->pFrame->height;
pFrameRGB->format = AV_PIX_FMT_RGB24;
av_frame_get_buffer(pFrameRGB, 0);

sws_scale(this->swsCtx, this->pFrame->data, this->pFrame->linesize, 0, 
        this->pCodecContext->height, pFrameRGB->data, pFrameRGB->linesize);

if (this->_debug) {
    std::cout << "Frame linesize " << pFrameRGB->linesize[0] << "\n";
    std::cout << "Frame width " << pFrameRGB->width << "\n";
    std::cout << "Frame height " << pFrameRGB->height << "\n";
}

// This looping method seems slow
for(int y = 0; y < pFrameRGB->height; ++y) {
    for(int x = 0; x < pFrameRGB->width; ++x) {
        int p = x * 3 + y * pFrameRGB->linesize[0];
        int r = pFrameRGB->data[0][p];
        int g = pFrameRGB->data[0][p+1];
        int b = pFrameRGB->data[0][p+2];
        PyList_Append(byte_list, PyLong_FromLong(r));
        PyList_Append(byte_list, PyLong_FromLong(g));
        PyList_Append(byte_list, PyLong_FromLong(b));
    }
}

av_frame_free(&pFrameRGB);

Thanks!

Solution

After looking around, I've decided to use Python Built-in Array Library that can use memcpy instead of PyList which require to input the data one by one.

From my test, this improve the speed from 2-10 times, depending on the data.

PyObject *vec_to_array(std::vector<uint8_t>& vec) {
    static PyObject *single_array;
    if (!single_array) {
        PyObject *array_module = PyImport_ImportModule("array");
        if (!array_module)
            return NULL;
        PyObject *array_type = PyObject_GetAttrString(array_module, "array");
        Py_DECREF(array_module);
        if (!array_type)
            return NULL;
        single_array = PyObject_CallFunction(array_type, "s[B]", "B", 0);
        Py_DECREF(array_type);
        if (!single_array)
            return NULL;
    }
    // extra-fast way to create an empty array of count elements:
    //   array = single_element_array * count
    PyObject *pysize = PyLong_FromSsize_t(vec.size());
    if (!pysize)
        return NULL;
    PyObject *array = PyNumber_Multiply(single_array, pysize);
    Py_DECREF(pysize);
    if (!array)
        return NULL;

    // now, obtain the address of the array's buffer
    PyObject *buffer_info = PyObject_CallMethod(array, "buffer_info", "");
    if (!buffer_info) {
        Py_DECREF(array);
        return NULL;
    }
    PyObject *pyaddr = PyTuple_GetItem(buffer_info, 0);
    void *addr = PyLong_AsVoidPtr(pyaddr);

    // and, finally, copy the data.
    if (vec.size())
        memcpy(addr, &vec[0], vec.size() * sizeof(uint8_t));

    return array;
}

after that I passed the vector into that function

std::vector<uint8_t> rgb_arr;

// Copy data from AV Frame
uint8_t* rgb_data[4];  int rgb_linesize[4];
av_image_alloc(rgb_data, rgb_linesize, this->pFrame->width, this->pFrame->height, AV_PIX_FMT_RGB24, 32); 
sws_scale(this->swsCtx, this->pFrame->data, this->pFrame->linesize, 0, this->pFrame->height, rgb_data, rgb_linesize);

// Put the data into vector
int rgb_size = pFrame->height * rgb_linesize[0];
std::vector<uint8_t> rgb_vector(rgb_size);
memcpy(rgb_vector.data(), rgb_data[0], rgb_size);

// Transfer the data from vector to rgb_arr
for(int y = 0; y < pFrame->height; ++y) {
    rgb_arr.insert(
        rgb_arr.end(), 
        rgb_vector.begin() + y * rgb_linesize[0],
        rgb_vector.begin() + y * rgb_linesize[0] + 3 * pFrame->width
    );
}


PyObject* arr = vec_to_array(rgb_arr);

This then later can be accessed by python.