How to make Cython faster?

this is a portion of my code. I tried it in both python and cython. Cython is 2 sec faster (only when the return type is mentioned. otherwise, it is almost 3.5 sec slower than the python code) in this case. Is there any chance to make it faster. Any help/ discussion would be appreciated. Thank you.

%%cython

# %%cython --compile-args=-fopenmp --link-args=-fopenmp --force

cimport cython
cimport numpy as cnp
import numpy as np
from cython.parallel import parallel, prange

ctypedef cnp.int_t DTYPE

@cython.boundscheck(False)
@cython.cdivision(True)
@cython.wraparound(False)
@cython.nogil
@cython.cfunc
@cython.exceptval(-1)
@cython.returns(list )
cdef list sub_mat_extract ( cnp.ndarray[ DTYPE , ndim= 3] mat ,  cython.int neibors) : 
    
#     print('sub_mat_extract: ', np.shape(mat)  )

#     temp = []
    cdef:
        Py_ssize_t M = 0, N = 0, x =0
        Py_ssize_t i
        Py_ssize_t j
        Py_ssize_t row = np.shape(mat)[0] 
        Py_ssize_t col = np.shape(mat)[1] 
        
        list temp = []       
        list temp1 = []
        list dup1 = []  
        list dup2 = []
        
   
    for i in range(  ((neibors-1)/2) , row - ((neibors-1)/2) ):
        N = 0
        temp1 = []
        for j in range( col  ):
            temp1.extend(mat[ j + M ][ 0 + N : neibors + N])
    #         print(i,M, mat[i+M][0+N :3+N])
    #             print(temp1)


            if j + M == neibors + M-1:
                M = M + 1
                break
        temp.append(temp1)
        N += 1    
        if M == col:
            break

    dup1 = []
     

    for i in range(len(temp) ):
        x = 0
        while (x <= col - neibors):

            dup2 = []
            for j in range(len(temp[i])):
    #                 print([temp[i][j][0], temp[i][j][1]+x] )
                dup2.append([temp[i][j][0], temp[i][j][1]+x] )
            dup1.append(dup2)    
            x = x+1

        
    return (dup1)

def action(mat, neibor):
    return (sub_mat_extract(np.array(mat), neibor ))

the time for python version:

CPU times: total: 5.23 s
Wall time: 5.77 s

same for cython:

CPU times: total: 3.14 s
Wall time: 4.78 s

I am trying to convert all my codes from conventional python to cython. I want to see if in all cases, cython can be faster than python. My ultimate goal is to understand how fast a code can run (utilizing hardware (numba+multiprocess) and python-like compilers). I am running the codes in jupyter notebook only.

Solution

Here are some things in arbitrary order that crossed my mind while looking at your code snippet:

You don't need to use the decorators like @cython.cfunc and @cython.returns. Both is already known based on the function signature.
I'd recommend to use typed memoryviews instead of the old np.ndarrays. The syntax is cleaner and they are slightly faster.
You don't need to check the length of each (inner) list as each list will contain the same number of elements. Therefore, the number of loop iterations are already known (partially).
Your main bottleneck is that you're using Python lists as data structures, which is a bad choice in your case. All of your lists contain only integers, so you can use homogenous data containers like np.arrays, C arrays or C++ vectors for example.

Timing your code snippet on my machine yields:

In [15]: M = np.ones((1000, 250, 250), dtype=np.int32)
In [16]: %timeit res1 = action(M, 25)
3.8 s ± 19.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)

Here's an improved version of your code that needs to be compiled in C++ mode. The main advantage is that it uses a C++ STL vector and memoryviews to copy the specific rows of your mat array instead of lists. In addition, it returns a numpy array instead of a nested lists.

%%cython -f -a -c=-O3

#distutils: language = c++

from cython cimport wraparound, boundscheck, cdivision
cimport numpy as np
import numpy as np
from libcpp.vector cimport vector

@boundscheck(False)
@wraparound(False)
cdef void copy_elements(vector[int]& out, int[:, ::1] mv):
    cdef int n_rows = mv.shape[0]
    cdef int n_cols = mv.shape[1]
    cdef int i, j
    for i in range(n_rows):
        for j in range(n_cols):
            out.push_back(mv[i][j])

@boundscheck(False)
@cdivision(True)
@wraparound(False)
def sub_mat_extract(int[:, :, ::1] mat, int neibors):
    cdef int M = 0
    cdef int x = 0
    cdef int i, j, k
    cdef int r = 0
    cdef int count = 0
    cdef int n_rows = mat.shape[0]
    cdef int n_cols = mat.shape[1]
    cdef int n_tmps = n_rows - neibors + 2
    cdef int neibors_pow2 = neibors*neibors
    cdef vector[int] temp
    cdef int[::1] out_mv

    for i in range(n_tmps):
        count += 1
        for j in range(n_cols):
            copy_elements(temp, mat[j+M][:neibors])
            if j == neibors - 1:
                M += 1
                break
        if M == n_cols:
            break
    # allocate memory
    out_mv = np.zeros(count*neibors_pow2*(n_cols-neibors+1)*2, dtype=np.int32)
    
    for i in range(count):
        for x in range(n_cols - neibors + 1):
            for j in range(neibors*neibors):
                out_mv[r] = temp[i + j*n_tmps + 0]
                r += 1
                out_mv[r] = temp[i + j*n_tmps + 1] + x
                r += 1
    return np.asarray(out_mv).reshape(-1, neibors_pow2, 2)

Timing again on my machine (with the same input as above) shows that this approach is nearly 20 times faster:

In [18]: %timeit res2 = sub_mat_extract(M, 25)
198 ms ± 3.59 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)