Custom conv2d operation Pytorch

I have tried a custom Conv2d function which has to work similar to nn.Conv2d but the multiplication and addition used inside nn.Conv2d are replaced with mymult(num1,num2) and myadd(num1,num2).

As per insight from very helpful forums 1,2 what i can do is try unfolding it and then do matrix multiplication. That @ part given in the code below can be done using loops with mymult() and myadd() as i believe this @ is doing matmul.

def convcheck():
    torch.manual_seed(123)
    batch_size = 2
    channels = 2

    h, w = 2, 2
    image = torch.randn(batch_size, channels, h, w) # input image
    out_channels = 3
    kh, kw = 1, 1# kernel size
    dh, dw = 1, 1 # stride
    size = int((h-kh+2*0)/dh+1)    #include padding in place of zero

    conv = nn.Conv2d(in_channels=channels, out_channels=out_channels, kernel_size=kw, padding=0,stride=dh ,bias=False)

    out = conv (image)
    #print('out', out)
    #print('out.size()', out.size())
    #print('')
    filt = conv.weight.data 


    imageunfold = F.unfold(image,kernel_size=kh,padding=0,stride=dh)

    print("Unfolded image","\n",imageunfold,"\n",imageunfold.shape)
    kernels_flat = filt.view(out_channels,-1)
    print("Kernel Flat=","\n",kernels_flat,"\n",kernels_flat.shape)
    res = kernels_flat @ imageunfold        # I have to replace this operation with mymult() and myadd()
    print(res,"\n",res.shape)
    #print(res.size(2),"\n",res.shape)
    res = res.view(-1, out_channels, size, size)
    #print("Same answer as buitlin function",res)

res = kernels_flat @ imageunfold can be replaced with this. although there can be some other efficient implementation which i am looking to get help for.

     for m_batch in range(len(imageunfold)):
        #iterate through rows of X   
        # iterate through columns of Y
        for j in range(imageunfold.size(2)):                   
            # iterate through rows of Y
            for k in range(imageunfold.size(1)):              
                #print(result[m_batch][i][j]," +=",   kernels_flat[i][k], "*", imageunfold[m_batch][k][j])
                result[m_batch][i][j] +=   kernels_flat[i][k] * imageunfold[m_batch][k][j]

Can someone please help me vectorize these three loops for faster execution.

Solution

The problem was with the dimesions as kernels_flat[dim0_1,dim1_1] and imageunfold[batch,dim0_2,dim1_2] the resultant should have [batch,dim0_1,dim1_2]

res = kernels_flat @ imageunfold can be replaced with this. although there can be some other efficient implementation.

     for m_batch in range(len(imageunfold)):
            #iterate through rows of X  
            # iterate through columns of Y
            for j in range(imageunfold.size(2)):                   
                # iterate through rows of Y
                for k in range(imageunfold.size(1)):              
                    #print(result[m_batch][i][j]," +=",   kernels_flat[i][k], "*", imageunfold[m_batch][k][j])
                    result[m_batch][i][j] +=   kernels_flat[i][k] * imageunfold[m_batch][k][j]