CUDA: Using device functors in kernels

I tried to make a device functor that essentially performs (unoptimized) matrix-vector multiplication like so

namespace cusolve
{

template <class value_type,
          class matrix_type = value_type*,
          class vector_type = value_type*>
struct linear_operator 
{
    const matrix_type matrix;
    const size_t width;
    __device__
    linear_operator(const matrix_type matrix, size_t width)
        : matrix(matrix), width(width) { }

    __device__
    void operator()(const vector_type x, vector_type x_out)
    {
        auto col = blockIdx.x * blockDim.x + threadIdx.x;
        auto row = blockIdx.y * blockDim.y + threadIdx.y;
        x_out[row] = 0;
        if (row < width)
        {
            for (size_t i = 0; i < width; i++)
            {
                x_out[row] += matrix[row*width + i] * x[i];
            }
        }
        return;              
    }
};

So, this assumes that matrix, x, and x_out are device pointers. So, to test it I tried to call it from a simple kernel

__global__
void
operateKernel(double *d_matrix,
        double *d_vector, double *d_vector_out,
        size_t width)
{
    cusolve::linear_operator<double> matmul(d_matrix, width);
    matmul(d_vector, d_vector_out);
}


void
operate(double *matrix, double *vector, double *vector_out, size_t width)
{
    const dim3 blockConfig(16, 16);
    const size_t gridWidth = (size_t) ((double) width) / 16.0l;
    const dim3 gridConfig(gridWidth, gridWidth);

    double *d_matrix, *d_vector, *d_vector_out;
    auto mem_vector = width * sizeof(double);
    auto mem_matrix = mem_vector * width;
    cudaMalloc((void **) &d_matrix, mem_matrix);
    cudaMalloc((void **) &d_vector, mem_vector);
    cudaMalloc((void **) &d_vector_out, mem_vector);

    cudaMemcpy(d_matrix, matrix, mem_matrix, cudaMemcpyHostToDevice);
    cudaMemcpy(d_vector, vector, mem_vector, cudaMemcpyHostToDevice);

    operateKernel<<<gridConfig, blockConfig>>>(d_matrix, d_vector, d_vector_out, width);
    cudaMemcpy(vector_out, d_vector_out, mem_vector, cudaMemcpyDeviceToHost);
    cudaFree(d_vector);
    cudaFree(d_matrix);
    cudaFree(d_vector_out);
}

But, when I try to call operate() from main() using allocated and initialized to non-null vectors and a matrix, the output is all zeros. I have been whacking my head over this for quite a while now and have not been able to figure out what it is that I am doing wrong. P.S: I am deliberately trying to do this without thrust as a learning exercise.

Solution

Forgot to use ceil when calculating grid dimensions.

const size_t gridWidth = ceil( ((double) width) / 16.0l );