Is thrust::max_element run on host or device?

I need to find the maximum element in a long array which is stored on my device. I thought I could use thrust::max_element to do so. I am calling thrust::max_element in the while loop in the code bellow. I simply give it two device pointers (note that real is just a typedef for float). Can I not just pass thrust::max_element device pointers. Is it trying to find the max element on the host? I am asking this because my code is failing with a seg fault after that.

int main()
{
    cuda_error(cudaSetDevice(1), "set device");
    const size_t DIM = 50;

    real* grid_d;
    real* next_grid_d;
    cuda_error(cudaMalloc(&grid_d, sizeof(real) * DIM * DIM * DIM), "malloc grid");
    cuda_error(cudaMalloc(&next_grid_d, sizeof(real) * DIM * DIM * DIM), "malloc next grid");
    cuda_error(cudaMemset(grid_d, 0, sizeof(real) * DIM * DIM * DIM), "memset grid");

    ConstantSum point_charge(0.3, DIM / 2, DIM / 2, DIM / 2);
    ConstantSum* point_charge_d;
    cuda_error(cudaMalloc(&point_charge_d, sizeof(ConstantSum)), "malloc constant sum");
    cuda_error(cudaMemcpy(point_charge_d, &point_charge, sizeof(ConstantSum), cudaMemcpyHostToDevice), "memset constant sum");

    real max_err;
    do
    {
        compute_next_grid_kernel<<< DIM, dim3(16, 16) >>>(grid_d, next_grid_d, DIM, point_charge_d, 1);
        cuda_error(cudaGetLastError(), "kernel launch");
        max_err = *thrust::max_element(grid_d, grid_d + DIM * DIM * DIM);

        std::swap(grid_d, next_grid_d);
    }
    while(max_err > 0.1);

    real* frame = new real[DIM * DIM];
    cuda_error(cudaMemcpy(frame, grid_d + DIM * DIM * (DIM / 2), DIM * DIM * sizeof(real), cudaMemcpyDeviceToHost), "memcpy frame");

    cuda_error(cudaFree(grid_d), "free grid");
    cuda_error(cudaFree(next_grid_d), "free next grid");
    cuda_error(cudaFree(point_charge_d), "free point charge");

    for(int i = 0; i < DIM; i++)
    {
        for(int j = 0; j < DIM; j++)
        {
            std::cout << frame[DIM * i + j] << "\t";
        }
        std::cout << "\n";
    }

    delete[] frame;
    return 0;
}

Solution

In general, thrust uses the types of the passed iterators to determine whether the algorithm back-end will run on host or device (there is also tag free and explicit execution policy selection in the latest versions, but that is a different discussion).

In your case, because grid_d is a host pointer (whether its value is a host or device address is irrelevant), thrust will try and run the algorithm on the host. This is the source of the segfault, you are trying to access device addresses on the host.

To make this work, you need to cast the pointer to a thrust::dev_ptr, something like:

thrust::device_ptr<real> grid_start = thrust::device_pointer_cast(grid_d);
thrust::device_ptr<real> grid_end= thrust::device_pointer_cast(grid_d + DIM * DIM * DIM);

auto max_it = thrust::max_element(grid_start, grid_end);
max_error = *max_it;

[warning, written in browser, never compiled or tested, use at own risk]

By passing thrust::dev_ptr, the correct tag selection occurs and the closure will run on the device.

Another solution without casting is to specify the execution policy device:

thrust::max_element(thrust::device, grid_d, grid_d + DIM * DIM * DIM);

Nothe that explicit execution policy control is only supported on Thrust 1.7 and later.