Is there a possibility to copy data strided by a constant (or even non-constant) value to and from the CUDA device efficiently?
I want to diagonalize a large symmetric matrix.
Using the jacobi algorithm there is a bunch of operations using two rows and two columns within each iteration.
Since the Matrix itself is too big to be copied to the device entirely i am looking for a way to copy the two rows and columns to the device.
It would be nice to use the triangular matrix form to store the data but additional downsides like
arise. [edit: Even using triangular form it is still impossible to store the whole Matrix on the GPU.]
I looked at some timings and recognized that copying strided values one by one is very slow (synchronous as well as async.).
// edit: removed solution - added an answer
Thanks to Robert Crovella for giving the right hint to use cudamemcpy2d. I'll append my test code to give everyone the possibility to comprehend...
If anyone Comes up with suggestions for solving the copy problem using row-major-ordered triangular matrices - feel free to write another answer please.
__global__ void setValues (double *arr, double value)
{
arr[blockIdx.x] = value;
}
int main( void )
{
// define consts
static size_t const R = 10, C = 10, RC = R*C;
// create matrices and initialize
double * matrix = (double*) malloc(RC*sizeof(double)),
*final_matrix = (double*) malloc(RC*sizeof(double));
for (size_t i=0; i<RC; ++i) matrix[i] = rand()%R+10;
memcpy(final_matrix, matrix, RC*sizeof(double));
// create vectors on the device
double *dev_col, *dev_row,
*h_row = (double*) malloc(C*sizeof(double)),
*h_col = (double*) malloc(R*sizeof(double));
cudaMalloc((void**)&dev_row, C * sizeof(double));
cudaMalloc((void**)&dev_col, R * sizeof(double));
// choose row / col to copy
size_t selected_row = 7, selected_col = 3;
// since we are in row-major order we can copy the row at once
cudaMemcpy(dev_row, &matrix[selected_row*C],
C * sizeof(double), cudaMemcpyHostToDevice);
// the colum needs to be copied using cudaMemcpy2D
// with Columnsize*sizeof(type) as source pitch
cudaMemcpy2D(dev_col, sizeof(double), &matrix[selected_col],
C*sizeof(double), sizeof(double), R, cudaMemcpyHostToDevice);
// copy back to host to check whether we got the right column and row
cudaMemcpy(h_row, dev_row, C * sizeof(double), cudaMemcpyDeviceToHost);
cudaMemcpy(h_col, dev_col, R * sizeof(double), cudaMemcpyDeviceToHost);
// change values to evaluate backcopy
setValues<<<R, 1>>>(dev_col, 88.0); // column should be 88
setValues<<<C, 1>>>(dev_row, 99.0); // row should be 99
// backcopy
cudaMemcpy(&final_matrix[selected_row*C], dev_row,
C * sizeof(double), cudaMemcpyDeviceToHost);
cudaMemcpy2D(&final_matrix[selected_col], C*sizeof(double), dev_col,
sizeof(double), sizeof(double), R, cudaMemcpyDeviceToHost);
cudaDeviceSynchronize();
// output for checking functionality
printf("Initial Matrix:\n");
for (size_t i=0; i<R; ++i)
{
for (size_t j=0; j<C; ++j) printf(" %lf", matrix[i*C+j]);
printf("\n");
}
printf("\nRow %u values: ", selected_row);
for (size_t i=0; i<C; ++i) printf(" %lf", h_row[i]);
printf("\nCol %u values: ", selected_col);
for (size_t i=0; i<R; ++i) printf(" %lf", h_col[i]);
printf("\n\n");
printf("Final Matrix:\n");
for (size_t i=0; i<R; ++i)
{
for (size_t j=0; j<C; ++j) printf(" %lf", final_matrix[i*C+j]);
printf("\n");
}
cudaFree(dev_col);
cudaFree(dev_row);
free(matrix);
free(final_matrix);
free(h_row);
free(h_col);
cudaDeviceReset();
return 0;
}