Efficient way to copy strided data (to and from a CUDA Device)?

Is there a possibility to copy data strided by a constant (or even non-constant) value to and from the CUDA device efficiently?

I want to diagonalize a large symmetric matrix.

Using the jacobi algorithm there is a bunch of operations using two rows and two columns within each iteration.

Since the Matrix itself is too big to be copied to the device entirely i am looking for a way to copy the two rows and columns to the device.

It would be nice to use the triangular matrix form to store the data but additional downsides like

non-constant row-length [not that Kind of a Problem]
non-constant stride of the column values [the stride increases by 1 for each row.]

arise. [edit: Even using triangular form it is still impossible to store the whole Matrix on the GPU.]

I looked at some timings and recognized that copying strided values one by one is very slow (synchronous as well as async.).

// edit: removed solution - added an answer

Solution

Thanks to Robert Crovella for giving the right hint to use cudamemcpy2d. I'll append my test code to give everyone the possibility to comprehend...

If anyone Comes up with suggestions for solving the copy problem using row-major-ordered triangular matrices - feel free to write another answer please.

__global__ void setValues (double *arr, double value)
{
  arr[blockIdx.x] = value;
}

int main( void ) 
{
  // define consts
  static size_t const R = 10, C = 10, RC = R*C;

  // create matrices and initialize
  double * matrix = (double*) malloc(RC*sizeof(double)), 
    *final_matrix = (double*) malloc(RC*sizeof(double));
  for (size_t i=0; i<RC; ++i) matrix[i] = rand()%R+10;
  memcpy(final_matrix, matrix, RC*sizeof(double));

  // create vectors on the device
  double *dev_col, *dev_row, 
    *h_row = (double*) malloc(C*sizeof(double)), 
    *h_col = (double*) malloc(R*sizeof(double));
  cudaMalloc((void**)&dev_row, C * sizeof(double));
  cudaMalloc((void**)&dev_col, R * sizeof(double));

  // choose row / col to copy
  size_t selected_row = 7, selected_col = 3;

  // since we are in row-major order we can copy the row at once 
  cudaMemcpy(dev_row, &matrix[selected_row*C], 
    C * sizeof(double), cudaMemcpyHostToDevice);
  // the colum needs to be copied using cudaMemcpy2D 
  // with Columnsize*sizeof(type) as source pitch
  cudaMemcpy2D(dev_col, sizeof(double), &matrix[selected_col], 
    C*sizeof(double), sizeof(double), R, cudaMemcpyHostToDevice);

  // copy back to host to check whether we got the right column and row
  cudaMemcpy(h_row, dev_row, C * sizeof(double), cudaMemcpyDeviceToHost);
  cudaMemcpy(h_col, dev_col, R * sizeof(double), cudaMemcpyDeviceToHost);
  // change values to evaluate backcopy
  setValues<<<R, 1>>>(dev_col, 88.0); // column should be 88
  setValues<<<C, 1>>>(dev_row, 99.0); // row should be 99
  // backcopy
  cudaMemcpy(&final_matrix[selected_row*C], dev_row, 
    C * sizeof(double), cudaMemcpyDeviceToHost);
  cudaMemcpy2D(&final_matrix[selected_col], C*sizeof(double), dev_col, 
    sizeof(double), sizeof(double), R, cudaMemcpyDeviceToHost);

  cudaDeviceSynchronize();
  // output for checking functionality

  printf("Initial Matrix:\n");
  for (size_t i=0; i<R; ++i)
  {
    for (size_t j=0; j<C; ++j) printf(" %lf", matrix[i*C+j]);
    printf("\n");
  }
  printf("\nRow %u values: ", selected_row);
  for (size_t i=0; i<C; ++i) printf(" %lf", h_row[i]);
  printf("\nCol %u values: ", selected_col);
  for (size_t i=0; i<R; ++i) printf(" %lf", h_col[i]);
  printf("\n\n");

  printf("Final Matrix:\n");
  for (size_t i=0; i<R; ++i)
  {
    for (size_t j=0; j<C; ++j) printf(" %lf", final_matrix[i*C+j]);
    printf("\n");
  }

  cudaFree(dev_col);
  cudaFree(dev_row);
  free(matrix);
  free(final_matrix);
  free(h_row);
  free(h_col);
  cudaDeviceReset();
  return 0;

}