cuda 3D texture Interpolation

I am trying to interpolate a 3D array with cuda using texture memory with the code below. I have plotted the input f[x][y][z] to a fixed z value, then I interpolate my array for x and y and plot i again and they look totally different. I also tried this in 1 dimension (with a different code) and there it works so i assume that there must be an error in my code. Can you help me finding it?

#include <cuda_runtime.h>
#include <cuda.h>
#include <iostream>
#include <fstream>

typedef float myType;

texture<myType, 3> tex;
cudaArray *d_volumeArray = 0;

#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
    if (code != cudaSuccess) 
    {
        fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
        if (abort) { getchar(); exit(code); }
    }
}

__global__ void getInterpolatedFunctionValue(double x, double y, double z){
//http://stackoverflow.com/questions/10643790/texture-memory-tex2d-basics
    printf("%f \n", tex3D(tex, x+0.5f, y+0.5f, z+0.5f));
}

using namespace std;

int main(){

int nx=100, ny=100, nz=10;
myType f[nx][ny][nz];
for(int i=0; i<nx; i++)
  for(int j=0; j<ny; j++)
    for(int k=0; k<nz; k++){ 
      f[i][j][k] = sin(i/10.0)*cos(j/10.0)+k;
    }

const cudaExtent extend = make_cudaExtent(nx, ny, nz);
cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<myType>();
gpuErrchk(cudaMalloc3DArray(&d_volumeArray, &channelDesc, extend));

cudaMemcpy3DParms copyParams = {0};
copyParams.srcPtr   = make_cudaPitchedPtr((void*)f, extend.width*sizeof(myType), extend.width, extend.height);
copyParams.dstArray = d_volumeArray;
copyParams.extent   = extend;
copyParams.kind     = cudaMemcpyHostToDevice;
gpuErrchk(cudaMemcpy3D(&copyParams));

tex.normalized = false;                      
tex.filterMode = cudaFilterModeLinear;      
tex.addressMode[0] = cudaAddressModeClamp;   
tex.addressMode[1] = cudaAddressModeClamp;
tex.addressMode[2] = cudaAddressModeClamp;

gpuErrchk(cudaBindTextureToArray(tex, d_volumeArray, channelDesc));

for(int i=0; i<nx*2; i++){
  for(int j=0; j<ny*2; j++){
    getInterpolatedFunctionValue <<<1, 1>>> (float(i)/2, float(j)/2, 3.0);
    gpuErrchk(cudaPeekAtLastError());
    gpuErrchk(cudaDeviceSynchronize());
  }
}

gpuErrchk(cudaUnbindTexture(tex));
gpuErrchk(cudaFreeArray(d_volumeArray));

return 0;
}

Update: @Robert Crovella: In my opinion you can see my problem better if one does plot the output and compare the interpolation with the original. I will add them below. The integer division was not planed and i fixed it, but that was not the reason for my problem

@JackOLantern: i know this post and your code there was the template for my version. But it seems to me that it does not work as i would have expected.

Since i have not enough reputation to upload images here i will link the two images. Number 1 shows a plot of my input values for a fix z value and figure 2 the interpolation done by my code. The original data are in a range of [2,4] while the interpolated are in [-2,10] and the structure are totally different. I hope this helps understanding my problem better.

Solution

The principal issue seems to be that you have your underlying texture storage index order reversed. The x dimension is the rapidly-varying matrix dimension (3rd subscript, in this case) and the rapidly varying thread dimension within a warp (although irrelevant for this example). In your code, I think the following summarizes the necessary changes:

myType f[nz][ny][nx];
for(int i=0; i<nx; i++)
  for(int j=0; j<ny; j++)
    for(int k=0; k<nz; k++){
      f[k][j][i] = sin(i/10.0f)*cos(j/10.0f)+k;
    }

There's quite a bit more that can be said about texturing with linear interpolation, so if you dig into it further, I suggest a solid understanding of the material presented here. For linear filtering with non-normalized coordinates, the range of interpolation (not including the clamp regions) will be of dimension N-1, for a texture with N data points in a particular direction. This relationship would normally be handled by careful application of the table lookup equation in the previously linked material, however for your example, to make the minimum number of changes, we can dispense with that and simply be careful about how we compute our expected functional value as well as the x,y, and z values passed to the texture lookup.

Here's an example with the primary modification being the storage order. Since I didn't want to plot the data, I chose to modify your code to inject validation checking.

#include <iostream>
#include <fstream>
#define NX 100
#define NY 100
#define NZ 10
#define TOL 0.003f
#define I_FACT 2
typedef float myType;

texture<myType, 3> tex;
cudaArray *d_volumeArray = 0;

__global__ void getInterpolatedFunctionValue(myType x, myType y, myType z, myType *result){
    *result = tex3D(tex, x+0.5f, y+0.5f, z+0.5f);
}

#define cudaCheckErrors(msg) \
    do { \
        cudaError_t __err = cudaGetLastError(); \
        if (__err != cudaSuccess) { \
            fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
                msg, cudaGetErrorString(__err), \
                __FILE__, __LINE__); \
            fprintf(stderr, "*** FAILED - ABORTING\n"); \
            exit(1); \
        } \
    } while (0)

using namespace std;

int main(){

int nx=NX, ny=NY, nz=NZ;
myType f[nz][ny][nx];

for(int ix=0; ix<nx; ix++)
  for(int iy=0; iy<ny; iy++)
    for(int iz=0; iz<nz; iz++){
      f[iz][iy][ix] = sin(ix/(float)10)*cos(iy/(float)10)+iz;
    }

const cudaExtent extent = make_cudaExtent(nx, ny, nz);
cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<myType>();
cudaMalloc3DArray(&d_volumeArray, &channelDesc, extent);
cudaCheckErrors("cudaMalloc3D error");

cudaMemcpy3DParms copyParams = {0};
copyParams.srcPtr   = make_cudaPitchedPtr((void*)f, extent.width*sizeof(myType), extent.width, extent.height);
copyParams.dstArray = d_volumeArray;
copyParams.extent   = extent;
copyParams.kind     = cudaMemcpyHostToDevice;
cudaMemcpy3D(&copyParams);
cudaCheckErrors("cudaMemcpy3D fail");

tex.normalized = false;
tex.filterMode = cudaFilterModeLinear;
tex.addressMode[0] = cudaAddressModeClamp;
tex.addressMode[1] = cudaAddressModeClamp;
tex.addressMode[2] = cudaAddressModeClamp;

cudaBindTextureToArray(tex, d_volumeArray, channelDesc);
cudaCheckErrors("bind fail");
myType my_result;
myType *d_result, *h_result = &my_result;
cudaMalloc(&d_result, sizeof(myType));


for(int i=0; i<(nx-1)*I_FACT; i++)
  for(int j=0; j<(ny-1)*I_FACT; j++)
    for (int k = 0; k <(nz-1)*I_FACT; k++){
    myType test_val = sin(i/(float)(10*I_FACT))*cos(j/(float)(10*I_FACT)) + k/(float)(I_FACT);
    getInterpolatedFunctionValue <<<1, 1>>> (i/(float)I_FACT, j/(float)I_FACT, k/(float)I_FACT, d_result);
    cudaDeviceSynchronize();
    cudaCheckErrors("kernel fail");
    cudaMemcpy(h_result, d_result, sizeof(myType), cudaMemcpyDeviceToHost);
    cudaCheckErrors("cudaMemcpy fail");
    if (fabs(my_result - test_val) > TOL) {printf("mismatch at x:%f, y:%f, z:%f, was:%f, should be: %f\n", i/(float)I_FACT,j/(float)I_FACT,k/(float)I_FACT, my_result, test_val); return 1;}
  }
printf("success!\n");

cudaUnbindTexture(tex);
cudaCheckErrors("unbind fail");
cudaFreeArray(d_volumeArray);
cudaCheckErrors("free fail");

return 0;
}

This code seems to run correctly for me, it takes about 30 seconds on a K40c, with CUDA 6.5. In the future, rather than expecting others to plot your data to determine validity, it would be helpful if you build validation checking into your request for help. This makes it easy on others to help you, and also explicitly declares the nature of the results you are expecting.

The tolerance built into the code above is probably incorrect to cover a wide range of cases. The texture hardware has coefficients that are stored with 8 bits of fractional precision (refer to previous link), and in the 3D case you'll be multiplying 3 of these coefficients together. So at most the tolerance probably needs to be perhaps 0.005 times the maximum value of the data stored in the texture, but I haven't done a careful tolerance analysis.

Increasing the I_FACT parameter will dramatically increase the runtime of the test code above.