Search code examples
visual-studio-2008cudaptx

load function parameters in inlined ptx


I have the following function with inline assembly that works fine on debug mode in 32 bit Visual Studio 2008:

__device__ void add(int* pa, int* pb)
{
  asm(".reg .u32   s<3>;"::);
  asm(".reg .u32   r<14>;"::);

  asm("ld.global.b32    s0, [%0];"::"r"(&pa));      //load addresses of pa, pb
  printf(...);
  asm("ld.global.b32    s1, [%0];"::"r"(&pb));
  printf(...);
  asm("ld.global.b32    r1, [s0+8];"::);
  printf(...);  
  asm("ld.global.b32    r2, [s1+8];"::);
  printf(...);

  ...// perform some operations
}

pa and pb are globally allocated on the device such as

__device__ int pa[3] = {0, 0x927c0000, 0x20000011};  
__device__ int pb[3] = {0, 0xbb900000, 0x2000000b};

However, this code fails on release mode, on line asm("ld.global.b32 r1, [s0+8];"::); How can I load function parameters correctly with inline ptx on release mode?

P.S. building the release mode with -G flag (Generates GPU debug info) causes the code to run correctly on release mode. Thank you,


Solution

  • Hopefully this code will help. I'm still guessing at what you are trying to do exactly, but I started with your code and decided to add some values in the pa and pb arrays and store them back into pa[0] and pb[0].

    This code is written for a 64 bit machine but converting it to 32 bit pointers should not be difficult. I have marked the lines that need to be changed for 32 bit pointers with a comment. Hopefully this will answer your question about how to use function parameters that are pointers to device memory:

    #include <stdio.h>
    
    __device__ int pa[3] = {0, 0x927c0000, 0x20000011};
    __device__ int pb[3] = {0, 0xbb900000, 0x2000000b};
    
    __device__ void add(int* mpa, int* mpb)
    {
      asm(".reg .u64   s<2>;"::);  // change to .u32 for 32 bit pointers
      asm(".reg .u32   r<6>;"::);
    
      asm("mov.u64    s0, %0;"::"l"(mpa));      //change to .u32 and "r" for 32 bit
      asm("mov.u64    s1, %0;"::"l"(mpb));      //change to .u32 and "r" for 32 bit
      asm("ld.global.u32    r0, [s0+4];"::);
      asm("ld.global.u32    r1, [s1+4];"::);
      asm("ld.global.u32    r2, [s0+8];"::);
      asm("ld.global.u32    r3, [s1+8];"::);
      asm("add.u32    r4, r0, r2;"::);
      asm("add.u32    r5, r1, r3;"::);
      asm("st.global.u32    [s0], r4;"::);
      asm("st.global.u32   [s1], r5;"::);
    }
    
    __global__ void mykernel(){
      printf("pa[0] = %x, pb[0] = %x\n", pa[0], pb[0]);
      add(pa, pb);
      printf("pa[0] = %x, pb[0] = %x\n", pa[0], pb[0]);
    }
    
    int  main() {
      mykernel<<<1,1>>>();
      cudaDeviceSynchronize();
      return 0;
    }
    

    When I run this code I get:

    $ ./t128
    pa[0] = 0, pb[0] = 0
    pa[0] = b27c0011, pb[0] = db90000b
    $
    

    which I believe is correct output.

    I compiled it with:

    nvcc -O3 -arch=sm_20 -o t128 t128.cu