I implemented a RNS Montgomery exponentiation in Cuda.
Everything nice everything fine. It runs on just one SM.
BUT, so far I focus on parallelization of just a single exp. What I want to do now is test with several exp on fly. That is, I want that the i-th next exp is assign to a free SM.
I tried, and the final time was always growing linearly, that is all the exp were assign to the same SM.
Then I switched to streams, but nothing changed.
However I have never used them, so maybe I am doing something wrong..
This is the code:
void __smeWrapper() {
cudaEvent_t start, stop;
cudaStream_t stream0, stream1, stream2;
float time;
unsigned int j, i, tmp;
cudaEventCreate(&start);
cudaEventCreate(&stop);
dim3 threadsPerBlock(SET_SIZE, (SET_SIZE+1)/2);
setCudaDevice();
s_transferDataToGPU();
if(cudaDeviceSetCacheConfig(cudaFuncCachePreferL1) != cudaSuccess)
printf("cudaDeviceSetCacheConfig ERROR!");
cudaEventRecord( start, 0 );
//for(i=0; i<EXPONENTIATION_NUMBER; i++) {
i=0;
__me<<< 1, threadsPerBlock, 0, stream0 >>>(&__s_x[i*(2*SET_SIZE + 1)], __B2modN, __bases, __mmi_NinB, __mmi_Bimodbi, __Bi_inAUar, __dbg, __NinAUar,
__mmi_BinAUar, __mmi_Ajmodaj, __Ajmodar, __mmi_Armodar, __AjinB, __minusAinB, &__z[i*(2*SET_SIZE + 1)], __e);
i=1;
__me<<< 1, threadsPerBlock, 0, stream1 >>>(&__s_x[i*(2*SET_SIZE + 1)], __B2modN, __bases, __mmi_NinB, __mmi_Bimodbi, __Bi_inAUar, __dbg, __NinAUar,
__mmi_BinAUar, __mmi_Ajmodaj, __Ajmodar, __mmi_Armodar, __AjinB, __minusAinB, &__z[i*(2*SET_SIZE + 1)], __e);
i=2;
__me<<< 1, threadsPerBlock, 0, stream2 >>>(&__s_x[i*(2*SET_SIZE + 1)], __B2modN, __bases, __mmi_NinB, __mmi_Bimodbi, __Bi_inAUar, __dbg, __NinAUar, __mmi_BinAUar,
__mmi_Ajmodaj, __Ajmodar, __mmi_Armodar, __AjinB, __minusAinB, &__z[i*(2*SET_SIZE + 1)], __e);
//printf("\n%s\n\n", cudaGetErrorString(cudaGetLastError()));
//}
cudaEventRecord( stop, 0 );
cudaEventSynchronize( stop );
cudaEventElapsedTime( &time, start, stop );
printf("GPU %f µs : %f ms\n", time*1000, time);
cudaEventDestroy( start );
cudaEventDestroy( stop );
Ubuntu 11.04 64b, Cuda 5 RC, 560 Ti (8 SM)
All threads from a block always run on a same SM. You need to start more then one block to use other SMs.
There seems to be something wrong with your streams - do you call cudaStreamCreate for every stream? On my system it crashes with SEGFAULT if I don't use one though.