thread work if previously thread finished work (cuda) in same block

hello I am a beginner in cuda programming.I use lock.lock () function to wait for previously thread finished work. this my code :

#include "book.h"
#include <cuda.h>
#include <conio.h>
#include <iostream>
#include <stdlib.h>
#include <time.h>
#include <stdio.h>
#include <math.h>
#include <fstream>
#include <string>
#include <curand.h>
#include <curand_kernel.h>
#include "lock.h"
#define pop 10
#define gen 10
#define pg pop*gen
using namespace std;
__global__ void hold(Lock lock,float* a )
{
    __shared__ int cache[gen];
int tid=blockIdx.x * blockDim.x+threadIdx.x;
int cacheIndex = threadIdx.x;
if(tid<gen)
{
    a[tid]=7;//this number example but in my chase this random number
}
else
{
    //cache[cacheIndex]=a[tid];
    int temp;
        if(tid%gen==0)
        {

            a[tid]=tid+4;//this example number but in my chase this random number if tid==tid%gen
            temp=a[tid];
            tid+=blockIdx.x*gridDim.x;

        }
        else
        {
            __syncthreads();
            a[tid]=temp+1;//this must a[tid]=a[tid-1]+1;
            temp=a[tid];
            tid+=blockIdx.x*gridDim.x;

        }

    cache[cacheIndex]=temp;
    __syncthreads();
    for (int i=0;i<gen;i++)
    {
        if(cacheIndex==i)
        {
            lock. lock();
            cache[cacheIndex]=temp;
            lock.unlock();
        }
    }


}

}
int main()
{
float time;
float* a=new float [pg];
float *dev_a;

HANDLE_ERROR( cudaMalloc( (void**)&dev_a,pg *sizeof(int) ) );
Lock lock;
cudaEvent_t start, stop;
HANDLE_ERROR( cudaEventCreate(&start) );
HANDLE_ERROR( cudaEventCreate(&stop) );
HANDLE_ERROR( cudaEventRecord(start, 0) );
hold<<<pop,gen>>>(lock,dev_a);
HANDLE_ERROR( cudaMemcpy( a, dev_a,pg * sizeof(float),cudaMemcpyDeviceToHost ) );
HANDLE_ERROR( cudaEventRecord(stop, 0) );
HANDLE_ERROR( cudaEventSynchronize(stop) );
HANDLE_ERROR( cudaEventElapsedTime(&time, start, stop) );
for(int i=0;i<pop;i++)
{
    for(int j=0;j<gen;j++)
    {
        cout<<a[(i*gen)+j]<<" ";
    }
    cout<<endl;
}
printf("hold:  %3.1f ms \n", time);
HANDLE_ERROR(cudaFree(dev_a));
HANDLE_ERROR( cudaEventDestroy( start ) );
HANDLE_ERROR( cudaEventDestroy( stop ) );
system("pause");
return 0;
}

and this the result :

7 7 7 7 7 7 7 7 7 7

14 0 0 0 0 0 0 0 0 0

24 0 0 0 0 0 0 0 0 0

34 0 0 0 0 0 0 0 0 0

44 0 0 0 0 0 0 0 0 0

54 0 0 0 0 0 0 0 0 0

64 0 0 0 0 0 0 0 0 0

74 0 0 0 0 0 0 0 0 0

84 0 0 0 0 0 0 0 0 0

94 0 0 0 0 0 0 0 0 0

my expected result :

7 7 7 7 7 7 7 7 7 7

14 15 16 17 18 19 20 21 22 23

24 25 26 27 28 29 23 31 32 33

34 35 36 37 38 39 40 41 42 43

44 45 46 47 48 49 50 51 52 53

54 55 56 57 58 59 60 61 62 63

64 65 66 67 68 69 70 71 72 73

74 75 76 77 78 79 80 81 82 83

84 85 86 87 88 89 90 91 92 93

94 95 96 97 98 99 100 101 102 103

any one please help me to correct my code. thanks

Solution

If you want help, it would be useful to point out that some of your code (e.g. lock.h and book.h) come from the CUDA by examples book. This is not a standard part of CUDA, so if you don't indicate where it comes from, it may be confusing.

I see the following issues in your code:

You are using a __syncthreads() in a conditional block where not all threads will meet the __syncthreads() barrier:
```
if(tid%gen==0)
{
  ...
}
else
{
    __syncthreads();  // illegal

}
```
The usage of __syncthreads() in this way is illegal because not all threads will be able to reach the __syncthreads() barrier:

__syncthreads() is allowed in conditional code but only if the conditional evaluates identically across the entire thread block, otherwise the code execution is likely to hang or produce unintended side effects.

You are using the temp local variable without initializing it first:
```
    a[tid]=temp+1;//this must a[tid]=a[tid-1]+1;
```
note that temp is thread-local variable. It is not shared amongst threads. Therefore the above line of code (for threads in the else block) is using an unitialized value of temp.
The remainder of your kernel code:
```
    cache[cacheIndex]=temp;
    __syncthreads();
    for (int i=0;i<gen;i++)
    {
      if(cacheIndex==i)
      {
        lock. lock();
        cache[cacheIndex]=temp;
        lock.unlock();
      }
    }


}
```
does nothing useful because it is updating shared memory locations (i.e. cache) which are never transferred back to the dev_a variable, i.e. global memory. Therefore none of this code could affect the results you print out.

It's difficult to follow what you are trying to accomplish in your code. However if you change this line (the uninitialized value):

    int temp;

to this:

    int temp=tid+3;

Your code will print out the data according to what you have shown.