unsigned long long tmp[threadsPerBlock]; tmp[threadIdx.x] = i + 1; __syncthreads(); for (uint32_t s = 1; s < threadsPerBlock; s *= 2) { if (threadIdx.x % (2*s) == 0) { tmp[threadIdx.x] += tmp[threadIdx.x + s]; } __syncthreads(); } if (threadIdx.x == 0) { atomicAdd(result, tmp[0]); }
unsigned long long tmp[threadsPerBlock]; tmp[threadIdx.x] = i + 1; __syncthreads(); for (uint32_t s = threadsPerBlock/2; s > 0; s >>= 1) { if (threadIdx.x < s) { tmp[threadIdx.x] += tmp[threadIdx.x + s]; } __syncthreads(); } if (threadIdx.x == 0) { atomicAdd(result, tmp[0]); }