SumBeforeWarp<blockSize>(sum);
if (threadIdx.x < 32) {
SumWarp<blockSize>(sum);
if (threadIdx.x == 0) output[blockIdx.x] = sum[0] * multiplyFactor;
}
}
void KernelSumSmallArray(cudaStream_t stream, int blockSize, cudafloat * inputs, cudafloat * output, int numInputs, cudafloat multiplyFactor) {
switch(blockSize) {
#ifdef FERMI
case 1024:
SumSmallArray<1024><<<1, blockSize, blockSize * sizeof(cudafloat), stream>>>(inputs, output, numInputs, multiplyFactor);
break;
#endif
case 512:
SumSmallArray<512><<<1, blockSize, blockSize * sizeof(cudafloat), stream>>>(inputs, output, numInputs, multiplyFactor);
break;
case 256:
SumSmallArray<256><<<1, blockSize, blockSize * sizeof(cudafloat), stream>>>(inputs, output, numInputs, multiplyFactor);
break;
case 128:
SumSmallArray<128><<<1, blockSize, blockSize * sizeof(cudafloat), stream>>>(inputs, output, numInputs, multiplyFactor);
break;
case 64:
SumSmallArray<64><<<1, blockSize, blockSize * sizeof(cudafloat), stream>>>(inputs, output, numInputs, multiplyFactor);
break;
case 32:
SumSmallArray<32><<<1, blockSize, blockSize * sizeof(cudafloat), stream>>>(inputs, output, numInputs, multiplyFactor);
break;
case 16:
SumSmallArray<16><<<1, blockSize, blockSize * sizeof(cudafloat), stream>>>(inputs, output, numInputs, multiplyFactor);
break;
case 8:
SumSmallArray<8><<<1, blockSize, blockSize * sizeof(cudafloat), stream>>>(inputs, output, numInputs, multiplyFactor);
break;
case 4:
SumSmallArray<4><<<1, blockSize, blockSize * sizeof(cudafloat), stream>>>(inputs, output, numInputs, multiplyFactor);
break;
case 2:
SumSmallArray<2><<<1, blockSize, blockSize * sizeof(cudafloat), stream>>>(inputs, output, numInputs, multiplyFactor);
break;
case 1:
SumSmallArray<1><<<1, blockSize, blockSize * sizeof(cudafloat), stream>>>(inputs, output, numInputs, multiplyFactor);
break;
}
}