cudafloat fRMS = CUDA_SQRT(shared_rms[0] / numberPatternsNeurons) / CUDA_VALUE(2.0);
if (IsInfOrNaN(fRMS)) fRMS = numberPatternsNeurons;
*rmsF = fRMS;
}
}
}
void KernelCalculateRMS(cudaStream_t stream, int blockSize, cudafloat * rms, cudafloat * rmsOut, int numberPatterns, cudafloat numberPatternsNeurons) {
switch(blockSize) {
#ifdef FERMI
case 1024:
CalculateRMS<1024><<<1, blockSize, blockSize * sizeof(cudafloat), stream>>>(rms, rmsOut, numberPatterns, numberPatternsNeurons);
break;
#endif
case 512:
CalculateRMS<512><<<1, blockSize, blockSize * sizeof(cudafloat), stream>>>(rms, rmsOut, numberPatterns, numberPatternsNeurons);
break;
case 256:
CalculateRMS<256><<<1, blockSize, blockSize * sizeof(cudafloat), stream>>>(rms, rmsOut, numberPatterns, numberPatternsNeurons);
break;
case 128:
CalculateRMS<128><<<1, blockSize, blockSize * sizeof(cudafloat), stream>>>(rms, rmsOut, numberPatterns, numberPatternsNeurons);
break;
case 64:
CalculateRMS<64><<<1, blockSize, blockSize * sizeof(cudafloat), stream>>>(rms, rmsOut, numberPatterns, numberPatternsNeurons);
break;
case 32:
CalculateRMS<32><<<1, blockSize, blockSize * sizeof(cudafloat), stream>>>(rms, rmsOut, numberPatterns, numberPatternsNeurons);
break;
case 16:
CalculateRMS<16><<<1, blockSize, blockSize * sizeof(cudafloat), stream>>>(rms, rmsOut, numberPatterns, numberPatternsNeurons);
break;
case 8:
CalculateRMS<8><<<1, blockSize, blockSize * sizeof(cudafloat), stream>>>(rms, rmsOut, numberPatterns, numberPatternsNeurons);
break;
case 4:
CalculateRMS<4><<<1, blockSize, blockSize * sizeof(cudafloat), stream>>>(rms, rmsOut, numberPatterns, numberPatternsNeurons);
break;
case 2:
CalculateRMS<2><<<1, blockSize, blockSize * sizeof(cudafloat), stream>>>(rms, rmsOut, numberPatterns, numberPatternsNeurons);
break;