if (NEURON == 0) lgNextLayer[OUTPUT_NEURON] = localGradientNextLayer[SAMPLE * NUM_OUTPUTS + OUTPUT_NEURON];
int connection = OUTPUT_NEURON * NUM_INPUTS_OUTPUT_NEURON + NEURON + 1;
int threadId = (NEURON * NUM_OUTPUTS + OUTPUT_NEURON);
__syncthreads();
lg[threadId] = weights[connection] * lgNextLayer[OUTPUT_NEURON];
__syncthreads();
int numberElemSum = NUM_OUTPUTS;
for(int sumUpTo = (numberElemSum >> 1); numberElemSum > 1; sumUpTo = (numberElemSum >> 1)) {
int nextNumberElemSum = sumUpTo;
if (numberElemSum & 1) nextNumberElemSum++;
if (OUTPUT_NEURON < sumUpTo) lg[threadId] += lg[threadId + nextNumberElemSum];
numberElemSum = nextNumberElemSum;
__syncthreads();
}
if (OUTPUT_NEURON == 0) {
cudafloat lgn = CUDA_VALUE(0.0);
int n = SAMPLE * NUM_NEURONS + NEURON;
cudafloat i = inputs[n];