float scale_B = (float)hist_size/(max_B-min_B);
if(tid < array_size){
s_binned_A[threadIdx.x] = floor( (d_array_A[tid] - min_A) * scale_A );
s_binned_B[threadIdx.x] = floor( (d_array_B[tid] - min_B) * scale_B );
if (s_binned_A[threadIdx.x] < 0)
s_binned_A[threadIdx.x] = 0;
if (s_binned_B[threadIdx.x] < 0)
s_binned_B[threadIdx.x] = 0;
if (s_binned_A[threadIdx.x] >= hist_size)
s_binned_A[threadIdx.x] = hist_size-1;
if (s_binned_B[threadIdx.x] >= hist_size)
s_binned_B[threadIdx.x] = hist_size-1;
__syncthreads();
}
if (threadIdx.x == 0)
for(int i=0;i< blockDim.x; i++)
if (blockIdx.x*blockDim.x + i < array_size)
d_joint_hist[ s_binned_A[i] * hist_size + s_binned_B[i] ] ++;
return;
}