for (int slice=0; slice<n_slices; slice++)
{
d_Data = (*d_data) + slice * data_slice_size;
d_Kernel = (*d_kernel) + slice * kernel_slice_size;
d_Result = (*d_result) + slice * data_slice_size;
cutilSafeCall( cudaMemset(d_PaddedKernel, 0, fftH * fftW * sizeof(float)) );
cutilSafeCall( cudaMemset(d_PaddedData, 0, fftH * fftW * sizeof(float)) );
padKernel(d_PaddedKernel,d_Kernel,fftH,fftW,kernelH,kernelW,kernelY,kernelX);
cutilSafeCall( cudaThreadSynchronize() );
if (!d_PaddedData || !d_PaddedKernel) fprintf_verbose("NULL arguments!\n");
padDataClampToBorder(d_PaddedData,d_Data,fftH,fftW,dataH,dataW,kernelH,kernelW,kernelY,kernelX);
cutilSafeCall( cudaThreadSynchronize() );
cufftSafeCall( cufftExecR2C(fftPlanFwd, d_PaddedKernel, (cufftComplex *)d_KernelSpectrum) );
cutilSafeCall( cudaThreadSynchronize() );
cufftSafeCall( cufftExecR2C(fftPlanFwd, d_PaddedData, (cufftComplex *)d_DataSpectrum) );
modulateAndNormalize(d_DataSpectrum, d_KernelSpectrum, fftH, fftW);
cufftSafeCall( cufftExecC2R(fftPlanInv, (cufftComplex *)d_DataSpectrum, d_PaddedData) );
cutilSafeCall( cudaThreadSynchronize() );
crop_image(d_Result,d_PaddedData,fftH,fftW,dataH,dataW,kernelH,kernelW);
}
cufftSafeCall( cufftDestroy(fftPlanInv) );
cufftSafeCall( cufftDestroy(fftPlanFwd) );
cutilSafeCall( cudaFree(d_DataSpectrum) );
cutilSafeCall( cudaFree(d_KernelSpectrum) );
cutilSafeCall( cudaFree(d_PaddedData) );
cutilSafeCall( cudaFree(d_PaddedKernel) );
status = 0;
return status;
}