Explore summation and loops using @simd and @inbounds with Julia
2018-08-11 Jeff Fessler, University of Michigan
Inspired by:
https://juliacomputing.com/blog/2017/09/27/auto-vectorization-in-julia.html
2020-08-05 Julia 1.5.0
2021-08-23 Julia 1.6.2
using BenchmarkTools
# conventional sum function
function sum0(a::Vector)
return sum(a)
end;
# sum written as a "for x in" loop with @simd
function sum_in_simd(a::Vector)
total = zero(eltype(a))
@simd for x in a
total += x
end
return total
end;
# no @simd version
function sum_in(a::Vector)
total = zero(eltype(a))
for x in a
total += x
end
return total
end;
# @simd version with for i=1:N range loop
function sum_range_simd(a::Vector)
total = zero(eltype(a))
@simd for i=1:length(a)
total += a[i]
end
return total
end;
# no @simd version with for i=1:N range loop
function sum_range(a::Vector)
total = zero(eltype(a))
for i=1:length(a)
total += a[i]
end
return total
end;
# @inbounds version with for i=1:N range loop
function sum_range_inbounds_simd(a::Vector)
total = zero(eltype(a))
@inbounds @simd for i=1:length(a)
total += a[i]
end
return total
end;
# @inbounds version with for i=1:N range loop
function sum_range_inbounds(a::Vector)
total = zero(eltype(a))
@inbounds for i=1:length(a)
total += a[i]
end
return total
end;
x = rand(10^6);
# test and time each function
flist = (sum0, sum_in_simd, sum_in, sum_range_simd, sum_range_inbounds, sum_range_inbounds_simd)
s0 = sum0(x)
for f in flist # warm-up and test each
@assert s0 ≈ f(x)
end
for f in flist
@show f
@btime $f($x)
end
f = sum0 174.792 μs (0 allocations: 0 bytes) f = sum_in_simd 163.322 μs (0 allocations: 0 bytes) f = sum_in 908.208 μs (0 allocations: 0 bytes) f = sum_range_simd 162.350 μs (0 allocations: 0 bytes) f = sum_range_inbounds 907.931 μs (0 allocations: 0 bytes) f = sum_range_inbounds_simd 163.678 μs (0 allocations: 0 bytes)
The results above are for a 2017 iMac with 4.2GHz quad-core Intel Core i7
with macOS Mojave 10.14.6 and Julia 1.6.2.
The @simd "for x in vector" loop is just as fast as calling sum().
However the loop with "for i=1:N" is slower
unless accelerated by both @simd and @inbounds !