using BenchmarkTools


# conventional sum function
function sum0(a::Vector)
    return sum(a)
end;


# sum written as a "for x in" loop with @simd
function sum_in_simd(a::Vector)
    total = zero(eltype(a))
    @simd for x in a
        total += x
    end
    return total
end;


# no @simd version
function sum_in(a::Vector)
    total = zero(eltype(a))
    for x in a 
        total += x
    end
    return total
end;


# @simd version with for i=1:N range loop 
function sum_range_simd(a::Vector)
    total = zero(eltype(a))
    @simd for i=1:length(a)
        total += a[i]
    end
    return total
end;


# no @simd version with for i=1:N range loop 
function sum_range(a::Vector)
    total = zero(eltype(a))
    for i=1:length(a)
        total += a[i]
    end
    return total
end;


# @inbounds version with for i=1:N range loop 
function sum_range_inbounds_simd(a::Vector)
    total = zero(eltype(a))
    @inbounds @simd for i=1:length(a)
        total += a[i]
    end
    return total
end;


# @inbounds version with for i=1:N range loop 
function sum_range_inbounds(a::Vector)
    total = zero(eltype(a))
    @inbounds for i=1:length(a)
        total += a[i]
    end
    return total
end;


x = rand(10^6);


# test and time each function
flist = (sum0, sum_in_simd, sum_in, sum_range_simd, sum_range_inbounds, sum_range_inbounds_simd)
s0 = sum0(x)
for f in flist # warm-up and test each
	@assert s0 ≈ f(x)
end

for f in flist
    @show f
    @btime $f($x)
end

f = sum0
  174.792 μs (0 allocations: 0 bytes)
f = sum_in_simd
  163.322 μs (0 allocations: 0 bytes)
f = sum_in
  908.208 μs (0 allocations: 0 bytes)
f = sum_range_simd
  162.350 μs (0 allocations: 0 bytes)
f = sum_range_inbounds
  907.931 μs (0 allocations: 0 bytes)
f = sum_range_inbounds_simd
  163.678 μs (0 allocations: 0 bytes)

Test and time¶