2018-08-14 Jeff Fessler, University of Michigan
2020-08-05 Julia 1.5
2023-08-23 Julia 1.6.2
using BenchmarkTools
# conventional high-level matrix-vector multiply function
function mult0(A::Matrix, x::Vector)
@boundscheck size(A,2) == length(x) || throw("DimensionMismatch(A,x)")
return A * x
end;
This is the textbook version.
function mult_mn(A::Matrix, x::Vector)
(M,N) = size(A)
y = similar(x, M)
for m=1:M
inprod = zero(eltype(x)) # accumulator
for n=1:N
inprod += A[m,n] * x[n]
end
y[m] = inprod
end
return y
end;
# with @inbounds
function mult_mn_inbounds(A::Matrix, x::Vector)
(M,N) = size(A)
y = similar(x, M)
for m=1:M
inprod = zero(x[1]) # accumulator
for n=1:N
@inbounds inprod += A[m,n] * x[n]
end
@inbounds y[m] = inprod
end
return y
end;
We expect this way to be faster because of cache access over m.
function mult_nm(A::Matrix, x::Vector)
(M,N) = size(A)
y = zeros(eltype(x), M)
for n=1:N
for m=1:M
y[m] += A[m,n] * x[n]
end
end
return y
end;
# with @inbounds
function mult_nm_inbounds(A::Matrix, x::Vector)
(M,N) = size(A)
y = zeros(eltype(x), M)
for n=1:N
for m=1:M
@inbounds y[m] += A[m,n] * x[n]
end
end
return y
end;
# and with @simd
function mult_nm_inbounds_simd(A::Matrix, x::Vector)
(M,N) = size(A)
y = zeros(eltype(x), M)
for n=1:N
@simd for m=1:M
@inbounds y[m] += A[m,n] * x[n]
end
end
return y
end;
# and with @views
function mult_nm_inbounds_simd_views(A::Matrix, x::Vector)
(M,N) = size(A)
y = zeros(eltype(x), M)
for n=1:N
@simd for m=1:M
@inbounds @views y[m] += A[m,n] * x[n]
end
end
return y
end;
Loop over m.
function mult_row(A::Matrix, x::Vector)
(M,N) = size(A)
y = similar(x, M)
for m=1:M
y[m] = transpose(A[m,:]) * x
end
return y
end;
# with @inbounds
function mult_row_inbounds(A::Matrix, x::Vector)
(M,N) = size(A)
y = similar(x, M)
for m=1:M
@inbounds y[m] = transpose(A[m,:]) * x
end
return y
end;
# with @views
function mult_row_views(A::Matrix, x::Vector)
(M,N) = size(A)
y = similar(x, M)
for m=1:M
@views y[m] = transpose(A[m,:]) * x
end
return y
end;
# with both
function mult_row_inbounds_views(A::Matrix, x::Vector)
(M,N) = size(A)
y = similar(x, M)
for m=1:M
@inbounds @views y[m] = transpose(A[m,:]) * x
end
return y
end;
Loop over n.
function mult_col(A::Matrix, x::Vector)
(M,N) = size(A)
y = zeros(eltype(x), M)
for n=1:N
y += A[:,n] * x[n]
end
return y
end;
# with "dots" (broadcast) to coalesce
function mult_col_dot(A::Matrix, x::Vector)
(M,N) = size(A)
y = zeros(eltype(x), M)
for n=1:N
y .+= A[:,n] .* x[n]
end
return y
end;
# and @views
function mult_col_dot_views(A::Matrix, x::Vector)
(M,N) = size(A)
y = zeros(eltype(x), M)
for n=1:N
@views y .+= A[:,n] .* x[n]
# @inbounds @views y .+= A[:,n] .* x[n] # did not help
end
return y
end;
M = 2^11
N = M-4
A = randn(M, N)
x = randn(N)
flist = (mult0,
mult_mn, mult_mn_inbounds,
mult_nm, mult_nm_inbounds, mult_nm_inbounds_simd, mult_nm_inbounds_simd_views,
mult_row, mult_row_inbounds, mult_row_views, mult_row_inbounds_views,
mult_col, mult_col_dot, mult_col_dot_views,
)
for f in flist # warm-up and test each version
@assert A * x ≈ f(A, x)
end
for f in flist # benchmark timing for each
# @show f
# @btime $f($A, $x)
b = @benchmark $f($A,$x)
tim = round(median(b.times)/10^6, digits=1) # in ms
tim = lpad(tim, 4)
name = rpad(f, 27)
alloc = lpad(b.allocs, 5)
mem = round(b.memory/2^10, digits=1)
println("$name : $tim ms $alloc alloc $mem KiB")
end
mult0 : 0.9 ms 1 alloc 16.1 KiB mult_mn : 22.6 ms 1 alloc 16.1 KiB mult_mn_inbounds : 22.2 ms 1 alloc 16.1 KiB mult_nm : 3.1 ms 1 alloc 16.1 KiB mult_nm_inbounds : 1.5 ms 1 alloc 16.1 KiB mult_nm_inbounds_simd : 1.5 ms 1 alloc 16.1 KiB mult_nm_inbounds_simd_views : 1.5 ms 1 alloc 16.1 KiB mult_row : 32.9 ms 2049 alloc 33040.1 KiB mult_row_inbounds : 32.9 ms 2049 alloc 33040.1 KiB mult_row_views : 22.4 ms 1 alloc 16.1 KiB mult_row_inbounds_views : 22.5 ms 1 alloc 16.1 KiB mult_col : 16.7 ms 6133 alloc 98894.6 KiB mult_col_dot : 7.2 ms 2045 alloc 32975.6 KiB mult_col_dot_views : 1.5 ms 1 alloc 16.1 KiB
#=
mult0 : 0.9 ms 1 alloc 16.1 KiB
mult_mn : 22.5 ms 1 alloc 16.1 KiB
mult_mn_inbounds : 22.0 ms 1 alloc 16.1 KiB
mult_nm : 3.1 ms 1 alloc 16.1 KiB
mult_nm_inbounds : 1.5 ms 1 alloc 16.1 KiB
mult_nm_inbounds_simd : 1.5 ms 1 alloc 16.1 KiB
mult_nm_inbounds_simd_views : 1.5 ms 1 alloc 16.1 KiB
mult_row : 32.8 ms 2049 alloc 33040.1 KiB
mult_row_inbounds : 32.7 ms 2049 alloc 33040.1 KiB
mult_row_views : 22.4 ms 1 alloc 16.1 KiB
mult_row_inbounds_views : 22.4 ms 1 alloc 16.1 KiB
mult_col : 16.0 ms 6133 alloc 98894.6 KiB
mult_col_dot : 7.0 ms 2045 alloc 32975.6 KiB
mult_col_dot_views : 1.5 ms 1 alloc 16.1 KiB
=#
The results above are for a
2017 iMac with 4.2GHz quad-core Intel Core i7
with macOS Mojave 10.14.6 and Julia 1.6.2.
As expected, simple A*x
is the fastest,
but one can come quite close to that using proper double loop order
with @inbounds
or using "dots" and @views
to coalesce.
Without @views
the vector versions have huge memory overhead!