# --- # jupyter: # jupytext: # text_representation: # extension: .jl # format_name: light # format_version: '1.5' # jupytext_version: 1.3.3 # kernelspec: # display_name: Julia 1.6.2 # language: julia # name: julia-1.6 # --- # ## Explore matrix-vector multiplication with Julia # 2018-08-14 Jeff Fessler, University of Michigan # 2020-08-05 Julia 1.5 # 2023-08-23 Julia 1.6.2 using BenchmarkTools # conventional high-level matrix-vector multiply function function mult0(A::Matrix, x::Vector) @boundscheck size(A,2) == length(x) || throw("DimensionMismatch(A,x)") return A * x end; # ## Double loop over m,n # This is the textbook version. function mult_mn(A::Matrix, x::Vector) (M,N) = size(A) y = similar(x, M) for m=1:M inprod = zero(eltype(x)) # accumulator for n=1:N inprod += A[m,n] * x[n] end y[m] = inprod end return y end; # with @inbounds function mult_mn_inbounds(A::Matrix, x::Vector) (M,N) = size(A) y = similar(x, M) for m=1:M inprod = zero(x[1]) # accumulator for n=1:N @inbounds inprod += A[m,n] * x[n] end @inbounds y[m] = inprod end return y end; # ## Double loop over n,m # We expect this way to be faster because of cache access over m. function mult_nm(A::Matrix, x::Vector) (M,N) = size(A) y = zeros(eltype(x), M) for n=1:N for m=1:M y[m] += A[m,n] * x[n] end end return y end; # with @inbounds function mult_nm_inbounds(A::Matrix, x::Vector) (M,N) = size(A) y = zeros(eltype(x), M) for n=1:N for m=1:M @inbounds y[m] += A[m,n] * x[n] end end return y end; # and with @simd function mult_nm_inbounds_simd(A::Matrix, x::Vector) (M,N) = size(A) y = zeros(eltype(x), M) for n=1:N @simd for m=1:M @inbounds y[m] += A[m,n] * x[n] end end return y end; # and with @views function mult_nm_inbounds_simd_views(A::Matrix, x::Vector) (M,N) = size(A) y = zeros(eltype(x), M) for n=1:N @simd for m=1:M @inbounds @views y[m] += A[m,n] * x[n] end end return y end; # ## Row versions # Loop over m. function mult_row(A::Matrix, x::Vector) (M,N) = size(A) y = similar(x, M) for m=1:M y[m] = transpose(A[m,:]) * x end return y end; # with @inbounds function mult_row_inbounds(A::Matrix, x::Vector) (M,N) = size(A) y = similar(x, M) for m=1:M @inbounds y[m] = transpose(A[m,:]) * x end return y end; # with @views function mult_row_views(A::Matrix, x::Vector) (M,N) = size(A) y = similar(x, M) for m=1:M @views y[m] = transpose(A[m,:]) * x end return y end; # with both function mult_row_inbounds_views(A::Matrix, x::Vector) (M,N) = size(A) y = similar(x, M) for m=1:M @inbounds @views y[m] = transpose(A[m,:]) * x end return y end; # ## Col versions # Loop over n. function mult_col(A::Matrix, x::Vector) (M,N) = size(A) y = zeros(eltype(x), M) for n=1:N y += A[:,n] * x[n] end return y end; # with "dots" (broadcast) to coalesce function mult_col_dot(A::Matrix, x::Vector) (M,N) = size(A) y = zeros(eltype(x), M) for n=1:N y .+= A[:,n] .* x[n] end return y end; # and @views function mult_col_dot_views(A::Matrix, x::Vector) (M,N) = size(A) y = zeros(eltype(x), M) for n=1:N @views y .+= A[:,n] .* x[n] # @inbounds @views y .+= A[:,n] .* x[n] # did not help end return y end; # ## Test and time each version # + M = 2^11 N = M-4 A = randn(M, N) x = randn(N) flist = (mult0, mult_mn, mult_mn_inbounds, mult_nm, mult_nm_inbounds, mult_nm_inbounds_simd, mult_nm_inbounds_simd_views, mult_row, mult_row_inbounds, mult_row_views, mult_row_inbounds_views, mult_col, mult_col_dot, mult_col_dot_views, ) for f in flist # warm-up and test each version @assert A * x ≈ f(A, x) end for f in flist # benchmark timing for each # @show f # @btime $f($A, $x) b = @benchmark $f($A,$x) tim = round(median(b.times)/10^6, digits=1) # in ms tim = lpad(tim, 4) name = rpad(f, 27) alloc = lpad(b.allocs, 5) mem = round(b.memory/2^10, digits=1) println("$name : $tim ms $alloc alloc $mem KiB") end # - #= mult0 : 0.9 ms 1 alloc 16.1 KiB mult_mn : 22.5 ms 1 alloc 16.1 KiB mult_mn_inbounds : 22.0 ms 1 alloc 16.1 KiB mult_nm : 3.1 ms 1 alloc 16.1 KiB mult_nm_inbounds : 1.5 ms 1 alloc 16.1 KiB mult_nm_inbounds_simd : 1.5 ms 1 alloc 16.1 KiB mult_nm_inbounds_simd_views : 1.5 ms 1 alloc 16.1 KiB mult_row : 32.8 ms 2049 alloc 33040.1 KiB mult_row_inbounds : 32.7 ms 2049 alloc 33040.1 KiB mult_row_views : 22.4 ms 1 alloc 16.1 KiB mult_row_inbounds_views : 22.4 ms 1 alloc 16.1 KiB mult_col : 16.0 ms 6133 alloc 98894.6 KiB mult_col_dot : 7.0 ms 2045 alloc 32975.6 KiB mult_col_dot_views : 1.5 ms 1 alloc 16.1 KiB =# # The results above are for a # 2017 iMac with 4.2GHz quad-core Intel Core i7 # with macOS Mojave 10.14.6 and Julia 1.6.2. # As expected, simple ``A*x`` is the fastest, # but one can come quite close to that using proper double loop order # with ``@inbounds`` or using "dots" and ``@views`` to coalesce. # Without ``@views`` the vector versions have huge memory overhead!