using BenchmarkTools


# conventional high-level matrix-vector multiply function
function mult0(A::Matrix, x::Vector)
    @boundscheck size(A,2) == length(x) || throw("DimensionMismatch(A,x)")
    return A * x
end;


function mult_mn(A::Matrix, x::Vector)
    (M,N) = size(A)
    y = similar(x, M)
    for m=1:M
        inprod = zero(eltype(x)) # accumulator
        for n=1:N
            inprod += A[m,n] * x[n]
        end
        y[m] = inprod
    end
    return y
end;


# with @inbounds
function mult_mn_inbounds(A::Matrix, x::Vector)
    (M,N) = size(A)
    y = similar(x, M)
    for m=1:M
        inprod = zero(x[1]) # accumulator
        for n=1:N
            @inbounds inprod += A[m,n] * x[n]
        end
        @inbounds y[m] = inprod
    end
    return y
end;


function mult_nm(A::Matrix, x::Vector)
    (M,N) = size(A)
    y = zeros(eltype(x), M)
    for n=1:N
        for m=1:M
            y[m] += A[m,n] * x[n]
        end
    end
    return y
end;


# with @inbounds
function mult_nm_inbounds(A::Matrix, x::Vector)
    (M,N) = size(A)
    y = zeros(eltype(x), M)
    for n=1:N
        for m=1:M
            @inbounds y[m] += A[m,n] * x[n]
        end
    end
    return y
end;


# and with @simd
function mult_nm_inbounds_simd(A::Matrix, x::Vector)
    (M,N) = size(A)
    y = zeros(eltype(x), M)
    for n=1:N
        @simd for m=1:M
            @inbounds y[m] += A[m,n] * x[n]
        end
    end
    return y
end;


# and with @views
function mult_nm_inbounds_simd_views(A::Matrix, x::Vector)
    (M,N) = size(A)
    y = zeros(eltype(x), M)
    for n=1:N
        @simd for m=1:M
            @inbounds @views y[m] += A[m,n] * x[n]
        end
    end
    return y
end;


function mult_row(A::Matrix, x::Vector)
    (M,N) = size(A)
    y = similar(x, M)
    for m=1:M
        y[m] = transpose(A[m,:]) * x
    end
    return y
end;


# with @inbounds
function mult_row_inbounds(A::Matrix, x::Vector)
    (M,N) = size(A)
    y = similar(x, M)
    for m=1:M
        @inbounds y[m] = transpose(A[m,:]) * x
    end
    return y
end;


# with @views
function mult_row_views(A::Matrix, x::Vector)
    (M,N) = size(A)
    y = similar(x, M)
    for m=1:M
        @views y[m] = transpose(A[m,:]) * x
    end
    return y
end;


# with both
function mult_row_inbounds_views(A::Matrix, x::Vector)
    (M,N) = size(A)
    y = similar(x, M)
    for m=1:M
        @inbounds @views y[m] = transpose(A[m,:]) * x
    end
    return y
end;


function mult_col(A::Matrix, x::Vector)
    (M,N) = size(A)
    y = zeros(eltype(x), M)
    for n=1:N
        y += A[:,n] * x[n]
    end
    return y
end;


# with "dots" (broadcast) to coalesce
function mult_col_dot(A::Matrix, x::Vector)
    (M,N) = size(A)
    y = zeros(eltype(x), M)
    for n=1:N
        y .+= A[:,n] .* x[n]
    end
    return y
end;


# and @views
function mult_col_dot_views(A::Matrix, x::Vector)
    (M,N) = size(A)
    y = zeros(eltype(x), M)
    for n=1:N
        @views y .+= A[:,n] .* x[n]
#       @inbounds @views y .+= A[:,n] .* x[n] # did not help
    end
    return y
end;


M = 2^11
N = M-4
A = randn(M, N)
x = randn(N)

flist = (mult0,
    mult_mn, mult_mn_inbounds,
    mult_nm, mult_nm_inbounds, mult_nm_inbounds_simd, mult_nm_inbounds_simd_views,
    mult_row, mult_row_inbounds, mult_row_views, mult_row_inbounds_views,
    mult_col, mult_col_dot, mult_col_dot_views,
)

for f in flist # warm-up and test each version
    @assert A * x ≈ f(A, x)
end

for f in flist # benchmark timing for each
#   @show f
#   @btime $f($A, $x)
    b = @benchmark $f($A,$x)
    tim = round(median(b.times)/10^6, digits=1) # in ms
    tim = lpad(tim, 4)
    name = rpad(f, 27)
	alloc = lpad(b.allocs, 5)
	mem = round(b.memory/2^10, digits=1)
    println("$name : $tim ms $alloc alloc $mem KiB")
end

mult0                       :  0.9 ms     1 alloc 16.1 KiB
mult_mn                     : 22.6 ms     1 alloc 16.1 KiB
mult_mn_inbounds            : 22.2 ms     1 alloc 16.1 KiB
mult_nm                     :  3.1 ms     1 alloc 16.1 KiB
mult_nm_inbounds            :  1.5 ms     1 alloc 16.1 KiB
mult_nm_inbounds_simd       :  1.5 ms     1 alloc 16.1 KiB
mult_nm_inbounds_simd_views :  1.5 ms     1 alloc 16.1 KiB
mult_row                    : 32.9 ms  2049 alloc 33040.1 KiB
mult_row_inbounds           : 32.9 ms  2049 alloc 33040.1 KiB
mult_row_views              : 22.4 ms     1 alloc 16.1 KiB
mult_row_inbounds_views     : 22.5 ms     1 alloc 16.1 KiB
mult_col                    : 16.7 ms  6133 alloc 98894.6 KiB
mult_col_dot                :  7.2 ms  2045 alloc 32975.6 KiB
mult_col_dot_views          :  1.5 ms     1 alloc 16.1 KiB


#=
mult0                       :  0.9 ms     1 alloc 16.1 KiB
mult_mn                     : 22.5 ms     1 alloc 16.1 KiB
mult_mn_inbounds            : 22.0 ms     1 alloc 16.1 KiB
mult_nm                     :  3.1 ms     1 alloc 16.1 KiB
mult_nm_inbounds            :  1.5 ms     1 alloc 16.1 KiB
mult_nm_inbounds_simd       :  1.5 ms     1 alloc 16.1 KiB
mult_nm_inbounds_simd_views :  1.5 ms     1 alloc 16.1 KiB
mult_row                    : 32.8 ms  2049 alloc 33040.1 KiB
mult_row_inbounds           : 32.7 ms  2049 alloc 33040.1 KiB
mult_row_views              : 22.4 ms     1 alloc 16.1 KiB
mult_row_inbounds_views     : 22.4 ms     1 alloc 16.1 KiB
mult_col                    : 16.0 ms  6133 alloc 98894.6 KiB
mult_col_dot                :  7.0 ms  2045 alloc 32975.6 KiB
mult_col_dot_views          :  1.5 ms     1 alloc 16.1 KiB
=#

Explore matrix-vector multiplication with Julia¶

Double loop over m,n¶

Double loop over n,m¶

Row versions¶

Col versions¶

Test and time each version¶