# ---
# jupyter:
#   jupytext:
#     text_representation:
#       extension: .jl
#       format_name: light
#       format_version: '1.5'
#       jupytext_version: 1.3.3
#   kernelspec:
#     display_name: Julia 1.6.2
#     language: julia
#     name: julia-1.6
# ---

# ## Explore matrix-vector multiplication with Julia  
# 2018-08-14 Jeff Fessler, University of Michigan  
# 2020-08-05 Julia 1.5  
# 2023-08-23 Julia 1.6.2

using BenchmarkTools

# conventional high-level matrix-vector multiply function
function mult0(A::Matrix, x::Vector)
    @boundscheck size(A,2) == length(x) || throw("DimensionMismatch(A,x)")
    return A * x
end;

# ## Double loop over m,n
# This is the textbook version.

function mult_mn(A::Matrix, x::Vector)
    (M,N) = size(A)
    y = similar(x, M)
    for m=1:M
        inprod = zero(eltype(x)) # accumulator
        for n=1:N
            inprod += A[m,n] * x[n]
        end
        y[m] = inprod
    end
    return y
end;

# with @inbounds
function mult_mn_inbounds(A::Matrix, x::Vector)
    (M,N) = size(A)
    y = similar(x, M)
    for m=1:M
        inprod = zero(x[1]) # accumulator
        for n=1:N
            @inbounds inprod += A[m,n] * x[n]
        end
        @inbounds y[m] = inprod
    end
    return y
end;

# ## Double loop over n,m  
# We expect this way to be faster because of cache access over m.

function mult_nm(A::Matrix, x::Vector)
    (M,N) = size(A)
    y = zeros(eltype(x), M)
    for n=1:N
        for m=1:M
            y[m] += A[m,n] * x[n]
        end
    end
    return y
end;

# with @inbounds
function mult_nm_inbounds(A::Matrix, x::Vector)
    (M,N) = size(A)
    y = zeros(eltype(x), M)
    for n=1:N
        for m=1:M
            @inbounds y[m] += A[m,n] * x[n]
        end
    end
    return y
end;

# and with @simd
function mult_nm_inbounds_simd(A::Matrix, x::Vector)
    (M,N) = size(A)
    y = zeros(eltype(x), M)
    for n=1:N
        @simd for m=1:M
            @inbounds y[m] += A[m,n] * x[n]
        end
    end
    return y
end;

# and with @views
function mult_nm_inbounds_simd_views(A::Matrix, x::Vector)
    (M,N) = size(A)
    y = zeros(eltype(x), M)
    for n=1:N
        @simd for m=1:M
            @inbounds @views y[m] += A[m,n] * x[n]
        end
    end
    return y
end;

# ## Row versions
# Loop over m.

function mult_row(A::Matrix, x::Vector)
    (M,N) = size(A)
    y = similar(x, M)
    for m=1:M
        y[m] = transpose(A[m,:]) * x
    end
    return y
end;

# with @inbounds
function mult_row_inbounds(A::Matrix, x::Vector)
    (M,N) = size(A)
    y = similar(x, M)
    for m=1:M
        @inbounds y[m] = transpose(A[m,:]) * x
    end
    return y
end;

# with @views
function mult_row_views(A::Matrix, x::Vector)
    (M,N) = size(A)
    y = similar(x, M)
    for m=1:M
        @views y[m] = transpose(A[m,:]) * x
    end
    return y
end;

# with both
function mult_row_inbounds_views(A::Matrix, x::Vector)
    (M,N) = size(A)
    y = similar(x, M)
    for m=1:M
        @inbounds @views y[m] = transpose(A[m,:]) * x
    end
    return y
end;

# ## Col versions
# Loop over n.

function mult_col(A::Matrix, x::Vector)
    (M,N) = size(A)
    y = zeros(eltype(x), M)
    for n=1:N
        y += A[:,n] * x[n]
    end
    return y
end;

# with "dots" (broadcast) to coalesce
function mult_col_dot(A::Matrix, x::Vector)
    (M,N) = size(A)
    y = zeros(eltype(x), M)
    for n=1:N
        y .+= A[:,n] .* x[n]
    end
    return y
end;

# and @views
function mult_col_dot_views(A::Matrix, x::Vector)
    (M,N) = size(A)
    y = zeros(eltype(x), M)
    for n=1:N
        @views y .+= A[:,n] .* x[n]
#       @inbounds @views y .+= A[:,n] .* x[n] # did not help
    end
    return y
end;


# ## Test and time each version

# +
M = 2^11
N = M-4
A = randn(M, N)
x = randn(N)

flist = (mult0,
    mult_mn, mult_mn_inbounds,
    mult_nm, mult_nm_inbounds, mult_nm_inbounds_simd, mult_nm_inbounds_simd_views,
    mult_row, mult_row_inbounds, mult_row_views, mult_row_inbounds_views,
    mult_col, mult_col_dot, mult_col_dot_views,
)

for f in flist # warm-up and test each version
    @assert A * x ≈ f(A, x)
end

for f in flist # benchmark timing for each
#   @show f
#   @btime $f($A, $x)
    b = @benchmark $f($A,$x)
    tim = round(median(b.times)/10^6, digits=1) # in ms
    tim = lpad(tim, 4)
    name = rpad(f, 27)
	alloc = lpad(b.allocs, 5)
	mem = round(b.memory/2^10, digits=1)
    println("$name : $tim ms $alloc alloc $mem KiB")
end
# -

#=
mult0                       :  0.9 ms     1 alloc 16.1 KiB
mult_mn                     : 22.5 ms     1 alloc 16.1 KiB
mult_mn_inbounds            : 22.0 ms     1 alloc 16.1 KiB
mult_nm                     :  3.1 ms     1 alloc 16.1 KiB
mult_nm_inbounds            :  1.5 ms     1 alloc 16.1 KiB
mult_nm_inbounds_simd       :  1.5 ms     1 alloc 16.1 KiB
mult_nm_inbounds_simd_views :  1.5 ms     1 alloc 16.1 KiB
mult_row                    : 32.8 ms  2049 alloc 33040.1 KiB
mult_row_inbounds           : 32.7 ms  2049 alloc 33040.1 KiB
mult_row_views              : 22.4 ms     1 alloc 16.1 KiB
mult_row_inbounds_views     : 22.4 ms     1 alloc 16.1 KiB
mult_col                    : 16.0 ms  6133 alloc 98894.6 KiB
mult_col_dot                :  7.0 ms  2045 alloc 32975.6 KiB
mult_col_dot_views          :  1.5 ms     1 alloc 16.1 KiB
=#

# The results above are for a
# 2017 iMac with 4.2GHz quad-core Intel Core i7
# with macOS Mojave 10.14.6 and Julia 1.6.2.
# As expected, simple ``A*x`` is the fastest,
# but one can come quite close to that using proper double loop order
# with ``@inbounds`` or using "dots" and ``@views`` to coalesce.
# Without ``@views`` the vector versions have huge memory overhead!