{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "## Explore matrix-vector multiplication with Julia \n", "2018-08-14 Jeff Fessler, University of Michigan \n", "2020-08-05 Julia 1.5 \n", "2023-08-23 Julia 1.6.2" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "using BenchmarkTools" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# conventional high-level matrix-vector multiply function\n", "function mult0(A::Matrix, x::Vector)\n", " @boundscheck size(A,2) == length(x) || throw(\"DimensionMismatch(A,x)\")\n", " return A * x\n", "end;" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Double loop over m,n\n", "This is the textbook version." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "function mult_mn(A::Matrix, x::Vector)\n", " (M,N) = size(A)\n", " y = similar(x, M)\n", " for m=1:M\n", " inprod = zero(eltype(x)) # accumulator\n", " for n=1:N\n", " inprod += A[m,n] * x[n]\n", " end\n", " y[m] = inprod\n", " end\n", " return y\n", "end;" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# with @inbounds\n", "function mult_mn_inbounds(A::Matrix, x::Vector)\n", " (M,N) = size(A)\n", " y = similar(x, M)\n", " for m=1:M\n", " inprod = zero(x[1]) # accumulator\n", " for n=1:N\n", " @inbounds inprod += A[m,n] * x[n]\n", " end\n", " @inbounds y[m] = inprod\n", " end\n", " return y\n", "end;" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Double loop over n,m \n", "We expect this way to be faster because of cache access over m." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "function mult_nm(A::Matrix, x::Vector)\n", " (M,N) = size(A)\n", " y = zeros(eltype(x), M)\n", " for n=1:N\n", " for m=1:M\n", " y[m] += A[m,n] * x[n]\n", " end\n", " end\n", " return y\n", "end;" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# with @inbounds\n", "function mult_nm_inbounds(A::Matrix, x::Vector)\n", " (M,N) = size(A)\n", " y = zeros(eltype(x), M)\n", " for n=1:N\n", " for m=1:M\n", " @inbounds y[m] += A[m,n] * x[n]\n", " end\n", " end\n", " return y\n", "end;" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# and with @simd\n", "function mult_nm_inbounds_simd(A::Matrix, x::Vector)\n", " (M,N) = size(A)\n", " y = zeros(eltype(x), M)\n", " for n=1:N\n", " @simd for m=1:M\n", " @inbounds y[m] += A[m,n] * x[n]\n", " end\n", " end\n", " return y\n", "end;" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# and with @views\n", "function mult_nm_inbounds_simd_views(A::Matrix, x::Vector)\n", " (M,N) = size(A)\n", " y = zeros(eltype(x), M)\n", " for n=1:N\n", " @simd for m=1:M\n", " @inbounds @views y[m] += A[m,n] * x[n]\n", " end\n", " end\n", " return y\n", "end;" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Row versions\n", "Loop over m." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "function mult_row(A::Matrix, x::Vector)\n", " (M,N) = size(A)\n", " y = similar(x, M)\n", " for m=1:M\n", " y[m] = transpose(A[m,:]) * x\n", " end\n", " return y\n", "end;" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# with @inbounds\n", "function mult_row_inbounds(A::Matrix, x::Vector)\n", " (M,N) = size(A)\n", " y = similar(x, M)\n", " for m=1:M\n", " @inbounds y[m] = transpose(A[m,:]) * x\n", " end\n", " return y\n", "end;" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# with @views\n", "function mult_row_views(A::Matrix, x::Vector)\n", " (M,N) = size(A)\n", " y = similar(x, M)\n", " for m=1:M\n", " @views y[m] = transpose(A[m,:]) * x\n", " end\n", " return y\n", "end;" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# with both\n", "function mult_row_inbounds_views(A::Matrix, x::Vector)\n", " (M,N) = size(A)\n", " y = similar(x, M)\n", " for m=1:M\n", " @inbounds @views y[m] = transpose(A[m,:]) * x\n", " end\n", " return y\n", "end;" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Col versions\n", "Loop over n." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "function mult_col(A::Matrix, x::Vector)\n", " (M,N) = size(A)\n", " y = zeros(eltype(x), M)\n", " for n=1:N\n", " y += A[:,n] * x[n]\n", " end\n", " return y\n", "end;" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# with \"dots\" (broadcast) to coalesce\n", "function mult_col_dot(A::Matrix, x::Vector)\n", " (M,N) = size(A)\n", " y = zeros(eltype(x), M)\n", " for n=1:N\n", " y .+= A[:,n] .* x[n]\n", " end\n", " return y\n", "end;" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "lines_to_next_cell": 2 }, "outputs": [], "source": [ "# and @views\n", "function mult_col_dot_views(A::Matrix, x::Vector)\n", " (M,N) = size(A)\n", " y = zeros(eltype(x), M)\n", " for n=1:N\n", " @views y .+= A[:,n] .* x[n]\n", "# @inbounds @views y .+= A[:,n] .* x[n] # did not help\n", " end\n", " return y\n", "end;" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Test and time each version" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "M = 2^11\n", "N = M-4\n", "A = randn(M, N)\n", "x = randn(N)\n", "\n", "flist = (mult0,\n", " mult_mn, mult_mn_inbounds,\n", " mult_nm, mult_nm_inbounds, mult_nm_inbounds_simd, mult_nm_inbounds_simd_views,\n", " mult_row, mult_row_inbounds, mult_row_views, mult_row_inbounds_views,\n", " mult_col, mult_col_dot, mult_col_dot_views,\n", ")\n", "\n", "for f in flist # warm-up and test each version\n", " @assert A * x ≈ f(A, x)\n", "end\n", "\n", "for f in flist # benchmark timing for each\n", "# @show f\n", "# @btime $f($A, $x)\n", " b = @benchmark $f($A,$x)\n", " tim = round(median(b.times)/10^6, digits=1) # in ms\n", " tim = lpad(tim, 4)\n", " name = rpad(f, 27)\n", "\talloc = lpad(b.allocs, 5)\n", "\tmem = round(b.memory/2^10, digits=1)\n", " println(\"$name : $tim ms $alloc alloc $mem KiB\")\n", "end" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#=\n", "mult0 : 0.9 ms 1 alloc 16.1 KiB\n", "mult_mn : 22.5 ms 1 alloc 16.1 KiB\n", "mult_mn_inbounds : 22.0 ms 1 alloc 16.1 KiB\n", "mult_nm : 3.1 ms 1 alloc 16.1 KiB\n", "mult_nm_inbounds : 1.5 ms 1 alloc 16.1 KiB\n", "mult_nm_inbounds_simd : 1.5 ms 1 alloc 16.1 KiB\n", "mult_nm_inbounds_simd_views : 1.5 ms 1 alloc 16.1 KiB\n", "mult_row : 32.8 ms 2049 alloc 33040.1 KiB\n", "mult_row_inbounds : 32.7 ms 2049 alloc 33040.1 KiB\n", "mult_row_views : 22.4 ms 1 alloc 16.1 KiB\n", "mult_row_inbounds_views : 22.4 ms 1 alloc 16.1 KiB\n", "mult_col : 16.0 ms 6133 alloc 98894.6 KiB\n", "mult_col_dot : 7.0 ms 2045 alloc 32975.6 KiB\n", "mult_col_dot_views : 1.5 ms 1 alloc 16.1 KiB\n", "=#" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "The results above are for a\n", "2017 iMac with 4.2GHz quad-core Intel Core i7\n", "with macOS Mojave 10.14.6 and Julia 1.6.2.\n", "As expected, simple ``A*x`` is the fastest,\n", "but one can come quite close to that using proper double loop order\n", "with ``@inbounds`` or using \"dots\" and ``@views`` to coalesce.\n", "Without ``@views`` the vector versions have huge memory overhead! " ] } ], "metadata": { "kernelspec": { "display_name": "Julia 1.6.2", "language": "julia", "name": "julia-1.6" } }, "nbformat": 4, "nbformat_minor": 4 }