# --- # jupyter: # jupytext: # text_representation: # extension: .jl # format_name: light # format_version: '1.5' # jupytext_version: 1.3.3 # kernelspec: # display_name: Julia 1.6.2 # language: julia # name: julia-1.6 # --- # ## in-place.jl # Illustrate the memory and speed benefits of in-place operations # for GD iteration # 2020-06-15 Jeff Fessler, University of Michigan # 2021-08-23 Julia 1.6.2, faster closure versions using BenchmarkTools: @btime using LinearAlgebra: mul!, opnorm using SparseArrays: sprandn # textbook gradient descent implementation function gd1(x0, grad::Function ; α::Real = 1, niter::Int = 100) x = copy(x0) # "copy" not actually necessary here for iter in 1:niter x = x + α * grad(x) end return x end # in-place update version of GD function gd2(x0, grad::Function ; α::Real = 1, niter::Int = 100) x = copy(x0) # "copy" necessary here! for iter=1:niter # x .+= α * grad(x) # better than gd1, but not best x .+= α .* grad(x) # the ".+=" causes in-place x update end return x end # in-place gradient version of GD function gd3(x0, grad!::Function ; α::Real = 1, niter::Int = 100) x = copy(x0) g = similar(x0) # allocate working memory for gradient for iter=1:niter x .+= α .* grad!(g, x) # no memory allocation here! end return x end # fully in-place version of GD # input "g" is work space for gradient function gd4!(x, x0, g, grad!::Function ; α::Real = 1, niter::Int = 100) x .= x0 for iter=1:niter x .+= α .* grad!(g, x) # no memory allocation here! end # no requirement to return anything, because input "x" was updated, return x # but we return it anyway for convenience end # fully in-place version of GD using convenient 1-input version of grad! function gd5!(x, x0, grad!::Function ; α::Real = 1, niter::Int = 100) x .= x0 for iter=1:niter x .+= α .* grad!(x) # no memory allocation here! end return x end # ## Simple gradient case grad = x -> x.^3 # gradient of 0.25*\norm{x}^4 function grad!(g, x) g .= x.^3 return g # for convenience end x0 = randn(2^10) g2 = similar(x0); # the following "closure" is faster than using a global variable for g5 function make_grad5(x0) g5 = similar(x0) # captures this variable in scope, improving speed return x -> grad!(g5, x) # "closure" end grad5! = make_grad5(x0) # check gradients grad(x0); grad!(g2, x0); grad5!(x0) # warm-up to compile display(@allocated g1 = grad(x0)) display(@allocated grad!(g2, x0)) display(@allocated grad5!(x0)) @assert g1 == g2 == grad5!(x0) # time gradients # (grad! and grad5! are much faster) if true println("Time simple gradients") @btime g2 = grad(x0) # 1 alloc @btime g2 .= grad(x0) # 3 alloc @btime grad!(g2, x0) # 0 alloc @btime grad5!(x0) # 0 alloc end; #= Times on 2017 iMac (4.2 GHz Intel Core i7) with Mojave 10.14.6 and Julia 1.6.2 380.831 ns (1 allocation: 8.12 KiB) 986.308 ns (3 allocations: 8.16 KiB) 80.472 ns (0 allocations: 0 bytes) 81.540 ns (0 allocations: 0 bytes) =# # check GD for simple case α = 1e-4 x4 = similar(x0) x5 = similar(x0) gd5!(x5, x0, grad5! ; α) gd4!(x4, x0, g2, grad! ; α) x3 = gd3(x0, grad! ; α) x2 = gd2(x0, grad ; α) x1 = gd1(x0, grad ; α) @assert x1 ≈ x2 ≈ x3 ≈ x4 ≈ x5 # ### Time GD for simple case println("Time GD in simple case") @btime gd5!(x5, x0, grad5! ; α); @btime gd4!(x4, x0, g2, grad! ; α); @btime gd3(x0, grad! ; α); @btime gd2(x0, grad ; α); @btime gd1(x0, grad ; α); # Generally, when more steps are done in place, # it runs faster and uses less memory. #= 16.393 μs (3 allocations: 48 bytes) 16.145 μs (3 allocations: 48 bytes) 17.074 μs (5 allocations: 16.30 KiB) 64.550 μs (104 allocations: 820.67 KiB) 165.640 μs (304 allocations: 2.39 MiB) =# # ## More complicated gradient (LS) N = 2^10 M = 9N x0 = randn(N) g1 = similar(x0) # work space A = sprandn(M, N, 10/N) y = randn(M) grad0 = x -> A' * (A*x - y) # LS gradient function make_grad1(A, y) # capture these: Ax = similar(y) Ac = copy(A) Ah = A' yc = copy(y) return (g,x) -> begin # closure version mul!(Ax, Ac, x) Ax .-= yc # now "r" mul!(g, Ah, Ax) end end # check gradient code grad1! = make_grad1(A, y) grad0(x0); grad1!(g1, x0); display(@allocated g0 = grad0(x0)) display(@allocated grad1!(g1, x0)) @assert g0 ≈ g1 if true println("Time LS gradient") @btime g0 = grad0(x0) # 6 alloc @btime g0 .= grad0(x0) # 8 alloc @btime grad1!(g1, x0) # 0 alloc end; #= 156 μs (6 allocations: 152.33 KiB) 158 μs (8 allocations: 152.36 KiB) 149 μs (0 allocations: 0 bytes) =# L = opnorm(A, 1) * opnorm(A, Inf) α = 1/L # step size # check GD code x4 = similar(x0) gd4!(x4, x0, g1, grad1! ; α) x3 = gd3(x0, grad1! ; α) x2 = gd2(x0, grad0 ; α) x1 = gd1(x0, grad0 ; α) @assert x1 ≈ x2 ≈ x3 ≈ x4 # ### Time GD for LS println("Time GD for LS") @btime gd4!(x4, x0, g2, grad1! ; α) @btime gd3(x0, grad1! ; α) @btime gd2(x0, grad0 ; α) @btime gd1(x0, grad0 ; α); #= 15.0 ms (3 allocations: 48 bytes) 15.0 ms (5 allocations: 16.30 KiB) 19.8 ms (1104 allocations: 14.90 MiB) 19.8 ms ( 904 allocations: 16.47 MiB) =# nothing