Illustrate the memory and speed benefits of in-place operations
for GD iteration
2020-06-15 Jeff Fessler, University of Michigan
2021-08-23 Julia 1.6.2, faster closure versions
using BenchmarkTools: @btime
using LinearAlgebra: mul!, opnorm
using SparseArrays: sprandn
# textbook gradient descent implementation
function gd1(x0, grad::Function ; α::Real = 1, niter::Int = 100)
x = copy(x0) # "copy" not actually necessary here
for iter in 1:niter
x = x + α * grad(x)
end
return x
end
gd1 (generic function with 1 method)
# in-place update version of GD
function gd2(x0, grad::Function ; α::Real = 1, niter::Int = 100)
x = copy(x0) # "copy" necessary here!
for iter=1:niter
# x .+= α * grad(x) # better than gd1, but not best
x .+= α .* grad(x) # the ".+=" causes in-place x update
end
return x
end
gd2 (generic function with 1 method)
# in-place gradient version of GD
function gd3(x0, grad!::Function ; α::Real = 1, niter::Int = 100)
x = copy(x0)
g = similar(x0) # allocate working memory for gradient
for iter=1:niter
x .+= α .* grad!(g, x) # no memory allocation here!
end
return x
end
gd3 (generic function with 1 method)
# fully in-place version of GD
# input "g" is work space for gradient
function gd4!(x, x0, g, grad!::Function ; α::Real = 1, niter::Int = 100)
x .= x0
for iter=1:niter
x .+= α .* grad!(g, x) # no memory allocation here!
end
# no requirement to return anything, because input "x" was updated,
return x # but we return it anyway for convenience
end
gd4! (generic function with 1 method)
# fully in-place version of GD using convenient 1-input version of grad!
function gd5!(x, x0, grad!::Function ; α::Real = 1, niter::Int = 100)
x .= x0
for iter=1:niter
x .+= α .* grad!(x) # no memory allocation here!
end
return x
end
gd5! (generic function with 1 method)
grad = x -> x.^3 # gradient of 0.25*\norm{x}^4
function grad!(g, x)
g .= x.^3
return g # for convenience
end
grad! (generic function with 1 method)
x0 = randn(2^10)
g2 = similar(x0);
# the following "closure" is faster than using a global variable for g5
function make_grad5(x0)
g5 = similar(x0) # captures this variable in scope, improving speed
return x -> grad!(g5, x) # "closure"
end
grad5! = make_grad5(x0)
#8 (generic function with 1 method)
check gradients
grad(x0); grad!(g2, x0); grad5!(x0) # warm-up to compile
display(@allocated g1 = grad(x0))
display(@allocated grad!(g2, x0))
display(@allocated grad5!(x0))
@assert g1 == g2 == grad5!(x0)
8320
0
0
# time gradients
# (grad! and grad5! are much faster)
if true
println("Time simple gradients")
@btime g2 = grad(x0) # 1 alloc
@btime g2 .= grad(x0) # 3 alloc
@btime grad!(g2, x0) # 0 alloc
@btime grad5!(x0) # 0 alloc
end;
Time simple gradients 364.024 ns (1 allocation: 8.12 KiB) 986.311 ns (3 allocations: 8.16 KiB) 79.670 ns (0 allocations: 0 bytes) 79.412 ns (0 allocations: 0 bytes)
#=
Times on 2017 iMac (4.2 GHz Intel Core i7) with Mojave 10.14.6 and Julia 1.6.2
380.831 ns (1 allocation: 8.12 KiB)
986.308 ns (3 allocations: 8.16 KiB)
80.472 ns (0 allocations: 0 bytes)
81.540 ns (0 allocations: 0 bytes)
=#
# check GD for simple case
α = 1e-4
x4 = similar(x0)
x5 = similar(x0)
gd5!(x5, x0, grad5! ; α)
gd4!(x4, x0, g2, grad! ; α)
x3 = gd3(x0, grad! ; α)
x2 = gd2(x0, grad ; α)
x1 = gd1(x0, grad ; α)
@assert x1 ≈ x2 ≈ x3 ≈ x4 ≈ x5
println("Time GD in simple case")
@btime gd5!(x5, x0, grad5! ; α);
@btime gd4!(x4, x0, g2, grad! ; α);
@btime gd3(x0, grad! ; α);
@btime gd2(x0, grad ; α);
@btime gd1(x0, grad ; α);
Time GD in simple case 16.898 μs (3 allocations: 48 bytes) 16.877 μs (3 allocations: 48 bytes) 16.945 μs (5 allocations: 16.30 KiB) 63.860 μs (104 allocations: 820.67 KiB) 173.981 μs (304 allocations: 2.39 MiB)
Generally, when more steps are done in place, it runs faster and uses less memory.
#=
16.393 μs (3 allocations: 48 bytes)
16.145 μs (3 allocations: 48 bytes)
17.074 μs (5 allocations: 16.30 KiB)
64.550 μs (104 allocations: 820.67 KiB)
165.640 μs (304 allocations: 2.39 MiB)
=#
N = 2^10
M = 9N
x0 = randn(N)
g1 = similar(x0) # work space
A = sprandn(M, N, 10/N)
y = randn(M)
grad0 = x -> A' * (A*x - y) # LS gradient
#10 (generic function with 1 method)
function make_grad1(A, y)
# capture these:
Ax = similar(y)
Ac = copy(A)
Ah = A'
yc = copy(y)
return (g,x) -> begin # closure version
mul!(Ax, Ac, x)
Ax .-= yc # now "r"
mul!(g, Ah, Ax)
end
end
make_grad1 (generic function with 1 method)
# check gradient code
grad1! = make_grad1(A, y)
grad0(x0); grad1!(g1, x0);
display(@allocated g0 = grad0(x0))
display(@allocated grad1!(g1, x0))
@assert g0 ≈ g1
155984
0
if true
println("Time LS gradient")
@btime g0 = grad0(x0) # 6 alloc
@btime g0 .= grad0(x0) # 8 alloc
@btime grad1!(g1, x0) # 0 alloc
end;
Time LS gradient 157.463 μs (6 allocations: 152.33 KiB) 158.012 μs (8 allocations: 152.36 KiB) 150.260 μs (0 allocations: 0 bytes)
#=
156 μs (6 allocations: 152.33 KiB)
158 μs (8 allocations: 152.36 KiB)
149 μs (0 allocations: 0 bytes)
=#
L = opnorm(A, 1) * opnorm(A, Inf)
α = 1/L # step size
0.0004032796035695085
# check GD code
x4 = similar(x0)
gd4!(x4, x0, g1, grad1! ; α)
x3 = gd3(x0, grad1! ; α)
x2 = gd2(x0, grad0 ; α)
x1 = gd1(x0, grad0 ; α)
@assert x1 ≈ x2 ≈ x3 ≈ x4
println("Time GD for LS")
@btime gd4!(x4, x0, g2, grad1! ; α)
@btime gd3(x0, grad1! ; α)
@btime gd2(x0, grad0 ; α)
@btime gd1(x0, grad0 ; α);
Time GD for LS 15.174 ms (3 allocations: 48 bytes) 15.171 ms (5 allocations: 16.30 KiB) 20.003 ms (1104 allocations: 14.90 MiB) 19.906 ms (904 allocations: 16.47 MiB)
#=
15.0 ms (3 allocations: 48 bytes)
15.0 ms (5 allocations: 16.30 KiB)
19.8 ms (1104 allocations: 14.90 MiB)
19.8 ms ( 904 allocations: 16.47 MiB)
=#
nothing