# ---
# jupyter:
#   jupytext:
#     text_representation:
#       extension: .jl
#       format_name: light
#       format_version: '1.5'
#       jupytext_version: 1.3.3
#   kernelspec:
#     display_name: Julia 1.6.2
#     language: julia
#     name: julia-1.6
# ---

# ## in-place.jl
# Illustrate the memory and speed benefits of in-place operations
# for GD iteration  
# 2020-06-15 Jeff Fessler, University of Michigan  
# 2021-08-23 Julia 1.6.2, faster closure versions

using BenchmarkTools: @btime
using LinearAlgebra: mul!, opnorm
using SparseArrays: sprandn


# textbook gradient descent implementation
function gd1(x0, grad::Function ; α::Real = 1, niter::Int = 100)
    x = copy(x0) # "copy" not actually necessary here
    for iter in 1:niter
        x = x + α * grad(x)
    end
    return x
end

# in-place update version of GD
function gd2(x0, grad::Function ; α::Real = 1, niter::Int = 100)
    x = copy(x0) # "copy" necessary here!
    for iter=1:niter
#       x .+= α * grad(x) # better than gd1, but not best
        x .+= α .* grad(x) # the ".+=" causes in-place x update
    end
    return x
end

# in-place gradient version of GD
function gd3(x0, grad!::Function ; α::Real = 1, niter::Int = 100)
    x = copy(x0)
    g = similar(x0) # allocate working memory for gradient
    for iter=1:niter
        x .+= α .* grad!(g, x) # no memory allocation here!
    end
    return x
end

# fully in-place version of GD
# input "g" is work space for gradient
function gd4!(x, x0, g, grad!::Function ; α::Real = 1, niter::Int = 100)
    x .= x0
    for iter=1:niter
        x .+= α .* grad!(g, x) # no memory allocation here!
    end
    # no requirement to return anything, because input "x" was updated,
    return x # but we return it anyway for convenience
end

# fully in-place version of GD using convenient 1-input version of grad!
function gd5!(x, x0, grad!::Function ; α::Real = 1, niter::Int = 100)
    x .= x0
    for iter=1:niter
        x .+= α .* grad!(x) # no memory allocation here!
    end
    return x
end


# ## Simple gradient case

grad = x -> x.^3 # gradient of 0.25*\norm{x}^4
function grad!(g, x)
    g .= x.^3
    return g # for convenience
end

x0 = randn(2^10)
g2 = similar(x0);

# the following "closure" is faster than using a global variable for g5
function make_grad5(x0)
    g5 = similar(x0) # captures this variable in scope, improving speed
    return x -> grad!(g5, x) # "closure"
end
grad5! = make_grad5(x0)

# check gradients

grad(x0); grad!(g2, x0); grad5!(x0) # warm-up to compile
display(@allocated g1 = grad(x0))
display(@allocated grad!(g2, x0))
display(@allocated grad5!(x0))
@assert g1 == g2 == grad5!(x0)


# time gradients
# (grad! and grad5! are much faster)
if true
    println("Time simple gradients")
    @btime g2 = grad(x0) # 1 alloc
    @btime g2 .= grad(x0) # 3 alloc
    @btime grad!(g2, x0) # 0 alloc
    @btime grad5!(x0) # 0 alloc
end;

#=
Times on 2017 iMac (4.2 GHz Intel Core i7) with Mojave 10.14.6 and Julia 1.6.2
  380.831 ns (1 allocation: 8.12 KiB)
  986.308 ns (3 allocations: 8.16 KiB)
  80.472 ns (0 allocations: 0 bytes)
  81.540 ns (0 allocations: 0 bytes)
=#


# check GD for simple case
α = 1e-4
x4 = similar(x0)
x5 = similar(x0)
gd5!(x5, x0, grad5! ; α)
gd4!(x4, x0, g2, grad! ; α)
x3 = gd3(x0, grad! ; α)
x2 = gd2(x0, grad ; α)
x1 = gd1(x0, grad ; α)
@assert x1 ≈ x2 ≈ x3 ≈ x4 ≈ x5


# ### Time GD for simple case

println("Time GD in simple case")
@btime gd5!(x5, x0, grad5! ; α);
@btime gd4!(x4, x0, g2, grad! ; α);
@btime gd3(x0, grad! ; α);
@btime gd2(x0, grad ; α);
@btime gd1(x0, grad ; α);


# Generally, when more steps are done in place,
# it runs faster and uses less memory.

#=
  16.393 μs (3 allocations: 48 bytes)
  16.145 μs (3 allocations: 48 bytes)
  17.074 μs (5 allocations: 16.30 KiB)
  64.550 μs (104 allocations: 820.67 KiB)
  165.640 μs (304 allocations: 2.39 MiB)
=#


# ## More complicated gradient (LS)

N = 2^10
M = 9N
x0 = randn(N)
g1 = similar(x0) # work space
A = sprandn(M, N, 10/N)
y = randn(M)
grad0 = x -> A' * (A*x - y) # LS gradient


function make_grad1(A, y)
    # capture these:
    Ax = similar(y)
    Ac = copy(A)
    Ah = A'
    yc = copy(y)
    return (g,x) -> begin # closure version
        mul!(Ax, Ac, x)
        Ax .-= yc # now "r"
        mul!(g, Ah, Ax)
    end
end

# check gradient code
grad1! = make_grad1(A, y)
grad0(x0); grad1!(g1, x0);
display(@allocated g0 = grad0(x0))
display(@allocated grad1!(g1, x0))
@assert g0 ≈ g1

if true
    println("Time LS gradient")
    @btime g0 = grad0(x0) # 6 alloc
    @btime g0 .= grad0(x0) # 8 alloc
    @btime grad1!(g1, x0) # 0 alloc
end;


#=
  156 μs (6 allocations: 152.33 KiB)
  158 μs (8 allocations: 152.36 KiB)
  149 μs (0 allocations: 0 bytes)
=#

L = opnorm(A, 1) * opnorm(A, Inf)
α = 1/L # step size

# check GD code
x4 = similar(x0)
gd4!(x4, x0, g1, grad1! ; α)
x3 = gd3(x0, grad1! ; α)
x2 = gd2(x0, grad0 ; α)
x1 = gd1(x0, grad0 ; α)
@assert x1 ≈ x2 ≈ x3 ≈ x4


# ### Time GD for LS

println("Time GD for LS")
@btime gd4!(x4, x0, g2, grad1! ; α)
@btime gd3(x0, grad1! ; α)
@btime gd2(x0, grad0 ; α)
@btime gd1(x0, grad0 ; α);

#=
  15.0 ms (3 allocations: 48 bytes)
  15.0 ms (5 allocations: 16.30 KiB)

  19.8 ms (1104 allocations: 14.90 MiB)
  19.8 ms ( 904 allocations: 16.47 MiB)
=#
nothing