using BenchmarkTools: @btime
using LinearAlgebra: mul!, opnorm
using SparseArrays: sprandn


# textbook gradient descent implementation
function gd1(x0, grad::Function ; α::Real = 1, niter::Int = 100)
    x = copy(x0) # "copy" not actually necessary here
    for iter in 1:niter
        x = x + α * grad(x)
    end
    return x
end

gd1 (generic function with 1 method)


# in-place update version of GD
function gd2(x0, grad::Function ; α::Real = 1, niter::Int = 100)
    x = copy(x0) # "copy" necessary here!
    for iter=1:niter
#       x .+= α * grad(x) # better than gd1, but not best
        x .+= α .* grad(x) # the ".+=" causes in-place x update
    end
    return x
end

gd2 (generic function with 1 method)


# in-place gradient version of GD
function gd3(x0, grad!::Function ; α::Real = 1, niter::Int = 100)
    x = copy(x0)
    g = similar(x0) # allocate working memory for gradient
    for iter=1:niter
        x .+= α .* grad!(g, x) # no memory allocation here!
    end
    return x
end

gd3 (generic function with 1 method)


# fully in-place version of GD
# input "g" is work space for gradient
function gd4!(x, x0, g, grad!::Function ; α::Real = 1, niter::Int = 100)
    x .= x0
    for iter=1:niter
        x .+= α .* grad!(g, x) # no memory allocation here!
    end
    # no requirement to return anything, because input "x" was updated,
    return x # but we return it anyway for convenience
end

gd4! (generic function with 1 method)


# fully in-place version of GD using convenient 1-input version of grad!
function gd5!(x, x0, grad!::Function ; α::Real = 1, niter::Int = 100)
    x .= x0
    for iter=1:niter
        x .+= α .* grad!(x) # no memory allocation here!
    end
    return x
end

gd5! (generic function with 1 method)


grad = x -> x.^3 # gradient of 0.25*\norm{x}^4
function grad!(g, x)
    g .= x.^3
    return g # for convenience
end

grad! (generic function with 1 method)


x0 = randn(2^10)
g2 = similar(x0);


# the following "closure" is faster than using a global variable for g5
function make_grad5(x0)
    g5 = similar(x0) # captures this variable in scope, improving speed
    return x -> grad!(g5, x) # "closure"
end
grad5! = make_grad5(x0)

#8 (generic function with 1 method)


grad(x0); grad!(g2, x0); grad5!(x0) # warm-up to compile
display(@allocated g1 = grad(x0))
display(@allocated grad!(g2, x0))
display(@allocated grad5!(x0))
@assert g1 == g2 == grad5!(x0)

8320

0

0


# time gradients
# (grad! and grad5! are much faster)
if true
    println("Time simple gradients")
    @btime g2 = grad(x0) # 1 alloc
    @btime g2 .= grad(x0) # 3 alloc
    @btime grad!(g2, x0) # 0 alloc
    @btime grad5!(x0) # 0 alloc
end;

Time simple gradients
  364.024 ns (1 allocation: 8.12 KiB)
  986.311 ns (3 allocations: 8.16 KiB)
  79.670 ns (0 allocations: 0 bytes)
  79.412 ns (0 allocations: 0 bytes)


#=
Times on 2017 iMac (4.2 GHz Intel Core i7) with Mojave 10.14.6 and Julia 1.6.2
  380.831 ns (1 allocation: 8.12 KiB)
  986.308 ns (3 allocations: 8.16 KiB)
  80.472 ns (0 allocations: 0 bytes)
  81.540 ns (0 allocations: 0 bytes)
=#


# check GD for simple case
α = 1e-4
x4 = similar(x0)
x5 = similar(x0)
gd5!(x5, x0, grad5! ; α)
gd4!(x4, x0, g2, grad! ; α)
x3 = gd3(x0, grad! ; α)
x2 = gd2(x0, grad ; α)
x1 = gd1(x0, grad ; α)
@assert x1 ≈ x2 ≈ x3 ≈ x4 ≈ x5


println("Time GD in simple case")
@btime gd5!(x5, x0, grad5! ; α);
@btime gd4!(x4, x0, g2, grad! ; α);
@btime gd3(x0, grad! ; α);
@btime gd2(x0, grad ; α);
@btime gd1(x0, grad ; α);

Time GD in simple case
  16.898 μs (3 allocations: 48 bytes)
  16.877 μs (3 allocations: 48 bytes)
  16.945 μs (5 allocations: 16.30 KiB)
  63.860 μs (104 allocations: 820.67 KiB)
  173.981 μs (304 allocations: 2.39 MiB)


#=
  16.393 μs (3 allocations: 48 bytes)
  16.145 μs (3 allocations: 48 bytes)
  17.074 μs (5 allocations: 16.30 KiB)
  64.550 μs (104 allocations: 820.67 KiB)
  165.640 μs (304 allocations: 2.39 MiB)
=#


N = 2^10
M = 9N
x0 = randn(N)
g1 = similar(x0) # work space
A = sprandn(M, N, 10/N)
y = randn(M)
grad0 = x -> A' * (A*x - y) # LS gradient

#10 (generic function with 1 method)


function make_grad1(A, y)
    # capture these:
    Ax = similar(y)
    Ac = copy(A)
    Ah = A'
    yc = copy(y)
    return (g,x) -> begin # closure version
        mul!(Ax, Ac, x)
        Ax .-= yc # now "r"
        mul!(g, Ah, Ax)
    end
end

make_grad1 (generic function with 1 method)


# check gradient code
grad1! = make_grad1(A, y)
grad0(x0); grad1!(g1, x0);
display(@allocated g0 = grad0(x0))
display(@allocated grad1!(g1, x0))
@assert g0 ≈ g1

155984

0


if true
    println("Time LS gradient")
    @btime g0 = grad0(x0) # 6 alloc
    @btime g0 .= grad0(x0) # 8 alloc
    @btime grad1!(g1, x0) # 0 alloc
end;

Time LS gradient
  157.463 μs (6 allocations: 152.33 KiB)
  158.012 μs (8 allocations: 152.36 KiB)
  150.260 μs (0 allocations: 0 bytes)


#=
  156 μs (6 allocations: 152.33 KiB)
  158 μs (8 allocations: 152.36 KiB)
  149 μs (0 allocations: 0 bytes)
=#


L = opnorm(A, 1) * opnorm(A, Inf)
α = 1/L # step size

0.0004032796035695085


# check GD code
x4 = similar(x0)
gd4!(x4, x0, g1, grad1! ; α)
x3 = gd3(x0, grad1! ; α)
x2 = gd2(x0, grad0 ; α)
x1 = gd1(x0, grad0 ; α)
@assert x1 ≈ x2 ≈ x3 ≈ x4


println("Time GD for LS")
@btime gd4!(x4, x0, g2, grad1! ; α)
@btime gd3(x0, grad1! ; α)
@btime gd2(x0, grad0 ; α)
@btime gd1(x0, grad0 ; α);

Time GD for LS
  15.174 ms (3 allocations: 48 bytes)
  15.171 ms (5 allocations: 16.30 KiB)
  20.003 ms (1104 allocations: 14.90 MiB)
  19.906 ms (904 allocations: 16.47 MiB)


#=
  15.0 ms (3 allocations: 48 bytes)
  15.0 ms (5 allocations: 16.30 KiB)

  19.8 ms (1104 allocations: 14.90 MiB)
  19.8 ms ( 904 allocations: 16.47 MiB)
=#
nothing

in-place.jl¶

Simple gradient case¶

Time GD for simple case¶

More complicated gradient (LS)¶

Time GD for LS¶