{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## in-place.jl\n",
    "Illustrate the memory and speed benefits of in-place operations\n",
    "for GD iteration  \n",
    "2020-06-15 Jeff Fessler, University of Michigan  \n",
    "2021-08-23 Julia 1.6.2, faster closure versions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "lines_to_next_cell": 2
   },
   "outputs": [],
   "source": [
    "using BenchmarkTools: @btime\n",
    "using LinearAlgebra: mul!, opnorm\n",
    "using SparseArrays: sprandn"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# textbook gradient descent implementation\n",
    "function gd1(x0, grad::Function ; α::Real = 1, niter::Int = 100)\n",
    "    x = copy(x0) # \"copy\" not actually necessary here\n",
    "    for iter in 1:niter\n",
    "        x = x + α * grad(x)\n",
    "    end\n",
    "    return x\n",
    "end"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# in-place update version of GD\n",
    "function gd2(x0, grad::Function ; α::Real = 1, niter::Int = 100)\n",
    "    x = copy(x0) # \"copy\" necessary here!\n",
    "    for iter=1:niter\n",
    "#       x .+= α * grad(x) # better than gd1, but not best\n",
    "        x .+= α .* grad(x) # the \".+=\" causes in-place x update\n",
    "    end\n",
    "    return x\n",
    "end"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# in-place gradient version of GD\n",
    "function gd3(x0, grad!::Function ; α::Real = 1, niter::Int = 100)\n",
    "    x = copy(x0)\n",
    "    g = similar(x0) # allocate working memory for gradient\n",
    "    for iter=1:niter\n",
    "        x .+= α .* grad!(g, x) # no memory allocation here!\n",
    "    end\n",
    "    return x\n",
    "end"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# fully in-place version of GD\n",
    "# input \"g\" is work space for gradient\n",
    "function gd4!(x, x0, g, grad!::Function ; α::Real = 1, niter::Int = 100)\n",
    "    x .= x0\n",
    "    for iter=1:niter\n",
    "        x .+= α .* grad!(g, x) # no memory allocation here!\n",
    "    end\n",
    "    # no requirement to return anything, because input \"x\" was updated,\n",
    "    return x # but we return it anyway for convenience\n",
    "end"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "lines_to_next_cell": 2
   },
   "outputs": [],
   "source": [
    "# fully in-place version of GD using convenient 1-input version of grad!\n",
    "function gd5!(x, x0, grad!::Function ; α::Real = 1, niter::Int = 100)\n",
    "    x .= x0\n",
    "    for iter=1:niter\n",
    "        x .+= α .* grad!(x) # no memory allocation here!\n",
    "    end\n",
    "    return x\n",
    "end"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Simple gradient case"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "grad = x -> x.^3 # gradient of 0.25*\\norm{x}^4\n",
    "function grad!(g, x)\n",
    "    g .= x.^3\n",
    "    return g # for convenience\n",
    "end"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "x0 = randn(2^10)\n",
    "g2 = similar(x0);"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# the following \"closure\" is faster than using a global variable for g5\n",
    "function make_grad5(x0)\n",
    "    g5 = similar(x0) # captures this variable in scope, improving speed\n",
    "    return x -> grad!(g5, x) # \"closure\"\n",
    "end\n",
    "grad5! = make_grad5(x0)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "check gradients"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "lines_to_next_cell": 2
   },
   "outputs": [],
   "source": [
    "grad(x0); grad!(g2, x0); grad5!(x0) # warm-up to compile\n",
    "display(@allocated g1 = grad(x0))\n",
    "display(@allocated grad!(g2, x0))\n",
    "display(@allocated grad5!(x0))\n",
    "@assert g1 == g2 == grad5!(x0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# time gradients\n",
    "# (grad! and grad5! are much faster)\n",
    "if true\n",
    "    println(\"Time simple gradients\")\n",
    "    @btime g2 = grad(x0) # 1 alloc\n",
    "    @btime g2 .= grad(x0) # 3 alloc\n",
    "    @btime grad!(g2, x0) # 0 alloc\n",
    "    @btime grad5!(x0) # 0 alloc\n",
    "end;"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "lines_to_next_cell": 2
   },
   "outputs": [],
   "source": [
    "#=\n",
    "Times on 2017 iMac (4.2 GHz Intel Core i7) with Mojave 10.14.6 and Julia 1.6.2\n",
    "  380.831 ns (1 allocation: 8.12 KiB)\n",
    "  986.308 ns (3 allocations: 8.16 KiB)\n",
    "  80.472 ns (0 allocations: 0 bytes)\n",
    "  81.540 ns (0 allocations: 0 bytes)\n",
    "=#"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "lines_to_next_cell": 2
   },
   "outputs": [],
   "source": [
    "# check GD for simple case\n",
    "α = 1e-4\n",
    "x4 = similar(x0)\n",
    "x5 = similar(x0)\n",
    "gd5!(x5, x0, grad5! ; α)\n",
    "gd4!(x4, x0, g2, grad! ; α)\n",
    "x3 = gd3(x0, grad! ; α)\n",
    "x2 = gd2(x0, grad ; α)\n",
    "x1 = gd1(x0, grad ; α)\n",
    "@assert x1 ≈ x2 ≈ x3 ≈ x4 ≈ x5"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Time GD for simple case"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "lines_to_next_cell": 2
   },
   "outputs": [],
   "source": [
    "println(\"Time GD in simple case\")\n",
    "@btime gd5!(x5, x0, grad5! ; α);\n",
    "@btime gd4!(x4, x0, g2, grad! ; α);\n",
    "@btime gd3(x0, grad! ; α);\n",
    "@btime gd2(x0, grad ; α);\n",
    "@btime gd1(x0, grad ; α);"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Generally, when more steps are done in place,\n",
    "it runs faster and uses less memory."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "lines_to_next_cell": 2
   },
   "outputs": [],
   "source": [
    "#=\n",
    "  16.393 μs (3 allocations: 48 bytes)\n",
    "  16.145 μs (3 allocations: 48 bytes)\n",
    "  17.074 μs (5 allocations: 16.30 KiB)\n",
    "  64.550 μs (104 allocations: 820.67 KiB)\n",
    "  165.640 μs (304 allocations: 2.39 MiB)\n",
    "=#"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## More complicated gradient (LS)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "lines_to_next_cell": 2
   },
   "outputs": [],
   "source": [
    "N = 2^10\n",
    "M = 9N\n",
    "x0 = randn(N)\n",
    "g1 = similar(x0) # work space\n",
    "A = sprandn(M, N, 10/N)\n",
    "y = randn(M)\n",
    "grad0 = x -> A' * (A*x - y) # LS gradient"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "function make_grad1(A, y)\n",
    "    # capture these:\n",
    "    Ax = similar(y)\n",
    "    Ac = copy(A)\n",
    "    Ah = A'\n",
    "    yc = copy(y)\n",
    "    return (g,x) -> begin # closure version\n",
    "        mul!(Ax, Ac, x)\n",
    "        Ax .-= yc # now \"r\"\n",
    "        mul!(g, Ah, Ax)\n",
    "    end\n",
    "end"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# check gradient code\n",
    "grad1! = make_grad1(A, y)\n",
    "grad0(x0); grad1!(g1, x0);\n",
    "display(@allocated g0 = grad0(x0))\n",
    "display(@allocated grad1!(g1, x0))\n",
    "@assert g0 ≈ g1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "lines_to_next_cell": 2
   },
   "outputs": [],
   "source": [
    "if true\n",
    "    println(\"Time LS gradient\")\n",
    "    @btime g0 = grad0(x0) # 6 alloc\n",
    "    @btime g0 .= grad0(x0) # 8 alloc\n",
    "    @btime grad1!(g1, x0) # 0 alloc\n",
    "end;"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#=\n",
    "  156 μs (6 allocations: 152.33 KiB)\n",
    "  158 μs (8 allocations: 152.36 KiB)\n",
    "  149 μs (0 allocations: 0 bytes)\n",
    "=#"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "L = opnorm(A, 1) * opnorm(A, Inf)\n",
    "α = 1/L # step size"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "lines_to_next_cell": 2
   },
   "outputs": [],
   "source": [
    "# check GD code\n",
    "x4 = similar(x0)\n",
    "gd4!(x4, x0, g1, grad1! ; α)\n",
    "x3 = gd3(x0, grad1! ; α)\n",
    "x2 = gd2(x0, grad0 ; α)\n",
    "x1 = gd1(x0, grad0 ; α)\n",
    "@assert x1 ≈ x2 ≈ x3 ≈ x4"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Time GD for LS"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "println(\"Time GD for LS\")\n",
    "@btime gd4!(x4, x0, g2, grad1! ; α)\n",
    "@btime gd3(x0, grad1! ; α)\n",
    "@btime gd2(x0, grad0 ; α)\n",
    "@btime gd1(x0, grad0 ; α);"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#=\n",
    "  15.0 ms (3 allocations: 48 bytes)\n",
    "  15.0 ms (5 allocations: 16.30 KiB)\n",
    "\n",
    "  19.8 ms (1104 allocations: 14.90 MiB)\n",
    "  19.8 ms ( 904 allocations: 16.47 MiB)\n",
    "=#\n",
    "nothing"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Julia 1.6.2",
   "language": "julia",
   "name": "julia-1.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}