{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "## in-place.jl\n", "Illustrate the memory and speed benefits of in-place operations\n", "for GD iteration \n", "2020-06-15 Jeff Fessler, University of Michigan \n", "2021-08-23 Julia 1.6.2, faster closure versions" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "lines_to_next_cell": 2 }, "outputs": [], "source": [ "using BenchmarkTools: @btime\n", "using LinearAlgebra: mul!, opnorm\n", "using SparseArrays: sprandn" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# textbook gradient descent implementation\n", "function gd1(x0, grad::Function ; α::Real = 1, niter::Int = 100)\n", " x = copy(x0) # \"copy\" not actually necessary here\n", " for iter in 1:niter\n", " x = x + α * grad(x)\n", " end\n", " return x\n", "end" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# in-place update version of GD\n", "function gd2(x0, grad::Function ; α::Real = 1, niter::Int = 100)\n", " x = copy(x0) # \"copy\" necessary here!\n", " for iter=1:niter\n", "# x .+= α * grad(x) # better than gd1, but not best\n", " x .+= α .* grad(x) # the \".+=\" causes in-place x update\n", " end\n", " return x\n", "end" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# in-place gradient version of GD\n", "function gd3(x0, grad!::Function ; α::Real = 1, niter::Int = 100)\n", " x = copy(x0)\n", " g = similar(x0) # allocate working memory for gradient\n", " for iter=1:niter\n", " x .+= α .* grad!(g, x) # no memory allocation here!\n", " end\n", " return x\n", "end" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# fully in-place version of GD\n", "# input \"g\" is work space for gradient\n", "function gd4!(x, x0, g, grad!::Function ; α::Real = 1, niter::Int = 100)\n", " x .= x0\n", " for iter=1:niter\n", " x .+= α .* grad!(g, x) # no memory allocation here!\n", " end\n", " # no requirement to return anything, because input \"x\" was updated,\n", " return x # but we return it anyway for convenience\n", "end" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "lines_to_next_cell": 2 }, "outputs": [], "source": [ "# fully in-place version of GD using convenient 1-input version of grad!\n", "function gd5!(x, x0, grad!::Function ; α::Real = 1, niter::Int = 100)\n", " x .= x0\n", " for iter=1:niter\n", " x .+= α .* grad!(x) # no memory allocation here!\n", " end\n", " return x\n", "end" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Simple gradient case" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "grad = x -> x.^3 # gradient of 0.25*\\norm{x}^4\n", "function grad!(g, x)\n", " g .= x.^3\n", " return g # for convenience\n", "end" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "x0 = randn(2^10)\n", "g2 = similar(x0);" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# the following \"closure\" is faster than using a global variable for g5\n", "function make_grad5(x0)\n", " g5 = similar(x0) # captures this variable in scope, improving speed\n", " return x -> grad!(g5, x) # \"closure\"\n", "end\n", "grad5! = make_grad5(x0)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "check gradients" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "lines_to_next_cell": 2 }, "outputs": [], "source": [ "grad(x0); grad!(g2, x0); grad5!(x0) # warm-up to compile\n", "display(@allocated g1 = grad(x0))\n", "display(@allocated grad!(g2, x0))\n", "display(@allocated grad5!(x0))\n", "@assert g1 == g2 == grad5!(x0)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# time gradients\n", "# (grad! and grad5! are much faster)\n", "if true\n", " println(\"Time simple gradients\")\n", " @btime g2 = grad(x0) # 1 alloc\n", " @btime g2 .= grad(x0) # 3 alloc\n", " @btime grad!(g2, x0) # 0 alloc\n", " @btime grad5!(x0) # 0 alloc\n", "end;" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "lines_to_next_cell": 2 }, "outputs": [], "source": [ "#=\n", "Times on 2017 iMac (4.2 GHz Intel Core i7) with Mojave 10.14.6 and Julia 1.6.2\n", " 380.831 ns (1 allocation: 8.12 KiB)\n", " 986.308 ns (3 allocations: 8.16 KiB)\n", " 80.472 ns (0 allocations: 0 bytes)\n", " 81.540 ns (0 allocations: 0 bytes)\n", "=#" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "lines_to_next_cell": 2 }, "outputs": [], "source": [ "# check GD for simple case\n", "α = 1e-4\n", "x4 = similar(x0)\n", "x5 = similar(x0)\n", "gd5!(x5, x0, grad5! ; α)\n", "gd4!(x4, x0, g2, grad! ; α)\n", "x3 = gd3(x0, grad! ; α)\n", "x2 = gd2(x0, grad ; α)\n", "x1 = gd1(x0, grad ; α)\n", "@assert x1 ≈ x2 ≈ x3 ≈ x4 ≈ x5" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Time GD for simple case" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "lines_to_next_cell": 2 }, "outputs": [], "source": [ "println(\"Time GD in simple case\")\n", "@btime gd5!(x5, x0, grad5! ; α);\n", "@btime gd4!(x4, x0, g2, grad! ; α);\n", "@btime gd3(x0, grad! ; α);\n", "@btime gd2(x0, grad ; α);\n", "@btime gd1(x0, grad ; α);" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Generally, when more steps are done in place,\n", "it runs faster and uses less memory." ] }, { "cell_type": "code", "execution_count": null, "metadata": { "lines_to_next_cell": 2 }, "outputs": [], "source": [ "#=\n", " 16.393 μs (3 allocations: 48 bytes)\n", " 16.145 μs (3 allocations: 48 bytes)\n", " 17.074 μs (5 allocations: 16.30 KiB)\n", " 64.550 μs (104 allocations: 820.67 KiB)\n", " 165.640 μs (304 allocations: 2.39 MiB)\n", "=#" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## More complicated gradient (LS)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "lines_to_next_cell": 2 }, "outputs": [], "source": [ "N = 2^10\n", "M = 9N\n", "x0 = randn(N)\n", "g1 = similar(x0) # work space\n", "A = sprandn(M, N, 10/N)\n", "y = randn(M)\n", "grad0 = x -> A' * (A*x - y) # LS gradient" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "function make_grad1(A, y)\n", " # capture these:\n", " Ax = similar(y)\n", " Ac = copy(A)\n", " Ah = A'\n", " yc = copy(y)\n", " return (g,x) -> begin # closure version\n", " mul!(Ax, Ac, x)\n", " Ax .-= yc # now \"r\"\n", " mul!(g, Ah, Ax)\n", " end\n", "end" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# check gradient code\n", "grad1! = make_grad1(A, y)\n", "grad0(x0); grad1!(g1, x0);\n", "display(@allocated g0 = grad0(x0))\n", "display(@allocated grad1!(g1, x0))\n", "@assert g0 ≈ g1" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "lines_to_next_cell": 2 }, "outputs": [], "source": [ "if true\n", " println(\"Time LS gradient\")\n", " @btime g0 = grad0(x0) # 6 alloc\n", " @btime g0 .= grad0(x0) # 8 alloc\n", " @btime grad1!(g1, x0) # 0 alloc\n", "end;" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#=\n", " 156 μs (6 allocations: 152.33 KiB)\n", " 158 μs (8 allocations: 152.36 KiB)\n", " 149 μs (0 allocations: 0 bytes)\n", "=#" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "L = opnorm(A, 1) * opnorm(A, Inf)\n", "α = 1/L # step size" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "lines_to_next_cell": 2 }, "outputs": [], "source": [ "# check GD code\n", "x4 = similar(x0)\n", "gd4!(x4, x0, g1, grad1! ; α)\n", "x3 = gd3(x0, grad1! ; α)\n", "x2 = gd2(x0, grad0 ; α)\n", "x1 = gd1(x0, grad0 ; α)\n", "@assert x1 ≈ x2 ≈ x3 ≈ x4" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Time GD for LS" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "println(\"Time GD for LS\")\n", "@btime gd4!(x4, x0, g2, grad1! ; α)\n", "@btime gd3(x0, grad1! ; α)\n", "@btime gd2(x0, grad0 ; α)\n", "@btime gd1(x0, grad0 ; α);" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#=\n", " 15.0 ms (3 allocations: 48 bytes)\n", " 15.0 ms (5 allocations: 16.30 KiB)\n", "\n", " 19.8 ms (1104 allocations: 14.90 MiB)\n", " 19.8 ms ( 904 allocations: 16.47 MiB)\n", "=#\n", "nothing" ] } ], "metadata": { "kernelspec": { "display_name": "Julia 1.6.2", "language": "julia", "name": "julia-1.6" } }, "nbformat": 4, "nbformat_minor": 4 }