Basic Introduction to Machine Learning: 01-overview

Jeff Fessler, University of Michigan
2018-10-18 Julia 1.0.1 version

This is a Jupyter notebook based on the Julia language
[https://julialang.org/]
To run it you will need Julia 1.0.1 or later and to install several packages.
For some tutorials on using Julia see:
[http://web.eecs.umich.edu/~fessler/course/551/julia/tutor/]

In [1]:
# packages used
using LinearAlgebra
using Random
using LaTeXStrings # pretty plot labels
using Plots
if true
    pyplot()
    pyplot(markersize=7, markerstrokecolor=:auto)
    fnt = font("DejaVu Sans", 15) # larger default font
    pyplot(guidefont=fnt, xtickfont=fnt, ytickfont=fnt, legendfont=fnt)
end
Out[1]:
Plots.PyPlotBackend()
In [2]:
# plot for supervised learning - classification
Random.seed!(0)
n1 = 50; n2 = n1
rot = phi -> [cos(phi) sin(-phi); sin(phi) cos(phi)]
data1 = rot(pi/8) * ([3 0; 0 1] * randn(2,n1) .+ [8;2])
data2 = rot(pi/4) * ([2 0; 0 1] * randn(2,n2) .+ [9;3])
scatter(data1[1,:], data1[2,:], color=[:blue], label="class1")
scatter!(data2[1,:], data2[2,:], color=[:red], label="class2")
plot!(xlabel=L"x_1", ylabel=L"x_2")
plot!(xlim=[0,14], ylim=[0,14])
plot!(aspect_ratio=1, xtick=[0, 14], ytick=[0, 14])
#savefig("ml-super1.pdf")

# add decision boundary
x = LinRange(0,14,101)
y = 2 .+ x - 0.03 * x.^2
plot!(x, y, color=:magenta, label="")
#savefig("ml-super2.pdf")
Out[2]:
In [3]:
# plot for supervised learning - regression
Random.seed!(0)
N = 40
f = (x) -> 10. / (x + 1)
xt = 10 * rand(N)
yt = f.(xt) + 0.4 * randn(N)
x = LinRange(0,10,101)
y = f.(x)
scatter(xt, yt, color=[:blue], label="training data")
plot!(xlabel=L"x", ylabel=L"y")
#plot!(x, y, line=:red)
plot!(xlim=[0,10], ylim=[0,8])
plot!(xtick=0:5:10, ytick=0:4:8)
#savefig("ml-super-regress1.pdf")

Afun = (tt) -> [t.^i for t in tt, i in 0:3] # matrix of monomials
A = Afun(xt)
coef = A \ yt
y = Afun(x) * coef

plot!(x, y, line=:magenta, label="cubic regression")
#savefig("ml-super-regress2.pdf")
Out[3]:
In [4]:
# plot for unsupervised learning
Random.seed!(0)
n1 = 50; n2 = n1; n3=n1
rot = phi -> [cos(phi) sin(-phi); sin(phi) cos(phi)]
data1 = rot(pi/4) * ([2 0; 0 0.7] * randn(2,n1) .+ [9;3])
data2 = rot(pi/8) * ([3 0; 0 0.6] * randn(2,n2) .+ [8;2])
data3 = rot(0   ) * ([2 0; 0 0.5] * randn(2,n3) .+ [9;1])

plot(xlabel=L"x_1", ylabel=L"x_2")
scatter!(data1[1,:], data1[2,:], color=[:black], label="training data")
scatter!(data2[1,:], data2[2,:], color=[:black], label="")
scatter!(data3[1,:], data3[2,:], color=[:black], label="")
plot!(xlim=[0,14], ylim=[0,14])
plot!(aspect_ratio=1, xtick=[0, 14], ytick=[0, 14])

#savefig("ml-unsup1.pdf")
Out[4]:
In [5]:
# after (oracle) "clustering"
plot(xlabel=L"x_1", ylabel=L"x_2")
scatter!(data1[1,:], data1[2,:], color=[:blue], label="cluster1")
scatter!(data2[1,:], data2[2,:], color=[:red], label="cluster2")
scatter!(data3[1,:], data3[2,:], color=[:orange], label="cluster3")
plot!(xlim=[0,14], ylim=[0,14])
plot!(aspect_ratio=1, xtick=[0, 14], ytick=[0, 14])
#savefig("ml-unsup2.pdf")
Out[5]:
In [6]:
# illustrate novelty detection
plot(xlabel=L"x_1", ylabel=L"x_2")
scatter!(data1[1,:], data1[2,:], color=[:black], label="")
scatter!(data2[1,:], data2[2,:], color=[:black], label="")
scatter!(data3[1,:], data3[2,:], color=[:black], label="")
scatter!([10], [11], color=[:red], label="")
plot!(xlim=[0,14], ylim=[0,14])
plot!(aspect_ratio=1, xtick=[0, 14], ytick=[0, 14])
#savefig("ml-unsup3.pdf")
Out[6]:

The utility of nonlinearity

In [7]:
# 1D plot supervised learning: classification
Random.seed!(0)
n1 = 20; n2 = n1; n3 = n1
data1 = 1 * randn(2,n1) .+ 5
data2 = 1 * randn(2,n2) .+ 0
data3 = 1 * randn(2,n3) .+ (-5)
plot(xlabel=L"x_1", ylabel="")
scatter!(data1[1,:], zeros(n1), color=[:blue], label="class1")
scatter!(data2[1,:], zeros(n2), color=[:red], label="class2")
scatter!(data3[1,:], zeros(n3), color=[:blue], label="")
plot!(xlim=[-8,7], ylim=[-1,1])
plot!(xtick=-6:3:6, ytick=[])
plot!([1, 1]*2, [-1, 1], color=:orange, label="")
#savefig("ml-nonlin1.pdf")
Out[7]:
In [8]:
# a simple nonlinearity, abs(feature), allows linear separation
f = x -> abs(x)
data1[2,:] = f.(data1[1,:])
data2[2,:] = f.(data2[1,:])
data3[2,:] = f.(data3[1,:])
plot(xlabel=L"x_1", ylabel=L"x_2")
scatter!(data1[1,:], data1[2,:], color=[:blue], label="class1")
scatter!(data2[1,:], data2[2,:], color=[:red], label="class2")
scatter!(data3[1,:], data3[2,:], color=[:blue], label="")
plot!(xlim=[-8,7], ylim=[-1,10])
plot!(xtick=-6:3:6, ytick=0:5:10)
plot!([-1, 1]*8, [1, 1]*2.5, color=:orange, label="", legend=:top)
#savefig("ml-nonlin2.pdf")
Out[8]:
In [9]:
# 2D example
Random.seed!(0)
n1 = 40; n2 = 120
data1 = randn(2,n1)
rad2 = 3 .+ 3*rand(1,n2)
ang2 = rand(1,n2) * 2Ï€
data2 = [rad2 .* cos.(ang2); rad2 .* sin.(ang2)]
#@show data2
plot(xlabel=L"x_1", ylabel=L"x_2")
scatter!(data1[1,:], data1[2,:], color=[:blue], label="class1")
scatter!(data2[1,:], data2[2,:], color=[:red], label="class2")
plot!(xlim=[-1,1]*6, ylim=[-1,1]*6)
plot!(aspect_ratio=1, xtick=-6:6:6, ytick=-6:6:6)
#savefig("ml-nonlin2d-flat.pdf")
Out[9]:
In [10]:
# nonlinear lifting into 3D
lift_fun = (x) -> sum(abs.(x), dims=1)
lift1 = [data1; lift_fun(data1)]
lift2 = [data2; lift_fun(data2)]
plot(xlabel=L"x_1", ylabel=L"x_2", zlabel=L"$x_3 = |x_1| + |x_2|$")
scatter!(lift1[1,:], lift1[2,:], lift1[3,:], color=[:blue], label="class1")
scatter!(lift2[1,:], lift2[2,:], lift2[3,:], color=[:red], label="class2")
plot!(xlim=[-1,1]*6, ylim=[-1,1]*6)
plot!(xtick=-6:6:6, ytick=-6:6:6)
plot!(camera=(30,10))
#savefig("ml-nonlin2d-lift.pdf")
Out[10]:
In [11]:
# nonlinearity for regression
Random.seed!(0)
N = 40
f = (x) -> 10. / (x + 1)
xt = 10 * rand(N)
yt = f.(xt) + 0.4 * randn(N)
x = LinRange(0,10,101)
y = f.(x)
scatter(xt, yt, color=[:blue], label="training data for regression")
plot!(xlabel=L"x", ylabel=L"y")
#plot!(x, y, line=:red)
plot!(xlim=[0,10], ylim=[0,8])
plot!(xtick=0:5:10, ytick=0:4:8)
#savefig("ml-super-regress1.pdf")

Afun = (tt,deg) -> [t.^i for t in tt, i in 0:deg] # matrix of monomials
A3 = Afun(xt,3)
coef3 = A3 \ yt
y3 = Afun(x,3) * coef3

A1 = Afun(xt,1)
coef1 = A1 \ yt
y1 = Afun(x,1) * coef1

plot!(x, y3, line=:magenta,
    label = L"cubic: $y = \alpha_3 x^3 + \alpha_2 x^2 + \alpha_1 x + \alpha_0$")
plot!(x, y1, line=(:dash,:red),
    label = L"linear (affine): $y = \alpha_1 x + \alpha_0$")

#savefig("ml-nonlin3-regress.pdf")
Out[11]:
In [12]:
Random.seed!(0)
n1 = 70; n2 = n1
rot = phi -> [cos(phi) sin(-phi); sin(phi) cos(phi)]
mu1 = [7, 10]
mu2 = [9, 4]
S1 = rot(pi/9) * [3 0; 0 1]
S2 = S1 # for LDA
data1 = S1 * randn(2,n1) .+ mu1
data2 = S2 * randn(2,n2) .+ mu2
plot(xlabel=L"x_1", ylabel=L"x_2")
scatter!(data1[1,:], data1[2,:], color=[:blue], label="class1")
scatter!(data2[1,:], data2[2,:], color=[:red], label="class2")
plot!(xlim=[0,16], ylim=[0,16])
plot!(aspect_ratio=1)
plot!(xtick=0:4:16, ytick=0:4:16)

Ï• = LinRange(0,2Ï€,101)
for r in [1.5 2.5]
    x = r * cos.(Ï•)
    y = r * sin.(Ï•)
    c1 = S1 * [x'; y'] .+ mu1
    c2 = S2 * [x'; y'] .+ mu2
    plot!(c1[1,:], c1[2,:], color=:blue, label="")
    plot!(c2[1,:], c2[2,:], color=:red, label="")
end
x = LinRange(-1,17,11)
w = (S1 * S1') \ (mu2 - mu1) # LDA
c = (norm(S1 \ mu2)^2 - norm(S1 \ mu1)^2)/2
y = (c .- w[1] * x) / (w[2])
plot!(x, y, color=:magenta, label="")
#savefig("ml-lda1.pdf")
Out[12]:

Model-order selection

In [13]:
# sinusoidal regression training data
Random.seed!(0)
Ntrain = 40
Ntest = 30
f = (x) -> 10. / (x + 1)
xtrain = 10 * rand(Ntrain)
ytrain = f.(xtrain) + 0.4 * randn(Ntrain)
xtest = 10 * rand(Ntest)
ytest = f.(xtest) + 0.4 * randn(Ntest)

x = LinRange(0,10,201)
y = f.(x)

plot(xlabel=L"x", ylabel=L"y")
scatter!(xtrain, ytrain, color=[:blue], label="training data")
scatter!(xtest, ytest, color=[:red], label="test data")
plot!(xlim=[0,10], ylim=[0,8])
plot!(xtick=0:5:10, ytick=0:4:8)
#savefig("ml-order0.pdf")
Out[13]:
In [14]:
# show overfit
scatter(xtrain, ytrain, color=[:blue], label="training data")
plot!(xlim=[0,10], ylim=[0,8])
plot!(xtick=0:5:10, ytick=0:4:8)

Afun = (tt,deg) -> [t.^i for t in tt, i in 0:deg] # matrix of monomials
Afun = (tt,deg) -> [cos(2Ï€*t*i/20) for t in tt, i in 0:deg] # matrix of sinusoids
dlist = [2 9 20]
clist = (:magenta, :red, :orange)
for ii=1:length(dlist)
    deg = dlist[ii]
    A = Afun(xtrain,deg)
    coef = A \ ytrain
    y = Afun(x,deg) * coef
    plot!(x, y, line=clist[ii], label="$deg harmonics")
end
plot!(xlabel=L"x", ylabel=L"y")
#savefig("ml-order29.pdf")
Out[14]:
In [15]:
# fit improves with more harmonics, of course
dlist = 0:30
etrain = zeros(length(dlist))
etest = zeros(length(dlist))
errs = zeros(length(dlist))
for ii=1:length(dlist)
    deg = dlist[ii]
    Atrain = Afun(xtrain, deg)
    Atest = Afun(xtest, deg)
#   @show cond(A'*A) # sinusoids is more stable than polynomials
    coef = Atrain \ ytrain
    yh = Atrain * coef
    etrain[ii] = norm(yh - ytrain) 
    etest[ii] = norm(Atest * coef - ytest)
    errs[ii] = norm(yh - f.(xt))
end
scatter(dlist, etrain, color=[:blue], label="fit to training data")
plot!(xlabel = "model order: # of sinusoids")
plot!(ylabel = L"fit: \ || \hat{y} - y ||_2")
plot!(ylim=[0,13], ytick=[0,13])
#savefig("ml-order-fit1.pdf")
#scatter!(dlist, errs, color=[:magenta], label="error to truth")
scatter!(dlist, etest, color=[:red], label="fit to test data")
#savefig("ml-order-fit2.pdf")
Out[15]:

Cross-validation

In [16]:
# todo
Nlearn = Int(Ntrain / 2)
Nvalid = Ntrain - Nlearn
xlearn = xtrain[1:Nlearn]
ylearn = ytrain[1:Nlearn]
xvalid = xtrain[(Nlearn+1):Ntrain]
yvalid = ytrain[(Nlearn+1):Ntrain]

plot(xlabel=L"x", ylabel=L"y")
scatter!(xlearn, ylearn, color=[:blue], label="training data (fitting)")
scatter!(xvalid, yvalid, color=[:cyan], label="validation data (model selection)")
plot!(xlim=[0,10], ylim=[0,8])
plot!(xtick=0:5:10, ytick=0:4:8)
#savefig("ml-valid0.pdf")
Out[16]:
In [17]:
# fit improves with more harmonics, of course
dlist = 0:20
elearn = zeros(length(dlist))
evalid = zeros(length(dlist))
etest = zeros(length(dlist))
errs = zeros(length(dlist))
for ii=1:length(dlist)
    deg = dlist[ii]
    Alearn = Afun(xlearn, deg)
    Avalid = Afun(xvalid, deg)
    Atest = Afun(xtest, deg)
#   @show cond(A'*A) # sinusoids is more stable than polynomials
    coef = Alearn \ ylearn
    elearn[ii] = norm(Alearn * coef - ylearn)
    evalid[ii] = norm(Avalid * coef - yvalid)
    etest[ii] = norm(Atest * coef - ytest)
    errs[ii] = norm([Alearn; Avalid]*coef - f.([xlearn; xvalid]))
end
scatter(dlist, elearn, color=[:blue], label="fit to training data")
scatter!(dlist, evalid, color=[:cyan], label="fit to validation data")
plot!(xlabel = "model order: # of sinusoids")
plot!(ylabel = L"fit: \ || \hat{y} - y ||_2")
plot!(ylim=[0,13], ytick=[0,13])
#dbest = findmin(evalid)[2]
dbest = findall(diff(evalid) .>= 0)[1] # find first increase in validation error
plot!(xtick=[0, dlist[dbest], 20])
#plot!(legend=:top)
#savefig("ml-valid-fit1.pdf")
#scatter!(dlist, errs, color=[:magenta], label="error to truth")
scatter!(dlist, etest, color=[:red], label="fit to test data")
#savefig("ml-valid-fit2.pdf")
Out[17]: