using Plots
using Random, Distributions
using JuMP, Ipopt, SCIP
using LinearAlgebra

using Colors
purple=RGB(122/256,30/256,71/256)
blue=RGB(0/256, 39/256, 76/256)
myred=RGB(220/256,20/256,60/256)
cd(dirname(@__FILE__))


# create random data 
Random.seed!(128)
n = 100;
x = [ones(n) sort(rand(Uniform(0,1),n))]

t = 1.5 .* x[:,2] .- 1 .+ rand(Normal(0,0.5),n);
Random.seed!()

plo_prediction = plot(frame=:box,xlabel="input x",ylabel="target t")
scatter!(x[:,2],t,label="data",c=:gray)


# parameters for gradient descent 
α = 0.1 # learing rate 
iter_max = 3000

################################################
####### Prediction-focused learning ############
################################################
# cost function 
cost_function(w) = 1/(2n)*sum((w'*x[i,:] - t[i])^2 for i in 1:n)

### write down the standard gradient descent 
w = zeros(2)
@time for i in 1:iter_max
    ŷ =  x * w  
    w = w - α / n * sum("""complete the gradient here""" for j in 1:n) 
    if i % 250 == 0
        println("GD iter $i ... prediction cost: $(round(cost_function(w),digits=6))")
    end
end
plot!(x[:,2], x * w,lw=5,c=purple, label="Prediction-focused model")


################################################
############# Decision problem #################
################################################
λ = 0.5
# optimization problem 
function z_star(ŷ)
    return """write down the closed-form solution for z"""
end

# collect optimal solutions on target values 
z_opt = zeros(n)
for i in 1:n
    z_opt[i] = z_star(t[i])
end

z_pre = zeros(n)
ŷ =  x * w 
for i in 1:n
    z_pre[i] = z_star(ŷ[i])
end

plo_decision = plot(frame=:box,xlabel="input x",ylabel="decision z")
scatter!(x[:,2],z_opt,label="Decision on ground-truth data",c=:gray)
plot!(x[:,2],z_pre,label="Prediction-focused model",c=purple,lw=5)


################################################
######## Decision-focused learning #############
################################################

### solve as a bilevel optimization problem 
function bo_sol()
    model = Model(optimizer_with_attributes(SCIP.Optimizer))
    # JuMP.set_silent(model)
    # variable
    @variable(model, w[1:2])
    @variable(model, ŷ[1:n])
    @variable(model, z[1:n])
    @variable(model, μ[1:n])
    @variable(model, u[1:n], Bin)

    # objective function
    @objective(model, Min, 1/(2*n)*sum((z[i]-z_opt[i])^2  for i in 1:n))
    # constraints 
    @constraint(model, ŷ .==  x * w)
    @constraint(model, """add prim feasibility condition""")
    @constraint(model, """add dual feasibility condition""")
    @constraint(model, """add the Lagrange optimality condition""")
    for i in 1:n
        @constraint(model, """first BigM constraint (BigM = 100)""")
        @constraint(model, """first BigM constraint (BigM = 100)""")
    end
    # solve
    optimize!(model)
    return JuMP.value.(z), JuMP.value.(w)
end

z_dfl_bl, w_dfl_bl = bo_sol()

# add the model to the prediction space 
plo_prediction=plot(plo_prediction)
plot!(x[:,2], x * w_dfl_bl,lw=5,c=:green, label="Decision-focused model (bilevel opt)", linestyle=:dot)

# add the model to the decision space 
plo_decision=plot(plo_decision)
plot!(x[:,2],z_dfl_bl,label="Decision-focused model (bilevel opt)",c=:green,lw=5)


### solve using gradient descent 
# function for the gradient ∂z/∂y
function ∂z(y)
    1/(1+exp(y/(1+2*λ))) * exp(y/(1+2*λ)) + 1/(1+2*λ)
end

w_dfl_gd = zeros(2)
α = 0.1
@time for i in 1:iter_max
    ŷ =  x * w_dfl_gd  
    w_dfl_gd = w_dfl_gd - α / n * sum(("""which term is missing?"""-z_opt[j])*"""which term is missing?"""*x[j,:] for j in 1:n) 
    if i % 250 == 0
        println("GD iter $i ... prediction cost: $(round(cost_function(w),digits=6))")
    end
end

# add the model to the prediction space 
plo_prediction=plot(plo_prediction)
plot!(x[:,2], x * w_dfl_gd,lw=5,c=myred, label="Decision-focused model (grad descent)")

# add the model to the decision space 
plo_decision=plot(plo_decision)
plot!(x[:,2],z_dfl_gd,label="Decision-focused model (gradient descent)",c=myred,lw=5)

# print decision gap between the optimal and prescribed decisions
println("decision sub-optimality")
@show norm(z_opt.-z_pre)^2;
@show norm(z_opt.-z_dfl_bl)^2;
@show norm(z_opt.-z_dfl_gd)^2;