Part 1¶

using PyPlot
using Random
import Statistics.mean
import LinearAlgebra.norm
import LinearAlgebra.I
import Printf.@printf

Random.seed!(4242);

┌ Warning: PyPlot is using tkagg backend, which is known to cause crashes on MacOS (#410); use the MPLBACKEND environment variable to request a different backend.
└ @ PyPlot /Users/cdesa/.julia/packages/PyPlot/4wzW1/src/init.jl:192

Objective function is $f(w) = \frac{1}{2} w^2$ so the gradient is $f(w) = w$

function grad_f(w)
    return w
end

grad_f (generic function with 1 method)

Suppose that our samples are of the form $\tilde f(w) = \frac{1}{2} (w + z)^2$ where $z \sim N(0,1)$ is a standard Gaussian random variable. Then $\nabla \tilde f(w) = w + z$

function sample_grad_f(w)
    return w + randn(length(w)) / sqrt(length(w))
end

sample_grad_f (generic function with 1 method)

function gradient_descent(w0, alpha, num_iters)
    dist_to_optimum = zeros(num_iters)
    w = w0
    for t = 1:num_iters
        dist_to_optimum[t] = norm(w)
        w = w - alpha * grad_f(w)
    end
    return dist_to_optimum
end

function stochastic_gradient_descent(w0, alpha, num_iters)
    dist_to_optimum = zeros(num_iters)
    w = w0
    for t = 1:num_iters
        dist_to_optimum[t] = norm(w)
        w = w - alpha * sample_grad_f(w)
    end
    return dist_to_optimum
end

function stochastic_gradient_descent_until(w0, alpha, accuracy, num_iters)
    w = w0
    for t = 1:num_iters
        w = w - alpha * sample_grad_f(w)
        if norm(w) <= accuracy
            return t
        end
    end
    return num_iters
end

stochastic_gradient_descent_until (generic function with 1 method)

w0 = 5 * randn(100);
num_iters = 1000000;

function noise_ball_size(dists)
    return sqrt(mean(dists[ceil(length(dists/2)):end].^2))
end

noise_ball_size (generic function with 1 method)

alphas = [0.01, 0.02, 0.05, 0.1, 0.2, 0.5];
ball_sizes = [noise_ball_size(stochastic_gradient_descent(w0, alpha, num_iters)) for alpha in alphas];

plot(alphas, ball_sizes);
xlabel("step size (alpha)");
ylabel("noise ball size");

convergence_times = [mean([stochastic_gradient_descent_until(w0, alpha, 1.0, num_iters) for k=1:5]) for alpha in alphas];

plot(alphas, convergence_times);
xlabel("step size (alpha)");
ylabel("time to converge to distance of 1");

plot(1 ./ alphas, convergence_times);
xlabel("inverse step size (1/alpha)");
ylabel("time to converge to distance of 1");

Part 2¶

function stochastic_gradient_descent_diminishing(w0, num_iters)
    dist_to_optimum = zeros(num_iters)
    alphas = zeros(num_iters)
    w = w0
    mu = 1.0
    M = 1.0
    rho0 = norm(w0)^2
    for t = 1:num_iters
        alpha = (2 * mu * rho0) / (4 * M + mu^2 * rho0 * (t-1))
        alphas[t] = alpha
        dist_to_optimum[t] = norm(w)
        w = w - alpha * sample_grad_f(w)
    end
    return (alphas, dist_to_optimum)
end

stochastic_gradient_descent_diminishing (generic function with 1 method)

w0 = randn(100) / 10;
(alphas, dto_diminishing) = stochastic_gradient_descent_diminishing(w0, 1000);

alpha1 = 0.5 * norm(w0)^2; # same as first step
alpha2 = 2 / (4 + norm(w0)^2 * 100); # intermediate
alpha3 = 2 / (4 + norm(w0)^2 * 1000); # same as last step

dto1 = stochastic_gradient_descent(w0, alpha1, 1000);
dto2 = stochastic_gradient_descent(w0, alpha2, 1000);
dto3 = stochastic_gradient_descent(w0, alpha3, 1000);

plot(dto_diminishing, label="optimal");
plot(dto1; label = "alpha = $(alpha1)");
plot(dto2; label = "alpha = $(alpha2)");
plot(dto3; label = "alpha = $(alpha3)");
xlabel("iteration");
ylabel("distance to optimum");
legend();

loglog(alphas; label = "alpha_t");
loglog(dto_diminishing.^2; label = "rho_t");
xlabel("iteration");
title("Optimal Step Size vs. Distance to Optimum");
legend();

Part 3¶

x = randn(10); y = sqrt(0.95) * x + sqrt(0.05) * randn(10);

scatter(x, y)

PyObject <matplotlib.collections.PathCollection object at 0x7fd519fca048>

# fit with linear regression

f = [x ones(10)];
w = y' / f';

xplot = collect(-1.5:0.1:1.5);
fplot = [xplot ones(length(xplot))]

plot(xplot, fplot * w'; color="red");
scatter(x, y);

xlim([-1.5, 1.5]);
ylim([-1.5, 1.5]);

training_error = round(mean((y - f * w').^2); digits=3)
println("training error: $(training_error)")

training error: 0.028

# fit with degree 10 polynomial

f = hcat([x.^k for k = 0:10]...);
w = y' / f';

xplot = collect(-1.5:0.01:1.5);
fplot = hcat([xplot.^k for k = 0:10]...)

plot(xplot, fplot * w'; color="red");
scatter(x, y);

xlim([-1.5, 1.5]);
ylim([-1.5, 1.5]);

training_error = round(mean((y - f * w').^2); digits=3)
println("training error: $(training_error)")

training error: 0.0

test_x = randn(20); test_y = sqrt(0.9) * test_x + sqrt(0.1) * randn(20);

# fit with linear regression

f = [x ones(10)];
w = y' / f';

xplot = collect(-1.5:0.1:1.5);
fplot = [xplot ones(length(xplot))]

plot(xplot, fplot * w'; color="red");
scatter(x, y);
scatter(test_x, test_y; color="orange");

xlim([-1.5, 1.5]);
ylim([-1.5, 1.5]);

training_error = round(mean((y - f * w').^2); digits=3)
println("training error: $(training_error)")

test_f = [test_x ones(length(test_x))];

test_error = round(mean((test_y - test_f * w').^2); digits=3)
println("test error: $(test_error)")

training error: 0.028
test error: 0.094

# fit with degree 10 polynomial

f = hcat([x.^k / factorial(k) for k = 0:10]...);
w = y' / f';

xplot = collect(-1.5:0.01:1.5);
fplot = hcat([xplot.^k / factorial(k) for k = 0:10]...)

plot(xplot, fplot * w'; color="red");
scatter(x, y);
scatter(test_x, test_y; color="orange");

xlim([-1.5, 1.5]);
ylim([-1.5, 1.5]);

training_error = round(mean((y - f * w').^2); digits=3)
println("training error: $(training_error)")

test_f = hcat([test_x.^k / factorial(k) for k = 0:10]...)

test_error = round(mean((test_y - test_f * w').^2); digits=3)
@printf("test error: %f", test_error)

training error: 0.0

test error: 22058263.902000

Part 4¶

Suppose that we use regularization such that for a polynomial of the form $f_w(x) = \sum_{k = 0}^{10} \frac{w_k x^k}{k!}$ the loss function is $h(w) = \sum_{i=1}^N (f_w(x_i) - y_i)^2 + \sigma^2 \| w \|^2.$ If $A$ is the matrix such that $A_{i,j} = \frac{x_i^j}{j!}$ then equivalently $h(w) = \left\| \left[\begin{array}{c} A \\ \sigma I \end{array} \right] w - \left[\begin{array}{c} y \\ 0 \end{array} \right] \right\|^2$ so we can still solve this easily with linear regression.

# fit with regularized degree 10 polynomial

sigma = 0.01

f = hcat([x.^k / factorial(k) for k = 0:10]...);
f_reg = vcat(f, sigma * Matrix{Float64}(I, 11, 11))
y_reg = vcat(y, zeros(11))

w = y_reg' / f_reg';

xplot = collect(-1.5:0.01:1.5);
fplot = hcat([xplot.^k / factorial(k) for k = 0:10]...)

plot(xplot, fplot * w'; color="red");
scatter(x, y);
scatter(test_x, test_y; color="orange");

xlim([-1.5, 1.5]);
ylim([-1.5, 1.5]);

training_error = round(mean((y - f * w').^2); digits=3)
println("training error: $(training_error)")

test_f = hcat([test_x.^k / factorial(k) for k = 0:10]...)

test_error = round(mean((test_y - test_f * w').^2); digits=3)
@printf("test error: %f", test_error)

training error: 0.021
test error: 0.104000

function regularized_fit_error(sigma)  
    test_x = randn(10000); test_y = sqrt(0.9) * test_x + sqrt(0.1) * randn(10000);
    
    f = hcat([x.^k / factorial(k) for k = 0:10]...);
    f_reg = vcat(f, sigma * Matrix{Float64}(I, 11, 11))
    y_reg = vcat(y, zeros(11))

    w = y_reg' / f_reg';

    test_f = hcat([test_x.^k / factorial(k) for k = 0:10]...)
    
    training_error = round(mean((y - f * w').^2); digits=3)
    test_error = round(mean((test_y - test_f * w').^2); digits=3)
    
    return (training_error, test_error)
end

regularized_fit_error (generic function with 1 method)

sigmas = 10 .^ (-3:0.1:2);
training_errors = zeros(length(sigmas));
test_errors = zeros(length(sigmas));

for i = 1:length(sigmas)
    (training_errors[i], test_errors[i]) = regularized_fit_error(sigmas[i]);
end

loglog(sigmas, training_errors, label="training error");
loglog(sigmas, test_errors, label="test error");
xlabel("regularization coefficient");
ylabel("error");
legend();