using PyPlot
using Random
import Statistics.mean
import LinearAlgebra.norm
import LinearAlgebra.I
import Printf.@printf
Random.seed!(4242);
Objective function is $$ f(w) = \frac{1}{2} w^2 $$ so the gradient is $$ f(w) = w $$
function grad_f(w)
return w
end
Suppose that our samples are of the form $$\tilde f(w) = \frac{1}{2} (w + z)^2$$ where $z \sim N(0,1)$ is a standard Gaussian random variable. Then $$ \nabla \tilde f(w) = w + z $$
function sample_grad_f(w)
return w + randn(length(w)) / sqrt(length(w))
end
function gradient_descent(w0, alpha, num_iters)
dist_to_optimum = zeros(num_iters)
w = w0
for t = 1:num_iters
dist_to_optimum[t] = norm(w)
w = w - alpha * grad_f(w)
end
return dist_to_optimum
end
function stochastic_gradient_descent(w0, alpha, num_iters)
dist_to_optimum = zeros(num_iters)
w = w0
for t = 1:num_iters
dist_to_optimum[t] = norm(w)
w = w - alpha * sample_grad_f(w)
end
return dist_to_optimum
end
function stochastic_gradient_descent_until(w0, alpha, accuracy, num_iters)
w = w0
for t = 1:num_iters
w = w - alpha * sample_grad_f(w)
if norm(w) <= accuracy
return t
end
end
return num_iters
end
w0 = 5 * randn(100);
num_iters = 1000000;
function noise_ball_size(dists)
return sqrt(mean(dists[ceil(length(dists/2)):end].^2))
end
alphas = [0.01, 0.02, 0.05, 0.1, 0.2, 0.5];
ball_sizes = [noise_ball_size(stochastic_gradient_descent(w0, alpha, num_iters)) for alpha in alphas];
plot(alphas, ball_sizes);
xlabel("step size (alpha)");
ylabel("noise ball size");
convergence_times = [mean([stochastic_gradient_descent_until(w0, alpha, 1.0, num_iters) for k=1:5]) for alpha in alphas];
plot(alphas, convergence_times);
xlabel("step size (alpha)");
ylabel("time to converge to distance of 1");
plot(1 ./ alphas, convergence_times);
xlabel("inverse step size (1/alpha)");
ylabel("time to converge to distance of 1");
function stochastic_gradient_descent_diminishing(w0, num_iters)
dist_to_optimum = zeros(num_iters)
alphas = zeros(num_iters)
w = w0
mu = 1.0
M = 1.0
rho0 = norm(w0)^2
for t = 1:num_iters
alpha = (2 * mu * rho0) / (4 * M + mu^2 * rho0 * (t-1))
alphas[t] = alpha
dist_to_optimum[t] = norm(w)
w = w - alpha * sample_grad_f(w)
end
return (alphas, dist_to_optimum)
end
w0 = randn(100) / 10;
(alphas, dto_diminishing) = stochastic_gradient_descent_diminishing(w0, 1000);
alpha1 = 0.5 * norm(w0)^2; # same as first step
alpha2 = 2 / (4 + norm(w0)^2 * 100); # intermediate
alpha3 = 2 / (4 + norm(w0)^2 * 1000); # same as last step
dto1 = stochastic_gradient_descent(w0, alpha1, 1000);
dto2 = stochastic_gradient_descent(w0, alpha2, 1000);
dto3 = stochastic_gradient_descent(w0, alpha3, 1000);
plot(dto_diminishing, label="optimal");
plot(dto1; label = "alpha = $(alpha1)");
plot(dto2; label = "alpha = $(alpha2)");
plot(dto3; label = "alpha = $(alpha3)");
xlabel("iteration");
ylabel("distance to optimum");
legend();
loglog(alphas; label = "alpha_t");
loglog(dto_diminishing.^2; label = "rho_t");
xlabel("iteration");
title("Optimal Step Size vs. Distance to Optimum");
legend();
x = randn(10); y = sqrt(0.95) * x + sqrt(0.05) * randn(10);
scatter(x, y)
# fit with linear regression
f = [x ones(10)];
w = y' / f';
xplot = collect(-1.5:0.1:1.5);
fplot = [xplot ones(length(xplot))]
plot(xplot, fplot * w'; color="red");
scatter(x, y);
xlim([-1.5, 1.5]);
ylim([-1.5, 1.5]);
training_error = round(mean((y - f * w').^2); digits=3)
println("training error: $(training_error)")
# fit with degree 10 polynomial
f = hcat([x.^k for k = 0:10]...);
w = y' / f';
xplot = collect(-1.5:0.01:1.5);
fplot = hcat([xplot.^k for k = 0:10]...)
plot(xplot, fplot * w'; color="red");
scatter(x, y);
xlim([-1.5, 1.5]);
ylim([-1.5, 1.5]);
training_error = round(mean((y - f * w').^2); digits=3)
println("training error: $(training_error)")
test_x = randn(20); test_y = sqrt(0.9) * test_x + sqrt(0.1) * randn(20);
# fit with linear regression
f = [x ones(10)];
w = y' / f';
xplot = collect(-1.5:0.1:1.5);
fplot = [xplot ones(length(xplot))]
plot(xplot, fplot * w'; color="red");
scatter(x, y);
scatter(test_x, test_y; color="orange");
xlim([-1.5, 1.5]);
ylim([-1.5, 1.5]);
training_error = round(mean((y - f * w').^2); digits=3)
println("training error: $(training_error)")
test_f = [test_x ones(length(test_x))];
test_error = round(mean((test_y - test_f * w').^2); digits=3)
println("test error: $(test_error)")
# fit with degree 10 polynomial
f = hcat([x.^k / factorial(k) for k = 0:10]...);
w = y' / f';
xplot = collect(-1.5:0.01:1.5);
fplot = hcat([xplot.^k / factorial(k) for k = 0:10]...)
plot(xplot, fplot * w'; color="red");
scatter(x, y);
scatter(test_x, test_y; color="orange");
xlim([-1.5, 1.5]);
ylim([-1.5, 1.5]);
training_error = round(mean((y - f * w').^2); digits=3)
println("training error: $(training_error)")
test_f = hcat([test_x.^k / factorial(k) for k = 0:10]...)
test_error = round(mean((test_y - test_f * w').^2); digits=3)
@printf("test error: %f", test_error)
Suppose that we use regularization such that for a polynomial of the form $$f_w(x) = \sum_{k = 0}^{10} \frac{w_k x^k}{k!}$$ the loss function is $$h(w) = \sum_{i=1}^N (f_w(x_i) - y_i)^2 + \sigma^2 \| w \|^2.$$ If $A$ is the matrix such that $$ A_{i,j} = \frac{x_i^j}{j!} $$ then equivalently $$h(w) = \left\| \left[\begin{array}{c} A \\ \sigma I \end{array} \right] w - \left[\begin{array}{c} y \\ 0 \end{array} \right] \right\|^2$$ so we can still solve this easily with linear regression.
# fit with regularized degree 10 polynomial
sigma = 0.01
f = hcat([x.^k / factorial(k) for k = 0:10]...);
f_reg = vcat(f, sigma * Matrix{Float64}(I, 11, 11))
y_reg = vcat(y, zeros(11))
w = y_reg' / f_reg';
xplot = collect(-1.5:0.01:1.5);
fplot = hcat([xplot.^k / factorial(k) for k = 0:10]...)
plot(xplot, fplot * w'; color="red");
scatter(x, y);
scatter(test_x, test_y; color="orange");
xlim([-1.5, 1.5]);
ylim([-1.5, 1.5]);
training_error = round(mean((y - f * w').^2); digits=3)
println("training error: $(training_error)")
test_f = hcat([test_x.^k / factorial(k) for k = 0:10]...)
test_error = round(mean((test_y - test_f * w').^2); digits=3)
@printf("test error: %f", test_error)
function regularized_fit_error(sigma)
test_x = randn(10000); test_y = sqrt(0.9) * test_x + sqrt(0.1) * randn(10000);
f = hcat([x.^k / factorial(k) for k = 0:10]...);
f_reg = vcat(f, sigma * Matrix{Float64}(I, 11, 11))
y_reg = vcat(y, zeros(11))
w = y_reg' / f_reg';
test_f = hcat([test_x.^k / factorial(k) for k = 0:10]...)
training_error = round(mean((y - f * w').^2); digits=3)
test_error = round(mean((test_y - test_f * w').^2); digits=3)
return (training_error, test_error)
end
sigmas = 10 .^ (-3:0.1:2);
training_errors = zeros(length(sigmas));
test_errors = zeros(length(sigmas));
for i = 1:length(sigmas)
(training_errors[i], test_errors[i]) = regularized_fit_error(sigmas[i]);
end
loglog(sigmas, training_errors, label="training error");
loglog(sigmas, test_errors, label="test error");
xlabel("regularization coefficient");
ylabel("error");
legend();