using PyPlot
using LinearAlgebra
using Statistics
Let's consider a simple linear gression optimization problem over $\mathbb{R}^d$ with objective function
$$ f(w; x_i, y_i) = \frac{1}{2} (x_i^T w - y_i)^2 $$Here, the gradient is
$$ \nabla f(w; x_i, y_i) = x_i (x_i^T w - y_i).$$We suppose that we have a dataset of $(x_1, y_1), (x_2, y_2), \ldots, (x_n, y_n)$, giving us a total empirical risk
$$ f(w) = \frac{1}{n} \sum_{i=1}^n f(w; x_i, y_i) = \frac{1}{n} \sum_{i=1}^n (x_i^T w - y_i)^2. $$The gradient of this is
\begin{align*} \nabla f(w) &= \frac{1}{n} \sum_{i=1}^n \nabla f(w; x_i, y_i) \\&= \frac{1}{n} \sum_{i=1}^n x_i (x_i^T w - y_i) \\&= \left( \frac{1}{n} \sum_{i=1}^n x_i x_i^T \right) w - \frac{1}{n} \sum_{i=1}^n x_i y_i. \end{align*}Of course, the exact minimum occurs when the gradient is $0$, at $$ w = \left( \frac{1}{n} \sum_{i=1}^n x_i x_i^T \right)^{-1} \left( \frac{1}{n} \sum_{i=1}^n x_i y_i \right). $$
# lets generate a synthetic dataset, first for the simple case of d = 1.
d = 1;
n = 1024;
X = randn(d, n);
Y = randn(n);
# find the true optimum
A = mean(X[:,i]*X[:,i]' for i = 1:n);
b = mean(X[:,i]*Y[i] for i = 1:n);
w_opt = inv(A)*b;
# component loss function
function loss_fi(w, xi, yi)
return (dot(xi,w) - yi)^2/2;
end
# component gradient function
function grad_fi(w, xi, yi)
return xi*(dot(xi,w) - yi);
end
# total loss function
function loss_f(w, X, Y)
(d, n) = size(X);
return mean(loss_fi(w, X[:,i], Y[i]) for i = 1:n);
end
# total gradient function
function grad_f(w, X, Y)
(d,n) = size(X);
return mean(grad_fi(w, X[:,i], Y[i]) for i = 1:n);
end
# total loss function
function matrix_loss_f(w, X, Y)
(d,n) = size(X);
return sum((X'*w - Y).^2)/(2*n);
end
# total gradient function
function matrix_grad_f(w, X, Y)
return X*(X'*w - Y)/n;
end
Observe that we can write this loss function and gradient function in terms of matrix operations...which is equivalent and much faster to run! (This is one of the first examples of a systems trick that we can do to make our learning algorithm more scalable...and something you've probably already seen.)
w = randn(d);
println(loss_f(w, X, Y));
println(matrix_loss_f(w, X, Y));
println(grad_f(w, X, Y));
println(matrix_grad_f(w, X, Y));
@time grad_f(w, X, Y);
@time matrix_grad_f(w, X, Y);
Now let's look at what GD and SGD do. Here, we'll measure the distance to the (known) optimum at each step.
function gradient_descent(w0::Vector{Float64}, X::Matrix{Float64}, Y::Vector{Float64},
w_opt::Vector{Float64}, alpha::Float64, num_iters::Int64)
dist_to_optimum = zeros(num_iters)
loss = zeros(num_iters)
w = w0
for t = 1:num_iters
w = w - alpha * matrix_grad_f(w, X, Y);
dist_to_optimum[t] = norm(w - w_opt);
loss[t] = matrix_loss_f(w, X, Y);
end
return (dist_to_optimum, loss)
end
function stochastic_gradient_descent(w0::Vector{Float64}, X::Matrix{Float64}, Y::Vector{Float64},
w_opt::Vector{Float64}, alpha::Float64, num_iters::Int64)
(d,n) = size(X);
dist_to_optimum = zeros(num_iters)
loss = zeros(num_iters)
w = w0
for t = 1:num_iters
i = rand(1:n)
w = w - alpha * grad_fi(w, X[:,i], Y[i]);
dist_to_optimum[t] = norm(w - w_opt);
loss[t] = matrix_loss_f(w, X, Y);
end
return (dist_to_optimum, loss)
end
w0 = [5.0];
alpha = 0.1;
num_iters = 1000;
(gd_dist, gd_loss) = gradient_descent(w0, X, Y, w_opt, alpha, num_iters);
(sgd_dist, sgd_loss) = stochastic_gradient_descent(w0, X, Y, w_opt, alpha, num_iters);
plot(1:num_iters, gd_dist; label="GD");
plot(1:num_iters, sgd_dist; label="SD")
xlabel("iterations");
ylabel("distance to optimum");
legend();
semilogy(1:num_iters, gd_dist; label="GD");
semilogy(1:num_iters, sgd_dist; label="SD")
xlabel("iterations");
ylabel("distance to optimum");
legend();
plot(1:num_iters, gd_loss; label="GD");
plot(1:num_iters, sgd_loss; label="SD")
xlabel("iterations");
ylabel("loss");
legend();
ylim((0,1));
Let's try something with a higher dimension!
d = 100;
n = 1024;
X = randn(d, n);
Y = randn(n);
A = mean(X[:,i]*X[:,i]' for i = 1:n);
b = mean(X[:,i]*Y[i] for i = 1:n);
w_opt = inv(A)*b;
w0 = 2 * randn(100);
alpha = 0.01;
num_iters = 5000;
(gd_dist, gd_loss) = gradient_descent(w0, X, Y, w_opt, alpha, num_iters);
(sgd_dist, sgd_loss) = stochastic_gradient_descent(w0, X, Y, w_opt, alpha, num_iters);
plot(1:num_iters, gd_dist; label="GD");
plot(1:num_iters, sgd_dist; label="SD")
xlabel("iterations");
ylabel("distance to optimum");
legend();
plot(1:num_iters, gd_dist; label="GD");
plot(1:num_iters, sgd_dist; label="SD")
xlabel("iterations");
ylabel("distance to optimum");
legend();
ylim([-0.1,2.1]);
semilogy(1:num_iters, gd_dist; label="GD");
semilogy(1:num_iters, sgd_dist; label="SD")
xlabel("iterations");
ylabel("distance to optimum");
legend();
plot(1:num_iters, gd_loss; label="GD");
plot(1:num_iters, sgd_loss; label="SD")
xlabel("iterations");
ylabel("loss");
legend();
ylim((0,2));
What does this tell us about the convergence of SGD?