using LinearAlgebra
using Statistics
using SparseArrays
using MLDatasets
using PyPlot
d = 100000;
n = 100;
Xs = randn(d,n); # the dataset
epsilon = 0.1;
D = Int64(ceil(8 * log(n) / epsilon^2))
A = randn(D, d) / sqrt(D);
# let's loook at this for a single example
norm(Xs[:,1] - Xs[:,2])
norm(A*Xs[:,1] - A*Xs[:,2])
norm(A*Xs[:,1] - A*Xs[:,2]) / norm(Xs[:,1] - Xs[:,2])
AXs = A*Xs;
# over the whole dataset
extrema((i == j) ? 1.0 : norm(AXs[:,i] - AXs[:,j]) / norm(Xs[:,i] - Xs[:,j]) for i = 1:n, j = 1:n)
# how much did we decrease the dimension?
D/d
n = 60000;
d = 28*28;
train_x, train_y = MNIST.traindata();
train_x = Float64.(reshape(train_x, (d, n)));
train_x_mean = mean(train_x; dims=2);
train_x_minus_mean = train_x .- train_x_mean;
Sigma = (train_x_minus_mean * train_x_minus_mean')/n;
# find the eigendecomposition of Sigma
ESigma = eigen(Sigma; sortby = (x -> -x))
# largest eigenvalue
ESigma.values[1]
# corresponding eigenvector
ESigma.vectors[:, 1]
imshow(reshape(ESigma.vectors[:, 1], (28,28))');
imshow(reshape(ESigma.vectors[:, 2], (28,28))');
imshow(reshape(ESigma.vectors[:, 3], (28,28))');
imshow(reshape(ESigma.vectors[:, 4], (28,28))');
imshow(reshape(ESigma.vectors[:, 5], (28,28))');
plot(ESigma.values)
ylabel("eigenvalue")
xlabel("index of eigenvalue")
title("Eigenvalues of MNIST Covariance Matrix");
PCA can represent objects in low dimension without losing information.
D = 5;
A = ESigma.vectors[:, 1:D]';
# original image
i = 1337;
imshow(reshape(train_x[:,i], (28,28))');
# dimension-reduced image
x_dr = A * train_x_minus_mean[:,i]
x_recovered = A' * x_dr + train_x_mean;
# original image
imshow(reshape(x_recovered, (28,28))');
Still enough to classify the image!
n = 4096;
X = sprand(n,n,0.1);
Y = sprand(n,n,0.1);
# visualize it
imshow(X);
# get the size of X in bytes
Base.summarysize(X)
# type of X
typeof(X)
Xdense = Matrix(X);
Ydense = Matrix(Y);
typeof(Xdense)
Base.summarysize(Xdense)
# what's the ratio?
Base.summarysize(X) / Base.summarysize(Xdense)
# what fraction of X's entries are nonzero?
nnz(X) / length(X)
Why are these numbers not equal?
@time X * Y;
dense_mmpy_time = @elapsed Xdense * Ydense
What happened here???
# make X and Y less dense
Xsparser = sprand(n,n,0.01);
Ysparser = sprand(n,n,0.01);
@time Xsparser * Ysparser;
densities = [0.001, 0.003, 0.01, 0.03, 0.1, 0.3];
sparse_mm_times = Float64[];
for d in densities
Xsparse = sprand(n,n,d);
Ysparse = sprand(n,n,d);
push!(sparse_mm_times, @elapsed(Xsparse * Ysparse));
end
loglog(densities, sparse_mm_times; label="sparse mmpy");
loglog(densities, densities * 0 .+ dense_mmpy_time; label="dense mmpy");
xlabel("density");
ylabel("matrix multiply runtime");
legend();