using LinearAlgebra
using Statistics
using SparseArrays
using MLDatasets
using PyPlot
d = 100000;
n = 100;
Xs = randn(d,n); # the dataset
epsilon = 0.1;
D = Int64(ceil(8 * log(n) / epsilon^2))
3685
A = randn(D, d) / sqrt(D);
# let's loook at this for a single example
norm(Xs[:,1] - Xs[:,2])
447.3594790493648
norm(A*Xs[:,1] - A*Xs[:,2])
453.60422837944606
norm(A*Xs[:,1] - A*Xs[:,2]) / norm(Xs[:,1] - Xs[:,2])
1.0139591304589126
AXs = A*Xs;
# over the whole dataset
extrema((i == j) ? 1.0 : norm(AXs[:,i] - AXs[:,j]) / norm(Xs[:,i] - Xs[:,j]) for i = 1:n, j = 1:n)
(0.9519405550668073, 1.046375370822268)
# how much did we decrease the dimension?
D/d
0.03685
n = 60000;
d = 28*28;
train_x, train_y = MNIST.traindata();
train_x = Float64.(reshape(train_x, (d, n)));
train_x_mean = mean(train_x; dims=2);
train_x_minus_mean = train_x .- train_x_mean;
Sigma = (train_x_minus_mean * train_x_minus_mean')/n;
# find the eigendecomposition of Sigma
ESigma = eigen(Sigma; sortby = (x -> -x))
Eigen{Float64,Float64,Array{Float64,2},Array{Float64,1}} values: 784-element Array{Float64,1}: 5.116787728342092 3.7413284788648067 3.2526542396844844 2.8415733372665466 2.5670749588127744 2.273625491620609 1.7251262295281955 1.5205348976536206 1.4562809808876922 1.2427293764173304 1.1120709732036005 1.0666227659356209 0.9046657547344794 ⋮ -4.341596436079342e-19 -7.816133441131003e-19 -8.328205317261318e-19 -9.06846445474923e-19 -2.7674683969166128e-18 -7.155496110387914e-18 -1.057007120922721e-17 -2.366763579350205e-17 -4.458784081324597e-17 -6.16504989725162e-17 -6.378346028260054e-17 -9.992685906775396e-17 vectors: 784×784 Array{Float64,2}: 0.0 0.0 0.0 … 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 … 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 … 0.0 0.0 0.0 0.0 0.0 0.0 0.0 -1.13023e-6 1.79706e-6 7.96314e-7 -1.82916e-5 -1.69162e-5 ⋮ ⋱ -8.769e-5 0.00143078 0.00113599 1.16734e-13 8.06924e-14 -4.5962e-5 0.000920176 0.000863292 9.46519e-14 -2.08525e-13 -1.27477e-5 0.000500157 0.000487834 -4.86915e-13 4.10592e-13 1.72528e-5 0.000219159 0.000211216 … 1.07941e-12 -3.80997e-13 5.51266e-6 0.000109011 9.13385e-5 -1.36204e-12 9.24027e-14 -2.59364e-7 3.54844e-5 3.45235e-5 2.02711e-12 -1.66652e-13 -4.95273e-7 2.12795e-5 1.89427e-5 -5.01332e-13 1.7726e-13 1.37842e-7 2.22338e-6 2.25569e-6 0.0 0.0 0.0 0.0 0.0 … 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
# largest eigenvalue
ESigma.values[1]
5.116787728342092
# corresponding eigenvector
ESigma.vectors[:, 1]
784-element Array{Float64,1}: 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 -1.1302299848127289e-6 ⋮ -8.768996662290797e-5 -4.596197162656801e-5 -1.2747685973350732e-5 1.7252770690812103e-5 5.512661031686905e-6 -2.593635886755097e-7 -4.952733260038533e-7 1.3784202612423542e-7 0.0 0.0 0.0 0.0
imshow(reshape(ESigma.vectors[:, 1], (28,28))');
imshow(reshape(ESigma.vectors[:, 2], (28,28))');
imshow(reshape(ESigma.vectors[:, 3], (28,28))');
imshow(reshape(ESigma.vectors[:, 4], (28,28))');
imshow(reshape(ESigma.vec
tors[:, 5], (28,28))');
syntax: missing comma or ) in argument list Stacktrace: [1] top-level scope at In[24]:2 [2] include_string(::Function, ::Module, ::String, ::String) at ./loading.jl:1091
plot(ESigma.values)
ylabel("eigenvalue")
xlabel("index of eigenvalue")
title("Eigenvalues of MNIST Covariance Matrix");
PCA can represent objects in low dimension without losing information.
D = 5;
A = ESigma.vectors[:, 1:D]';
# original image
i = 1337;
imshow(reshape(train_x[:,i], (28,28))');
# dimension-reduced image
x_dr = A * train_x_minus_mean[:,i]
x_recovered = A' * x_dr + train_x_mean;
# original image
imshow(reshape(x_recovered, (28,28))');
Still enough to classify the image!
Matrix(X)
UndefVarError: X not defined Stacktrace: [1] top-level scope at In[30]:1 [2] include_string(::Function, ::Module, ::String, ::String) at ./loading.jl:1091
n = 4096;
X = sprand(n,n,0.1);
Y = sprand(n,n,0.1);
# visualize part of it
imshow(X[1:64,1:64]);
colorbar();
# get the size of X in bytes
Base.summarysize(X)
26896696
# type of X
typeof(X)
SparseMatrixCSC{Float64,Int64}
Xdense = Matrix(X);
Ydense = Matrix(Y);
typeof(Xdense)
Array{Float64,2}
Base.summarysize(Xdense)
134217768
# what's the ratio?
Base.summarysize(X) / Base.summarysize(Xdense)
0.20039594161631416
# what fraction of X's entries are nonzero?
nnz(X) / length(X)
0.10007530450820923
Why are these numbers not equal?
@time X * Y;
1.312289 seconds (110.34 k allocations: 287.229 MiB, 0.31% gc time)
dense_mmpy_time = @elapsed Xdense * Ydense
1.174083337
What happened here???
# make X and Y less dense
Xsparser = sprand(n,n,0.01);
Ysparser = sprand(n,n,0.01);
@time Xsparser * Ysparser;
0.121884 seconds (8 allocations: 94.578 MiB)
densities = [0.001, 0.003, 0.01, 0.03, 0.1, 0.3];
sparse_mm_times = Float64[];
for d in densities
Xsparse = sprand(n,n,d);
Ysparse = sprand(n,n,d);
push!(sparse_mm_times, @elapsed(Xsparse * Ysparse));
end
loglog(densities, sparse_mm_times; label="sparse mmpy");
loglog(densities, densities * 0 .+ dense_mmpy_time; label="dense mmpy");
xlabel("density");
ylabel("matrix multiply runtime");
legend();
spv = X[:,1]
4096-element SparseVector{Float64,Int64} with 427 stored entries: [16 ] = 0.494497 [18 ] = 0.826087 [34 ] = 0.687849 [59 ] = 0.318373 [63 ] = 0.543537 [73 ] = 0.460325 [75 ] = 0.0852525 [90 ] = 0.141923 [108 ] = 0.0701148 [116 ] = 0.994332 ⋮ [4029] = 0.570289 [4035] = 0.54624 [4040] = 0.202171 [4042] = 0.403688 [4062] = 0.420507 [4066] = 0.538902 [4068] = 0.265724 [4072] = 0.191174 [4080] = 0.67434 [4084] = 0.384076 [4089] = 0.188717
spv.n
4096
spv.nzind
427-element Array{Int64,1}: 16 18 34 59 63 73 75 90 108 116 125 127 134 ⋮ 4021 4029 4035 4040 4042 4062 4066 4068 4072 4080 4084 4089
spv.nzval
427-element Array{Float64,1}: 0.4944967485040994 0.8260867791524058 0.6878492561841358 0.31837265760532185 0.5435373410214652 0.4603252124070474 0.08525252863099708 0.14192308123381947 0.07011482669433211 0.994332364738097 0.07054927165346192 0.3335951909204913 0.9303865975954675 ⋮ 0.2442214611309792 0.5702885838128535 0.5462397434084996 0.20217057750630207 0.40368809673279826 0.420507447700027 0.5389015609303036 0.2657240323430008 0.19117367805893304 0.674340219210197 0.3840757164610593 0.18871747643515846