import numpy
import scipy
import matplotlib
from matplotlib import pyplot
import time
matplotlib.rcParams.update({'font.size': 18})
Let's consider a setting in which we are using 0-1 loss for our empirical risk, and imagine that our error rate is $p = 0.3$ over the whole dataset of $n = 1000000$ examples. Without loss of generality, suppose that the first $30\%$ of the examples are errors and the remainder are not. We can construct the losses of these examples as follows.
n = 1000000;
p = 0.3;
L = numpy.hstack((numpy.ones(int(n * p)), numpy.zeros(n - int(n * p))));
L
Next, let's sample some variables $Z_k$ which are samples from the empirical risk.
Kmax = 100000;
Z = numpy.random.choice(L,Kmax)
Z
Next, we compute the partial averages $$ S_K = \frac{1}{K} \sum_{k=1}^K Z_k. $$
S = numpy.cumsum(Z) / numpy.arange(1,(Kmax+1))
Now we can plot this average and see how it changes as we increase $K$.
Kplot = 100;
pyplot.plot(numpy.arange(Kplot), S[0:Kplot], label="average");
pyplot.plot(numpy.arange(Kplot), p * numpy.ones(Kplot), "k:", label="true empirical risk");
pyplot.legend();
pyplot.show();
# what's the error at the end?
print(f"true empirical risk: {p}");
print(f"approx empirical risk: {S[Kplot-1]}");
print(f"error : {abs(S[Kplot-1]-p)}");
Kplot = 1000;
pyplot.plot(numpy.arange(Kplot), S[0:Kplot], label="average");
pyplot.plot(numpy.arange(Kplot), p * numpy.ones(Kplot), "k:", label="true empirical risk");
pyplot.legend();
pyplot.show();
# what's the error at the end?
print(f"true empirical risk: {p}");
print(f"approx empirical risk: {S[Kplot-1]}");
print(f"error : {abs(S[Kplot-1]-p)}");
Kplot = Kmax;
pyplot.plot(numpy.arange(Kplot), S[0:Kplot], label="average");
pyplot.plot(numpy.arange(Kplot), p * numpy.ones(Kplot), "k:", label="true empirical risk");
pyplot.legend();
pyplot.show();
# what's the error at the end?
print(f"true empirical risk: {p}");
print(f"approx empirical risk: {S[Kplot-1]}");
print(f"error : {abs(S[Kplot-1]-p)}");
Let's choose some $x$ and $y$ completely at random, and evaluate the hypothesis $$h_w(x) = \operatorname{sign}(x^T w).$$
n = 1000000;
d = 256;
Xs = [numpy.random.randn(d) for i in range(n)];
w = numpy.random.randn(d);
Ys = [numpy.sign(numpy.random.choice([-1.0,0.9,1.0,1.1])*numpy.sign(numpy.dot(Xs[i],w))) for i in range(n)];
# error should be about 25%
def total_error(Xs, Ys, w):
n = len(Ys)
return numpy.mean([numpy.sign(numpy.dot(Xs[i],w)) != Ys[i] for i in range(n)])
t = time.time()
print(f"total error: {total_error(Xs, Ys, w)}")
print(f"time elapsed: {time.time()-t} seconds")
def estimate_error(Xs, Ys, w, K):
n = len(Ys)
return numpy.mean([numpy.sign(numpy.dot(Xs[i],w)) != Ys[i] for i in numpy.random.randint(n, size=K)])
t = time.time()
print(f"total error: {estimate_error(Xs, Ys, w, 10)}")
print(f"time elapsed: {time.time()-t} seconds")
t = time.time()
print(f"total error: {estimate_error(Xs, Ys, w, 1000)}")
print(f"time elapsed: {time.time()-t} seconds")
t = time.time()
print(f"total error: {estimate_error(Xs, Ys, w, 10000)}")
print(f"time elapsed: {time.time()-t} seconds")
t = time.time()
print(f"total error: {estimate_error(Xs, Ys, w, 100000)}")
print(f"time elapsed: {time.time()-t} seconds")
t = time.time()
print(f"total error: {estimate_error(Xs, Ys, w, 1000000)}")
print(f"time elapsed: {time.time()-t} seconds")