function float32_as_binary(x::Float32)
bs = bitstring(x);
println(" ", bs[1:1], " ", bs[2:9], " ", bs[10:32]);
println("sgn exponent mantissa");
end
function binary_as_float32(sgn::String, exponent::String, mantissa::String)
@assert(length(sgn)==1);
@assert(length(exponent)==8);
@assert(length(mantissa)==23);
return reinterpret(Float32,parse(UInt32,sgn*exponent*mantissa;base=2));
end
x = Float32(3.0)
$3 = 11_b = 1.1_b \times 10_b^1 = 1.1_b \times 10_b^{128 - 127} = 1.1_b \times 10_b^{10000000_b - 127}$
float32_as_binary(x)
float32_as_binary(Float32(3.5))
binary_as_float32("1","11111111","00000000000000000000110")
Float32(1.0) / Float32(0.0)
Float32(0.0) / Float32(0.0)
sqrt(Float32(-1.0))
Float32(1.0) / Float32(0.0) - Float32(1.0) / Float32(0.0)
neg0 = binary_as_float32("1","00000000","00000000000000000000000")
Float32(1.0) / neg0
-Float32(0.0)
Float32(1.0) / Float32(3.0)
float32_as_binary(Float32(1.0) / Float32(3.0))
abs(Float32(1.0) / Float32(3.0) - Float64(1.0) / Float64(3.0)) / (Float64(1/3))
machine epsilion: $1.2 \times 10^{-7}$
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
Randomized rounding often does a better job of preserving the mean of a large vector than nearest-neighbor rounding.
Let's suppose we have a dataset, we're going to quantize it into an 8-bit integer, and then measure the resulting mean.
using Statistics
N = 1024;
X_original = 0.1 * randn(N) .+ 3.4;
mean(X_original)
X_8bit_nearest = Int8.(round.(X_original));
mean(X_8bit_nearest)
X_8bit_randomized = Int8.(floor.(X_original .+ rand(N)));
mean(X_8bit_randomized)
abs(mean(X_8bit_randomized) - mean(X_original))
abs(mean(X_8bit_nearest) - mean(X_original))