Accelerators
2024-10-10
To the board!
CPU code calls GPU kernels
“Hello world”: vector addition in CUDA
__global__
void gpu_add(int n, float* x, float* y)
{
for (int i = 0; i < n; ++i)
y[i] += x[i];
}
gpu_add<<<1,1>>>(n, d_x, d_y);
__global__
void gpu_add(int n, float* x, float* y)
{
int i = threadIdx.x + blockDim.x * blockIdx.x;
if (i < n)
y[i] += x[i]
}
// Should have n <= n_blocks * block_size
gpu_add<<<n_blocks, block_size>>>(n, d_x, d_y);
void add(const std::vector<float>& x, std::vector<float>& y)
{
int n = x.size();
int size = n * sizeof(float);
float *d_x, *d_y;
cudaMalloc((void**)& d_x, size); cudaMalloc((void**)& d_y, size);
cudaMemcpy(d_x, x.data(), size, cudaMemcpyHostToDevice);
cudaMemcpy(d_y, y.data(), size, cudaMemcpyHostToDevice);
int bs = 256;
int nblocks = (n+bs-1)/bs;
gpu_add<<nblocks,bs>>>(n, d_x, d_y);
cudaMemcpy(y.data(), d_y, size, cudaMemcpyDeviceToHost);
cudaFree(d_x); cudaFree(d_y);
}
__global__
void gpu_add(int n, float* x, float* y)
{
int index = threadIdx.x + blockDim.x * blockIdx.x;
int stride = blockDim.x * gridDim.x;
for (int i = index; i < n; i += stride)
y[i] += x[i]
}
int blockSize = 256;
int numSMs;
cudaDeviceGetAttribute(&numSMs,
cudaDevAttrMultiProcessorCount, devId);
add<<<32*numSMs, block_size>>>(n, d_x, d_y);
CUDA heirarchy
// Consider time-stepping a 96-by-96 grid equation
dim3 dimGrid(6,6,1);
dim3 dimBlock(16,16,1);
wave2Dkernel<<<dimGrid,dimBlock>>>(...);
threadIdx.x
gridDim.x
in \(1\) to \(2^{31}-1\), others in \(1\) to \(2^{16}-1\).Continuous: \[ \frac{\partial^2 u}{\partial t^2} = c^2 \frac{\partial^2 u}{\partial x^2} \] with \(u(0) = u(1) = 0\).
(Think 2D-3D — 1D is to make it all fit on slides.)
Discrete: \[ (\Delta t)^{-2} \left( u_{t+1,k}-2u_{t,k}+u_{t-1,k} \right) = c^2 (\Delta x)^{-2} \left( u_{t,k+1}-2u_{t,k}+u_{t,k+1} \right). \] with \(u_{t,0} = u_{t,N} = 0\).
Becomes: \[ u_{t+1,k} = 2 (1-C^2) u_{t,k} - u_{t-1,k} + C^2 (u_{t,k+1} + u_{t,k-1}) \] where \(C = c (\Delta t/\Delta x)\). Let \(D = 2(1-C^2)\).
void step(int nx, float C2, float D,
float* restrict unext, // u_{t+1,:}
float* restrict u, // u_{t,:}
float* restrict uprev) // u_{t-1,:}
{
for (int k = 1; k < nx-1; ++k)
unext[k] = D*u[k] - uprev[k] + C2*(u[k+1] + u[k-1]];
}
u[0] = u[nx] = 0
for BCsvoid step(int nx, float C2, float D,
float* restrict unext, // u_{t+1,:}
float* restrict u, // u_{t,:}
float* restrict uprev) // u_{t-1,:}
{
#pragma omp for
for (int k = 1; k < nx-1; ++k)
unext[k] = D*u[k] - uprev[k] + C2*(u[k+1] + u[k-1]];
}
Is this a good idea?
void steps(int nx, int nt, float C2, float D, float* uu)
{
for (int t = 0; t < nt; ++t) {
float* unext = uu + ((t+1)%3) * nx;
float* u = uu + ( t %3) * nx;
float* uprev = uu + ((t+2)%3) * nx;
for (int k = 1; k < nx-1; ++k)
unext[k] = D*u[k] - uprev[k] + C2*(u[k+1] + u[k-1]];
}
}
uu
is length 3*nx
(pay attention to first two)nt
time stepsnt
!nt
)1+(i*(nx-1))/ntiles
nt
from edgeTo the board!
Assuming steps_tiled
called in a parallel region:
__device__
void steps(int nx, int nt, float C2, float D, float* uu)
{
for (int t = 0; t < nt; ++t) {
float* unext = uu + ((t+1)%3) * nx;
float* u = uu + ( t %3) * nx;
float* uprev = uu + ((t+2)%3) * nx;
k = threadIdx.x+1;
if (k < nx-1)
unext[k] = D*u[k] - uprev[k] + C2*(u[k+1] + u[k-1]];
}
}
What goes wrong?
__device__
void steps(int nx, int nt, float C2, float D, float* uu)
{
for (int t = 0; t < nt; ++t) {
float* unext = uu + ((t+1)%3) * nx;
float* u = uu + ( t %3) * nx;
float* uprev = uu + ((t+2)%3) * nx;
k = threadIdx.x+1;
if (k < nx-1)
unext[k] = D*u[k] - uprev[k] + C2*(u[k+1] + u[k-1]];
__syncthreads();
}
}
Have a great fall break!