Shared memory
2024-10-03
Basic syntax: #omp
construct [
clause … ]
exit()
in such a blockvoid accumulate_a2a(std::vector<double>& f, const std::vector<double>& x)
{
int n = f.size();
#pragma omp parallel for
for (int i = 0; i < n; ++i) {
double fi = 0.0;
double xi = x[i];
for (int j = 0; j < n; ++j) {
double dij = xi-x[j];
if (dij != 0.0)
fi += 1.0/(dij*dij);
}
f[i] = fi;
}
}
Aside: tiling transformation on this code?
void accumulate_a2a(std::vector<double>& f, const std::vector<double>& x)
{
int n = f.size();
for (int i = 0; i < n; ++i) {
double xi = x[i];
for (int j = i+1; j < n; ++j) {
double dij = xi-x[j];
f[i] += 1.0/(dij*dij);
f[j] += 1.0/(dij*dij);
}
}
}
Why would omp parallel for
fail here?
sections
: like cobegin/coendsingle
: do only in one thread (e.g. I/O)master
: do only in master thread; others skip#pragma omp parallel
{
#pragma omp sections nowait
{
#pragma omp section
do_something();
#pragma omp section
and_something_else();
#pragma omp section
and_this_too();
// No implicit barrier here
}
// Implicit barrier here
}
sections nowait
to kill barrier.
Task involves:
Tasks are handled by run time, complete at barriers or taskwait
.
#pragma omp parallel
{
#pragma omp single nowait
{
for (link_t* link = head; link; link = link->next)
#pragma omp task firstprivate(link)
process(link);
}
// Implicit barrier
}
One thread generates tasks, others execute them.
int tree_max(node_t* n)
{
int lmax, rmax;
if (n->is_leaf)
return n->value;
#pragma omp task shared(lmax)
lmax = tree_max(n->l);
#pragma omp task shared(rmax)
rmax = tree_max(n->l);
#pragma omp taskwait
return max(lmax, rmax);
}
The taskwait
waits for all child tasks.
void omp_qsort(int* a, int lo, int hi)
{
if (lo >= hi) return;
int p = partition(a, lo, hi);
#pragma omp task shared(a)
omp_qsort(a, lo, p);
#pragma omp task shared(a)
omp_qsort(a, p, hi);
}
void call_qsort(int* a, int lo, int hi)
{
#pragma omp parallel
{
#pragma omp single
omp_qsort(a, lo, hi);
#pragma omp taskwait
}
}
What happens if one task produces what another needs?
struct particle_t {
float rho; // Density
float x[3]; // Position
float v[3]; // Velocity (full step)
float vh[3]; // Velocity (half step)
float a[3]; // Accelerations
particle_t* next; // Link for hashing
};
struct sim_state_t {
float mass; // Particle mass
std::vector<particle_t> part; // Particles
std::vector<particle_t*> hash; // Hash table
};
Equivalent to height-balanced quadtree/octtree
Computation involves several different steps:
How much does each cost? Scaling?
How do we decompose the problem?
How do we synchronize force computation?
No blue cell neighbors another blue.