How can I thread the nested for-loop below safely in order to run the programme in parallel on a core with 8 threads and still output data in the correct order. I have tried using the #pragma omp for command but that gives me an error message: work-sharing region may not be closely nested inside of work-sharing, critical or explicit task region.
Note: This code is for an introduction to parallel programming, so it is poorly written for the sake of being optimised
#pragma omp parallel private(t, i, j) shared(nx, ny, nt)
{
// main loop
for (int t = 0; t < nt; t++)
{
cout << "\n" << t;
cout.flush();
// first block
for (int i = 0; i < nx; i++)
{
for(int j=0; j < ny ;j++)
{
if (i> 0 && i < nx - 1 && j >0 && j < ny - 1)
{
vr[i][j] = (vi[i+1][j]+vi[i-1][j]+vi[i][j-1]+vi[i][j+1]) / 4.;
}
else if (i == 0 && i < nx - 1 && j > 0 && j < ny - 1)
{
vr[i][j] = (vi[i+1][j]+10.+vi[i][j-1]+vi[i][j+1]) / 4.;
}
else if (i > 0 && i == nx - 1 && j > 0 && j < ny - 1)
{
vr[i][j] = (5.+vi[i-1][j]+vi[i][j-1]+vi[i][j+1]) / 4.;
}
else if (i > 0 && i < nx - 1 && j == 0 && j < ny - 1)
{
vr[i][j] = (vi[i+1][j]+vi[i-1][j]+15.45+vi[i][j+1]) / 4.;
}
else if (i > 0 && i < nx - 1 && j > 0 && j == ny - 1)
{
vr[i][j] = (vi[i+1][j]+vi[i-1][j]+vi[i][j-1]-6.7) / 4.;
}
}
}
// second block
for (int i = 0; i < nx; i++)
{
for (int j = 0; j < ny; j++)
{
if (fabs(fabs(vr[i][j]) - fabs(vi[i][j])) < 1e-2)
{
fout << "\n" << t << " " << i << " " << j << " "
<< fabs(vi[i][j]) << " " << fabs(vr[i][j]);
}
}
#pragma omp for schedule(static,100)
// third block
for (int i = 0; i < nx; i++)
{
for (int j = 0; j < ny; j++)
{
vi[i][j] = vi[i][j] / 2. + vr[i][j] / 2.;
}
}
}
}