I think I managed to obtain the lower bound for maximum number of iterations one loop can do while serial execution outperforms parallel execution. However, I'd like you to kindly comment on that for us to see if there is a better way to find the right tradeoff/estimated number of a loop body iteration less than which it is not useful to benefit from a parallel for region. (many thanks
Let say:
N: is the number of threads in the parallel region running the loop.
R: is the cost of opening and closing a parallel region (measured by number of the loop iterations instead of nanoseconds for the sake simiplicity. )
S: is the maximum number of iterations (the tradeoff) more than which serial execution would take as much time as parallel execution (including the parallel region overhead).
So basically we are looking for S such that : parallel execution time = serial execution time. which would be: R + S/N = S.
When solved for S then: S = R x N / (N-1). So I tested the idea using the below code:
Where I do the below steps:
- lines 78-96: the cost of creating/forking and finalizing/joining a parallel region is measured.
- lines 99-111: the cost one iteration of the loop is measured.
- lines 113-116 : the tradeoff is obtained using the formula discussed above.
- line 119 : I run a test just to compare the parallel vs serial execution.
#include <iostream>
#include <omp.h>
// #include <ctime>
#include <chrono>
// #define dtype float
// #define NROWS 1350 // float
#define dtype double
#define NUMEL 1000 // double
#define NPAR_REGION 100000
#define TEST_REP 1000
#define NREP 1000
#define printvar(x) std::cout << #x << " = " << x << std::endl;
#define printline std::cout << "LINE: " << __LINE__ << std::endl;
// #pragma omp declare simd
template <typename T>
T loop_body_func(T x, T y){
return sqrt(x*sin(x)/(1+y));
}
using namespace std;
int compare_seri_vs_para(int numel, int nrep){
dtype *m = (dtype*) malloc(sizeof(dtype)*numel) ;
dtype *n = (dtype*) malloc(sizeof(dtype)*numel) ;
dtype *mn = (dtype*) malloc(sizeof(dtype)*numel) ;
int a;
chrono::steady_clock::time_point st = chrono::steady_clock::now();
#pragma noparallel
for(int k = 0; k < nrep; k++){
#pragma omp parallel for if (true)
#pragma novector
for(int i = 0; i < numel; i++){
mn[i] = loop_body_func(m[i], n[i]);
// if (i % 100 == 0){
// printvar(omp_get_thread_num());
// }
}
}
chrono::steady_clock::time_point en = chrono::steady_clock::now();
std::cout << "parallel ellapsed time: \t" <<
chrono::duration_cast<chrono::nanoseconds>(en - st).count() << std::endl;
st = chrono::steady_clock::now();
#pragma noparallel
for(int k = 0; k < nrep; k++){
#pragma noparallel
#pragma novector
for(int i = 0; i < numel; i++){
mn[i] = loop_body_func(m[i], n[i]);
// if (i % 100 == 0){
// printvar(omp_get_thread_num());
// }
}
}
en = chrono::steady_clock::now();
std::cout << "serial ellapsed time: \t\t" <<
chrono::duration_cast<chrono::nanoseconds>(en - st).count() << std::endl;
free(m); free(n); free(mn);
return 0;
}
int main(){
printline
int nthread = omp_get_max_threads ();
printvar(nthread)
// chrono::steady_clock::time_point *thread_cost =
// (chrono::steady_clock::time_point *) calloc(nthread, sizeof(chrono::steady_clock::time_point));
double *thread_cost = (double*) calloc(nthread, sizeof(double));
chrono::steady_clock::time_point sc = chrono::steady_clock::now();
#pragma noparallel
#pragma novector
for(int i = 0; i < NPAR_REGION; i++){
#pragma omp parallel
{
int tid = omp_get_thread_num();
thread_cost[tid] += tid; // a dummy task
// chrono::duration_cast<chrono::nanoseconds>(chrono::steady_clock::now() - sc).count()
}
}
printline
double paralell_region_cost = chrono::duration_cast<chrono::nanoseconds>(chrono::steady_clock::now() - sc).count();
// for(int i = 0; i < nthread; i++)
// paralell_region_cost+= thread_cost[i];
paralell_region_cost /= NPAR_REGION;
printvar(paralell_region_cost)
printline
dtype *m = (dtype*) malloc(sizeof(dtype)*TEST_REP) ;
dtype *n = (dtype*) malloc(sizeof(dtype)*TEST_REP) ;
dtype *mn = (dtype*) malloc(sizeof(dtype)*TEST_REP) ;
sc = chrono::steady_clock::now();
#pragma noparallel
#pragma novector
for(int i = 0; i < TEST_REP; i++){
mn[i] = loop_body_func(m[i], n[i]);
}
double loop_body_cost = \
chrono::duration_cast<chrono::nanoseconds>(chrono::steady_clock::now() - sc).count()/TEST_REP;
printvar(loop_body_cost)
printline
int loop_parallel_serial_tradeoff = (paralell_region_cost / loop_body_cost)*(nthread/(double(nthread)-1.0));
printline
printvar(loop_parallel_serial_tradeoff)
printline
compare_seri_vs_para(loop_parallel_serial_tradeoff, NREP);
free(m); free(n); free(mn);
printvar("-=program ended!=-")
}
I compiled my code snippet using the below command line:
icl.exe -Qopt-report:5 -Qopt-report-phase:all /Ob0 -Qopenmp -Qsimd -Qopenmp-simd -arch:avx2 -Qdiag-error-limit:5 -c omp_if_test.cpp -Fo:omp_if_test.obj
Intel(R) C++ Intel(R) 64 Compiler for applications running on Intel(R) 64, Version 19.1.2.254 Build 20200623
Copyright (C) 1985-2020 Intel Corporation. All rights reserved.
icl: remark #10397: optimization reports are generated in *.optrpt files in the output location
omp_if_test.cpp
xilink.exe omp_if_test.obj -LIBPATH:../../Debug/lib -out:omp_if_test.exe
xilink: executing 'link'
Microsoft (R) Incremental Linker Version 14.28.29913.0
Copyright (C) Microsoft Corporation. All rights reserved.
omp_if_test.obj
-LIBPATH:../../Debug/lib
-out:omp_if_test.exe
Finally, here is the output which is a bit odd as it seems serial always outperforms parallel execution almost 3 fold. (lol) (I was expecting a flactuating situation).
C:\simdtests>omp_if_test.exe
LINE: 72
nthread = 8
LINE: 90
paralell_region_cost = 2053.75
LINE: 97
loop_body_cost = 32
LINE: 113
LINE: 115
loop_parallel_serial_tradeoff = 73
LINE: 118
parallel ellapsed time: 3392300
serial ellapsed time: 1141000
"-=program ended!=-" = -=program ended!=-
C:\simdtests>omp_if_test.exe
LINE: 72
nthread = 8
LINE: 90
paralell_region_cost = 2304
LINE: 97
loop_body_cost = 33
LINE: 113
LINE: 115
loop_parallel_serial_tradeoff = 79
LINE: 118
parallel ellapsed time: 3275000
serial ellapsed time: 1216000
"-=program ended!=-" = -=program ended!=-
C:\simdtests>omp_if_test.exe
LINE: 72
nthread = 8
LINE: 90
paralell_region_cost = 2132.9
LINE: 97
loop_body_cost = 32
LINE: 113
LINE: 115
loop_parallel_serial_tradeoff = 76
LINE: 118
parallel ellapsed time: 3337900
serial ellapsed time: 1123100
"-=program ended!=-" = -=program ended!=-
C:\simdtests>omp_if_test.exe
LINE: 72
nthread = 8
LINE: 90
paralell_region_cost = 2062.77
LINE: 97
loop_body_cost = 32
LINE: 113
LINE: 115
loop_parallel_serial_tradeoff = 73
LINE: 118
parallel ellapsed time: 3739700
serial ellapsed time: 1118400
"-=program ended!=-" = -=program ended!=-
C:\simdtests>omp_if_test.exe
LINE: 72
nthread = 8
LINE: 90
paralell_region_cost = 2121.74
LINE: 97
loop_body_cost = 91
LINE: 113
LINE: 115
loop_parallel_serial_tradeoff = 26
LINE: 118
parallel ellapsed time: 5627200
serial ellapsed time: 356300
"-=program ended!=-" = -=program ended!=-
So it seems I obtained a lower bound for the tradeoff value as opposed to the actual value.
I would appreciate if you kindly comment on that for us.
Regards
#pragma omp parallel for if (...)where the condition is a runtime one. So you can decide at run-time whether to execute a region in parallel or not.