I am trying to parallelise these recursive functions with openMP tasks,
when I compile with gcc it runs only on 1 thread. When i compile it with clang it runs on multiple threads
The second function calls the first one which doesn't generate new tasks to stop wasting time.
gcc does work when there is only one function that calls itself.
Why is this?
Am I doing something wrong in the code?
Then why does it work with clang?
I am using gcc 9.3 on windows with Msys2. The code was compiled with -O3 -fopenmp
//the program compiled by gcc only runs on one thread
#include<vector>
#include<omp.h>
#include<iostream>
#include<ctime>
using namespace std;
vector<int> vec;
thread_local double steps;
void excalibur(int current_node, int current_depth) {
#pragma omp simd
for( int i = 0 ; i < current_node; i++){
++steps;
excalibur(i, current_depth);
}
if(current_depth > 0){
int new_depth = current_depth - 1;
#pragma omp simd
for(int i = current_node;i <= vec[current_node];i++){
++steps;
excalibur(i + 1,new_depth);
}
}
}
void mario( int current_node, int current_depth) {
#pragma omp task firstprivate(current_node,current_depth)
{
if(current_depth > 0){
int new_depth = current_depth - 1;
for(int i = current_node;i <= vec[current_node];i++){
++steps;
mario(i + 1,new_depth);
}
}
}
#pragma omp simd
for( int i = 0 ; i < current_node; i++){
++steps;
excalibur(i, current_depth);
}
}
int main() {
double total = 0;
clock_t tim = clock();
omp_set_dynamic(0);
int nodes = 10;
int timesteps = 3;
omp_set_num_threads(4);
vec.assign( nodes, nodes - 2 );
#pragma omp parallel
{
steps = 0;
#pragma omp single
{
mario(nodes - 1, timesteps - 1);
}
#pragma omp atomic
total += steps;
}
double time_taken = (double)(tim) / CLOCKS_PER_SEC;
cout <<fixed<<total<<" steps, "<< fixed << time_taken << " seconds"<<endl;
return 0;
}
while this works with gcc
#include<vector>
#include<omp.h>
#include<iostream>
#include<ctime>
using namespace std;
vector<int> vec;
thread_local double steps;
void mario( int current_node, int current_depth) {
#pragma omp task firstprivate(current_node,current_depth)
{
if(current_depth > 0){
int new_depth = current_depth - 1;
for(int i = current_node;i <= vec[current_node];i++){
++steps;
mario(i + 1,new_depth);
}
}
}
#pragma omp simd
for( int i = 0 ; i < current_node; i++){
++steps;
mario(i, current_depth);
}
}
int main() {
double total = 0;
clock_t tim = clock();
omp_set_dynamic(0);
int nodes = 10;
int timesteps = 3;
omp_set_num_threads(4);
vec.assign( nodes, nodes - 2 );
#pragma omp parallel
{
steps = 0;
#pragma omp single
{
mario(nodes - 1, timesteps - 1);
}
#pragma omp atomic
total += steps;
}
double time_taken = (double)(tim) / CLOCKS_PER_SEC;
cout <<fixed<<total<<" steps, "<< fixed << time_taken << " seconds"<<endl;
return 0;
}