Using this benchmark, indeed, the solution of @tstanisl is much better.
#include <benchmark/benchmark.h>
#include <memory>
#include <omp.h>
constexpr int cacheline_size{64};
double do_something(const double v)
{
return v * 2;
}
void do_matrix_calculation(double* A, const int column, const int row)
{
for (int i = 1; i < row; i++)
{
// I put column iteration in the inside loop
// because of cache miss and I tried to schedule the
// same thread on the same cache line to avoid false
// sharing
#pragma omp parallel for schedule(static, cacheline_size)
for (int j = 0; j < column; j++)
{
A[i * column+ j] = do_something(A[(i - 1) * column + j]);
}
}
}
void BM_matrix_calculation(benchmark::State& state)
{
const int n_threads = state.range(0);
const int cols = state.range(1);
const int rows = state.range(2);
std::unique_ptr<double[]> arr = std::make_unique<double[]>(cols * rows);
omp_set_num_threads(n_threads);
for (auto _ : state)
{
do_matrix_calculation(arr.get(), cols, rows);
}
}
void do_matrix_calculation2(double* A, const int column, const int row)
{
#pragma omp parallel for schedule(static, cacheline_size) collapse(2)
for (int i = 1; i < row; i++)
{
for (int j = 0; j < column; j++)
{
A[i * column+ j] = do_something(A[(i - 1) * column + j]);
}
}
}
void BM_matrix_calculation2(benchmark::State& state)
{
const int n_threads = state.range(0);
const int cols = state.range(1);
const int rows = state.range(2);
std::unique_ptr<double[]> arr = std::make_unique<double[]>(cols * rows);
omp_set_num_threads(n_threads);
for (auto _ : state)
{
do_matrix_calculation2(arr.get(), cols, rows);
}
}
void do_matrix_calculation3(double* A, const int column, const int row)
{
#pragma omp parallel for
for (int j0 = 0; j0 < column; j0+=cacheline_size)
{
for (int i = 1; i < row; i++)
{
int j1 = std::min(j0 + cacheline_size, column);
for (int j = j0; j < j1; ++j)
A[i * column+ j] = do_something(A[(i - 1) * column+ j]);
}
}
}
void BM_matrix_calculation3(benchmark::State& state)
{
const int n_threads = state.range(0);
const int cols = state.range(1);
const int rows = state.range(2);
std::unique_ptr<double[]> arr = std::make_unique<double[]>(cols * rows);
omp_set_num_threads(n_threads);
for (auto _ : state)
{
do_matrix_calculation3(arr.get(), cols, rows);
}
}
BENCHMARK(BM_matrix_calculation)
->Args({1, 1024, 1024})
->Args({2, 1024, 1024})
->Args({4, 1024, 1024})
->Args({8, 1024, 1024});
BENCHMARK(BM_matrix_calculation2)
->Args({1, 1024, 1024})
->Args({2, 1024, 1024})
->Args({4, 1024, 1024})
->Args({8, 1024, 1024});
BENCHMARK(BM_matrix_calculation3)
->Args({1, 1024, 1024})
->Args({2, 1024, 1024})
->Args({4, 1024, 1024})
->Args({8, 1024, 1024});
BENCHMARK_MAIN();
The difference in performance between BM_matrix_calculation and BM_matrix_calculation2 seems to be sensitive to the cache-line size. Of course, this needs to be optimized for your particular machine.
do_something? How many rows and columns is there? What is the type of the items inA? How long the loop is in sequential (1us, 1ms, 1s, 1h)?do_somethingis simple, this code is almost certainly memory bandwidth bound, so adding threads won't help. (Note, also, that any sane OpenMP runtime will not be creating and destroying threads here, since it will use a thread-pool.)