Hello all, I recently encountered a strange performance drop on a test code. I have two versions of the same code (I believe so at least), one parallelized with OpenMP pragmas and one other manually parallelized with pthread. The test machine is made of a Intel Core i7 920 processor (x86_64, 4 cores and HyperThreading activated). Both versions are compiled using the -O2 option.
gcc -v gives me: Utilisation des specs internes. Target: x86_64-unknown-linux-gnu Configuré avec: ../configure --prefix=/usr --enable-shared --enable-languages=c,c++,fortran,objc,obj-c++ --enable-threads=posix --mandir=/usr/share/man --infodir=/usr/share/info --enable-__cxa_atexit --disable-multilib --libdir=/usr/lib --libexecdir=/usr/lib --enable-clocale=gnu --disable-libstdcxx-pch --with-tune=generic Modèle de thread: posix gcc version 4.4.0 (GCC) uname -a: Linux 2.6.29-ARCH #1 SMP PREEMPT Sat May 9 14:09:36 CEST 2009 x86_64 Intel(R) Core(TM) i7 CPU 920 @ 2.67GHz GenuineIntel GNU/Linux The problem appears when at least one core is heavily loaded. I simulated the machine load using a simple, infinite program running some multiplications on random parts of a 8MB array. I obtain some very coherent results on both version except when I use exactly 8 threads. In this special case, the pthread version performance is still very coherent but the OpenMP version suffers from a huge performance drop: the execution time is multiplied by a factor around 5 compared to the runtime using 7 or 9 threads. What surprised me is that this performance drop doesn't appear on the pthread version, and when using 7 or 9 threads with the OpenMP version. The problem only appears when at least one core is heavily loaded. Stop me if I'm wrong but I think that both code versions are pretty similar. Could someone help me with that performance drop? Do I do something wrong using OpenMP? Why does this performance drop only happen when using exactly 8 threads? Thank you in advance, Benoit Pradelle.
#include <omp.h> #include <time.h> #include <stdio.h> #include <stdint.h> #include <stdlib.h> #define N 1024 #define PAD 17 double A[N * (N + PAD)]; double C[N * (N + PAD)]; int main(int argc, char **argv) { unsigned int nb_ths = 8; int i, j, k; omp_set_dynamic(0); if (argc > 1) { nb_ths = atoi(argv[1]); omp_set_num_threads(nb_ths); } /* initialization */ for (i = 0; i < N; i++) { for (j = 0; j < N; j++) { A[i * (N + PAD) + j] = i * j; C[i * (N + PAD) + j] = i * j; } } #pragma omp parallel private(i, j, k) shared(A) { for (i = 0; i < N - 1; i++) { #pragma omp for for (j = 0; j < N; j++) { for (k = 0; k < N; k++) { A[i * (N + PAD) + j] = A[(i + 1) * (N + PAD) + j + 5] + j * i - k; } } } } /* checking */ for (i = 0; i < N - 1; i++) { for (j = 0; j < N; j++) { for (k = 0; k < N; k++) { C[i * (N + PAD) + j] = C[(i + 1) * (N + PAD) + j + 5] + j * i - k; } } } for (i = 0; i < N; i++) { for (j = 0; j < N; j++) { if (A[i * (N + PAD) + j] != C[i * (N + PAD) + j]) { printf("CHECK FAILED at %d %d\n", i, j); } } } return EXIT_SUCCESS; }
#include <pthread.h> #include <time.h> #include <stdio.h> #include <stdint.h> #include <stdlib.h> #include <unistd.h> #define N 1024 #define PAD 17 double A[N * (N + PAD)]; double C[N * (N + PAD)]; /* thread attributes */ typedef struct { unsigned int id; unsigned int nb_ths; } tattrs; /* barrier stuff */ static unsigned int limit; static unsigned int count; static pthread_mutex_t lock; static pthread_cond_t cond; /* initialize the barrier */ void init_barrier(unsigned int lim) { limit = lim; count = 0; pthread_mutex_init(&lock, NULL); pthread_cond_init(&cond, NULL); } /* wait for a barrier */ void wait_barrier() { pthread_mutex_lock(&lock); count++; if (count >= limit) { pthread_cond_broadcast(&cond); count = 0; } else { pthread_cond_wait(&cond, &lock); } pthread_mutex_unlock(&lock); } /* pthread thread routine */ void *routine(void *args) { unsigned int i, j, k; tattrs *ctx = (tattrs *) args; unsigned int jmin, jmax; for (i = 0; i < N - 1; i++) { /* bound computation */ jmin = ctx->id * (N / ctx->nb_ths); if (ctx->id == ctx->nb_ths - 1) { jmax = N; } else { jmax = jmin + (N / ctx->nb_ths); } for (j = jmin; j < jmax; j++) { for (k = 0; k < N; k++) { A[i * (N + PAD) + j] = A[(i + 1) * (N + PAD) + j + 5] + j * i - k; } } wait_barrier(); } return NULL; } int main(int argc, char **argv) { pthread_t *tids; tattrs *ctx; unsigned int nb_ths = 8; unsigned int i, j, k; if (argc > 1) { nb_ths = atoi(argv[1]); } /* initialization */ for (i = 0; i < N; i++) { for (j = 0; j < N; j++) { A[i * (N + PAD) + j] = i * j; C[i * (N + PAD) + j] = i * j; } } init_barrier(nb_ths); tids = malloc(nb_ths * sizeof(*tids)); ctx = malloc(nb_ths * sizeof(*ctx)); /* run */ for (i = 0; i < nb_ths; i++) { ctx[i].id = i; ctx[i].nb_ths = nb_ths; pthread_create(&tids[i], NULL, routine, &ctx[i]); } for (i = 0; i < nb_ths; i++) { pthread_join(tids[i], NULL); } free(tids); free(ctx); /* checking */ for (i = 0; i < N - 1; i++) { for (j = 0; j < N; j++) { for (k = 0; k < N; k++) { C[i * (N + PAD) + j] = C[(i + 1) * (N + PAD) + j + 5] + j * i - k; } } } for (i = 0; i < N; i++) { for (j = 0; j < N; j++) { if (A[i * (N + PAD) + j] != C[i * (N + PAD) + j]) { printf("CHECK FAILED at %d %d\n", i, j); } } } return EXIT_SUCCESS; }