mpirun binds a.out on a single core, so when you

OMP_NUM_THREADS=2 mpirun -np 1 a.out

the two OpenMP threads ends up doing time sharing.


you can confirm that by running

grep Cpus_allowed_list /proc/self/status

mpirun -np 1 grep Cpus_allowed_list /proc/self/status


here is what i get :

[gilles@c7]$ grep Cpus_allowed_list /proc/self/status
Cpus_allowed_list:    0-3
[gilles@c7]$ ~/local/ompi-v1.8/bin/mpirun -np 1 grep Cpus_allowed_list /proc/self/status
Cpus_allowed_list:    0


the easiest workaround is to

mpirun --bind-to none ...

[gilles@c7]$ ~/local/ompi-v1.8/bin/mpirun -np 1 --bind-to none grep Cpus_allowed_list /proc/self/status
Cpus_allowed_list:    0-3

there is definitely a better option (e.g. ask mpirun to allocate 2 cores per MPI task) but i cannot remember it


Cheers,


Gilles

On 6/8/2016 2:37 PM, Maxim Reshetnyak wrote:
Hello!

I have a problem with the hybrid MPI/OpenMP C++ code, which does not produce acceleration in OpenMP mode at the local, 4th-core home computer.

Open MPI loaded from www.open-mpi.org/

mpirun -V
mpirun (Open MPI) 1.8.1.
Compiled from the source.

Ubuntu 14.04

// ===

//main.c

#include <stdio.h>
#include <stdlib.h>
#include <omp.h>
#include <cmath>
#include <iostream>
#include <fstream>
#include <mpi.h>

int main ( int argc, char *argv[] )
{

int my_rank=0;

MPI::Init (argc, argv);
int proc_num = MPI::COMM_WORLD.Get_size ( );
my_rank  = MPI::COMM_WORLD.Get_rank ( );

double GG;
int i1,i2,i3,N=60;
for (int ITER=1; ITER<=10000; ITER++) {

#pragma omp parallel  for  private(i1,i2,i3,GG)
   for (i1=0; i1<N; i1++) {
if ( (i1==0 && ITER==1) || (i1==N-1 && ITER==1) ) std::cout<<my_rank<<" "<<omp_get_thread_num()<<std::endl;
    for (i2=0; i2<N; i2++) {
         for (i3=0; i3<N; i3++)
             GG=i1+i2+i3+pow(i1+i2+i3,2);
                                           }}
}

MPI::Finalize();

}

// ==

mpic++ main.c -fopenmp

echo "run 1 " ; export OMP_NUM_THREADS=1 ; time mpirun -np 1 a.out ; echo "run 2 " ; export OMP_NUM_THREADS=2 ; time mpirun -np 1 a.out

run 1
0 0
0 0

real    0m44.494s
user    0m43.594s
sys    0m0.320s

run 2
0 0
0 1

real    0m44.796s
user    0m43.813s
sys    0m0.360s

No acceleration!


But, if I exclude lines

//#include <mpi.h>

//MPI::Init (argc, argv);
//int proc_num = MPI::COMM_WORLD.Get_size ( );
//my_rank  = MPI::COMM_WORLD.Get_rank ( );

//MPI::Finalize();

compile:

g++ main.c -fopenmp

and run

echo "run 1 " ; export OMP_NUM_THREADS=1 ; time a.out ; echo "run 2 " ; export OMP_NUM_THREADS=2 ; time a.out

run 1
0 0
0 0

real    0m45.416s
user    0m45.402s
sys    0m0.025s

run 2
0 0
0 1

real    0m22.783s
user    0m45.517s
sys    0m0.016s

Then I have acceleration.

Why I have not it in the first case?

M.

lstopo -v

Machine (P#0 total=12240880KB DMIProductName=S5520SC DMIProductVersion=.................... DMIBoardVendor="Intel Corporation" DMIBoardName=S5520SC DMIBoardVersion="FRU Ver 0.05" DMIBoardAssetTag=.................... DMIChassisVendor=.............................. DMIChassisType=17 DMIChassisVersion=.................. DMIChassisAssetTag=.................... DMIBIOSVendor="Intel Corp." DMIBIOSVersion=S5500.86B.01.00.0038.060120091503 DMIBIOSDate=06/01/2009 DMISysVendor="Intel Corporation" Backend=Linux OSName=Linux OSRelease=3.13.0-87-generic OSVersion="#133-Ubuntu SMP Tue May 24 18:32:09 UTC 2016" HostName=mr Architecture=x86_64)
  NUMANode L#0 (P#0 local=6049180KB total=6049180KB)
Socket L#0 (P#0 CPUModel="Intel(R) Xeon(R) CPU E5502 @ 1.87GHz")
      L3Cache L#0 (size=4096KB linesize=64 ways=16)
        L2Cache L#0 (size=256KB linesize=64 ways=8)
          L1dCache L#0 (size=32KB linesize=64 ways=8)
            L1iCache L#0 (size=32KB linesize=64 ways=4)
              Core L#0 (P#0)
                PU L#0 (P#0)
        L2Cache L#1 (size=256KB linesize=64 ways=8)
          L1dCache L#1 (size=32KB linesize=64 ways=8)
            L1iCache L#1 (size=32KB linesize=64 ways=4)
              Core L#1 (P#2)
                PU L#1 (P#2)
  NUMANode L#1 (P#1 local=6191700KB total=6191700KB)
Socket L#1 (P#1 CPUModel="Intel(R) Xeon(R) CPU E5502 @ 1.87GHz")
      L3Cache L#1 (size=4096KB linesize=64 ways=16)
        L2Cache L#2 (size=256KB linesize=64 ways=8)
          L1dCache L#2 (size=32KB linesize=64 ways=8)
            L1iCache L#2 (size=32KB linesize=64 ways=4)
              Core L#2 (P#0)
                PU L#2 (P#1)
        L2Cache L#3 (size=256KB linesize=64 ways=8)
          L1dCache L#3 (size=32KB linesize=64 ways=8)
            L1iCache L#3 (size=32KB linesize=64 ways=4)
              Core L#3 (P#2)
                PU L#3 (P#3)
  Bridge Host->PCI L#0 (P#0 buses=0000:[00-07])
Bridge PCI->PCI (P#16 busid=0000:00:01.0 id=8086:3408 class=0604(PCI_B) buses=0000:[01-01] PCIVendor="Intel Corporation" PCIDevice="5520/5500/X58 I/O Hub PCI Express Root Port 1") "Intel Corporation 5520/5500/X58 I/O Hub PCI Express Root Port 1" PCI 8086:10a7 (P#4096 busid=0000:01:00.0 class=0200(Ether) PCIVendor="Intel Corporation" PCIDevice="82575EB Gigabit Network Connection") "Intel Corporation 82575EB Gigabit Network Connection"
        Network L#0 (Address=00:15:17:af:94:40) "eth0"
PCI 8086:10a7 (P#4097 busid=0000:01:00.1 class=0200(Ether) PCIVendor="Intel Corporation" PCIDevice="82575EB Gigabit Network Connection") "Intel Corporation 82575EB Gigabit Network Connection"
        Network L#1 (Address=00:15:17:af:94:41) "eth1"
Bridge PCI->PCI (P#48 busid=0000:00:03.0 id=8086:340a class=0604(PCI_B) buses=0000:[02-02] PCIVendor="Intel Corporation" PCIDevice="5520/5500/X58 I/O Hub PCI Express Root Port 3") "Intel Corporation 5520/5500/X58 I/O Hub PCI Express Root Port 3" PCI 1002:6819 (P#8192 busid=0000:02:00.0 class=0300(VGA) PCIVendor="Advanced Micro Devices, Inc. [AMD/ATI]" PCIDevice="Pitcairn PRO [Radeon HD 7850 / R7 265 / R9 270 1024SP]") "Advanced Micro Devices, Inc. [AMD/ATI] Pitcairn PRO [Radeon HD 7850 / R7 265 / R9 270 1024SP]"
        GPU L#2 "card0"
        GPU L#3 "controlD64"
PCI 8086:3a20 (P#498 busid=0000:00:1f.2 class=0101(IDE) PCIVendor="Intel Corporation" PCIDevice="82801JI (ICH10 Family) 4 port SATA IDE Controller #1") "Intel Corporation 82801JI (ICH10 Family) 4 port SATA IDE Controller #1"
      Block L#4 "sda"
      Block L#5 "sdb"
PCI 8086:3a26 (P#501 busid=0000:00:1f.5 class=0101(IDE) PCIVendor="Intel Corporation" PCIDevice="82801JI (ICH10 Family) 2 port SATA IDE Controller #2") "Intel Corporation 82801JI (ICH10 Family) 2 port SATA IDE Controller #2"
depth 0:    1 Machine (type #1)
 depth 1:    2 NUMANode (type #2)
  depth 2:    2 Socket (type #3)
   depth 3:    2 L3Cache (type #4)
    depth 4:    4 L2Cache (type #4)
     depth 5:    4 L1dCache (type #4)
      depth 6:    4 L1iCache (type #4)
       depth 7:    4 Core (type #5)
        depth 8:    4 PU (type #6)
Special depth -3:    3 Bridge (type #9)
Special depth -4:    5 PCI Device (type #10)
Special depth -5:    6 OS Device (type #11)
latency matrix between NUMANodes (depth 1) by logical indexes:
  index     0     1
      0 1,000 2,100
      1 2,100 1,000



_______________________________________________
users mailing list
us...@open-mpi.org
Subscription: https://www.open-mpi.org/mailman/listinfo.cgi/users
Link to this post: http://www.open-mpi.org/community/lists/users/2016/06/29400.php


Reply via email to