All, I'm trying to get the OpenMP portion of the code below to run multicore on a couple of 8 core nodes.
Good news: Multiple threads are being spawned on each node in the run. Bad news: Each of the threads only runs on a single core, leaving 7 cores basically idle. Sorta good news: If I provide a rank file, I get the threads running on different cores within each node (a hack). Here's the first lines of output. /usr/mpi/gcc/openmpi-1.4/bin/mpirun -host c005,c006 -np 2 -rf rank.file -x OMP_NUM_THREADS=4 hybrid.gcc Hello from thread 2 out of 4 from process 1 out of 2 on c006.local another parallel region: name:c006.local MPI_RANK_ID=1 OMP_THREAD_ID=2 Hello from thread 3 out of 4 from process 1 out of 2 on c006.local another parallel region: name:c006.local MPI_RANK_ID=1 OMP_THREAD_ID=3 Hello from thread 1 out of 4 from process 1 out of 2 on c006.local another parallel region: name:c006.local MPI_RANK_ID=1 OMP_THREAD_ID=1 Hello from thread 1 out of 4 from process 0 out of 2 on c005.local another parallel region: name:c005.local MPI_RANK_ID=0 OMP_THREAD_ID=1 Hello from thread 3 out of 4 from process 0 out of 2 on c005.local Hello from thread 2 out of 4 from process 0 out of 2 on c005.local another parallel region: name:c005.local MPI_RANK_ID=0 OMP_THREAD_ID=3 another parallel region: name:c005.local MPI_RANK_ID=0 OMP_THREAD_ID=2 Hello from thread 0 out of 4 from process 0 out of 2 on c005.local another parallel region: name:c005.local MPI_RANK_ID=0 OMP_THREAD_ID=0 Hello from thread 0 out of 4 from process 1 out of 2 on c006.local another parallel region: name:c006.local MPI_RANK_ID=1 OMP_THREAD_ID=0 another parallel region: name:c005.local MPI_RANK_ID=0 OMP_THREAD_ID=3 another parallel region: name:c005.local MPI_RANK_ID=0 OMP_THREAD_ID=2 another parallel region: name:c005.local MPI_RANK_ID=0 OMP_THREAD_ID=0 another parallel region: name:c006.local MPI_RANK_ID=1 OMP_THREAD_ID=3 another parallel region: name:c005.local MPI_RANK_ID=0 OMP_THREAD_ID=3 another parallel region: name:c005.local MPI_RANK_ID=0 OMP_THREAD_ID=2 another parallel region: name:c006.local MPI_RANK_ID=1 OMP_THREAD_ID=0 another parallel region: name:c006.local MPI_RANK_ID=1 OMP_THREAD_ID=1 . . . Here's the simple code: #include <stdio.h> #include "mpi.h" #include <omp.h> int main(int argc, char *argv[]) { int numprocs, rank, namelen; char processor_name[MPI_MAX_PROCESSOR_NAME]; int iam = 0, np = 1; char name[MPI_MAX_PROCESSOR_NAME]; /* MPI_MAX_PROCESSOR_NAME == 128 */ int O_ID; /* OpenMP thread ID */ int M_ID; /* MPI rank ID */ int rtn_val; MPI_Init(&argc, &argv); MPI_Comm_size(MPI_COMM_WORLD, &numprocs); MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Get_processor_name(processor_name, &namelen); #pragma omp parallel default(shared) private(iam, np,O_ID) { np = omp_get_num_threads(); iam = omp_get_thread_num(); printf("Hello from thread %d out of %d from process %d out of %d on %s\n", iam, np, rank, numprocs, processor_name); int i=0; int j=0; double counter=0; for(i =0;i<99999999;i++) { O_ID = omp_get_thread_num(); /* get OpenMP thread ID */ MPI_Get_processor_name(name,&namelen); rtn_val = MPI_Comm_rank(MPI_COMM_WORLD,&M_ID); printf("another parallel region: name:%s MPI_RANK_ID=%d OMP_THREAD_ID=%d\n", name,M_ID,O_ID); for(j = 0;j<999999999;j++) { counter=counter+i; } } } MPI_Finalize(); }