hmm, I'm not correct. cr_restart starts with no errors, launches some of the processes, then suspends itself. strace on mpirun on this manual invocation yields the behavior same as below.
-Henk [hmeij@swallowtail kflaherty]$ ps -u hmeij PID TTY TIME CMD 29481 ? 00:00:00 res 29485 ? 00:00:00 1458575067.384 29488 ? 00:00:00 1458575067.384. 29508 ? 00:00:00 cr_restart 29509 ? 00:00:00 blcr_watcher 29512 ? 00:00:02 lava.openmpi.wr 29514 ? 00:38:35 mpirun 30313 ? 00:00:01 sshd 30314 pts/1 00:00:00 bash 30458 ? 00:00:00 sleep 30483 ? 00:00:00 sleep 30650 pts/1 00:00:00 cr_restart 30652 pts/1 00:00:00 lava.openmpi.wr 30653 pts/1 00:00:00 mpirun 30729 pts/1 00:00:00 ps [hmeij@swallowtail kflaherty]$ jobs [1]+ Stopped cr_restart --no-restore-pid --no-restore-pgid --no-restore-sid --relocate /sanscratch/383=/sanscratch/000 /sanscratch/checkpoints/383/chk.28244 ________________________________ From: Meij, Henk Sent: Monday, March 21, 2016 12:04 PM To: us...@open-mpi.org Subject: BLCR & openmpi openmpi1.2 (yes, I know old),python 2.6.1 blcr 0.8.5 when I attempt to cr_restart (having performed cr_checkpoint --save-all) I can restart the job manually with blcr on a node. but when I go through my openlava scheduler, the cr_restart launches mpirun, then nothing. no orted or the python processes that were running. the new scheduler job performing the restart puts in place the old machinefile and stderr and stdout files. here is what I view on an strace of mpirun What problem is this pointing at? Thanks, -Henk poll([{fd=5, events=POLLIN}, {fd=4, events=POLLIN}, {fd=6, events=POLLIN}, {fd=11, events=POLLIN}, {fd=7, events=POLLIN}, {fd=8, events=POLLIN}, {fd=9, events=POLLIN}, {fd=10, events=POLLIN}], 8, 1000) = 8 ([{fd=5, revents=POLLNVAL}, {fd=4, revents=POLLNVAL}, {fd=6, revents=POLLNVAL}, {fd=11, revents=POLLNVAL}, {fd=7, revents=POLLNVAL}, {fd=8, revents=POLLNVAL}, {fd=9, revents=POLLNVAL}, {fd=10, revents=POLLNVAL}]) rt_sigprocmask(SIG_BLOCK, [INT USR1 USR2 TERM CHLD], NULL, 8) = 0 rt_sigaction(SIGCHLD, {0x2b7ca19cb30a, [INT USR1 USR2 TERM CHLD], SA_RESTORER|SA_RESTART, 0x397840f790}, NULL, 8) = 0 rt_sigaction(SIGTERM, {0x2b7ca19cb30a, [INT USR1 USR2 TERM CHLD], SA_RESTORER|SA_RESTART, 0x397840f790}, NULL, 8) = 0 rt_sigaction(SIGINT, {0x2b7ca19cb30a, [INT USR1 USR2 TERM CHLD], SA_RESTORER|SA_RESTART, 0x397840f790}, NULL, 8) = 0 rt_sigaction(SIGUSR1, {0x2b7ca19cb30a, [INT USR1 USR2 TERM CHLD], SA_RESTORER|SA_RESTART, 0x397840f790}, NULL, 8) = 0 rt_sigaction(SIGUSR2, {0x2b7ca19cb30a, [INT USR1 USR2 TERM CHLD], SA_RESTORER|SA_RESTART, 0x397840f790}, NULL, 8) = 0 sched_yield() = 0 rt_sigprocmask(SIG_BLOCK, [INT USR1 USR2 TERM CHLD], NULL, 8) = 0 rt_sigaction(SIGCHLD, {0x2b7ca19cb30a, [INT USR1 USR2 TERM CHLD], SA_RESTORER|SA_RESTART, 0x397840f790}, NULL, 8) = 0 rt_sigaction(SIGTERM, {0x2b7ca19cb30a, [INT USR1 USR2 TERM CHLD], SA_RESTORER|SA_RESTART, 0x397840f790}, NULL, 8) = 0 rt_sigaction(SIGINT, {0x2b7ca19cb30a, [INT USR1 USR2 TERM CHLD], SA_RESTORER|SA_RESTART, 0x397840f790}, NULL, 8) = 0 rt_sigaction(SIGUSR1, {0x2b7ca19cb30a, [INT USR1 USR2 TERM CHLD], SA_RESTORER|SA_RESTART, 0x397840f790}, NULL, 8) = 0 rt_sigaction(SIGUSR2, {0x2b7ca19cb30a, [INT USR1 USR2 TERM CHLD], SA_RESTORER|SA_RESTART, 0x397840f790}, NULL, 8) = 0