It seems select can return ERESTARTNOHAND many times when a lot of child 
processes exit.
With strace I can observe longer periods of such situations. Maybe the loop should relax a bit after ERESTARTNOHAND and not try again immediately?

--- SIGCHLD {si_signo=SIGCHLD, si_code=CLD_EXITED, si_pid=27837, si_uid=1000, si_status=0, si_utime=1, si_stime=2} ---
wait4(-1, [{WIFEXITED(s) && WEXITSTATUS(s) == 0}], WNOHANG, NULL) = 27837
wait4(-1, 0x7ffe7698d2c4, WNOHANG, NULL) = 0
rt_sigreturn({mask=[]})                 = -1 EINTR (Interrupted system call)
stat("/run/initctl", {st_mode=S_IFIFO|0600, st_size=0, ...}) = 0
fstat(10, {st_mode=S_IFIFO|0600, st_size=0, ...}) = 0
stat("/run/initctl", {st_mode=S_IFIFO|0600, st_size=0, ...}) = 0
select(11, [10], NULL, NULL, {tv_sec=5, tv_usec=0}) = ? ERESTARTNOHAND (To be 
restarted if no handler)
--- SIGCHLD {si_signo=SIGCHLD, si_code=CLD_EXITED, si_pid=27839, si_uid=1000, si_status=0, si_utime=1, si_stime=2} ---
wait4(-1, [{WIFEXITED(s) && WEXITSTATUS(s) == 0}], WNOHANG, NULL) = 27839
wait4(-1, 0x7ffe7698d2c4, WNOHANG, NULL) = 0
rt_sigreturn({mask=[]})                 = -1 EINTR (Interrupted system call)
stat("/run/initctl", {st_mode=S_IFIFO|0600, st_size=0, ...}) = 0
fstat(10, {st_mode=S_IFIFO|0600, st_size=0, ...}) = 0
stat("/run/initctl", {st_mode=S_IFIFO|0600, st_size=0, ...}) = 0
select(11, [10], NULL, NULL, {tv_sec=5, tv_usec=0}) = ? ERESTARTNOHAND (To be 
restarted if no handler)
--- SIGCHLD {si_signo=SIGCHLD, si_code=CLD_EXITED, si_pid=27823, si_uid=1000, si_status=0, si_utime=4, si_stime=5} ---
wait4(-1, [{WIFEXITED(s) && WEXITSTATUS(s) == 0}], WNOHANG, NULL) = 27823
wait4(-1, 0x7ffe7698d2c4, WNOHANG, NULL) = 0
rt_sigreturn({mask=[]})                 = -1 EINTR (Interrupted system call)
stat("/run/initctl", {st_mode=S_IFIFO|0600, st_size=0, ...}) = 0
fstat(10, {st_mode=S_IFIFO|0600, st_size=0, ...}) = 0
stat("/run/initctl", {st_mode=S_IFIFO|0600, st_size=0, ...}) = 0
select(11, [10], NULL, NULL, {tv_sec=5, tv_usec=0}) = ? ERESTARTNOHAND (To be 
restarted if no handler)
--- SIGCHLD {si_signo=SIGCHLD, si_code=CLD_EXITED, si_pid=27841, si_uid=1000, si_status=0, si_utime=1, si_stime=1} ---
wait4(-1, [{WIFEXITED(s) && WEXITSTATUS(s) == 0}], WNOHANG, NULL) = 27841
wait4(-1, 0x7ffe7698d2c4, WNOHANG, NULL) = 0
rt_sigreturn({mask=[]})                 = -1 EINTR (Interrupted system call)
stat("/run/initctl", {st_mode=S_IFIFO|0600, st_size=0, ...}) = 0
fstat(10, {st_mode=S_IFIFO|0600, st_size=0, ...}) = 0
stat("/run/initctl", {st_mode=S_IFIFO|0600, st_size=0, ...}) = 0
select(11, [10], NULL, NULL, {tv_sec=5, tv_usec=0}) = ? ERESTARTNOHAND (To be 
restarted if no handler)
--- SIGCHLD {si_signo=SIGCHLD, si_code=CLD_EXITED, si_pid=27829, si_uid=1000, si_status=0, si_utime=2, si_stime=4} ---
wait4(-1, [{WIFEXITED(s) && WEXITSTATUS(s) == 0}], WNOHANG, NULL) = 27829
wait4(-1, 0x7ffe7698d2c4, WNOHANG, NULL) = 0
rt_sigreturn({mask=[]})                 = -1 EINTR (Interrupted system call)
stat("/run/initctl", {st_mode=S_IFIFO|0600, st_size=0, ...}) = 0
fstat(10, {st_mode=S_IFIFO|0600, st_size=0, ...}) = 0
stat("/run/initctl", {st_mode=S_IFIFO|0600, st_size=0, ...}) = 0
select(11, [10], NULL, NULL, {tv_sec=5, tv_usec=0}) = ? ERESTARTNOHAND (To be 
restarted if no handler)
--- SIGCHLD {si_signo=SIGCHLD, si_code=CLD_EXITED, si_pid=27843, si_uid=1000, si_status=0, si_utime=1, si_stime=2} ---
wait4(-1, [{WIFEXITED(s) && WEXITSTATUS(s) == 0}], WNOHANG, NULL) = 27843
wait4(-1, 0x7ffe7698d2c4, WNOHANG, NULL) = 0
rt_sigreturn({mask=[]})                 = -1 EINTR (Interrupted system call)
stat("/run/initctl", {st_mode=S_IFIFO|0600, st_size=0, ...}) = 0
fstat(10, {st_mode=S_IFIFO|0600, st_size=0, ...}) = 0
stat("/run/initctl", {st_mode=S_IFIFO|0600, st_size=0, ...}) = 0
select(11, [10], NULL, NULL, {tv_sec=5, tv_usec=0}) = ? ERESTARTNOHAND (To be 
restarted if no handler)
--- SIGCHLD {si_signo=SIGCHLD, si_code=CLD_EXITED, si_pid=27827, si_uid=1000, si_status=0, si_utime=17, si_stime=5} ---
wait4(-1, [{WIFEXITED(s) && WEXITSTATUS(s) == 0}], WNOHANG, NULL) = 27827
wait4(-1, 0x7ffe7698d2c4, WNOHANG, NULL) = 0
rt_sigreturn({mask=[]})                 = -1 EINTR (Interrupted system call)
stat("/run/initctl", {st_mode=S_IFIFO|0600, st_size=0, ...}) = 0
fstat(10, {st_mode=S_IFIFO|0600, st_size=0, ...}) = 0
stat("/run/initctl", {st_mode=S_IFIFO|0600, st_size=0, ...}) = 0
select(11, [10], NULL, NULL, {tv_sec=5, tv_usec=0}) = ? ERESTARTNOHAND (To be 
restarted if no handler)
--- SIGCHLD {si_signo=SIGCHLD, si_code=CLD_EXITED, si_pid=27825, si_uid=1000, si_status=0, si_utime=70, si_stime=12} ---


On 10.07.19 16:08, Ortwin Glück wrote:
Hi,

Browsing I/O statistics taken by atop on a DB server I came across this curious 10 minute slot where init was apparently causing enormous reads on the system:

  PID              TID              RDDSK             WRDSK             WCANCL              DSK        CMD        1/72     1                -               3.3G              428K                 0K              63%        init

Also it is the top source of I/O since boot 76 days ago:
  PID              TID              RDDSK             WRDSK             WCANCL              DSK        CMD        1/19     1                -               3.8T              2.1T               1.9G              66%        init 30892                -               1.4T              3.1G             139.9M              15%       cron  2672                -              57.2G              1.0T                 0K              12%        oracle

I wonder how that is possible and if it is a bug.

The system is Gentoo with sysvinit-2.93 on linux-4.20.17.

Ortwin


Reply via email to