It seems select can return ERESTARTNOHAND many times when a lot of child
processes exit.
With strace I can observe longer periods of such situations. Maybe the loop should relax a bit after
ERESTARTNOHAND and not try again immediately?
--- SIGCHLD {si_signo=SIGCHLD, si_code=CLD_EXITED, si_pid=27837, si_uid=1000, si_status=0,
si_utime=1, si_stime=2} ---
wait4(-1, [{WIFEXITED(s) && WEXITSTATUS(s) == 0}], WNOHANG, NULL) = 27837
wait4(-1, 0x7ffe7698d2c4, WNOHANG, NULL) = 0
rt_sigreturn({mask=[]}) = -1 EINTR (Interrupted system call)
stat("/run/initctl", {st_mode=S_IFIFO|0600, st_size=0, ...}) = 0
fstat(10, {st_mode=S_IFIFO|0600, st_size=0, ...}) = 0
stat("/run/initctl", {st_mode=S_IFIFO|0600, st_size=0, ...}) = 0
select(11, [10], NULL, NULL, {tv_sec=5, tv_usec=0}) = ? ERESTARTNOHAND (To be
restarted if no handler)
--- SIGCHLD {si_signo=SIGCHLD, si_code=CLD_EXITED, si_pid=27839, si_uid=1000, si_status=0,
si_utime=1, si_stime=2} ---
wait4(-1, [{WIFEXITED(s) && WEXITSTATUS(s) == 0}], WNOHANG, NULL) = 27839
wait4(-1, 0x7ffe7698d2c4, WNOHANG, NULL) = 0
rt_sigreturn({mask=[]}) = -1 EINTR (Interrupted system call)
stat("/run/initctl", {st_mode=S_IFIFO|0600, st_size=0, ...}) = 0
fstat(10, {st_mode=S_IFIFO|0600, st_size=0, ...}) = 0
stat("/run/initctl", {st_mode=S_IFIFO|0600, st_size=0, ...}) = 0
select(11, [10], NULL, NULL, {tv_sec=5, tv_usec=0}) = ? ERESTARTNOHAND (To be
restarted if no handler)
--- SIGCHLD {si_signo=SIGCHLD, si_code=CLD_EXITED, si_pid=27823, si_uid=1000, si_status=0,
si_utime=4, si_stime=5} ---
wait4(-1, [{WIFEXITED(s) && WEXITSTATUS(s) == 0}], WNOHANG, NULL) = 27823
wait4(-1, 0x7ffe7698d2c4, WNOHANG, NULL) = 0
rt_sigreturn({mask=[]}) = -1 EINTR (Interrupted system call)
stat("/run/initctl", {st_mode=S_IFIFO|0600, st_size=0, ...}) = 0
fstat(10, {st_mode=S_IFIFO|0600, st_size=0, ...}) = 0
stat("/run/initctl", {st_mode=S_IFIFO|0600, st_size=0, ...}) = 0
select(11, [10], NULL, NULL, {tv_sec=5, tv_usec=0}) = ? ERESTARTNOHAND (To be
restarted if no handler)
--- SIGCHLD {si_signo=SIGCHLD, si_code=CLD_EXITED, si_pid=27841, si_uid=1000, si_status=0,
si_utime=1, si_stime=1} ---
wait4(-1, [{WIFEXITED(s) && WEXITSTATUS(s) == 0}], WNOHANG, NULL) = 27841
wait4(-1, 0x7ffe7698d2c4, WNOHANG, NULL) = 0
rt_sigreturn({mask=[]}) = -1 EINTR (Interrupted system call)
stat("/run/initctl", {st_mode=S_IFIFO|0600, st_size=0, ...}) = 0
fstat(10, {st_mode=S_IFIFO|0600, st_size=0, ...}) = 0
stat("/run/initctl", {st_mode=S_IFIFO|0600, st_size=0, ...}) = 0
select(11, [10], NULL, NULL, {tv_sec=5, tv_usec=0}) = ? ERESTARTNOHAND (To be
restarted if no handler)
--- SIGCHLD {si_signo=SIGCHLD, si_code=CLD_EXITED, si_pid=27829, si_uid=1000, si_status=0,
si_utime=2, si_stime=4} ---
wait4(-1, [{WIFEXITED(s) && WEXITSTATUS(s) == 0}], WNOHANG, NULL) = 27829
wait4(-1, 0x7ffe7698d2c4, WNOHANG, NULL) = 0
rt_sigreturn({mask=[]}) = -1 EINTR (Interrupted system call)
stat("/run/initctl", {st_mode=S_IFIFO|0600, st_size=0, ...}) = 0
fstat(10, {st_mode=S_IFIFO|0600, st_size=0, ...}) = 0
stat("/run/initctl", {st_mode=S_IFIFO|0600, st_size=0, ...}) = 0
select(11, [10], NULL, NULL, {tv_sec=5, tv_usec=0}) = ? ERESTARTNOHAND (To be
restarted if no handler)
--- SIGCHLD {si_signo=SIGCHLD, si_code=CLD_EXITED, si_pid=27843, si_uid=1000, si_status=0,
si_utime=1, si_stime=2} ---
wait4(-1, [{WIFEXITED(s) && WEXITSTATUS(s) == 0}], WNOHANG, NULL) = 27843
wait4(-1, 0x7ffe7698d2c4, WNOHANG, NULL) = 0
rt_sigreturn({mask=[]}) = -1 EINTR (Interrupted system call)
stat("/run/initctl", {st_mode=S_IFIFO|0600, st_size=0, ...}) = 0
fstat(10, {st_mode=S_IFIFO|0600, st_size=0, ...}) = 0
stat("/run/initctl", {st_mode=S_IFIFO|0600, st_size=0, ...}) = 0
select(11, [10], NULL, NULL, {tv_sec=5, tv_usec=0}) = ? ERESTARTNOHAND (To be
restarted if no handler)
--- SIGCHLD {si_signo=SIGCHLD, si_code=CLD_EXITED, si_pid=27827, si_uid=1000, si_status=0,
si_utime=17, si_stime=5} ---
wait4(-1, [{WIFEXITED(s) && WEXITSTATUS(s) == 0}], WNOHANG, NULL) = 27827
wait4(-1, 0x7ffe7698d2c4, WNOHANG, NULL) = 0
rt_sigreturn({mask=[]}) = -1 EINTR (Interrupted system call)
stat("/run/initctl", {st_mode=S_IFIFO|0600, st_size=0, ...}) = 0
fstat(10, {st_mode=S_IFIFO|0600, st_size=0, ...}) = 0
stat("/run/initctl", {st_mode=S_IFIFO|0600, st_size=0, ...}) = 0
select(11, [10], NULL, NULL, {tv_sec=5, tv_usec=0}) = ? ERESTARTNOHAND (To be
restarted if no handler)
--- SIGCHLD {si_signo=SIGCHLD, si_code=CLD_EXITED, si_pid=27825, si_uid=1000, si_status=0,
si_utime=70, si_stime=12} ---
On 10.07.19 16:08, Ortwin Glück wrote:
Hi,
Browsing I/O statistics taken by atop on a DB server I came across this curious 10 minute slot where
init was apparently causing enormous reads on the system:
PID TID RDDSK WRDSK WCANCL DSK
CMD 1/72
1 - 3.3G 428K 0K 63%
init
Also it is the top source of I/O since boot 76 days ago:
PID TID RDDSK WRDSK WCANCL DSK
CMD 1/19
1 - 3.8T 2.1T 1.9G 66%
init
30892 - 1.4T 3.1G 139.9M 15%
cron
2672 - 57.2G 1.0T 0K 12%
oracle
I wonder how that is possible and if it is a bug.
The system is Gentoo with sysvinit-2.93 on linux-4.20.17.
Ortwin