Embarrassingly, the issue does appear to have been
/proc/sys/net/core/somaxconn setting all along. It was increased from
128, to 512, to 1024 with full restarts of Dovecot inbetween. However,
this was apparently not enough to handle our login bursts.
The latest increase, from 1024 to 65536, only had a `doveadm reload`
issued, which seemingly does not cause the socket to rebind and
therefore still used the old settings.
After fully restarting dovecot, the issue -appears- to be gone. At least
we survived through the morning spikes today without errors.
Apologies for the list spam.
On 14/01/2020 13:38, Eirik Rye wrote:
We are unfortunately still seeing a lot of these errors once the machine
reaches a high number of concurrent users/logins (just below 20k
simultaneous IMAP connections on a powerful 24 core machine with 128GB
RAM):
2020-01-14T09:18:58.661349+01:00 dovecot: imap-login: Warning:
net_connect_unix(imap) succeeded only after retrying - took 140330 us
2020-01-14T09:18:58.854692+01:00 dovecot: imap-login: Error:
master(imap): net_connect_unix(imap) failed: Resource temporarily
unavailable - http://wiki2.dovecot.org/SocketUnavailable
(client-pid=107932, client-id=780262, rip=x, created 500 msecs ago,
received 0/4 bytes)
2020-01-14T09:18:58.888228+01:00 dovecot: imap-login: Warning:
net_connect_unix(imap) succeeded only after retrying - took 440762 us
The machine is at insignificant load numbers, and the dovecot process is
somewhere near 25% CPU usage when the problem occurs. It is not close to
saturating its core from what I can tell.
To make sure the issues are not task/fd-limit related, I have set this
in /etc/systemd/system/dovecot.service.d/service.conf:
[Service]
LimitNOFILE=infinity
TasksMax=infinity
~# egrep "processes|files" /proc/`pidof dovecot`/limits
Max processes 514051 514051
Max open files 1048576 1048576
~# cat /proc/sys/net/core/somaxconn
65536
~# cat /proc/sys/kernel/pid_max
278528
Dovecot is configured with NFS backed storage, and locally stored
passwdfile userdb/passdb are used for authentication.
Backends are (now) behind directors. The directors/proxies are having no
issues dealing with the traffic whatsoever.
At the time of failure, the process list looks like this:
~# ps -f --ppid `pidof dovecot` | egrep -v "dovecot/(imap|pop3) "
UID PID PPID C STIME TTY TIME CMD
274264 44753 138506 0 12:43 ? 00:00:00 [imap]
308665 104852 138506 0 13:01 ? 00:00:00 [imap]
308665 104853 138506 0 13:01 ? 00:00:00 [imap]
dovenull 138508 138506 1 10:31 ? 00:03:00 dovecot/pop3-login [6
pre-login + 36 TLS proxies]
dovenull 138509 138506 0 10:31 ? 00:00:07 dovecot/imap-login
dovecot 138510 138506 0 10:31 ? 00:01:10 dovecot/anvil [20
connections]
root 138511 138506 1 10:31 ? 00:02:14 dovecot/log
dovenull 138512 138506 1 10:31 ? 00:01:48 dovecot/pop3-login [1
pre-login + 15 TLS proxies]
dovenull 138513 138506 0 10:31 ? 00:00:08 dovecot/imap-login
[redacted TLS proxy]
dovenull 138514 138506 0 10:31 ? 00:00:07 dovecot/imap-login [0
pre-login + 3 TLS proxies]
dovenull 138515 138506 0 10:31 ? 00:00:10 dovecot/imap-login
[redacted TLS proxy]
dovenull 138516 138506 0 10:31 ? 00:01:14 dovecot/imap-login [27
pre-login + 12 TLS proxies]
dovenull 138517 138506 0 10:31 ? 00:00:31 dovecot/imap-login [2
pre-login + 2 TLS proxies]
dovenull 138518 138506 0 10:31 ? 00:01:28 dovecot/imap-login [56
pre-login + 20 TLS proxies]
dovenull 138519 138506 0 10:31 ? 00:00:09 dovecot/imap-login [0
pre-login + 4 TLS proxies]
dovenull 138520 138506 0 10:31 ? 00:00:06 dovecot/imap-login
[redacted TLS proxy]
dovenull 138521 138506 0 10:31 ? 00:00:11 dovecot/imap-login [0
pre-login + 3 TLS proxies]
dovenull 138522 138506 0 10:31 ? 00:00:16 dovecot/imap-login [2
pre-login + 2 TLS proxies]
dovenull 138523 138506 0 10:31 ? 00:00:13 dovecot/imap-login [1
pre-login + 2 TLS proxies]
dovenull 138524 138506 0 10:31 ? 00:00:24 dovecot/imap-login [1
pre-login + 3 TLS proxies]
dovenull 138525 138506 0 10:31 ? 00:01:13 dovecot/imap-login [36
pre-login + 23 TLS proxies]
dovenull 138526 138506 0 10:31 ? 00:00:41 dovecot/imap-login [10
pre-login + 12 TLS proxies]
dovenull 138527 138506 0 10:31 ? 00:00:20 dovecot/imap-login [1
pre-login + 7 TLS proxies]
root 138528 138506 2 10:31 ? 00:04:45 dovecot/config
dovecot 138529 138506 1 10:31 ? 00:02:22 dovecot/stats [19389
connections]
dovecot 138530 138506 3 10:31 ? 00:05:53 dovecot/auth [137
wait, 0 passdb, 0 userdb]
root 148675 138506 0 13:13 ? 00:00:00 dovecot/doveadm-server
[redacted]
Other stats:
~# ps -f --ppid `pidof dovecot` | grep "dovecot/imap " | wc -l
19328
~# doveadm process status | grep "^imap " | wc -l
19307
~# ss -ntp "( sport = :143 or sport = :993 )" | grep "\"imap\"" | wc -l
19141
~# ss -ntp "( sport = :143 or sport = :993 )" | grep "\"imap-login\"" |
wc -l
333
~# ss -ntp "( sport = :143 or sport = :993 )" | wc -l
19530
~# doveconf -n
# 2.3.9.2 (cf2918cac): /etc/dovecot/dovecot.conf
# OS: Linux 4.9.0-9-amd64 x86_64 Debian 9.11
# Hostname: [redacted]
default_vsz_limit = 768 M
disable_plaintext_auth = no
imap_id_log = *
log_timestamp = "%F %T %z "
login_trusted_networks = [redacted]
mail_fsync = always
mail_location = maildir:~/Maildir
mail_nfs_index = yes
mail_nfs_storage = yes
mmap_disable = yes
namespace inbox {
inbox = yes
location =
mailbox Drafts {
auto = subscribe
special_use = \Drafts
}
mailbox Sent {
auto = subscribe
special_use = \Sent
}
mailbox "Sent Messages" {
auto = no
special_use = \Sent
}
mailbox Spam {
auto = create
special_use = \Junk
}
mailbox Trash {
auto = subscribe
special_use = \Trash
}
prefix =
separator = /
}
passdb {
args = username_format=%Lu /etc/dovecot/aliases
default_fields = noauthenticate
driver = passwd-file
}
passdb {
args = /etc/dovecot/passwd
driver = passwd-file
}
protocols = imap pop3
service doveadm {
inet_listener {
port = 24245
}
inet_listener http {
port = 8080
}
}
service imap-login {
client_limit = 4096
inet_listener imap {
port = 143
}
inet_listener imaps {
port = 993
ssl = yes
}
process_limit = 16
process_min_avail = 16
service_count = 0
vsz_limit = 768 M
}
service imap {
client_limit = 1
process_limit = 65536
}
service pop3-login {
client_limit = 4096
inet_listener pop3 {
port = 110
}
inet_listener pop3s {
port = 995
ssl = yes
}
process_limit = 8
process_min_avail = 2
service_count = 0
}
service pop3 {
process_limit = 16384
}
service stats {
client_limit = 65536
}
ssl_cert = <[redacted]
ssl_cipher_list = ALL:!LOW:!SSLv2:!EXP:!aNULL
ssl_dh = # hidden, use -P to show it
ssl_key = # hidden, use -P to show it
userdb {
args = /etc/dovecot/passwd
driver = passwd-file
}
verbose_proctitle = yes
protocol imap {
mail_max_userip_connections = 20
rawlog_dir = [redacted]
}
Are there any other tunables either in Dovecot or in the kernel that may
relate to this issue that we may have missed?
--
Eirik