Embarrassingly, the issue does appear to have been /proc/sys/net/core/somaxconn setting all along. It was increased from 128, to 512, to 1024 with full restarts of Dovecot inbetween. However, this was apparently not enough to handle our login bursts.

The latest increase, from 1024 to 65536, only had a `doveadm reload` issued, which seemingly does not cause the socket to rebind and therefore still used the old settings.

After fully restarting dovecot, the issue -appears- to be gone. At least we survived through the morning spikes today without errors.

Apologies for the list spam.

On 14/01/2020 13:38, Eirik Rye wrote:
We are unfortunately still seeing a lot of these errors once the machine reaches a high number of concurrent users/logins (just below 20k simultaneous IMAP connections on a powerful 24 core machine with 128GB RAM):

2020-01-14T09:18:58.661349+01:00 dovecot: imap-login: Warning: net_connect_unix(imap) succeeded only after retrying - took 140330 us 2020-01-14T09:18:58.854692+01:00 dovecot: imap-login: Error: master(imap): net_connect_unix(imap) failed: Resource temporarily unavailable - http://wiki2.dovecot.org/SocketUnavailable (client-pid=107932, client-id=780262, rip=x, created 500 msecs ago, received 0/4 bytes) 2020-01-14T09:18:58.888228+01:00 dovecot: imap-login: Warning: net_connect_unix(imap) succeeded only after retrying - took 440762 us

The machine is at insignificant load numbers, and the dovecot process is somewhere near 25% CPU usage when the problem occurs. It is not close to saturating its core from what I can tell.

To make sure the issues are not task/fd-limit related, I have set this in /etc/systemd/system/dovecot.service.d/service.conf:

[Service]
LimitNOFILE=infinity
TasksMax=infinity

~# egrep "processes|files" /proc/`pidof dovecot`/limits
Max processes             514051               514051
Max open files            1048576              1048576
~# cat /proc/sys/net/core/somaxconn
65536
~# cat /proc/sys/kernel/pid_max
278528

Dovecot is configured with NFS backed storage, and locally stored passwdfile userdb/passdb are used for authentication.

Backends are (now) behind directors. The directors/proxies are having no issues dealing with the traffic whatsoever.

At the time of failure, the process list looks like this:

~# ps -f --ppid `pidof dovecot` | egrep -v "dovecot/(imap|pop3) "
UID         PID   PPID  C STIME TTY          TIME CMD
274264    44753 138506  0 12:43 ?        00:00:00 [imap]
308665   104852 138506  0 13:01 ?        00:00:00 [imap]
308665   104853 138506  0 13:01 ?        00:00:00 [imap]
dovenull 138508 138506  1 10:31 ?        00:03:00 dovecot/pop3-login [6 pre-login + 36 TLS proxies]
dovenull 138509 138506  0 10:31 ?        00:00:07 dovecot/imap-login
dovecot  138510 138506  0 10:31 ?        00:01:10 dovecot/anvil [20 connections]
root     138511 138506  1 10:31 ?        00:02:14 dovecot/log
dovenull 138512 138506  1 10:31 ?        00:01:48 dovecot/pop3-login [1 pre-login + 15 TLS proxies] dovenull 138513 138506  0 10:31 ?        00:00:08 dovecot/imap-login [redacted TLS proxy] dovenull 138514 138506  0 10:31 ?        00:00:07 dovecot/imap-login [0 pre-login + 3 TLS proxies] dovenull 138515 138506  0 10:31 ?        00:00:10 dovecot/imap-login [redacted TLS proxy] dovenull 138516 138506  0 10:31 ?        00:01:14 dovecot/imap-login [27 pre-login + 12 TLS proxies] dovenull 138517 138506  0 10:31 ?        00:00:31 dovecot/imap-login [2 pre-login + 2 TLS proxies] dovenull 138518 138506  0 10:31 ?        00:01:28 dovecot/imap-login [56 pre-login + 20 TLS proxies] dovenull 138519 138506  0 10:31 ?        00:00:09 dovecot/imap-login [0 pre-login + 4 TLS proxies] dovenull 138520 138506  0 10:31 ?        00:00:06 dovecot/imap-login [redacted TLS proxy] dovenull 138521 138506  0 10:31 ?        00:00:11 dovecot/imap-login [0 pre-login + 3 TLS proxies] dovenull 138522 138506  0 10:31 ?        00:00:16 dovecot/imap-login [2 pre-login + 2 TLS proxies] dovenull 138523 138506  0 10:31 ?        00:00:13 dovecot/imap-login [1 pre-login + 2 TLS proxies] dovenull 138524 138506  0 10:31 ?        00:00:24 dovecot/imap-login [1 pre-login + 3 TLS proxies] dovenull 138525 138506  0 10:31 ?        00:01:13 dovecot/imap-login [36 pre-login + 23 TLS proxies] dovenull 138526 138506  0 10:31 ?        00:00:41 dovecot/imap-login [10 pre-login + 12 TLS proxies] dovenull 138527 138506  0 10:31 ?        00:00:20 dovecot/imap-login [1 pre-login + 7 TLS proxies]
root     138528 138506  2 10:31 ?        00:04:45 dovecot/config
dovecot  138529 138506  1 10:31 ?        00:02:22 dovecot/stats [19389 connections] dovecot  138530 138506  3 10:31 ?        00:05:53 dovecot/auth [137 wait, 0 passdb, 0 userdb] root     148675 138506  0 13:13 ?        00:00:00 dovecot/doveadm-server [redacted]

Other stats:

~# ps -f --ppid `pidof dovecot` | grep "dovecot/imap " | wc -l
19328
~# doveadm process status | grep "^imap " | wc -l
19307
~# ss -ntp "( sport = :143 or sport = :993 )" | grep "\"imap\"" | wc -l
19141
~# ss -ntp "( sport = :143 or sport = :993 )" | grep "\"imap-login\"" | wc -l
333
~# ss -ntp "( sport = :143 or sport = :993 )" | wc -l
19530

~# doveconf -n

# 2.3.9.2 (cf2918cac): /etc/dovecot/dovecot.conf
# OS: Linux 4.9.0-9-amd64 x86_64 Debian 9.11
# Hostname: [redacted]
default_vsz_limit = 768 M
disable_plaintext_auth = no
imap_id_log = *
log_timestamp = "%F %T %z "
login_trusted_networks = [redacted]
mail_fsync = always
mail_location = maildir:~/Maildir
mail_nfs_index = yes
mail_nfs_storage = yes
mmap_disable = yes
namespace inbox {
   inbox = yes
   location =
   mailbox Drafts {
     auto = subscribe
     special_use = \Drafts
   }
   mailbox Sent {
     auto = subscribe
     special_use = \Sent
   }
   mailbox "Sent Messages" {
     auto = no
     special_use = \Sent
   }
   mailbox Spam {
     auto = create
     special_use = \Junk
   }
   mailbox Trash {
     auto = subscribe
     special_use = \Trash
   }
   prefix =
   separator = /
}
passdb {
   args = username_format=%Lu /etc/dovecot/aliases
   default_fields = noauthenticate
   driver = passwd-file
}
passdb {
   args = /etc/dovecot/passwd
   driver = passwd-file
}
protocols = imap pop3
service doveadm {
   inet_listener {
     port = 24245
   }
   inet_listener http {
     port = 8080
   }
}
service imap-login {
   client_limit = 4096
   inet_listener imap {
     port = 143
   }
   inet_listener imaps {
     port = 993
     ssl = yes
   }
   process_limit = 16
   process_min_avail = 16
   service_count = 0
   vsz_limit = 768 M
}
service imap {
   client_limit = 1
   process_limit = 65536
}
service pop3-login {
   client_limit = 4096
   inet_listener pop3 {
     port = 110
   }
   inet_listener pop3s {
     port = 995
     ssl = yes
   }
   process_limit = 8
   process_min_avail = 2
   service_count = 0
}
service pop3 {
   process_limit = 16384
}
service stats {
   client_limit = 65536
}
ssl_cert = <[redacted]
ssl_cipher_list = ALL:!LOW:!SSLv2:!EXP:!aNULL
ssl_dh = # hidden, use -P to show it
ssl_key = # hidden, use -P to show it
userdb {
   args = /etc/dovecot/passwd
   driver = passwd-file
}
verbose_proctitle = yes
protocol imap {
   mail_max_userip_connections = 20
   rawlog_dir = [redacted]
}

Are there any other tunables either in Dovecot or in the kernel that may relate to this issue that we may have missed?



--
Eirik

Reply via email to