On Wed, 2021-06-16 at 00:08 +0200, Daniel Gustafsson wrote: > > On 15 Jun 2021, at 00:15, Jacob Champion <pchamp...@vmware.com> wrote: > > Attached is a quick patch; does it work on your machine? > > It does, thanks! I've included it in the attached v37 along with a few tiny > non-functional improvements in comment spelling etc.
Great, thanks! I've been tracking down reference leaks in the client. These open references prevent NSS from shutting down cleanly, which then makes it impossible to open a new context in the future. This probably affects other libpq clients more than it affects psql. The first step to fixing that is not ignoring failures during NSS shutdown, so I've tried a patch to pgtls_close() that pushes any failures through the pqInternalNotice(). That seems to be working well. The tests were still mostly green, so I taught connect_ok() to fail if any stderr showed up, and that exposed quite a few failures. I am currently stuck on one last failing test. This leak seems to only show up when using TLSv1.2 or below. There doesn't seem to be a substantial difference in libpq code coverage between 1.2 and 1.3, so I'm worried that either 1) there's some API we use that "requires" cleanup, but only on 1.2 and below, or 2) there's some bug in my version of NSS. Attached are a few work-in-progress patches. I think the reference cleanups themselves are probably solid, but the rest of it could use some feedback. Are there better ways to test for this? and can anyone reproduce the TLSv1.2 leak? --Jacob
From 6976d15a4be50276ea2f77fd5c37d238493f9447 Mon Sep 17 00:00:00 2001 From: Jacob Champion <pchamp...@vmware.com> Date: Tue, 15 Jun 2021 10:01:14 -0700 Subject: [PATCH 1/3] nss: don't ignore failures during context shutdown The biggest culprit of a shutdown failure so far seems to be object leaks. A failure here may prevent future contexts from being created (and they'll fail in confusing ways), so don't ignore it. There's not a great way to signal errors from this layer of the stack, but a notice will at least give us a chance of seeing the problem. --- src/interfaces/libpq/fe-secure-nss.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/src/interfaces/libpq/fe-secure-nss.c b/src/interfaces/libpq/fe-secure-nss.c index 1b9bd4f1a5..90f57e0d99 100644 --- a/src/interfaces/libpq/fe-secure-nss.c +++ b/src/interfaces/libpq/fe-secure-nss.c @@ -139,7 +139,16 @@ pgtls_close(PGconn *conn) { if (conn->nss_context) { - NSS_ShutdownContext(conn->nss_context); + SECStatus status; + status = NSS_ShutdownContext(conn->nss_context); + + if (status != SECSuccess) + { + pqInternalNotice(&conn->noticeHooks, + "unable to shut down NSS context: %s", + pg_SSLerrmessage(PR_GetError())); + } + conn->nss_context = NULL; } } -- 2.25.1
From acabc4d51d38124db748a6be9dca0ff83693e039 Mon Sep 17 00:00:00 2001 From: Jacob Champion <pchamp...@vmware.com> Date: Tue, 15 Jun 2021 14:37:28 -0700 Subject: [PATCH 2/3] test: check for empty stderr during connect_ok() ...in a similar manner to command_like(), to catch notices-as-errors coming from NSS. --- src/test/authentication/t/001_password.pl | 2 +- src/test/authentication/t/002_saslprep.pl | 2 +- src/test/kerberos/t/001_auth.pl | 2 +- src/test/ldap/t/001_auth.pl | 2 +- src/test/perl/PostgresNode.pm | 5 ++++- src/test/ssl/t/001_ssltests.pl | 4 ++-- src/test/ssl/t/002_scram.pl | 4 ++-- 7 files changed, 12 insertions(+), 9 deletions(-) diff --git a/src/test/authentication/t/001_password.pl b/src/test/authentication/t/001_password.pl index 427a360198..327dfb4ed9 100644 --- a/src/test/authentication/t/001_password.pl +++ b/src/test/authentication/t/001_password.pl @@ -20,7 +20,7 @@ if (!$use_unix_sockets) } else { - plan tests => 23; + plan tests => 32; } diff --git a/src/test/authentication/t/002_saslprep.pl b/src/test/authentication/t/002_saslprep.pl index f080a0ccba..d6508df8bd 100644 --- a/src/test/authentication/t/002_saslprep.pl +++ b/src/test/authentication/t/002_saslprep.pl @@ -17,7 +17,7 @@ if (!$use_unix_sockets) } else { - plan tests => 12; + plan tests => 20; } # Delete pg_hba.conf from the given node, add a new entry to it diff --git a/src/test/kerberos/t/001_auth.pl b/src/test/kerberos/t/001_auth.pl index b5594924ca..62015339a0 100644 --- a/src/test/kerberos/t/001_auth.pl +++ b/src/test/kerberos/t/001_auth.pl @@ -23,7 +23,7 @@ use Time::HiRes qw(usleep); if ($ENV{with_gssapi} eq 'yes') { - plan tests => 44; + plan tests => 54; } else { diff --git a/src/test/ldap/t/001_auth.pl b/src/test/ldap/t/001_auth.pl index 0ae14e4c85..e857c26258 100644 --- a/src/test/ldap/t/001_auth.pl +++ b/src/test/ldap/t/001_auth.pl @@ -9,7 +9,7 @@ use Test::More; if ($ENV{with_ldap} eq 'yes') { - plan tests => 28; + plan tests => 40; } else { diff --git a/src/test/perl/PostgresNode.pm b/src/test/perl/PostgresNode.pm index 2027cbf43d..4a1e151d7e 100644 --- a/src/test/perl/PostgresNode.pm +++ b/src/test/perl/PostgresNode.pm @@ -2018,8 +2018,11 @@ sub connect_ok if (defined($params{expected_stdout})) { - like($stdout, $params{expected_stdout}, "$test_name: matches"); + like($stdout, $params{expected_stdout}, "$test_name: stdout matches"); } + + is($stderr, "", "$test_name: no stderr"); + if (@log_like or @log_unlike) { my $log_contents = TestLib::slurp_file($self->logfile, $log_location); diff --git a/src/test/ssl/t/001_ssltests.pl b/src/test/ssl/t/001_ssltests.pl index aec99e7bf6..c80abc9ec9 100644 --- a/src/test/ssl/t/001_ssltests.pl +++ b/src/test/ssl/t/001_ssltests.pl @@ -18,12 +18,12 @@ my $nss; if ($ENV{with_ssl} eq 'openssl') { $openssl = 1; - plan tests => 112; + plan tests => 144; } elsif ($ENV{with_ssl} eq 'nss') { $nss = 1; - plan tests => 112; + plan tests => 138; } else { diff --git a/src/test/ssl/t/002_scram.pl b/src/test/ssl/t/002_scram.pl index adfdc55e1a..7976b2e5b1 100644 --- a/src/test/ssl/t/002_scram.pl +++ b/src/test/ssl/t/002_scram.pl @@ -27,13 +27,13 @@ if ($ENV{with_ssl} eq 'openssl') # Determine whether build supports tls-server-end-point. $supports_tls_server_end_point = check_pg_config("#define HAVE_X509_GET_SIGNATURE_NID 1"); - plan tests => $supports_tls_server_end_point ? 11 : 12; + plan tests => 15; } elsif ($ENV{with_ssl} eq 'nss') { $nss = 1; $supports_tls_server_end_point = 1; - plan tests => 11; + plan tests => 15; } else { -- 2.25.1
From 3141b29053a4b48289ebb7ee26d22e395c07ff31 Mon Sep 17 00:00:00 2001 From: Jacob Champion <pchamp...@vmware.com> Date: Tue, 15 Jun 2021 15:59:39 -0700 Subject: [PATCH 3/3] nss: clean up leaked resources The NSPR file descriptor and a handful of certificate references were preventing NSS from shutting down. At least one leak is still on the loose, when forcing TLSv1.2 or lower during client connections: $ psql 'host=localhost sslmode=require ssl_max_protocol_version=TLSv1.2' psql (14beta1) SSL connection (protocol: TLSv1.2, cipher: TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256, bits: 128, compression: off) Type "help" for help. postgres=# \q unable to shut down NSS context: NSS could not shutdown. Objects are still in use. --- src/interfaces/libpq/fe-secure-nss.c | 37 ++++++++++++++++++++++------ 1 file changed, 30 insertions(+), 7 deletions(-) diff --git a/src/interfaces/libpq/fe-secure-nss.c b/src/interfaces/libpq/fe-secure-nss.c index 90f57e0d99..7b92e89ecd 100644 --- a/src/interfaces/libpq/fe-secure-nss.c +++ b/src/interfaces/libpq/fe-secure-nss.c @@ -137,6 +137,22 @@ pgtls_init(PGconn *conn, bool do_ssl, bool do_crypto) void pgtls_close(PGconn *conn) { + /* All NSS references must be cleaned up before we close out the context. */ + if (conn->pr_fd) + { + PRStatus status; + + status = PR_Close(conn->pr_fd); + if (status != PR_SUCCESS) + { + pqInternalNotice(&conn->noticeHooks, + "unable to close NSPR fd: %s", + pg_SSLerrmessage(PR_GetError())); + } + + conn->pr_fd = NULL; + } + if (conn->nss_context) { SECStatus status; @@ -573,16 +589,16 @@ pgtls_get_peer_certificate_hash(PGconn *conn, size_t *len) SECOidTag signature_tag; SECOidTag digest_alg; int digest_len; - PLArenaPool *arena; + PLArenaPool *arena = NULL; SECItem digest; - char *ret; + char *ret = NULL; SECStatus status; *len = 0; server_cert = SSL_PeerCertificate(conn->pr_fd); if (!server_cert) - return NULL; + goto cleanup; signature_tag = SECOID_GetAlgorithmTag(&server_cert->signature); if (!pg_find_signature_algorithm(signature_tag, &digest_alg, &digest_len)) @@ -590,7 +606,7 @@ pgtls_get_peer_certificate_hash(PGconn *conn, size_t *len) printfPQExpBuffer(&conn->errorMessage, libpq_gettext("could not find digest for OID '%s'\n"), SECOID_FindOIDTagDescription(signature_tag)); - return NULL; + goto cleanup; } arena = PORT_NewArena(SEC_ASN1_DEFAULT_ARENA_SIZE); @@ -605,14 +621,18 @@ pgtls_get_peer_certificate_hash(PGconn *conn, size_t *len) printfPQExpBuffer(&conn->errorMessage, libpq_gettext("unable to generate peer certificate digest: %s"), pg_SSLerrmessage(PR_GetError())); - PORT_FreeArena(arena, PR_TRUE); - return NULL; + goto cleanup; } ret = pg_malloc(digest.len); memcpy(ret, digest.data, digest.len); *len = digest_len; - PORT_FreeArena(arena, PR_TRUE); + +cleanup: + if (arena) + PORT_FreeArena(arena, PR_TRUE); + if (server_cert) + CERT_DestroyCertificate(server_cert); return ret; } @@ -712,6 +732,8 @@ done: /* san_list will be freed by freeing the arena it was allocated in */ if (arena) PORT_FreeArena(arena, PR_TRUE); + if (server_cert) + CERT_DestroyCertificate(server_cert); PR_Free(server_hostname); if (status == SECSuccess) @@ -826,6 +848,7 @@ pg_cert_auth_handler(void *arg, PRFileDesc *fd, PRBool checksig, PRBool isServer pg_SSLerrmessage(PR_GetError())); } + CERT_DestroyCertificate(server_cert); return status; } -- 2.25.1