I have an SSL client that connects to an SSL server. The server is able to
process 1000s of clients just fine on a variety of platforms
[Window/Linux/HP/Solairs] for long periods of time.

The problem that is driving me nuts is that from time to time like once
every 24 hours some client fails to connect to the server at the handshaking
phase. This happens only on Linux and HP. Other platforms do not experience
this issue.

Here is a sketch of my client and server code. Please note that I am using non
blocking sockets:

common code:
---------------------
int ssl_retry(int ret)
{
   int r;
   fd_set fds;
   struct timeval tv, *ptv=0;
   tv.tv_sec =  1;/*do a select for 1 second each time*/
   tv.tv_usec = 0;
   ptv=&tv;
   FD_ZERO(&fds);

   switch(SSL_get_error(m_ssl, ret)
   {
    case SSL_ERROR_NONE:
         r = 1;
         break;
    case SSL_ERROR_WANT_READ:
        FD_SET(m_sock_fd, &fds);
        r=select(m_sock_fd + 1, &fds, 0, 0, ptv);
        if (r <= 0 && (Errno == EAGAIN || Errno == EINTR))/*if we timed out
with EAGAIN try again*/
        {
            r = 1;
        }
        break;
    case SSL_ERROR_WANT_WRITE:/
        FD_SET(m_sock_fd, &fds);
        r=select(m_sock_fd + 1, 0, &fds, 0, ptv);
        if (r <= 0 && (Errno == EAGAIN || Errno == EINTR))/*if we timed out
with EAGAIN try again*/
        {
            r = 1;
        }
        break;
    case SSL_ERROR_ZERO_RETURN:/*The socket closed*/
        r = 0;
        break;
    case SSL_ERROR_SYSCALL:
    case SSL_ERROR_SSL:
         r = -1;
         break;
    default:
        r = -1;
    }
    return r;

client code:
-----------------
int time0 = time(NULL);
timeout=10 seconds;
while (t<timeout)
{
      r = SSL_connect(m_ssl);
      if (r > 0)
      {
         break;
      }
      r = ssl_retry(r);
      if ( r <= 0)
      {
         break;
      }
      t = time(NULL) - time0;
}
if (t>=timeout)
{
  I timed out:(
}
if (r>0)
{
  We are connected. Do work.
}
else
{
  Some kind of an issue.
}

server code:
-----------------
int time0 = time(NULL);
timeout=10 seconds;
while (t<timeout)
{
      r = SSL_accept(m_ssl);
      if (r > 0)
      {
         break;
      }
      r = ssl_retry(r);
      if ( r <= 0)
      {
         break;
      }
      t = time(NULL) - time0;
}
if (t>=timeout)
{
  I timed out:(
}
if (r>0)
{
  We are connected. Do work.
}
else
{
  Some kind of an issue.
}


When this problem happens both the client and the server end up in the red
line above "I timed out"

With some debugging efforts I see that when this problem hits, both the
client and the server go repeatedly into the green section above, each one
of them seems to want to perform a read as the returned code
is SSL_ERROR_WANT_READ from both the SSL_connect and the SSL_accept calls.

This looks to me as a deadlock situation where both my server and my client
are wanting to do a READ until both of them timeout!

Can someone please suggest to me what is wrong with the above code and how
is this deadlock possible?? I am using openssl-1.0.0a

Reply via email to