[Spread-cvs] cvs commit: spread/daemon Readme.txt sp.c

jonathan at spread.org jonathan at spread.org
Tue Oct 5 10:08:10 EDT 2004


jonathan    04/10/05 10:08:10

  Modified:    daemon   Readme.txt sp.c
  Log:
  Make SP_connect_timeout call connect() on a non-blocking fd. This prevents
  the client from blocking when a daemon is non-responsive. The connect processing
  was broken into a new connect_nointr_timeout() function which handles both
  interrups/signals during connect and makes the connect non-blocking.
  
  Some old code that was commented out was removed from the SP_connect_timeout
  function and some small readability improvements were made by breaking the
  setsockopt() calls into a separate set_large_socket_buffers() function.
  
  Revision  Changes    Path
  1.66      +4 -0      spread/daemon/Readme.txt
  
  Index: Readme.txt
  ===================================================================
  RCS file: /storage/cvsroot/spread/daemon/Readme.txt,v
  retrieving revision 1.65
  retrieving revision 1.66
  diff -u -r1.65 -r1.66
  --- Readme.txt	3 Oct 2004 05:21:11 -0000	1.65
  +++ Readme.txt	5 Oct 2004 14:08:10 -0000	1.66
  @@ -101,6 +101,10 @@
   16) Fix EVS bug where AGREED messages may be delivered before a transitional signal
       on some daemons and after it on others. Bug found and patch created by 
       Ryan Caudy.
  +17) Make SP_connect_timeout() calls non-blocking for the actual 'connect()' call. 
  +    This should fix the issue reported by Shlomi Yaakobovich where a hung daemon
  +    causes new connections to also hang in connect. Also includes slight cleanup
  +    of connect code path.
   
   SOURCE INSTALL:
   ---------------
  
  
  
  1.14      +143 -178  spread/daemon/sp.c
  
  Index: sp.c
  ===================================================================
  RCS file: /storage/cvsroot/spread/daemon/sp.c,v
  retrieving revision 1.13
  retrieving revision 1.14
  diff -u -r1.13 -r1.14
  --- sp.c	23 Sep 2004 23:15:18 -0000	1.13
  +++ sp.c	5 Oct 2004 14:08:10 -0000	1.14
  @@ -221,6 +221,142 @@
           return(ret);
   }
   
  +/* This calls connect() with the additional features of ignoring syscall interruptions
  + * caused by signals delivered to this application, and of returning in at most *time_out time.
  + * When it returns the *time_out variable will be modified to have contain the value:
  + * old *time_out - time spent in this function.
  + *
  + * If *time_out == {0,0} then the call is made blocking and will NOT timeout.
  + */
  +static  int     connect_nointr_timeout(int s, struct sockaddr *sname, socklen_t slen, sp_time *time_out)
  +{
  +    int         ret, num_ready;
  +    fd_set      rset,fixed_rset,wset;
  +    sp_time     start_time, temp_time, target_time, wait_time;
  +    int         non_blocking = 0;
  +    int         err;
  +    int         on;
  +    int         ret_ioctl;
  +    sockopt_len_t   elen;
  +
  +    if ( E_compare_time(Zero_timeout, *time_out) < 0 )
  +    {
  +        non_blocking = 1;
  +        start_time = E_get_time();
  +        target_time = E_add_time(start_time, *time_out);
  +        wait_time = *time_out;
  +        /* set file descriptor to non-blocking */
  +        on = 1;
  +        ret_ioctl = ioctl( s, FIONBIO, &on);
  +    }
  +    /* Handle EINTR while connecting by waiting with select until the 
  +     * connect completes or fails.  This is a while loop but it is never 
  +     * done more then once. The while is so we can use 'break' 
  +     */
  +    while( ((ret = connect( s, sname, slen ) ) == -1) 
  +           && ((sock_errno == EINTR) || (sock_errno == EAGAIN) || (sock_errno == EWOULDBLOCK) || (sock_errno == EINPROGRESS)) )
  +    {
  +        FD_ZERO(&fixed_rset);
  +        FD_SET(s, &fixed_rset);
  +        rset = fixed_rset;
  +        wset = rset;
  +        Alarmp( SPLOG_DEBUG, SESSION, "connect_nointr_timeout: connect in progress for socket %d, now wait in select\n", s);
  +        /* wait for select to timeout (num_ready == 0), give a permanent error (num_ready < 0 && sock_errno != transient). If transient error, retry after checking to make sure timeout has not expired */
  +        while( ((num_ready = select(s+1, &rset, &wset, NULL, (struct timeval *)&wait_time)) == -1) && ((sock_errno == EINTR) || (sock_errno == EAGAIN) || (sock_errno == EWOULDBLOCK)) )
  +        {
  +            temp_time = E_get_time();
  +            if (E_compare_time(temp_time, target_time) < 0 ) {
  +                wait_time = E_sub_time(target_time, temp_time);
  +            } else {
  +                Alarmp( SPLOG_WARNING, SESSION, "connect_nointr_timeout: connect interrupted and select wait timesout during transient error: %s\n", sock_strerror(sock_errno));
  +                close(s);
  +                sock_set_errno( ERR_TIMEDOUT );
  +                ret = -1;
  +                goto done_connect_try;
  +            }
  +            rset = fixed_rset;
  +            wset = rset;
  +        }
  +        if ( num_ready == 0 ) {
  +            /* timeout */
  +            close(s);
  +            sock_set_errno( ERR_TIMEDOUT );
  +            ret = -1;
  +            break;
  +        } else if ( num_ready < 0 ) 
  +        {
  +            Alarmp( SPLOG_WARNING, SESSION, "connect_nointr_timeout: connect interrupted and error in select wait: %s\n", sock_strerror(sock_errno));
  +            ret = -1;
  +            break;
  +        } 
  +        if (FD_ISSET(s, &rset) || FD_ISSET( s, &wset))
  +        {
  +            err = 0;
  +            elen = sizeof(err);
  +            if (getsockopt(s, SOL_SOCKET, SO_ERROR, (void *)&err, &elen) < 0)
  +            {
  +                ret = -1;
  +                break;
  +            }
  +            if (err)
  +            {
  +                sock_set_errno( err );
  +                ret = -1;
  +            } else {
  +                ret = 0;
  +            }
  +            break;
  +        } else {
  +            Alarmp( SPLOG_FATAL, SESSION, "connect_nointr_timeout: connect interrupted--but select does not indicate either error or connecting socket ready. Impossible condition (i.e. bug).  ret= %d: %s\n", err, sock_strerror(sock_errno));
  +            ret = -1;
  +            break;
  +        }
  +    } /* while error case for connect */
  +    Alarmp( SPLOG_DEBUG, SESSION, "connect_nointr_timeout: After connect, ret = %d error is:%s\n", ret, sock_strerror(sock_errno));
  +
  +done_connect_try:
  +    if ( non_blocking )
  +    {
  +        /* set file descriptor to blocking */
  +        on = 0;
  +        ret_ioctl = ioctl( s, FIONBIO, &on);
  +        temp_time = E_sub_time(E_get_time(), start_time);
  +        *time_out = E_sub_time(*time_out, temp_time);
  +    }
  +    return(ret);
  +}
  +
  +/* Increase socket buffer size to 200Kb if possible.
  + * Used in SP_connect family when connection is established.
  + */
  +static void set_large_socket_buffers(int s)
  +{
  +    int i, on, ret;
  +    sockopt_len_t onlen;
  +
  +    for( i=10; i <= 200; i+=5 )
  +    {
  +        on = 1024*i;
  +
  +        ret = setsockopt( s, SOL_SOCKET, SO_SNDBUF, (void *)&on, 4);
  +        if (ret < 0 ) break;
  +        
  +        ret = setsockopt( s, SOL_SOCKET, SO_RCVBUF, (void *)&on, 4);
  +        if (ret < 0 ) break;
  +	
  +        onlen = sizeof(on);
  +        ret= getsockopt( s, SOL_SOCKET, SO_SNDBUF, (void *)&on, &onlen );
  +        if( on < i*1024 ) break;
  +        Alarmp( SPLOG_INFO, SESSION, "set_large_socket_buffers: set sndbuf %d, ret is %d\n", on, ret );
  +        
  +        onlen = sizeof(on);
  +        ret= getsockopt( s, SOL_SOCKET, SO_RCVBUF, (void *)&on, &onlen );
  +        if( on < i*1024 ) break;
  +        Alarmp( SPLOG_INFO, SESSION, "set_large_socket_buffers: set rcvbuf %d, ret is %d\n", on, ret );
  +    }
  +    Alarmp( SPLOG_INFO, SESSION, "set_large_socket_buffers: set sndbuf/rcvbuf to %d\n", 1024*(i-5) );
  +}
  +
   /* API break 3.15.0. version is no longer a float. return 0 on error, 1 if set version number */
   int	SP_version(int *major_version, int *minor_version, int *patch_version)
   {
  @@ -343,7 +479,6 @@
   	int			sp_v1, sp_v2, sp_v3;
   	char			l;
   	int32			on;
  -        sockopt_len_t           onlen;
   
   	struct	sockaddr_in	inet_addr;
   
  @@ -361,6 +496,7 @@
   
   #ifdef ENABLEDEBUG
           Alarm_set_types(SESSION | DEBUG);
  +        Alarm_set_priority(SPLOG_DEBUG);
   #endif
   
           sp_initialize_locks();
  @@ -429,28 +565,7 @@
   			return( COULD_NOT_CONNECT );
   		}
   
  -		for( i=10; i <= 200; i+=5 )
  -		{
  -
  -		    on = 1024*i;
  -
  -		    ret = setsockopt( s, SOL_SOCKET, SO_SNDBUF, (void *)&on, 4);
  -		    if (ret < 0 ) break;
  -
  -		    ret = setsockopt( s, SOL_SOCKET, SO_RCVBUF, (void *)&on, 4);
  -		    if (ret < 0 ) break;
  -	
  -		    onlen = sizeof(on);
  -		    ret= getsockopt( s, SOL_SOCKET, SO_SNDBUF, (void *)&on, &onlen );
  -		    if( on < i*1024 ) break;
  -		    Alarm( SESSION, "SP_connect: set sndbuf %d, ret is %d\n", on, ret );
  -
  -		    onlen = sizeof(on);
  -		    ret= getsockopt( s, SOL_SOCKET, SO_RCVBUF, (void *)&on, &onlen );
  -		    if( on < i*1024 ) break;
  -		    Alarm( SESSION, "SP_connect: set rcvbuf %d, ret is %d\n", on, ret );
  -		}
  -		Alarm( SESSION, "SP_connect: set sndbuf/rcvbuf to %d\n", 1024*(i-5) );
  +                set_large_socket_buffers(s);
   
                   on = 1;
                   ret = setsockopt( s, IPPROTO_TCP, TCP_NODELAY, (void *)&on, 4);
  @@ -462,62 +577,7 @@
   		inet_addr.sin_family = AF_INET;
   		inet_addr.sin_port   = htons( port );
   	        memcpy( &inet_addr.sin_addr, &host_address, sizeof(int32) );
  -                /* Handle EINTR while connecting by waiting with select until the 
  -                 * connect completes or fails.  This is a while loop but it is never 
  -                 * done more then once. The while is so we can use 'break' 
  -                 */
  -		while( ((ret = connect( s, (struct sockaddr *)&inet_addr, sizeof(inet_addr) ) ) == -1) 
  -                       && ((sock_errno == EINTR) || (sock_errno == EAGAIN) || (sock_errno == EWOULDBLOCK)) )
  -                {
  -                        fd_set  rset, wset;
  -                        sp_time tout;
  -                        int     err;
  -			sockopt_len_t   elen;
  -
  -
  -                        FD_ZERO(&rset);
  -                        FD_SET(s, &rset);
  -                        wset = rset;
  -                        err = 0;
  -                        tout.sec = 30;
  -                        tout.usec = 0;
  -                        if ( (err = select( FD_SETSIZE, &rset, &wset, NULL, (struct timeval *)&tout)) == 0 )
  -                        {
  -                                /* timeout */
  -                                close(s);
  -                                sock_set_errno( ERR_TIMEDOUT );
  -                                ret = -1;
  -                                break;
  -                        }
  -                        if (err < 0)
  -                        {
  -                                Alarm( SESSION, "SP_connect: connect interrupted and error in select wait: %s\n", sock_strerror(sock_errno));
  -                                ret = -1;
  -                                break;
  -                        }
  -                        if (FD_ISSET(s, &rset) || FD_ISSET( s, &wset))
  -                        {
  -                                elen = sizeof(err);
  -                                if (getsockopt(s, SOL_SOCKET, SO_ERROR, (void *)&err, &elen) < 0)
  -                                {
  -                                        ret = -1;
  -                                        break;
  -                                }
  -                                if (err)
  -                                {
  -                                        sock_set_errno( err );
  -                                        ret = -1;
  -                                } else {
  -                                        ret = 0;
  -                                }
  -                                break;
  -                        } else {
  -                                Alarm( SESSION, "SP_connect: connect interrupted--but socket not selected ret= %d: %s\n", err, sock_strerror(sock_errno));
  -                                ret = -1;
  -                                break;
  -                        }
  -                } /* while error case for connect */
  -                
  +		ret = connect_nointr_timeout( s, (struct sockaddr *)&inet_addr, sizeof(inet_addr), &time_out);
   	}else{
   
   #ifndef	ARCH_PC_WIN95
  @@ -528,85 +588,14 @@
   			return( COULD_NOT_CONNECT );
   		}
   
  -		for( i=10; i <= 200; i+=5 )
  -		{
  -		    on = 1024*i;
  -
  -		    ret = setsockopt( s, SOL_SOCKET, SO_SNDBUF, (void *)&on, 4);
  -		    if (ret < 0 ) break;
  -
  -		    ret = setsockopt( s, SOL_SOCKET, SO_RCVBUF, (void *)&on, 4);
  -		    if (ret < 0 ) break;
  -	
  -
  -		    onlen = sizeof(on);
  -		    ret= getsockopt( s, SOL_SOCKET, SO_SNDBUF, (void *)&on, &onlen );
  -		    if( on < i*1024 ) break;
  -		    Alarm( SESSION, "SP_connect: set sndbuf %d, ret is %d\n", on, ret );
  -
  -		    onlen = sizeof(on);
  -		    ret= getsockopt( s, SOL_SOCKET, SO_RCVBUF, (void *)&on, &onlen );
  -		    if( on < i*1024 ) break;
  -		    Alarm( SESSION, "SP_connect: set rcvbuf %d, ret is %d\n", on, ret );
  -		}
  -		Alarm( SESSION, "SP_connect: set sndbuf/rcvbuf to %d\n", 1024*(i-5) );
  +                set_large_socket_buffers(s);
   
   		unix_addr.sun_family = AF_UNIX;
   		sprintf( unix_addr.sun_path, "/tmp/%d", port );
  -		while( ((ret = connect( s, (struct sockaddr *)&unix_addr, sizeof(unix_addr) )) == -1)
  -                       && ((sock_errno == EINTR) || (sock_errno == EAGAIN) || (sock_errno == EWOULDBLOCK)) )
  -                {
  -                        fd_set  rset, wset;
  -                        sp_time tout;
  -                        int     err;
  -			sockopt_len_t   elen;
  -
  -                        FD_ZERO(&rset);
  -                        FD_SET(s, &rset);
  -                        wset = rset;
  -                        tout.sec = 30;
  -                        tout.usec = 0;
  -                        err = 0;
  -                        if ( (err = select( FD_SETSIZE, &rset, &wset, NULL, (struct timeval *)&tout)) == 0 )
  -                        {
  -                                /* timeout */
  -                                close(s);
  -                                sock_set_errno( ETIMEDOUT );
  -                                ret = -1;
  -                                break;
  -                        }
  -                        if (err < 0)
  -                        {
  -                                Alarm( SESSION, "SP_connect: uniz connect interrupted and error in select wait: %s\n", sock_strerror(sock_errno));
  -                                ret = -1;
  -                                break;
  -                        }
  -                        if (FD_ISSET(s, &rset) || FD_ISSET( s, &wset))
  -                        {
  -                                elen = sizeof(err);
  -                                if (getsockopt(s, SOL_SOCKET, SO_ERROR, (void *)&err, &elen) < 0)
  -                                {
  -                                        ret = -1;
  -                                        break;
  -                                }
  -                                if (err)
  -                                {
  -                                        sock_set_errno( err );
  -                                        ret = -1;
  -                                } else {
  -                                        ret = 0;
  -                                }
  -                                break;
  -                        } else {
  -                                Alarm( SESSION, "SP_connect: unix connect interrupted--but socket not selected ret= %d: %s\n",
  -                                       err, sock_strerror(sock_errno));
  -                                ret = -1;
  -                                break;
  -                        }
  -                } /* while error case for connect */
  -#endif	/* ARCH_PC_WIN95 */
  -
  +		ret = connect_nointr_timeout( s, (struct sockaddr *)&unix_addr, sizeof(unix_addr), &time_out);
  +#endif	/* !ARCH_PC_WIN95 */
   	}
  +
   	if( ret < 0 )
   	{
   		Alarm( SESSION, "SP_connect: unable to connect mailbox %d: %s\n", s, sock_strerror(sock_errno));
  @@ -745,10 +734,6 @@
   
   	l=0;
           ret = recv_nointr_timeout( s, &l, 1, 0, &time_out);
  -#if 0
  -	while(((ret = recv( s, &l, 1, 0)) == -1) && ((sock_errno == EINTR) || (sock_errno == EAGAIN) || (sock_errno == EWOULDBLOCK)) )
  -                ;
  -#endif
   	if( ret <= 0 )
   	{
   		Alarm( SESSION, "SP_connect: unable to read answer %d: %s\n", ret, sock_strerror(sock_errno));
  @@ -762,10 +747,6 @@
   		return( l );
   	}
           ret = recv_nointr_timeout( s, &l, 1, 0, &time_out);
  -#if 0
  -	while(((ret = recv( s, &l, 1, 0)) == -1) && ((sock_errno == EINTR) || (sock_errno == EAGAIN) || (sock_errno == EWOULDBLOCK)) )
  -                ;
  -#endif
   	if( ret <= 0 )
   	{
   		Alarm( SESSION, "SP_connect: unable to read version %d: %s\n", ret, sock_strerror(sock_errno));
  @@ -775,10 +756,6 @@
   	sp_v1 = l;
   
           ret = recv_nointr_timeout( s, &l, 1, 0, &time_out);
  -#if 0
  -	while(((ret = recv( s, &l, 1, 0)) == -1) && ((sock_errno == EINTR) || (sock_errno == EAGAIN) || (sock_errno == EWOULDBLOCK)) )
  -                ;
  -#endif
   	if( ret <= 0 )
   	{
   		Alarm( SESSION, "SP_connect: unable to read subversion %d: %s\n", ret, sock_strerror(sock_errno));
  @@ -788,10 +765,6 @@
   	sp_v2 = l;
   
           ret = recv_nointr_timeout( s, &l, 1, 0, &time_out);
  -#if 0
  -	while(((ret = recv( s, &l, 1, 0)) == -1) && ((sock_errno == EINTR) || (sock_errno == EAGAIN) || (sock_errno == EWOULDBLOCK)) )
  -                ;
  -#endif
   	if( ret <= 0 )
   	{
   		Alarm( SESSION, "SP_connect: unable to read patch version %d: %s\n", ret, sock_strerror(sock_errno));
  @@ -817,10 +790,6 @@
   	}
   
           ret = recv_nointr_timeout( s, &l, 1, 0, &time_out);
  -#if 0
  -	while(((ret = recv( s, &l, 1, 0)) == -1) && ((sock_errno == EINTR) || (sock_errno == EAGAIN) || (sock_errno == EWOULDBLOCK)) )
  -                ;
  -#endif
   	if( ret <= 0 )
   	{
   		Alarm( SESSION, "SP_connect: unable to read size of group %d: %s\n", ret, sock_strerror(sock_errno));
  @@ -829,10 +798,6 @@
   	}
   	len = l;
           ret = recv_nointr_timeout( s, private_group, len, 0, &time_out);
  -#if 0
  -	while(((ret = recv( s, private_group, len, 0 )) == -1 ) && ((sock_errno == EINTR) || (sock_errno == EAGAIN) || (sock_errno == EWOULDBLOCK)) )
  -                ;
  -#endif
   	if( ret <= 0 )
   	{
   		Alarm( SESSION, "SP_connect: unable to read private group %d:  %s\n", ret, sock_strerror(sock_errno));
  
  
  




More information about the Spread-cvs mailing list