[Spread-cvs] cvs commit: spread/daemon Readme.txt sp.c
jonathan at spread.org
jonathan at spread.org
Tue Oct 5 10:08:10 EDT 2004
jonathan 04/10/05 10:08:10
Modified: daemon Readme.txt sp.c
Log:
Make SP_connect_timeout call connect() on a non-blocking fd. This prevents
the client from blocking when a daemon is non-responsive. The connect processing
was broken into a new connect_nointr_timeout() function which handles both
interrups/signals during connect and makes the connect non-blocking.
Some old code that was commented out was removed from the SP_connect_timeout
function and some small readability improvements were made by breaking the
setsockopt() calls into a separate set_large_socket_buffers() function.
Revision Changes Path
1.66 +4 -0 spread/daemon/Readme.txt
Index: Readme.txt
===================================================================
RCS file: /storage/cvsroot/spread/daemon/Readme.txt,v
retrieving revision 1.65
retrieving revision 1.66
diff -u -r1.65 -r1.66
--- Readme.txt 3 Oct 2004 05:21:11 -0000 1.65
+++ Readme.txt 5 Oct 2004 14:08:10 -0000 1.66
@@ -101,6 +101,10 @@
16) Fix EVS bug where AGREED messages may be delivered before a transitional signal
on some daemons and after it on others. Bug found and patch created by
Ryan Caudy.
+17) Make SP_connect_timeout() calls non-blocking for the actual 'connect()' call.
+ This should fix the issue reported by Shlomi Yaakobovich where a hung daemon
+ causes new connections to also hang in connect. Also includes slight cleanup
+ of connect code path.
SOURCE INSTALL:
---------------
1.14 +143 -178 spread/daemon/sp.c
Index: sp.c
===================================================================
RCS file: /storage/cvsroot/spread/daemon/sp.c,v
retrieving revision 1.13
retrieving revision 1.14
diff -u -r1.13 -r1.14
--- sp.c 23 Sep 2004 23:15:18 -0000 1.13
+++ sp.c 5 Oct 2004 14:08:10 -0000 1.14
@@ -221,6 +221,142 @@
return(ret);
}
+/* This calls connect() with the additional features of ignoring syscall interruptions
+ * caused by signals delivered to this application, and of returning in at most *time_out time.
+ * When it returns the *time_out variable will be modified to have contain the value:
+ * old *time_out - time spent in this function.
+ *
+ * If *time_out == {0,0} then the call is made blocking and will NOT timeout.
+ */
+static int connect_nointr_timeout(int s, struct sockaddr *sname, socklen_t slen, sp_time *time_out)
+{
+ int ret, num_ready;
+ fd_set rset,fixed_rset,wset;
+ sp_time start_time, temp_time, target_time, wait_time;
+ int non_blocking = 0;
+ int err;
+ int on;
+ int ret_ioctl;
+ sockopt_len_t elen;
+
+ if ( E_compare_time(Zero_timeout, *time_out) < 0 )
+ {
+ non_blocking = 1;
+ start_time = E_get_time();
+ target_time = E_add_time(start_time, *time_out);
+ wait_time = *time_out;
+ /* set file descriptor to non-blocking */
+ on = 1;
+ ret_ioctl = ioctl( s, FIONBIO, &on);
+ }
+ /* Handle EINTR while connecting by waiting with select until the
+ * connect completes or fails. This is a while loop but it is never
+ * done more then once. The while is so we can use 'break'
+ */
+ while( ((ret = connect( s, sname, slen ) ) == -1)
+ && ((sock_errno == EINTR) || (sock_errno == EAGAIN) || (sock_errno == EWOULDBLOCK) || (sock_errno == EINPROGRESS)) )
+ {
+ FD_ZERO(&fixed_rset);
+ FD_SET(s, &fixed_rset);
+ rset = fixed_rset;
+ wset = rset;
+ Alarmp( SPLOG_DEBUG, SESSION, "connect_nointr_timeout: connect in progress for socket %d, now wait in select\n", s);
+ /* wait for select to timeout (num_ready == 0), give a permanent error (num_ready < 0 && sock_errno != transient). If transient error, retry after checking to make sure timeout has not expired */
+ while( ((num_ready = select(s+1, &rset, &wset, NULL, (struct timeval *)&wait_time)) == -1) && ((sock_errno == EINTR) || (sock_errno == EAGAIN) || (sock_errno == EWOULDBLOCK)) )
+ {
+ temp_time = E_get_time();
+ if (E_compare_time(temp_time, target_time) < 0 ) {
+ wait_time = E_sub_time(target_time, temp_time);
+ } else {
+ Alarmp( SPLOG_WARNING, SESSION, "connect_nointr_timeout: connect interrupted and select wait timesout during transient error: %s\n", sock_strerror(sock_errno));
+ close(s);
+ sock_set_errno( ERR_TIMEDOUT );
+ ret = -1;
+ goto done_connect_try;
+ }
+ rset = fixed_rset;
+ wset = rset;
+ }
+ if ( num_ready == 0 ) {
+ /* timeout */
+ close(s);
+ sock_set_errno( ERR_TIMEDOUT );
+ ret = -1;
+ break;
+ } else if ( num_ready < 0 )
+ {
+ Alarmp( SPLOG_WARNING, SESSION, "connect_nointr_timeout: connect interrupted and error in select wait: %s\n", sock_strerror(sock_errno));
+ ret = -1;
+ break;
+ }
+ if (FD_ISSET(s, &rset) || FD_ISSET( s, &wset))
+ {
+ err = 0;
+ elen = sizeof(err);
+ if (getsockopt(s, SOL_SOCKET, SO_ERROR, (void *)&err, &elen) < 0)
+ {
+ ret = -1;
+ break;
+ }
+ if (err)
+ {
+ sock_set_errno( err );
+ ret = -1;
+ } else {
+ ret = 0;
+ }
+ break;
+ } else {
+ Alarmp( SPLOG_FATAL, SESSION, "connect_nointr_timeout: connect interrupted--but select does not indicate either error or connecting socket ready. Impossible condition (i.e. bug). ret= %d: %s\n", err, sock_strerror(sock_errno));
+ ret = -1;
+ break;
+ }
+ } /* while error case for connect */
+ Alarmp( SPLOG_DEBUG, SESSION, "connect_nointr_timeout: After connect, ret = %d error is:%s\n", ret, sock_strerror(sock_errno));
+
+done_connect_try:
+ if ( non_blocking )
+ {
+ /* set file descriptor to blocking */
+ on = 0;
+ ret_ioctl = ioctl( s, FIONBIO, &on);
+ temp_time = E_sub_time(E_get_time(), start_time);
+ *time_out = E_sub_time(*time_out, temp_time);
+ }
+ return(ret);
+}
+
+/* Increase socket buffer size to 200Kb if possible.
+ * Used in SP_connect family when connection is established.
+ */
+static void set_large_socket_buffers(int s)
+{
+ int i, on, ret;
+ sockopt_len_t onlen;
+
+ for( i=10; i <= 200; i+=5 )
+ {
+ on = 1024*i;
+
+ ret = setsockopt( s, SOL_SOCKET, SO_SNDBUF, (void *)&on, 4);
+ if (ret < 0 ) break;
+
+ ret = setsockopt( s, SOL_SOCKET, SO_RCVBUF, (void *)&on, 4);
+ if (ret < 0 ) break;
+
+ onlen = sizeof(on);
+ ret= getsockopt( s, SOL_SOCKET, SO_SNDBUF, (void *)&on, &onlen );
+ if( on < i*1024 ) break;
+ Alarmp( SPLOG_INFO, SESSION, "set_large_socket_buffers: set sndbuf %d, ret is %d\n", on, ret );
+
+ onlen = sizeof(on);
+ ret= getsockopt( s, SOL_SOCKET, SO_RCVBUF, (void *)&on, &onlen );
+ if( on < i*1024 ) break;
+ Alarmp( SPLOG_INFO, SESSION, "set_large_socket_buffers: set rcvbuf %d, ret is %d\n", on, ret );
+ }
+ Alarmp( SPLOG_INFO, SESSION, "set_large_socket_buffers: set sndbuf/rcvbuf to %d\n", 1024*(i-5) );
+}
+
/* API break 3.15.0. version is no longer a float. return 0 on error, 1 if set version number */
int SP_version(int *major_version, int *minor_version, int *patch_version)
{
@@ -343,7 +479,6 @@
int sp_v1, sp_v2, sp_v3;
char l;
int32 on;
- sockopt_len_t onlen;
struct sockaddr_in inet_addr;
@@ -361,6 +496,7 @@
#ifdef ENABLEDEBUG
Alarm_set_types(SESSION | DEBUG);
+ Alarm_set_priority(SPLOG_DEBUG);
#endif
sp_initialize_locks();
@@ -429,28 +565,7 @@
return( COULD_NOT_CONNECT );
}
- for( i=10; i <= 200; i+=5 )
- {
-
- on = 1024*i;
-
- ret = setsockopt( s, SOL_SOCKET, SO_SNDBUF, (void *)&on, 4);
- if (ret < 0 ) break;
-
- ret = setsockopt( s, SOL_SOCKET, SO_RCVBUF, (void *)&on, 4);
- if (ret < 0 ) break;
-
- onlen = sizeof(on);
- ret= getsockopt( s, SOL_SOCKET, SO_SNDBUF, (void *)&on, &onlen );
- if( on < i*1024 ) break;
- Alarm( SESSION, "SP_connect: set sndbuf %d, ret is %d\n", on, ret );
-
- onlen = sizeof(on);
- ret= getsockopt( s, SOL_SOCKET, SO_RCVBUF, (void *)&on, &onlen );
- if( on < i*1024 ) break;
- Alarm( SESSION, "SP_connect: set rcvbuf %d, ret is %d\n", on, ret );
- }
- Alarm( SESSION, "SP_connect: set sndbuf/rcvbuf to %d\n", 1024*(i-5) );
+ set_large_socket_buffers(s);
on = 1;
ret = setsockopt( s, IPPROTO_TCP, TCP_NODELAY, (void *)&on, 4);
@@ -462,62 +577,7 @@
inet_addr.sin_family = AF_INET;
inet_addr.sin_port = htons( port );
memcpy( &inet_addr.sin_addr, &host_address, sizeof(int32) );
- /* Handle EINTR while connecting by waiting with select until the
- * connect completes or fails. This is a while loop but it is never
- * done more then once. The while is so we can use 'break'
- */
- while( ((ret = connect( s, (struct sockaddr *)&inet_addr, sizeof(inet_addr) ) ) == -1)
- && ((sock_errno == EINTR) || (sock_errno == EAGAIN) || (sock_errno == EWOULDBLOCK)) )
- {
- fd_set rset, wset;
- sp_time tout;
- int err;
- sockopt_len_t elen;
-
-
- FD_ZERO(&rset);
- FD_SET(s, &rset);
- wset = rset;
- err = 0;
- tout.sec = 30;
- tout.usec = 0;
- if ( (err = select( FD_SETSIZE, &rset, &wset, NULL, (struct timeval *)&tout)) == 0 )
- {
- /* timeout */
- close(s);
- sock_set_errno( ERR_TIMEDOUT );
- ret = -1;
- break;
- }
- if (err < 0)
- {
- Alarm( SESSION, "SP_connect: connect interrupted and error in select wait: %s\n", sock_strerror(sock_errno));
- ret = -1;
- break;
- }
- if (FD_ISSET(s, &rset) || FD_ISSET( s, &wset))
- {
- elen = sizeof(err);
- if (getsockopt(s, SOL_SOCKET, SO_ERROR, (void *)&err, &elen) < 0)
- {
- ret = -1;
- break;
- }
- if (err)
- {
- sock_set_errno( err );
- ret = -1;
- } else {
- ret = 0;
- }
- break;
- } else {
- Alarm( SESSION, "SP_connect: connect interrupted--but socket not selected ret= %d: %s\n", err, sock_strerror(sock_errno));
- ret = -1;
- break;
- }
- } /* while error case for connect */
-
+ ret = connect_nointr_timeout( s, (struct sockaddr *)&inet_addr, sizeof(inet_addr), &time_out);
}else{
#ifndef ARCH_PC_WIN95
@@ -528,85 +588,14 @@
return( COULD_NOT_CONNECT );
}
- for( i=10; i <= 200; i+=5 )
- {
- on = 1024*i;
-
- ret = setsockopt( s, SOL_SOCKET, SO_SNDBUF, (void *)&on, 4);
- if (ret < 0 ) break;
-
- ret = setsockopt( s, SOL_SOCKET, SO_RCVBUF, (void *)&on, 4);
- if (ret < 0 ) break;
-
-
- onlen = sizeof(on);
- ret= getsockopt( s, SOL_SOCKET, SO_SNDBUF, (void *)&on, &onlen );
- if( on < i*1024 ) break;
- Alarm( SESSION, "SP_connect: set sndbuf %d, ret is %d\n", on, ret );
-
- onlen = sizeof(on);
- ret= getsockopt( s, SOL_SOCKET, SO_RCVBUF, (void *)&on, &onlen );
- if( on < i*1024 ) break;
- Alarm( SESSION, "SP_connect: set rcvbuf %d, ret is %d\n", on, ret );
- }
- Alarm( SESSION, "SP_connect: set sndbuf/rcvbuf to %d\n", 1024*(i-5) );
+ set_large_socket_buffers(s);
unix_addr.sun_family = AF_UNIX;
sprintf( unix_addr.sun_path, "/tmp/%d", port );
- while( ((ret = connect( s, (struct sockaddr *)&unix_addr, sizeof(unix_addr) )) == -1)
- && ((sock_errno == EINTR) || (sock_errno == EAGAIN) || (sock_errno == EWOULDBLOCK)) )
- {
- fd_set rset, wset;
- sp_time tout;
- int err;
- sockopt_len_t elen;
-
- FD_ZERO(&rset);
- FD_SET(s, &rset);
- wset = rset;
- tout.sec = 30;
- tout.usec = 0;
- err = 0;
- if ( (err = select( FD_SETSIZE, &rset, &wset, NULL, (struct timeval *)&tout)) == 0 )
- {
- /* timeout */
- close(s);
- sock_set_errno( ETIMEDOUT );
- ret = -1;
- break;
- }
- if (err < 0)
- {
- Alarm( SESSION, "SP_connect: uniz connect interrupted and error in select wait: %s\n", sock_strerror(sock_errno));
- ret = -1;
- break;
- }
- if (FD_ISSET(s, &rset) || FD_ISSET( s, &wset))
- {
- elen = sizeof(err);
- if (getsockopt(s, SOL_SOCKET, SO_ERROR, (void *)&err, &elen) < 0)
- {
- ret = -1;
- break;
- }
- if (err)
- {
- sock_set_errno( err );
- ret = -1;
- } else {
- ret = 0;
- }
- break;
- } else {
- Alarm( SESSION, "SP_connect: unix connect interrupted--but socket not selected ret= %d: %s\n",
- err, sock_strerror(sock_errno));
- ret = -1;
- break;
- }
- } /* while error case for connect */
-#endif /* ARCH_PC_WIN95 */
-
+ ret = connect_nointr_timeout( s, (struct sockaddr *)&unix_addr, sizeof(unix_addr), &time_out);
+#endif /* !ARCH_PC_WIN95 */
}
+
if( ret < 0 )
{
Alarm( SESSION, "SP_connect: unable to connect mailbox %d: %s\n", s, sock_strerror(sock_errno));
@@ -745,10 +734,6 @@
l=0;
ret = recv_nointr_timeout( s, &l, 1, 0, &time_out);
-#if 0
- while(((ret = recv( s, &l, 1, 0)) == -1) && ((sock_errno == EINTR) || (sock_errno == EAGAIN) || (sock_errno == EWOULDBLOCK)) )
- ;
-#endif
if( ret <= 0 )
{
Alarm( SESSION, "SP_connect: unable to read answer %d: %s\n", ret, sock_strerror(sock_errno));
@@ -762,10 +747,6 @@
return( l );
}
ret = recv_nointr_timeout( s, &l, 1, 0, &time_out);
-#if 0
- while(((ret = recv( s, &l, 1, 0)) == -1) && ((sock_errno == EINTR) || (sock_errno == EAGAIN) || (sock_errno == EWOULDBLOCK)) )
- ;
-#endif
if( ret <= 0 )
{
Alarm( SESSION, "SP_connect: unable to read version %d: %s\n", ret, sock_strerror(sock_errno));
@@ -775,10 +756,6 @@
sp_v1 = l;
ret = recv_nointr_timeout( s, &l, 1, 0, &time_out);
-#if 0
- while(((ret = recv( s, &l, 1, 0)) == -1) && ((sock_errno == EINTR) || (sock_errno == EAGAIN) || (sock_errno == EWOULDBLOCK)) )
- ;
-#endif
if( ret <= 0 )
{
Alarm( SESSION, "SP_connect: unable to read subversion %d: %s\n", ret, sock_strerror(sock_errno));
@@ -788,10 +765,6 @@
sp_v2 = l;
ret = recv_nointr_timeout( s, &l, 1, 0, &time_out);
-#if 0
- while(((ret = recv( s, &l, 1, 0)) == -1) && ((sock_errno == EINTR) || (sock_errno == EAGAIN) || (sock_errno == EWOULDBLOCK)) )
- ;
-#endif
if( ret <= 0 )
{
Alarm( SESSION, "SP_connect: unable to read patch version %d: %s\n", ret, sock_strerror(sock_errno));
@@ -817,10 +790,6 @@
}
ret = recv_nointr_timeout( s, &l, 1, 0, &time_out);
-#if 0
- while(((ret = recv( s, &l, 1, 0)) == -1) && ((sock_errno == EINTR) || (sock_errno == EAGAIN) || (sock_errno == EWOULDBLOCK)) )
- ;
-#endif
if( ret <= 0 )
{
Alarm( SESSION, "SP_connect: unable to read size of group %d: %s\n", ret, sock_strerror(sock_errno));
@@ -829,10 +798,6 @@
}
len = l;
ret = recv_nointr_timeout( s, private_group, len, 0, &time_out);
-#if 0
- while(((ret = recv( s, private_group, len, 0 )) == -1 ) && ((sock_errno == EINTR) || (sock_errno == EAGAIN) || (sock_errno == EWOULDBLOCK)) )
- ;
-#endif
if( ret <= 0 )
{
Alarm( SESSION, "SP_connect: unable to read private group %d: %s\n", ret, sock_strerror(sock_errno));
More information about the Spread-cvs
mailing list