[Spread-cvs] commit: r402 - in trunk: daemon docs

jonathan at spread.org jonathan at spread.org
Mon Oct 13 17:00:53 EDT 2008


Author: jonathan
Date: 2008-10-13 17:00:53 -0400 (Mon, 13 Oct 2008)
New Revision: 402

Modified:
   trunk/daemon/Changelog
   trunk/daemon/configuration.c
   trunk/daemon/configuration.h
   trunk/daemon/membership.c
   trunk/daemon/network.c
   trunk/daemon/prot_body.h
   trunk/daemon/protocol.c
   trunk/docs/DynamicConfiguration.txt
Log:
Fix for but where Memb_token_loss() crashes with an assertion failure when a configuration
reload is triggered while another membership change is occuring. This bug was reported on 
spread-users by at least Adam Grossman,	Clotho Tsang, Adrian Revill and Mike Perik.

The fix involves delaying the execution of the configuraiton reload until after the current
membership change has completed and the daemon has moved back into OP state. 



Modified: trunk/daemon/Changelog
===================================================================
--- trunk/daemon/Changelog	2008-10-13 20:19:18 UTC (rev 401)
+++ trunk/daemon/Changelog	2008-10-13 21:00:53 UTC (rev 402)
@@ -1,3 +1,25 @@
+Mon Oct 13 16:22:28 2008  Jonathan Stanton  <jonathan at spreadconcepts.com>
+
+	* protocol.c (Prot_initiate_conf_reload, Prot_need_conf_reload): 
+	Change Prot_handle_conf_reload() to only start the reload by calling 
+	the new function Prot_initiate_conf_reload if in OP membership state. 
+	Otherwise a flag is set (Prot_Need_Conf_Reload). The flag is accessed
+	through the Prot_need_conf_reload() function and cleard by the 
+	Prot_clear_need_conf_reload() function. 
+
+	This fixes bug where if a reload is triggerd by the spmonitor when
+	the daemon is already in a membership change it causes a crash because
+	of the asserts in Memb_token_loss(). Bug was reported by Adam Grossman,
+	Clotho Tsang, Adrian Revill, Mike Perik, and others. 
+
+Mon Oct 13 16:21:03 2008  Jonathan Stanton  <jonathan at spreadconcepts.com>
+
+	* membership.c (Shift_to_op): Add Shift_to_op function to capture
+	all cases when daemon membership algorithm changes state to OP.
+	This triggers a configuration reload if one has been queued up.
+
+	
+
 Mon Oct 13 16:16:45 2008  Jonathan Stanton  <jonathan at spreadconcepts.com>
 
 	* session.c, groups.c (G_shift_to_GOP): Add G_shift_to_GOP function 

Modified: trunk/daemon/configuration.c
===================================================================
--- trunk/daemon/configuration.c	2008-10-13 20:19:18 UTC (rev 401)
+++ trunk/daemon/configuration.c	2008-10-13 21:00:53 UTC (rev 402)
@@ -105,6 +105,7 @@
 static  bool    Conf_Debug_Initial_Sequence = FALSE;
 
 static  bool    Conf_Reload_State = FALSE;
+static  bool    Conf_Reload_Singleton_State = FALSE;
 static  configuration *Config_Previous;
 static  proc    *Config_Previous_Procs;
 static  char    Conf_FileName[80];
@@ -159,6 +160,23 @@
         Conf_load_conf_file( file_name, my_name);
 }
 
+bool    Conf_in_reload_singleton_state(void)
+{
+        return(Conf_Reload_Singleton_State);
+}
+
+void    Conf_reload_singleton_state_begin(void)
+{
+
+        Conf_Reload_Singleton_State = TRUE;
+}
+
+void    Conf_reload_singleton_state_end(void)
+{
+
+        Conf_Reload_Singleton_State = FALSE;
+}
+
 bool    Conf_in_reload_state(void)
 {
         return(Conf_Reload_State);

Modified: trunk/daemon/configuration.h
===================================================================
--- trunk/daemon/configuration.h	2008-10-13 20:19:18 UTC (rev 401)
+++ trunk/daemon/configuration.h	2008-10-13 21:00:53 UTC (rev 402)
@@ -109,9 +109,13 @@
 int	        Conf_num_procs_in_seg( configuration *config, int16 seg_index );
 void		Conf_id_to_str( int32u id, char *str );
 char 		Conf_print(configuration *config);
+
 bool            Conf_in_reload_state(void);
 void            Conf_reload_state_begin(void);
 void            Conf_reload_state_end(void);
+bool            Conf_in_reload_singleton_state(void);
+void            Conf_reload_singleton_state_begin(void);
+void            Conf_reload_singleton_state_end(void);
 bool            Conf_reload_initiate(void);
 
 void            Conf_set_debug_initial_sequence(void);

Modified: trunk/daemon/membership.c
===================================================================
--- trunk/daemon/membership.c	2008-10-13 20:19:18 UTC (rev 401)
+++ trunk/daemon/membership.c	2008-10-13 21:00:53 UTC (rev 402)
@@ -112,6 +112,7 @@
 static	void	Memb_handle_foreign( sys_scatter *scat );
 static	void	Memb_handle_form1  ( sys_scatter *scat );
 static	void	Memb_handle_form2  ( sys_scatter *scat );
+static	void	Shift_to_op();
 static	void	Shift_to_seg();
 static	void	Gather_or_represented();
 static	void	Shift_to_gather();
@@ -140,8 +141,7 @@
 	int32		current_subnet;
 	int		i, num_seg;
 
-	State = OP;
-	GlobalStatus.state = OP;
+        Shift_to_op();
 	GlobalStatus.membership_changes = 0;
 
 	My = Conf_my();
@@ -851,6 +851,16 @@
     Conf_append_id_to_seg(&Membership.segments[My.seg_index], My.id);
 }
 
+static	void	Shift_to_op()
+{
+       	State = OP;
+	GlobalStatus.state = OP;
+
+        if ( Prot_need_conf_reload() ) {
+            E_queue( Prot_initiate_conf_reload, 0, NULL, Zero_timeout );
+        }
+}
+
 static	void	Shift_to_seg()
 {
 	State = SEG;
@@ -919,8 +929,7 @@
 			/* clear everything and go back to op */
 			E_dequeue( Send_join, 0, NULL);
 			E_queue( Memb_lookup_new_members, 0, NULL, Lookup_timeout );
-			State = OP;
-			GlobalStatus.state = OP;
+                        Shift_to_op();
 		}else{
 			/* create and send form token */
 			Create_form1();
@@ -932,8 +941,7 @@
 			Alarm( MEMB, "Form_or_fail:failed, return to OP\n");
 			E_dequeue( Send_join, 0, NULL );
 			E_queue( Memb_lookup_new_members, 0, NULL, Lookup_timeout );
-			State = OP;
-			GlobalStatus.state = OP;
+                        Shift_to_op();
 		}else{
 			Alarm( MEMB, "Form_or_fail: failed to gather\n");
 			/* failed to gather again */
@@ -2276,9 +2284,9 @@
 
 	Alarm( MEMB, "Memb_regular\n");
 	Transitional = 0;
-	State = OP;
-	GlobalStatus.state = OP;
+
 	GlobalStatus.membership_changes++;
+
 	Membership = Future_membership;
 	Membership_id = Future_membership_id;
 	Reg_membership = Membership;
@@ -2300,6 +2308,8 @@
 		E_queue( Memb_lookup_new_members, 0, NULL, Lookup_timeout );
 	printf("Membership id is ( %d, %d)\n", Membership_id.proc_id, Membership_id.time );
 	printf("%c", Conf_print( &Membership ) );
+
+        Shift_to_op();
 }
 
 void	Flip_members( members_info *members_ptr )

Modified: trunk/daemon/network.c
===================================================================
--- trunk/daemon/network.c	2008-10-13 20:19:18 UTC (rev 401)
+++ trunk/daemon/network.c	2008-10-13 21:00:53 UTC (rev 402)
@@ -746,7 +746,7 @@
 {
         int     i;
 
-        if ( Conf_in_reload_state() ) {
+        if ( Conf_in_reload_singleton_state() ) {
                 Alarmp(SPLOG_DEBUG, NETWORK, "Net_set_partition: Can not change partition since daemon configuration change in progress\n");
                 return;
         }

Modified: trunk/daemon/prot_body.h
===================================================================
--- trunk/daemon/prot_body.h	2008-10-13 20:19:18 UTC (rev 401)
+++ trunk/daemon/prot_body.h	2008-10-13 21:00:53 UTC (rev 402)
@@ -99,5 +99,8 @@
 
 void	Prot_token_hurry();
 void	Discard_packets();
+void    Prot_initiate_conf_reload( void );
+bool    Prot_need_conf_reload( void );
+void    Prot_clear_need_conf_reload( void );
 
 #endif	/* INC_PROT_BODY */

Modified: trunk/daemon/protocol.c
===================================================================
--- trunk/daemon/protocol.c	2008-10-13 20:19:18 UTC (rev 401)
+++ trunk/daemon/protocol.c	2008-10-13 21:00:53 UTC (rev 402)
@@ -70,6 +70,9 @@
 static	sys_scatter	Hurry_pack;
 static  sp_time         Zero_timeout    = {  0, 0};
 
+/* Used to indicate a need to reload configuration at end of current membership */
+static  bool            Prot_Need_Conf_Reload  = FALSE;
+
 /* ### Pack: 1 line */
 static	packet_info	Buffered_packets[ARCH_SCATTER_SIZE];
 
@@ -698,18 +701,54 @@
 	GlobalStatus.token_rounds = Token_rounds;
 }
 
+/* Provide boolean result of whether the membership system needs to initiate a configuration reload
+ * because it was delayed by an ongoing membership change
+ */
+bool    Prot_need_conf_reload( void )
+{
+    return( Prot_Need_Conf_Reload );
+}
+
+void    Prot_clear_need_conf_reload( void )
+{
+    Prot_Need_Conf_Reload = FALSE;
+}
+
+/* If we are in OP state for the daemon membership, initiate conf reload to new configuration file and membership,
+ * If we are not, then a membership change is ongoing and we need to let that complete before starting a 
+ * new one to load in the new configuration.
+ */
+static  void    Prot_handle_conf_reload(sys_scatter *scat)
+{
+    if ( Memb_state() == OP ) {
+        Prot_initiate_conf_reload();
+    } else {
+        Prot_Need_Conf_Reload = TRUE;
+    }
+}
+
 /* Basic algorithm:
  * 1) have configuration code load new conf file and check for modifications to conf.
  * 2) If only add/sub of daemons, then initiate membership change with token_loss and return;
  * 3) else, then set Conf_reload_state, create singleton partition, and schedule token_loss.
  * 4) When membership completes in Discard_packets() cleanup partition and probe for new members.
  */
-static  void    Prot_handle_conf_reload(sys_scatter *scat)
+void    Prot_initiate_conf_reload( void )
 {
         bool    need_memb_partition;
         int16   singleton_partition[MAX_PROCS_RING];
         int     i;
 
+        if (Memb_state() != OP ) {
+            /* This is a race condition, that the Prot_initiate_conf_reload was scheduled when OP state, 
+             * but another membership occured before it was executed.
+             * The membership system will requeue this function when it reaches OP state again.
+             */
+            return;
+        }
+        /* Disable queueing of this function when OP state reached in membership */
+        Prot_clear_need_conf_reload();
+
         need_memb_partition = Conf_reload_initiate();
 
         /* Signal all subsystems to update Conf and My strucures */
@@ -729,7 +768,7 @@
                 }
                 Net_set_partition(singleton_partition);
 
-                Conf_reload_state_begin();
+                Conf_reload_singleton_state_begin();
         }
         E_queue( Memb_token_loss, 0, NULL, Zero_timeout );
 }
@@ -1233,14 +1272,14 @@
          * Remove partition
          * Initiate Memb_lookup() to find other daemons 
          */
-        if( Conf_in_reload_state() ) {
+        if( Conf_in_reload_singleton_state() ) {
                 /* GOP state equals value 1, but is private declaration in groups.c */
                 if ( (GlobalStatus.gstate != 1 ) || ( Conf_num_procs( &Reg_membership ) != 1 ) ) {
                         Alarmp( SPLOG_FATAL, MEMB, "Discard_packets: Failed to reload configuration - gstate: %d and num_procs in membership: %d\n", GlobalStatus.gstate, Conf_num_procs( &Reg_membership) );
                 }
                 Net_clear_partition();
                 E_queue( Memb_lookup_new_members, 0, NULL, Zero_timeout);
-                Conf_reload_state_end();
+                Conf_reload_singleton_state_end();
         }
 
 	/* set variables for next membership */

Modified: trunk/docs/DynamicConfiguration.txt
===================================================================
--- trunk/docs/DynamicConfiguration.txt	2008-10-13 20:19:18 UTC (rev 401)
+++ trunk/docs/DynamicConfiguration.txt	2008-10-13 21:00:53 UTC (rev 402)
@@ -131,9 +131,12 @@
 
 Daemon Algorithm Details:
 
-When a spmonitor command to reload configuration is received, the
-daemon executes the following steps. (See function
-Prot_handle_conf_reload() for the code).
+When a spmonitor command to reload configuration is received, if 
+the daemon is in OP state, the function Prot_initiate_conf_reload()
+executes the following steps to reload the configuration. If the
+daemon not in OP state (i.e it is already working on a membership), a
+flag is set and when the membership completes (by moving into OP state)
+the Prot_initiate_conf_reload() function is then executed.
 
 1) Call function in configuration code to load new configuration file
    and examine the type of changes to the configuration. If the




More information about the Spread-cvs mailing list