People sometimes think they have a use case for influencing which node will be the DC.
Sometimes it is latency (certain cli commands work faster when done on the DC), sometimes they add a "mostly quorum" node which may be not quite up to the task of being DC. Prohibiting a node from becoming DC completely would mean it can not even be cleanly shutdown (with 1.0.x, no MCP), or act on its own resources for certain no-quorum policies. So here is a patch I have been asked to present for discussion, against Pacemaker 1.0, that introduces a "dc-prio" configuration parameter, which will add some skew to the election algorithm. Open questions: * does it make sense at all? * election algorithm compatibility, stability: will the election be correct if some nodes have this patch, and some don't ? * How can it be improved so that a node with dc-prio=0 will "give up" its DC-role as soon as there is at least one other node with dc-prio > 0? Lars --- ./crmd/election.c.orig 2011-05-11 11:36:05.577329600 +0200 +++ ./crmd/election.c 2011-05-12 13:49:04.671484200 +0200 @@ -29,6 +29,7 @@ GHashTable *voted = NULL; uint highest_born_on = -1; static int current_election_id = 1; +static int our_dc_prio = -1; /* A_ELECTION_VOTE */ void @@ -55,6 +56,20 @@ break; } + if (our_dc_prio < 0) { + char * dc_prio_str = getenv("HA_dc_prio"); + + if (dc_prio_str == NULL) { + our_dc_prio = 1; + } else { + our_dc_prio = atoi(dc_prio_str); + } + } + + if (!our_dc_prio) { + not_voting = TRUE; + } + if(not_voting == FALSE) { if(is_set(fsa_input_register, R_STARTING)) { not_voting = TRUE; @@ -72,12 +87,13 @@ } vote = create_request( - CRM_OP_VOTE, NULL, NULL, + our_dc_prio?CRM_OP_VOTE:CRM_OP_NOVOTE, NULL, NULL, CRM_SYSTEM_CRMD, CRM_SYSTEM_CRMD, NULL); current_election_id++; crm_xml_add(vote, F_CRM_ELECTION_OWNER, fsa_our_uuid); crm_xml_add_int(vote, F_CRM_ELECTION_ID, current_election_id); + crm_xml_add_int(vote, F_CRM_DC_PRIO, our_dc_prio); send_cluster_message(NULL, crm_msg_crmd, vote, TRUE); free_xml(vote); @@ -188,6 +204,7 @@ fsa_data_t *msg_data) { int election_id = -1; + int your_dc_prio = 1; int log_level = LOG_INFO; gboolean done = FALSE; gboolean we_loose = FALSE; @@ -216,6 +233,17 @@ your_version = crm_element_value(vote->msg, F_CRM_VERSION); election_owner = crm_element_value(vote->msg, F_CRM_ELECTION_OWNER); crm_element_value_int(vote->msg, F_CRM_ELECTION_ID, &election_id); + crm_element_value_int(vote->msg, F_CRM_DC_PRIO, &your_dc_prio); + + if (our_dc_prio < 0) { + char * dc_prio_str = getenv("HA_dc_prio"); + + if (dc_prio_str == NULL) { + our_dc_prio = 1; + } else { + our_dc_prio = atoi(dc_prio_str); + } + } CRM_CHECK(vote_from != NULL, vote_from = fsa_our_uname); @@ -269,6 +297,13 @@ reason = "Recorded"; done = TRUE; + } else if(our_dc_prio < your_dc_prio) { + reason = "DC Prio"; + we_loose = TRUE; + + } else if(our_dc_prio > your_dc_prio) { + reason = "DC Prio"; + } else if(compare_version(your_version, CRM_FEATURE_SET) < 0) { reason = "Version"; we_loose = TRUE; @@ -328,6 +363,7 @@ crm_xml_add(novote, F_CRM_ELECTION_OWNER, election_owner); crm_xml_add_int(novote, F_CRM_ELECTION_ID, election_id); + crm_xml_add_int(novote, F_CRM_DC_PRIO, our_dc_prio); send_cluster_message(vote_from, crm_msg_crmd, novote, TRUE); free_xml(novote); --- ./include/crm/msg_xml.h.orig 2011-05-11 18:22:08.061726000 +0200 +++ ./include/crm/msg_xml.h 2011-05-11 18:24:17.405132000 +0200 @@ -32,6 +32,7 @@ #define F_CRM_ORIGIN "origin" #define F_CRM_JOIN_ID "join_id" #define F_CRM_ELECTION_ID "election-id" +#define F_CRM_DC_PRIO "dc-prio" #define F_CRM_ELECTION_OWNER "election-owner" #define F_CRM_TGRAPH "crm-tgraph" #define F_CRM_TGRAPH_INPUT "crm-tgraph-in" --- ./lib/ais/plugin.c.orig 2011-05-11 11:29:38.496116000 +0200 +++ ./lib/ais/plugin.c 2011-05-11 17:28:32.385425300 +0200 @@ -421,6 +421,9 @@ get_config_opt(pcmk_api, local_handle, "use_logd", &value, "no"); pcmk_env.use_logd = value; + get_config_opt(pcmk_api, local_handle, "dc_prio", &value, "1"); + pcmk_env.dc_prio = value; + get_config_opt(pcmk_api, local_handle, "use_mgmtd", &value, "no"); if(ais_get_boolean(value) == FALSE) { int lpc = 0; @@ -584,6 +587,7 @@ pcmk_env.logfile = NULL; pcmk_env.use_logd = "false"; pcmk_env.syslog = "daemon"; + pcmk_env.dc_prio = "1"; if(cs_uid != root_uid) { ais_err("Corosync must be configured to start as 'root'," --- ./lib/ais/utils.c.orig 2011-05-11 11:27:08.460183200 +0200 +++ ./lib/ais/utils.c 2011-05-11 17:29:09.182064800 +0200 @@ -171,6 +171,7 @@ setenv("HA_logfacility", pcmk_env.syslog, 1); setenv("HA_LOGFACILITY", pcmk_env.syslog, 1); setenv("HA_use_logd", pcmk_env.use_logd, 1); + setenv("HA_dc_prio", pcmk_env.dc_prio, 1); if(pcmk_env.logfile) { setenv("HA_debugfile", pcmk_env.logfile, 1); } --- ./lib/ais/utils.h.orig 2011-05-11 11:26:12.757414700 +0200 +++ ./lib/ais/utils.h 2011-05-11 17:36:34.194841700 +0200 @@ -226,6 +226,7 @@ const char *syslog; const char *logfile; const char *use_logd; + const char *dc_prio; }; extern struct pcmk_env_s pcmk_env; -- : Lars Ellenberg : LINBIT | Your Way to High Availability : DRBD/HA support and consulting http://www.linbit.com DRBD® and LINBIT® are registered trademarks of LINBIT, Austria. _______________________________________________ Pacemaker mailing list: Pacemaker@oss.clusterlabs.org http://oss.clusterlabs.org/mailman/listinfo/pacemaker Project Home: http://www.clusterlabs.org Getting started: http://www.clusterlabs.org/doc/Cluster_from_Scratch.pdf Bugs: http://bugs.clusterlabs.org