403 lines
10 KiB
C
403 lines
10 KiB
C
/*-
|
|
* See the file LICENSE for redistribution information.
|
|
*
|
|
* Copyright (c) 2005,2008 Oracle. All rights reserved.
|
|
*
|
|
* $Id: repmgr_elect.c 63573 2008-05-23 21:43:21Z trent.nelson $
|
|
*/
|
|
|
|
#include "db_config.h"
|
|
|
|
#define __INCLUDE_NETWORKING 1
|
|
#include "db_int.h"
|
|
|
|
static int __repmgr_is_ready __P((ENV *));
|
|
static int __repmgr_elect_main __P((ENV *));
|
|
static void *__repmgr_elect_thread __P((void *));
|
|
static int start_election_thread __P((ENV *));
|
|
|
|
/*
|
|
* Starts the election thread, or wakes up an existing one, starting off with
|
|
* the specified operation (an election, or a call to rep_start(CLIENT), or
|
|
* nothing). Avoid multiple concurrent elections.
|
|
*
|
|
* PUBLIC: int __repmgr_init_election __P((ENV *, int));
|
|
*
|
|
* !!!
|
|
* Caller must hold mutex.
|
|
*/
|
|
int
|
|
__repmgr_init_election(env, initial_operation)
|
|
ENV *env;
|
|
int initial_operation;
|
|
{
|
|
DB_REP *db_rep;
|
|
int ret;
|
|
|
|
db_rep = env->rep_handle;
|
|
if (db_rep->finished) {
|
|
RPRINT(env, DB_VERB_REPMGR_MISC, (env,
|
|
"ignoring elect thread request %d; repmgr is finished",
|
|
initial_operation));
|
|
return (0);
|
|
}
|
|
|
|
db_rep->operation_needed = initial_operation;
|
|
if (db_rep->elect_thread == NULL)
|
|
ret = start_election_thread(env);
|
|
else if (db_rep->elect_thread->finished) {
|
|
RPRINT(env, DB_VERB_REPMGR_MISC,
|
|
(env, "join dead elect thread"));
|
|
if ((ret = __repmgr_thread_join(db_rep->elect_thread)) != 0)
|
|
return (ret);
|
|
__os_free(env, db_rep->elect_thread);
|
|
db_rep->elect_thread = NULL;
|
|
ret = start_election_thread(env);
|
|
} else {
|
|
RPRINT(env, DB_VERB_REPMGR_MISC,
|
|
(env, "reusing existing elect thread"));
|
|
if ((ret = __repmgr_signal(&db_rep->check_election)) != 0)
|
|
__db_err(env, ret, "can't signal election thread");
|
|
}
|
|
return (ret);
|
|
}
|
|
|
|
/*
|
|
* !!!
|
|
* Caller holds mutex.
|
|
*/
|
|
static int
|
|
start_election_thread(env)
|
|
ENV *env;
|
|
{
|
|
DB_REP *db_rep;
|
|
REPMGR_RUNNABLE *elector;
|
|
int ret;
|
|
|
|
db_rep = env->rep_handle;
|
|
|
|
if ((ret = __os_malloc(env, sizeof(REPMGR_RUNNABLE), &elector))
|
|
!= 0)
|
|
return (ret);
|
|
elector->env = env;
|
|
elector->run = __repmgr_elect_thread;
|
|
|
|
if ((ret = __repmgr_thread_start(env, elector)) == 0)
|
|
db_rep->elect_thread = elector;
|
|
else
|
|
__os_free(env, elector);
|
|
|
|
return (ret);
|
|
}
|
|
|
|
static void *
|
|
__repmgr_elect_thread(args)
|
|
void *args;
|
|
{
|
|
ENV *env = args;
|
|
int ret;
|
|
|
|
RPRINT(env, DB_VERB_REPMGR_MISC, (env, "starting election thread"));
|
|
|
|
if ((ret = __repmgr_elect_main(env)) != 0) {
|
|
__db_err(env, ret, "election thread failed");
|
|
__repmgr_thread_failure(env, ret);
|
|
}
|
|
|
|
RPRINT(env, DB_VERB_REPMGR_MISC, (env, "election thread is exiting"));
|
|
return (NULL);
|
|
}
|
|
|
|
static int
|
|
__repmgr_elect_main(env)
|
|
ENV *env;
|
|
{
|
|
DBT my_addr;
|
|
DB_ENV *dbenv;
|
|
DB_REP *db_rep;
|
|
#ifdef DB_WIN32
|
|
DWORD duration;
|
|
#else
|
|
struct timespec deadline;
|
|
#endif
|
|
u_int32_t nsites, nvotes;
|
|
int done, failure_recovery, last_op;
|
|
int need_success, ret, succeeded, to_do;
|
|
|
|
COMPQUIET(need_success, TRUE);
|
|
|
|
dbenv = env->dbenv;
|
|
db_rep = env->rep_handle;
|
|
last_op = 0;
|
|
failure_recovery = succeeded = FALSE;
|
|
|
|
/*
|
|
* db_rep->operation_needed is the mechanism by which the outside world
|
|
* (running in a different thread) tells us what it wants us to do. It
|
|
* is obviously relevant when we're just starting up. But it can also
|
|
* be set if a subsequent request for us to do something occurs while
|
|
* we're still looping.
|
|
*
|
|
* ELECT_FAILURE_ELECTION asks us to start by doing an election, but to
|
|
* do so in failure recovery mode. This failure recovery mode may
|
|
* persist through several loop iterations: as long as it takes us to
|
|
* succeed in finding a master, or until we get asked to perform a new
|
|
* request. Thus the time for mapping ELECT_FAILURE_ELECTION to the
|
|
* internal ELECT_ELECTION, as well as the setting of the failure
|
|
* recovery flag, is at the point we receive the new request from
|
|
* operation_needed (either here, or within the loop below).
|
|
*/
|
|
LOCK_MUTEX(db_rep->mutex);
|
|
if (db_rep->finished) {
|
|
db_rep->elect_thread->finished = TRUE;
|
|
UNLOCK_MUTEX(db_rep->mutex);
|
|
return (0);
|
|
}
|
|
to_do = db_rep->operation_needed;
|
|
db_rep->operation_needed = 0;
|
|
UNLOCK_MUTEX(db_rep->mutex);
|
|
|
|
/*
|
|
* The way we are invoked determines the criterion for completion (which
|
|
* is represented as "need_success"): if we've been asked to do an
|
|
* election, we're only "done" when an election has actually succeeded.
|
|
* If we're just here trying to find the master initially, then merely
|
|
* getting a valid master_eid suffices.
|
|
*/
|
|
switch (to_do) {
|
|
case ELECT_FAILURE_ELECTION:
|
|
failure_recovery = TRUE;
|
|
to_do = ELECT_ELECTION;
|
|
/* FALLTHROUGH */
|
|
case ELECT_ELECTION:
|
|
need_success = TRUE;
|
|
break;
|
|
case ELECT_SEEK_MASTER:
|
|
to_do = 0; /* Caller has already called rep_start. */
|
|
/* FALLTHROUGH */
|
|
case ELECT_REPSTART:
|
|
need_success = FALSE;
|
|
break;
|
|
default:
|
|
DB_ASSERT(env, FALSE);
|
|
}
|
|
/* Here, need_success has been initialized. */
|
|
|
|
for (;;) {
|
|
RPRINT(env, DB_VERB_REPMGR_MISC,
|
|
(env, "elect thread to do: %d", to_do));
|
|
switch (to_do) {
|
|
case ELECT_ELECTION:
|
|
nsites = __repmgr_get_nsites(db_rep);
|
|
/*
|
|
* With only 2 sites in the group, even a single failure
|
|
* could make it impossible to get a majority. So,
|
|
* fudge a little, unless the user really wants strict
|
|
* safety.
|
|
*/
|
|
if (nsites == 2 &&
|
|
!FLD_ISSET(db_rep->region->config,
|
|
REP_C_2SITE_STRICT))
|
|
nvotes = 1;
|
|
else
|
|
nvotes = ELECTION_MAJORITY(nsites);
|
|
|
|
/*
|
|
* If we're doing an election because we noticed that
|
|
* the master failed, it's reasonable to expect that the
|
|
* master won't participate. By not waiting for its
|
|
* vote, we can probably complete the election faster.
|
|
* But note that we shouldn't allow this to affect
|
|
* nvotes calculation.
|
|
*
|
|
* However, if we have 2 sites, and strict majority is
|
|
* turned on, now nvotes would be 2, and it doesn't make
|
|
* sense to rep_elect to see nsites of 1 in that case.
|
|
* So only decrement nsites if it currently exceeds
|
|
* nvotes.
|
|
*/
|
|
if (failure_recovery && nsites > nvotes)
|
|
nsites--;
|
|
|
|
switch (ret =
|
|
__rep_elect(dbenv, nsites, nvotes, 0)) {
|
|
case DB_REP_UNAVAIL:
|
|
break;
|
|
|
|
case 0:
|
|
succeeded = TRUE;
|
|
if (db_rep->takeover_pending) {
|
|
db_rep->takeover_pending = FALSE;
|
|
if ((ret =
|
|
__repmgr_become_master(env)) != 0)
|
|
return (ret);
|
|
}
|
|
break;
|
|
|
|
default:
|
|
__db_err(
|
|
env, ret, "unexpected election failure");
|
|
return (ret);
|
|
}
|
|
last_op = ELECT_ELECTION;
|
|
break;
|
|
case ELECT_REPSTART:
|
|
if ((ret =
|
|
__repmgr_prepare_my_addr(env, &my_addr)) != 0)
|
|
return (ret);
|
|
ret = __rep_start(dbenv, &my_addr, DB_REP_CLIENT);
|
|
__os_free(env, my_addr.data);
|
|
if (ret != 0) {
|
|
__db_err(env, ret, "rep_start");
|
|
return (ret);
|
|
}
|
|
last_op = ELECT_REPSTART;
|
|
break;
|
|
case 0:
|
|
/*
|
|
* Nothing to do: this can happen the first time
|
|
* through, on initialization.
|
|
*/
|
|
last_op = 0;
|
|
break;
|
|
default:
|
|
DB_ASSERT(env, FALSE);
|
|
}
|
|
|
|
/*
|
|
* Only the first election after a crashed master should be
|
|
* "fast". If that election fails and we have to retry, the
|
|
* crashed master may have rebooted in the interim.
|
|
*/
|
|
failure_recovery = FALSE;
|
|
|
|
LOCK_MUTEX(db_rep->mutex);
|
|
while (!succeeded && !__repmgr_is_ready(env)) {
|
|
#ifdef DB_WIN32
|
|
duration = db_rep->election_retry_wait / US_PER_MS;
|
|
ret = SignalObjectAndWait(db_rep->mutex,
|
|
db_rep->check_election, duration, FALSE);
|
|
LOCK_MUTEX(db_rep->mutex);
|
|
if (ret == WAIT_TIMEOUT)
|
|
break;
|
|
DB_ASSERT(env, ret == WAIT_OBJECT_0);
|
|
#else
|
|
__repmgr_compute_wait_deadline(env, &deadline,
|
|
db_rep->election_retry_wait);
|
|
if ((ret = pthread_cond_timedwait(
|
|
&db_rep->check_election, &db_rep->mutex, &deadline))
|
|
== ETIMEDOUT)
|
|
break;
|
|
DB_ASSERT(env, ret == 0);
|
|
#endif
|
|
}
|
|
|
|
/*
|
|
* Ways we can get here: election succeeded, sleep duration
|
|
* expired, "operation needed", or thread shut-down command.
|
|
*
|
|
* If we're not yet done, figure out what to do next (which may
|
|
* be trivially easy if we've been told explicitly, via the
|
|
* "operation needed" flag). We must first check if we've been
|
|
* told to do a specific operation, because that could make our
|
|
* completion criterion more stringent. Note that we never
|
|
* lessen our completion criterion (i.e., unlike the initial
|
|
* case, we may leave need_success untouched here).
|
|
*/
|
|
done = FALSE;
|
|
if ((to_do = db_rep->operation_needed) != 0) {
|
|
db_rep->operation_needed = 0;
|
|
switch (to_do) {
|
|
case ELECT_FAILURE_ELECTION:
|
|
failure_recovery = TRUE;
|
|
to_do = ELECT_ELECTION;
|
|
/* FALLTHROUGH */
|
|
case ELECT_ELECTION:
|
|
need_success = TRUE;
|
|
break;
|
|
case ELECT_SEEK_MASTER:
|
|
to_do = 0;
|
|
break;
|
|
default:
|
|
break;
|
|
}
|
|
} else if ((done = (succeeded ||
|
|
(!need_success && IS_VALID_EID(db_rep->master_eid)) ||
|
|
db_rep->finished)))
|
|
db_rep->elect_thread->finished = TRUE;
|
|
else {
|
|
if (last_op == ELECT_ELECTION)
|
|
to_do = ELECT_REPSTART;
|
|
else {
|
|
/*
|
|
* Generally, if what we previously did is a
|
|
* rep_start (or nothing, which really just
|
|
* means another thread did the rep_start before
|
|
* turning us on), then we next do an election.
|
|
* However, with the REP_CLIENT init policy we
|
|
* never do an initial election.
|
|
*/
|
|
to_do = ELECT_ELECTION;
|
|
if (db_rep->init_policy == DB_REP_CLIENT &&
|
|
!db_rep->found_master)
|
|
to_do = ELECT_REPSTART;
|
|
}
|
|
}
|
|
|
|
UNLOCK_MUTEX(db_rep->mutex);
|
|
if (done)
|
|
return (0);
|
|
}
|
|
}
|
|
|
|
/*
|
|
* Tests whether another thread has signalled for our attention.
|
|
*/
|
|
static int
|
|
__repmgr_is_ready(env)
|
|
ENV *env;
|
|
{
|
|
DB_REP *db_rep;
|
|
|
|
db_rep = env->rep_handle;
|
|
|
|
RPRINT(env, DB_VERB_REPMGR_MISC, (env,
|
|
"repmgr elect: opcode %d, finished %d, master %d",
|
|
db_rep->operation_needed, db_rep->finished, db_rep->master_eid));
|
|
|
|
return (db_rep->operation_needed || db_rep->finished);
|
|
}
|
|
|
|
/*
|
|
* PUBLIC: int __repmgr_become_master __P((ENV *));
|
|
*/
|
|
int
|
|
__repmgr_become_master(env)
|
|
ENV *env;
|
|
{
|
|
DBT my_addr;
|
|
DB_ENV *dbenv;
|
|
DB_REP *db_rep;
|
|
int ret;
|
|
|
|
dbenv = env->dbenv;
|
|
db_rep = env->rep_handle;
|
|
db_rep->master_eid = SELF_EID;
|
|
db_rep->found_master = TRUE;
|
|
|
|
/*
|
|
* At the moment, it's useless to pass my address to rep_start here,
|
|
* because rep_start ignores it in the case of MASTER. So we could
|
|
* avoid the trouble of allocating and freeing this memory. But might
|
|
* this conceivably change in the future?
|
|
*/
|
|
if ((ret = __repmgr_prepare_my_addr(env, &my_addr)) != 0)
|
|
return (ret);
|
|
ret = __rep_start(dbenv, &my_addr, DB_REP_MASTER);
|
|
__os_free(env, my_addr.data);
|
|
if (ret == 0)
|
|
__repmgr_stash_generation(env);
|
|
|
|
return (ret);
|
|
}
|