/*- * See the file LICENSE for redistribution information. * * Copyright (c) 2005,2008 Oracle. All rights reserved. * * $Id: repmgr_elect.c,v 1.41 2008/03/13 17:31:28 mbrey Exp $ */ #include "db_config.h" #define __INCLUDE_NETWORKING 1 #include "db_int.h" static int __repmgr_is_ready __P((ENV *)); static int __repmgr_elect_main __P((ENV *)); static void *__repmgr_elect_thread __P((void *)); static int start_election_thread __P((ENV *)); /* * Starts the election thread, or wakes up an existing one, starting off with * the specified operation (an election, or a call to rep_start(CLIENT), or * nothing). Avoid multiple concurrent elections. * * PUBLIC: int __repmgr_init_election __P((ENV *, int)); * * !!! * Caller must hold mutex. */ int __repmgr_init_election(env, initial_operation) ENV *env; int initial_operation; { DB_REP *db_rep; int ret; db_rep = env->rep_handle; if (db_rep->finished) { RPRINT(env, DB_VERB_REPMGR_MISC, (env, "ignoring elect thread request %d; repmgr is finished", initial_operation)); return (0); } db_rep->operation_needed = initial_operation; if (db_rep->elect_thread == NULL) ret = start_election_thread(env); else if (db_rep->elect_thread->finished) { RPRINT(env, DB_VERB_REPMGR_MISC, (env, "join dead elect thread")); if ((ret = __repmgr_thread_join(db_rep->elect_thread)) != 0) return (ret); __os_free(env, db_rep->elect_thread); db_rep->elect_thread = NULL; ret = start_election_thread(env); } else { RPRINT(env, DB_VERB_REPMGR_MISC, (env, "reusing existing elect thread")); if ((ret = __repmgr_signal(&db_rep->check_election)) != 0) __db_err(env, ret, "can't signal election thread"); } return (ret); } /* * !!! * Caller holds mutex. */ static int start_election_thread(env) ENV *env; { DB_REP *db_rep; REPMGR_RUNNABLE *elector; int ret; db_rep = env->rep_handle; if ((ret = __os_malloc(env, sizeof(REPMGR_RUNNABLE), &elector)) != 0) return (ret); elector->env = env; elector->run = __repmgr_elect_thread; if ((ret = __repmgr_thread_start(env, elector)) == 0) db_rep->elect_thread = elector; else __os_free(env, elector); return (ret); } static void * __repmgr_elect_thread(args) void *args; { ENV *env = args; int ret; RPRINT(env, DB_VERB_REPMGR_MISC, (env, "starting election thread")); if ((ret = __repmgr_elect_main(env)) != 0) { __db_err(env, ret, "election thread failed"); __repmgr_thread_failure(env, ret); } RPRINT(env, DB_VERB_REPMGR_MISC, (env, "election thread is exiting")); return (NULL); } static int __repmgr_elect_main(env) ENV *env; { DBT my_addr; DB_ENV *dbenv; DB_REP *db_rep; #ifdef DB_WIN32 DWORD duration; #else struct timespec deadline; #endif u_int32_t nsites, nvotes; int done, failure_recovery, last_op; int need_success, ret, succeeded, to_do; COMPQUIET(need_success, TRUE); dbenv = env->dbenv; db_rep = env->rep_handle; last_op = 0; failure_recovery = succeeded = FALSE; /* * db_rep->operation_needed is the mechanism by which the outside world * (running in a different thread) tells us what it wants us to do. It * is obviously relevant when we're just starting up. But it can also * be set if a subsequent request for us to do something occurs while * we're still looping. * * ELECT_FAILURE_ELECTION asks us to start by doing an election, but to * do so in failure recovery mode. This failure recovery mode may * persist through several loop iterations: as long as it takes us to * succeed in finding a master, or until we get asked to perform a new * request. Thus the time for mapping ELECT_FAILURE_ELECTION to the * internal ELECT_ELECTION, as well as the setting of the failure * recovery flag, is at the point we receive the new request from * operation_needed (either here, or within the loop below). */ LOCK_MUTEX(db_rep->mutex); if (db_rep->finished) { db_rep->elect_thread->finished = TRUE; UNLOCK_MUTEX(db_rep->mutex); return (0); } to_do = db_rep->operation_needed; db_rep->operation_needed = 0; UNLOCK_MUTEX(db_rep->mutex); /* * The way we are invoked determines the criterion for completion (which * is represented as "need_success"): if we've been asked to do an * election, we're only "done" when an election has actually succeeded. * If we're just here trying to find the master initially, then merely * getting a valid master_eid suffices. */ switch (to_do) { case ELECT_FAILURE_ELECTION: failure_recovery = TRUE; to_do = ELECT_ELECTION; /* FALLTHROUGH */ case ELECT_ELECTION: need_success = TRUE; break; case ELECT_SEEK_MASTER: to_do = 0; /* Caller has already called rep_start. */ /* FALLTHROUGH */ case ELECT_REPSTART: need_success = FALSE; break; default: DB_ASSERT(env, FALSE); } /* Here, need_success has been initialized. */ for (;;) { RPRINT(env, DB_VERB_REPMGR_MISC, (env, "elect thread to do: %d", to_do)); switch (to_do) { case ELECT_ELECTION: nsites = __repmgr_get_nsites(db_rep); /* * With only 2 sites in the group, even a single failure * could make it impossible to get a majority. So, * fudge a little, unless the user really wants strict * safety. */ if (nsites == 2 && !FLD_ISSET(db_rep->region->config, REP_C_2SITE_STRICT)) nvotes = 1; else nvotes = ELECTION_MAJORITY(nsites); /* * If we're doing an election because we noticed that * the master failed, it's reasonable to expect that the * master won't participate. By not waiting for its * vote, we can probably complete the election faster. * But note that we shouldn't allow this to affect * nvotes calculation. * * However, if we have 2 sites, and strict majority is * turned on, now nvotes would be 2, and it doesn't make * sense to rep_elect to see nsites of 1 in that case. * So only decrement nsites if it currently exceeds * nvotes. */ if (failure_recovery && nsites > nvotes) nsites--; switch (ret = __rep_elect(dbenv, nsites, nvotes, 0)) { case DB_REP_UNAVAIL: break; case 0: succeeded = TRUE; if (db_rep->takeover_pending) { db_rep->takeover_pending = FALSE; if ((ret = __repmgr_become_master(env)) != 0) return (ret); } break; default: __db_err( env, ret, "unexpected election failure"); return (ret); } last_op = ELECT_ELECTION; break; case ELECT_REPSTART: if ((ret = __repmgr_prepare_my_addr(env, &my_addr)) != 0) return (ret); ret = __rep_start(dbenv, &my_addr, DB_REP_CLIENT); __os_free(env, my_addr.data); if (ret != 0) { __db_err(env, ret, "rep_start"); return (ret); } last_op = ELECT_REPSTART; break; case 0: /* * Nothing to do: this can happen the first time * through, on initialization. */ last_op = 0; break; default: DB_ASSERT(env, FALSE); } /* * Only the first election after a crashed master should be * "fast". If that election fails and we have to retry, the * crashed master may have rebooted in the interim. */ failure_recovery = FALSE; LOCK_MUTEX(db_rep->mutex); while (!succeeded && !__repmgr_is_ready(env)) { #ifdef DB_WIN32 duration = db_rep->election_retry_wait / US_PER_MS; ret = SignalObjectAndWait(db_rep->mutex, db_rep->check_election, duration, FALSE); LOCK_MUTEX(db_rep->mutex); if (ret == WAIT_TIMEOUT) break; DB_ASSERT(env, ret == WAIT_OBJECT_0); #else __repmgr_compute_wait_deadline(env, &deadline, db_rep->election_retry_wait); if ((ret = pthread_cond_timedwait( &db_rep->check_election, &db_rep->mutex, &deadline)) == ETIMEDOUT) break; DB_ASSERT(env, ret == 0); #endif } /* * Ways we can get here: election succeeded, sleep duration * expired, "operation needed", or thread shut-down command. * * If we're not yet done, figure out what to do next (which may * be trivially easy if we've been told explicitly, via the * "operation needed" flag). We must first check if we've been * told to do a specific operation, because that could make our * completion criterion more stringent. Note that we never * lessen our completion criterion (i.e., unlike the initial * case, we may leave need_success untouched here). */ done = FALSE; if ((to_do = db_rep->operation_needed) != 0) { db_rep->operation_needed = 0; switch (to_do) { case ELECT_FAILURE_ELECTION: failure_recovery = TRUE; to_do = ELECT_ELECTION; /* FALLTHROUGH */ case ELECT_ELECTION: need_success = TRUE; break; case ELECT_SEEK_MASTER: to_do = 0; break; default: break; } } else if ((done = (succeeded || (!need_success && IS_VALID_EID(db_rep->master_eid)) || db_rep->finished))) db_rep->elect_thread->finished = TRUE; else { if (last_op == ELECT_ELECTION) to_do = ELECT_REPSTART; else { /* * Generally, if what we previously did is a * rep_start (or nothing, which really just * means another thread did the rep_start before * turning us on), then we next do an election. * However, with the REP_CLIENT init policy we * never do an initial election. */ to_do = ELECT_ELECTION; if (db_rep->init_policy == DB_REP_CLIENT && !db_rep->found_master) to_do = ELECT_REPSTART; } } UNLOCK_MUTEX(db_rep->mutex); if (done) return (0); } } /* * Tests whether another thread has signalled for our attention. */ static int __repmgr_is_ready(env) ENV *env; { DB_REP *db_rep; db_rep = env->rep_handle; RPRINT(env, DB_VERB_REPMGR_MISC, (env, "repmgr elect: opcode %d, finished %d, master %d", db_rep->operation_needed, db_rep->finished, db_rep->master_eid)); return (db_rep->operation_needed || db_rep->finished); } /* * PUBLIC: int __repmgr_become_master __P((ENV *)); */ int __repmgr_become_master(env) ENV *env; { DBT my_addr; DB_ENV *dbenv; DB_REP *db_rep; int ret; dbenv = env->dbenv; db_rep = env->rep_handle; db_rep->master_eid = SELF_EID; db_rep->found_master = TRUE; /* * At the moment, it's useless to pass my address to rep_start here, * because rep_start ignores it in the case of MASTER. So we could * avoid the trouble of allocating and freeing this memory. But might * this conceivably change in the future? */ if ((ret = __repmgr_prepare_my_addr(env, &my_addr)) != 0) return (ret); ret = __rep_start(dbenv, &my_addr, DB_REP_MASTER); __os_free(env, my_addr.data); if (ret == 0) __repmgr_stash_generation(env); return (ret); }