rep_lease.c   [plain text]


/*-
 * See the file LICENSE for redistribution information.
 *
 * Copyright (c) 2007,2008 Oracle.  All rights reserved.
 *
 * $Id: rep_lease.c,v 12.23 2008/01/11 21:49:26 sue Exp $
 */

#include "db_config.h"

#include "db_int.h"
#include "dbinc/log.h"

static void __rep_find_entry __P((ENV *, REP *, int, REP_LEASE_ENTRY **));

/*
 * __rep_update_grant -
 *      Update a client's lease grant for this perm record
 *	and send the grant to the master.  Caller must
 *	hold the mtx_clientdb mutex.  Timespec given is in
 *	host local format.
 *
 * PUBLIC: int __rep_update_grant __P((ENV *, db_timespec *));
 */
int
__rep_update_grant(env, ts)
	ENV *env;
	db_timespec *ts;
{
	DBT lease_dbt;
	DB_LOG *dblp;
	DB_REP *db_rep;
	LOG *lp;
	REP *rep;
	__rep_grant_info_args gi;
	db_timespec mytime;
	u_int8_t buf[__REP_GRANT_INFO_SIZE];
	int ret;
	size_t len;

	db_rep = env->rep_handle;
	rep = db_rep->region;
	dblp = env->lg_handle;
	lp = dblp->reginfo.primary;
	timespecclear(&mytime);

	/*
	 * Get current time, and add in the (skewed) lease duration
	 * time to send the grant to the master.
	 */
	__os_gettime(env, &mytime, 1);
	timespecadd(&mytime, &rep->lease_duration);
	REP_SYSTEM_LOCK(env);
	/*
	 * If we are in an election, we cannot grant the lease.
	 * We need to check under the region mutex.
	 */
	if (IN_ELECTION(rep)) {
		REP_SYSTEM_UNLOCK(env);
		return (0);
	}
	if (timespeccmp(&mytime, &rep->grant_expire, >))
		rep->grant_expire = mytime;
	REP_SYSTEM_UNLOCK(env);

	/*
	 * Send the LEASE_GRANT message with the current lease grant
	 * no matter if we've actually extended the lease or not.
	 */
	gi.msg_sec = (u_int32_t)ts->tv_sec;
	gi.msg_nsec = (u_int32_t)ts->tv_nsec;

	if ((ret = __rep_grant_info_marshal(env, &gi, buf,
	    __REP_GRANT_INFO_SIZE, &len)) != 0)
		return (ret);
	DB_INIT_DBT(lease_dbt, buf, len);
	(void)__rep_send_message(env, rep->master_id, REP_LEASE_GRANT,
	    &lp->max_perm_lsn, &lease_dbt, 0, 0);
	return (0);
}

/*
 * __rep_islease_granted -
 *      Return 0 if this client has no outstanding lease granted.
 *	Return 1 otherwise.
 *	Caller must hold the REP_SYSTEM (region) mutex.
 *
 * PUBLIC: int __rep_islease_granted __P((ENV *));
 */
int
__rep_islease_granted(env)
	ENV *env;
{
	DB_REP *db_rep;
	REP *rep;
	db_timespec mytime;

	db_rep = env->rep_handle;
	rep = db_rep->region;
	/*
	 * Get current time and compare against our granted lease.
	 */
	timespecclear(&mytime);
	__os_gettime(env, &mytime, 1);

	return (timespeccmp(&mytime, &rep->grant_expire, <=) ? 1 : 0);
}

/*
 * __rep_lease_table_alloc -
 *	Allocate the lease table on a master.  Called with rep mutex
 * held.  We need to acquire the env region mutex, so we need to
 * make sure we never acquire those mutexes in the opposite order.
 *
 * PUBLIC: int __rep_lease_table_alloc __P((ENV *, u_int32_t));
 */
int
__rep_lease_table_alloc(env, nsites)
	ENV *env;
	u_int32_t nsites;
{
	REGENV *renv;
	REGINFO *infop;
	REP *rep;
	REP_LEASE_ENTRY *le, *table;
	int *lease, ret;
	u_int32_t i;

	rep = env->rep_handle->region;

	infop = env->reginfo;
	renv = infop->primary;
	MUTEX_LOCK(env, renv->mtx_regenv);
	if ((ret = __env_alloc(infop, (size_t)nsites * sizeof(REP_LEASE_ENTRY),
	    &lease)) == 0) {
		if (rep->lease_off != INVALID_ROFF)
			__env_alloc_free(infop,
			    R_ADDR(infop, rep->lease_off));
		rep->lease_off = R_OFFSET(infop, lease);
	}
	MUTEX_UNLOCK(env, renv->mtx_regenv);
	table = R_ADDR(infop, rep->lease_off);
	for (i = 0; i < nsites; i++) {
		le = &table[i];
		le->eid = DB_EID_INVALID;
		timespecclear(&le->start_time);
		timespecclear(&le->end_time);
		ZERO_LSN(le->lease_lsn);
	}
	return (ret);
}

/*
 * __rep_lease_grant -
 *	Handle incoming REP_LEASE_GRANT message on a master.
 *
 * PUBLIC: int __rep_lease_grant __P((ENV *, __rep_control_args *, DBT *, int));
 */
int
__rep_lease_grant(env, rp, rec, eid)
	ENV *env;
	__rep_control_args *rp;
	DBT *rec;
	int eid;
{
	DB_REP *db_rep;
	REP *rep;
	__rep_grant_info_args gi;
	REP_LEASE_ENTRY *le;
	db_timespec msg_time;
	int ret;

	db_rep = env->rep_handle;
	rep = db_rep->region;
	if ((ret = __rep_grant_info_unmarshal(env,
	    &gi, rec->data, rec->size, NULL)) != 0)
		return (ret);
	timespecset(&msg_time, gi.msg_sec, gi.msg_nsec);
	le = NULL;

	/*
	 * Get current time, and add in the (skewed) lease duration
	 * time to send the grant to the master.
	 */
	REP_SYSTEM_LOCK(env);
	__rep_find_entry(env, rep, eid, &le);
	/*
	 * We either get back this site's entry, or an empty entry
	 * that we need to initialize.
	 */
	DB_ASSERT(env, le != NULL);
	/*
	 * Update the entry if it is an empty entry or if the new
	 * lease grant is a later start time than the current one.
	 */
	RPRINT(env, DB_VERB_REP_LEASE,
	    (env, "lease_grant: grant msg time %lu %lu",
	    (u_long)msg_time.tv_sec, (u_long)msg_time.tv_nsec));
	if (le->eid == DB_EID_INVALID ||
	    timespeccmp(&msg_time, &le->start_time, >)) {
		le->eid = eid;
		le->start_time = msg_time;
		le->end_time = le->start_time;
		timespecadd(&le->end_time, &rep->lease_duration);
		RPRINT(env, DB_VERB_REP_LEASE, (env,
    "lease_grant: eid %d, start %lu %lu, end %lu %lu, duration %lu %lu",
    le->eid, (u_long)le->start_time.tv_sec, (u_long)le->start_time.tv_nsec,
    (u_long)le->end_time.tv_sec, (u_long)le->end_time.tv_nsec,
    (u_long)rep->lease_duration.tv_sec, (u_long)rep->lease_duration.tv_nsec));
		/*
		 * XXX Is this really true?  Could we have a lagging
		 * record that has a later start time, but smaller
		 * LSN than we have previously seen??
		 */
		DB_ASSERT(env, LOG_COMPARE(&rp->lsn, &le->lease_lsn) >= 0);
		le->lease_lsn = rp->lsn;
	}
	REP_SYSTEM_UNLOCK(env);
	return (0);
}

/*
 * Find the entry for the given EID.  Or the first empty one.
 */
static void
__rep_find_entry(env, rep, eid, lep)
	ENV *env;
	REP *rep;
	int eid;
	REP_LEASE_ENTRY **lep;
{
	REGINFO *infop;
	REP_LEASE_ENTRY *le, *table;
	u_int32_t i;

	infop = env->reginfo;
	table = R_ADDR(infop, rep->lease_off);

	for (i = 0; i < rep->nsites; i++) {
		le = &table[i];
		/*
		 * Find either the one that matches the client's
		 * EID or the first empty one.
		 */
		if (le->eid == eid || le->eid == DB_EID_INVALID) {
			*lep = le;
			return;
		}
	}
	return;
}

/*
 * __rep_lease_check -
 *      Return 0 if this master holds valid leases and can confirm
 *	its mastership.  If leases are expired, an attempt is made
 *	to refresh the leases.  If that fails, then return the
 *	DB_REP_LEASE_EXPIRED error to the user.  No mutexes held.
 *
 * PUBLIC: int __rep_lease_check __P((ENV *, int));
 */
int
__rep_lease_check(env, refresh)
	ENV *env;
	int refresh;
{
	DB_LOG *dblp;
	DB_LSN lease_lsn;
	DB_REP *db_rep;
	LOG *lp;
	REGINFO *infop;
	REP *rep;
	REP_LEASE_ENTRY *le, *table;
	db_timespec curtime;
	int ret, tries;
	u_int32_t i, min_leases, valid_leases;

	infop = env->reginfo;
	tries = 0;
retry:
	ret = 0;
	db_rep = env->rep_handle;
	rep = db_rep->region;
	dblp = env->lg_handle;
	lp = dblp->reginfo.primary;
	LOG_SYSTEM_LOCK(env);
	lease_lsn = lp->max_perm_lsn;
	LOG_SYSTEM_UNLOCK(env);
	REP_SYSTEM_LOCK(env);
	min_leases = rep->nsites / 2;

	__os_gettime(env, &curtime, 1);
	RPRINT(env, DB_VERB_REP_LEASE,
	    (env, "lease_check: min_leases %lu curtime %lu %lu",
	    (u_long)min_leases, (u_long)curtime.tv_sec,
	    (u_long)curtime.tv_nsec));
	table = R_ADDR(infop, rep->lease_off);
	for (i = 0, valid_leases = 0;
	    i < rep->nsites && valid_leases < min_leases; i++) {
		le = &table[i];
		/*
		 * Count this lease as valid if:
		 * - It is a valid entry (has an EID).
		 * - The lease has not expired.
		 * - The LSN is up to date.
		 */
		if (le->eid != DB_EID_INVALID) {
			RPRINT(env, DB_VERB_REP_LEASE, (env,
		    "lease_check: valid %lu eid %d, lease_lsn [%lu][%lu]",
			    (u_long)valid_leases, le->eid,
			    (u_long)le->lease_lsn.file,
			    (u_long)le->lease_lsn.offset));
			RPRINT(env, DB_VERB_REP_LEASE,
			    (env, "lease_check: endtime %lu %lu",
			    (u_long)le->end_time.tv_sec,
			    (u_long)le->end_time.tv_nsec));
		}
		if (le->eid != DB_EID_INVALID &&
		    timespeccmp(&le->end_time, &curtime, >=) &&
		    LOG_COMPARE(&le->lease_lsn, &lease_lsn) == 0)
			valid_leases++;
	}
	REP_SYSTEM_UNLOCK(env);

	/*
	 * Now see if we have enough.
	 */
	RPRINT(env, DB_VERB_REP_LEASE, (env, "valid %lu, min %lu",
	    (u_long)valid_leases, (u_long)min_leases));
	if (valid_leases < min_leases) {
		if (!refresh)
			ret = DB_REP_LEASE_EXPIRED;
		else {
			/*
			 * If we are successful, we need to recheck the leases
			 * because the lease grant messages may have raced with
			 * the PERM acknowledgement.  Give the grant messages
			 * a chance to arrive and be processed.
			 */
			if ((ret = __rep_lease_refresh(env)) == 0) {
				if (tries <= LEASE_REFRESH_TRIES) {
					/*
					 * If we were successful sending, but
					 * not in racing the message threads,
					 * then yield the processor so that
					 * the message threads get a chance
					 * to run.
					 */
					if (tries > 0)
						__os_yield(env, 1, 0);
					tries++;
					goto retry;
				} else
					ret = DB_REP_LEASE_EXPIRED;
			}
		}
	}

	return (ret);
}

/*
 * __rep_lease_refresh -
 *	Find the last permanent record and send that out so that it
 *	forces clients to grant their leases.
 *
 * PUBLIC: int __rep_lease_refresh __P((ENV *));
 */
int
__rep_lease_refresh(env)
	ENV *env;
{
	DBT rec;
	DB_LOGC *logc;
	DB_LSN lsn;
	DB_REP *db_rep;
	REP *rep;
	int ret, t_ret;

	db_rep = env->rep_handle;
	rep = db_rep->region;

	if ((ret = __log_cursor(env, &logc)) != 0)
		return (ret);

	memset(&rec, 0, sizeof(rec));
	memset(&lsn, 0, sizeof(lsn));
	/*
	 * Use __rep_log_backup to find the last PERM record.
	 */
	if ((ret = __rep_log_backup(env, rep, logc, &lsn)) != 0)
		goto err;

	if ((ret = __logc_get(logc, &lsn, &rec, DB_CURRENT)) != 0)
		goto err;

	if ((ret = __rep_send_message(env,
	    DB_EID_BROADCAST, REP_LOG, &lsn, &rec, REPCTL_PERM, 0)) != 0) {
		/*
		 * If we do not get an ack, we expire leases.
		 */
		(void)__rep_lease_expire(env, 0);
		ret = DB_REP_LEASE_EXPIRED;
	}

err:	if ((t_ret = __logc_close(logc)) != 0 && ret == 0)
		ret = t_ret;
	return (ret);
}

/*
 * __rep_lease_expire -
 *	Proactively expire all leases granted to us.
 *
 * PUBLIC: int __rep_lease_expire __P((ENV *, int));
 */
int
__rep_lease_expire(env, locked)
	ENV *env;
	int locked;
{
	DB_REP *db_rep;
	REGINFO *infop;
	REP *rep;
	REP_LEASE_ENTRY *le, *table;
	int ret;
	u_int32_t i;

	ret = 0;
	db_rep = env->rep_handle;
	rep = db_rep->region;
	infop = env->reginfo;

	if (!locked)
		REP_SYSTEM_LOCK(env);
	if (rep->lease_off != INVALID_ROFF) {
		table = R_ADDR(infop, rep->lease_off);
		/*
		 * Expire all leases forcibly.  We are guaranteed that the
		 * start_time for all leases are not in the future.  Therefore,
		 * set the end_time to the start_time.
		 */
		for (i = 0; i < rep->nsites; i++) {
			le = &table[i];
			le->end_time = le->start_time;
		}
	}
	if (!locked)
		REP_SYSTEM_UNLOCK(env);
	return (ret);
}

/*
 * __rep_lease_waittime -
 *	Return the amount of time remaining on a granted lease.
 * Assume the caller holds the REP_SYSTEM (region) mutex.
 *
 * PUBLIC: db_timeout_t __rep_lease_waittime __P((ENV *));
 */
db_timeout_t
__rep_lease_waittime(env)
	ENV *env;
{
	DB_REP *db_rep;
	REP *rep;
	db_timespec exptime, mytime;
	db_timeout_t to;

	db_rep = env->rep_handle;
	rep = db_rep->region;
	exptime = rep->grant_expire;
	to = 0;
	/*
	 * If the lease has never been granted, we must wait a full
	 * lease timeout because we could be freshly rebooted after
	 * a crash and a lease could be granted from a previous
	 * incarnation of this client.
	 */
	RPRINT(env, DB_VERB_REP_LEASE, (env,
    "wait_time: grant_expire %lu %lu lease_to %lu",
	    (u_long)exptime.tv_sec, (u_long)exptime.tv_nsec,
	    (u_long)rep->lease_timeout));
	if (!timespecisset(&exptime))
		to = rep->lease_timeout;
	else {
		__os_gettime(env, &mytime, 1);
		RPRINT(env, DB_VERB_REP_LEASE, (env,
    "wait_time: mytime %lu %lu, grant_expire %lu %lu",
		    (u_long)mytime.tv_sec, (u_long)mytime.tv_nsec,
		    (u_long)exptime.tv_sec, (u_long)exptime.tv_nsec));
		if (timespeccmp(&mytime, &exptime, <=)) {
			/*
			 * If the current time is before the grant expiration
			 * compute the difference and return remaining grant
			 * time.
			 */
			timespecsub(&exptime, &mytime);
			DB_TIMESPEC_TO_TIMEOUT(to, &exptime, 1);
		}
	}
	return (to);
}