mp_alloc.c   [plain text]


/*-
 * See the file LICENSE for redistribution information.
 *
 * Copyright (c) 1996,2007 Oracle.  All rights reserved.
 *
 * $Id: mp_alloc.c,v 12.33 2007/06/01 18:32:44 bostic Exp $
 */

#include "db_config.h"

#include "db_int.h"
#include "dbinc/mp.h"
#include "dbinc/txn.h"

static void __memp_bad_buffer __P((DB_ENV *, DB_MPOOL_HASH *));

/*
 * __memp_alloc --
 *	Allocate some space from a cache region.
 *
 * PUBLIC: int __memp_alloc __P((DB_MPOOL *,
 * PUBLIC:     REGINFO *, MPOOLFILE *, size_t, roff_t *, void *));
 */
int
__memp_alloc(dbmp, infop, mfp, len, offsetp, retp)
	DB_MPOOL *dbmp;
	REGINFO *infop;
	MPOOLFILE *mfp;
	size_t len;
	roff_t *offsetp;
	void *retp;
{
	BH *bhp, *oldest_bhp, *tbhp;
	BH_FROZEN_PAGE *frozen_bhp;
	DB_ENV *dbenv;
	DB_LSN vlsn;
	DB_MPOOL_HASH *dbht, *hp, *hp_end, *hp_tmp;
	MPOOL *c_mp;
	MPOOLFILE *bh_mfp;
	size_t freed_space;
	u_int32_t buckets, buffers, high_priority, priority;
	u_int32_t put_counter, total_buckets;
	int aggressive, alloc_freeze, giveup, got_oldest, ret;
	u_int8_t *endp;
	void *p;

	dbenv = dbmp->dbenv;
	c_mp = infop->primary;
	dbht = R_ADDR(infop, c_mp->htab);
	hp_end = &dbht[c_mp->htab_buckets];

	buckets = buffers = put_counter = total_buckets = 0;
	aggressive = alloc_freeze = giveup = got_oldest = 0;
	hp_tmp = NULL;

	STAT(c_mp->stat.st_alloc++);

	/*
	 * If we're allocating a buffer, and the one we're discarding is the
	 * same size, we don't want to waste the time to re-integrate it into
	 * the shared memory free list.  If the DB_MPOOLFILE argument isn't
	 * NULL, we'll compare the underlying page sizes of the two buffers
	 * before free-ing and re-allocating buffers.
	 */
	if (mfp != NULL) {
		len = SSZA(BH, buf) + mfp->stat.st_pagesize;
		/* Add space for alignment padding for MVCC diagnostics. */
		MVCC_BHSIZE(mfp, len);
	}

	MPOOL_REGION_LOCK(dbenv, infop);

	/*
	 * Anything newer than 1/10th of the buffer pool is ignored during
	 * allocation (unless allocation starts failing).
	 */
	high_priority = c_mp->lru_count - c_mp->stat.st_pages / 10;

	/*
	 * First we try to allocate from free memory.  If that fails, scan the
	 * buffer pool to find buffers with low priorities.  We consider small
	 * sets of hash buckets each time to limit the amount of work needing
	 * to be done.  This approximates LRU, but not very well.  We either
	 * find a buffer of the same size to use, or we will free 3 times what
	 * we need in the hopes it will coalesce into a contiguous chunk of the
	 * right size.  In the latter case we branch back here and try again.
	 */
alloc:	if ((ret = __env_alloc(infop, len, &p)) == 0) {
		if (mfp != NULL)
			c_mp->stat.st_pages++;
		MPOOL_REGION_UNLOCK(dbenv, infop);
		/*
		 * For MVCC diagnostics, align the pointer so that the buffer
		 * starts on a page boundary.
		 */
		MVCC_BHALIGN(mfp, p);

found:		if (offsetp != NULL)
			*offsetp = R_OFFSET(infop, p);
		*(void **)retp = p;

		/*
		 * Update the search statistics.
		 *
		 * We're not holding the region locked here, these statistics
		 * can't be trusted.
		 */
#ifdef HAVE_STATISTICS
		total_buckets += buckets;
		if (total_buckets != 0) {
			if (total_buckets > c_mp->stat.st_alloc_max_buckets)
				c_mp->stat.st_alloc_max_buckets = total_buckets;
			c_mp->stat.st_alloc_buckets += total_buckets;
		}
		if (buffers != 0) {
			if (buffers > c_mp->stat.st_alloc_max_pages)
				c_mp->stat.st_alloc_max_pages = buffers;
			c_mp->stat.st_alloc_pages += buffers;
		}
#endif
		return (0);
	} else if (giveup || c_mp->stat.st_pages == 0) {
		MPOOL_REGION_UNLOCK(dbenv, infop);

		__db_errx(dbenv,
		    "unable to allocate space from the buffer cache");
		return (ret);
	}
	ret = 0;

	/*
	 * We re-attempt the allocation every time we've freed 3 times what
	 * we need.  Reset our free-space counter.
	 */
	freed_space = 0;
	total_buckets += buckets;
	buckets = 0;

	/*
	 * Walk the hash buckets and find the next two with potentially useful
	 * buffers.  Free the buffer with the lowest priority from the buckets'
	 * chains.
	 */
	for (;;) {
		/* All pages have been freed, make one last try */
		if (c_mp->stat.st_pages == 0)
			goto alloc;

		/* Check for wrap around. */
		hp = &dbht[c_mp->last_checked++];
		if (hp >= hp_end) {
			c_mp->last_checked = 0;
			hp = &dbht[c_mp->last_checked++];
		}

		/*
		 * The failure mode is when there are too many buffers we can't
		 * write or there's not enough memory in the system to support
		 * the number of pinned buffers.
		 *
		 * Get aggressive if we've reviewed the entire cache without
		 * freeing 3 times the needed space.  (The code resets the
		 * counter when we free 3 times the needed space.)  Aggressive
		 * means:
		 *
		 * a: set a flag to attempt to flush high priority buffers as
		 *    well as other buffers.
		 * b: sync the mpool to force out queue extent pages.  While we
		 *    might not have enough space for what we want and flushing
		 *    is expensive, why not?
		 * c: look at a buffer in every hash bucket rather than choose
		 *    the more preferable of two.
		 * d: start to think about giving up.
		 *
		 * If we get here twice, sleep for a second, hopefully someone
		 * else will run and free up some memory.
		 *
		 * Always try to allocate memory too, in case some other thread
		 * returns its memory to the region.
		 *
		 * We don't have any way to know an allocation has no way to
		 * succeed.  Fail if no pages are returned to the cache after
		 * we've been trying for a relatively long time.
		 *
		 * !!!
		 * This test ignores pathological cases like no buffers in the
		 * system -- we check for that early on, so it isn't possible.
		 */
		if (buckets++ == c_mp->htab_buckets) {
			if (freed_space > 0)
				goto alloc;
			MPOOL_REGION_UNLOCK(dbenv, infop);

			switch (++aggressive) {
			case 1:
				break;
			case 2:
				put_counter = c_mp->put_counter;
				/* FALLTHROUGH */
			case 3:
			case 4:
			case 5:
			case 6:
				(void)__memp_sync_int(
				    dbenv, NULL, 0, DB_SYNC_ALLOC, NULL, NULL);

				__os_sleep(dbenv, 1, 0);
				break;
			default:
				aggressive = 1;
				if (put_counter == c_mp->put_counter)
					giveup = 1;
				break;
			}

			MPOOL_REGION_LOCK(dbenv, infop);
			goto alloc;
		}

		/*
		 * Skip empty buckets.
		 *
		 * We can check for empty buckets before locking as we
		 * only care if the pointer is zero or non-zero.
		 */
		if (SH_TAILQ_FIRST(&hp->hash_bucket, __bh) == NULL)
			continue;
		/*
		 * Skip buckets that only have pinned pages.
		 *
		 * Again we are doing this without locking. If we misread
		 * the number we might improperly skip a bucket but this is
		 * not fatal.
		 */
		if (hp->hash_priority == UINT32_MAX)
			continue;

		if (!aggressive) {
			/* Adjust if the bucket has not been reset. */
			priority = hp->hash_priority;
			if (c_mp->lru_reset != 0 &&
			    c_mp->lru_reset <= hp - dbht)
				priority -= MPOOL_BASE_DECREMENT;
			/*
			 * Skip high priority buckets.
			 */
			if (priority > high_priority)
				continue;
			/*
			 * Find two buckets and select the one with the lowest
			 * priority.  Performance testing shows that looking
			 * at two improves the LRUness and looking at more only
			 * does a little better.
			 */
			if (hp_tmp == NULL) {
				hp_tmp = hp;
				continue;
			}
			if (c_mp->lru_reset &&
			    c_mp->lru_reset <= hp_tmp - dbht) {
				if (priority > hp_tmp->hash_priority -
				    MPOOL_BASE_DECREMENT)
					hp = hp_tmp;
			} else if (priority > hp_tmp->hash_priority)
				hp = hp_tmp;
			hp_tmp = NULL;
		}

		/* Unlock the region and lock the hash bucket. */
		MPOOL_REGION_UNLOCK(dbenv, infop);
		MUTEX_LOCK(dbenv, hp->mtx_hash);

		/* Remember the priority of the buffer we're looking for. */
		priority = hp->hash_priority;

#ifdef DIAGNOSTIC
		__memp_check_order(dbenv, hp);
#endif
		/*
		 * The lowest priority page is first in the bucket, as they are
		 * maintained in sorted order.
		 *
		 * The buffer may have been freed or its priority changed while
		 * we switched from the region lock to the hash lock.  If so,
		 * we have to restart.  We will still take the first buffer on
		 * the bucket's list, though, if it has a low enough priority.
		 */
this_hb:	if ((bhp = SH_TAILQ_FIRST(&hp->hash_bucket, __bh)) == NULL)
			goto next_hb;

		buffers++;

		/* Find the associated MPOOLFILE. */
		bh_mfp = R_ADDR(dbmp->reginfo, bhp->mf_offset);

		/* Select the lowest priority buffer in the chain. */
		for (oldest_bhp = bhp, tbhp = SH_CHAIN_PREV(bhp, vc, __bh);
		    tbhp != NULL;
		    oldest_bhp = tbhp, tbhp = SH_CHAIN_PREV(tbhp, vc, __bh))
			if (tbhp->ref <= bhp->ref &&
			    tbhp->priority <= bhp->priority)
				bhp = tbhp;

		/*
		 * Prefer the last buffer in the chain.
		 *
		 * If the oldest buffer isn't obsolete with respect to the
		 * cached old reader LSN, recalculate the oldest reader LSN
		 * and retry.  There is a tradeoff here, because if we had the
		 * LSN earlier, we might have found pages to evict, but to get
		 * it, we need to lock the transaction region.
		 */
		if (oldest_bhp != bhp && oldest_bhp->ref == 0) {
			if (F_ISSET(bhp, BH_FROZEN) &&
			    !F_ISSET(oldest_bhp, BH_FROZEN))
				bhp = oldest_bhp;
			else if (BH_OBSOLETE(oldest_bhp, hp->old_reader, vlsn))
				bhp = oldest_bhp;
			else if (!got_oldest &&
			    __txn_oldest_reader(dbenv, &hp->old_reader) == 0) {
				got_oldest = 1;
				if (BH_OBSOLETE(
				    oldest_bhp, hp->old_reader, vlsn))
					bhp = oldest_bhp;
			}
		}

		if (bhp->ref != 0 || (bhp != oldest_bhp &&
		    !aggressive && bhp->priority > priority))
			goto next_hb;

		/* If the page is dirty, pin it and write it. */
		ret = 0;
		if (F_ISSET(bhp, BH_DIRTY)) {
			++bhp->ref;
			ret = __memp_bhwrite(dbmp, hp, bh_mfp, bhp, 0);
			--bhp->ref;
#ifdef HAVE_STATISTICS
			if (ret == 0)
				++c_mp->stat.st_rw_evict;
#endif
		}
#ifdef HAVE_STATISTICS
		else
			++c_mp->stat.st_ro_evict;
#endif

		/*
		 * Freeze this buffer, if necessary.  That is, if the buffer
		 * itself or the next version created could be read by the
		 * oldest reader in the system.
		 */
		if (ret == 0 && bh_mfp->multiversion) {
			if (!got_oldest && !SH_CHAIN_HASPREV(bhp, vc) &&
			    !BH_OBSOLETE(bhp, hp->old_reader, vlsn)) {
				(void)__txn_oldest_reader(dbenv,
				    &hp->old_reader);
				got_oldest = 1;
			}
			if (SH_CHAIN_HASPREV(bhp, vc) ||
			    !BH_OBSOLETE(bhp, hp->old_reader, vlsn)) {
				/*
				 * Before freezing, double-check that we have
				 * an up-to-date old_reader LSN.
				 */
				if (!aggressive ||
				    F_ISSET(bhp, BH_FROZEN) || bhp->ref != 0)
					goto next_hb;
				ret = __memp_bh_freeze(dbmp,
				    infop, hp, bhp, &alloc_freeze);
			}
		}

		/*
		 * If a write fails for any reason, we can't proceed.
		 *
		 * We released the hash bucket lock while doing I/O, so another
		 * thread may have acquired this buffer and incremented the ref
		 * count after we wrote it, in which case we can't have it.
		 *
		 * If there's a write error and we're having problems finding
		 * something to allocate, avoid selecting this buffer again
		 * by making it the bucket's least-desirable buffer.
		 */
		if (ret != 0 || bhp->ref != 0) {
			if (ret != 0 && aggressive)
				__memp_bad_buffer(dbenv, hp);
			goto next_hb;
		}

		/*
		 * If we need some empty buffer headers for freezing, turn the
		 * buffer we've found into frozen headers and put them on the
		 * free list.  Only reset alloc_freeze if we've actually
		 * allocated some frozen buffer headers.
		 *
		 * Check to see if the buffer is the size we're looking for.
		 * If so, we can simply reuse it.  Otherwise, free the buffer
		 * and its space and keep looking.
		 */
		if (F_ISSET(bhp, BH_FROZEN)) {
			++bhp->ref;
			if ((ret = __memp_bh_thaw(dbmp, infop, hp,
			    bhp, NULL)) != 0) {
				MUTEX_UNLOCK(dbenv, hp->mtx_hash);
				return (ret);
			}
			alloc_freeze = 0;
			goto this_hb;
		} else if (alloc_freeze) {
			if ((ret = __memp_bhfree(dbmp, infop, hp, bhp, 0)) != 0)
				return (ret);
			MVCC_MPROTECT(bhp->buf, bh_mfp->stat.st_pagesize,
			    PROT_READ | PROT_WRITE | PROT_EXEC);

			MPOOL_REGION_LOCK(dbenv, infop);
			SH_TAILQ_INSERT_TAIL(&c_mp->alloc_frozen,
			    (BH_FROZEN_ALLOC *)bhp, links);
			frozen_bhp = (BH_FROZEN_PAGE *)
			    ((BH_FROZEN_ALLOC *)bhp + 1);
			endp = (u_int8_t *)bhp->buf + bh_mfp->stat.st_pagesize;
			while ((u_int8_t *)(frozen_bhp + 1) < endp) {
				SH_TAILQ_INSERT_TAIL(&c_mp->free_frozen,
				    (BH *)frozen_bhp, hq);
				frozen_bhp++;
			}
			alloc_freeze = 0;
			continue;
		} else if (mfp != NULL &&
		    mfp->stat.st_pagesize == bh_mfp->stat.st_pagesize) {
			if ((ret = __memp_bhfree(dbmp, infop, hp, bhp, 0)) != 0)
				return (ret);
			p = bhp;
			goto found;
		} else {
			freed_space += sizeof(*bhp) + bh_mfp->stat.st_pagesize;
			if ((ret = __memp_bhfree(dbmp, infop,
			    hp, bhp, BH_FREE_FREEMEM)) != 0)
				return (ret);
		}

		if (aggressive > 1)
			aggressive = 1;

		/*
		 * Unlock this hash bucket and re-acquire the region lock. If
		 * we're reaching here as a result of calling memp_bhfree, the
		 * hash bucket lock has already been discarded.
		 */
		if (0) {
next_hb:		MUTEX_UNLOCK(dbenv, hp->mtx_hash);
		}
		MPOOL_REGION_LOCK(dbenv, infop);

		/*
		 * Retry the allocation as soon as we've freed up sufficient
		 * space.  We're likely to have to coalesce of memory to
		 * satisfy the request, don't try until it's likely (possible?)
		 * we'll succeed.
		 */
		if (freed_space >= 3 * len)
			goto alloc;
	}
	/* NOTREACHED */
}

/*
 * __memp_free --
 *	Free some space from a cache region.
 *
 * PUBLIC: void __memp_free __P((REGINFO *, MPOOLFILE *, void *));
 */
void
__memp_free(infop, mfp, buf)
	REGINFO *infop;
	MPOOLFILE *mfp;
	void *buf;
{
	MVCC_BHUNALIGN(mfp, buf);
	COMPQUIET(mfp, NULL);
	__env_alloc_free(infop, buf);
}

/*
 * __memp_bad_buffer --
 *	Make the first buffer in a hash bucket the least desirable buffer.
 */
static void
__memp_bad_buffer(dbenv, hp)
	DB_ENV *dbenv;
	DB_MPOOL_HASH *hp;
{
	BH *bhp, *last_bhp;
	u_int32_t priority;

	/*
	 * Get the first buffer from the bucket.  If it is also the last buffer
	 * (in other words, it is the only buffer in the bucket), we're done.
	 */
	bhp = SH_TAILQ_FIRST(&hp->hash_bucket, __bh);
	last_bhp = SH_TAILQ_LASTP(&hp->hash_bucket, hq, __bh);
	if (bhp == last_bhp)
		return;

	/* There are multiple buffers in the bucket, remove the first one. */
	SH_TAILQ_REMOVE(&hp->hash_bucket, bhp, hq, __bh);

	/*
	 * Find the highest priority buffer in the bucket.  Buffers are
	 * sorted by priority, so it's the last one in the bucket.
	 */
	priority = BH_PRIORITY(last_bhp);

	/*
	 * Append our buffer to the bucket and set its priority to be just as
	 * bad.
	 */
	SH_TAILQ_INSERT_TAIL(&hp->hash_bucket, bhp, hq);
	for (; bhp != NULL ; bhp = SH_CHAIN_PREV(bhp, vc, __bh))
		bhp->priority = priority;

	/* Reset the hash bucket's priority. */
	hp->hash_priority = BH_PRIORITY(
	    SH_TAILQ_FIRSTP(&hp->hash_bucket, __bh));

#ifdef DIAGNOSTIC
	__memp_check_order(dbenv, hp);
#else
	COMPQUIET(dbenv, NULL);
#endif
}

#ifdef DIAGNOSTIC
/*
 * __memp_check_order --
 *	Verify the priority ordering of a hash bucket chain.
 *
 * PUBLIC: #ifdef DIAGNOSTIC
 * PUBLIC: void __memp_check_order __P((DB_ENV *, DB_MPOOL_HASH *));
 * PUBLIC: #endif
 */
void
__memp_check_order(dbenv, hp)
	DB_ENV *dbenv;
	DB_MPOOL_HASH *hp;
{
	BH *bhp, *first_bhp, *tbhp;
	u_int32_t dirty, priority, last_priority;

	dirty = 0;

	/*
	 * Assumes the hash bucket is locked.
	 */
	last_priority = hp->hash_priority;
	for (bhp = first_bhp = SH_TAILQ_FIRST(&hp->hash_bucket, __bh);
	    bhp != NULL; bhp = SH_TAILQ_NEXT(bhp, hq, __bh)) {
		DB_ASSERT(dbenv, !SH_CHAIN_HASNEXT(bhp, vc));

		if (F_ISSET(bhp, BH_DIRTY))
			dirty++;
		priority = BH_PRIORITY(bhp);
		DB_ASSERT(dbenv, (bhp == first_bhp) ?
		    priority == last_priority : priority >= last_priority);
		last_priority = priority;

		/* Chains have referential integrity. */
		for (tbhp = SH_CHAIN_PREV(bhp, vc, __bh); tbhp != NULL;
		    tbhp = SH_CHAIN_PREV(tbhp, vc, __bh))
			DB_ASSERT(dbenv, tbhp == SH_CHAIN_PREV(
			    SH_CHAIN_NEXT(tbhp, vc, __bh), vc, __bh));

		/*
		 * No repeats.
		 * XXX This is O(N**2) where N is the number of buffers in the
		 * bucket, but we generally assume small buckets.
		 */
		for (tbhp = SH_TAILQ_NEXT(bhp, hq, __bh); tbhp != NULL;
		    tbhp = SH_TAILQ_NEXT(tbhp, hq, __bh))
			DB_ASSERT(dbenv, bhp->pgno != tbhp->pgno ||
			    bhp->mf_offset != tbhp->mf_offset);
	}
	DB_ASSERT(dbenv, dirty == hp->hash_page_dirty);
}
#endif