#include "db_config.h"
#include "db_int.h"
#include "dbinc/log.h"
#include "dbinc/mp.h"
#include "dbinc/txn.h"
int
__memp_fget_pp(dbmfp, pgnoaddr, txnp, flags, addrp)
DB_MPOOLFILE *dbmfp;
db_pgno_t *pgnoaddr;
DB_TXN *txnp;
u_int32_t flags;
void *addrp;
{
DB_ENV *dbenv;
DB_THREAD_INFO *ip;
int rep_check, ret;
dbenv = dbmfp->dbenv;
PANIC_CHECK(dbenv);
MPF_ILLEGAL_BEFORE_OPEN(dbmfp, "DB_MPOOLFILE->get");
#define OKFLAGS (DB_MPOOL_CREATE | DB_MPOOL_DIRTY | \
DB_MPOOL_EDIT | DB_MPOOL_LAST | DB_MPOOL_NEW)
if (flags != 0) {
if ((ret = __db_fchk(dbenv, "memp_fget", flags, OKFLAGS)) != 0)
return (ret);
switch (flags) {
case DB_MPOOL_DIRTY:
case DB_MPOOL_CREATE:
case DB_MPOOL_EDIT:
case DB_MPOOL_LAST:
case DB_MPOOL_NEW:
break;
default:
return (__db_ferr(dbenv, "memp_fget", 1));
}
}
ENV_ENTER(dbenv, ip);
rep_check = IS_ENV_REPLICATED(dbenv) ? 1 : 0;
if (rep_check && (ret = __op_rep_enter(dbenv)) != 0)
goto err;
ret = __memp_fget(dbmfp, pgnoaddr, txnp, flags, addrp);
if (ret != 0 && rep_check)
(void)__op_rep_exit(dbenv);
err: if (ret != 0)
ENV_LEAVE(dbenv, ip);
return (ret);
}
int
__memp_fget(dbmfp, pgnoaddr, txn, flags, addrp)
DB_MPOOLFILE *dbmfp;
db_pgno_t *pgnoaddr;
DB_TXN *txn;
u_int32_t flags;
void *addrp;
{
enum { FIRST_FOUND, FIRST_MISS, SECOND_FOUND, SECOND_MISS } state;
BH *alloc_bhp, *bhp, *frozen_bhp, *oldest_bhp;
DB_ENV *dbenv;
DB_LSN *read_lsnp, vlsn;
DB_MPOOL *dbmp;
DB_MPOOL_HASH *hp;
MPOOL *c_mp;
MPOOLFILE *mfp;
REGINFO *infop, *t_infop;
TXN_DETAIL *td;
roff_t mf_offset;
u_int32_t st_hsearch;
int b_incr, b_locked, dirty, edit, extending, first;
int makecopy, mvcc, need_free, reorder, ret;
*(void **)addrp = NULL;
COMPQUIET(c_mp, NULL);
COMPQUIET(infop, NULL);
COMPQUIET(oldest_bhp, NULL);
dbenv = dbmfp->dbenv;
dbmp = dbenv->mp_handle;
mfp = dbmfp->mfp;
mvcc = mfp->multiversion;
mf_offset = R_OFFSET(dbmp->reginfo, mfp);
alloc_bhp = bhp = frozen_bhp = NULL;
read_lsnp = NULL;
td = NULL;
hp = NULL;
b_incr = b_locked = extending = makecopy = ret = 0;
if (LF_ISSET(DB_MPOOL_DIRTY)) {
if (F_ISSET(dbmfp, MP_READONLY)) {
__db_errx(dbenv,
"%s: dirty flag set for readonly file page",
__memp_fn(dbmfp));
return (EINVAL);
}
if ((ret = __db_fcchk(dbenv, "DB_MPOOLFILE->get",
flags, DB_MPOOL_DIRTY, DB_MPOOL_EDIT)) != 0)
return (ret);
}
dirty = LF_ISSET(DB_MPOOL_DIRTY);
edit = LF_ISSET(DB_MPOOL_EDIT);
LF_CLR(DB_MPOOL_DIRTY | DB_MPOOL_EDIT);
if (mvcc && txn != NULL && txn->td != NULL) {
while (txn->parent != NULL)
txn = txn->parent;
td = (TXN_DETAIL *)txn->td;
if (F_ISSET(txn, TXN_SNAPSHOT)) {
read_lsnp = &td->read_lsn;
if (IS_MAX_LSN(*read_lsnp) &&
(ret = __log_current_lsn(dbenv, read_lsnp,
NULL, NULL)) != 0)
return (ret);
}
if ((dirty || LF_ISSET(DB_MPOOL_CREATE | DB_MPOOL_NEW)) &&
td->mvcc_mtx == MUTEX_INVALID && (ret =
__mutex_alloc(dbenv, MTX_TXN_MVCC, 0, &td->mvcc_mtx)) != 0)
return (ret);
}
switch (flags) {
case DB_MPOOL_LAST:
MUTEX_LOCK(dbenv, mfp->mutex);
*pgnoaddr = mfp->last_pgno;
MUTEX_UNLOCK(dbenv, mfp->mutex);
break;
case DB_MPOOL_NEW:
state = FIRST_MISS;
goto alloc;
case DB_MPOOL_CREATE:
default:
break;
}
if (dbmfp->addr != NULL &&
F_ISSET(mfp, MP_CAN_MMAP) && *pgnoaddr <= mfp->orig_last_pgno) {
*(void **)addrp = (u_int8_t *)dbmfp->addr +
(*pgnoaddr * mfp->stat.st_pagesize);
STAT(++mfp->stat.st_map);
return (0);
}
retry:
MP_GET_BUCKET(dbmfp, *pgnoaddr, &infop, hp, ret);
if (ret != 0)
return (ret);
c_mp = infop->primary;
st_hsearch = 0;
b_locked = 1;
SH_TAILQ_FOREACH(bhp, &hp->hash_bucket, hq, __bh) {
++st_hsearch;
if (bhp->pgno != *pgnoaddr || bhp->mf_offset != mf_offset)
continue;
if (mvcc && !edit && read_lsnp != NULL) {
while (bhp != NULL &&
!BH_OWNED_BY(dbenv, bhp, txn) &&
!BH_VISIBLE(dbenv, bhp, read_lsnp, vlsn))
bhp = SH_CHAIN_PREV(bhp, vc, __bh);
DB_ASSERT(dbenv, bhp != NULL);
}
makecopy = mvcc && dirty && !BH_OWNED_BY(dbenv, bhp, txn);
if (F_ISSET(bhp, BH_FROZEN) && !F_ISSET(bhp, BH_FREED)) {
DB_ASSERT(dbenv, frozen_bhp == NULL);
frozen_bhp = bhp;
}
if (bhp->ref == UINT16_MAX) {
__db_errx(dbenv,
"%s: page %lu: reference count overflow",
__memp_fn(dbmfp), (u_long)bhp->pgno);
ret = __db_panic(dbenv, EINVAL);
goto err;
}
++bhp->ref;
b_incr = 1;
for (first = 1; F_ISSET(bhp, BH_LOCKED) &&
!F_ISSET(dbenv, DB_ENV_NOLOCKING); first = 0) {
if (!first && bhp->ref_sync != 0) {
--bhp->ref;
MUTEX_UNLOCK(dbenv, hp->mtx_hash);
bhp = frozen_bhp = NULL;
b_incr = b_locked = 0;
__os_sleep(dbenv, 0, 1);
goto retry;
}
if (!F_ISSET(hp, IO_WAITER)) {
F_SET(hp, IO_WAITER);
MUTEX_LOCK(dbenv, hp->mtx_io);
}
STAT(++hp->hash_io_wait);
MUTEX_UNLOCK(dbenv, hp->mtx_hash);
MUTEX_LOCK(dbenv, hp->mtx_io);
MUTEX_UNLOCK(dbenv, hp->mtx_io);
MUTEX_LOCK(dbenv, hp->mtx_hash);
}
if (frozen_bhp != NULL && F_ISSET(frozen_bhp, BH_THAWED)) {
thawed: need_free = (--frozen_bhp->ref == 0);
b_incr = 0;
MUTEX_UNLOCK(dbenv, hp->mtx_hash);
MPOOL_REGION_LOCK(dbenv, infop);
if (alloc_bhp != NULL) {
__memp_free(infop, mfp, alloc_bhp);
alloc_bhp = NULL;
}
if (need_free)
SH_TAILQ_INSERT_TAIL(&c_mp->free_frozen,
frozen_bhp, hq);
MPOOL_REGION_UNLOCK(dbenv, infop);
bhp = frozen_bhp = NULL;
goto retry;
}
if (SH_CHAIN_HASNEXT(bhp, vc) &&
SH_CHAIN_NEXTP(bhp, vc, __bh)->td_off == bhp->td_off) {
--bhp->ref;
b_incr = 0;
MUTEX_UNLOCK(dbenv, hp->mtx_hash);
bhp = frozen_bhp = NULL;
goto retry;
} else if (dirty && SH_CHAIN_HASNEXT(bhp, vc)) {
ret = DB_LOCK_DEADLOCK;
goto err;
}
#ifdef HAVE_STATISTICS
++mfp->stat.st_cache_hit;
#endif
break;
}
#ifdef HAVE_STATISTICS
++c_mp->stat.st_hash_searches;
if (st_hsearch > c_mp->stat.st_hash_longest)
c_mp->stat.st_hash_longest = st_hsearch;
c_mp->stat.st_hash_examined += st_hsearch;
#endif
state = bhp == NULL ?
(alloc_bhp == NULL ? FIRST_MISS : SECOND_MISS) :
(alloc_bhp == NULL ? FIRST_FOUND : SECOND_FOUND);
switch (state) {
case FIRST_FOUND:
if (flags == DB_MPOOL_FREE) {
if (--bhp->ref == 0) {
if (F_ISSET(bhp, BH_DIRTY)) {
--hp->hash_page_dirty;
F_CLR(bhp, BH_DIRTY | BH_DIRTY_CREATE);
}
if (mvcc && (F_ISSET(bhp, BH_FROZEN) ||
!SH_CHAIN_SINGLETON(bhp, vc) ||
bhp->td_off == INVALID_ROFF ||
!IS_MAX_LSN(*VISIBLE_LSN(dbenv, bhp)))) {
F_SET(bhp, BH_FREED);
MUTEX_UNLOCK(dbenv, hp->mtx_hash);
return (0);
}
return (__memp_bhfree(
dbmp, infop, hp, bhp, BH_FREE_FREEMEM));
}
__db_errx(dbenv,
"File %s: freeing pinned buffer for page %lu",
__memp_fns(dbmp, mfp), (u_long)*pgnoaddr);
ret = __db_panic(dbenv, EINVAL);
goto err;
}
if (mvcc) {
if (flags == DB_MPOOL_CREATE &&
F_ISSET(bhp, BH_FREED)) {
extending = makecopy = 1;
MUTEX_LOCK(dbenv, mfp->mutex);
if (*pgnoaddr > mfp->last_pgno)
mfp->last_pgno = *pgnoaddr;
MUTEX_UNLOCK(dbenv, mfp->mutex);
}
reuse: if ((makecopy || frozen_bhp != NULL) && (oldest_bhp =
SH_CHAIN_PREV(bhp, vc, __bh)) != NULL) {
while (SH_CHAIN_HASPREV(oldest_bhp, vc))
oldest_bhp = SH_CHAIN_PREVP(oldest_bhp,
vc, __bh);
if (oldest_bhp->ref == 0 && !BH_OBSOLETE(
oldest_bhp, hp->old_reader, vlsn) &&
(ret = __txn_oldest_reader(dbenv,
&hp->old_reader)) != 0)
goto err;
if (BH_OBSOLETE(
oldest_bhp, hp->old_reader, vlsn) &&
oldest_bhp->ref == 0) {
if (F_ISSET(oldest_bhp, BH_FROZEN)) {
++oldest_bhp->ref;
if ((ret = __memp_bh_thaw(dbmp,
infop, hp, oldest_bhp,
NULL)) != 0)
goto err;
goto reuse;
} else if ((ret = __memp_bhfree(dbmp,
infop, hp, oldest_bhp,
BH_FREE_REUSE)) != 0)
goto err;
alloc_bhp = oldest_bhp;
}
DB_ASSERT(dbenv, alloc_bhp == NULL ||
!F_ISSET(alloc_bhp, BH_FROZEN));
}
}
if ((!makecopy && frozen_bhp == NULL) || alloc_bhp != NULL)
break;
case FIRST_MISS:
MUTEX_UNLOCK(dbenv, hp->mtx_hash);
b_locked = 0;
if (flags == DB_MPOOL_FREE)
return (0);
alloc:
DB_ASSERT(dbenv, !b_locked);
MUTEX_LOCK(dbenv, mfp->mutex);
switch (flags) {
case DB_MPOOL_NEW:
extending = 1;
if (mfp->maxpgno != 0 &&
mfp->last_pgno >= mfp->maxpgno) {
__db_errx(
dbenv, "%s: file limited to %lu pages",
__memp_fn(dbmfp), (u_long)mfp->maxpgno);
ret = ENOSPC;
} else
*pgnoaddr = mfp->last_pgno + 1;
break;
case DB_MPOOL_CREATE:
if (mfp->maxpgno != 0 && *pgnoaddr > mfp->maxpgno) {
__db_errx(
dbenv, "%s: file limited to %lu pages",
__memp_fn(dbmfp), (u_long)mfp->maxpgno);
ret = ENOSPC;
} else if (!extending)
extending = *pgnoaddr > mfp->last_pgno;
break;
default:
ret = *pgnoaddr > mfp->last_pgno ? DB_PAGE_NOTFOUND : 0;
break;
}
MUTEX_UNLOCK(dbenv, mfp->mutex);
if (ret != 0)
goto err;
MP_GET_REGION(dbmfp, *pgnoaddr, &infop, ret);
if (ret != 0)
goto err;
c_mp = infop->primary;
if ((ret =
__memp_alloc(dbmp, infop, mfp, 0, NULL, &alloc_bhp)) != 0)
goto err;
#ifdef DIAGNOSTIC
if ((uintptr_t)alloc_bhp->buf & (sizeof(size_t) - 1)) {
__db_errx(dbenv,
"DB_MPOOLFILE->get: buffer data is NOT size_t aligned");
ret = __db_panic(dbenv, EINVAL);
goto err;
}
#endif
if (extending)
MUTEX_LOCK(dbenv, mfp->mutex);
if (flags == DB_MPOOL_NEW && *pgnoaddr != mfp->last_pgno + 1) {
*pgnoaddr = mfp->last_pgno + 1;
MP_GET_REGION(dbmfp, *pgnoaddr, &t_infop,ret);
if (ret != 0)
goto err;
if (t_infop != infop) {
MUTEX_UNLOCK(dbenv, mfp->mutex);
MPOOL_REGION_LOCK(dbenv, infop);
__memp_free(infop, mfp, alloc_bhp);
c_mp->stat.st_pages--;
MPOOL_REGION_UNLOCK(dbenv, infop);
alloc_bhp = NULL;
goto alloc;
}
}
if (extending) {
if (*pgnoaddr > mfp->last_pgno)
mfp->last_pgno = *pgnoaddr;
MUTEX_UNLOCK(dbenv, mfp->mutex);
if (ret != 0)
goto err;
}
if (bhp != NULL) {
MUTEX_LOCK(dbenv, hp->mtx_hash);
b_locked = 1;
break;
}
DB_ASSERT(dbenv, frozen_bhp == NULL);
goto retry;
case SECOND_FOUND:
if (extending && F_ISSET(bhp, BH_FREED))
makecopy = 1;
if (makecopy || frozen_bhp != NULL)
break;
MUTEX_UNLOCK(dbenv, hp->mtx_hash);
MPOOL_REGION_LOCK(dbenv, infop);
__memp_free(infop, mfp, alloc_bhp);
c_mp->stat.st_pages--;
MPOOL_REGION_UNLOCK(dbenv, infop);
alloc_bhp = NULL;
if (flags == DB_MPOOL_NEW) {
--bhp->ref;
b_incr = b_locked = 0;
bhp = NULL;
goto alloc;
}
MUTEX_LOCK(dbenv, hp->mtx_hash);
break;
case SECOND_MISS:
bhp = alloc_bhp;
alloc_bhp = NULL;
#ifdef DIAG_MVCC
memset(bhp, 0, SSZ(BH, align_off));
#else
memset(bhp, 0, sizeof(BH));
#endif
bhp->ref = 1;
b_incr = 1;
bhp->priority = UINT32_MAX;
bhp->pgno = *pgnoaddr;
bhp->mf_offset = mf_offset;
SH_TAILQ_INSERT_TAIL(&hp->hash_bucket, bhp, hq);
SH_CHAIN_INIT(bhp, vc);
hp->hash_priority =
BH_PRIORITY(SH_TAILQ_FIRSTP(&hp->hash_bucket, __bh));
if (extending) {
++hp->hash_page_dirty;
F_SET(bhp, BH_DIRTY | BH_DIRTY_CREATE);
}
if (extending) {
MVCC_MPROTECT(bhp->buf, mfp->stat.st_pagesize,
PROT_READ | PROT_WRITE);
if (mfp->clear_len == DB_CLEARLEN_NOTSET)
memset(bhp->buf, 0, mfp->stat.st_pagesize);
else {
memset(bhp->buf, 0, mfp->clear_len);
#if defined(DIAGNOSTIC) || defined(UMRW)
memset(bhp->buf + mfp->clear_len, CLEAR_BYTE,
mfp->stat.st_pagesize - mfp->clear_len);
#endif
}
if (flags == DB_MPOOL_CREATE && mfp->ftype != 0)
F_SET(bhp, BH_CALLPGIN);
STAT(++mfp->stat.st_page_create);
} else {
F_SET(bhp, BH_TRASH);
STAT(++mfp->stat.st_cache_miss);
}
MUTEX_LOCK(dbenv, mfp->mutex);
++mfp->block_cnt;
MUTEX_UNLOCK(dbenv, mfp->mutex);
}
DB_ASSERT(dbenv, bhp != NULL);
DB_ASSERT(dbenv, bhp->ref != 0);
if (frozen_bhp != NULL) {
DB_ASSERT(dbenv, alloc_bhp != NULL);
if (F_ISSET(frozen_bhp, BH_THAWED))
goto thawed;
else {
if ((ret = __memp_bh_thaw(dbmp, infop, hp,
frozen_bhp, alloc_bhp)) != 0)
goto err;
bhp = alloc_bhp;
}
frozen_bhp = alloc_bhp = NULL;
if (makecopy) {
MUTEX_UNLOCK(dbenv, hp->mtx_hash);
b_locked = 0;
goto alloc;
}
}
if (F_ISSET(bhp, BH_TRASH) &&
(ret = __memp_pgread(dbmfp,
hp, bhp, LF_ISSET(DB_MPOOL_CREATE) ? 1 : 0)) != 0)
goto err;
if (F_ISSET(bhp, BH_CALLPGIN)) {
MVCC_MPROTECT(bhp->buf, mfp->stat.st_pagesize,
PROT_READ | PROT_WRITE);
if ((ret = __memp_pg(dbmfp, bhp, 1)) != 0)
goto err;
F_CLR(bhp, BH_CALLPGIN);
}
if (makecopy && state != SECOND_MISS) {
DB_ASSERT(dbenv, !SH_CHAIN_HASNEXT(bhp, vc));
DB_ASSERT(dbenv, bhp != NULL);
DB_ASSERT(dbenv, alloc_bhp != NULL);
DB_ASSERT(dbenv, alloc_bhp != bhp);
if (bhp->ref == 1)
MVCC_MPROTECT(bhp->buf, mfp->stat.st_pagesize,
PROT_READ);
alloc_bhp->ref = 1;
alloc_bhp->ref_sync = 0;
alloc_bhp->flags = F_ISSET(bhp, BH_DIRTY | BH_DIRTY_CREATE);
F_CLR(bhp, BH_DIRTY | BH_DIRTY_CREATE);
alloc_bhp->priority = bhp->priority;
alloc_bhp->pgno = bhp->pgno;
alloc_bhp->mf_offset = bhp->mf_offset;
alloc_bhp->td_off = INVALID_ROFF;
if (txn != NULL &&
(ret = __memp_bh_settxn(dbmp, mfp, alloc_bhp, td)) != 0)
goto err;
if (extending) {
memset(alloc_bhp->buf, 0, mfp->stat.st_pagesize);
F_SET(alloc_bhp, BH_DIRTY_CREATE);
} else
memcpy(alloc_bhp->buf, bhp->buf, mfp->stat.st_pagesize);
SH_CHAIN_INSERT_AFTER(bhp, alloc_bhp, vc, __bh);
SH_TAILQ_INSERT_BEFORE(&hp->hash_bucket,
bhp, alloc_bhp, hq, __bh);
SH_TAILQ_REMOVE(&hp->hash_bucket, bhp, hq, __bh);
if (--bhp->ref == 0) {
bhp->priority = c_mp->lru_count;
__memp_bucket_reorder(dbenv, hp, bhp);
MVCC_MPROTECT(bhp->buf, mfp->stat.st_pagesize, 0);
}
bhp = alloc_bhp;
if (alloc_bhp != oldest_bhp) {
MUTEX_LOCK(dbenv, mfp->mutex);
++mfp->block_cnt;
MUTEX_UNLOCK(dbenv, mfp->mutex);
}
alloc_bhp = NULL;
} else if (mvcc && extending && txn != NULL &&
(ret = __memp_bh_settxn(dbmp, mfp, bhp, td)) != 0)
goto err;
if ((dirty || edit || extending) && !F_ISSET(bhp, BH_DIRTY)) {
DB_ASSERT(dbenv, !SH_CHAIN_HASNEXT(bhp, vc));
++hp->hash_page_dirty;
F_SET(bhp, BH_DIRTY);
}
if (state != SECOND_MISS && bhp->ref == 1) {
if (SH_CHAIN_SINGLETON(bhp, vc)) {
bhp->priority = UINT32_MAX;
if (bhp != SH_TAILQ_LAST(&hp->hash_bucket, hq, __bh)) {
SH_TAILQ_REMOVE(&hp->hash_bucket,
bhp, hq, __bh);
SH_TAILQ_INSERT_TAIL(&hp->hash_bucket, bhp, hq);
}
hp->hash_priority = BH_PRIORITY(
SH_TAILQ_FIRSTP(&hp->hash_bucket, __bh));
} else {
reorder = (BH_PRIORITY(bhp) == bhp->priority);
bhp->priority = UINT32_MAX;
if (reorder)
__memp_bucket_reorder(dbenv, hp, bhp);
}
}
MVCC_MPROTECT(bhp->buf, mfp->stat.st_pagesize, PROT_READ |
(dirty || edit || extending || F_ISSET(bhp, BH_DIRTY) ?
PROT_WRITE : 0));
#ifdef DIAGNOSTIC
__memp_check_order(dbenv, hp);
{
BH *next_bhp = SH_CHAIN_NEXT(bhp, vc, __bh);
DB_ASSERT(dbenv, !mfp->multiversion ||
!F_ISSET(bhp, BH_DIRTY) || next_bhp == NULL);
DB_ASSERT(dbenv, !mvcc || edit || read_lsnp == NULL ||
bhp->td_off == INVALID_ROFF || BH_OWNED_BY(dbenv, bhp, txn) ||
(BH_VISIBLE(dbenv, bhp, read_lsnp, vlsn) &&
(next_bhp == NULL || F_ISSET(next_bhp, BH_FROZEN) ||
(next_bhp->td_off != INVALID_ROFF &&
(BH_OWNER(dbenv, next_bhp)->status != TXN_COMMITTED ||
!BH_VISIBLE(dbenv, next_bhp, read_lsnp, vlsn))))));
}
#endif
MUTEX_UNLOCK(dbenv, hp->mtx_hash);
#ifdef DIAGNOSTIC
MPOOL_SYSTEM_LOCK(dbenv);
++dbmfp->pinref;
MPOOL_SYSTEM_UNLOCK(dbenv);
if (F_ISSET(dbenv, DB_ENV_YIELDCPU))
__os_yield(dbenv);
#endif
DB_ASSERT(dbenv, alloc_bhp == NULL);
*(void **)addrp = bhp->buf;
return (0);
err:
if (b_incr || frozen_bhp != NULL) {
if (!b_locked) {
MUTEX_LOCK(dbenv, hp->mtx_hash);
b_locked = 1;
}
if (frozen_bhp != NULL)
--frozen_bhp->ref;
if (b_incr && bhp != frozen_bhp)
--bhp->ref;
}
if (b_locked)
MUTEX_UNLOCK(dbenv, hp->mtx_hash);
if (alloc_bhp != NULL) {
MPOOL_REGION_LOCK(dbenv, infop);
__memp_free(infop, mfp, alloc_bhp);
c_mp->stat.st_pages--;
MPOOL_REGION_UNLOCK(dbenv, infop);
}
return (ret);
}