#include "db_config.h"
#include "db_int.h"
#include "dbinc/log.h"
#include "dbinc/mp.h"
#include "dbinc/txn.h"
int
__memp_fget_pp(dbmfp, pgnoaddr, txnp, flags, addrp)
DB_MPOOLFILE *dbmfp;
db_pgno_t *pgnoaddr;
DB_TXN *txnp;
u_int32_t flags;
void *addrp;
{
DB_THREAD_INFO *ip;
ENV *env;
int rep_blocked, ret;
env = dbmfp->env;
MPF_ILLEGAL_BEFORE_OPEN(dbmfp, "DB_MPOOLFILE->get");
#define OKFLAGS (DB_MPOOL_CREATE | DB_MPOOL_DIRTY | \
DB_MPOOL_EDIT | DB_MPOOL_LAST | DB_MPOOL_NEW)
if (flags != 0) {
if ((ret = __db_fchk(env, "memp_fget", flags, OKFLAGS)) != 0)
return (ret);
switch (flags) {
case DB_MPOOL_DIRTY:
case DB_MPOOL_CREATE:
case DB_MPOOL_EDIT:
case DB_MPOOL_LAST:
case DB_MPOOL_NEW:
break;
default:
return (__db_ferr(env, "memp_fget", 1));
}
}
ENV_ENTER(env, ip);
rep_blocked = 0;
if (txnp == NULL && IS_ENV_REPLICATED(env)) {
if ((ret = __op_rep_enter(env)) != 0)
goto err;
rep_blocked = 1;
}
ret = __memp_fget(dbmfp, pgnoaddr, ip, txnp, flags, addrp);
if (ret != 0 && rep_blocked)
(void)__op_rep_exit(env);
err: if (ret != 0)
ENV_LEAVE(env, ip);
return (ret);
}
int
__memp_fget(dbmfp, pgnoaddr, ip, txn, flags, addrp)
DB_MPOOLFILE *dbmfp;
db_pgno_t *pgnoaddr;
DB_THREAD_INFO *ip;
DB_TXN *txn;
u_int32_t flags;
void *addrp;
{
enum { FIRST_FOUND, FIRST_MISS, SECOND_FOUND, SECOND_MISS } state;
BH *alloc_bhp, *bhp, *frozen_bhp, *oldest_bhp;
ENV *env;
DB_LSN *read_lsnp, vlsn;
DB_MPOOL *dbmp;
DB_MPOOL_HASH *hp;
MPOOL *c_mp;
MPOOLFILE *mfp;
PIN_LIST *list, *lp;
REGINFO *infop, *t_infop, *reginfo;
TXN_DETAIL *td;
roff_t list_off, mf_offset;
u_int32_t pinmax, st_hsearch;
int b_incr, b_locked, dirty, edit, extending, first;
int makecopy, mvcc, need_free, ret;
*(void **)addrp = NULL;
COMPQUIET(c_mp, NULL);
COMPQUIET(infop, NULL);
COMPQUIET(oldest_bhp, NULL);
env = dbmfp->env;
dbmp = env->mp_handle;
mfp = dbmfp->mfp;
mvcc = mfp->multiversion;
mf_offset = R_OFFSET(dbmp->reginfo, mfp);
alloc_bhp = bhp = frozen_bhp = NULL;
read_lsnp = NULL;
td = NULL;
hp = NULL;
b_incr = b_locked = extending = makecopy = ret = 0;
if (LF_ISSET(DB_MPOOL_DIRTY)) {
if (F_ISSET(dbmfp, MP_READONLY)) {
__db_errx(env,
"%s: dirty flag set for readonly file page",
__memp_fn(dbmfp));
return (EINVAL);
}
if ((ret = __db_fcchk(env, "DB_MPOOLFILE->get",
flags, DB_MPOOL_DIRTY, DB_MPOOL_EDIT)) != 0)
return (ret);
}
dirty = LF_ISSET(DB_MPOOL_DIRTY);
edit = LF_ISSET(DB_MPOOL_EDIT);
LF_CLR(DB_MPOOL_DIRTY | DB_MPOOL_EDIT);
if (mvcc && txn != NULL && txn->td != NULL) {
while (txn->parent != NULL)
txn = txn->parent;
td = (TXN_DETAIL *)txn->td;
if (F_ISSET(txn, TXN_SNAPSHOT)) {
read_lsnp = &td->read_lsn;
if (IS_MAX_LSN(*read_lsnp) &&
(ret = __log_current_lsn(env, read_lsnp,
NULL, NULL)) != 0)
return (ret);
}
if ((dirty || LF_ISSET(DB_MPOOL_CREATE | DB_MPOOL_NEW)) &&
td->mvcc_mtx == MUTEX_INVALID && (ret =
__mutex_alloc(env, MTX_TXN_MVCC, 0, &td->mvcc_mtx)) != 0)
return (ret);
}
switch (flags) {
case DB_MPOOL_LAST:
MUTEX_LOCK(env, mfp->mutex);
*pgnoaddr = mfp->last_pgno;
MUTEX_UNLOCK(env, mfp->mutex);
break;
case DB_MPOOL_NEW:
state = FIRST_MISS;
goto alloc;
case DB_MPOOL_CREATE:
default:
break;
}
if (dbmfp->addr != NULL &&
F_ISSET(mfp, MP_CAN_MMAP) && *pgnoaddr <= mfp->orig_last_pgno) {
*(void **)addrp = (u_int8_t *)dbmfp->addr +
(*pgnoaddr * mfp->stat.st_pagesize);
STAT(++mfp->stat.st_map);
return (0);
}
retry:
MP_GET_BUCKET(env, mfp, *pgnoaddr, &infop, hp, ret);
if (ret != 0)
return (ret);
c_mp = infop->primary;
st_hsearch = 0;
b_locked = 1;
SH_TAILQ_FOREACH(bhp, &hp->hash_bucket, hq, __bh) {
++st_hsearch;
if (bhp->pgno != *pgnoaddr || bhp->mf_offset != mf_offset)
continue;
if (mvcc && !edit && read_lsnp != NULL) {
while (bhp != NULL &&
!BH_OWNED_BY(env, bhp, txn) &&
!BH_VISIBLE(env, bhp, read_lsnp, vlsn))
bhp = SH_CHAIN_PREV(bhp, vc, __bh);
DB_ASSERT(env, bhp != NULL);
}
makecopy = mvcc && dirty && !BH_OWNED_BY(env, bhp, txn);
if (F_ISSET(bhp, BH_FROZEN) && !F_ISSET(bhp, BH_FREED)) {
DB_ASSERT(env, frozen_bhp == NULL);
frozen_bhp = bhp;
}
if (bhp->ref == UINT16_MAX) {
__db_errx(env,
"%s: page %lu: reference count overflow",
__memp_fn(dbmfp), (u_long)bhp->pgno);
ret = __env_panic(env, EINVAL);
goto err;
}
++bhp->ref;
b_incr = 1;
for (first = 1; F_ISSET(bhp, BH_LOCKED) &&
!F_ISSET(env->dbenv, DB_ENV_NOLOCKING); first = 0) {
if (!first && bhp->ref_sync != 0) {
--bhp->ref;
MUTEX_UNLOCK(env, hp->mtx_hash);
bhp = frozen_bhp = NULL;
b_incr = b_locked = 0;
__os_yield(env, 0, 1);
goto retry;
}
if (!F_ISSET(hp, IO_WAITER)) {
F_SET(hp, IO_WAITER);
MUTEX_LOCK(env, hp->mtx_io);
}
STAT(++hp->hash_io_wait);
MUTEX_UNLOCK(env, hp->mtx_hash);
MUTEX_LOCK(env, hp->mtx_io);
MUTEX_UNLOCK(env, hp->mtx_io);
MUTEX_LOCK(env, hp->mtx_hash);
}
if (frozen_bhp != NULL && F_ISSET(frozen_bhp, BH_THAWED)) {
thawed: need_free = (--frozen_bhp->ref == 0);
b_incr = 0;
MUTEX_UNLOCK(env, hp->mtx_hash);
MPOOL_REGION_LOCK(env, infop);
if (alloc_bhp != NULL) {
__memp_free(infop, mfp, alloc_bhp);
alloc_bhp = NULL;
}
if (need_free)
SH_TAILQ_INSERT_TAIL(&c_mp->free_frozen,
frozen_bhp, hq);
MPOOL_REGION_UNLOCK(env, infop);
bhp = frozen_bhp = NULL;
goto retry;
}
if (SH_CHAIN_HASNEXT(bhp, vc) &&
SH_CHAIN_NEXTP(bhp, vc, __bh)->td_off == bhp->td_off) {
--bhp->ref;
b_incr = 0;
MUTEX_UNLOCK(env, hp->mtx_hash);
bhp = frozen_bhp = NULL;
goto retry;
} else if (dirty && SH_CHAIN_HASNEXT(bhp, vc)) {
ret = DB_LOCK_DEADLOCK;
goto err;
}
#ifdef HAVE_STATISTICS
++mfp->stat.st_cache_hit;
#endif
break;
}
#ifdef HAVE_STATISTICS
++c_mp->stat.st_hash_searches;
if (st_hsearch > c_mp->stat.st_hash_longest)
c_mp->stat.st_hash_longest = st_hsearch;
c_mp->stat.st_hash_examined += st_hsearch;
#endif
state = bhp == NULL ?
(alloc_bhp == NULL ? FIRST_MISS : SECOND_MISS) :
(alloc_bhp == NULL ? FIRST_FOUND : SECOND_FOUND);
switch (state) {
case FIRST_FOUND:
if (flags == DB_MPOOL_FREE) {
if (--bhp->ref == 0) {
if (F_ISSET(bhp, BH_DIRTY)) {
--hp->hash_page_dirty;
F_CLR(bhp, BH_DIRTY | BH_DIRTY_CREATE);
}
if (mvcc && (F_ISSET(bhp, BH_FROZEN) ||
!SH_CHAIN_SINGLETON(bhp, vc) ||
bhp->td_off == INVALID_ROFF ||
!IS_MAX_LSN(*VISIBLE_LSN(env, bhp)))) {
F_SET(bhp, BH_FREED);
MUTEX_UNLOCK(env, hp->mtx_hash);
return (0);
}
return (__memp_bhfree(
dbmp, infop, hp, bhp, BH_FREE_FREEMEM));
}
__db_errx(env,
"File %s: freeing pinned buffer for page %lu",
__memp_fns(dbmp, mfp), (u_long)*pgnoaddr);
ret = __env_panic(env, EINVAL);
goto err;
}
if (mvcc) {
if (flags == DB_MPOOL_CREATE &&
F_ISSET(bhp, BH_FREED)) {
extending = makecopy = 1;
MUTEX_LOCK(env, mfp->mutex);
if (*pgnoaddr > mfp->last_pgno)
mfp->last_pgno = *pgnoaddr;
MUTEX_UNLOCK(env, mfp->mutex);
}
reuse: if ((makecopy || frozen_bhp != NULL) && (oldest_bhp =
SH_CHAIN_PREV(bhp, vc, __bh)) != NULL) {
while (SH_CHAIN_HASPREV(oldest_bhp, vc))
oldest_bhp = SH_CHAIN_PREVP(oldest_bhp,
vc, __bh);
if (oldest_bhp->ref == 0 && !BH_OBSOLETE(
oldest_bhp, hp->old_reader, vlsn) &&
(ret = __txn_oldest_reader(env,
&hp->old_reader)) != 0)
goto err;
if (BH_OBSOLETE(
oldest_bhp, hp->old_reader, vlsn) &&
oldest_bhp->ref == 0) {
if (F_ISSET(oldest_bhp, BH_FROZEN)) {
++oldest_bhp->ref;
if ((ret = __memp_bh_thaw(dbmp,
infop, hp, oldest_bhp,
NULL)) != 0)
goto err;
goto reuse;
} else if ((ret = __memp_bhfree(dbmp,
infop, hp, oldest_bhp,
BH_FREE_REUSE)) != 0)
goto err;
alloc_bhp = oldest_bhp;
}
DB_ASSERT(env, alloc_bhp == NULL ||
!F_ISSET(alloc_bhp, BH_FROZEN));
}
}
if ((!makecopy && frozen_bhp == NULL) || alloc_bhp != NULL)
break;
case FIRST_MISS:
MUTEX_UNLOCK(env, hp->mtx_hash);
b_locked = 0;
if (flags == DB_MPOOL_FREE)
return (0);
alloc:
DB_ASSERT(env, !b_locked);
MUTEX_LOCK(env, mfp->mutex);
switch (flags) {
case DB_MPOOL_NEW:
extending = 1;
if (mfp->maxpgno != 0 &&
mfp->last_pgno >= mfp->maxpgno) {
__db_errx(
env, "%s: file limited to %lu pages",
__memp_fn(dbmfp), (u_long)mfp->maxpgno);
ret = ENOSPC;
} else
*pgnoaddr = mfp->last_pgno + 1;
break;
case DB_MPOOL_CREATE:
if (mfp->maxpgno != 0 && *pgnoaddr > mfp->maxpgno) {
__db_errx(
env, "%s: file limited to %lu pages",
__memp_fn(dbmfp), (u_long)mfp->maxpgno);
ret = ENOSPC;
} else if (!extending)
extending = *pgnoaddr > mfp->last_pgno;
break;
default:
ret = *pgnoaddr > mfp->last_pgno ? DB_PAGE_NOTFOUND : 0;
break;
}
MUTEX_UNLOCK(env, mfp->mutex);
if (ret != 0)
goto err;
MP_GET_REGION(dbmfp, *pgnoaddr, &infop, ret);
if (ret != 0)
goto err;
c_mp = infop->primary;
if ((ret =
__memp_alloc(dbmp, infop, mfp, 0, NULL, &alloc_bhp)) != 0)
goto err;
#ifdef DIAGNOSTIC
if ((uintptr_t)alloc_bhp->buf & (sizeof(size_t) - 1)) {
__db_errx(env,
"DB_MPOOLFILE->get: buffer data is NOT size_t aligned");
ret = __env_panic(env, EINVAL);
goto err;
}
#endif
if (extending)
MUTEX_LOCK(env, mfp->mutex);
if (flags == DB_MPOOL_NEW && *pgnoaddr != mfp->last_pgno + 1) {
*pgnoaddr = mfp->last_pgno + 1;
MP_GET_REGION(dbmfp, *pgnoaddr, &t_infop,ret);
if (ret != 0)
goto err;
if (t_infop != infop) {
MUTEX_UNLOCK(env, mfp->mutex);
MPOOL_REGION_LOCK(env, infop);
__memp_free(infop, mfp, alloc_bhp);
c_mp->stat.st_pages--;
MPOOL_REGION_UNLOCK(env, infop);
alloc_bhp = NULL;
goto alloc;
}
}
if (extending) {
if (*pgnoaddr > mfp->last_pgno)
mfp->last_pgno = *pgnoaddr;
MUTEX_UNLOCK(env, mfp->mutex);
if (ret != 0)
goto err;
}
if (bhp != NULL) {
MUTEX_LOCK(env, hp->mtx_hash);
b_locked = 1;
break;
}
DB_ASSERT(env, frozen_bhp == NULL);
goto retry;
case SECOND_FOUND:
if (extending && F_ISSET(bhp, BH_FREED))
makecopy = 1;
if (makecopy || frozen_bhp != NULL)
break;
MUTEX_UNLOCK(env, hp->mtx_hash);
MPOOL_REGION_LOCK(env, infop);
__memp_free(infop, mfp, alloc_bhp);
c_mp->stat.st_pages--;
MPOOL_REGION_UNLOCK(env, infop);
alloc_bhp = NULL;
if (flags == DB_MPOOL_NEW) {
--bhp->ref;
b_incr = b_locked = 0;
bhp = NULL;
goto alloc;
}
MUTEX_LOCK(env, hp->mtx_hash);
break;
case SECOND_MISS:
bhp = alloc_bhp;
alloc_bhp = NULL;
#ifdef DIAG_MVCC
memset(bhp, 0, SSZ(BH, align_off));
#else
memset(bhp, 0, sizeof(BH));
#endif
bhp->ref = 1;
b_incr = 1;
bhp->priority = UINT32_MAX;
bhp->pgno = *pgnoaddr;
bhp->mf_offset = mf_offset;
SH_TAILQ_INSERT_TAIL(&hp->hash_bucket, bhp, hq);
SH_CHAIN_INIT(bhp, vc);
if (extending) {
++hp->hash_page_dirty;
F_SET(bhp, BH_DIRTY | BH_DIRTY_CREATE);
}
if (extending) {
MVCC_MPROTECT(bhp->buf, mfp->stat.st_pagesize,
PROT_READ | PROT_WRITE);
if (mfp->clear_len == DB_CLEARLEN_NOTSET)
memset(bhp->buf, 0, mfp->stat.st_pagesize);
else {
memset(bhp->buf, 0, mfp->clear_len);
#if defined(DIAGNOSTIC) || defined(UMRW)
memset(bhp->buf + mfp->clear_len, CLEAR_BYTE,
mfp->stat.st_pagesize - mfp->clear_len);
#endif
}
if (flags == DB_MPOOL_CREATE && mfp->ftype != 0)
F_SET(bhp, BH_CALLPGIN);
STAT(++mfp->stat.st_page_create);
} else {
F_SET(bhp, BH_TRASH);
STAT(++mfp->stat.st_cache_miss);
}
MUTEX_LOCK(env, mfp->mutex);
++mfp->block_cnt;
MUTEX_UNLOCK(env, mfp->mutex);
}
DB_ASSERT(env, bhp != NULL);
DB_ASSERT(env, bhp->ref != 0);
if (frozen_bhp != NULL) {
DB_ASSERT(env, alloc_bhp != NULL);
if (F_ISSET(frozen_bhp, BH_THAWED))
goto thawed;
else {
if ((ret = __memp_bh_thaw(dbmp, infop, hp,
frozen_bhp, alloc_bhp)) != 0)
goto err;
bhp = alloc_bhp;
}
frozen_bhp = alloc_bhp = NULL;
if (makecopy) {
MUTEX_UNLOCK(env, hp->mtx_hash);
b_locked = 0;
goto alloc;
}
}
if (F_ISSET(bhp, BH_TRASH) &&
(ret = __memp_pgread(dbmfp,
hp, bhp, LF_ISSET(DB_MPOOL_CREATE) ? 1 : 0)) != 0)
goto err;
if (F_ISSET(bhp, BH_CALLPGIN)) {
MVCC_MPROTECT(bhp->buf, mfp->stat.st_pagesize,
PROT_READ | PROT_WRITE);
if ((ret = __memp_pg(dbmfp, bhp, 1)) != 0)
goto err;
F_CLR(bhp, BH_CALLPGIN);
}
if (makecopy && state != SECOND_MISS) {
DB_ASSERT(env, !SH_CHAIN_HASNEXT(bhp, vc));
DB_ASSERT(env, bhp != NULL);
DB_ASSERT(env, alloc_bhp != NULL);
DB_ASSERT(env, alloc_bhp != bhp);
if (bhp->ref == 1)
MVCC_MPROTECT(bhp->buf, mfp->stat.st_pagesize,
PROT_READ);
alloc_bhp->ref = 1;
alloc_bhp->ref_sync = 0;
alloc_bhp->flags = F_ISSET(bhp, BH_DIRTY | BH_DIRTY_CREATE);
F_CLR(bhp, BH_DIRTY | BH_DIRTY_CREATE);
alloc_bhp->priority = bhp->priority;
alloc_bhp->pgno = bhp->pgno;
alloc_bhp->mf_offset = bhp->mf_offset;
alloc_bhp->td_off = INVALID_ROFF;
if (txn != NULL &&
(ret = __memp_bh_settxn(dbmp, mfp, alloc_bhp, td)) != 0)
goto err;
if (extending) {
memset(alloc_bhp->buf, 0, mfp->stat.st_pagesize);
F_SET(alloc_bhp, BH_DIRTY_CREATE);
} else
memcpy(alloc_bhp->buf, bhp->buf, mfp->stat.st_pagesize);
SH_CHAIN_INSERT_AFTER(bhp, alloc_bhp, vc, __bh);
SH_TAILQ_INSERT_BEFORE(&hp->hash_bucket,
bhp, alloc_bhp, hq, __bh);
SH_TAILQ_REMOVE(&hp->hash_bucket, bhp, hq, __bh);
if (--bhp->ref == 0) {
bhp->priority = c_mp->lru_count;
MVCC_MPROTECT(bhp->buf, mfp->stat.st_pagesize, 0);
}
bhp = alloc_bhp;
if (alloc_bhp != oldest_bhp) {
MUTEX_LOCK(env, mfp->mutex);
++mfp->block_cnt;
MUTEX_UNLOCK(env, mfp->mutex);
}
alloc_bhp = NULL;
} else if (mvcc && extending && txn != NULL &&
(ret = __memp_bh_settxn(dbmp, mfp, bhp, td)) != 0)
goto err;
if ((dirty || edit || extending) && !F_ISSET(bhp, BH_DIRTY)) {
DB_ASSERT(env, !SH_CHAIN_HASNEXT(bhp, vc));
++hp->hash_page_dirty;
F_SET(bhp, BH_DIRTY);
}
if (state != SECOND_MISS && bhp->ref == 1) {
bhp->priority = UINT32_MAX;
if (SH_CHAIN_SINGLETON(bhp, vc)) {
if (bhp != SH_TAILQ_LAST(&hp->hash_bucket, hq, __bh)) {
SH_TAILQ_REMOVE(&hp->hash_bucket,
bhp, hq, __bh);
SH_TAILQ_INSERT_TAIL(&hp->hash_bucket, bhp, hq);
}
}
}
MVCC_MPROTECT(bhp->buf, mfp->stat.st_pagesize, PROT_READ |
(dirty || edit || extending || F_ISSET(bhp, BH_DIRTY) ?
PROT_WRITE : 0));
#ifdef DIAGNOSTIC
{
BH *next_bhp = SH_CHAIN_NEXT(bhp, vc, __bh);
DB_ASSERT(env, !mfp->multiversion ||
!F_ISSET(bhp, BH_DIRTY) || next_bhp == NULL);
DB_ASSERT(env, !mvcc || edit || read_lsnp == NULL ||
bhp->td_off == INVALID_ROFF || BH_OWNED_BY(env, bhp, txn) ||
(BH_VISIBLE(env, bhp, read_lsnp, vlsn) &&
(next_bhp == NULL || F_ISSET(next_bhp, BH_FROZEN) ||
(next_bhp->td_off != INVALID_ROFF &&
(BH_OWNER(env, next_bhp)->status != TXN_COMMITTED ||
!BH_VISIBLE(env, next_bhp, read_lsnp, vlsn))))));
}
#endif
MUTEX_UNLOCK(env, hp->mtx_hash);
if (ip != NULL) {
reginfo = env->reginfo;
if (ip->dbth_pincount == ip->dbth_pinmax) {
pinmax = ip->dbth_pinmax;
if ((ret = __env_alloc(reginfo,
2 * pinmax * sizeof(PIN_LIST), &list)) != 0)
goto err;
memcpy(list, R_ADDR(reginfo, ip->dbth_pinlist),
pinmax * sizeof(PIN_LIST));
memset(&list[pinmax], 0, pinmax * sizeof(PIN_LIST));
list_off = R_OFFSET(reginfo, list);
list = R_ADDR(reginfo, ip->dbth_pinlist);
ip->dbth_pinmax = 2 * pinmax;
ip->dbth_pinlist = list_off;
if (list != ip->dbth_pinarray)
__env_alloc_free(reginfo, list);
}
list = R_ADDR(reginfo, ip->dbth_pinlist);
for (lp = list; lp < &list[ip->dbth_pinmax]; lp++)
if (lp->b_ref == INVALID_ROFF)
break;
ip->dbth_pincount++;
lp->b_ref = R_OFFSET(infop, bhp);
lp->region = (int)(infop - dbmp->reginfo);
}
#ifdef DIAGNOSTIC
MPOOL_SYSTEM_LOCK(env);
++dbmfp->pinref;
MPOOL_SYSTEM_UNLOCK(env);
if (F_ISSET(env->dbenv, DB_ENV_YIELDCPU))
__os_yield(env, 0, 0);
#endif
DB_ASSERT(env, alloc_bhp == NULL);
*(void **)addrp = bhp->buf;
return (0);
err:
if (b_incr || frozen_bhp != NULL) {
if (!b_locked) {
MUTEX_LOCK(env, hp->mtx_hash);
b_locked = 1;
}
if (frozen_bhp != NULL)
--frozen_bhp->ref;
if (b_incr && bhp != frozen_bhp)
--bhp->ref;
}
if (b_locked)
MUTEX_UNLOCK(env, hp->mtx_hash);
if (alloc_bhp != NULL) {
MPOOL_REGION_LOCK(env, infop);
__memp_free(infop, mfp, alloc_bhp);
c_mp->stat.st_pages--;
MPOOL_REGION_UNLOCK(env, infop);
}
return (ret);
}