#include "db_config.h"
#include "db_int.h"
#include "dbinc/log.h"
#include "dbinc/mp.h"
#include "dbinc/txn.h"
static int __pgno_cmp __P((const void *, const void *));
u_int32_t
__memp_bh_priority(bhp)
BH *bhp;
{
u_int32_t priority;
while (SH_CHAIN_HASNEXT(bhp, vc))
bhp = SH_CHAIN_NEXT(bhp, vc, __bh);
priority = bhp->priority;
while ((bhp = SH_CHAIN_PREV(bhp, vc, __bh)) != NULL)
if (bhp->priority < priority)
priority = bhp->priority;
return (priority);
}
int
__memp_bh_settxn(dbmp, mfp, bhp, vtd)
DB_MPOOL *dbmp;
MPOOLFILE *mfp;
BH *bhp;
void *vtd;
{
ENV *env;
TXN_DETAIL *td;
env = dbmp->env;
td = (TXN_DETAIL *)vtd;
if (td == NULL) {
__db_errx(env,
"%s: non-transactional update to a multiversion file",
__memp_fns(dbmp, mfp));
return (EINVAL);
}
if (bhp->td_off != INVALID_ROFF) {
DB_ASSERT(env, BH_OWNER(env, bhp) == td);
return (0);
}
bhp->td_off = R_OFFSET(&env->tx_handle->reginfo, td);
return (__txn_add_buffer(env, td));
}
int
__memp_skip_curadj(dbc, pgno)
DBC * dbc;
db_pgno_t pgno;
{
BH *bhp;
DB_MPOOL *dbmp;
DB_MPOOLFILE *dbmfp;
DB_MPOOL_HASH *hp;
DB_TXN *txn;
ENV *env;
MPOOLFILE *mfp;
REGINFO *infop;
roff_t mf_offset;
int ret, skip;
env = dbc->env;
dbmp = env->mp_handle;
dbmfp = dbc->dbp->mpf;
mfp = dbmfp->mfp;
mf_offset = R_OFFSET(dbmp->reginfo, mfp);
skip = 0;
for (txn = dbc->txn; txn->parent != NULL; txn = txn->parent)
;
MP_GET_BUCKET(env, mfp, pgno, &infop, hp, ret);
if (ret != 0) {
(void)__env_panic(env, ret);
return (0);
}
SH_TAILQ_FOREACH(bhp, &hp->hash_bucket, hq, __bh) {
if (bhp->pgno != pgno || bhp->mf_offset != mf_offset)
continue;
if (!BH_OWNED_BY(env, bhp, txn))
skip = 1;
break;
}
MUTEX_UNLOCK(env, hp->mtx_hash);
return (skip);
}
#define DB_FREEZER_MAGIC 0x06102002
int
__memp_bh_freeze(dbmp, infop, hp, bhp, need_frozenp)
DB_MPOOL *dbmp;
REGINFO *infop;
DB_MPOOL_HASH *hp;
BH *bhp;
int *need_frozenp;
{
BH *frozen_bhp;
BH_FROZEN_ALLOC *frozen_alloc;
DB_FH *fhp;
ENV *env;
MPOOL *c_mp;
MPOOLFILE *bh_mfp;
db_pgno_t maxpgno, newpgno, nextfree;
size_t nio;
int ret, t_ret;
u_int32_t magic, nbucket, ncache, pagesize;
char filename[100], *real_name;
env = dbmp->env;
c_mp = infop->primary;
ret = 0;
bh_mfp = R_ADDR(dbmp->reginfo, bhp->mf_offset);
pagesize = bh_mfp->stat.st_pagesize;
real_name = NULL;
fhp = NULL;
DB_ASSERT(env, bhp->ref == 0);
DB_ASSERT(env, !F_ISSET(bhp, BH_DIRTY | BH_FROZEN | BH_LOCKED));
++bhp->ref;
F_SET(bhp, BH_LOCKED);
MVCC_MPROTECT(bhp->buf, pagesize, PROT_READ | PROT_WRITE);
MUTEX_UNLOCK(env, hp->mtx_hash);
MPOOL_REGION_LOCK(env, infop);
frozen_bhp = SH_TAILQ_FIRST(&c_mp->free_frozen, __bh);
if (frozen_bhp != NULL) {
SH_TAILQ_REMOVE(&c_mp->free_frozen, frozen_bhp, hq, __bh);
*need_frozenp = SH_TAILQ_EMPTY(&c_mp->free_frozen);
} else {
*need_frozenp = 1;
if (__env_alloc(infop,
sizeof(BH_FROZEN_ALLOC) + sizeof(BH_FROZEN_PAGE),
&frozen_alloc) == 0) {
frozen_bhp = (BH *)(frozen_alloc + 1);
SH_TAILQ_INSERT_TAIL(&c_mp->alloc_frozen,
frozen_alloc, links);
}
}
MPOOL_REGION_UNLOCK(env, infop);
MUTEX_LOCK(env, hp->mtx_hash);
if (frozen_bhp == NULL) {
ret = ENOMEM;
goto err;
}
ncache = (u_int32_t)(infop - dbmp->reginfo);
nbucket = (u_int32_t)(hp - (DB_MPOOL_HASH *)R_ADDR(infop, c_mp->htab));
snprintf(filename, sizeof(filename), "__db.freezer.%lu.%lu.%luK",
(u_long)ncache, (u_long)nbucket, (u_long)pagesize / 1024);
if ((ret = __db_appname(env, DB_APP_NONE, filename,
0, NULL, &real_name)) != 0)
goto err;
if ((ret = __os_open(env, real_name, pagesize,
DB_OSO_CREATE | DB_OSO_EXCL, env->db_mode, &fhp)) == 0) {
magic = DB_FREEZER_MAGIC;
maxpgno = newpgno = 0;
if ((ret = __os_write(env, fhp, &magic, sizeof(u_int32_t),
&nio)) < 0 || nio == 0 ||
(ret = __os_write(env, fhp, &newpgno, sizeof(db_pgno_t),
&nio)) < 0 || nio == 0 ||
(ret = __os_write(env, fhp, &maxpgno, sizeof(db_pgno_t),
&nio)) < 0 || nio == 0 ||
(ret = __os_seek(env, fhp, 0, 0, 0)) != 0)
goto err;
} else if (ret == EEXIST)
ret = __os_open(
env, real_name, pagesize, 0, env->db_mode, &fhp);
if (ret != 0)
goto err;
if ((ret = __os_read(env, fhp, &magic, sizeof(u_int32_t),
&nio)) < 0 || nio == 0 ||
(ret = __os_read(env, fhp, &newpgno, sizeof(db_pgno_t),
&nio)) < 0 || nio == 0 ||
(ret = __os_read(env, fhp, &maxpgno, sizeof(db_pgno_t),
&nio)) < 0 || nio == 0)
goto err;
if (magic != DB_FREEZER_MAGIC) {
ret = EINVAL;
goto err;
}
if (newpgno == 0) {
newpgno = ++maxpgno;
if ((ret = __os_seek(env,
fhp, 0, 0, sizeof(u_int32_t) + sizeof(db_pgno_t))) != 0 ||
(ret = __os_write(env, fhp, &maxpgno, sizeof(db_pgno_t),
&nio)) < 0 || nio == 0)
goto err;
} else {
if ((ret = __os_seek(env, fhp, newpgno, pagesize, 0)) != 0 ||
(ret = __os_read(env, fhp, &nextfree, sizeof(db_pgno_t),
&nio)) < 0 || nio == 0)
goto err;
if ((ret =
__os_seek(env, fhp, 0, 0, sizeof(u_int32_t))) != 0 ||
(ret = __os_write(env, fhp, &nextfree, sizeof(db_pgno_t),
&nio)) < 0 || nio == 0)
goto err;
}
if ((ret = __os_io(env, DB_IO_WRITE, fhp, newpgno, pagesize, 0,
pagesize, bhp->buf, &nio)) != 0 || nio == 0)
goto err;
#ifdef DIAG_MVCC
memcpy(frozen_bhp, bhp, SSZ(BH, align_off));
#else
memcpy(frozen_bhp, bhp, SSZA(BH, buf));
#endif
frozen_bhp->ref = frozen_bhp->ref_sync = 0;
F_SET(frozen_bhp, BH_FROZEN);
F_CLR(frozen_bhp, BH_LOCKED);
frozen_bhp->priority = UINT32_MAX;
((BH_FROZEN_PAGE *)frozen_bhp)->spgno = newpgno;
if (frozen_bhp->td_off != INVALID_ROFF &&
(ret = __txn_add_buffer(env, BH_OWNER(env, frozen_bhp))) != 0) {
(void)__env_panic(env, ret);
goto err;
}
SH_CHAIN_INSERT_AFTER(bhp, frozen_bhp, vc, __bh);
if (!SH_CHAIN_HASNEXT(frozen_bhp, vc)) {
SH_TAILQ_INSERT_BEFORE(&hp->hash_bucket,
bhp, frozen_bhp, hq, __bh);
SH_TAILQ_REMOVE(&hp->hash_bucket, bhp, hq, __bh);
}
MUTEX_LOCK(env, bh_mfp->mutex);
++bh_mfp->block_cnt;
MUTEX_UNLOCK(env, bh_mfp->mutex);
STAT(++hp->hash_frozen);
if (0) {
err: if (ret == 0)
ret = EIO;
if (frozen_bhp != NULL) {
MUTEX_UNLOCK(env, hp->mtx_hash);
MPOOL_REGION_LOCK(env, infop);
SH_TAILQ_INSERT_TAIL(&c_mp->free_frozen,
frozen_bhp, hq);
MPOOL_REGION_UNLOCK(env, infop);
MUTEX_LOCK(env, hp->mtx_hash);
}
}
if (real_name != NULL)
__os_free(env, real_name);
if (fhp != NULL &&
(t_ret = __os_closehandle(env, fhp)) != 0 && ret == 0)
ret = t_ret;
if (ret != 0 && ret != ENOMEM)
__db_err(env, ret, "__memp_bh_freeze");
F_CLR(bhp, BH_LOCKED);
--bhp->ref;
if (F_ISSET(hp, IO_WAITER)) {
F_CLR(hp, IO_WAITER);
MUTEX_UNLOCK(env, hp->mtx_io);
}
return (ret);
}
static int
__pgno_cmp(a, b)
const void *a, *b;
{
db_pgno_t *ap, *bp;
ap = (db_pgno_t *)a;
bp = (db_pgno_t *)b;
return (int)(*ap - *bp);
}
int
__memp_bh_thaw(dbmp, infop, hp, frozen_bhp, alloc_bhp)
DB_MPOOL *dbmp;
REGINFO *infop;
DB_MPOOL_HASH *hp;
BH *frozen_bhp, *alloc_bhp;
{
BH *next_bhp;
DB_FH *fhp;
ENV *env;
#ifdef DIAGNOSTIC
DB_LSN vlsn;
#endif
MPOOL *c_mp;
MPOOLFILE *bh_mfp;
db_pgno_t *freelist, *ppgno, freepgno, maxpgno, spgno;
size_t nio;
u_int32_t listsize, magic, nbucket, ncache, ntrunc, nfree, pagesize;
#ifdef HAVE_FTRUNCATE
int i;
#endif
int needfree, ret, t_ret;
char filename[100], *real_name;
env = dbmp->env;
fhp = NULL;
c_mp = infop->primary;
bh_mfp = R_ADDR(dbmp->reginfo, frozen_bhp->mf_offset);
freelist = NULL;
pagesize = bh_mfp->stat.st_pagesize;
ret = 0;
real_name = NULL;
DB_ASSERT(env, F_ISSET(frozen_bhp, BH_FROZEN));
DB_ASSERT(env, !F_ISSET(frozen_bhp, BH_LOCKED));
DB_ASSERT(env, alloc_bhp != NULL ||
BH_OBSOLETE(frozen_bhp, hp->old_reader, vlsn));
spgno = ((BH_FROZEN_PAGE *)frozen_bhp)->spgno;
if (alloc_bhp != NULL) {
#ifdef DIAG_MVCC
memcpy(alloc_bhp, frozen_bhp, SSZ(BH, align_off));
#else
memcpy(alloc_bhp, frozen_bhp, SSZA(BH, buf));
#endif
alloc_bhp->ref = 1;
alloc_bhp->ref_sync = 0;
F_CLR(alloc_bhp, BH_FROZEN);
}
F_SET(frozen_bhp, BH_LOCKED);
ncache = (u_int32_t)(infop - dbmp->reginfo);
nbucket = (u_int32_t)(hp - (DB_MPOOL_HASH *)R_ADDR(infop, c_mp->htab));
snprintf(filename, sizeof(filename), "__db.freezer.%lu.%lu.%luK",
(u_long)ncache, (u_long)nbucket, (u_long)pagesize / 1024);
if ((ret = __db_appname(
env, DB_APP_NONE, filename, 0, NULL, &real_name)) != 0)
goto err;
if ((ret = __os_open(
env, real_name, pagesize, 0, env->db_mode, &fhp)) != 0)
goto err;
if ((ret = __os_read(env, fhp, &magic, sizeof(u_int32_t),
&nio)) < 0 || nio == 0 ||
(ret = __os_read(env, fhp, &freepgno, sizeof(db_pgno_t),
&nio)) < 0 || nio == 0 ||
(ret = __os_read(env, fhp, &maxpgno, sizeof(db_pgno_t),
&nio)) < 0 || nio == 0)
goto err;
if (magic != DB_FREEZER_MAGIC) {
ret = EINVAL;
goto err;
}
if (alloc_bhp != NULL &&
((ret = __os_io(env, DB_IO_READ, fhp, spgno, pagesize,
0, pagesize, alloc_bhp->buf, &nio)) != 0 || nio == 0))
goto err;
needfree = 1;
if (spgno == maxpgno) {
listsize = 100;
if ((ret = __os_malloc(env,
listsize * sizeof(db_pgno_t), &freelist)) != 0)
goto err;
nfree = 0;
while (freepgno != 0) {
if (nfree == listsize - 1) {
listsize *= 2;
if ((ret = __os_realloc(env,
listsize * sizeof(db_pgno_t),
&freelist)) != 0)
goto err;
}
freelist[nfree++] = freepgno;
if ((ret = __os_seek(
env, fhp, freepgno, pagesize, 0)) != 0 ||
(ret = __os_read(env, fhp, &freepgno,
sizeof(db_pgno_t), &nio)) < 0 || nio == 0)
goto err;
}
freelist[nfree++] = spgno;
qsort(freelist, nfree, sizeof(db_pgno_t), __pgno_cmp);
for (ppgno = &freelist[nfree - 1]; ppgno > freelist; ppgno--)
if (*(ppgno - 1) != *ppgno - 1)
break;
ntrunc = (u_int32_t)(&freelist[nfree] - ppgno);
if (ntrunc == (u_int32_t)maxpgno) {
needfree = 0;
ret = __os_closehandle(env, fhp);
fhp = NULL;
if (ret != 0 ||
(ret = __os_unlink(env, real_name, 0)) != 0)
goto err;
}
#ifdef HAVE_FTRUNCATE
else {
maxpgno -= (db_pgno_t)ntrunc;
if ((ret = __os_truncate(env, fhp,
maxpgno + 1, pagesize)) != 0)
goto err;
freelist[nfree - ntrunc] = 0;
if ((ret = __os_seek(
env, fhp, 0, 0, sizeof(u_int32_t))) != 0 ||
(ret = __os_write(env, fhp, &freelist[0],
sizeof(db_pgno_t), &nio)) < 0 || nio == 0 ||
(ret = __os_write(env, fhp, &maxpgno,
sizeof(db_pgno_t), &nio)) < 0 || nio == 0)
goto err;
for (i = 0; i < (int)(nfree - ntrunc); i++)
if ((ret = __os_seek(env,
fhp, freelist[i], pagesize, 0)) != 0 ||
(ret = __os_write(env, fhp,
&freelist[i + 1], sizeof(db_pgno_t),
&nio)) < 0 || nio == 0)
goto err;
needfree = 0;
}
#endif
}
if (needfree &&
((ret = __os_seek(env, fhp, spgno, pagesize, 0)) != 0 ||
(ret = __os_write(env, fhp, &freepgno, sizeof(db_pgno_t),
&nio)) < 0 || nio == 0 ||
(ret = __os_seek(env, fhp, 0, 0, sizeof(u_int32_t))) != 0 ||
(ret = __os_write(env, fhp, &spgno, sizeof(db_pgno_t),
&nio)) < 0 || nio == 0))
goto err;
if (alloc_bhp != NULL) {
alloc_bhp->priority = c_mp->lru_count;
SH_CHAIN_INSERT_AFTER(frozen_bhp, alloc_bhp, vc, __bh);
if (!SH_CHAIN_HASNEXT(alloc_bhp, vc)) {
SH_TAILQ_INSERT_BEFORE(&hp->hash_bucket,
frozen_bhp, alloc_bhp, hq, __bh);
SH_TAILQ_REMOVE(&hp->hash_bucket, frozen_bhp, hq, __bh);
}
}
if ((next_bhp = SH_CHAIN_NEXT(frozen_bhp, vc, __bh)) == NULL) {
if ((next_bhp = SH_CHAIN_PREV(frozen_bhp, vc, __bh)) != NULL)
SH_TAILQ_INSERT_BEFORE(&hp->hash_bucket, frozen_bhp,
next_bhp, hq, __bh);
SH_TAILQ_REMOVE(&hp->hash_bucket, frozen_bhp, hq, __bh);
}
SH_CHAIN_REMOVE(frozen_bhp, vc, __bh);
if (--frozen_bhp->ref == 0) {
MUTEX_UNLOCK(env, hp->mtx_hash);
if (alloc_bhp == NULL && frozen_bhp->td_off != INVALID_ROFF &&
(ret = __txn_remove_buffer(env,
BH_OWNER(env, frozen_bhp), MUTEX_INVALID)) != 0) {
(void)__env_panic(env, ret);
goto err;
}
MPOOL_REGION_LOCK(env, infop);
SH_TAILQ_INSERT_TAIL(&c_mp->free_frozen, frozen_bhp, hq);
MPOOL_REGION_UNLOCK(env, infop);
MUTEX_LOCK(env, hp->mtx_hash);
} else {
F_SET(frozen_bhp, BH_THAWED);
F_CLR(frozen_bhp, BH_LOCKED);
}
#ifdef HAVE_STATISTICS
if (alloc_bhp != NULL)
++hp->hash_thawed;
else
++hp->hash_frozen_freed;
#endif
if (0) {
err: if (ret == 0)
ret = EIO;
}
if (real_name != NULL)
__os_free(env, real_name);
if (freelist != NULL)
__os_free(env, freelist);
if (fhp != NULL &&
(t_ret = __os_closehandle(env, fhp)) != 0 && ret == 0)
ret = t_ret;
if (ret != 0)
__db_err(env, ret, "__memp_bh_thaw");
if (F_ISSET(hp, IO_WAITER)) {
F_CLR(hp, IO_WAITER);
MUTEX_UNLOCK(env, hp->mtx_io);
}
return (ret);
}