/*- * See the file LICENSE for redistribution information. * * Copyright (c) 1996-2003 * Sleepycat Software. All rights reserved. */ #include "db_config.h" #ifndef lint static const char revid[] = "$Id: lock.c,v 1.2 2004/03/30 01:23:43 jtownsen Exp $"; #endif /* not lint */ #ifndef NO_SYSTEM_INCLUDES #include #include #include #endif #include "db_int.h" #include "dbinc/db_shash.h" #include "dbinc/lock.h" #include "dbinc/log.h" static int __lock_freelock __P((DB_LOCKTAB *, struct __db_lock *, u_int32_t, u_int32_t)); static void __lock_expires __P((DB_ENV *, db_timeval_t *, db_timeout_t)); static void __lock_freelocker __P((DB_LOCKTAB *, DB_LOCKREGION *, DB_LOCKER *, u_int32_t)); static int __lock_get_internal __P((DB_LOCKTAB *, u_int32_t, u_int32_t, const DBT *, db_lockmode_t, db_timeout_t, DB_LOCK *)); static int __lock_getobj __P((DB_LOCKTAB *, const DBT *, u_int32_t, int, DB_LOCKOBJ **)); static int __lock_inherit_locks __P ((DB_LOCKTAB *, u_int32_t, u_int32_t)); static int __lock_is_parent __P((DB_LOCKTAB *, u_int32_t, DB_LOCKER *)); static int __lock_put_internal __P((DB_LOCKTAB *, struct __db_lock *, u_int32_t, u_int32_t)); static int __lock_put_nolock __P((DB_ENV *, DB_LOCK *, int *, u_int32_t)); static void __lock_remove_waiter __P((DB_LOCKTAB *, DB_LOCKOBJ *, struct __db_lock *, db_status_t)); static int __lock_set_timeout_internal __P(( DB_ENV *, u_int32_t, db_timeout_t, u_int32_t)); static int __lock_trade __P((DB_ENV *, DB_LOCK *, u_int32_t)); static int __lock_sort_cmp __P((const void *, const void *)); static int __lock_fix_list __P((DB_ENV *, DBT *, u_int32_t)); static const char __db_lock_err[] = "Lock table is out of available %s"; static const char __db_lock_invalid[] = "%s: Lock is no longer valid"; static const char __db_locker_invalid[] = "Locker is not valid"; /* * __lock_id_pp -- * DB_ENV->lock_id pre/post processing. * * PUBLIC: int __lock_id_pp __P((DB_ENV *, u_int32_t *)); */ int __lock_id_pp(dbenv, idp) DB_ENV *dbenv; u_int32_t *idp; { int rep_check, ret; PANIC_CHECK(dbenv); ENV_REQUIRES_CONFIG(dbenv, dbenv->lk_handle, "DB_ENV->lock_id", DB_INIT_LOCK); rep_check = IS_ENV_REPLICATED(dbenv) ? 1 : 0; if (rep_check) __env_rep_enter(dbenv); ret = __lock_id(dbenv, idp); if (rep_check) __env_rep_exit(dbenv); return (ret); } /* * __lock_id -- * DB_ENV->lock_id. * * PUBLIC: int __lock_id __P((DB_ENV *, u_int32_t *)); */ int __lock_id(dbenv, idp) DB_ENV *dbenv; u_int32_t *idp; { DB_LOCKER *lk; DB_LOCKTAB *lt; DB_LOCKREGION *region; u_int32_t *ids, locker_ndx; int nids, ret; lt = dbenv->lk_handle; region = lt->reginfo.primary; ret = 0; /* * Allocate a new lock id. If we wrap around then we * find the minimum currently in use and make sure we * can stay below that. This code is similar to code * in __txn_begin_int for recovering txn ids. */ LOCKREGION(dbenv, lt); /* * Our current valid range can span the maximum valid value, so check * for it and wrap manually. */ if (region->stat.st_id == DB_LOCK_MAXID && region->stat.st_cur_maxid != DB_LOCK_MAXID) region->stat.st_id = DB_LOCK_INVALIDID; if (region->stat.st_id == region->stat.st_cur_maxid) { if ((ret = __os_malloc(dbenv, sizeof(u_int32_t) * region->stat.st_nlockers, &ids)) != 0) goto err; nids = 0; for (lk = SH_TAILQ_FIRST(®ion->lockers, __db_locker); lk != NULL; lk = SH_TAILQ_NEXT(lk, ulinks, __db_locker)) ids[nids++] = lk->id; region->stat.st_id = DB_LOCK_INVALIDID; region->stat.st_cur_maxid = DB_LOCK_MAXID; if (nids != 0) __db_idspace(ids, nids, ®ion->stat.st_id, ®ion->stat.st_cur_maxid); __os_free(dbenv, ids); } *idp = ++region->stat.st_id; /* Allocate a locker for this id. */ LOCKER_LOCK(lt, region, *idp, locker_ndx); ret = __lock_getlocker(lt, *idp, locker_ndx, 1, &lk); err: UNLOCKREGION(dbenv, lt); return (ret); } /* * __lock_id_free_pp -- * DB_ENV->lock_id_free pre/post processing. * * PUBLIC: int __lock_id_free_pp __P((DB_ENV *, u_int32_t)); */ int __lock_id_free_pp(dbenv, id) DB_ENV *dbenv; u_int32_t id; { int rep_check, ret; PANIC_CHECK(dbenv); ENV_REQUIRES_CONFIG(dbenv, dbenv->lk_handle, "DB_ENV->lock_id_free", DB_INIT_LOCK); rep_check = IS_ENV_REPLICATED(dbenv) ? 1 : 0; if (rep_check) __env_rep_enter(dbenv); ret = __lock_id_free(dbenv, id); if (rep_check) __env_rep_exit(dbenv); return (ret); } /* * __lock_id_free -- * Free a locker id. * * PUBLIC: int __lock_id_free __P((DB_ENV *, u_int32_t)); */ int __lock_id_free(dbenv, id) DB_ENV *dbenv; u_int32_t id; { DB_LOCKER *sh_locker; DB_LOCKTAB *lt; DB_LOCKREGION *region; u_int32_t locker_ndx; int ret; PANIC_CHECK(dbenv); ENV_REQUIRES_CONFIG(dbenv, dbenv->lk_handle, "DB_ENV->lock_id_free", DB_INIT_LOCK); lt = dbenv->lk_handle; region = lt->reginfo.primary; LOCKREGION(dbenv, lt); LOCKER_LOCK(lt, region, id, locker_ndx); if ((ret = __lock_getlocker(lt, id, locker_ndx, 0, &sh_locker)) != 0) goto err; if (sh_locker == NULL) { ret = EINVAL; goto err; } if (sh_locker->nlocks != 0) { __db_err(dbenv, "Locker still has locks"); ret = EINVAL; goto err; } __lock_freelocker(lt, region, sh_locker, locker_ndx); err: UNLOCKREGION(dbenv, lt); return (ret); } /* * __lock_vec_pp -- * DB_ENV->lock_vec pre/post processing. * * PUBLIC: int __lock_vec_pp __P((DB_ENV *, * PUBLIC: u_int32_t, u_int32_t, DB_LOCKREQ *, int, DB_LOCKREQ **)); */ int __lock_vec_pp(dbenv, locker, flags, list, nlist, elistp) DB_ENV *dbenv; u_int32_t locker, flags; int nlist; DB_LOCKREQ *list, **elistp; { int rep_check, ret; PANIC_CHECK(dbenv); ENV_REQUIRES_CONFIG(dbenv, dbenv->lk_handle, "DB_ENV->lock_vec", DB_INIT_LOCK); /* Validate arguments. */ if ((ret = __db_fchk(dbenv, "DB_ENV->lock_vec", flags, DB_LOCK_NOWAIT)) != 0) return (ret); rep_check = IS_ENV_REPLICATED(dbenv) ? 1 : 0; if (rep_check) __env_rep_enter(dbenv); ret = __lock_vec(dbenv, locker, flags, list, nlist, elistp); if (rep_check) __env_rep_exit(dbenv); return (ret); } /* * __lock_vec -- * DB_ENV->lock_vec. * * Vector lock routine. This function takes a set of operations * and performs them all at once. In addition, lock_vec provides * functionality for lock inheritance, releasing all locks for a * given locker (used during transaction commit/abort), releasing * all locks on a given object, and generating debugging information. * * PUBLIC: int __lock_vec __P((DB_ENV *, * PUBLIC: u_int32_t, u_int32_t, DB_LOCKREQ *, int, DB_LOCKREQ **)); */ int __lock_vec(dbenv, locker, flags, list, nlist, elistp) DB_ENV *dbenv; u_int32_t locker, flags; int nlist; DB_LOCKREQ *list, **elistp; { struct __db_lock *lp, *next_lock; DB_LOCK lock; DB_LOCKER *sh_locker; DB_LOCKOBJ *sh_obj; DB_LOCKREGION *region; DB_LOCKTAB *lt; DBT *objlist, *np; u_int32_t lndx, ndx; int did_abort, i, ret, run_dd, upgrade, writes; /* Check if locks have been globally turned off. */ if (F_ISSET(dbenv, DB_ENV_NOLOCKING)) return (0); lt = dbenv->lk_handle; region = lt->reginfo.primary; run_dd = 0; LOCKREGION(dbenv, (DB_LOCKTAB *)dbenv->lk_handle); for (i = 0, ret = 0; i < nlist && ret == 0; i++) switch (list[i].op) { case DB_LOCK_GET_TIMEOUT: LF_SET(DB_LOCK_SET_TIMEOUT); case DB_LOCK_GET: if (IS_RECOVERING(dbenv)) { LOCK_INIT(list[i].lock); break; } ret = __lock_get_internal(dbenv->lk_handle, locker, flags, list[i].obj, list[i].mode, list[i].timeout, &list[i].lock); break; case DB_LOCK_INHERIT: ret = __lock_inherit_locks(lt, locker, flags); break; case DB_LOCK_PUT: ret = __lock_put_nolock(dbenv, &list[i].lock, &run_dd, flags); break; case DB_LOCK_PUT_ALL: case DB_LOCK_PUT_READ: case DB_LOCK_UPGRADE_WRITE: /* * Get the locker and mark it as deleted. This * allows us to traverse the locker links without * worrying that someone else is deleting locks out * from under us. Since the locker may hold no * locks (i.e., you could call abort before you've * done any work), it's perfectly reasonable for there * to be no locker; this is not an error. */ LOCKER_LOCK(lt, region, locker, ndx); if ((ret = __lock_getlocker(lt, locker, ndx, 0, &sh_locker)) != 0 || sh_locker == NULL || F_ISSET(sh_locker, DB_LOCKER_DELETED)) /* * If ret is set, then we'll generate an * error. If it's not set, we have nothing * to do. */ break; upgrade = 0; writes = 1; if (list[i].op == DB_LOCK_PUT_READ) writes = 0; else if (list[i].op == DB_LOCK_UPGRADE_WRITE) { if (F_ISSET(sh_locker, DB_LOCKER_DIRTY)) upgrade = 1; writes = 0; } objlist = list[i].obj; if (objlist != NULL) { /* * We know these should be ilocks, * but they could be something else, * so allocate room for the size too. */ objlist->size = sh_locker->nwrites * sizeof(DBT); if ((ret = __os_malloc(dbenv, objlist->size, &objlist->data)) != 0) goto up_done; memset(objlist->data, 0, objlist->size); np = (DBT *) objlist->data; } else np = NULL; F_SET(sh_locker, DB_LOCKER_DELETED); /* Now traverse the locks, releasing each one. */ for (lp = SH_LIST_FIRST(&sh_locker->heldby, __db_lock); lp != NULL; lp = next_lock) { sh_obj = (DB_LOCKOBJ *) ((u_int8_t *)lp + lp->obj); next_lock = SH_LIST_NEXT(lp, locker_links, __db_lock); if (writes == 1 || lp->mode == DB_LOCK_READ || lp->mode == DB_LOCK_DIRTY) { SH_LIST_REMOVE(lp, locker_links, __db_lock); sh_obj = (DB_LOCKOBJ *) ((u_int8_t *)lp + lp->obj); SHOBJECT_LOCK(lt, region, sh_obj, lndx); /* * We are not letting lock_put_internal * unlink the lock, so we'll have to * update counts here. */ sh_locker->nlocks--; if (IS_WRITELOCK(lp->mode)) sh_locker->nwrites--; ret = __lock_put_internal(lt, lp, lndx, DB_LOCK_FREE | DB_LOCK_DOALL); if (ret != 0) break; continue; } if (objlist != NULL) { DB_ASSERT((char *)np < (char *)objlist->data + objlist->size); np->data = SH_DBT_PTR(&sh_obj->lockobj); np->size = sh_obj->lockobj.size; np++; } } if (ret != 0) goto up_done; if (objlist != NULL) if ((ret = __lock_fix_list(dbenv, objlist, sh_locker->nwrites)) != 0) goto up_done; switch (list[i].op) { case DB_LOCK_UPGRADE_WRITE: if (upgrade != 1) goto up_done; for (lp = SH_LIST_FIRST( &sh_locker->heldby, __db_lock); lp != NULL; lp = SH_LIST_NEXT(lp, locker_links, __db_lock)) { if (lp->mode != DB_LOCK_WWRITE) continue; lock.off = R_OFFSET(<->reginfo, lp); lock.gen = lp->gen; F_SET(sh_locker, DB_LOCKER_INABORT); if ((ret = __lock_get_internal(lt, locker, DB_LOCK_UPGRADE, NULL, DB_LOCK_WRITE, 0, &lock)) !=0) break; } up_done: /* FALL THROUGH */ case DB_LOCK_PUT_READ: case DB_LOCK_PUT_ALL: F_CLR(sh_locker, DB_LOCKER_DELETED); break; default: break; } break; case DB_LOCK_PUT_OBJ: /* Remove all the locks associated with an object. */ OBJECT_LOCK(lt, region, list[i].obj, ndx); if ((ret = __lock_getobj(lt, list[i].obj, ndx, 0, &sh_obj)) != 0 || sh_obj == NULL) { if (ret == 0) ret = EINVAL; break; } /* * Go through both waiters and holders. Don't bother * to run promotion, because everyone is getting * released. The processes waiting will still get * awakened as their waiters are released. */ for (lp = SH_TAILQ_FIRST(&sh_obj->waiters, __db_lock); ret == 0 && lp != NULL; lp = SH_TAILQ_FIRST(&sh_obj->waiters, __db_lock)) ret = __lock_put_internal(lt, lp, ndx, DB_LOCK_UNLINK | DB_LOCK_NOPROMOTE | DB_LOCK_DOALL); /* * On the last time around, the object will get * reclaimed by __lock_put_internal, structure the * loop carefully so we do not get bitten. */ for (lp = SH_TAILQ_FIRST(&sh_obj->holders, __db_lock); ret == 0 && lp != NULL; lp = next_lock) { next_lock = SH_TAILQ_NEXT(lp, links, __db_lock); ret = __lock_put_internal(lt, lp, ndx, DB_LOCK_UNLINK | DB_LOCK_NOPROMOTE | DB_LOCK_DOALL); } break; case DB_LOCK_TIMEOUT: ret = __lock_set_timeout_internal(dbenv, locker, 0, DB_SET_TXN_NOW); break; case DB_LOCK_TRADE: /* * INTERNAL USE ONLY. * Change the holder of the lock described in * list[i].lock to the locker-id specified by * the locker parameter. */ /* * You had better know what you're doing here. * We are trading locker-id's on a lock to * facilitate file locking on open DB handles. * We do not do any conflict checking on this, * so heaven help you if you use this flag under * any other circumstances. */ ret = __lock_trade(dbenv, &list[i].lock, locker); break; #ifdef DEBUG case DB_LOCK_DUMP: /* Find the locker. */ LOCKER_LOCK(lt, region, locker, ndx); if ((ret = __lock_getlocker(lt, locker, ndx, 0, &sh_locker)) != 0 || sh_locker == NULL || F_ISSET(sh_locker, DB_LOCKER_DELETED)) break; for (lp = SH_LIST_FIRST(&sh_locker->heldby, __db_lock); lp != NULL; lp = SH_LIST_NEXT(lp, locker_links, __db_lock)) { __lock_printlock(lt, lp, 1, NULL); } break; #endif default: __db_err(dbenv, "Invalid lock operation: %d", list[i].op); ret = EINVAL; break; } if (ret == 0 && region->detect != DB_LOCK_NORUN && (region->need_dd || LOCK_TIME_ISVALID(®ion->next_timeout))) run_dd = 1; UNLOCKREGION(dbenv, (DB_LOCKTAB *)dbenv->lk_handle); if (run_dd) (void)__lock_detect(dbenv, region->detect, &did_abort); if (ret != 0 && elistp != NULL) *elistp = &list[i - 1]; return (ret); } /* * __lock_get_pp -- * DB_ENV->lock_get pre/post processing. * * PUBLIC: int __lock_get_pp __P((DB_ENV *, * PUBLIC: u_int32_t, u_int32_t, const DBT *, db_lockmode_t, DB_LOCK *)); */ int __lock_get_pp(dbenv, locker, flags, obj, lock_mode, lock) DB_ENV *dbenv; u_int32_t locker, flags; const DBT *obj; db_lockmode_t lock_mode; DB_LOCK *lock; { int rep_check, ret; PANIC_CHECK(dbenv); ENV_REQUIRES_CONFIG(dbenv, dbenv->lk_handle, "DB_ENV->lock_get", DB_INIT_LOCK); /* Validate arguments. */ if ((ret = __db_fchk(dbenv, "DB_ENV->lock_get", flags, DB_LOCK_NOWAIT | DB_LOCK_UPGRADE | DB_LOCK_SWITCH)) != 0) return (ret); rep_check = IS_ENV_REPLICATED(dbenv) ? 1 : 0; if (rep_check) __env_rep_enter(dbenv); ret = __lock_get(dbenv, locker, flags, obj, lock_mode, lock); if (rep_check) __env_rep_exit(dbenv); return (ret); } /* * __lock_get -- * DB_ENV->lock_get. * * PUBLIC: int __lock_get __P((DB_ENV *, * PUBLIC: u_int32_t, u_int32_t, const DBT *, db_lockmode_t, DB_LOCK *)); */ int __lock_get(dbenv, locker, flags, obj, lock_mode, lock) DB_ENV *dbenv; u_int32_t locker, flags; const DBT *obj; db_lockmode_t lock_mode; DB_LOCK *lock; { int ret; if (IS_RECOVERING(dbenv)) { LOCK_INIT(*lock); return (0); } LOCKREGION(dbenv, (DB_LOCKTAB *)dbenv->lk_handle); ret = __lock_get_internal(dbenv->lk_handle, locker, flags, obj, lock_mode, 0, lock); UNLOCKREGION(dbenv, (DB_LOCKTAB *)dbenv->lk_handle); return (ret); } /* * __lock_get_internal -- * * All the work for lock_get (and for the GET option of lock_vec) is done * inside of lock_get_internal. */ static int __lock_get_internal(lt, locker, flags, obj, lock_mode, timeout, lock) DB_LOCKTAB *lt; u_int32_t locker, flags; const DBT *obj; db_lockmode_t lock_mode; db_timeout_t timeout; DB_LOCK *lock; { struct __db_lock *newl, *lp, *wwrite; DB_ENV *dbenv; DB_LOCKER *sh_locker; DB_LOCKOBJ *sh_obj; DB_LOCKREGION *region; u_int32_t holder, locker_ndx, obj_ndx; int did_abort, ihold, grant_dirty, no_dd, ret, t_ret; /* * We decide what action to take based on what * locks are already held and what locks are * in the wait queue. */ enum { GRANT, /* Grant the lock. */ UPGRADE, /* Upgrade the lock. */ HEAD, /* Wait at head of wait queue. */ SECOND, /* Wait as the second waiter. */ TAIL /* Wait at tail of the wait queue. */ } action; dbenv = lt->dbenv; region = lt->reginfo.primary; /* Check if locks have been globally turned off. */ if (F_ISSET(dbenv, DB_ENV_NOLOCKING)) return (0); no_dd = ret = 0; newl = NULL; /* * If we are not going to reuse this lock, invalidate it * so that if we fail it will not look like a valid lock. */ if (!LF_ISSET(DB_LOCK_UPGRADE | DB_LOCK_SWITCH)) LOCK_INIT(*lock); /* Check that the lock mode is valid. */ if ((u_int32_t)lock_mode >= region->stat.st_nmodes) { __db_err(dbenv, "DB_ENV->lock_get: invalid lock mode %lu", (u_long)lock_mode); return (EINVAL); } region->stat.st_nrequests++; if (obj == NULL) { DB_ASSERT(LOCK_ISSET(*lock)); lp = (struct __db_lock *)R_ADDR(<->reginfo, lock->off); sh_obj = (DB_LOCKOBJ *) ((u_int8_t *)lp + lp->obj); } else { /* Allocate a shared memory new object. */ OBJECT_LOCK(lt, region, obj, lock->ndx); if ((ret = __lock_getobj(lt, obj, lock->ndx, 1, &sh_obj)) != 0) goto err; } /* Get the locker, we may need it to find our parent. */ LOCKER_LOCK(lt, region, locker, locker_ndx); if ((ret = __lock_getlocker(lt, locker, locker_ndx, locker > DB_LOCK_MAXID ? 1 : 0, &sh_locker)) != 0) { /* * XXX We cannot tell if we created the object or not, * so we don't kow if we should free it or not. */ goto err; } if (sh_locker == NULL) { __db_err(dbenv, "Locker does not exist"); ret = EINVAL; goto err; } /* * Figure out if we can grant this lock or if it should wait. * By default, we can grant the new lock if it does not conflict with * anyone on the holders list OR anyone on the waiters list. * The reason that we don't grant if there's a conflict is that * this can lead to starvation (a writer waiting on a popularly * read item will never be granted). The downside of this is that * a waiting reader can prevent an upgrade from reader to writer, * which is not uncommon. * * There are two exceptions to the no-conflict rule. First, if * a lock is held by the requesting locker AND the new lock does * not conflict with any other holders, then we grant the lock. * The most common place this happens is when the holder has a * WRITE lock and a READ lock request comes in for the same locker. * If we do not grant the read lock, then we guarantee deadlock. * Second, dirty readers are granted if at all possible while * avoiding starvation, see below. * * In case of conflict, we put the new lock on the end of the waiters * list, unless we are upgrading or this is a dirty reader in which * case the locker goes at or near the front of the list. */ ihold = 0; grant_dirty = 0; holder = 0; wwrite = NULL; /* * SWITCH is a special case, used by the queue access method * when we want to get an entry which is past the end of the queue. * We have a DB_READ_LOCK and need to switch it to DB_LOCK_WAIT and * join the waiters queue. This must be done as a single operation * so that another locker cannot get in and fail to wake us up. */ if (LF_ISSET(DB_LOCK_SWITCH)) lp = NULL; else lp = SH_TAILQ_FIRST(&sh_obj->holders, __db_lock); for (; lp != NULL; lp = SH_TAILQ_NEXT(lp, links, __db_lock)) { if (locker == lp->holder) { if (lp->mode == lock_mode && lp->status == DB_LSTAT_HELD) { if (LF_ISSET(DB_LOCK_UPGRADE)) goto upgrade; /* * Lock is held, so we can increment the * reference count and return this lock * to the caller. We do not count reference * increments towards the locks held by * the locker. */ lp->refcount++; lock->off = R_OFFSET(<->reginfo, lp); lock->gen = lp->gen; lock->mode = lp->mode; goto done; } else { ihold = 1; if (lock_mode == DB_LOCK_WRITE && lp->mode == DB_LOCK_WWRITE) wwrite = lp; } } else if (__lock_is_parent(lt, lp->holder, sh_locker)) ihold = 1; else if (CONFLICTS(lt, region, lp->mode, lock_mode)) break; else if (lp->mode == DB_LOCK_READ || lp->mode == DB_LOCK_WWRITE) { grant_dirty = 1; holder = lp->holder; } } /* * If there are conflicting holders we will have to wait. * An upgrade or dirty reader goes to the head * of the queue, everone else to the back. */ if (lp != NULL) { if (LF_ISSET(DB_LOCK_UPGRADE) || wwrite != NULL || lock_mode == DB_LOCK_DIRTY) action = HEAD; else action = TAIL; } else { if (LF_ISSET(DB_LOCK_SWITCH)) action = TAIL; else if (LF_ISSET(DB_LOCK_UPGRADE) || wwrite != NULL) action = UPGRADE; else if (ihold) action = GRANT; else { /* * Look for conflicting waiters. */ for (lp = SH_TAILQ_FIRST(&sh_obj->waiters, __db_lock); lp != NULL; lp = SH_TAILQ_NEXT(lp, links, __db_lock)) { if (CONFLICTS(lt, region, lp->mode, lock_mode) && locker != lp->holder) break; } /* * If there are no conflicting holders or waiters, * then we grant. Normally when we wait, we * wait at the end (TAIL). However, the goal of * DIRTY_READ locks to allow forward progress in the * face of updating transactions, so we try to allow * all DIRTY_READ requests to proceed as rapidly * as possible, so long as we can prevent starvation. * * When determining how to queue a DIRTY_READ * request: * * 1. If there is a waiting upgrading writer, * then we enqueue the dirty reader BEHIND it * (second in the queue). * 2. Else, if the current holders are either * READ or WWRITE, we grant * 3. Else queue SECOND i.e., behind the first * waiter. * * The end result is that dirty_readers get to run * so long as other lockers are blocked. Once * there is a locker which is only waiting on * dirty readers then they queue up behind that * locker so that it gets to run. In general * this locker will be a WRITE which will shortly * get downgraded to a WWRITE, permitting the * DIRTY locks to be granted. */ if (lp == NULL) action = GRANT; else if (lock_mode == DB_LOCK_DIRTY && grant_dirty) { /* * An upgrade will be at the head of the * queue. */ lp = SH_TAILQ_FIRST( &sh_obj->waiters, __db_lock); if (lp->mode == DB_LOCK_WRITE && lp->holder == holder) action = SECOND; else action = GRANT; } else if (lock_mode == DB_LOCK_DIRTY) action = SECOND; else action = TAIL; } } switch (action) { case HEAD: case TAIL: case SECOND: case GRANT: /* Allocate a new lock. */ if (++region->stat.st_nlocks > region->stat.st_maxnlocks) region->stat.st_maxnlocks = region->stat.st_nlocks; if ((newl = SH_TAILQ_FIRST(®ion->free_locks, __db_lock)) != NULL) SH_TAILQ_REMOVE( ®ion->free_locks, newl, links, __db_lock); if (newl == NULL) { __db_err(dbenv, __db_lock_err, "locks"); return (ENOMEM); } newl->holder = locker; newl->refcount = 1; newl->mode = lock_mode; newl->obj = SH_PTR_TO_OFF(newl, sh_obj); /* * Now, insert the lock onto its locker's list. * If the locker does not currently hold any locks, * there's no reason to run a deadlock * detector, save that information. */ no_dd = sh_locker->master_locker == INVALID_ROFF && SH_LIST_FIRST( &sh_locker->child_locker, __db_locker) == NULL && SH_LIST_FIRST(&sh_locker->heldby, __db_lock) == NULL; SH_LIST_INSERT_HEAD( &sh_locker->heldby, newl, locker_links, __db_lock); break; case UPGRADE: upgrade: if (wwrite != NULL) { lp = wwrite; lp->refcount++; lock->off = R_OFFSET(<->reginfo, lp); lock->gen = lp->gen; lock->mode = lock_mode; } else lp = (struct __db_lock *)R_ADDR(<->reginfo, lock->off); if (IS_WRITELOCK(lock_mode) && !IS_WRITELOCK(lp->mode)) sh_locker->nwrites++; lp->mode = lock_mode; goto done; } switch (action) { case UPGRADE: DB_ASSERT(0); break; case GRANT: newl->status = DB_LSTAT_HELD; SH_TAILQ_INSERT_TAIL(&sh_obj->holders, newl, links); break; case HEAD: case TAIL: case SECOND: if (LF_ISSET(DB_LOCK_NOWAIT)) { ret = DB_LOCK_NOTGRANTED; region->stat.st_nnowaits++; goto err; } if ((lp = SH_TAILQ_FIRST(&sh_obj->waiters, __db_lock)) == NULL) SH_TAILQ_INSERT_HEAD(®ion->dd_objs, sh_obj, dd_links, __db_lockobj); switch (action) { case HEAD: SH_TAILQ_INSERT_HEAD( &sh_obj->waiters, newl, links, __db_lock); break; case SECOND: SH_TAILQ_INSERT_AFTER( &sh_obj->waiters, lp, newl, links, __db_lock); break; case TAIL: SH_TAILQ_INSERT_TAIL(&sh_obj->waiters, newl, links); break; default: DB_ASSERT(0); } /* If we are switching drop the lock we had. */ if (LF_ISSET(DB_LOCK_SWITCH) && (ret = __lock_put_nolock(dbenv, lock, &ihold, DB_LOCK_NOWAITERS)) != 0) { __lock_remove_waiter(lt, sh_obj, newl, DB_LSTAT_FREE); goto err; } /* * This is really a blocker for the thread. It should be * initialized locked, so that when we try to acquire it, we * block. */ newl->status = DB_LSTAT_WAITING; region->stat.st_nconflicts++; region->need_dd = 1; /* * First check to see if this txn has expired. * If not then see if the lock timeout is past * the expiration of the txn, if it is, use * the txn expiration time. lk_expire is passed * to avoid an extra call to get the time. */ if (__lock_expired(dbenv, &sh_locker->lk_expire, &sh_locker->tx_expire)) { newl->status = DB_LSTAT_EXPIRED; sh_locker->lk_expire = sh_locker->tx_expire; /* We are done. */ goto expired; } /* * If a timeout was specified in this call then it * takes priority. If a lock timeout has been specified * for this transaction then use that, otherwise use * the global timeout value. */ if (!LF_ISSET(DB_LOCK_SET_TIMEOUT)) { if (F_ISSET(sh_locker, DB_LOCKER_TIMEOUT)) timeout = sh_locker->lk_timeout; else timeout = region->lk_timeout; } if (timeout != 0) __lock_expires(dbenv, &sh_locker->lk_expire, timeout); else LOCK_SET_TIME_INVALID(&sh_locker->lk_expire); if (LOCK_TIME_ISVALID(&sh_locker->tx_expire) && (timeout == 0 || __lock_expired(dbenv, &sh_locker->lk_expire, &sh_locker->tx_expire))) sh_locker->lk_expire = sh_locker->tx_expire; if (LOCK_TIME_ISVALID(&sh_locker->lk_expire) && (!LOCK_TIME_ISVALID(®ion->next_timeout) || LOCK_TIME_GREATER( ®ion->next_timeout, &sh_locker->lk_expire))) region->next_timeout = sh_locker->lk_expire; UNLOCKREGION(dbenv, (DB_LOCKTAB *)dbenv->lk_handle); /* * We are about to wait; before waiting, see if the deadlock * detector should be run. */ if (region->detect != DB_LOCK_NORUN && !no_dd) (void)__lock_detect(dbenv, region->detect, &did_abort); MUTEX_LOCK(dbenv, &newl->mutex); LOCKREGION(dbenv, (DB_LOCKTAB *)dbenv->lk_handle); /* Turn off lock timeout. */ if (newl->status != DB_LSTAT_EXPIRED) LOCK_SET_TIME_INVALID(&sh_locker->lk_expire); if (newl->status != DB_LSTAT_PENDING) { switch (newl->status) { case DB_LSTAT_ABORTED: ret = DB_LOCK_DEADLOCK; break; case DB_LSTAT_NOTEXIST: ret = DB_LOCK_NOTEXIST; break; case DB_LSTAT_EXPIRED: expired: SHOBJECT_LOCK(lt, region, sh_obj, obj_ndx); if ((ret = __lock_put_internal( lt, newl, obj_ndx, DB_LOCK_UNLINK | DB_LOCK_FREE) != 0)) goto err; if (LOCK_TIME_EQUAL( &sh_locker->lk_expire, &sh_locker->tx_expire)) { region->stat.st_ntxntimeouts++; return (DB_LOCK_NOTGRANTED); } else { region->stat.st_nlocktimeouts++; return (DB_LOCK_NOTGRANTED); } default: ret = EINVAL; break; } goto err; } else if (LF_ISSET(DB_LOCK_UPGRADE)) { /* * The lock that was just granted got put on the * holders list. Since we're upgrading some other * lock, we've got to remove it here. */ SH_TAILQ_REMOVE( &sh_obj->holders, newl, links, __db_lock); /* * Ensure that the object is not believed to be on * the object's lists, if we're traversing by locker. */ newl->links.stqe_prev = -1; goto upgrade; } else newl->status = DB_LSTAT_HELD; } lock->off = R_OFFSET(<->reginfo, newl); lock->gen = newl->gen; lock->mode = newl->mode; sh_locker->nlocks++; if (IS_WRITELOCK(newl->mode)) sh_locker->nwrites++; return (0); done: ret = 0; err: if (newl != NULL && (t_ret = __lock_freelock(lt, newl, locker, DB_LOCK_FREE | DB_LOCK_UNLINK)) != 0 && ret == 0) ret = t_ret; return (ret); } /* * __lock_put_pp -- * DB_ENV->lock_put pre/post processing. * * PUBLIC: int __lock_put_pp __P((DB_ENV *, DB_LOCK *)); */ int __lock_put_pp(dbenv, lock) DB_ENV *dbenv; DB_LOCK *lock; { int rep_check, ret; PANIC_CHECK(dbenv); ENV_REQUIRES_CONFIG(dbenv, dbenv->lk_handle, "DB_LOCK->lock_put", DB_INIT_LOCK); rep_check = IS_ENV_REPLICATED(dbenv) ? 1 : 0; if (rep_check) __env_rep_enter(dbenv); ret = __lock_put(dbenv, lock); if (rep_check) __env_rep_exit(dbenv); return (ret); } /* * __lock_put -- * DB_ENV->lock_put. * * PUBLIC: int __lock_put __P((DB_ENV *, DB_LOCK *)); */ int __lock_put(dbenv, lock) DB_ENV *dbenv; DB_LOCK *lock; { DB_LOCKTAB *lt; int ret, run_dd; if (IS_RECOVERING(dbenv)) return (0); lt = dbenv->lk_handle; LOCKREGION(dbenv, lt); ret = __lock_put_nolock(dbenv, lock, &run_dd, 0); UNLOCKREGION(dbenv, lt); /* * Only run the lock detector if put told us to AND we are running * in auto-detect mode. If we are not running in auto-detect, then * a call to lock_detect here will 0 the need_dd bit, but will not * actually abort anything. */ if (ret == 0 && run_dd) (void)__lock_detect(dbenv, ((DB_LOCKREGION *)lt->reginfo.primary)->detect, NULL); return (ret); } static int __lock_put_nolock(dbenv, lock, runp, flags) DB_ENV *dbenv; DB_LOCK *lock; int *runp; u_int32_t flags; { struct __db_lock *lockp; DB_LOCKREGION *region; DB_LOCKTAB *lt; int ret; /* Check if locks have been globally turned off. */ if (F_ISSET(dbenv, DB_ENV_NOLOCKING)) return (0); lt = dbenv->lk_handle; region = lt->reginfo.primary; lockp = (struct __db_lock *)R_ADDR(<->reginfo, lock->off); LOCK_INIT(*lock); if (lock->gen != lockp->gen) { __db_err(dbenv, __db_lock_invalid, "DB_LOCK->lock_put"); return (EINVAL); } ret = __lock_put_internal(lt, lockp, lock->ndx, flags | DB_LOCK_UNLINK | DB_LOCK_FREE); *runp = 0; if (ret == 0 && region->detect != DB_LOCK_NORUN && (region->need_dd || LOCK_TIME_ISVALID(®ion->next_timeout))) *runp = 1; return (ret); } /* * __lock_downgrade -- * * Used to downgrade locks. Currently this is used in two places: 1) by the * Concurrent Data Store product to downgrade write locks back to iwrite locks * and 2) to downgrade write-handle locks to read-handle locks at the end of * an open/create. * * PUBLIC: int __lock_downgrade __P((DB_ENV *, * PUBLIC: DB_LOCK *, db_lockmode_t, u_int32_t)); */ int __lock_downgrade(dbenv, lock, new_mode, flags) DB_ENV *dbenv; DB_LOCK *lock; db_lockmode_t new_mode; u_int32_t flags; { struct __db_lock *lockp; DB_LOCKER *sh_locker; DB_LOCKOBJ *obj; DB_LOCKREGION *region; DB_LOCKTAB *lt; u_int32_t indx; int ret; COMPQUIET(flags, 0); PANIC_CHECK(dbenv); ret = 0; /* Check if locks have been globally turned off. */ if (F_ISSET(dbenv, DB_ENV_NOLOCKING)) return (0); lt = dbenv->lk_handle; region = lt->reginfo.primary; LOCKREGION(dbenv, lt); lockp = (struct __db_lock *)R_ADDR(<->reginfo, lock->off); if (lock->gen != lockp->gen) { __db_err(dbenv, __db_lock_invalid, "lock_downgrade"); ret = EINVAL; goto out; } LOCKER_LOCK(lt, region, lockp->holder, indx); if ((ret = __lock_getlocker(lt, lockp->holder, indx, 0, &sh_locker)) != 0 || sh_locker == NULL) { if (ret == 0) ret = EINVAL; __db_err(dbenv, __db_locker_invalid); goto out; } if (IS_WRITELOCK(lockp->mode) && !IS_WRITELOCK(new_mode)) sh_locker->nwrites--; if (new_mode == DB_LOCK_WWRITE) F_SET(sh_locker, DB_LOCKER_DIRTY); lockp->mode = new_mode; /* Get the object associated with this lock. */ obj = (DB_LOCKOBJ *)((u_int8_t *)lockp + lockp->obj); (void)__lock_promote(lt, obj, LF_ISSET(DB_LOCK_NOWAITERS)); out: UNLOCKREGION(dbenv, lt); return (ret); } static int __lock_put_internal(lt, lockp, obj_ndx, flags) DB_LOCKTAB *lt; struct __db_lock *lockp; u_int32_t obj_ndx, flags; { DB_LOCKOBJ *sh_obj; DB_LOCKREGION *region; int ret, state_changed; region = lt->reginfo.primary; ret = state_changed = 0; if (!OBJ_LINKS_VALID(lockp)) { /* * Someone removed this lock while we were doing a release * by locker id. We are trying to free this lock, but it's * already been done; all we need to do is return it to the * free list. */ (void)__lock_freelock(lt, lockp, 0, DB_LOCK_FREE); return (0); } if (LF_ISSET(DB_LOCK_DOALL)) region->stat.st_nreleases += lockp->refcount; else region->stat.st_nreleases++; if (!LF_ISSET(DB_LOCK_DOALL) && lockp->refcount > 1) { lockp->refcount--; return (0); } /* Increment generation number. */ lockp->gen++; /* Get the object associated with this lock. */ sh_obj = (DB_LOCKOBJ *)((u_int8_t *)lockp + lockp->obj); /* Remove this lock from its holders/waitlist. */ if (lockp->status != DB_LSTAT_HELD && lockp->status != DB_LSTAT_PENDING) __lock_remove_waiter(lt, sh_obj, lockp, DB_LSTAT_FREE); else { SH_TAILQ_REMOVE(&sh_obj->holders, lockp, links, __db_lock); lockp->links.stqe_prev = -1; } if (LF_ISSET(DB_LOCK_NOPROMOTE)) state_changed = 0; else state_changed = __lock_promote(lt, sh_obj, LF_ISSET(DB_LOCK_REMOVE | DB_LOCK_NOWAITERS)); /* Check if object should be reclaimed. */ if (SH_TAILQ_FIRST(&sh_obj->holders, __db_lock) == NULL && SH_TAILQ_FIRST(&sh_obj->waiters, __db_lock) == NULL) { HASHREMOVE_EL(lt->obj_tab, obj_ndx, __db_lockobj, links, sh_obj); if (sh_obj->lockobj.size > sizeof(sh_obj->objdata)) __db_shalloc_free(lt->reginfo.addr, SH_DBT_PTR(&sh_obj->lockobj)); SH_TAILQ_INSERT_HEAD( ®ion->free_objs, sh_obj, links, __db_lockobj); region->stat.st_nobjects--; state_changed = 1; } /* Free lock. */ if (LF_ISSET(DB_LOCK_UNLINK | DB_LOCK_FREE)) ret = __lock_freelock(lt, lockp, lockp->holder, flags); /* * If we did not promote anyone; we need to run the deadlock * detector again. */ if (state_changed == 0) region->need_dd = 1; return (ret); } /* * Utility functions; listed alphabetically. */ /* * __lock_freelock -- * Free a lock. Unlink it from its locker if necessary. * */ static int __lock_freelock(lt, lockp, locker, flags) DB_LOCKTAB *lt; struct __db_lock *lockp; u_int32_t locker, flags; { DB_ENV *dbenv; DB_LOCKER *sh_locker; DB_LOCKREGION *region; u_int32_t indx; int ret; dbenv = lt->dbenv; region = lt->reginfo.primary; ret = 0; if (LF_ISSET(DB_LOCK_UNLINK)) { LOCKER_LOCK(lt, region, locker, indx); if ((ret = __lock_getlocker(lt, locker, indx, 0, &sh_locker)) != 0 || sh_locker == NULL) { if (ret == 0) ret = EINVAL; __db_err(dbenv, __db_locker_invalid); return (ret); } SH_LIST_REMOVE(lockp, locker_links, __db_lock); if (lockp->status == DB_LSTAT_HELD) { sh_locker->nlocks--; if (IS_WRITELOCK(lockp->mode)) sh_locker->nwrites--; } } if (LF_ISSET(DB_LOCK_FREE)) { lockp->status = DB_LSTAT_FREE; SH_TAILQ_INSERT_HEAD( ®ion->free_locks, lockp, links, __db_lock); region->stat.st_nlocks--; } return (ret); } /* * __lock_addfamilylocker * Put a locker entry in for a child transaction. * * PUBLIC: int __lock_addfamilylocker __P((DB_ENV *, u_int32_t, u_int32_t)); */ int __lock_addfamilylocker(dbenv, pid, id) DB_ENV *dbenv; u_int32_t pid, id; { DB_LOCKER *lockerp, *mlockerp; DB_LOCKREGION *region; DB_LOCKTAB *lt; u_int32_t ndx; int ret; lt = dbenv->lk_handle; region = lt->reginfo.primary; LOCKREGION(dbenv, lt); /* get/create the parent locker info */ LOCKER_LOCK(lt, region, pid, ndx); if ((ret = __lock_getlocker(dbenv->lk_handle, pid, ndx, 1, &mlockerp)) != 0) goto err; /* * We assume that only one thread can manipulate * a single transaction family. * Therefore the master locker cannot go away while * we manipulate it, nor can another child in the * family be created at the same time. */ LOCKER_LOCK(lt, region, id, ndx); if ((ret = __lock_getlocker(dbenv->lk_handle, id, ndx, 1, &lockerp)) != 0) goto err; /* Point to our parent. */ lockerp->parent_locker = R_OFFSET(<->reginfo, mlockerp); /* See if this locker is the family master. */ if (mlockerp->master_locker == INVALID_ROFF) lockerp->master_locker = R_OFFSET(<->reginfo, mlockerp); else { lockerp->master_locker = mlockerp->master_locker; mlockerp = R_ADDR(<->reginfo, mlockerp->master_locker); } /* * Link the child at the head of the master's list. * The guess is when looking for deadlock that * the most recent child is the one thats blocked. */ SH_LIST_INSERT_HEAD( &mlockerp->child_locker, lockerp, child_link, __db_locker); err: UNLOCKREGION(dbenv, lt); return (ret); } /* * __lock_freefamilylocker * Remove a locker from the hash table and its family. * * This must be called without the locker bucket locked. * * PUBLIC: int __lock_freefamilylocker __P((DB_LOCKTAB *, u_int32_t)); */ int __lock_freefamilylocker(lt, locker) DB_LOCKTAB *lt; u_int32_t locker; { DB_ENV *dbenv; DB_LOCKER *sh_locker; DB_LOCKREGION *region; u_int32_t indx; int ret; dbenv = lt->dbenv; region = lt->reginfo.primary; LOCKREGION(dbenv, lt); LOCKER_LOCK(lt, region, locker, indx); if ((ret = __lock_getlocker(lt, locker, indx, 0, &sh_locker)) != 0 || sh_locker == NULL) goto err; if (SH_LIST_FIRST(&sh_locker->heldby, __db_lock) != NULL) { ret = EINVAL; __db_err(dbenv, "Freeing locker with locks"); goto err; } /* If this is part of a family, we must fix up its links. */ if (sh_locker->master_locker != INVALID_ROFF) SH_LIST_REMOVE(sh_locker, child_link, __db_locker); __lock_freelocker(lt, region, sh_locker, indx); err: UNLOCKREGION(dbenv, lt); return (ret); } /* * __lock_freelocker * common code for deleting a locker. * * This must be called with the locker bucket locked. */ static void __lock_freelocker(lt, region, sh_locker, indx) DB_LOCKTAB *lt; DB_LOCKREGION *region; DB_LOCKER *sh_locker; u_int32_t indx; { HASHREMOVE_EL( lt->locker_tab, indx, __db_locker, links, sh_locker); SH_TAILQ_INSERT_HEAD( ®ion->free_lockers, sh_locker, links, __db_locker); SH_TAILQ_REMOVE(®ion->lockers, sh_locker, ulinks, __db_locker); region->stat.st_nlockers--; } /* * __lock_set_timeout * -- set timeout values in shared memory. * This is called from the transaction system. * We either set the time that this tranaction expires or the * amount of time that a lock for this transaction is permitted * to wait. * * PUBLIC: int __lock_set_timeout __P(( DB_ENV *, * PUBLIC: u_int32_t, db_timeout_t, u_int32_t)); */ int __lock_set_timeout(dbenv, locker, timeout, op) DB_ENV *dbenv; u_int32_t locker; db_timeout_t timeout; u_int32_t op; { DB_LOCKTAB *lt; int ret; lt = dbenv->lk_handle; LOCKREGION(dbenv, lt); ret = __lock_set_timeout_internal(dbenv, locker, timeout, op); UNLOCKREGION(dbenv, lt); return (ret); } /* * __lock_set_timeout_internal * -- set timeout values in shared memory. * This is the internal version called from the lock system. * We either set the time that this tranaction expires or the * amount of time that a lock for this transaction is permitted * to wait. * */ static int __lock_set_timeout_internal(dbenv, locker, timeout, op) DB_ENV *dbenv; u_int32_t locker; db_timeout_t timeout; u_int32_t op; { DB_LOCKER *sh_locker; DB_LOCKREGION *region; DB_LOCKTAB *lt; u_int32_t locker_ndx; int ret; lt = dbenv->lk_handle; region = lt->reginfo.primary; LOCKER_LOCK(lt, region, locker, locker_ndx); ret = __lock_getlocker(lt, locker, locker_ndx, 1, &sh_locker); if (ret != 0) return (ret); if (op == DB_SET_TXN_TIMEOUT) { if (timeout == 0) LOCK_SET_TIME_INVALID(&sh_locker->tx_expire); else __lock_expires(dbenv, &sh_locker->tx_expire, timeout); } else if (op == DB_SET_LOCK_TIMEOUT) { sh_locker->lk_timeout = timeout; F_SET(sh_locker, DB_LOCKER_TIMEOUT); } else if (op == DB_SET_TXN_NOW) { LOCK_SET_TIME_INVALID(&sh_locker->tx_expire); __lock_expires(dbenv, &sh_locker->tx_expire, 0); sh_locker->lk_expire = sh_locker->tx_expire; if (!LOCK_TIME_ISVALID(®ion->next_timeout) || LOCK_TIME_GREATER( ®ion->next_timeout, &sh_locker->lk_expire)) region->next_timeout = sh_locker->lk_expire; } else return (EINVAL); return (0); } /* * __lock_inherit_timeout * -- inherit timeout values from parent locker. * This is called from the transaction system. This will * return EINVAL if the parent does not exist or did not * have a current txn timeout set. * * PUBLIC: int __lock_inherit_timeout __P(( DB_ENV *, u_int32_t, u_int32_t)); */ int __lock_inherit_timeout(dbenv, parent, locker) DB_ENV *dbenv; u_int32_t parent, locker; { DB_LOCKER *parent_locker, *sh_locker; DB_LOCKREGION *region; DB_LOCKTAB *lt; u_int32_t locker_ndx; int ret; lt = dbenv->lk_handle; region = lt->reginfo.primary; ret = 0; LOCKREGION(dbenv, lt); /* If the parent does not exist, we are done. */ LOCKER_LOCK(lt, region, parent, locker_ndx); if ((ret = __lock_getlocker(lt, parent, locker_ndx, 0, &parent_locker)) != 0) goto err; /* * If the parent is not there yet, thats ok. If it * does not have any timouts set, then avoid creating * the child locker at this point. */ if (parent_locker == NULL || (LOCK_TIME_ISVALID(&parent_locker->tx_expire) && !F_ISSET(parent_locker, DB_LOCKER_TIMEOUT))) { ret = EINVAL; goto done; } LOCKER_LOCK(lt, region, locker, locker_ndx); if ((ret = __lock_getlocker(lt, locker, locker_ndx, 1, &sh_locker)) != 0) goto err; sh_locker->tx_expire = parent_locker->tx_expire; if (F_ISSET(parent_locker, DB_LOCKER_TIMEOUT)) { sh_locker->lk_timeout = parent_locker->lk_timeout; F_SET(sh_locker, DB_LOCKER_TIMEOUT); if (!LOCK_TIME_ISVALID(&parent_locker->tx_expire)) ret = EINVAL; } done: err: UNLOCKREGION(dbenv, lt); return (ret); } /* * __lock_getlocker -- * Get a locker in the locker hash table. The create parameter * indicates if the locker should be created if it doesn't exist in * the table. * * This must be called with the locker bucket locked. * * PUBLIC: int __lock_getlocker __P((DB_LOCKTAB *, * PUBLIC: u_int32_t, u_int32_t, int, DB_LOCKER **)); */ int __lock_getlocker(lt, locker, indx, create, retp) DB_LOCKTAB *lt; u_int32_t locker, indx; int create; DB_LOCKER **retp; { DB_ENV *dbenv; DB_LOCKER *sh_locker; DB_LOCKREGION *region; dbenv = lt->dbenv; region = lt->reginfo.primary; HASHLOOKUP(lt->locker_tab, indx, __db_locker, links, locker, sh_locker, __lock_locker_cmp); /* * If we found the locker, then we can just return it. If * we didn't find the locker, then we need to create it. */ if (sh_locker == NULL && create) { /* Create new locker and then insert it into hash table. */ if ((sh_locker = SH_TAILQ_FIRST( ®ion->free_lockers, __db_locker)) == NULL) { __db_err(dbenv, __db_lock_err, "locker entries"); return (ENOMEM); } SH_TAILQ_REMOVE( ®ion->free_lockers, sh_locker, links, __db_locker); if (++region->stat.st_nlockers > region->stat.st_maxnlockers) region->stat.st_maxnlockers = region->stat.st_nlockers; sh_locker->id = locker; sh_locker->dd_id = 0; sh_locker->master_locker = INVALID_ROFF; sh_locker->parent_locker = INVALID_ROFF; SH_LIST_INIT(&sh_locker->child_locker); sh_locker->flags = 0; SH_LIST_INIT(&sh_locker->heldby); sh_locker->nlocks = 0; sh_locker->nwrites = 0; sh_locker->lk_timeout = 0; LOCK_SET_TIME_INVALID(&sh_locker->tx_expire); LOCK_SET_TIME_INVALID(&sh_locker->lk_expire); HASHINSERT(lt->locker_tab, indx, __db_locker, links, sh_locker); SH_TAILQ_INSERT_HEAD(®ion->lockers, sh_locker, ulinks, __db_locker); } *retp = sh_locker; return (0); } /* * __lock_getobj -- * Get an object in the object hash table. The create parameter * indicates if the object should be created if it doesn't exist in * the table. * * This must be called with the object bucket locked. */ static int __lock_getobj(lt, obj, ndx, create, retp) DB_LOCKTAB *lt; const DBT *obj; u_int32_t ndx; int create; DB_LOCKOBJ **retp; { DB_ENV *dbenv; DB_LOCKOBJ *sh_obj; DB_LOCKREGION *region; int ret; void *p; dbenv = lt->dbenv; region = lt->reginfo.primary; /* Look up the object in the hash table. */ HASHLOOKUP(lt->obj_tab, ndx, __db_lockobj, links, obj, sh_obj, __lock_cmp); /* * If we found the object, then we can just return it. If * we didn't find the object, then we need to create it. */ if (sh_obj == NULL && create) { /* Create new object and then insert it into hash table. */ if ((sh_obj = SH_TAILQ_FIRST(®ion->free_objs, __db_lockobj)) == NULL) { __db_err(lt->dbenv, __db_lock_err, "object entries"); ret = ENOMEM; goto err; } /* * If we can fit this object in the structure, do so instead * of shalloc-ing space for it. */ if (obj->size <= sizeof(sh_obj->objdata)) p = sh_obj->objdata; else if ((ret = __db_shalloc( lt->reginfo.addr, obj->size, 0, &p)) != 0) { __db_err(dbenv, "No space for lock object storage"); goto err; } memcpy(p, obj->data, obj->size); SH_TAILQ_REMOVE( ®ion->free_objs, sh_obj, links, __db_lockobj); if (++region->stat.st_nobjects > region->stat.st_maxnobjects) region->stat.st_maxnobjects = region->stat.st_nobjects; SH_TAILQ_INIT(&sh_obj->waiters); SH_TAILQ_INIT(&sh_obj->holders); sh_obj->lockobj.size = obj->size; sh_obj->lockobj.off = SH_PTR_TO_OFF(&sh_obj->lockobj, p); HASHINSERT(lt->obj_tab, ndx, __db_lockobj, links, sh_obj); } *retp = sh_obj; return (0); err: return (ret); } /* * __lock_is_parent -- * Given a locker and a transaction, return 1 if the locker is * an ancestor of the designcated transaction. This is used to determine * if we should grant locks that appear to conflict, but don't because * the lock is already held by an ancestor. */ static int __lock_is_parent(lt, locker, sh_locker) DB_LOCKTAB *lt; u_int32_t locker; DB_LOCKER *sh_locker; { DB_LOCKER *parent; parent = sh_locker; while (parent->parent_locker != INVALID_ROFF) { parent = (DB_LOCKER *) R_ADDR(<->reginfo, parent->parent_locker); if (parent->id == locker) return (1); } return (0); } /* * __lock_inherit_locks -- * Called on child commit to merge child's locks with parent's. */ static int __lock_inherit_locks(lt, locker, flags) DB_LOCKTAB *lt; u_int32_t locker; u_int32_t flags; { DB_ENV *dbenv; DB_LOCKER *sh_locker, *sh_parent; DB_LOCKOBJ *obj; DB_LOCKREGION *region; int ret; struct __db_lock *hlp, *lp; u_int32_t ndx; region = lt->reginfo.primary; dbenv = lt->dbenv; /* * Get the committing locker and mark it as deleted. * This allows us to traverse the locker links without * worrying that someone else is deleting locks out * from under us. However, if the locker doesn't * exist, that just means that the child holds no * locks, so inheritance is easy! */ LOCKER_LOCK(lt, region, locker, ndx); if ((ret = __lock_getlocker(lt, locker, ndx, 0, &sh_locker)) != 0 || sh_locker == NULL || F_ISSET(sh_locker, DB_LOCKER_DELETED)) { if (ret == 0 && sh_locker != NULL) ret = EINVAL; __db_err(dbenv, __db_locker_invalid); goto err; } /* Make sure we are a child transaction. */ if (sh_locker->parent_locker == INVALID_ROFF) { __db_err(dbenv, "Not a child transaction"); ret = EINVAL; goto err; } sh_parent = (DB_LOCKER *) R_ADDR(<->reginfo, sh_locker->parent_locker); F_SET(sh_locker, DB_LOCKER_DELETED); /* * Now, lock the parent locker; move locks from * the committing list to the parent's list. */ LOCKER_LOCK(lt, region, locker, ndx); if (F_ISSET(sh_parent, DB_LOCKER_DELETED)) { if (ret == 0) { __db_err(dbenv, "Parent locker is not valid"); ret = EINVAL; } goto err; } /* * In order to make it possible for a parent to have * many, many children who lock the same objects, and * not require an inordinate number of locks, we try * to merge the child's locks with its parent's. */ for (lp = SH_LIST_FIRST(&sh_locker->heldby, __db_lock); lp != NULL; lp = SH_LIST_FIRST(&sh_locker->heldby, __db_lock)) { SH_LIST_REMOVE(lp, locker_links, __db_lock); /* See if the parent already has a lock. */ obj = (DB_LOCKOBJ *)((u_int8_t *)lp + lp->obj); for (hlp = SH_TAILQ_FIRST(&obj->holders, __db_lock); hlp != NULL; hlp = SH_TAILQ_NEXT(hlp, links, __db_lock)) if (hlp->holder == sh_parent->id && lp->mode == hlp->mode) break; if (hlp != NULL) { /* Parent already holds lock. */ hlp->refcount += lp->refcount; /* Remove lock from object list and free it. */ DB_ASSERT(lp->status == DB_LSTAT_HELD); SH_TAILQ_REMOVE(&obj->holders, lp, links, __db_lock); (void)__lock_freelock(lt, lp, locker, DB_LOCK_FREE); } else { /* Just move lock to parent chains. */ SH_LIST_INSERT_HEAD(&sh_parent->heldby, lp, locker_links, __db_lock); lp->holder = sh_parent->id; } /* * We may need to promote regardless of whether we simply * moved the lock to the parent or changed the parent's * reference count, because there might be a sibling waiting, * who will now be allowed to make forward progress. */ (void)__lock_promote(lt, obj, LF_ISSET(DB_LOCK_NOWAITERS)); } /* Transfer child counts to parent. */ sh_parent->nlocks += sh_locker->nlocks; sh_parent->nwrites += sh_locker->nwrites; err: return (ret); } /* * __lock_promote -- * * Look through the waiters and holders lists and decide which (if any) * locks can be promoted. Promote any that are eligible. * * PUBLIC: int __lock_promote __P((DB_LOCKTAB *, DB_LOCKOBJ *, u_int32_t)); */ int __lock_promote(lt, obj, flags) DB_LOCKTAB *lt; DB_LOCKOBJ *obj; u_int32_t flags; { struct __db_lock *lp_w, *lp_h, *next_waiter; DB_LOCKER *sh_locker; DB_LOCKREGION *region; u_int32_t locker_ndx; int had_waiters, state_changed; region = lt->reginfo.primary; had_waiters = 0; /* * We need to do lock promotion. We also need to determine if we're * going to need to run the deadlock detector again. If we release * locks, and there are waiters, but no one gets promoted, then we * haven't fundamentally changed the lockmgr state, so we may still * have a deadlock and we have to run again. However, if there were * no waiters, or we actually promoted someone, then we are OK and we * don't have to run it immediately. * * During promotion, we look for state changes so we can return this * information to the caller. */ for (lp_w = SH_TAILQ_FIRST(&obj->waiters, __db_lock), state_changed = lp_w == NULL; lp_w != NULL; lp_w = next_waiter) { had_waiters = 1; next_waiter = SH_TAILQ_NEXT(lp_w, links, __db_lock); /* Waiter may have aborted or expired. */ if (lp_w->status != DB_LSTAT_WAITING) continue; /* Are we switching locks? */ if (LF_ISSET(DB_LOCK_NOWAITERS) && lp_w->mode == DB_LOCK_WAIT) continue; if (LF_ISSET(DB_LOCK_REMOVE)) { __lock_remove_waiter(lt, obj, lp_w, DB_LSTAT_NOTEXIST); continue; } for (lp_h = SH_TAILQ_FIRST(&obj->holders, __db_lock); lp_h != NULL; lp_h = SH_TAILQ_NEXT(lp_h, links, __db_lock)) { if (lp_h->holder != lp_w->holder && CONFLICTS(lt, region, lp_h->mode, lp_w->mode)) { LOCKER_LOCK(lt, region, lp_w->holder, locker_ndx); if ((__lock_getlocker(lt, lp_w->holder, locker_ndx, 0, &sh_locker)) != 0) { DB_ASSERT(0); break; } if (!__lock_is_parent(lt, lp_h->holder, sh_locker)) break; } } if (lp_h != NULL) /* Found a conflict. */ break; /* No conflict, promote the waiting lock. */ SH_TAILQ_REMOVE(&obj->waiters, lp_w, links, __db_lock); lp_w->status = DB_LSTAT_PENDING; SH_TAILQ_INSERT_TAIL(&obj->holders, lp_w, links); /* Wake up waiter. */ MUTEX_UNLOCK(lt->dbenv, &lp_w->mutex); state_changed = 1; } /* * If this object had waiters and doesn't any more, then we need * to remove it from the dd_obj list. */ if (had_waiters && SH_TAILQ_FIRST(&obj->waiters, __db_lock) == NULL) SH_TAILQ_REMOVE(®ion->dd_objs, obj, dd_links, __db_lockobj); return (state_changed); } /* * __lock_remove_waiter -- * Any lock on the waitlist has a process waiting for it. Therefore, * we can't return the lock to the freelist immediately. Instead, we can * remove the lock from the list of waiters, set the status field of the * lock, and then let the process waking up return the lock to the * free list. * * This must be called with the Object bucket locked. */ static void __lock_remove_waiter(lt, sh_obj, lockp, status) DB_LOCKTAB *lt; DB_LOCKOBJ *sh_obj; struct __db_lock *lockp; db_status_t status; { DB_LOCKREGION *region; int do_wakeup; region = lt->reginfo.primary; do_wakeup = lockp->status == DB_LSTAT_WAITING; SH_TAILQ_REMOVE(&sh_obj->waiters, lockp, links, __db_lock); lockp->links.stqe_prev = -1; lockp->status = status; if (SH_TAILQ_FIRST(&sh_obj->waiters, __db_lock) == NULL) SH_TAILQ_REMOVE( ®ion->dd_objs, sh_obj, dd_links, __db_lockobj); /* * Wake whoever is waiting on this lock. */ if (do_wakeup) MUTEX_UNLOCK(lt->dbenv, &lockp->mutex); } /* * __lock_expires -- set the expire time given the time to live. * We assume that if timevalp is set then it contains "now". * This avoids repeated system calls to get the time. */ static void __lock_expires(dbenv, timevalp, timeout) DB_ENV *dbenv; db_timeval_t *timevalp; db_timeout_t timeout; { if (!LOCK_TIME_ISVALID(timevalp)) __os_clock(dbenv, &timevalp->tv_sec, &timevalp->tv_usec); if (timeout > 1000000) { timevalp->tv_sec += timeout / 1000000; timevalp->tv_usec += timeout % 1000000; } else timevalp->tv_usec += timeout; if (timevalp->tv_usec > 1000000) { timevalp->tv_sec++; timevalp->tv_usec -= 1000000; } } /* * __lock_expired -- determine if a lock has expired. * * PUBLIC: int __lock_expired __P((DB_ENV *, db_timeval_t *, db_timeval_t *)); */ int __lock_expired(dbenv, now, timevalp) DB_ENV *dbenv; db_timeval_t *now, *timevalp; { if (!LOCK_TIME_ISVALID(timevalp)) return (0); if (!LOCK_TIME_ISVALID(now)) __os_clock(dbenv, &now->tv_sec, &now->tv_usec); return (now->tv_sec > timevalp->tv_sec || (now->tv_sec == timevalp->tv_sec && now->tv_usec >= timevalp->tv_usec)); } /* * __lock_trade -- * * Trade locker ids on a lock. This is used to reassign file locks from * a transactional locker id to a long-lived locker id. This should be * called with the region mutex held. */ static int __lock_trade(dbenv, lock, new_locker) DB_ENV *dbenv; DB_LOCK *lock; u_int32_t new_locker; { struct __db_lock *lp; DB_LOCKREGION *region; DB_LOCKTAB *lt; DB_LOCKER *sh_locker; int ret; u_int32_t locker_ndx; lt = dbenv->lk_handle; region = lt->reginfo.primary; lp = (struct __db_lock *)R_ADDR(<->reginfo, lock->off); /* If the lock is already released, simply return. */ if (lp->gen != lock->gen) return (DB_NOTFOUND); /* Make sure that we can get new locker and add this lock to it. */ LOCKER_LOCK(lt, region, new_locker, locker_ndx); if ((ret = __lock_getlocker(lt, new_locker, locker_ndx, 0, &sh_locker)) != 0) return (ret); if (sh_locker == NULL) { __db_err(dbenv, "Locker does not exist"); return (EINVAL); } /* Remove the lock from its current locker. */ if ((ret = __lock_freelock(lt, lp, lp->holder, DB_LOCK_UNLINK)) != 0) return (ret); /* Add lock to its new locker. */ SH_LIST_INSERT_HEAD(&sh_locker->heldby, lp, locker_links, __db_lock); sh_locker->nlocks++; if (IS_WRITELOCK(lp->mode)) sh_locker->nwrites++; lp->holder = new_locker; return (0); } /* * Lock list routines. * The list is composed of a 32-bit count of locks followed by * each lock. A lock is represented by a 16-bit page-count, a lock * object and a page list. A lock object consists of a 16-bit size * and the object itself. In a pseudo BNF notation, you get: * * LIST = COUNT32 LOCK* * LOCK = COUNT16 LOCKOBJ PAGELIST * LOCKOBJ = COUNT16 OBJ * PAGELIST = COUNT32* * * (Recall that X* means "0 or more X's") * * In most cases, the OBJ is a struct __db_ilock and the page list is * a series of (32-bit) page numbers that should get written into the * pgno field of the __db_ilock. So, the actual number of pages locked * is the number of items in the PAGELIST plus 1. If this is an application- * specific lock, then we cannot interpret obj and the pagelist must * be empty. * * Consider a lock list for: File A, pages 1&2, File B pages 3-5, Applock * This would be represented as: * 5 1 [fid=A;page=1] 2 2 [fid=B;page=3] 4 5 0 APPLOCK * ------------------ -------------------- --------- * LOCK for file A LOCK for file B application-specific lock */ #define MAX_PGNOS 0xffff /* * These macros are bigger than one might exepect becasue the * Solaris compiler says that a cast does not return an lvalue, * so constructs like *(u_int32_t*)dp = count; generate warnings. */ #define RET_SIZE(size, count) ((size) + \ sizeof(u_int32_t) + (count) * 2 * sizeof(u_int16_t)) #define PUT_COUNT(dp, count) do { u_int32_t *ip = (u_int32_t *)dp;\ *ip = count; \ dp = (u_int8_t *)dp + \ sizeof(u_int32_t); \ } while (0) #define PUT_PCOUNT(dp, count) do { u_int16_t *ip = (u_int16_t *)dp;\ *ip = count; \ dp = (u_int8_t *)dp + \ sizeof(u_int16_t); \ } while (0) #define PUT_SIZE(dp, size) do { u_int16_t *ip = (u_int16_t *)dp;\ *ip = size; \ dp = (u_int8_t *)dp + \ sizeof(u_int16_t); \ } while (0) #define PUT_PGNO(dp, pgno) do { db_pgno_t *ip = (db_pgno_t *)dp;\ *ip = pgno; \ dp = (u_int8_t *)dp + \ sizeof(db_pgno_t); \ } while (0) #define COPY_OBJ(dp, obj) do { \ memcpy(dp, \ (obj)->data, (obj)->size); \ dp = (u_int8_t *)dp + \ ALIGN((obj)->size, \ sizeof(u_int32_t)); \ } while (0) #define GET_COUNT(dp, count) do { \ (count) = *(u_int32_t *) dp; \ dp = (u_int8_t *)dp + \ sizeof(u_int32_t); \ } while (0); #define GET_PCOUNT(dp, count) do { \ (count) = *(u_int16_t *) dp; \ dp = (u_int8_t *)dp + \ sizeof(u_int16_t); \ } while (0); #define GET_SIZE(dp, size) do { \ (size) = *(u_int16_t *) dp; \ dp = (u_int8_t *)dp + \ sizeof(u_int16_t); \ } while (0); #define GET_PGNO(dp, pgno) do { \ (pgno) = *(db_pgno_t *) dp; \ dp = (u_int8_t *)dp + \ sizeof(db_pgno_t); \ } while (0); static int __lock_fix_list(dbenv, list_dbt, nlocks) DB_ENV *dbenv; DBT *list_dbt; u_int32_t nlocks; { DBT *obj; DB_LOCK_ILOCK *lock, *plock; u_int32_t i, j, nfid, npgno, size; int ret; u_int8_t *data, *dp; size = list_dbt->size; if (size == 0) return (0); obj = (DBT *) list_dbt->data; /* * If necessary sort the list of locks so that locks * on the same fileid are together. We do not sort * 1 or 2 locks because by definition if there are * locks on the same fileid they will be together. * The sort will also move any locks that do not * look like page locks to the end of the list * so we can stop looking for locks we can combine * when we hit one. */ switch (nlocks) { case 1: size = RET_SIZE(obj->size, 1); if ((ret = __os_malloc(dbenv, size, &data)) != 0) return (ret); dp = data; PUT_COUNT(dp, 1); PUT_PCOUNT(dp, 0); PUT_SIZE(dp, obj->size); COPY_OBJ(dp, obj); break; default: /* Sort so that all locks with same fileid are together. */ qsort(list_dbt->data, nlocks, sizeof(DBT), __lock_sort_cmp); /* FALL THROUGH */ case 2: nfid = npgno = 0; i = 0; if (obj->size != sizeof(DB_LOCK_ILOCK)) goto not_ilock; nfid = 1; plock = (DB_LOCK_ILOCK *)obj->data; /* We use ulen to keep track of the number of pages. */ j = 0; obj[0].ulen = 0; for (i = 1; i < nlocks; i++) { if (obj[i].size != sizeof(DB_LOCK_ILOCK)) break; lock = (DB_LOCK_ILOCK *)obj[i].data; if (obj[j].ulen < MAX_PGNOS && lock->type == plock->type && memcmp(lock->fileid, plock->fileid, DB_FILE_ID_LEN) == 0) { obj[j].ulen++; npgno++; } else { nfid++; plock = lock; j = i; obj[j].ulen = 0; } } not_ilock: size = nfid * sizeof(DB_LOCK_ILOCK); size += npgno * sizeof(db_pgno_t); /* Add the number of nonstandard locks and get their size. */ nfid += nlocks - i; for (; i < nlocks; i++) { size += obj[i].size; obj[i].ulen = 0; } size = RET_SIZE(size, nfid); if ((ret = __os_malloc(dbenv, size, &data)) != 0) return (ret); dp = data; PUT_COUNT(dp, nfid); for (i = 0; i < nlocks; i = j) { PUT_PCOUNT(dp, obj[i].ulen); PUT_SIZE(dp, obj[i].size); COPY_OBJ(dp, &obj[i]); lock = (DB_LOCK_ILOCK *)obj[i].data; for (j = i + 1; j <= i + obj[i].ulen; j++) { lock = (DB_LOCK_ILOCK *)obj[j].data; PUT_PGNO(dp, lock->pgno); } } } (void)__os_free(dbenv, list_dbt->data); list_dbt->data = data; list_dbt->size = size; return (0); } /* * PUBLIC: int __lock_get_list __P((DB_ENV *, u_int32_t, u_int32_t, * PUBLIC: db_lockmode_t, DBT *)); */ int __lock_get_list(dbenv, locker, flags, lock_mode, list) DB_ENV *dbenv; u_int32_t locker, flags; db_lockmode_t lock_mode; DBT *list; { DBT obj_dbt; DB_LOCK ret_lock; DB_LOCK_ILOCK *lock; DB_LOCKTAB *lt; DB_LOCKREGION *region; db_pgno_t save_pgno; u_int16_t npgno, size; u_int32_t i, nlocks; int ret; void *dp; if (list->size == 0) return (0); ret = 0; lt = dbenv->lk_handle; region = lt->reginfo.primary; dp = list->data; GET_COUNT(dp, nlocks); LOCKREGION(dbenv, (DB_LOCKTAB *)dbenv->lk_handle); for (i = 0; i < nlocks; i++) { GET_PCOUNT(dp, npgno); GET_SIZE(dp, size); lock = (DB_LOCK_ILOCK *) dp; save_pgno = lock->pgno; obj_dbt.data = dp; obj_dbt.size = size; dp = ((u_int8_t *)dp) + ALIGN(size, sizeof(u_int32_t)); do { if ((ret = __lock_get_internal(lt, locker, flags, &obj_dbt, lock_mode, 0, &ret_lock)) != 0) { lock->pgno = save_pgno; goto err; } if (npgno != 0) GET_PGNO(dp, lock->pgno); } while (npgno-- != 0); lock->pgno = save_pgno; } err: UNLOCKREGION(dbenv, (DB_LOCKTAB *)dbenv->lk_handle); return (ret); } static int __lock_sort_cmp(a, b) const void *a, *b; { const DBT *d1, *d2; DB_LOCK_ILOCK *l1, *l2; d1 = a; d2 = b; /* Force all non-standard locks to sort at end. */ if (d1->size != sizeof(DB_LOCK_ILOCK)) { if (d2->size != sizeof(DB_LOCK_ILOCK)) return (d1->size - d2->size); else return (1); } else if (d2->size != sizeof(DB_LOCK_ILOCK)) return (-1); l1 = d1->data; l2 = d2->data; if (l1->type != l2->type) return (l1->type - l2->type); return (memcmp(l1->fileid, l2->fileid, DB_FILE_ID_LEN)); }