/*- * See the file LICENSE for redistribution information. * * Copyright (c) 2000,2008 Oracle. All rights reserved. * * $Id: db_cam.c,v 12.79 2008/05/07 12:27:32 bschmeck Exp $ */ #include "db_config.h" #include "db_int.h" #include "dbinc/db_page.h" #include "dbinc/btree.h" #include "dbinc/hash.h" #include "dbinc/lock.h" #include "dbinc/mp.h" #include "dbinc/qam.h" #include "dbinc/txn.h" static int __db_buildpartial __P((DB *, DBT *, DBT *, DBT *)); static int __db_s_count __P((DB *)); static int __db_wrlock_err __P((ENV *)); static int __dbc_cleanup __P((DBC *, DBC *, int)); static int __dbc_del_foreign __P((DBC *)); static int __dbc_del_oldskey __P((DB *, DBC *, DBT *, DBT *, DBT *)); static int __dbc_del_secondary __P((DBC *)); static int __dbc_pget_recno __P((DBC *, DBT *, DBT *, u_int32_t)); #define CDB_LOCKING_INIT(env, dbc) \ /* \ * If we are running CDB, this had better be either a write \ * cursor or an immediate writer. If it's a regular writer, \ * that means we have an IWRITE lock and we need to upgrade \ * it to a write lock. \ */ \ if (CDB_LOCKING(env)) { \ if (!F_ISSET(dbc, DBC_WRITECURSOR | DBC_WRITER)) \ return (__db_wrlock_err(env)); \ \ if (F_ISSET(dbc, DBC_WRITECURSOR) && \ (ret = __lock_get(env, \ (dbc)->locker, DB_LOCK_UPGRADE, &(dbc)->lock_dbt, \ DB_LOCK_WRITE, &(dbc)->mylock)) != 0) \ return (ret); \ } #define CDB_LOCKING_DONE(env, dbc) \ /* Release the upgraded lock. */ \ if (F_ISSET(dbc, DBC_WRITECURSOR)) \ (void)__lock_downgrade( \ env, &(dbc)->mylock, DB_LOCK_IWRITE, 0); /* * __dbc_close -- * DBC->close. * * PUBLIC: int __dbc_close __P((DBC *)); */ int __dbc_close(dbc) DBC *dbc; { DB *dbp; DBC *opd; DBC_INTERNAL *cp; DB_TXN *txn; ENV *env; int ret, t_ret; dbp = dbc->dbp; env = dbp->env; cp = dbc->internal; opd = cp->opd; ret = 0; /* * Remove the cursor(s) from the active queue. We may be closing two * cursors at once here, a top-level one and a lower-level, off-page * duplicate one. The access-method specific cursor close routine must * close both of them in a single call. * * !!! * Cursors must be removed from the active queue before calling the * access specific cursor close routine, btree depends on having that * order of operations. */ MUTEX_LOCK(env, dbp->mutex); if (opd != NULL) { DB_ASSERT(env, F_ISSET(opd, DBC_ACTIVE)); F_CLR(opd, DBC_ACTIVE); TAILQ_REMOVE(&dbp->active_queue, opd, links); } DB_ASSERT(env, F_ISSET(dbc, DBC_ACTIVE)); F_CLR(dbc, DBC_ACTIVE); TAILQ_REMOVE(&dbp->active_queue, dbc, links); MUTEX_UNLOCK(env, dbp->mutex); /* Call the access specific cursor close routine. */ if ((t_ret = dbc->am_close(dbc, PGNO_INVALID, NULL)) != 0 && ret == 0) ret = t_ret; /* * Release the lock after calling the access method specific close * routine, a Btree cursor may have had pending deletes. */ if (CDB_LOCKING(env)) { /* * Also, be sure not to free anything if mylock.off is * INVALID; in some cases, such as idup'ed read cursors * and secondary update cursors, a cursor in a CDB * environment may not have a lock at all. */ if ((t_ret = __LPUT(dbc, dbc->mylock)) != 0 && ret == 0) ret = t_ret; /* For safety's sake, since this is going on the free queue. */ memset(&dbc->mylock, 0, sizeof(dbc->mylock)); if (opd != NULL) memset(&opd->mylock, 0, sizeof(opd->mylock)); } if ((txn = dbc->txn) != NULL) txn->cursors--; /* Move the cursor(s) to the free queue. */ MUTEX_LOCK(env, dbp->mutex); if (opd != NULL) { if (txn != NULL) txn->cursors--; TAILQ_INSERT_TAIL(&dbp->free_queue, opd, links); opd = NULL; } TAILQ_INSERT_TAIL(&dbp->free_queue, dbc, links); MUTEX_UNLOCK(env, dbp->mutex); if (txn != NULL && F_ISSET(txn, TXN_PRIVATE) && txn->cursors == 0 && (t_ret = __txn_commit(txn, 0)) != 0 && ret == 0) ret = t_ret; return (ret); } /* * __dbc_destroy -- * Destroy the cursor, called after DBC->close. * * PUBLIC: int __dbc_destroy __P((DBC *)); */ int __dbc_destroy(dbc) DBC *dbc; { DB *dbp; ENV *env; int ret, t_ret; dbp = dbc->dbp; env = dbp->env; /* Remove the cursor from the free queue. */ MUTEX_LOCK(env, dbp->mutex); TAILQ_REMOVE(&dbp->free_queue, dbc, links); MUTEX_UNLOCK(env, dbp->mutex); /* Free up allocated memory. */ if (dbc->my_rskey.data != NULL) __os_free(env, dbc->my_rskey.data); if (dbc->my_rkey.data != NULL) __os_free(env, dbc->my_rkey.data); if (dbc->my_rdata.data != NULL) __os_free(env, dbc->my_rdata.data); /* Call the access specific cursor destroy routine. */ ret = dbc->am_destroy == NULL ? 0 : dbc->am_destroy(dbc); /* * Release the lock id for this cursor. */ if (LOCKING_ON(env) && F_ISSET(dbc, DBC_OWN_LID) && (t_ret = __lock_id_free(env, dbc->lref)) != 0 && ret == 0) ret = t_ret; __os_free(env, dbc); return (ret); } /* * __dbc_count -- * Return a count of duplicate data items. * * PUBLIC: int __dbc_count __P((DBC *, db_recno_t *)); */ int __dbc_count(dbc, recnop) DBC *dbc; db_recno_t *recnop; { ENV *env; int ret; env = dbc->env; /* * Cursor Cleanup Note: * All of the cursors passed to the underlying access methods by this * routine are not duplicated and will not be cleaned up on return. * So, pages/locks that the cursor references must be resolved by the * underlying functions. */ switch (dbc->dbtype) { case DB_QUEUE: case DB_RECNO: *recnop = 1; break; case DB_HASH: if (dbc->internal->opd == NULL) { if ((ret = __hamc_count(dbc, recnop)) != 0) return (ret); break; } /* FALLTHROUGH */ case DB_BTREE: if ((ret = __bamc_count(dbc, recnop)) != 0) return (ret); break; case DB_UNKNOWN: default: return (__db_unknown_type(env, "__dbc_count", dbc->dbtype)); } return (0); } /* * __dbc_del -- * DBC->del. * * PUBLIC: int __dbc_del __P((DBC *, u_int32_t)); */ int __dbc_del(dbc, flags) DBC *dbc; u_int32_t flags; { DB *dbp; DBC *opd; ENV *env; int ret, t_ret; dbp = dbc->dbp; env = dbp->env; /* * Cursor Cleanup Note: * All of the cursors passed to the underlying access methods by this * routine are not duplicated and will not be cleaned up on return. * So, pages/locks that the cursor references must be resolved by the * underlying functions. */ CDB_LOCKING_INIT(env, dbc); /* * If we're a secondary index, and DB_UPDATE_SECONDARY isn't set * (which it only is if we're being called from a primary update), * then we need to call through to the primary and delete the item. * * Note that this will delete the current item; we don't need to * delete it ourselves as well, so we can just goto done. */ if (flags != DB_UPDATE_SECONDARY && F_ISSET(dbp, DB_AM_SECONDARY)) { ret = __dbc_del_secondary(dbc); goto done; } /* * If we are a foreign db, go through and check any foreign key * constraints first, which will make rolling back changes on an abort * simpler. */ if (LIST_FIRST(&dbp->f_primaries) != NULL && (ret = __dbc_del_foreign(dbc)) != 0) goto done; /* * If we are a primary and have secondary indices, go through * and delete any secondary keys that point at the current record. */ if (LIST_FIRST(&dbp->s_secondaries) != NULL && (ret = __dbc_del_primary(dbc)) != 0) goto done; /* * Off-page duplicate trees are locked in the primary tree, that is, * we acquire a write lock in the primary tree and no locks in the * off-page dup tree. If the del operation is done in an off-page * duplicate tree, call the primary cursor's upgrade routine first. */ opd = dbc->internal->opd; if (opd == NULL) ret = dbc->am_del(dbc); else if ((ret = dbc->am_writelock(dbc)) == 0) ret = opd->am_del(opd); /* * If this was an update that is supporting dirty reads * then we may have just swapped our read for a write lock * which is held by the surviving cursor. We need * to explicitly downgrade this lock. The closed cursor * may only have had a read lock. */ if (F_ISSET(dbc->dbp, DB_AM_READ_UNCOMMITTED) && dbc->internal->lock_mode == DB_LOCK_WRITE) { if ((t_ret = __TLPUT(dbc, dbc->internal->lock)) != 0 && ret == 0) ret = t_ret; if (t_ret == 0) dbc->internal->lock_mode = DB_LOCK_WWRITE; } done: CDB_LOCKING_DONE(env, dbc); return (ret); } /* * __dbc_dup -- * Duplicate a cursor * * PUBLIC: int __dbc_dup __P((DBC *, DBC **, u_int32_t)); */ int __dbc_dup(dbc_orig, dbcp, flags) DBC *dbc_orig; DBC **dbcp; u_int32_t flags; { DBC *dbc_n, *dbc_nopd; int ret; dbc_n = dbc_nopd = NULL; /* Allocate a new cursor and initialize it. */ if ((ret = __dbc_idup(dbc_orig, &dbc_n, flags)) != 0) goto err; *dbcp = dbc_n; /* * If the cursor references an off-page duplicate tree, allocate a * new cursor for that tree and initialize it. */ if (dbc_orig->internal->opd != NULL) { if ((ret = __dbc_idup(dbc_orig->internal->opd, &dbc_nopd, flags)) != 0) goto err; dbc_n->internal->opd = dbc_nopd; } return (0); err: if (dbc_n != NULL) (void)__dbc_close(dbc_n); if (dbc_nopd != NULL) (void)__dbc_close(dbc_nopd); return (ret); } /* * __dbc_idup -- * Internal version of __dbc_dup. * * PUBLIC: int __dbc_idup __P((DBC *, DBC **, u_int32_t)); */ int __dbc_idup(dbc_orig, dbcp, flags) DBC *dbc_orig, **dbcp; u_int32_t flags; { DB *dbp; DBC *dbc_n; DBC_INTERNAL *int_n, *int_orig; ENV *env; int ret; dbp = dbc_orig->dbp; dbc_n = *dbcp; env = dbp->env; if ((ret = __db_cursor_int(dbp, dbc_orig->thread_info, dbc_orig->txn, dbc_orig->dbtype, dbc_orig->internal->root, F_ISSET(dbc_orig, DBC_OPD) | DBC_DUPLICATE, dbc_orig->locker, &dbc_n)) != 0) return (ret); /* Position the cursor if requested, acquiring the necessary locks. */ if (flags == DB_POSITION) { int_n = dbc_n->internal; int_orig = dbc_orig->internal; dbc_n->flags |= dbc_orig->flags & ~DBC_OWN_LID; int_n->indx = int_orig->indx; int_n->pgno = int_orig->pgno; int_n->root = int_orig->root; int_n->lock_mode = int_orig->lock_mode; switch (dbc_orig->dbtype) { case DB_QUEUE: if ((ret = __qamc_dup(dbc_orig, dbc_n)) != 0) goto err; break; case DB_BTREE: case DB_RECNO: if ((ret = __bamc_dup(dbc_orig, dbc_n)) != 0) goto err; break; case DB_HASH: if ((ret = __hamc_dup(dbc_orig, dbc_n)) != 0) goto err; break; case DB_UNKNOWN: default: ret = __db_unknown_type(env, "__dbc_idup", dbc_orig->dbtype); goto err; } } /* Copy the locking flags to the new cursor. */ F_SET(dbc_n, F_ISSET(dbc_orig, DBC_READ_COMMITTED | DBC_READ_UNCOMMITTED | DBC_WRITECURSOR)); /* * If we're in CDB and this isn't an offpage dup cursor, then * we need to get a lock for the duplicated cursor. */ if (CDB_LOCKING(env) && !F_ISSET(dbc_n, DBC_OPD) && (ret = __lock_get(env, dbc_n->locker, 0, &dbc_n->lock_dbt, F_ISSET(dbc_orig, DBC_WRITECURSOR) ? DB_LOCK_IWRITE : DB_LOCK_READ, &dbc_n->mylock)) != 0) goto err; dbc_n->priority = dbc_orig->priority; *dbcp = dbc_n; return (0); err: (void)__dbc_close(dbc_n); return (ret); } /* * __dbc_newopd -- * Create a new off-page duplicate cursor. * * PUBLIC: int __dbc_newopd __P((DBC *, db_pgno_t, DBC *, DBC **)); */ int __dbc_newopd(dbc_parent, root, oldopd, dbcp) DBC *dbc_parent; db_pgno_t root; DBC *oldopd; DBC **dbcp; { DB *dbp; DBC *opd; DBTYPE dbtype; int ret; dbp = dbc_parent->dbp; dbtype = (dbp->dup_compare == NULL) ? DB_RECNO : DB_BTREE; /* * On failure, we want to default to returning the old off-page dup * cursor, if any; our caller can't be left with a dangling pointer * to a freed cursor. On error the only allowable behavior is to * close the cursor (and the old OPD cursor it in turn points to), so * this should be safe. */ *dbcp = oldopd; if ((ret = __db_cursor_int(dbp, dbc_parent->thread_info, dbc_parent->txn, dbtype, root, DBC_OPD, dbc_parent->locker, &opd)) != 0) return (ret); opd->priority = dbc_parent->priority; *dbcp = opd; /* * Check to see if we already have an off-page dup cursor that we've * passed in. If we do, close it. It'd be nice to use it again * if it's a cursor belonging to the right tree, but if we're doing * a cursor-relative operation this might not be safe, so for now * we'll take the easy way out and always close and reopen. * * Note that under no circumstances do we want to close the old * cursor without returning a valid new one; we don't want to * leave the main cursor in our caller with a non-NULL pointer * to a freed off-page dup cursor. */ if (oldopd != NULL && (ret = __dbc_close(oldopd)) != 0) return (ret); return (0); } /* * __dbc_get -- * Get using a cursor. * * PUBLIC: int __dbc_get __P((DBC *, DBT *, DBT *, u_int32_t)); */ int __dbc_get(dbc_arg, key, data, flags) DBC *dbc_arg; DBT *key, *data; u_int32_t flags; { DB *dbp; DBC *dbc, *dbc_n, *opd; DBC_INTERNAL *cp, *cp_n; DB_MPOOLFILE *mpf; ENV *env; db_pgno_t pgno; db_indx_t indx_off; u_int32_t multi, orig_ulen, tmp_flags, tmp_read_uncommitted, tmp_rmw; u_int8_t type; int key_small, ret, t_ret; COMPQUIET(orig_ulen, 0); key_small = 0; /* * Cursor Cleanup Note: * All of the cursors passed to the underlying access methods by this * routine are duplicated cursors. On return, any referenced pages * will be discarded, and, if the cursor is not intended to be used * again, the close function will be called. So, pages/locks that * the cursor references do not need to be resolved by the underlying * functions. */ dbp = dbc_arg->dbp; env = dbp->env; mpf = dbp->mpf; dbc_n = NULL; opd = NULL; /* Clear OR'd in additional bits so we can check for flag equality. */ tmp_rmw = LF_ISSET(DB_RMW); LF_CLR(DB_RMW); tmp_read_uncommitted = LF_ISSET(DB_READ_UNCOMMITTED) && !F_ISSET(dbc_arg, DBC_READ_UNCOMMITTED); LF_CLR(DB_READ_UNCOMMITTED); multi = LF_ISSET(DB_MULTIPLE|DB_MULTIPLE_KEY); LF_CLR(DB_MULTIPLE|DB_MULTIPLE_KEY); /* * Return a cursor's record number. It has nothing to do with the * cursor get code except that it was put into the interface. */ if (flags == DB_GET_RECNO) { if (tmp_rmw) F_SET(dbc_arg, DBC_RMW); if (tmp_read_uncommitted) F_SET(dbc_arg, DBC_READ_UNCOMMITTED); ret = __bamc_rget(dbc_arg, data); if (tmp_rmw) F_CLR(dbc_arg, DBC_RMW); if (tmp_read_uncommitted) F_CLR(dbc_arg, DBC_READ_UNCOMMITTED); return (ret); } if (flags == DB_CONSUME || flags == DB_CONSUME_WAIT) CDB_LOCKING_INIT(env, dbc_arg); /* Don't return the key or data if it was passed to us. */ if (!DB_RETURNS_A_KEY(dbp, flags)) F_SET(key, DB_DBT_ISSET); if (flags == DB_GET_BOTH && (dbp->dup_compare == NULL || dbp->dup_compare == __bam_defcmp)) F_SET(data, DB_DBT_ISSET); /* * If we have an off-page duplicates cursor, and the operation applies * to it, perform the operation. Duplicate the cursor and call the * underlying function. * * Off-page duplicate trees are locked in the primary tree, that is, * we acquire a write lock in the primary tree and no locks in the * off-page dup tree. If the DB_RMW flag was specified and the get * operation is done in an off-page duplicate tree, call the primary * cursor's upgrade routine first. */ cp = dbc_arg->internal; if (cp->opd != NULL && (flags == DB_CURRENT || flags == DB_GET_BOTHC || flags == DB_NEXT || flags == DB_NEXT_DUP || flags == DB_PREV || flags == DB_PREV_DUP)) { if (tmp_rmw && (ret = dbc_arg->am_writelock(dbc_arg)) != 0) goto err; if (F_ISSET(dbc_arg, DBC_TRANSIENT)) opd = cp->opd; else if ((ret = __dbc_idup(cp->opd, &opd, DB_POSITION)) != 0) goto err; switch (ret = opd->am_get(opd, key, data, flags, NULL)) { case 0: goto done; case DB_NOTFOUND: /* * Translate DB_NOTFOUND failures for the DB_NEXT and * DB_PREV operations into a subsequent operation on * the parent cursor. */ if (flags == DB_NEXT || flags == DB_PREV) { if ((ret = __dbc_close(opd)) != 0) goto err; opd = NULL; if (F_ISSET(dbc_arg, DBC_TRANSIENT)) cp->opd = NULL; break; } goto err; default: goto err; } } else if (cp->opd != NULL && F_ISSET(dbc_arg, DBC_TRANSIENT)) { if ((ret = __dbc_close(cp->opd)) != 0) goto err; cp->opd = NULL; } /* * Perform an operation on the main cursor. Duplicate the cursor, * upgrade the lock as required, and call the underlying function. */ switch (flags) { case DB_CURRENT: case DB_GET_BOTHC: case DB_NEXT: case DB_NEXT_DUP: case DB_NEXT_NODUP: case DB_PREV: case DB_PREV_DUP: case DB_PREV_NODUP: tmp_flags = DB_POSITION; break; default: tmp_flags = 0; break; } if (tmp_read_uncommitted) F_SET(dbc_arg, DBC_READ_UNCOMMITTED); /* * If this cursor is going to be closed immediately, we don't * need to take precautions to clean it up on error. */ if (F_ISSET(dbc_arg, DBC_TRANSIENT)) dbc_n = dbc_arg; else { ret = __dbc_idup(dbc_arg, &dbc_n, tmp_flags); if (tmp_read_uncommitted) F_CLR(dbc_arg, DBC_READ_UNCOMMITTED); if (ret != 0) goto err; COPY_RET_MEM(dbc_arg, dbc_n); } if (tmp_rmw) F_SET(dbc_n, DBC_RMW); switch (multi) { case DB_MULTIPLE: F_SET(dbc_n, DBC_MULTIPLE); break; case DB_MULTIPLE_KEY: F_SET(dbc_n, DBC_MULTIPLE_KEY); break; case DB_MULTIPLE | DB_MULTIPLE_KEY: F_SET(dbc_n, DBC_MULTIPLE|DBC_MULTIPLE_KEY); break; case 0: default: break; } retry: pgno = PGNO_INVALID; ret = dbc_n->am_get(dbc_n, key, data, flags, &pgno); if (tmp_rmw) F_CLR(dbc_n, DBC_RMW); if (tmp_read_uncommitted) F_CLR(dbc_arg, DBC_READ_UNCOMMITTED); F_CLR(dbc_n, DBC_MULTIPLE|DBC_MULTIPLE_KEY); if (ret != 0) goto err; cp_n = dbc_n->internal; /* * We may be referencing a new off-page duplicates tree. Acquire * a new cursor and call the underlying function. */ if (pgno != PGNO_INVALID) { if ((ret = __dbc_newopd(dbc_arg, pgno, cp_n->opd, &cp_n->opd)) != 0) goto err; switch (flags) { case DB_FIRST: case DB_NEXT: case DB_NEXT_NODUP: case DB_SET: case DB_SET_RECNO: case DB_SET_RANGE: tmp_flags = DB_FIRST; break; case DB_LAST: case DB_PREV: case DB_PREV_NODUP: tmp_flags = DB_LAST; break; case DB_GET_BOTH: case DB_GET_BOTHC: case DB_GET_BOTH_RANGE: tmp_flags = flags; break; default: ret = __db_unknown_flag(env, "__dbc_get", flags); goto err; } ret = cp_n->opd->am_get(cp_n->opd, key, data, tmp_flags, NULL); /* * Another cursor may have deleted all of the off-page * duplicates, so for DB_NEXT and DB_PREV operations we need to * retry on the parent cursor. */ switch (ret) { case 0: break; case DB_NOTFOUND: /* * Translate DB_NOTFOUND failures for the DB_NEXT and * DB_PREV operations into a subsequent operation on * the parent cursor. */ if (flags == DB_NEXT || flags == DB_PREV) { if ((ret = __dbc_close(cp_n->opd)) != 0) goto err; cp_n->opd = NULL; goto retry; } goto err; default: goto err; } } done: /* * Return a key/data item. The only exception is that we don't return * a key if the user already gave us one, that is, if the DB_SET flag * was set. The DB_SET flag is necessary. In a Btree, the user's key * doesn't have to be the same as the key stored the tree, depending on * the magic performed by the comparison function. As we may not have * done any key-oriented operation here, the page reference may not be * valid. Fill it in as necessary. We don't have to worry about any * locks, the cursor must already be holding appropriate locks. * * XXX * If not a Btree and DB_SET_RANGE is set, we shouldn't return a key * either, should we? */ cp_n = dbc_n == NULL ? dbc_arg->internal : dbc_n->internal; if (!F_ISSET(key, DB_DBT_ISSET)) { if (cp_n->page == NULL && (ret = __memp_fget(mpf, &cp_n->pgno, dbc_arg->thread_info, dbc_arg->txn, 0, &cp_n->page)) != 0) goto err; if ((ret = __db_ret(dbp, dbc_arg->thread_info, dbc_arg->txn, cp_n->page, cp_n->indx, key, &dbc_arg->rkey->data, &dbc_arg->rkey->ulen)) != 0) { /* * If the key DBT is too small, we still want to return * the size of the data. Otherwise applications are * forced to check each one with a separate call. We * don't want to copy the data, so we set the ulen to * zero before calling __db_ret. */ if (ret == DB_BUFFER_SMALL && F_ISSET(data, DB_DBT_USERMEM)) { key_small = 1; orig_ulen = data->ulen; data->ulen = 0; } else goto err; } } if (multi != 0) { /* * Even if fetching from the OPD cursor we need a duplicate * primary cursor if we are going after multiple keys. */ if (dbc_n == NULL) { /* * Non-"_KEY" DB_MULTIPLE doesn't move the main cursor, * so it's safe to just use dbc_arg, unless dbc_arg * has an open OPD cursor whose state might need to * be preserved. */ if ((!(multi & DB_MULTIPLE_KEY) && dbc_arg->internal->opd == NULL) || F_ISSET(dbc_arg, DBC_TRANSIENT)) dbc_n = dbc_arg; else { if ((ret = __dbc_idup(dbc_arg, &dbc_n, DB_POSITION)) != 0) goto err; if ((ret = dbc_n->am_get(dbc_n, key, data, DB_CURRENT, &pgno)) != 0) goto err; } cp_n = dbc_n->internal; } /* * If opd is set then we dupped the opd that we came in with. * When we return we may have a new opd if we went to another * key. */ if (opd != NULL) { DB_ASSERT(env, cp_n->opd == NULL); cp_n->opd = opd; opd = NULL; } /* * Bulk get doesn't use __db_retcopy, so data.size won't * get set up unless there is an error. Assume success * here. This is the only call to am_bulk, and it avoids * setting it exactly the same everywhere. If we have an * DB_BUFFER_SMALL error, it'll get overwritten with the * needed value. */ data->size = data->ulen; ret = dbc_n->am_bulk(dbc_n, data, flags | multi); } else if (!F_ISSET(data, DB_DBT_ISSET)) { dbc = opd != NULL ? opd : cp_n->opd != NULL ? cp_n->opd : dbc_n; cp = dbc->internal; if (cp->page == NULL && (ret = __memp_fget(mpf, &cp->pgno, dbc_arg->thread_info, dbc->txn, 0, &cp->page)) != 0) goto err; type = TYPE(cp->page); indx_off = ((type == P_LBTREE || type == P_HASH || type == P_HASH_UNSORTED) ? O_INDX : 0); ret = __db_ret(dbp, dbc->thread_info, dbc->txn, cp->page, cp->indx + indx_off, data, &dbc_arg->rdata->data, &dbc_arg->rdata->ulen); } err: /* Don't pass DB_DBT_ISSET back to application level, error or no. */ F_CLR(key, DB_DBT_ISSET); F_CLR(data, DB_DBT_ISSET); /* Cleanup and cursor resolution. */ if (opd != NULL) { /* * To support dirty reads we must reget the write lock * if we have just stepped off a deleted record. * Since the OPD cursor does not know anything * about the referencing page or cursor we need * to peek at the OPD cursor and get the lock here. */ if (F_ISSET(dbc_arg->dbp, DB_AM_READ_UNCOMMITTED) && F_ISSET((BTREE_CURSOR *) dbc_arg->internal->opd->internal, C_DELETED)) if ((t_ret = dbc_arg->am_writelock(dbc_arg)) != 0 && ret == 0) ret = t_ret; if ((t_ret = __dbc_cleanup( dbc_arg->internal->opd, opd, ret)) != 0 && ret == 0) ret = t_ret; } if (key_small) { data->ulen = orig_ulen; if (ret == 0) ret = DB_BUFFER_SMALL; } if ((t_ret = __dbc_cleanup(dbc_arg, dbc_n, ret)) != 0 && (ret == 0 || ret == DB_BUFFER_SMALL)) ret = t_ret; if (flags == DB_CONSUME || flags == DB_CONSUME_WAIT) CDB_LOCKING_DONE(env, dbc_arg); return (ret); } /* * __dbc_put -- * Put using a cursor. * * PUBLIC: int __dbc_put __P((DBC *, DBT *, DBT *, u_int32_t)); */ int __dbc_put(dbc_arg, key, data, flags) DBC *dbc_arg; DBT *key, *data; u_int32_t flags; { DB *dbp, *sdbp; DBC *dbc_n, *fdbc, *oldopd, *opd, *sdbc, *pdbc; DBT *all_skeys, *skeyp, *tskeyp; DBT fdata, olddata, oldpkey, newdata, pkey, temppkey, tempskey; ENV *env; db_pgno_t pgno; int cmp, have_oldrec, ispartial, nodel, re_pad, ret, s_count, t_ret; u_int32_t re_len, nskey, rmw, size, tmp_flags; /* * Cursor Cleanup Note: * All of the cursors passed to the underlying access methods by this * routine are duplicated cursors. On return, any referenced pages * will be discarded, and, if the cursor is not intended to be used * again, the close function will be called. So, pages/locks that * the cursor references do not need to be resolved by the underlying * functions. */ dbp = dbc_arg->dbp; env = dbp->env; sdbp = NULL; fdbc = pdbc = dbc_n = NULL; all_skeys = NULL; memset(&newdata, 0, sizeof(DBT)); ret = s_count = 0; /* * We do multiple cursor operations in some cases and subsequently * access the data DBT information. Set DB_DBT_MALLOC so we don't risk * modification of the data between our uses of it. */ memset(&olddata, 0, sizeof(DBT)); F_SET(&olddata, DB_DBT_MALLOC); /* * Putting to secondary indices is forbidden; when we need * to internally update one, we'll call this with a private * synonym for DB_KEYLAST, DB_UPDATE_SECONDARY, which does * the right thing but won't return an error from cputchk(). */ if (flags == DB_UPDATE_SECONDARY) flags = DB_KEYLAST; CDB_LOCKING_INIT(env, dbc_arg); /* * Check to see if we are a primary and have secondary indices. * If we are not, we save ourselves a good bit of trouble and * just skip to the "normal" put. */ if (LIST_FIRST(&dbp->s_secondaries) == NULL) goto skip_s_update; /* * We have at least one secondary which we may need to update. * * There is a rather vile locking issue here. Secondary gets * will always involve acquiring a read lock in the secondary, * then acquiring a read lock in the primary. Ideally, we * would likewise perform puts by updating all the secondaries * first, then doing the actual put in the primary, to avoid * deadlock (since having multiple threads doing secondary * gets and puts simultaneously is probably a common case). * * However, if this put is a put-overwrite--and we have no way to * tell in advance whether it will be--we may need to delete * an outdated secondary key. In order to find that old * secondary key, we need to get the record we're overwriting, * before we overwrite it. * * (XXX: It would be nice to avoid this extra get, and have the * underlying put routines somehow pass us the old record * since they need to traverse the tree anyway. I'm saving * this optimization for later, as it's a lot of work, and it * would be hard to fit into this locking paradigm anyway.) * * The simple thing to do would be to go get the old record before * we do anything else. Unfortunately, though, doing so would * violate our "secondary, then primary" lock acquisition * ordering--even in the common case where no old primary record * exists, we'll still acquire and keep a lock on the page where * we're about to do the primary insert. * * To get around this, we do the following gyrations, which * hopefully solve this problem in the common case: * * 1) If this is a c_put(DB_CURRENT), go ahead and get the * old record. We already hold the lock on this page in * the primary, so no harm done, and we'll need the primary * key (which we weren't passed in this case) to do any * secondary puts anyway. * * 2) If we're doing a partial put, we need to perform the * get on the primary key right away, since we don't have * the whole datum that the secondary key is based on. * We may also need to pad out the record if the primary * has a fixed record length. * * 3) Loop through the secondary indices, putting into each a * new secondary key that corresponds to the new record. * * 4) If we haven't done so in (1) or (2), get the old primary * key/data pair. If one does not exist--the common case--we're * done with secondary indices, and can go straight on to the * primary put. * * 5) If we do have an old primary key/data pair, however, we need * to loop through all the secondaries a second time and delete * the old secondary in each. */ memset(&pkey, 0, sizeof(DBT)); s_count = __db_s_count(dbp); if ((ret = __os_calloc( env, (u_int)s_count, sizeof(DBT), &all_skeys)) != 0) goto err; have_oldrec = nodel = 0; /* * Primary indices can't have duplicates, so only DB_CURRENT, * DB_KEYFIRST, and DB_KEYLAST make any sense. Other flags * should have been caught by the checking routine, but * add a sprinkling of paranoia. */ DB_ASSERT(env, flags == DB_CURRENT || flags == DB_KEYFIRST || flags == DB_KEYLAST || flags == DB_NOOVERWRITE); /* * We'll want to use DB_RMW in a few places, but it's only legal * when locking is on. */ rmw = STD_LOCKING(dbc_arg) ? DB_RMW : 0; if (flags == DB_CURRENT) { /* Step 1. */ /* * This is safe to do on the cursor we already have; * error or no, it won't move. * * We use DB_RMW for all of these gets because we'll be * writing soon enough in the "normal" put code. In * transactional databases we'll hold those write locks * even if we close the cursor we're reading with. * * The DB_KEYEMPTY return needs special handling -- if the * cursor is on a deleted key, we return DB_NOTFOUND. */ ret = __dbc_get(dbc_arg, &pkey, &olddata, rmw | DB_CURRENT); if (ret == DB_KEYEMPTY) ret = DB_NOTFOUND; if (ret != 0) goto err; have_oldrec = 1; /* We've looked for the old record. */ } else { /* Set pkey so we can use &pkey everywhere instead of key. */ pkey.data = key->data; pkey.size = key->size; } /* * Check for partial puts (step 2). */ if (F_ISSET(data, DB_DBT_PARTIAL)) { if (!have_oldrec && !nodel) { /* * We're going to have to search the tree for the * specified key. Dup a cursor (so we have the same * locking info) and do a c_get. */ if ((ret = __dbc_idup(dbc_arg, &pdbc, 0)) != 0) goto err; /* We should have gotten DB_CURRENT in step 1. */ DB_ASSERT(env, flags != DB_CURRENT); ret = __dbc_get(pdbc, &pkey, &olddata, rmw | DB_SET); if (ret == DB_KEYEMPTY || ret == DB_NOTFOUND) { nodel = 1; ret = 0; } if ((t_ret = __dbc_close(pdbc)) != 0) ret = t_ret; if (ret != 0) goto err; have_oldrec = 1; } /* * Now build the new datum from olddata and the partial data we * were given. It's okay to do this if no record was returned * above: a partial put on an empty record is allowed, if a * little strange. The data is zero-padded. */ if ((ret = __db_buildpartial(dbp, &olddata, data, &newdata)) != 0) goto err; ispartial = 1; } else ispartial = 0; /* * Handle fixed-length records. If the primary database has * fixed-length records, we need to pad out the datum before * we pass it into the callback function; we always index the * "real" record. */ if ((dbp->type == DB_RECNO && F_ISSET(dbp, DB_AM_FIXEDLEN)) || (dbp->type == DB_QUEUE)) { if (dbp->type == DB_QUEUE) { re_len = ((QUEUE *)dbp->q_internal)->re_len; re_pad = ((QUEUE *)dbp->q_internal)->re_pad; } else { re_len = ((BTREE *)dbp->bt_internal)->re_len; re_pad = ((BTREE *)dbp->bt_internal)->re_pad; } size = ispartial ? newdata.size : data->size; if (size > re_len) { ret = __db_rec_toobig(env, size, re_len); goto err; } else if (size < re_len) { /* * If we're not doing a partial put, copy * data->data into newdata.data, then pad out * newdata.data. * * If we're doing a partial put, the data * we want are already in newdata.data; we * just need to pad. * * Either way, realloc is safe. */ if ((ret = __os_realloc(env, re_len, &newdata.data)) != 0) goto err; if (!ispartial) memcpy(newdata.data, data->data, size); memset((u_int8_t *)newdata.data + size, re_pad, re_len - size); newdata.size = re_len; ispartial = 1; } } /* * Loop through the secondaries. (Step 3.) * * Note that __db_s_first and __db_s_next will take care of * thread-locking and refcounting issues. */ for (ret = __db_s_first(dbp, &sdbp), skeyp = all_skeys; sdbp != NULL && ret == 0; ret = __db_s_next(&sdbp, dbc_arg->txn), ++skeyp) { DB_ASSERT(env, skeyp - all_skeys < s_count); /* * Don't process this secondary if the key is immutable and we * know that the old record exists. This optimization can't be * used if we have not checked for the old record yet. */ if (have_oldrec && !nodel && FLD_ISSET(sdbp->s_assoc_flags, DB_ASSOC_IMMUTABLE_KEY)) continue; /* * Call the callback for this secondary, to get the * appropriate secondary key. */ if ((ret = sdbp->s_callback(sdbp, &pkey, ispartial ? &newdata : data, skeyp)) != 0) { /* Not indexing is equivalent to an empty key set. */ if (ret == DB_DONOTINDEX) { F_SET(skeyp, DB_DBT_MULTIPLE); skeyp->size = 0; ret = 0; } else goto err; } if (sdbp->s_foreign != NULL && (ret = __db_cursor_int(sdbp->s_foreign, dbc_arg->thread_info, dbc_arg->txn, sdbp->s_foreign->type, PGNO_INVALID, 0, dbc_arg->locker, &fdbc)) != 0) goto err; /* * Mark the secondary key DBT(s) as set -- that is, the * callback returned at least one secondary key. * * Also, if this secondary index is associated with a foreign * database, check that the foreign db contains the key(s) to * maintain referential integrity. Set flags in fdata to avoid * mem copying, we just need to know existence. We need to do * this check before setting DB_DBT_ISSET, otherwise __dbc_get * will overwrite the flag values. */ if (F_ISSET(skeyp, DB_DBT_MULTIPLE)) { #ifdef DIAGNOSTIC __db_check_skeyset(sdbp, skeyp); #endif for (tskeyp = (DBT *)skeyp->data, nskey = skeyp->size; nskey > 0; nskey--, tskeyp++) { if (fdbc != NULL) { memset(&fdata, 0, sizeof(DBT)); F_SET(&fdata, DB_DBT_PARTIAL | DB_DBT_USERMEM); if ((ret = __dbc_get( fdbc, tskeyp, &fdata, DB_SET | rmw)) == DB_NOTFOUND || ret == DB_KEYEMPTY) { ret = DB_FOREIGN_CONFLICT; break; } } F_SET(tskeyp, DB_DBT_ISSET); } tskeyp = (DBT *)skeyp->data; nskey = skeyp->size; } else { if (fdbc != NULL) { memset(&fdata, 0, sizeof(DBT)); F_SET(&fdata, DB_DBT_PARTIAL | DB_DBT_USERMEM); if ((ret = __dbc_get(fdbc, skeyp, &fdata, DB_SET | rmw)) == DB_NOTFOUND || ret == DB_KEYEMPTY) ret = DB_FOREIGN_CONFLICT; } F_SET(skeyp, DB_DBT_ISSET); tskeyp = skeyp; nskey = 1; } if (fdbc != NULL && (t_ret = __dbc_close(fdbc)) != 0 && ret == 0) ret = t_ret; fdbc = NULL; if (ret != 0) goto err; /* * If we have the old record, we can generate and remove any * old secondary key(s) now. We can also skip the secondary put * if there is no change. */ if (have_oldrec) { if ((ret = __dbc_del_oldskey(sdbp, dbc_arg, skeyp, &pkey, &olddata)) == DB_KEYEXIST) continue; else if (ret != 0) goto err; } if (nskey == 0) continue; /* * Open a cursor in this secondary. * * Use the same locker ID as our primary cursor, so that * we're guaranteed that the locks don't conflict (e.g. in CDB * or if we're subdatabases that share and want to lock a * metadata page). */ if ((ret = __db_cursor_int(sdbp, dbc_arg->thread_info, dbc_arg->txn, sdbp->type, PGNO_INVALID, 0, dbc_arg->locker, &sdbc)) != 0) goto err; /* * If we're in CDB, updates will fail since the new cursor * isn't a writer. However, we hold the WRITE lock in the * primary and will for as long as our new cursor lasts, * and the primary and secondary share a lock file ID, * so it's safe to consider this a WRITER. The close * routine won't try to put anything because we don't * really have a lock. */ if (CDB_LOCKING(env)) { DB_ASSERT(env, sdbc->mylock.off == LOCK_INVALID); F_SET(sdbc, DBC_WRITER); } /* * Swap the primary key to the byte order of this secondary, if * necessary. By doing this now, we can compare directly * against the data already in the secondary without having to * swap it after reading. */ SWAP_IF_NEEDED(sdbp, &pkey); for (; nskey > 0 && ret == 0; nskey--, tskeyp++) { /* Skip this key if it is already in the database. */ if (!F_ISSET(tskeyp, DB_DBT_ISSET)) continue; /* * There are three cases here-- * 1) The secondary supports sorted duplicates. * If we attempt to put a secondary/primary pair * that already exists, that's a duplicate * duplicate, and c_put will return DB_KEYEXIST * (see __db_duperr). This will leave us with * exactly one copy of the secondary/primary pair, * and this is just right--we'll avoid deleting it * later, as the old and new secondaries will * match (since the old secondary is the dup dup * that's already there). * 2) The secondary supports duplicates, but they're not * sorted. We need to avoid putting a duplicate * duplicate, because the matching old and new * secondaries will prevent us from deleting * anything and we'll wind up with two secondary * records that point to the same primary key. Do * a c_get(DB_GET_BOTH); only do the put if the * secondary doesn't exist. * 3) The secondary doesn't support duplicates at all. * In this case, secondary keys must be unique; * if another primary key already exists for this * secondary key, we have to either overwrite it * or not put this one, and in either case we've * corrupted the secondary index. Do a * c_get(DB_SET). If the secondary/primary pair * already exists, do nothing; if the secondary * exists with a different primary, return an * error; and if the secondary does not exist, * put it. */ if (!F_ISSET(sdbp, DB_AM_DUP)) { /* Case 3. */ memset(&oldpkey, 0, sizeof(DBT)); F_SET(&oldpkey, DB_DBT_MALLOC); ret = __dbc_get(sdbc, tskeyp, &oldpkey, rmw | DB_SET); if (ret == 0) { cmp = __bam_defcmp(sdbp, &oldpkey, &pkey); __os_ufree(env, oldpkey.data); if (cmp != 0) { __db_errx(env, "%s%s", "Put results in a non-unique secondary key in an ", "index not configured to support duplicates"); ret = EINVAL; } } if (ret != DB_NOTFOUND && ret != DB_KEYEMPTY) break; } else if (!F_ISSET(sdbp, DB_AM_DUPSORT)) { /* Case 2. */ DB_INIT_DBT(tempskey, tskeyp->data, tskeyp->size); DB_INIT_DBT(temppkey, pkey.data, pkey.size); ret = __dbc_get(sdbc, &tempskey, &temppkey, rmw | DB_GET_BOTH); if (ret != DB_NOTFOUND && ret != DB_KEYEMPTY) break; } ret = __dbc_put(sdbc, tskeyp, &pkey, DB_UPDATE_SECONDARY); /* * We don't know yet whether this was a put-overwrite * that in fact changed nothing. If it was, we may get * DB_KEYEXIST. This is not an error. */ if (ret == DB_KEYEXIST) ret = 0; } /* Make sure the primary key is back in native byte-order. */ SWAP_IF_NEEDED(sdbp, &pkey); if ((t_ret = __dbc_close(sdbc)) != 0 && ret == 0) ret = t_ret; if (ret != 0) goto err; /* * Mark that we have a key for this secondary so we can check * it later before deleting the old one. We can't set it * earlier or it would be cleared in the calls above. */ F_SET(skeyp, DB_DBT_ISSET); } if (ret != 0) goto err; /* * If we've already got the old primary key/data pair, the secondary * updates are already done. */ if (have_oldrec) goto skip_s_update; /* * If still necessary, go get the old primary key/data. (Step 4.) * * See the comments in step 2. This is real familiar. */ if ((ret = __dbc_idup(dbc_arg, &pdbc, 0)) != 0) goto err; DB_ASSERT(env, flags != DB_CURRENT); pkey.data = key->data; pkey.size = key->size; ret = __dbc_get(pdbc, &pkey, &olddata, rmw | DB_SET); if (ret == DB_KEYEMPTY || ret == DB_NOTFOUND) { nodel = 1; ret = 0; } if ((t_ret = __dbc_close(pdbc)) != 0 && ret == 0) ret = t_ret; if (ret != 0) goto err; /* * Check whether we do in fact have an old record we may need to * delete. (Step 5). */ if (nodel) goto skip_s_update; for (ret = __db_s_first(dbp, &sdbp), skeyp = all_skeys; sdbp != NULL && ret == 0; ret = __db_s_next(&sdbp, dbc_arg->txn), skeyp++) { DB_ASSERT(env, skeyp - all_skeys < s_count); /* * Don't process this secondary if the key is immutable. We * know that the old record exists, so this optimization can * always be used. */ if (FLD_ISSET(sdbp->s_assoc_flags, DB_ASSOC_IMMUTABLE_KEY)) continue; if ((ret = __dbc_del_oldskey(sdbp, dbc_arg, skeyp, &pkey, &olddata)) != 0 && ret != DB_KEYEXIST) goto err; } if (ret != 0) goto err; /* Secondary index updates are now done. On to the "real" stuff. */ skip_s_update: /* * If we have an off-page duplicates cursor, and the operation applies * to it, perform the operation. Duplicate the cursor and call the * underlying function. * * Off-page duplicate trees are locked in the primary tree, that is, * we acquire a write lock in the primary tree and no locks in the * off-page dup tree. If the put operation is done in an off-page * duplicate tree, call the primary cursor's upgrade routine first. */ if (dbc_arg->internal->opd != NULL && (flags == DB_AFTER || flags == DB_BEFORE || flags == DB_CURRENT)) { /* * A special case for hash off-page duplicates. Hash doesn't * support (and is documented not to support) put operations * relative to a cursor which references an already deleted * item. For consistency, apply the same criteria to off-page * duplicates as well. */ if (dbc_arg->dbtype == DB_HASH && F_ISSET( ((BTREE_CURSOR *)(dbc_arg->internal->opd->internal)), C_DELETED)) { ret = DB_NOTFOUND; goto err; } if ((ret = dbc_arg->am_writelock(dbc_arg)) != 0 || (ret = __dbc_dup(dbc_arg, &dbc_n, DB_POSITION)) != 0) goto err; opd = dbc_n->internal->opd; if ((ret = opd->am_put( opd, key, data, flags, NULL)) != 0) goto err; goto done; } /* * Perform an operation on the main cursor. Duplicate the cursor, * and call the underlying function. */ tmp_flags = flags == DB_AFTER || flags == DB_BEFORE || flags == DB_CURRENT ? DB_POSITION : 0; /* * If this cursor is going to be closed immediately, we don't * need to take precautions to clean it up on error. */ if (F_ISSET(dbc_arg, DBC_TRANSIENT)) dbc_n = dbc_arg; else if ((ret = __dbc_idup(dbc_arg, &dbc_n, tmp_flags)) != 0) goto err; pgno = PGNO_INVALID; if ((ret = dbc_n->am_put(dbc_n, key, data, flags, &pgno)) != 0) goto err; /* * We may be referencing a new off-page duplicates tree. Acquire * a new cursor and call the underlying function. */ if (pgno != PGNO_INVALID) { oldopd = dbc_n->internal->opd; if ((ret = __dbc_newopd(dbc_arg, pgno, oldopd, &opd)) != 0) { dbc_n->internal->opd = opd; goto err; } dbc_n->internal->opd = opd; if (flags == DB_NOOVERWRITE) flags = DB_KEYLAST; if ((ret = opd->am_put( opd, key, data, flags, NULL)) != 0) goto err; } done: err: /* Cleanup and cursor resolution. */ if ((t_ret = __dbc_cleanup(dbc_arg, dbc_n, ret)) != 0 && ret == 0) ret = t_ret; /* If newdata or olddata were used, free their buffers. */ if (newdata.data != NULL) __os_free(env, newdata.data); if (olddata.data != NULL) __os_ufree(env, olddata.data); CDB_LOCKING_DONE(env, dbc_arg); if (sdbp != NULL && (t_ret = __db_s_done(sdbp, dbc_arg->txn)) != 0 && ret == 0) ret = t_ret; for (skeyp = all_skeys; skeyp - all_skeys < s_count; skeyp++) { if (F_ISSET(skeyp, DB_DBT_MULTIPLE)) { for (nskey = skeyp->size, tskeyp = (DBT *)skeyp->data; nskey > 0; nskey--, tskeyp++) FREE_IF_NEEDED(env, tskeyp); } FREE_IF_NEEDED(env, skeyp); } if (all_skeys != NULL) __os_free(env, all_skeys); return (ret); } /* * __dbc_del_oldskey -- * Delete an old secondary key, if necessary. * Returns DB_KEYEXIST if the new and old keys match.. */ static int __dbc_del_oldskey(sdbp, dbc_arg, skey, pkey, olddata) DB *sdbp; DBC *dbc_arg; DBT *skey, *pkey, *olddata; { DB *dbp; DBC *sdbc; DBT *toldskeyp, *tskeyp; DBT oldskey, temppkey, tempskey; ENV *env; int ret, t_ret; u_int32_t i, noldskey, nsame, nskey, rmw; sdbc = NULL; dbp = sdbp->s_primary; env = dbp->env; nsame = 0; rmw = STD_LOCKING(dbc_arg) ? DB_RMW : 0; /* * Get the old secondary key. */ memset(&oldskey, 0, sizeof(DBT)); if ((ret = sdbp->s_callback(sdbp, pkey, olddata, &oldskey)) != 0) { if (ret == DB_DONOTINDEX || (F_ISSET(&oldskey, DB_DBT_MULTIPLE) && oldskey.size == 0)) /* There's no old key to delete. */ ret = 0; return (ret); } if (F_ISSET(&oldskey, DB_DBT_MULTIPLE)) { #ifdef DIAGNOSTIC __db_check_skeyset(sdbp, &oldskey); #endif toldskeyp = (DBT *)oldskey.data; noldskey = oldskey.size; } else { toldskeyp = &oldskey; noldskey = 1; } if (F_ISSET(skey, DB_DBT_MULTIPLE)) { nskey = skey->size; skey = (DBT *)skey->data; } else nskey = F_ISSET(skey, DB_DBT_ISSET) ? 1 : 0; for (; noldskey > 0 && ret == 0; noldskey--, toldskeyp++) { /* * Check whether this old secondary key is also a new key * before we delete it. Note that bt_compare is (and must be) * set no matter what access method we're in. */ for (i = 0, tskeyp = skey; i < nskey; i++, tskeyp++) if (((BTREE *)sdbp->bt_internal)->bt_compare(sdbp, toldskeyp, tskeyp) == 0) { nsame++; F_CLR(tskeyp, DB_DBT_ISSET); break; } if (i < nskey) { FREE_IF_NEEDED(env, toldskeyp); continue; } if (sdbc == NULL) { if ((ret = __db_cursor_int(sdbp, dbc_arg->thread_info, dbc_arg->txn, sdbp->type, PGNO_INVALID, 0, dbc_arg->locker, &sdbc)) != 0) goto err; if (CDB_LOCKING(env)) { DB_ASSERT(env, sdbc->mylock.off == LOCK_INVALID); F_SET(sdbc, DBC_WRITER); } } /* * Don't let c_get(DB_GET_BOTH) stomp on our data. Use * temporary DBTs instead. */ SWAP_IF_NEEDED(sdbp, pkey); DB_INIT_DBT(temppkey, pkey->data, pkey->size); DB_INIT_DBT(tempskey, toldskeyp->data, toldskeyp->size); if ((ret = __dbc_get(sdbc, &tempskey, &temppkey, rmw | DB_GET_BOTH)) == 0) ret = __dbc_del(sdbc, DB_UPDATE_SECONDARY); else if (ret == DB_NOTFOUND) ret = __db_secondary_corrupt(dbp); SWAP_IF_NEEDED(sdbp, pkey); FREE_IF_NEEDED(env, toldskeyp); } err: for (; noldskey > 0; noldskey--, toldskeyp++) FREE_IF_NEEDED(env, toldskeyp); FREE_IF_NEEDED(env, &oldskey); if (sdbc != NULL && (t_ret = __dbc_close(sdbc)) != 0 && ret == 0) ret = t_ret; if (ret == 0 && nsame == nskey) return (DB_KEYEXIST); return (ret); } /* * __db_duperr() * Error message: we don't currently support sorted duplicate duplicates. * PUBLIC: int __db_duperr __P((DB *, u_int32_t)); */ int __db_duperr(dbp, flags) DB *dbp; u_int32_t flags; { /* * If we run into this error while updating a secondary index, * don't yell--there's no clean way to pass DB_NODUPDATA in along * with DB_UPDATE_SECONDARY, but we may run into this problem * in a normal, non-error course of events. * * !!! * If and when we ever permit duplicate duplicates in sorted-dup * databases, we need to either change the secondary index code * to check for dup dups, or we need to maintain the implicit * "DB_NODUPDATA" behavior for databases with DB_AM_SECONDARY set. */ if (flags != DB_NODUPDATA && !F_ISSET(dbp, DB_AM_SECONDARY)) __db_errx(dbp->env, "Duplicate data items are not supported with sorted data"); return (DB_KEYEXIST); } /* * __dbc_cleanup -- * Clean up duplicate cursors. */ static int __dbc_cleanup(dbc, dbc_n, failed) DBC *dbc, *dbc_n; int failed; { DB *dbp; DBC *opd; DBC_INTERNAL *internal; DB_MPOOLFILE *mpf; int ret, t_ret; dbp = dbc->dbp; mpf = dbp->mpf; internal = dbc->internal; ret = 0; /* Discard any pages we're holding. */ if (internal->page != NULL) { if ((t_ret = __memp_fput(mpf, dbc->thread_info, internal->page, dbc->priority)) != 0 && ret == 0) ret = t_ret; internal->page = NULL; } opd = internal->opd; if (opd != NULL && opd->internal->page != NULL) { if ((t_ret = __memp_fput(mpf, dbc->thread_info, opd->internal->page, dbc->priority)) != 0 && ret == 0) ret = t_ret; opd->internal->page = NULL; } /* * If dbc_n is NULL, there's no internal cursor swapping to be done * and no dbc_n to close--we probably did the entire operation on an * offpage duplicate cursor. Just return. * * If dbc and dbc_n are the same, we're either inside a DB->{put/get} * operation, and as an optimization we performed the operation on * the main cursor rather than on a duplicated one, or we're in a * bulk get that can't have moved the cursor (DB_MULTIPLE with the * initial c_get operation on an off-page dup cursor). Just * return--either we know we didn't move the cursor, or we're going * to close it before we return to application code, so we're sure * not to visibly violate the "cursor stays put on error" rule. */ if (dbc_n == NULL || dbc == dbc_n) return (ret); if (dbc_n->internal->page != NULL) { if ((t_ret = __memp_fput(mpf, dbc->thread_info, dbc_n->internal->page, dbc->priority)) != 0 && ret == 0) ret = t_ret; dbc_n->internal->page = NULL; } opd = dbc_n->internal->opd; if (opd != NULL && opd->internal->page != NULL) { if ((t_ret = __memp_fput(mpf, dbc->thread_info, opd->internal->page, dbc->priority)) != 0 && ret == 0) ret = t_ret; opd->internal->page = NULL; } /* * If we didn't fail before entering this routine or just now when * freeing pages, swap the interesting contents of the old and new * cursors. */ if (!failed && ret == 0) { dbc->internal = dbc_n->internal; dbc_n->internal = internal; } /* * Close the cursor we don't care about anymore. The close can fail, * but we only expect DB_LOCK_DEADLOCK failures. This violates our * "the cursor is unchanged on error" semantics, but since all you can * do with a DB_LOCK_DEADLOCK failure is close the cursor, I believe * that's OK. * * XXX * There's no way to recover from failure to close the old cursor. * All we can do is move to the new position and return an error. * * XXX * We might want to consider adding a flag to the cursor, so that any * subsequent operations other than close just return an error? */ if ((t_ret = __dbc_close(dbc_n)) != 0 && ret == 0) ret = t_ret; /* * If this was an update that is supporting dirty reads * then we may have just swapped our read for a write lock * which is held by the surviving cursor. We need * to explicitly downgrade this lock. The closed cursor * may only have had a read lock. */ if (F_ISSET(dbp, DB_AM_READ_UNCOMMITTED) && dbc->internal->lock_mode == DB_LOCK_WRITE) { if ((t_ret = __TLPUT(dbc, dbc->internal->lock)) != 0 && ret == 0) ret = t_ret; if (t_ret == 0) dbc->internal->lock_mode = DB_LOCK_WWRITE; } return (ret); } /* * __dbc_secondary_get_pp -- * This wrapper function for DBC->pget() is the DBC->get() function * for a secondary index cursor. * * PUBLIC: int __dbc_secondary_get_pp __P((DBC *, DBT *, DBT *, u_int32_t)); */ int __dbc_secondary_get_pp(dbc, skey, data, flags) DBC *dbc; DBT *skey, *data; u_int32_t flags; { DB_ASSERT(dbc->env, F_ISSET(dbc->dbp, DB_AM_SECONDARY)); return (__dbc_pget_pp(dbc, skey, NULL, data, flags)); } /* * __dbc_pget -- * Get a primary key/data pair through a secondary index. * * PUBLIC: int __dbc_pget __P((DBC *, DBT *, DBT *, DBT *, u_int32_t)); */ int __dbc_pget(dbc, skey, pkey, data, flags) DBC *dbc; DBT *skey, *pkey, *data; u_int32_t flags; { DB *pdbp, *sdbp; DBC *dbc_n, *pdbc; DBT nullpkey; u_int32_t save_pkey_flags, tmp_flags, tmp_read_uncommitted, tmp_rmw; int pkeymalloc, ret, t_ret; sdbp = dbc->dbp; pdbp = sdbp->s_primary; dbc_n = NULL; pkeymalloc = t_ret = 0; /* * The challenging part of this function is getting the behavior * right for all the various permutations of DBT flags. The * next several blocks handle the various cases we need to * deal with specially. */ /* * We may be called with a NULL pkey argument, if we've been * wrapped by a 2-DBT get call. If so, we need to use our * own DBT. */ if (pkey == NULL) { memset(&nullpkey, 0, sizeof(DBT)); pkey = &nullpkey; } /* Clear OR'd in additional bits so we can check for flag equality. */ tmp_rmw = LF_ISSET(DB_RMW); LF_CLR(DB_RMW); tmp_read_uncommitted = LF_ISSET(DB_READ_UNCOMMITTED) && !F_ISSET(dbc, DBC_READ_UNCOMMITTED); LF_CLR(DB_READ_UNCOMMITTED); /* * DB_GET_RECNO is a special case, because we're interested not in * the primary key/data pair, but rather in the primary's record * number. */ if (flags == DB_GET_RECNO) { if (tmp_rmw) F_SET(dbc, DBC_RMW); if (tmp_read_uncommitted) F_SET(dbc, DBC_READ_UNCOMMITTED); ret = __dbc_pget_recno(dbc, pkey, data, flags); if (tmp_rmw) F_CLR(dbc, DBC_RMW); if (tmp_read_uncommitted) F_CLR(dbc, DBC_READ_UNCOMMITTED); return (ret); } /* * If the DBTs we've been passed don't have any of the * user-specified memory management flags set, we want to make sure * we return values using the DBTs dbc->rskey, dbc->rkey, and * dbc->rdata, respectively. * * There are two tricky aspects to this: first, we need to pass * skey and pkey *in* to the initial c_get on the secondary key, * since either or both may be looked at by it (depending on the * get flag). Second, we must not use a normal DB->get call * on the secondary, even though that's what we want to accomplish, * because the DB handle may be free-threaded. Instead, * we open a cursor, then take steps to ensure that we actually use * the rkey/rdata from the *secondary* cursor. * * We accomplish all this by passing in the DBTs we started out * with to the c_get, but swapping the contents of rskey and rkey, * respectively, into rkey and rdata; __db_ret will treat them like * the normal key/data pair in a c_get call, and will realloc them as * need be (this is "step 1"). Then, for "step 2", we swap back * rskey/rkey/rdata to normal, and do a get on the primary with the * secondary dbc appointed as the owner of the returned-data memory. * * Note that in step 2, we copy the flags field in case we need to * pass down a DB_DBT_PARTIAL or other flag that is compatible with * letting DB do the memory management. */ /* * It is correct, though slightly sick, to attempt a partial get of a * primary key. However, if we do so here, we'll never find the * primary record; clear the DB_DBT_PARTIAL field of pkey just for the * duration of the next call. */ save_pkey_flags = pkey->flags; F_CLR(pkey, DB_DBT_PARTIAL); /* * Now we can go ahead with the meat of this call. First, get the * primary key from the secondary index. (What exactly we get depends * on the flags, but the underlying cursor get will take care of the * dirty work.) Duplicate the cursor, in case the later get on the * primary fails. */ switch (flags) { case DB_CURRENT: case DB_GET_BOTHC: case DB_NEXT: case DB_NEXT_DUP: case DB_NEXT_NODUP: case DB_PREV: case DB_PREV_DUP: case DB_PREV_NODUP: tmp_flags = DB_POSITION; break; default: tmp_flags = 0; break; } if ((ret = __dbc_dup(dbc, &dbc_n, tmp_flags)) != 0) return (ret); F_SET(dbc_n, DBC_TRANSIENT); if (tmp_rmw) F_SET(dbc_n, DBC_RMW); if (tmp_read_uncommitted) F_SET(dbc_n, DBC_READ_UNCOMMITTED); /* * If we've been handed a primary key, it will be in native byte order, * so we need to swap it before reading from the secondary. */ if (flags == DB_GET_BOTH || flags == DB_GET_BOTHC || flags == DB_GET_BOTH_RANGE) SWAP_IF_NEEDED(sdbp, pkey); retry: /* Step 1. */ dbc_n->rdata = dbc->rkey; dbc_n->rkey = dbc->rskey; ret = __dbc_get(dbc_n, skey, pkey, flags); /* Restore pkey's flags in case we stomped the PARTIAL flag. */ pkey->flags = save_pkey_flags; /* * We need to swap the primary key to native byte order if we read it * successfully, or if we swapped it on entry above. We can't return * with the application's data modified. */ if (ret == 0 || flags == DB_GET_BOTH || flags == DB_GET_BOTHC || flags == DB_GET_BOTH_RANGE) SWAP_IF_NEEDED(sdbp, pkey); if (ret != 0) goto err; /* * Now we're ready for "step 2". If either or both of pkey and data do * not have memory management flags set--that is, if DB is managing * their memory--we need to swap around the rkey/rdata structures so * that we don't wind up trying to use memory managed by the primary * database cursor, which we'll close before we return. * * !!! * If you're carefully following the bouncing ball, you'll note that in * the DB-managed case, the buffer hanging off of pkey is the same as * dbc->rkey->data. This is just fine; we may well realloc and stomp * on it when we return, if we're doing a DB_GET_BOTH and need to * return a different partial or key (depending on the comparison * function), but this is safe. * * !!! * We need to use __db_cursor_int here rather than simply calling * pdbp->cursor, because otherwise, if we're in CDB, we'll allocate a * new locker ID and leave ourselves open to deadlocks. (Even though * we're only acquiring read locks, we'll still block if there are any * waiters.) */ if ((ret = __db_cursor_int(pdbp, dbc->thread_info, dbc->txn, pdbp->type, PGNO_INVALID, 0, dbc->locker, &pdbc)) != 0) goto err; if (tmp_read_uncommitted || F_ISSET(dbc, DBC_READ_UNCOMMITTED)) F_SET(pdbc, DBC_READ_UNCOMMITTED); if (tmp_rmw || F_ISSET(dbc, DBC_RMW)) F_SET(pdbc, DBC_RMW); if (F_ISSET(dbc, DBC_READ_COMMITTED)) F_SET(pdbc, DBC_READ_COMMITTED); /* * We're about to use pkey a second time. If DB_DBT_MALLOC is set on * it, we'll leak the memory we allocated the first time. Thus, set * DB_DBT_REALLOC instead so that we reuse that memory instead of * leaking it. * * Alternatively, if the application is handling copying for pkey, we * need to take a copy now. The copy will be freed on exit from * __dbc_pget_pp (and we must be coming through there if DB_DBT_USERCOPY * is set). In the case of DB_GET_BOTH_RANGE, the pkey supplied by * the application has already been copied in but the value may have * changed in the search. In that case, free the original copy and get * a new one. * * !!! * This assumes that the user must always specify a compatible realloc * function if a malloc function is specified. I think this is a * reasonable requirement. */ if (F_ISSET(pkey, DB_DBT_MALLOC)) { F_CLR(pkey, DB_DBT_MALLOC); F_SET(pkey, DB_DBT_REALLOC); pkeymalloc = 1; } else if (F_ISSET(pkey, DB_DBT_USERCOPY)) { if (flags == DB_GET_BOTH_RANGE) __dbt_userfree(sdbp->env, NULL, pkey, NULL); if ((ret = __dbt_usercopy(sdbp->env, pkey)) != 0) goto err; } /* * Do the actual get. Set DBC_TRANSIENT since we don't care about * preserving the position on error, and it's faster. SET_RET_MEM so * that the secondary DBC owns any returned-data memory. */ F_SET(pdbc, DBC_TRANSIENT); SET_RET_MEM(pdbc, dbc); ret = __dbc_get(pdbc, pkey, data, DB_SET); /* * If the item wasn't found in the primary, this is a bug; our * secondary has somehow gotten corrupted, and contains elements that * don't correspond to anything in the primary. Complain. */ /* Now close the primary cursor. */ if ((t_ret = __dbc_close(pdbc)) != 0 && ret == 0) ret = t_ret; else if (ret == DB_NOTFOUND) { if (!F_ISSET(pdbc, DBC_READ_UNCOMMITTED)) ret = __db_secondary_corrupt(pdbp); else switch (flags) { case DB_GET_BOTHC: case DB_NEXT: case DB_NEXT_DUP: case DB_NEXT_NODUP: case DB_PREV: case DB_PREV_DUP: case DB_PREV_NODUP: goto retry; default: break; } } err: /* Cleanup and cursor resolution. */ if ((t_ret = __dbc_cleanup(dbc, dbc_n, ret)) != 0 && ret == 0) ret = t_ret; if (pkeymalloc) { /* * If pkey had a MALLOC flag, we need to restore it; otherwise, * if the user frees the buffer but reuses the DBT without * NULL'ing its data field or changing the flags, we may drop * core. */ F_CLR(pkey, DB_DBT_REALLOC); F_SET(pkey, DB_DBT_MALLOC); } return (ret); } /* * __dbc_pget_recno -- * Perform a DB_GET_RECNO c_pget on a secondary index. Returns * the secondary's record number in the pkey field and the primary's * in the data field. */ static int __dbc_pget_recno(sdbc, pkey, data, flags) DBC *sdbc; DBT *pkey, *data; u_int32_t flags; { DB *pdbp, *sdbp; DBC *pdbc; DBT discardme, primary_key; ENV *env; db_recno_t oob; u_int32_t rmw; int ret, t_ret; sdbp = sdbc->dbp; pdbp = sdbp->s_primary; env = sdbp->env; pdbc = NULL; ret = t_ret = 0; rmw = LF_ISSET(DB_RMW); memset(&discardme, 0, sizeof(DBT)); F_SET(&discardme, DB_DBT_USERMEM | DB_DBT_PARTIAL); oob = RECNO_OOB; /* * If the primary is an rbtree, we want its record number, whether * or not the secondary is one too. Fetch the recno into "data". * * If it's not an rbtree, return RECNO_OOB in "data". */ if (F_ISSET(pdbp, DB_AM_RECNUM)) { /* * Get the primary key, so we can find the record number * in the primary. (We're uninterested in the secondary key.) */ memset(&primary_key, 0, sizeof(DBT)); F_SET(&primary_key, DB_DBT_MALLOC); if ((ret = __dbc_get(sdbc, &discardme, &primary_key, rmw | DB_CURRENT)) != 0) return (ret); /* * Open a cursor on the primary, set it to the right record, * and fetch its recno into "data". * * (See __dbc_pget for comments on the use of __db_cursor_int.) * * SET_RET_MEM so that the secondary DBC owns any returned-data * memory. */ if ((ret = __db_cursor_int(pdbp, sdbc->thread_info, sdbc->txn, pdbp->type, PGNO_INVALID, 0, sdbc->locker, &pdbc)) != 0) goto perr; SET_RET_MEM(pdbc, sdbc); if ((ret = __dbc_get(pdbc, &primary_key, &discardme, rmw | DB_SET)) != 0) goto perr; ret = __dbc_get(pdbc, &discardme, data, rmw | DB_GET_RECNO); perr: __os_ufree(env, primary_key.data); if (pdbc != NULL && (t_ret = __dbc_close(pdbc)) != 0 && ret == 0) ret = t_ret; if (ret != 0) return (ret); } else if ((ret = __db_retcopy(env, data, &oob, sizeof(oob), &sdbc->rkey->data, &sdbc->rkey->ulen)) != 0) return (ret); /* * If the secondary is an rbtree, we want its record number, whether * or not the primary is one too. Fetch the recno into "pkey". * * If it's not an rbtree, return RECNO_OOB in "pkey". */ if (F_ISSET(sdbp, DB_AM_RECNUM)) return (__dbc_get(sdbc, &discardme, pkey, flags)); else return (__db_retcopy(env, pkey, &oob, sizeof(oob), &sdbc->rdata->data, &sdbc->rdata->ulen)); } /* * __db_wrlock_err -- do not have a write lock. */ static int __db_wrlock_err(env) ENV *env; { __db_errx(env, "Write attempted on read-only cursor"); return (EPERM); } /* * __dbc_del_secondary -- * Perform a delete operation on a secondary index: call through * to the primary and delete the primary record that this record * points to. * * Note that deleting the primary record will call c_del on all * the secondaries, including this one; thus, it is not necessary * to execute both this function and an actual delete. */ static int __dbc_del_secondary(dbc) DBC *dbc; { DB *pdbp; DBC *pdbc; DBT skey, pkey; ENV *env; int ret, t_ret; u_int32_t rmw; pdbp = dbc->dbp->s_primary; env = pdbp->env; rmw = STD_LOCKING(dbc) ? DB_RMW : 0; /* * Get the current item that we're pointing at. * We don't actually care about the secondary key, just * the primary. */ memset(&skey, 0, sizeof(DBT)); memset(&pkey, 0, sizeof(DBT)); F_SET(&skey, DB_DBT_PARTIAL | DB_DBT_USERMEM); if ((ret = __dbc_get(dbc, &skey, &pkey, DB_CURRENT)) != 0) return (ret); SWAP_IF_NEEDED(dbc->dbp, &pkey); /* * Create a cursor on the primary with our locker ID, * so that when it calls back, we don't conflict. * * We create a cursor explicitly because there's no * way to specify the same locker ID if we're using * locking but not transactions if we use the DB->del * interface. This shouldn't be any less efficient * anyway. */ if ((ret = __db_cursor_int(pdbp, dbc->thread_info, dbc->txn, pdbp->type, PGNO_INVALID, 0, dbc->locker, &pdbc)) != 0) return (ret); /* * See comment in __dbc_put--if we're in CDB, * we already hold the locks we need, and we need to flag * the cursor as a WRITER so we don't run into errors * when we try to delete. */ if (CDB_LOCKING(env)) { DB_ASSERT(env, pdbc->mylock.off == LOCK_INVALID); F_SET(pdbc, DBC_WRITER); } /* * Set the new cursor to the correct primary key. Then * delete it. We don't really care about the datum; * just reuse our skey DBT. * * If the primary get returns DB_NOTFOUND, something is amiss-- * every record in the secondary should correspond to some record * in the primary. */ if ((ret = __dbc_get(pdbc, &pkey, &skey, DB_SET | rmw)) == 0) ret = __dbc_del(pdbc, 0); else if (ret == DB_NOTFOUND) ret = __db_secondary_corrupt(pdbp); if ((t_ret = __dbc_close(pdbc)) != 0 && ret == 0) ret = t_ret; return (ret); } /* * __dbc_del_primary -- * Perform a delete operation on a primary index. Loop through * all the secondary indices which correspond to this primary * database, and delete any secondary keys that point at the current * record. * * PUBLIC: int __dbc_del_primary __P((DBC *)); */ int __dbc_del_primary(dbc) DBC *dbc; { DB *dbp, *sdbp; DBC *sdbc; DBT *tskeyp; DBT data, pkey, skey, temppkey, tempskey; ENV *env; u_int32_t nskey, rmw; int ret, t_ret; dbp = dbc->dbp; env = dbp->env; rmw = STD_LOCKING(dbc) ? DB_RMW : 0; /* * If we're called at all, we have at least one secondary. * (Unfortunately, we can't assert this without grabbing the mutex.) * Get the current record so that we can construct appropriate * secondary keys as needed. */ memset(&pkey, 0, sizeof(DBT)); memset(&data, 0, sizeof(DBT)); if ((ret = __dbc_get(dbc, &pkey, &data, DB_CURRENT)) != 0) return (ret); memset(&skey, 0, sizeof(DBT)); for (ret = __db_s_first(dbp, &sdbp); sdbp != NULL && ret == 0; ret = __db_s_next(&sdbp, dbc->txn)) { /* * Get the secondary key for this secondary and the current * item. */ if ((ret = sdbp->s_callback(sdbp, &pkey, &data, &skey)) != 0) { /* Not indexing is equivalent to an empty key set. */ if (ret == DB_DONOTINDEX) { F_SET(&skey, DB_DBT_MULTIPLE); skey.size = 0; } else /* We had a substantive error. Bail. */ goto err; } #ifdef DIAGNOSTIC if (F_ISSET(&skey, DB_DBT_MULTIPLE)) __db_check_skeyset(sdbp, &skey); #endif if (F_ISSET(&skey, DB_DBT_MULTIPLE)) { tskeyp = (DBT *)skey.data; nskey = skey.size; if (nskey == 0) continue; } else { tskeyp = &skey; nskey = 1; } /* Open a secondary cursor. */ if ((ret = __db_cursor_int(sdbp, dbc->thread_info, dbc->txn, sdbp->type, PGNO_INVALID, 0, dbc->locker, &sdbc)) != 0) goto err; /* See comment above and in __dbc_put. */ if (CDB_LOCKING(env)) { DB_ASSERT(env, sdbc->mylock.off == LOCK_INVALID); F_SET(sdbc, DBC_WRITER); } for (; nskey > 0; nskey--, tskeyp++) { /* * Set the secondary cursor to the appropriate item. * Delete it. * * We want to use DB_RMW if locking is on; it's only * legal then, though. * * !!! * Don't stomp on any callback-allocated buffer in skey * when we do a c_get(DB_GET_BOTH); use a temp DBT * instead. Similarly, don't allow pkey to be * invalidated when the cursor is closed. */ DB_INIT_DBT(tempskey, tskeyp->data, tskeyp->size); SWAP_IF_NEEDED(sdbp, &pkey); DB_INIT_DBT(temppkey, pkey.data, pkey.size); if ((ret = __dbc_get(sdbc, &tempskey, &temppkey, DB_GET_BOTH | rmw)) == 0) ret = __dbc_del(sdbc, DB_UPDATE_SECONDARY); else if (ret == DB_NOTFOUND) ret = __db_secondary_corrupt(dbp); SWAP_IF_NEEDED(sdbp, &pkey); FREE_IF_NEEDED(env, tskeyp); } if ((t_ret = __dbc_close(sdbc)) != 0 && ret == 0) ret = t_ret; if (ret != 0) goto err; /* * In the common case where there is a single secondary key, we * will have freed any application-allocated data in skey * already. In the multiple key case, we need to free it here. * It is safe to do this twice as the macro resets the data * field. */ FREE_IF_NEEDED(env, &skey); } err: if (sdbp != NULL && (t_ret = __db_s_done(sdbp, dbc->txn)) != 0 && ret == 0) ret = t_ret; FREE_IF_NEEDED(env, &skey); return (ret); } /* * __dbc_del_foreign -- * Apply the foreign database constraints for a particular foreign * database when an item is being deleted (dbc points at item being deleted * in the foreign database.) * * Delete happens in dbp, check for occurrences of key in pdpb. * Terminology: * Foreign db = Where delete occurs (dbp). * Secondary db = Where references to dbp occur (sdbp, a secondary) * Primary db = sdbp's primary database, references to dbp are secondary * keys here * Foreign Key = Key being deleted in dbp (fkey) * Primary Key = Key of the corresponding entry in sdbp's primary (pkey). */ static int __dbc_del_foreign(dbc) DBC *dbc; { DB_FOREIGN_INFO *f_info; DB *dbp, *pdbp, *sdbp; DBC *pdbc, *sdbc; DBT data, fkey, pkey; ENV *env; u_int32_t flags, rmw; int changed, ret, t_ret; dbp = dbc->dbp; env = dbp->env; memset(&fkey, 0, sizeof(DBT)); memset(&data, 0, sizeof(DBT)); if ((ret = __dbc_get(dbc, &fkey, &data, DB_CURRENT)) != 0) return (ret); LIST_FOREACH(f_info, &(dbp->f_primaries), f_links) { sdbp = f_info->dbp; pdbp = sdbp->s_primary; flags = f_info->flags; rmw = (STD_LOCKING(dbc) && !LF_ISSET(DB_FOREIGN_ABORT)) ? DB_RMW : 0; /* * Handle CDB locking. Some of this is copied from * __dbc_del_primary, but a bit more acrobatics are required. * If we're not going to abort, then we need to get a write * cursor. If CDB_ALLDB is set, then only one write cursor is * allowed and we hold it, so we fudge things and promote the * cursor on the other DBs manually, it won't cause a problem. * If CDB_ALLDB is not set, then we go through the usual route * to make sure we block as necessary. If there are any open * read cursors on sdbp, the delete or put call later will * block. * * If NULLIFY is set, we'll need a cursor on the primary to * update it with the nullified data. Because primary and * secondary dbs share a lock file ID in CDB, we open a cursor * on the secondary and then get another writeable cursor on the * primary via __db_cursor_int to avoid deadlocking. */ sdbc = pdbc = NULL; if (!LF_ISSET(DB_FOREIGN_ABORT) && CDB_LOCKING(env) && !F_ISSET(env->dbenv, DB_ENV_CDB_ALLDB)) { ret = __db_cursor(sdbp, dbc->thread_info, dbc->txn, &sdbc, DB_WRITECURSOR); if (LF_ISSET(DB_FOREIGN_NULLIFY) && ret == 0) { ret = __db_cursor_int(pdbp, dbc->thread_info, dbc->txn, pdbp->type, PGNO_INVALID, 0, dbc->locker, &pdbc); F_SET(pdbc, DBC_WRITER); } } else { ret = __db_cursor_int(sdbp, dbc->thread_info, dbc->txn, sdbp->type, PGNO_INVALID, 0, dbc->locker, &sdbc); if (LF_ISSET(DB_FOREIGN_NULLIFY) && ret == 0) ret = __db_cursor_int(pdbp, dbc->thread_info, dbc->txn, pdbp->type, PGNO_INVALID, 0, dbc->locker, &pdbc); } if (ret != 0) { if (sdbc != NULL) (void)__dbc_close(sdbc); return (ret); } if (CDB_LOCKING(env) && F_ISSET(env->dbenv, DB_ENV_CDB_ALLDB)){ DB_ASSERT(env, sdbc->mylock.off == LOCK_INVALID); F_SET(sdbc, DBC_WRITER); if (LF_ISSET(DB_FOREIGN_NULLIFY) && pdbc != NULL) { DB_ASSERT(env, pdbc->mylock.off == LOCK_INVALID); F_SET(pdbc, DBC_WRITER); } } /* * There are three actions possible when a foreign database has * items corresponding to a deleted item: * DB_FOREIGN_ABORT - The delete operation should be aborted. * DB_FOREIGN_CASCADE - All corresponding foreign items should * be deleted. * DB_FOREIGN_NULLIFY - A callback needs to be made, allowing * the application to modify the data DBT from the * associated database. If the callback makes a * modification, the updated item needs to replace the * original item in the foreign db */ memset(&pkey, 0, sizeof(DBT)); memset(&data, 0, sizeof(DBT)); ret = __dbc_pget(sdbc, &fkey, &pkey, &data, DB_SET|rmw); if (ret == DB_NOTFOUND) { /* No entry means no constraint */ ret = __dbc_close(sdbc); if (LF_ISSET(DB_FOREIGN_NULLIFY) && (t_ret = __dbc_close(pdbc)) != 0) ret = t_ret; if (ret != 0) return (ret); continue; } else if (ret != 0) { /* Just return the error code from the pget */ (void)__dbc_close(sdbc); if (LF_ISSET(DB_FOREIGN_NULLIFY)) (void)__dbc_close(pdbc); return (ret); } else if (LF_ISSET(DB_FOREIGN_ABORT)) { /* If the record exists and ABORT is set, we're done */ if ((ret = __dbc_close(sdbc)) != 0) return (ret); return (DB_FOREIGN_CONFLICT); } /* * There were matching items in the primary DB, and the action * is either DB_FOREIGN_CASCADE or DB_FOREIGN_NULLIFY. */ while (ret == 0) { if (LF_ISSET(DB_FOREIGN_CASCADE)) { /* * Don't use the DB_UPDATE_SECONDARY flag, * since we want the delete to cascade into the * secondary's primary. */ if ((ret = __dbc_del(sdbc, 0)) != 0) { __db_err(env, ret, "Attempt to execute cascading delete in a foreign index failed"); break; } } else if (LF_ISSET(DB_FOREIGN_NULLIFY)) { changed = 0; if ((ret = f_info->callback(sdbp, &pkey, &data, &fkey, &changed)) != 0) { __db_err(env, ret, "Foreign database application callback"); break; } /* * If the user callback modified the DBT and * a put on the primary failed. */ if (changed && (ret = __dbc_put(pdbc, &pkey, &data, DB_KEYFIRST)) != 0) { __db_err(env, ret, "Attempt to overwrite item in foreign database with nullified value failed"); break; } } /* retrieve the next matching item from the prim. db */ memset(&pkey, 0, sizeof(DBT)); memset(&data, 0, sizeof(DBT)); ret = __dbc_pget(sdbc, &fkey, &pkey, &data, DB_NEXT_DUP|rmw); } if (ret == DB_NOTFOUND) ret = 0; if ((t_ret = __dbc_close(sdbc)) != 0 && ret == 0) ret = t_ret; if (LF_ISSET(DB_FOREIGN_NULLIFY) && (t_ret = __dbc_close(pdbc)) != 0 && ret == 0) ret = t_ret; if (ret != 0) return (ret); } return (ret); } /* * __db_s_first -- * Get the first secondary, if any are present, from the primary. * * PUBLIC: int __db_s_first __P((DB *, DB **)); */ int __db_s_first(pdbp, sdbpp) DB *pdbp, **sdbpp; { DB *sdbp; MUTEX_LOCK(pdbp->env, pdbp->mutex); sdbp = LIST_FIRST(&pdbp->s_secondaries); /* See __db_s_next. */ if (sdbp != NULL) sdbp->s_refcnt++; MUTEX_UNLOCK(pdbp->env, pdbp->mutex); *sdbpp = sdbp; return (0); } /* * __db_s_next -- * Get the next secondary in the list. * * PUBLIC: int __db_s_next __P((DB **, DB_TXN *)); */ int __db_s_next(sdbpp, txn) DB **sdbpp; DB_TXN *txn; { DB *sdbp, *pdbp, *closeme; ENV *env; int ret; /* * Secondary indices are kept in a linked list, s_secondaries, * off each primary DB handle. If a primary is free-threaded, * this list may only be traversed or modified while the primary's * thread mutex is held. * * The tricky part is that we don't want to hold the thread mutex * across the full set of secondary puts necessary for each primary * put, or we'll wind up essentially single-threading all the puts * to the handle; the secondary puts will each take about as * long as the primary does, and may require I/O. So we instead * hold the thread mutex only long enough to follow one link to the * next secondary, and then we release it before performing the * actual secondary put. * * The only danger here is that we might legitimately close a * secondary index in one thread while another thread is performing * a put and trying to update that same secondary index. To * prevent this from happening, we refcount the secondary handles. * If close is called on a secondary index handle while we're putting * to it, it won't really be closed--the refcount will simply drop, * and we'll be responsible for closing it here. */ sdbp = *sdbpp; pdbp = sdbp->s_primary; env = pdbp->env; closeme = NULL; MUTEX_LOCK(env, pdbp->mutex); DB_ASSERT(env, sdbp->s_refcnt != 0); if (--sdbp->s_refcnt == 0) { LIST_REMOVE(sdbp, s_links); closeme = sdbp; } sdbp = LIST_NEXT(sdbp, s_links); if (sdbp != NULL) sdbp->s_refcnt++; MUTEX_UNLOCK(env, pdbp->mutex); *sdbpp = sdbp; /* * closeme->close() is a wrapper; call __db_close explicitly. */ if (closeme == NULL) ret = 0; else if (txn == NULL) ret = __db_close(closeme, NULL, 0); else ret = __txn_closeevent(env, txn, closeme); return (ret); } /* * __db_s_done -- * Properly decrement the refcount on a secondary database handle we're * using, without calling __db_s_next. * * PUBLIC: int __db_s_done __P((DB *, DB_TXN *)); */ int __db_s_done(sdbp, txn) DB *sdbp; DB_TXN *txn; { DB *pdbp; ENV *env; int doclose, ret; pdbp = sdbp->s_primary; env = pdbp->env; doclose = 0; MUTEX_LOCK(env, pdbp->mutex); DB_ASSERT(env, sdbp->s_refcnt != 0); if (--sdbp->s_refcnt == 0) { LIST_REMOVE(sdbp, s_links); doclose = 1; } MUTEX_UNLOCK(env, pdbp->mutex); if (doclose == 0) ret = 0; else if (txn == NULL) ret = __db_close(sdbp, NULL, 0); else ret = __txn_closeevent(env, txn, sdbp); return (ret); } /* * __db_s_count -- * Count the number of secondaries associated with a given primary. */ static int __db_s_count(pdbp) DB *pdbp; { DB *sdbp; ENV *env; int count; env = pdbp->env; count = 0; MUTEX_LOCK(env, pdbp->mutex); for (sdbp = LIST_FIRST(&pdbp->s_secondaries); sdbp != NULL; sdbp = LIST_NEXT(sdbp, s_links)) ++count; MUTEX_UNLOCK(env, pdbp->mutex); return (count); } /* * __db_buildpartial -- * Build the record that will result after a partial put is applied to * an existing record. * * This should probably be merged with __bam_build, but that requires * a little trickery if we plan to keep the overflow-record optimization * in that function. */ static int __db_buildpartial(dbp, oldrec, partial, newrec) DB *dbp; DBT *oldrec, *partial, *newrec; { ENV *env; u_int32_t len, nbytes; u_int8_t *buf; int ret; env = dbp->env; DB_ASSERT(env, F_ISSET(partial, DB_DBT_PARTIAL)); memset(newrec, 0, sizeof(DBT)); nbytes = __db_partsize(oldrec->size, partial); newrec->size = nbytes; if ((ret = __os_malloc(env, nbytes, &buf)) != 0) return (ret); newrec->data = buf; /* Nul or pad out the buffer, for any part that isn't specified. */ memset(buf, F_ISSET(dbp, DB_AM_FIXEDLEN) ? ((BTREE *)dbp->bt_internal)->re_pad : 0, nbytes); /* Copy in any leading data from the original record. */ memcpy(buf, oldrec->data, partial->doff > oldrec->size ? oldrec->size : partial->doff); /* Copy the data from partial. */ memcpy(buf + partial->doff, partial->data, partial->size); /* Copy any trailing data from the original record. */ len = partial->doff + partial->dlen; if (oldrec->size > len) memcpy(buf + partial->doff + partial->size, (u_int8_t *)oldrec->data + len, oldrec->size - len); return (0); } /* * __db_partsize -- * Given the number of bytes in an existing record and a DBT that * is about to be partial-put, calculate the size of the record * after the put. * * This code is called from __bam_partsize. * * PUBLIC: u_int32_t __db_partsize __P((u_int32_t, DBT *)); */ u_int32_t __db_partsize(nbytes, data) u_int32_t nbytes; DBT *data; { /* * There are really two cases here: * * Case 1: We are replacing some bytes that do not exist (i.e., they * are past the end of the record). In this case the number of bytes * we are replacing is irrelevant and all we care about is how many * bytes we are going to add from offset. So, the new record length * is going to be the size of the new bytes (size) plus wherever those * new bytes begin (doff). * * Case 2: All the bytes we are replacing exist. Therefore, the new * size is the oldsize (nbytes) minus the bytes we are replacing (dlen) * plus the bytes we are adding (size). */ if (nbytes < data->doff + data->dlen) /* Case 1 */ return (data->doff + data->size); return (nbytes + data->size - data->dlen); /* Case 2 */ } #ifdef DIAGNOSTIC /* * __db_check_skeyset -- * Diagnostic check that the application's callback returns a set of * secondary keys without repeats. * * PUBLIC: #ifdef DIAGNOSTIC * PUBLIC: void __db_check_skeyset __P((DB *, DBT *)); * PUBLIC: #endif */ void __db_check_skeyset(sdbp, skeyp) DB *sdbp; DBT *skeyp; { DBT *firstkey, *lastkey, *key1, *key2; ENV *env; env = sdbp->env; firstkey = (DBT *)skeyp->data; lastkey = firstkey + skeyp->size; for (key1 = firstkey; key1 < lastkey; key1++) for (key2 = key1 + 1; key2 < lastkey; key2++) DB_ASSERT(env, ((BTREE *)sdbp->bt_internal)->bt_compare(sdbp, key1, key2) != 0); } #endif