/*- * See the file LICENSE for redistribution information. * * Copyright (c) 2001,2008 Oracle. All rights reserved. * * $Id: fop_rec.c,v 12.27 2008/01/31 18:40:43 bostic Exp $ */ #include "db_config.h" #include "db_int.h" #include "dbinc/db_page.h" #include "dbinc/fop.h" #include "dbinc/db_am.h" #include "dbinc/mp.h" #include "dbinc/txn.h" static int __fop_rename_recover_int __P((ENV *, DBT *, DB_LSN *, db_recops, void *, int)); /* * The transactional guarantees Berkeley DB provides for file * system level operations (database physical file create, delete, * rename) are based on our understanding of current file system * semantics; a system that does not provide these semantics and * guarantees could be in danger. * * First, as in standard database changes, fsync and fdatasync must * work: when applied to the log file, the records written into the * log must be transferred to stable storage. * * Second, it must not be possible for the log file to be removed * without previous file system level operations being flushed to * stable storage. Berkeley DB applications write log records * describing file system operations into the log, then perform the * file system operation, then commit the enclosing transaction * (which flushes the log file to stable storage). Subsequently, * a database environment checkpoint may make it possible for the * application to remove the log file containing the record of the * file system operation. DB's transactional guarantees for file * system operations require the log file removal not succeed until * all previous filesystem operations have been flushed to stable * storage. In other words, the flush of the log file, or the * removal of the log file, must block until all previous * filesystem operations have been flushed to stable storage. This * semantic is not, as far as we know, required by any existing * standards document, but we have never seen a filesystem where * it does not apply. */ /* * __fop_create_recover -- * Recovery function for create. * * PUBLIC: int __fop_create_recover * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *)); */ int __fop_create_recover(env, dbtp, lsnp, op, info) ENV *env; DBT *dbtp; DB_LSN *lsnp; db_recops op; void *info; { __fop_create_args *argp; DB_FH *fhp; DBMETA *meta; u_int8_t mbuf[DBMETASIZE]; int ret; char *real_name; COMPQUIET(info, NULL); real_name = NULL; REC_PRINT(__fop_create_print); REC_NOOP_INTRO(__fop_create_read); meta = (DBMETA *)mbuf; if ((ret = __db_appname(env, (APPNAME)argp->appname, (const char *)argp->name.data, 0, NULL, &real_name)) != 0) goto out; if (DB_UNDO(op)) { /* * If the file was opened in mpool, we must mark it as * dead via nameop which will also unlink the file. */ if (__os_open(env, real_name, 0, 0, 0, &fhp) == 0) { if (__fop_read_meta(env, real_name, mbuf, DBMETASIZE, fhp, 1, NULL) == 0 && __db_chk_meta(env, NULL, meta, 1) == 0) { if ((ret = __memp_nameop(env, meta->uid, NULL, real_name, NULL, 0)) != 0) goto out; } else goto do_unlink; (void)__os_closehandle(env, fhp); } else do_unlink: (void)__os_unlink(env, real_name, 0); } else if (DB_REDO(op)) { if ((ret = __os_open(env, real_name, 0, DB_OSO_CREATE, (int)argp->mode, &fhp)) == 0) (void)__os_closehandle(env, fhp); else goto out; } *lsnp = argp->prev_lsn; out: if (real_name != NULL) __os_free(env, real_name); REC_NOOP_CLOSE; } /* * __fop_remove_recover -- * Recovery function for remove. * * PUBLIC: int __fop_remove_recover * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *)); */ int __fop_remove_recover(env, dbtp, lsnp, op, info) ENV *env; DBT *dbtp; DB_LSN *lsnp; db_recops op; void *info; { __fop_remove_args *argp; int ret; char *real_name; COMPQUIET(info, NULL); real_name = NULL; REC_PRINT(__fop_remove_print); REC_NOOP_INTRO(__fop_remove_read); if ((ret = __db_appname(env, (APPNAME)argp->appname, (const char *)argp->name.data, 0, NULL, &real_name)) != 0) goto out; /* Its ok if the file is not there. */ if (DB_REDO(op)) (void)__memp_nameop(env, (u_int8_t *)argp->fid.data, NULL, real_name, NULL, 0); *lsnp = argp->prev_lsn; out: if (real_name != NULL) __os_free(env, real_name); REC_NOOP_CLOSE; } /* * __fop_write_recover -- * Recovery function for writechunk. * * PUBLIC: int __fop_write_recover * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *)); */ int __fop_write_recover(env, dbtp, lsnp, op, info) ENV *env; DBT *dbtp; DB_LSN *lsnp; db_recops op; void *info; { __fop_write_args *argp; int ret; COMPQUIET(info, NULL); REC_PRINT(__fop_write_print); REC_NOOP_INTRO(__fop_write_read); ret = 0; if (DB_UNDO(op)) DB_ASSERT(env, argp->flag != 0); else if (DB_REDO(op)) ret = __fop_write(env, argp->txnp, argp->name.data, (APPNAME)argp->appname, NULL, argp->pgsize, argp->pageno, argp->offset, argp->page.data, argp->page.size, argp->flag, 0); if (ret == 0) *lsnp = argp->prev_lsn; REC_NOOP_CLOSE; } /* * __fop_rename_recover -- * Recovery functions for rename. There are two variants that * both use the same utility function. Had we known about this on day * one, we would have simply added a parameter. However, since we need * to retain old records for backward compatibility (online-upgrade) * wrapping the two seems like the right solution. * * PUBLIC: int __fop_rename_recover * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *)); * * PUBLIC: int __fop_rename_noundo_recover * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *)); */ int __fop_rename_recover(env, dbtp, lsnp, op, info) ENV *env; DBT *dbtp; DB_LSN *lsnp; db_recops op; void *info; { return (__fop_rename_recover_int(env, dbtp, lsnp, op, info, 1)); } int __fop_rename_noundo_recover(env, dbtp, lsnp, op, info) ENV *env; DBT *dbtp; DB_LSN *lsnp; db_recops op; void *info; { return (__fop_rename_recover_int(env, dbtp, lsnp, op, info, 0)); } static int __fop_rename_recover_int(env, dbtp, lsnp, op, info, undo) ENV *env; DBT *dbtp; DB_LSN *lsnp; db_recops op; void *info; int undo; { __fop_rename_args *argp; DB_FH *fhp; DBMETA *meta; u_int8_t *fileid, mbuf[DBMETASIZE]; int ret; char *real_new, *real_old, *src; COMPQUIET(info, NULL); fhp = NULL; meta = (DBMETA *)&mbuf[0]; ret = 0; real_new = real_old = NULL; REC_PRINT(__fop_rename_print); REC_NOOP_INTRO(__fop_rename_read); fileid = argp->fileid.data; if ((ret = __db_appname(env, (APPNAME)argp->appname, (const char *)argp->newname.data, 0, NULL, &real_new)) != 0) goto out; if ((ret = __db_appname(env, (APPNAME)argp->appname, (const char *)argp->oldname.data, 0, NULL, &real_old)) != 0) goto out; /* * Verify that we are manipulating the correct file. We should always * be OK on an ABORT or an APPLY, but during recovery, we have to * check. */ if (op != DB_TXN_ABORT && op != DB_TXN_APPLY) { src = DB_UNDO(op) ? real_new : real_old; /* * Interpret any error as meaning that the file either doesn't * exist, doesn't have a meta-data page, or is in some other * way, shape or form, incorrect, so that we should not restore * it. */ if (__os_open(env, src, 0, 0, 0, &fhp) != 0) goto done; if (__fop_read_meta(env, src, mbuf, DBMETASIZE, fhp, 1, NULL) != 0) goto done; if (__db_chk_meta(env, NULL, meta, 1) != 0) goto done; if (memcmp(argp->fileid.data, meta->uid, DB_FILE_ID_LEN) != 0) goto done; (void)__os_closehandle(env, fhp); fhp = NULL; if (DB_REDO(op)) { /* * Check to see if the target file exists. If it * does and it does not have the proper id then * it is a later version. We just remove the source * file since the state of the world is beyond this * point. */ if (__os_open(env, real_new, 0, 0, 0, &fhp) == 0 && __fop_read_meta(env, src, mbuf, DBMETASIZE, fhp, 1, NULL) == 0 && __db_chk_meta(env, NULL, meta, 1) == 0 && memcmp(argp->fileid.data, meta->uid, DB_FILE_ID_LEN) != 0) { (void)__memp_nameop(env, fileid, NULL, real_old, NULL, 0); goto done; } } } if (undo && DB_UNDO(op)) (void)__memp_nameop(env, fileid, (const char *)argp->oldname.data, real_new, real_old, 0); if (DB_REDO(op)) (void)__memp_nameop(env, fileid, (const char *)argp->newname.data, real_old, real_new, 0); done: *lsnp = argp->prev_lsn; out: if (real_new != NULL) __os_free(env, real_new); if (real_old != NULL) __os_free(env, real_old); if (fhp != NULL) (void)__os_closehandle(env, fhp); REC_NOOP_CLOSE; } /* * __fop_file_remove_recover -- * Recovery function for file_remove. On the REDO pass, we need to * make sure no one recreated the file while we weren't looking. On an * undo pass must check if the file we are interested in is the one that * exists and then set the status of the child transaction depending on * what we find out. * * PUBLIC: int __fop_file_remove_recover * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *)); */ int __fop_file_remove_recover(env, dbtp, lsnp, op, info) ENV *env; DBT *dbtp; DB_LSN *lsnp; db_recops op; void *info; { __fop_file_remove_args *argp; DBMETA *meta; DB_FH *fhp; size_t len; u_int8_t mbuf[DBMETASIZE]; u_int32_t cstat, ret_stat; int is_real, is_tmp, ret; char *real_name; fhp = NULL; meta = (DBMETA *)&mbuf[0]; is_real = is_tmp = 0; real_name = NULL; REC_PRINT(__fop_file_remove_print); REC_NOOP_INTRO(__fop_file_remove_read); /* * This record is only interesting on the backward, forward, and * apply phases. */ if (op != DB_TXN_BACKWARD_ROLL && op != DB_TXN_FORWARD_ROLL && op != DB_TXN_APPLY) goto done; if ((ret = __db_appname(env, (APPNAME)argp->appname, argp->name.data, 0, NULL, &real_name)) != 0) goto out; /* Verify that we are manipulating the correct file. */ len = 0; if (__os_open(env, real_name, 0, 0, 0, &fhp) != 0 || (ret = __fop_read_meta(env, real_name, mbuf, DBMETASIZE, fhp, 1, &len)) != 0) { /* * If len is non-zero, then the file exists and has something * in it, but that something isn't a full meta-data page, so * this is very bad. Bail out! */ if (len != 0) goto out; /* File does not exist. */ cstat = TXN_EXPECTED; } else { /* * We can ignore errors here since we'll simply fail the * checks below and assume this is the wrong file. */ (void)__db_chk_meta(env, NULL, meta, 1); is_real = memcmp(argp->real_fid.data, meta->uid, DB_FILE_ID_LEN) == 0; is_tmp = memcmp(argp->tmp_fid.data, meta->uid, DB_FILE_ID_LEN) == 0; if (!is_real && !is_tmp) /* File exists, but isn't what we were removing. */ cstat = TXN_IGNORE; else /* File exists and is the one that we were removing. */ cstat = TXN_COMMIT; } if (fhp != NULL) { (void)__os_closehandle(env, fhp); fhp = NULL; } if (DB_UNDO(op)) { /* On the backward pass, we leave a note for the child txn. */ if ((ret = __db_txnlist_update(env, info, argp->child, cstat, NULL, &ret_stat, 1)) != 0) goto out; } else if (DB_REDO(op)) { /* * On the forward pass, check if someone recreated the * file while we weren't looking. */ if (cstat == TXN_COMMIT) (void)__memp_nameop(env, is_real ? argp->real_fid.data : argp->tmp_fid.data, NULL, real_name, NULL, 0); } done: *lsnp = argp->prev_lsn; ret = 0; out: if (real_name != NULL) __os_free(env, real_name); if (fhp != NULL) (void)__os_closehandle(env, fhp); REC_NOOP_CLOSE; }