os_rw.c   [plain text]


/*-
 * See the file LICENSE for redistribution information.
 *
 * Copyright (c) 1997,2008 Oracle.  All rights reserved.
 *
 * $Id: os_rw.c,v 12.32 2008/02/12 16:08:51 bostic Exp $
 */

#include "db_config.h"

#include "db_int.h"

/*
 * __os_io --
 *	Do an I/O.
 *
 * PUBLIC: int __os_io __P((ENV *, int, DB_FH *, db_pgno_t,
 * PUBLIC:     u_int32_t, u_int32_t, u_int32_t, u_int8_t *, size_t *));
 */
int
__os_io(env, op, fhp, pgno, pgsize, relative, io_len, buf, niop)
	ENV *env;
	int op;
	DB_FH *fhp;
	db_pgno_t pgno;
	u_int32_t pgsize, relative, io_len;
	u_int8_t *buf;
	size_t *niop;
{
#if defined(HAVE_PREAD) && defined(HAVE_PWRITE)
	DB_ENV *dbenv;
	off_t offset;
	ssize_t nio;
#endif
	int ret;

	/*
	 * Check for illegal usage.
	 *
	 * This routine is used in one of two ways: reading bytes from an
	 * absolute offset and reading a specific database page.  All of
	 * our absolute offsets are known to fit into a u_int32_t, while
	 * our database pages might be at offsets larger than a u_int32_t.
	 * We don't want to specify an absolute offset in our caller as we
	 * aren't exactly sure what size an off_t might be.
	 */
	DB_ASSERT(env, F_ISSET(fhp, DB_FH_OPENED) && fhp->fd != -1);
	DB_ASSERT(env, (pgno == 0 && pgsize == 0) || relative == 0);

#if defined(HAVE_PREAD) && defined(HAVE_PWRITE)
	dbenv = env == NULL ? NULL : env->dbenv;

	if ((offset = relative) == 0)
		offset = (off_t)pgno * pgsize;
	switch (op) {
	case DB_IO_READ:
		if (DB_GLOBAL(j_read) != NULL)
			goto slow;
#if defined(HAVE_STATISTICS)
		++fhp->read_count;
#endif
		if (dbenv != NULL &&
		    FLD_ISSET(dbenv->verbose, DB_VERB_FILEOPS_ALL))
			__db_msg(env,
			    "fileops: read %s: %lu bytes at offset %lu",
			    fhp->name, (u_long)io_len, (u_long)offset);

		LAST_PANIC_CHECK_BEFORE_IO(env);
		nio = DB_GLOBAL(j_pread) != NULL ?
		    DB_GLOBAL(j_pread)(fhp->fd, buf, io_len, offset) :
		    pread(fhp->fd, buf, io_len, offset);
		break;
	case DB_IO_WRITE:
		if (DB_GLOBAL(j_write) != NULL)
			goto slow;
#ifdef HAVE_FILESYSTEM_NOTZERO
		if (__os_fs_notzero())
			goto slow;
#endif
#if defined(HAVE_STATISTICS)
		++fhp->write_count;
#endif
		if (dbenv != NULL &&
		    FLD_ISSET(dbenv->verbose, DB_VERB_FILEOPS_ALL))
			__db_msg(env,
			    "fileops: write %s: %lu bytes at offset %lu",
			    fhp->name, (u_long)io_len, (u_long)offset);

		LAST_PANIC_CHECK_BEFORE_IO(env);
		nio = DB_GLOBAL(j_pwrite) != NULL ?
		    DB_GLOBAL(j_pwrite)(fhp->fd, buf, io_len, offset) :
		    pwrite(fhp->fd, buf, io_len, offset);
		break;
	default:
		return (EINVAL);
	}
	if (nio == (ssize_t)io_len) {
		*niop = io_len;
		return (0);
	}
slow:
#endif
	MUTEX_LOCK(env, fhp->mtx_fh);

	if ((ret = __os_seek(env, fhp, pgno, pgsize, relative)) != 0)
		goto err;
	switch (op) {
	case DB_IO_READ:
		ret = __os_read(env, fhp, buf, io_len, niop);
		break;
	case DB_IO_WRITE:
		ret = __os_write(env, fhp, buf, io_len, niop);
		break;
	default:
		ret = EINVAL;
		break;
	}

err:	MUTEX_UNLOCK(env, fhp->mtx_fh);

	return (ret);

}

/*
 * __os_read --
 *	Read from a file handle.
 *
 * PUBLIC: int __os_read __P((ENV *, DB_FH *, void *, size_t, size_t *));
 */
int
__os_read(env, fhp, addr, len, nrp)
	ENV *env;
	DB_FH *fhp;
	void *addr;
	size_t len;
	size_t *nrp;
{
	DB_ENV *dbenv;
	size_t offset;
	ssize_t nr;
	int ret;
	u_int8_t *taddr;

	dbenv = env == NULL ? NULL : env->dbenv;
	ret = 0;

	DB_ASSERT(env, F_ISSET(fhp, DB_FH_OPENED) && fhp->fd != -1);

#if defined(HAVE_STATISTICS)
	++fhp->read_count;
#endif
	if (dbenv != NULL && FLD_ISSET(dbenv->verbose, DB_VERB_FILEOPS_ALL))
		__db_msg(env,
		    "fileops: read %s: %lu bytes", fhp->name, (u_long)len);

	if (DB_GLOBAL(j_read) != NULL) {
		*nrp = len;
		LAST_PANIC_CHECK_BEFORE_IO(env);
		if (DB_GLOBAL(j_read)(fhp->fd, addr, len) != (ssize_t)len) {
			ret = __os_get_syserr();
			__db_syserr(env, ret, "read: %#lx, %lu",
			    P_TO_ULONG(addr), (u_long)len);
			ret = __os_posix_err(ret);
		}
		return (ret);
	}

	for (taddr = addr, offset = 0;
	    offset < len; taddr += nr, offset += (u_int32_t)nr) {
		LAST_PANIC_CHECK_BEFORE_IO(env);
		RETRY_CHK(((nr = read(fhp->fd,
		    CHAR_STAR_CAST taddr, len - offset)) < 0 ? 1 : 0), ret);
		if (nr == 0 || ret != 0)
			break;
	}
	*nrp = (size_t)(taddr - (u_int8_t *)addr);
	if (ret != 0) {
		__db_syserr(env, ret, "read: %#lx, %lu",
		    P_TO_ULONG(taddr), (u_long)len - offset);
		ret = __os_posix_err(ret);
	}
	return (ret);
}

/*
 * __os_write --
 *	Write to a file handle.
 *
 * PUBLIC: int __os_write __P((ENV *, DB_FH *, void *, size_t, size_t *));
 */
int
__os_write(env, fhp, addr, len, nwp)
	ENV *env;
	DB_FH *fhp;
	void *addr;
	size_t len;
	size_t *nwp;
{
	DB_ASSERT(env, F_ISSET(fhp, DB_FH_OPENED) && fhp->fd != -1);

#ifdef HAVE_FILESYSTEM_NOTZERO
	/* Zero-fill as necessary. */
	if (__os_fs_notzero()) {
		int ret;
		if ((ret = __db_zero_fill(env, fhp)) != 0)
			return (ret);
	}
#endif
	return (__os_physwrite(env, fhp, addr, len, nwp));
}

/*
 * __os_physwrite --
 *	Physical write to a file handle.
 *
 * PUBLIC: int __os_physwrite
 * PUBLIC:     __P((ENV *, DB_FH *, void *, size_t, size_t *));
 */
int
__os_physwrite(env, fhp, addr, len, nwp)
	ENV *env;
	DB_FH *fhp;
	void *addr;
	size_t len;
	size_t *nwp;
{
	DB_ENV *dbenv;
	size_t offset;
	ssize_t nw;
	int ret;
	u_int8_t *taddr;

	dbenv = env == NULL ? NULL : env->dbenv;
	ret = 0;

#if defined(HAVE_STATISTICS)
	++fhp->write_count;
#endif
	if (dbenv != NULL && FLD_ISSET(dbenv->verbose, DB_VERB_FILEOPS_ALL))
		__db_msg(env,
		    "fileops: write %s: %lu bytes", fhp->name, (u_long)len);

#if defined(HAVE_FILESYSTEM_NOTZERO) && defined(DIAGNOSTIC)
	if (__os_fs_notzero()) {
		struct stat sb;
		off_t cur_off;

		DB_ASSERT(env, fstat(fhp->fd, &sb) != -1 &&
		    (cur_off = lseek(fhp->fd, (off_t)0, SEEK_CUR)) != -1 &&
		    cur_off <= sb.st_size);
	}
#endif
	if (DB_GLOBAL(j_write) != NULL) {
		*nwp = len;
		LAST_PANIC_CHECK_BEFORE_IO(env);
		if (DB_GLOBAL(j_write)(fhp->fd, addr, len) != (ssize_t)len) {
			ret = __os_get_syserr();
			__db_syserr(env, ret, "write: %#lx, %lu",
			    P_TO_ULONG(addr), (u_long)len);
			ret = __os_posix_err(ret);

			DB_EVENT(env, DB_EVENT_WRITE_FAILED, NULL);
		}
		return (ret);
	}

	for (taddr = addr, offset = 0;
	    offset < len; taddr += nw, offset += (u_int32_t)nw) {
		LAST_PANIC_CHECK_BEFORE_IO(env);
		RETRY_CHK(((nw = write(fhp->fd,
		    CHAR_STAR_CAST taddr, len - offset)) < 0 ? 1 : 0), ret);
		if (ret != 0)
			break;
	}
	*nwp = len;
	if (ret != 0) {
		__db_syserr(env, ret, "write: %#lx, %lu",
		    P_TO_ULONG(taddr), (u_long)len - offset);
		ret = __os_posix_err(ret);

		DB_EVENT(env, DB_EVENT_WRITE_FAILED, NULL);
	}
	return (ret);
}