vm_compressor_backing_file.c [plain text]

/*
 * Copyright (c) 2000-2016 Apple Inc. All rights reserved.
 *
 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
 *
 * This file contains Original Code and/or Modifications of Original Code
 * as defined in and that are subject to the Apple Public Source License
 * Version 2.0 (the 'License'). You may not use this file except in
 * compliance with the License. The rights granted to you under the License
 * may not be used to create, or enable the creation or redistribution of,
 * unlawful or unlicensed copies of an Apple operating system, or to
 * circumvent, violate, or enable the circumvention or violation of, any
 * terms of an Apple operating system software license agreement.
 *
 * Please obtain a copy of the License at
 * http://www.opensource.apple.com/apsl/ and read it before using this file.
 *
 * The Original Code and all software distributed under the License are
 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
 * Please see the License for the specific language governing rights and
 * limitations under the License.
 *
 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
 */

#include <stdint.h>
#include <sys/fcntl.h>
#include <sys/vnode_internal.h>
#include <sys/vnode.h>
#include <sys/kauth.h>
#include <sys/mount_internal.h>
#include <sys/buf_internal.h>
#include <kern/debug.h>
#include <kern/kalloc.h>
#include <sys/cprotect.h>
#include <sys/disk.h>
#include <vm/vm_protos.h>
#include <vm/vm_pageout.h>
#include <sys/content_protection.h>

void vm_swapfile_open(const char *path, vnode_t *vp);
void vm_swapfile_close(uint64_t path, vnode_t vp);
int vm_swapfile_preallocate(vnode_t vp, uint64_t *size, boolean_t *pin);
uint64_t vm_swapfile_get_blksize(vnode_t vp);
uint64_t vm_swapfile_get_transfer_size(vnode_t vp);
int vm_swapfile_io(vnode_t vp, uint64_t offset, uint64_t start, int npages, int flags, void *);
int vm_record_file_write(struct vnode *vp, uint64_t offset, char *buf, int size);

#if CONFIG_FREEZE
int vm_swap_vol_get_budget(vnode_t vp, uint64_t *freeze_daily_budget);
#endif /* CONFIG_FREEZE */


void
vm_swapfile_open(const char *path, vnode_t *vp)
{
	int error = 0;
	vfs_context_t   ctx = vfs_context_kernel();

	if ((error = vnode_open(path, (O_CREAT | O_TRUNC | FREAD | FWRITE), S_IRUSR | S_IWUSR, 0, vp, ctx))) {
		printf("Failed to open swap file %d\n", error);
		*vp = NULL;
		return;
	}

	/*
	 * If MNT_IOFLAGS_NOSWAP is set, opening the swap file should fail.
	 * To avoid a race on the mount we only make this check after creating the
	 * vnode.
	 */
	if ((*vp)->v_mount->mnt_kern_flag & MNTK_NOSWAP) {
		vnode_put(*vp);
		vm_swapfile_close((uint64_t)path, *vp);
		*vp = NULL;
		return;
	}

	vnode_put(*vp);
}

uint64_t
vm_swapfile_get_blksize(vnode_t vp)
{
	return (uint64_t)vfs_devblocksize(vnode_mount(vp));
}

uint64_t
vm_swapfile_get_transfer_size(vnode_t vp)
{
	return (uint64_t)vp->v_mount->mnt_vfsstat.f_iosize;
}

int unlink1(vfs_context_t, vnode_t, user_addr_t, enum uio_seg, int);

void
vm_swapfile_close(uint64_t path_addr, vnode_t vp)
{
	vfs_context_t context = vfs_context_kernel();
	int error;

	vnode_getwithref(vp);
	vnode_close(vp, 0, context);

	error = unlink1(context, NULLVP, CAST_USER_ADDR_T(path_addr),
	    UIO_SYSSPACE, 0);

#if DEVELOPMENT || DEBUG
	if (error) {
		printf("%s : unlink of %s failed with error %d", __FUNCTION__,
		    (char *)path_addr, error);
	}
#endif
}

int
vm_swapfile_preallocate(vnode_t vp, uint64_t *size, boolean_t *pin)
{
	int             error = 0;
	uint64_t        file_size = 0;
	vfs_context_t   ctx = NULL;
#if CONFIG_FREEZE
	struct vnode_attr va;
#endif /* CONFIG_FREEZE */

	ctx = vfs_context_kernel();

	error = vnode_setsize(vp, *size, IO_NOZEROFILL, ctx);

	if (error) {
		printf("vnode_setsize for swap files failed: %d\n", error);
		goto done;
	}

	error = vnode_size(vp, (off_t*) &file_size, ctx);

	if (error) {
		printf("vnode_size (new file) for swap file failed: %d\n", error);
		goto done;
	}
	assert(file_size == *size);

	if (pin != NULL && *pin != FALSE) {
		error = VNOP_IOCTL(vp, FIOPINSWAP, NULL, 0, ctx);

		if (error) {
			printf("pin for swap files failed: %d,  file_size = %lld\n", error, file_size);
			/* this is not fatal, carry on with files wherever they landed */
			*pin = FALSE;
			error = 0;
		}
	}

	vnode_lock_spin(vp);
	SET(vp->v_flag, VSWAP);
	vnode_unlock(vp);

#if CONFIG_FREEZE
	VATTR_INIT(&va);
	VATTR_SET(&va, va_dataprotect_class, PROTECTION_CLASS_C);
	error = VNOP_SETATTR(vp, &va, ctx);

	if (error) {
		printf("setattr PROTECTION_CLASS_C for swap file failed: %d\n", error);
		goto done;
	}
#endif /* CONFIG_FREEZE */

done:
	return error;
}


int
vm_record_file_write(vnode_t vp, uint64_t offset, char *buf, int size)
{
	int error = 0;
	vfs_context_t ctx;

	ctx = vfs_context_kernel();

	error = vn_rdwr(UIO_WRITE, vp, (caddr_t)buf, size, offset,
	    UIO_SYSSPACE, IO_NODELOCKED, vfs_context_ucred(ctx), (int *) 0, vfs_context_proc(ctx));

	return error;
}



int
vm_swapfile_io(vnode_t vp, uint64_t offset, uint64_t start, int npages, int flags, void *upl_iodone)
{
	int error = 0;
	uint64_t io_size = npages * PAGE_SIZE_64;
#if 1
	kern_return_t   kr = KERN_SUCCESS;
	upl_t           upl = NULL;
	unsigned int    count = 0;
	upl_control_flags_t upl_create_flags = 0;
	int             upl_control_flags = 0;
	upl_size_t      upl_size = 0;

	upl_create_flags = UPL_SET_INTERNAL | UPL_SET_LITE;

	if (upl_iodone == NULL) {
		upl_control_flags = UPL_IOSYNC;
	}

#if ENCRYPTED_SWAP
	upl_control_flags |= UPL_PAGING_ENCRYPTED;
#endif

	if ((flags & SWAP_READ) == FALSE) {
		upl_create_flags |= UPL_COPYOUT_FROM;
	}

	upl_size = io_size;
	kr = vm_map_create_upl( kernel_map,
	    start,
	    &upl_size,
	    &upl,
	    NULL,
	    &count,
	    &upl_create_flags,
	    VM_KERN_MEMORY_OSFMK);

	if (kr != KERN_SUCCESS || (upl_size != io_size)) {
		panic("vm_map_create_upl failed with %d\n", kr);
	}

	if (flags & SWAP_READ) {
		vnode_pagein(vp,
		    upl,
		    0,
		    offset,
		    io_size,
		    upl_control_flags | UPL_IGNORE_VALID_PAGE_CHECK,
		    &error);
		if (error) {
#if DEBUG
			printf("vm_swapfile_io: vnode_pagein failed with %d (vp: %p, offset: 0x%llx, size:%llu)\n", error, vp, offset, io_size);
#else /* DEBUG */
			printf("vm_swapfile_io: vnode_pagein failed with %d.\n", error);
#endif /* DEBUG */
		}
	} else {
		upl_set_iodone(upl, upl_iodone);

		vnode_pageout(vp,
		    upl,
		    0,
		    offset,
		    io_size,
		    upl_control_flags,
		    &error);
		if (error) {
#if DEBUG
			printf("vm_swapfile_io: vnode_pageout failed with %d (vp: %p, offset: 0x%llx, size:%llu)\n", error, vp, offset, io_size);
#else /* DEBUG */
			printf("vm_swapfile_io: vnode_pageout failed with %d.\n", error);
#endif /* DEBUG */
		}
	}
	return error;

#else /* 1 */
	vfs_context_t ctx;
	ctx = vfs_context_kernel();

	error = vn_rdwr((flags & SWAP_READ) ? UIO_READ : UIO_WRITE, vp, (caddr_t)start, io_size, offset,
	    UIO_SYSSPACE, IO_SYNC | IO_NODELOCKED | IO_UNIT | IO_NOCACHE | IO_SWAP_DISPATCH, vfs_context_ucred(ctx), (int *) 0, vfs_context_proc(ctx));

	if (error) {
		printf("vn_rdwr: Swap I/O failed with %d\n", error);
	}
	return error;
#endif /* 1 */
}


#define MAX_BATCH_TO_TRIM       256

#define ROUTE_ONLY              0x10            /* if corestorage is present, tell it to just pass */
                                                /* the DKIOUNMAP command through w/o acting on it */
                                                /* this is used by the compressed swap system to reclaim empty space */


u_int32_t
vnode_trim_list(vnode_t vp, struct trim_list *tl, boolean_t route_only)
{
	int             error = 0;
	int             trim_index = 0;
	u_int32_t       blocksize = 0;
	struct vnode    *devvp;
	dk_extent_t     *extents;
	dk_unmap_t      unmap;
	_dk_cs_unmap_t  cs_unmap;

	if (!(vp->v_mount->mnt_ioflags & MNT_IOFLAGS_UNMAP_SUPPORTED)) {
		return ENOTSUP;
	}

	if (tl == NULL) {
		return 0;
	}

	/*
	 * Get the underlying device vnode and physical block size
	 */
	devvp = vp->v_mount->mnt_devvp;
	blocksize = vp->v_mount->mnt_devblocksize;

	extents = kalloc(sizeof(dk_extent_t) * MAX_BATCH_TO_TRIM);

	if (vp->v_mount->mnt_ioflags & MNT_IOFLAGS_CSUNMAP_SUPPORTED) {
		memset(&cs_unmap, 0, sizeof(_dk_cs_unmap_t));
		cs_unmap.extents = extents;

		if (route_only == TRUE) {
			cs_unmap.options = ROUTE_ONLY;
		}
	} else {
		memset(&unmap, 0, sizeof(dk_unmap_t));
		unmap.extents = extents;
	}

	while (tl) {
		daddr64_t       io_blockno;     /* Block number corresponding to the start of the extent */
		size_t          io_bytecount;   /* Number of bytes in current extent for the specified range */
		size_t          trimmed;
		size_t          remaining_length;
		off_t           current_offset;

		current_offset = tl->tl_offset;
		remaining_length = tl->tl_length;
		trimmed = 0;

		/*
		 * We may not get the entire range from tl_offset -> tl_offset+tl_length in a single
		 * extent from the blockmap call.  Keep looping/going until we are sure we've hit
		 * the whole range or if we encounter an error.
		 */
		while (trimmed < tl->tl_length) {
			/*
			 * VNOP_BLOCKMAP will tell us the logical to physical block number mapping for the
			 * specified offset.  It returns blocks in contiguous chunks, so if the logical range is
			 * broken into multiple extents, it must be called multiple times, increasing the offset
			 * in each call to ensure that the entire range is covered.
			 */
			error = VNOP_BLOCKMAP(vp, current_offset, remaining_length,
			    &io_blockno, &io_bytecount, NULL, VNODE_READ | VNODE_BLOCKMAP_NO_TRACK, NULL);

			if (error) {
				goto trim_exit;
			}
			if (io_blockno != -1) {
				extents[trim_index].offset = (uint64_t) io_blockno * (u_int64_t) blocksize;
				extents[trim_index].length = io_bytecount;

				trim_index++;
			}
			if (trim_index == MAX_BATCH_TO_TRIM) {
				if (vp->v_mount->mnt_ioflags & MNT_IOFLAGS_CSUNMAP_SUPPORTED) {
					cs_unmap.extentsCount = trim_index;
					error = VNOP_IOCTL(devvp, _DKIOCCSUNMAP, (caddr_t)&cs_unmap, 0, vfs_context_kernel());
				} else {
					unmap.extentsCount = trim_index;
					error = VNOP_IOCTL(devvp, DKIOCUNMAP, (caddr_t)&unmap, 0, vfs_context_kernel());
				}
				if (error) {
					goto trim_exit;
				}
				trim_index = 0;
			}
			trimmed += io_bytecount;
			current_offset += io_bytecount;
			remaining_length -= io_bytecount;
		}
		tl = tl->tl_next;
	}
	if (trim_index) {
		if (vp->v_mount->mnt_ioflags & MNT_IOFLAGS_CSUNMAP_SUPPORTED) {
			cs_unmap.extentsCount = trim_index;
			error = VNOP_IOCTL(devvp, _DKIOCCSUNMAP, (caddr_t)&cs_unmap, 0, vfs_context_kernel());
		} else {
			unmap.extentsCount = trim_index;
			error = VNOP_IOCTL(devvp, DKIOCUNMAP, (caddr_t)&unmap, 0, vfs_context_kernel());
		}
	}
trim_exit:
	kfree(extents, sizeof(dk_extent_t) * MAX_BATCH_TO_TRIM);

	return error;
}

#if CONFIG_FREEZE
int
vm_swap_vol_get_budget(vnode_t vp, uint64_t *freeze_daily_budget)
{
	vnode_t         devvp = NULL;
	vfs_context_t   ctx = vfs_context_kernel();
	errno_t         err = 0;

	devvp = vp->v_mount->mnt_devvp;

	err = VNOP_IOCTL(devvp, DKIOCGETMAXSWAPWRITE, (caddr_t)freeze_daily_budget, 0, ctx);

	return err;
}
#endif /* CONFIG_FREEZE */