shadow.c   [plain text]


/*
 * Copyright (c) 2001-2006 Apple Computer, Inc. All rights reserved.
 *
 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
 * 
 * This file contains Original Code and/or Modifications of Original Code
 * as defined in and that are subject to the Apple Public Source License
 * Version 2.0 (the 'License'). You may not use this file except in
 * compliance with the License. The rights granted to you under the License
 * may not be used to create, or enable the creation or redistribution of,
 * unlawful or unlicensed copies of an Apple operating system, or to
 * circumvent, violate, or enable the circumvention or violation of, any
 * terms of an Apple operating system software license agreement.
 * 
 * Please obtain a copy of the License at
 * http://www.opensource.apple.com/apsl/ and read it before using this file.
 * 
 * The Original Code and all software distributed under the License are
 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
 * Please see the License for the specific language governing rights and
 * limitations under the License.
 * 
 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
 */

/*
 * shadow.c
 *
 * Implement copy-on-write shadow map to allow a disk image to be
 * mounted read-only, yet be writable by transferring writes to a
 * "shadow" file.  Subsequent reads from blocks that have been
 * written will then go the "shadow" file.
 *
 * The map has two parts:
 * 1) a bit map to track which blocks have been written
 * 2) a band map to map a "band" within the original file to a corresponding
 *    "band" in the shadow file.  Each band has the same size.
 *
 * The band map is used to ensure that blocks that are contiguous in the 
 * original file will remain contiguous in the shadow file.
 *
 * For debugging purposes, this file can be compiled standalone using:
 * cc -o shadow shadow.c -DTEST_SHADOW
 */

/*
 * Modification History
 *
 * December 21, 2001 	Dieter Siegmund (dieter@apple.com)
 * - initial revision
 */
#include <sys/param.h>
#include <sys/types.h>
#include <mach/boolean.h>

#include <string.h>

#ifdef TEST_SHADOW
#include <unistd.h>
#include <stdlib.h>
#define my_malloc(a)	malloc(a)
#define my_free(a)	free(a)
#else /* !TEST_SHADOW */
#include <sys/malloc.h>
#define my_malloc(a)	_MALLOC(a, M_TEMP, M_WAITOK)
#define my_free(a)	FREE(a, M_TEMP)
#include <libkern/libkern.h>
#endif /* TEST_SHADOW */

#include "shadow.h"

#define UINT32_ALL_ONES			((uint32_t)(-1))
#define USHORT_ALL_ONES			((u_short)(-1))
#define UCHAR_ALL_ONES			((u_char)(-1))

#define my_trunc(value, divisor)	((value) / (divisor) * (divisor))

/* a band size of 128K can represent a file up to 8GB */
#define BAND_SIZE_DEFAULT_POWER_2	17
#define BAND_SIZE_DEFAULT		(1 << BAND_SIZE_DEFAULT_POWER_2)

typedef u_short	band_number_t;
#define BAND_ZERO			((band_number_t)0)
#define BAND_MAX			((band_number_t)65535)

struct shadow_map {
    uint32_t		blocks_per_band;/* size in blocks */
    uint32_t		block_size;
    u_char *		block_bitmap;		/* 1 bit per block; 1=written */
    band_number_t *	bands;			/* band map array */
    uint32_t		file_size_blocks;	/* size of file in bands */
    uint32_t		shadow_size_bands;	/* size of shadow in bands */
    uint32_t 		next_band;		/* next free band */
    uint32_t		zeroth_band;		/* special-case 0th band */
};


typedef struct {
    uint32_t	byte;
    uint32_t	bit;
} bitmap_offset_t;

static __inline__ u_char
bit(int b)
{
    return ((u_char)(1 << b));
}

/* 
 * Function: bits_lower
 * Purpose:
 *   Return a byte value in which bits numbered lower than 'b' are set.
 */
static __inline__ u_char
bits_lower(int b)
{
    return ((u_char)(bit(b) - 1));
}

/*
 * Function: byte_set_bits
 * Purpose:
 *   Set the given range of bits within a byte.
 */
static __inline__ u_char
byte_set_bits(int start, int end)
{
    return ((u_char)((~bits_lower(start)) & (bits_lower(end) | bit(end))));
}

static __inline__ bitmap_offset_t
bitmap_offset(off_t where)
{
    bitmap_offset_t	b;

    b.byte = where / NBBY;
    b.bit = where % NBBY;
    return (b);
}

/* 
 * Function: bitmap_set
 *
 * Purpose:
 *   Set the given range of bits.
 *
 *   This algorithm tries to set the extents using the biggest
 *   units, using longs, then a short, then a byte, then bits.
 */
static void
bitmap_set(u_char * map, uint32_t start_bit, uint32_t bit_count)
{
    bitmap_offset_t 	start;
    bitmap_offset_t	end;

    start = bitmap_offset(start_bit);
    end = bitmap_offset(start_bit + bit_count);
    if (start.byte < end.byte) {
	uint32_t n_bytes;

	if (start.bit) {
	    map[start.byte] |= byte_set_bits(start.bit, NBBY - 1);
	    start.bit = 0;
	    start.byte++;
	    if (start.byte == end.byte)
		goto end;
	}
			
	n_bytes = end.byte - start.byte;
	
	while (n_bytes >= (sizeof(uint32_t))) {
	    *((uint32_t *)(map + start.byte)) = UINT32_ALL_ONES;
	    start.byte += sizeof(uint32_t);
	    n_bytes -= sizeof(uint32_t);
	}
	if (n_bytes >= sizeof(u_short)) {
	    *((u_short *)(map + start.byte)) = USHORT_ALL_ONES;
	    start.byte += sizeof(u_short);
	    n_bytes -= sizeof(u_short);
	}
	if (n_bytes == 1) {
	    map[start.byte] = UCHAR_ALL_ONES;
	    start.byte++;
	    n_bytes = 0;
	}
    }

 end:
    if (end.bit > start.bit) {
	map[start.byte] |= byte_set_bits(start.bit, end.bit - 1);
    }

    return;
}

/*
 * Function: bitmap_get
 *
 * Purpose:
 *   Return the number of bits in the range that are the same e.g.
 *   11101 returns 3 because the first 3 bits are the same (1's), whereas
 *   001100 returns 2 because the first 2 bits are the same.
 *   This algorithm tries to count things in as big a chunk as possible,
 *   first aligning to a byte offset, then trying to count longs, a short,
 *   a byte, then any remaining bits to find the bit that is different.
 */

static uint32_t
bitmap_get(u_char * map, uint32_t start_bit, uint32_t bit_count, 
	   boolean_t * ret_is_set)
{
    uint32_t		count;
    int			i;
    boolean_t		is_set;
    bitmap_offset_t 	start;
    bitmap_offset_t	end;

    start = bitmap_offset(start_bit);
    end = bitmap_offset(start_bit + bit_count);

    is_set = (map[start.byte] & bit(start.bit)) ? TRUE : FALSE;
    count = 0;

    if (start.byte < end.byte) {
	uint32_t n_bytes;

	if (start.bit) { /* try to align to a byte */
	    for (i = start.bit; i < NBBY; i++) {
		boolean_t	this_is_set;

		this_is_set = (map[start.byte] & bit(i)) ? TRUE : FALSE;
		if (this_is_set != is_set) {
		    goto done; /* found bit that was different, we're done */
		}
		count++;
	    }
	    start.bit = 0; /* made it to the next byte */
	    start.byte++;
	    if (start.byte == end.byte)
		goto end; /* no more bytes, check for any leftover bits */
	}
	/* calculate how many bytes are left in the range */
	n_bytes = end.byte - start.byte;

	/* check for 4 bytes of the same bits */
	while (n_bytes >= sizeof(uint32_t)) {
	    uint32_t * valPtr = (uint32_t *)(map + start.byte);
	    if ((is_set && *valPtr == UINT32_ALL_ONES) 
		|| (!is_set && *valPtr == 0)) {
		count += sizeof(*valPtr) * NBBY;
		start.byte += sizeof(*valPtr);
		n_bytes -= sizeof(*valPtr);
	    }
	    else
		break; /* bits differ */

	}
	/* check for 2 bytes of the same bits */
	if (n_bytes >= sizeof(u_short)) {
	    u_short * valPtr = (u_short *)(map + start.byte);
			
	    if ((is_set && *valPtr == USHORT_ALL_ONES) 
		|| (!is_set && (*valPtr == 0))) {
		count += sizeof(*valPtr) * NBBY;
		start.byte += sizeof(*valPtr);
		n_bytes -= sizeof(*valPtr);
	    }
	}

	/* check for 1 byte of the same bits */
	if (n_bytes) { 
	    if ((is_set && map[start.byte] == UCHAR_ALL_ONES) 
		|| (!is_set && map[start.byte] == 0)) {
		count += NBBY;
		start.byte++;
		n_bytes--;
	    }
	    /* we found bits that were different, find the first one */
	    if (n_bytes) { 
		for (i = 0; i < NBBY; i++) {
		    boolean_t	this_is_set;

		    this_is_set = (map[start.byte] & bit(i)) ? TRUE : FALSE;
		    if (this_is_set != is_set) {
			break;
		    }
		    count++;
		}
		goto done;
	    }
	}
    }

 end:
    for (i = start.bit; i < (int)end.bit; i++) {
	boolean_t this_is_set = (map[start.byte] & bit(i)) ? TRUE : FALSE;
	
	if (this_is_set != is_set) {
	    break;
	}
	count++;
    }

 done:
    *ret_is_set = is_set;
    return (count);
}

static __inline__ band_number_t
shadow_map_block_to_band(shadow_map_t * map, uint32_t block)
{
    return (block / map->blocks_per_band);
}

/*
 * Function: shadow_map_mapped_band
 * Purpose:
 *   Return the mapped band for the given band.
 *   If map_it is FALSE, and the band is not mapped, return FALSE.
 *   If map_it is TRUE, then this function will always return TRUE.
 */
static boolean_t
shadow_map_mapped_band(shadow_map_t * map, band_number_t band,
		       boolean_t map_it, band_number_t * mapped_band)
{
    boolean_t		is_mapped = FALSE;

    if (band == map->zeroth_band) {
	*mapped_band = BAND_ZERO;
	is_mapped = TRUE;
    }
    else {
	*mapped_band = map->bands[band];
	if (*mapped_band == BAND_ZERO) {
	    if (map_it) {
		/* grow the file */
		if (map->next_band == 0) {
		    /* remember the zero'th band */
		    map->zeroth_band = band;
		}
		*mapped_band = map->bands[band] = map->next_band++;
		is_mapped = TRUE;
	    }
	}
	else {
	    is_mapped = TRUE;
	}
    }
    return (is_mapped);
}

/* 
 * Function: shadow_map_contiguous
 *
 * Purpose:
 *   Return the first offset within the range position..(position + count) 
 *   that is not a contiguous mapped band.
 *
 *   If called with is_write = TRUE, this function will map bands as it goes.
 */
static uint32_t
shadow_map_contiguous(shadow_map_t * map, uint32_t start_block,
		      uint32_t num_blocks, boolean_t is_write)
{
    band_number_t	band = shadow_map_block_to_band(map, start_block);
    uint32_t		end_block = start_block + num_blocks;
    boolean_t		is_mapped;
    band_number_t	mapped_band;
    uint32_t		ret_end_block = end_block;
    uint32_t		p;

    is_mapped = shadow_map_mapped_band(map, band, is_write, &mapped_band);
    if (is_write == FALSE && is_mapped == FALSE) {
	static int happened = 0;
	/* this can't happen */
	if (happened == 0) {
	    printf("shadow_map_contiguous: this can't happen!\n");
	    happened = 1;
	}
	return (start_block);
    }
    for (p = my_trunc(start_block + map->blocks_per_band, 
		      map->blocks_per_band);
	 p < end_block; p += map->blocks_per_band) {
	band_number_t 	next_mapped_band;
		
	band++;
	is_mapped = shadow_map_mapped_band(map, band, is_write,
					   &next_mapped_band);
	if (is_write == FALSE && is_mapped == FALSE) {
	    return (p);
	}
	if ((mapped_band + 1) != next_mapped_band) {
	    /* not contiguous */
	    ret_end_block = p;
	    break;
	}
	mapped_band = next_mapped_band;
    }
    return (ret_end_block);
}


/* 
 * Function: block_bitmap_size
 * Purpose:
 *   The number of bytes required in a block bitmap to represent a file of size 
 *   file_size.
 *
 *   The bytes required is the number of blocks in the file,
 *   divided by the number of bits per byte.
 * Note:
 *   An 8GB file requires (assuming 512 byte block):
 *   2^33 / 2^9 / 2^3 = 2^21 = 2MB
 *   of bitmap space.  This is a non-trival amount of memory,
 *   particularly since most of the bits will be zero.
 *   A sparse bitmap would really help in this case.
 */
static __inline__ uint32_t
block_bitmap_size(off_t file_size, uint32_t block_size)
{
    off_t blocks = howmany(file_size, block_size);
    return (howmany(blocks, NBBY));
}

/*
 * Function: shadow_map_read
 *
 * Purpose:
 *   Calculate the block offset within the shadow to read, and the number
 *   blocks to read.  The input values (block_offset, block_count) refer
 *   to the original file.
 *
 *   The output values (*incr_block_offset, *incr_block_count) refer to the
 *   shadow file if the return value is TRUE.  They refer to the original
 *   file if the return value is FALSE.

 *   Blocks within a band may or may not have been written, in addition,
 *   Bands are not necessarily contiguous, therefore:
 *   	*incr_block_count <= block_count
 *   The caller must be prepared to call this function interatively
 *   to complete the whole i/o.
 * Returns:
 *   TRUE if the shadow file should be read, FALSE if the original file
 *   should be read.
 */
boolean_t
shadow_map_read(shadow_map_t * map, uint32_t block_offset, uint32_t block_count,
		uint32_t * incr_block_offset, uint32_t * incr_block_count)
{
    boolean_t		written = FALSE;
    uint32_t		n_blocks;

    if (block_offset >= map->file_size_blocks
	|| (block_offset + block_count) > map->file_size_blocks) {
	printf("shadow_map_read: request (%d, %d) exceeds file size %d\n",
	       block_offset, block_count, map->file_size_blocks);
	*incr_block_count = 0;
    }
    n_blocks = bitmap_get(map->block_bitmap, block_offset, block_count,
			  &written);
    if (written == FALSE) {
	*incr_block_count = n_blocks;
	*incr_block_offset = block_offset;
    }
    else { /* start has been written, and therefore mapped */
	band_number_t	mapped_band;
	uint32_t		band_limit;
	
	mapped_band = map->bands[shadow_map_block_to_band(map, block_offset)];
	*incr_block_offset = mapped_band * map->blocks_per_band
	    + (block_offset % map->blocks_per_band);
	band_limit 
	    = shadow_map_contiguous(map, block_offset, block_count, FALSE);
	*incr_block_count = band_limit - block_offset;
	if (*incr_block_count > n_blocks) {
	    *incr_block_count = n_blocks;
	}
    }
    return (written);
}

/*
 * Function: shadow_map_write
 *
 * Purpose:
 *   Calculate the block offset within the shadow to write, and the number
 *   blocks to write.  The input values (block_offset, block_count) refer
 *   to the original file.  The output values 
 *   (*incr_block_offset, *incr_block_count) refer to the shadow file.
 *
 *   Bands are not necessarily contiguous, therefore:
 *   	*incr_block_count <= block_count
 *   The caller must be prepared to call this function interatively
 *   to complete the whole i/o.
 * Returns:
 *   TRUE if the shadow file was grown, FALSE otherwise. 
 */
boolean_t
shadow_map_write(shadow_map_t * map, uint32_t block_offset, 
		 uint32_t block_count, uint32_t * incr_block_offset, 
		 uint32_t * incr_block_count)
{
    uint32_t		band_limit;
    band_number_t	mapped_band;
    boolean_t		shadow_grew = FALSE;

    if (block_offset >= map->file_size_blocks
	|| (block_offset + block_count) > map->file_size_blocks) {
	printf("shadow_map_write: request (%d, %d) exceeds file size %d\n",
	       block_offset, block_count, map->file_size_blocks);
	*incr_block_count = 0;
    }
    
    band_limit = shadow_map_contiguous(map, block_offset, block_count, TRUE);
    mapped_band = map->bands[shadow_map_block_to_band(map, block_offset)];
    *incr_block_offset = mapped_band * map->blocks_per_band
	+ (block_offset % map->blocks_per_band);
    *incr_block_count = band_limit - block_offset;

    /* mark these blocks as written */
    bitmap_set(map->block_bitmap, block_offset, *incr_block_count);

    if (map->next_band > map->shadow_size_bands) {
	map->shadow_size_bands = map->next_band;
	shadow_grew = TRUE;
    }
    return (shadow_grew);
}

boolean_t
shadow_map_is_written(shadow_map_t * map, uint32_t block_offset)
{
    bitmap_offset_t 	b;

    b = bitmap_offset(block_offset);
    return ((map->block_bitmap[b.byte] & bit(b.bit)) ? TRUE : FALSE);
}

/*
 * Function: shadow_map_shadow_size
 *
 * Purpose:
 *   To return the size of the shadow file in blocks.
 */
uint32_t
shadow_map_shadow_size(shadow_map_t * map)
{
    return (map->shadow_size_bands * map->blocks_per_band);
}

/* 
 * Function: shadow_map_create
 *
 * Purpose:
 *   Allocate the dynamic data for keeping track of the shadow dirty blocks
 *   and the band mapping table.
 * Returns:
 *   NULL if an error occurred.
 */
shadow_map_t *
shadow_map_create(off_t file_size, off_t shadow_size, 
		  uint32_t band_size, uint32_t block_size)
{
    void *		block_bitmap = NULL;
    uint32_t		bitmap_size;
    band_number_t *	bands = NULL;
    shadow_map_t *	map;
    uint32_t		n_bands = 0;

    if (band_size == 0) {
	band_size = BAND_SIZE_DEFAULT;
    }

    n_bands = howmany(file_size, band_size);
    if (n_bands > (BAND_MAX + 1)) {
	printf("file is too big: %d > %d\n",
	       n_bands, BAND_MAX);
	goto failure;
    }

    /* create a block bitmap, one bit per block */
    bitmap_size = block_bitmap_size(file_size, block_size);
    block_bitmap = my_malloc(bitmap_size);
    if (block_bitmap == NULL) {
	printf("failed to allocate bitmap\n");
	goto failure;
    }
    bzero(block_bitmap, bitmap_size);

    /* get the band map */
    bands = (band_number_t *)my_malloc(n_bands * sizeof(band_number_t));
    if (bands == NULL) {
	printf("failed to allocate bands\n");
	goto failure;
    }
    bzero(bands, n_bands * sizeof(band_number_t));

    map = my_malloc(sizeof(*map));
    if (map == NULL) {
	printf("failed to allocate map\n");
	goto failure;
    }
    map->blocks_per_band = band_size / block_size;
    map->block_bitmap = block_bitmap;
    map->bands = bands;
    map->file_size_blocks = n_bands * map->blocks_per_band;
    map->next_band = 0;
    map->zeroth_band = -1;
    map->shadow_size_bands = howmany(shadow_size, band_size);
    map->block_size = block_size;
    return (map);
	
 failure:
    if (block_bitmap)
	my_free(block_bitmap);
    if (bands)
	my_free(bands);
    return (NULL);
}

/*
 * Function: shadow_map_free
 * Purpose:
 *   Frees the data structure to deal with the shadow map.
 */
void
shadow_map_free(shadow_map_t * map)
{	
    if (map->block_bitmap)
	my_free(map->block_bitmap);
    if (map->bands)
	my_free(map->bands);
    map->block_bitmap = NULL;
    map->bands = NULL;
    my_free(map);
    return;
}

#ifdef TEST_SHADOW
#define BAND_SIZE_BLOCKS	(BAND_SIZE_DEFAULT / 512)

enum {
    ReadRequest,
    WriteRequest,
};

typedef struct {
    int		type;
    uint32_t	offset;
    uint32_t	count;
} block_request_t;

int
main()
{
    shadow_map_t *	map;
    int 		i;
    block_request_t 	requests[] = {
	{ WriteRequest, BAND_SIZE_BLOCKS * 2, 1 },
	{ ReadRequest, BAND_SIZE_BLOCKS / 2, BAND_SIZE_BLOCKS * 2 - 2 },
	{ WriteRequest, BAND_SIZE_BLOCKS * 1, 5 * BAND_SIZE_BLOCKS + 3},
	{ ReadRequest, 0, BAND_SIZE_BLOCKS * 10 },
	{ WriteRequest, BAND_SIZE_BLOCKS * (BAND_MAX - 1),
	  BAND_SIZE_BLOCKS * 2},
	{ 0, 0 },
    };
    
    map = shadow_map_create(1024 * 1024 * 1024 * 8ULL, 0, 0, 512);
    if (map == NULL) {
	printf("shadow_map_create failed\n");
	exit(1);
    }
    for (i = 0; TRUE; i++) {
	uint32_t		offset;
	uint32_t		resid;
	boolean_t	shadow_grew;
	boolean_t	read_shadow;

    	if (requests[i].count == 0) {
	    break;
	}
	offset = requests[i].offset;
	resid = requests[i].count;
	printf("\n%s REQUEST (%ld, %ld)\n", 
	       requests[i].type == WriteRequest ? "WRITE" : "READ",
	       offset, resid);
	switch (requests[i].type) {
	case WriteRequest:
	    while (resid > 0) {
		uint32_t this_offset;
		uint32_t this_count;
		
		shadow_grew = shadow_map_write(map, offset,
					       resid,
					       &this_offset,
					       &this_count);
		printf("\t(%ld, %ld) => (%ld, %ld)",
		       offset, resid, this_offset, this_count);
		resid -= this_count;
		offset += this_count;
		if (shadow_grew) {
		    printf(" shadow grew to %ld", shadow_map_shadow_size(map));
		}
		printf("\n");
	    }
	    break;
	case ReadRequest:
	    while (resid > 0) {
		uint32_t this_offset;
		uint32_t this_count;
		
		read_shadow = shadow_map_read(map, offset,
					      resid,
					      &this_offset,
					      &this_count);
		printf("\t(%ld, %ld) => (%ld, %ld)%s\n",
		       offset, resid, this_offset, this_count,
		       read_shadow ? " from shadow" : "");
		if (this_count == 0) {
		    printf("this_count is 0, aborting\n");
		    break;
		}
		resid -= this_count;
		offset += this_count;
	    }
	    break;
	default:
	    break;
	}
    }
    if (map) {
	shadow_map_free(map);
    }
    exit(0);
    return (0);
}
#endif