hfs_hotfiles.c   [plain text]


/*
 * Copyright (c) 2003-2015 Apple Inc. All rights reserved.
 *
 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
 * 
 * This file contains Original Code and/or Modifications of Original Code
 * as defined in and that are subject to the Apple Public Source License
 * Version 2.0 (the 'License'). You may not use this file except in
 * compliance with the License. The rights granted to you under the License
 * may not be used to create, or enable the creation or redistribution of,
 * unlawful or unlicensed copies of an Apple operating system, or to
 * circumvent, violate, or enable the circumvention or violation of, any
 * terms of an Apple operating system software license agreement.
 * 
 * Please obtain a copy of the License at
 * http://www.opensource.apple.com/apsl/ and read it before using this file.
 * 
 * The Original Code and all software distributed under the License are
 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
 * Please see the License for the specific language governing rights and
 * limitations under the License.
 * 
 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
 */

#include <libkern/OSAtomic.h>

#include <sys/param.h>
#include <sys/systm.h>
#include <sys/fcntl.h>
#include <sys/kernel.h>
#include <sys/malloc.h>
#include <sys/ubc.h>
#include <sys/vnode.h>
#include <sys/kauth.h>
#include <sys/vm.h>

#include "hfs.h"
#include "hfs_endian.h"
#include "hfs_format.h"
#include "hfs_mount.h"
#include "hfs_hotfiles.h"

#include "BTreeScanner.h"


#define HFC_DEBUG  0
#define HFC_VERBOSE 0


/*
 * Minimum post Tiger base time.
 * Thu Mar 31 17:00:00 2005
 */
#define HFC_MIN_BASE_TIME   0x424c8f00L

/*
 * Hot File List (runtime).
 */
typedef struct hotfileinfo {
	u_int32_t  hf_fileid;
	u_int32_t  hf_temperature;
	u_int32_t  hf_blocks;
} hotfileinfo_t;

typedef struct hotfilelist {
	size_t		  hfl_size;
	u_int32_t     hfl_magic;
	u_int32_t     hfl_version;
	time_t        hfl_duration;    /* duration of sample period */
	int           hfl_count;       /* count of hot files recorded */
	int           hfl_next;        /* next file to move */
	int           hfl_totalblocks; /* total hot file blocks */
	int           hfl_reclaimblks; /* blocks to reclaim in HFV */
	u_int32_t     hfl_spare[2];
	hotfileinfo_t hfl_hotfile[1];  /* array of hot files */
} hotfilelist_t;


/*
 * Hot File Entry (runtime).
 */
typedef struct hotfile_entry {
	struct  hotfile_entry  *left;
	struct  hotfile_entry  *right;
	u_int32_t  fileid;
	u_int32_t  temperature;
	u_int32_t  blocks;
} hotfile_entry_t;


//
// We cap the max temperature for non-system files to "MAX_NORMAL_TEMP"
// so that they will always have a lower temperature than system (aka 
// "auto-cached") files.  System files have MAX_NORMAL_TEMP added to
// their temperature which produces two bands of files (all non-system
// files will have a temp less than MAX_NORMAL_TEMP and all system
// files will have a temp greatern than MAX_NORMAL_TEMP).
//
// This puts non-system files on the left side of the hotfile btree 
// (and we start evicting from the left-side of the tree).  The idea is 
// that we will evict non-system files more aggressively since their
// working set changes much more dynamically than system files (which 
// are for the most part, static).
//
// NOTE: these values have to fit into a 32-bit int.  We use a
//       value of 1-billion which gives a pretty broad range
//       and yet should not run afoul of any sign issues.
//
#define MAX_NORMAL_TEMP    1000000000
#define HF_TEMP_RANGE      MAX_NORMAL_TEMP


//
// These used to be defines of the hard coded values.  But if
// we're on an cooperative fusion (CF) system we need to change 
// the values (which happens in hfs_recording_init()
// 
uint32_t hfc_default_file_count = 1000;
uint32_t hfc_default_duration   = (3600 * 60);
uint32_t hfc_max_file_count     = 5000;
uint64_t hfc_max_file_size      = (10 * 1024 * 1024);


/*
 * Hot File Recording Data (runtime).
 */
typedef struct hotfile_data {
	size_t				size;
	struct hfsmount	   *hfsmp;
	long				refcount;
	u_int32_t			activefiles;  /* active number of hot files */
	u_int32_t			threshold;
	u_int32_t			maxblocks;
	hotfile_entry_t	   *rootentry;
	hotfile_entry_t	   *freelist;
	hotfile_entry_t	   *coldest;
	hotfile_entry_t		entries[];
} hotfile_data_t;

static int  hfs_recording_start (struct hfsmount *);
static int  hfs_recording_stop (struct hfsmount *);

/* Hotfiles pinning routines */
static int hfs_getvnode_and_pin (struct hfsmount *hfsmp, uint32_t fileid, uint32_t *pinned);
static int hfs_pin_extent_record (struct hfsmount *hfsmp, HFSPlusExtentRecord extents, uint32_t *pinned);
static int hfs_pin_catalog_rec (struct hfsmount *hfsmp, HFSPlusCatalogFile *cfp, int rsrc);

/*
 * Hot File Data recording functions (in-memory binary tree).
 */
static int               hf_insert (hotfile_data_t *, hotfile_entry_t *);
static void              hf_delete (hotfile_data_t *, u_int32_t, u_int32_t);
static hotfile_entry_t * hf_coldest (hotfile_data_t *);
static hotfile_entry_t * hf_getnewentry (hotfile_data_t *);
static void              hf_getsortedlist (hotfile_data_t *, hotfilelist_t *);

#if HFC_DEBUG
static hotfile_entry_t * hf_lookup (hotfile_data_t *, u_int32_t, u_int32_t);
static void  hf_maxdepth(hotfile_entry_t *, int, int *);
static void  hf_printtree (hotfile_entry_t *);
#endif

/*
 * Hot File misc support functions.
 */
static int  hotfiles_collect (struct hfsmount *);
static int  hotfiles_age (struct hfsmount *);
static int  hotfiles_adopt (struct hfsmount *);
static int  hotfiles_evict (struct hfsmount *, vfs_context_t);
static int  hotfiles_refine (struct hfsmount *);
static int  hotextents(struct hfsmount *, HFSPlusExtentDescriptor *);
static int  hfs_addhotfile_internal(struct vnode *);
static int  hfs_hotfile_cur_freeblks(hfsmount_t *hfsmp);


/*
 * Hot File Cluster B-tree (on disk) functions.
 */
static int  hfc_btree_create (struct hfsmount *, unsigned int, unsigned int);
static int  hfc_btree_open (struct hfsmount *, struct vnode **);
static int  hfc_btree_open_ext(struct hfsmount *hfsmp, struct vnode **vpp, int ignore_btree_errs);
static int  hfc_btree_close (struct hfsmount *, struct vnode *);
static int  hfc_btree_delete_record(struct hfsmount *hfsmp, BTreeIterator *iterator, HotFileKey *key);
static int  hfc_btree_delete(struct hfsmount *hfsmp);
static int  hfc_comparekeys (HotFileKey *, HotFileKey *);


char hfc_tag[] = "CLUSTERED HOT FILES B-TREE     ";


/*
 *========================================================================
 *                       HOT FILE INTERFACE ROUTINES
 *========================================================================
 */

/*
 * Start recording the hottest files on a file system.
 *
 * Requires that the hfc_mutex be held.
 */
static int
hfs_recording_start(struct hfsmount *hfsmp)
{
	hotfile_data_t *hotdata;
	struct timeval tv;
	int maxentries;
	size_t size;
	int i;
	int error;

	if ((hfsmp->hfs_flags & HFS_READ_ONLY) ||
	    (hfsmp->jnl == NULL) ||
	    (hfsmp->hfs_flags & HFS_METADATA_ZONE) == 0) {
		return (EPERM);
	}
	if (HFSTOVCB(hfsmp)->freeBlocks < (2 * (u_int32_t)hfsmp->hfs_hotfile_maxblks)) {
		return (ENOSPC);
	}
	if (hfsmp->hfc_stage != HFC_IDLE) {
		return (EBUSY);
	}
	hfsmp->hfc_stage = HFC_BUSY;

	if (hfsmp->hfc_recdata) {
		hfs_free(hfsmp->hfc_recdata, hfsmp->hfc_recdata->size);
		hfsmp->hfc_recdata = NULL;
	}
	if (hfsmp->hfc_filelist) {
		hfs_free(hfsmp->hfc_filelist, hfsmp->hfc_filelist->hfl_size);
		hfsmp->hfc_filelist = NULL;
	}

	microtime(&tv);  /* Times are base on GMT time. */

	/*
	 * On first startup check for suspended recording.
	 */
	if (hfsmp->hfc_timebase == 0 &&
	    hfc_btree_open(hfsmp, &hfsmp->hfc_filevp) == 0) {
		HotFilesInfo hotfileinfo;

		if ((BTGetUserData(VTOF(hfsmp->hfc_filevp), &hotfileinfo,
		                   sizeof(hotfileinfo)) == 0) &&
		    (SWAP_BE32 (hotfileinfo.magic) == HFC_MAGIC) &&
		    (SWAP_BE32 (hotfileinfo.timeleft) > 0) &&
		    (SWAP_BE32 (hotfileinfo.timebase) > 0)) {
			if (hfsmp->hfs_flags & HFS_CS_HOTFILE_PIN) {
				if (hfsmp->hfs_hotfile_freeblks == 0) {
					hfsmp->hfs_hotfile_freeblks = hfsmp->hfs_hotfile_maxblks - SWAP_BE32 (hotfileinfo.usedblocks);
				}
				hfsmp->hfc_maxfiles = 0x7fffffff;
				printf("hfs: %s: %s: hotfile freeblocks: %d, max: %d\n", hfsmp->vcbVN, __FUNCTION__,
				       hfsmp->hfs_hotfile_freeblks, hfsmp->hfs_hotfile_maxblks);
			} else {
				hfsmp->hfc_maxfiles = SWAP_BE32 (hotfileinfo.maxfilecnt);
			}
			hfsmp->hfc_timebase = SWAP_BE32 (hotfileinfo.timebase);
			int timeleft = (int)SWAP_BE32(hotfileinfo.timeleft);
			if (timeleft < 0 || timeleft > (int)(HFC_DEFAULT_DURATION*2)) {
				// in case this field got botched, don't let it screw things up
				// printf("hfs: hotfiles: bogus looking timeleft: %d\n", timeleft);
				timeleft = HFC_DEFAULT_DURATION;
			}
			hfsmp->hfc_timeout = timeleft + tv.tv_sec ;
			/* Fix up any bogus timebase values. */
			if (hfsmp->hfc_timebase < HFC_MIN_BASE_TIME) {
				hfsmp->hfc_timebase = hfsmp->hfc_timeout - HFC_DEFAULT_DURATION;
			}
#if HFC_VERBOSE
			printf("hfs: Resume recording hot files on %s (%d secs left (%d); timeout %ld)\n",
			       hfsmp->vcbVN, SWAP_BE32 (hotfileinfo.timeleft), timeleft, hfsmp->hfc_timeout - tv.tv_sec);
#endif
		} else {
			hfsmp->hfc_maxfiles = HFC_DEFAULT_FILE_COUNT;
			hfsmp->hfc_timebase = tv.tv_sec + 1;
			hfsmp->hfc_timeout = hfsmp->hfc_timebase + HFC_DEFAULT_DURATION;
		}
		(void) hfc_btree_close(hfsmp, hfsmp->hfc_filevp);
		hfsmp->hfc_filevp = NULL;
	} else {
		struct cat_attr cattr;
		u_int32_t cnid;

		/*
		 * Make sure a btree file exists.
		 */
		cnid = GetFileInfo(HFSTOVCB(hfsmp), kRootDirID, HFC_FILENAME, &cattr, NULL);
		if ((cnid == 0) &&
		    !S_ISREG(cattr.ca_mode) &&
		    (error = hfc_btree_create(hfsmp, HFSTOVCB(hfsmp)->blockSize, HFC_DEFAULT_FILE_COUNT))) {
			hfsmp->hfc_stage = HFC_IDLE;
			wakeup((caddr_t)&hfsmp->hfc_stage);
			return (error);
		}
#if HFC_VERBOSE
		printf("hfs: begin recording hot files on %s (hotfile start/end block: %d - %d; max/free: %d/%d; maxfiles: %d)\n",
		       hfsmp->vcbVN,
		       hfsmp->hfs_hotfile_start, hfsmp->hfs_hotfile_end,
		       hfsmp->hfs_hotfile_maxblks, hfsmp->hfs_hotfile_freeblks, hfsmp->hfc_maxfiles);
#endif
		hfsmp->hfc_maxfiles = HFC_DEFAULT_FILE_COUNT;
		hfsmp->hfc_timeout = tv.tv_sec + HFC_DEFAULT_DURATION;

		/* Reset time base.  */
		if (hfsmp->hfc_timebase == 0) {
			hfsmp->hfc_timebase = tv.tv_sec + 1;
		} else {
			time_t cumulativebase;

			cumulativebase = hfsmp->hfc_timeout - (HFC_CUMULATIVE_CYCLES * HFC_DEFAULT_DURATION);
			hfsmp->hfc_timebase = MAX(hfsmp->hfc_timebase, cumulativebase);
		}
	}

	if ((hfsmp->hfc_maxfiles == 0) ||
	    (hfsmp->hfc_maxfiles > HFC_MAXIMUM_FILE_COUNT)) {
		hfsmp->hfc_maxfiles = HFC_DEFAULT_FILE_COUNT;
	}
	maxentries = hfsmp->hfc_maxfiles;

	size = sizeof(hotfile_data_t) + maxentries * sizeof(hotfile_entry_t);
	hotdata = hfs_mallocz(size);
	hotdata->size = size;

	for (i = 1; i < maxentries ; i++)
		hotdata->entries[i-1].right = &hotdata->entries[i];
	
	hotdata->freelist = &hotdata->entries[0];
	/* 
	 * Establish minimum temperature and maximum file size.
	 */
	hotdata->threshold = HFC_MINIMUM_TEMPERATURE;
	hotdata->maxblocks = HFC_MAXIMUM_FILESIZE / HFSTOVCB(hfsmp)->blockSize;
	hotdata->hfsmp = hfsmp;
	
	hfsmp->hfc_recdata = hotdata;
	hfsmp->hfc_stage = HFC_RECORDING;
	wakeup((caddr_t)&hfsmp->hfc_stage);
	return (0);
}

/*
 * Stop recording the hotest files on a file system.
 *
 * Requires that the hfc_mutex be held.
 */
static int
hfs_recording_stop(struct hfsmount *hfsmp)
{
	hotfile_data_t *hotdata;
	hotfilelist_t  *listp;
	struct timeval tv;
	size_t  size;
	enum hfc_stage newstage = HFC_IDLE;
	int  error;

	if (hfsmp->hfc_stage != HFC_RECORDING)
		return (EPERM);

	hfsmp->hfc_stage = HFC_BUSY;

	hotfiles_collect(hfsmp);


	/*
	 * Convert hot file data into a simple file id list....
	 *
	 * then dump the sample data
	 */
#if HFC_VERBOSE
	printf("hfs: end of hot file recording on %s\n", hfsmp->vcbVN);
#endif
	hotdata = hfsmp->hfc_recdata;
	if (hotdata == NULL)
		return (0);
	hfsmp->hfc_recdata = NULL;
	hfsmp->hfc_stage = HFC_EVALUATION;
	wakeup((caddr_t)&hfsmp->hfc_stage);

#if HFC_VERBOSE
	printf("hfs:   curentries: %d\n", hotdata->activefiles);
#endif
	/*
	 * If no hot files recorded then we're done.
	 */
	if (hotdata->rootentry == NULL) {
		error = 0;
		goto out;
	}

	/* Open the B-tree file for writing... */
	if (hfsmp->hfc_filevp)
		panic("hfs_recording_stop: hfc_filevp exists (vp = %p)", hfsmp->hfc_filevp);

	error = hfc_btree_open(hfsmp, &hfsmp->hfc_filevp);
	if (error) {
		goto out;
	}

	/*
	 * Age the previous set of clustered hot files.
	 */
	error = hotfiles_age(hfsmp);
	if (error) {
		(void) hfc_btree_close(hfsmp, hfsmp->hfc_filevp);
		hfsmp->hfc_filevp = NULL;
		goto out;
	}

	/*
	 * Create a sorted list of hotest files.
	 */
	size = sizeof(hotfilelist_t);
	size += sizeof(hotfileinfo_t) * (hotdata->activefiles - 1);
	listp = hfs_mallocz(size);
	listp->hfl_size = size;

	hf_getsortedlist(hotdata, listp);	/* NOTE: destroys hot file tree! */
	microtime(&tv);
	listp->hfl_duration = tv.tv_sec - hfsmp->hfc_timebase;
	hfs_assert(!hfsmp->hfc_filelist);
	hfsmp->hfc_filelist = listp;

	/*
	 * Account for duplicates.
	 */
	error = hotfiles_refine(hfsmp);
	if (error) {
		(void) hfc_btree_close(hfsmp, hfsmp->hfc_filevp);
		hfsmp->hfc_filevp = NULL;
		goto out;
	}

	/*
	 * Compute the amount of space to reclaim...
	 */
	if (listp->hfl_totalblocks > hfs_hotfile_cur_freeblks(hfsmp)) {
		listp->hfl_reclaimblks =
			MIN(listp->hfl_totalblocks, hfsmp->hfs_hotfile_maxblks) -
			hfsmp->hfs_hotfile_freeblks;
#if HFC_VERBOSE
		printf("hfs_recording_stop: need to reclaim %d blocks\n", listp->hfl_reclaimblks);
#endif
		if (listp->hfl_reclaimblks)
			newstage = HFC_EVICTION;
		else
			newstage = HFC_ADOPTION;
	} else {
		newstage = HFC_ADOPTION;
	}
	
	if (newstage == HFC_ADOPTION && listp->hfl_totalblocks == 0) {
		(void) hfc_btree_close(hfsmp, hfsmp->hfc_filevp);
		hfsmp->hfc_filevp = NULL;
		newstage = HFC_IDLE;
	}
out:
#if HFC_VERBOSE
	if (newstage == HFC_EVICTION)
		printf("hfs: evicting coldest files\n");
	else if (newstage == HFC_ADOPTION)
		printf("hfs: adopting hotest files\n");
#endif
	hfs_free(hotdata, hotdata->size);

	hfsmp->hfc_stage = newstage;
	wakeup((caddr_t)&hfsmp->hfc_stage);
	return (error);
}

static void
save_btree_user_info(struct hfsmount *hfsmp)
{
	HotFilesInfo hotfileinfo;
	struct timeval tv;

	microtime(&tv);
	hotfileinfo.magic       = SWAP_BE32 (HFC_MAGIC);
	hotfileinfo.version     = SWAP_BE32 (HFC_VERSION);
	hotfileinfo.duration    = SWAP_BE32 (HFC_DEFAULT_DURATION);
	hotfileinfo.timebase    = SWAP_BE32 (hfsmp->hfc_timebase);
	hotfileinfo.timeleft    = SWAP_BE32 (hfsmp->hfc_timeout - tv.tv_sec);
	hotfileinfo.threshold   = SWAP_BE32 (HFC_MINIMUM_TEMPERATURE);
	hotfileinfo.maxfileblks = SWAP_BE32 (HFC_MAXIMUM_FILESIZE / HFSTOVCB(hfsmp)->blockSize);
	if (hfsmp->hfs_flags & HFS_CS_HOTFILE_PIN) {
		hotfileinfo.usedblocks = SWAP_BE32 (hfsmp->hfs_hotfile_maxblks - hfs_hotfile_cur_freeblks(hfsmp));
#if HFC_VERBOSE
		printf("hfs: %s: saving usedblocks = %d (timeleft: %d; timeout %ld)\n", hfsmp->vcbVN, (hfsmp->hfs_hotfile_maxblks - hfsmp->hfs_hotfile_freeblks),
		       SWAP_BE32(hotfileinfo.timeleft), hfsmp->hfc_timeout);
#endif
	} else {
		hotfileinfo.maxfilecnt  = SWAP_BE32 (HFC_DEFAULT_FILE_COUNT);
	}
	strlcpy((char *)hotfileinfo.tag, hfc_tag, sizeof hotfileinfo.tag);
	(void) BTSetUserData(VTOF(hfsmp->hfc_filevp), &hotfileinfo, sizeof(hotfileinfo));
}

/*
 * Suspend recording the hotest files on a file system.
 */
int
hfs_recording_suspend(struct hfsmount *hfsmp)
{
	hotfile_data_t *hotdata = NULL;
	int  error;

	if (hfsmp->hfc_stage == HFC_DISABLED)
		return (0);

	lck_mtx_lock(&hfsmp->hfc_mutex);

	/*
	 * XXX NOTE
	 * A suspend can occur during eval/evict/adopt stage.
	 * In that case we would need to write out info and
	 * flush our HFBT vnode. Currently we just bail.
	 */

	hotdata = hfsmp->hfc_recdata;
	if (hotdata == NULL || hfsmp->hfc_stage != HFC_RECORDING) {
		error = 0;
		goto out;
	}
	hfsmp->hfc_stage = HFC_BUSY;

#if HFC_VERBOSE
	printf("hfs: suspend hot file recording on %s\n", hfsmp->vcbVN);
#endif
	error = hfc_btree_open(hfsmp, &hfsmp->hfc_filevp);
	if (error) {
		printf("hfs_recording_suspend: err %d opening btree\n", error);
		goto out;
	}

	if (hfs_start_transaction(hfsmp) != 0) {
	    goto out;
	}
	if (hfs_lock(VTOC(hfsmp->hfc_filevp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT) != 0) {
		goto end_transaction;
	}

	save_btree_user_info(hfsmp);

	hfs_unlock(VTOC(hfsmp->hfc_filevp));

end_transaction:
	hfs_end_transaction(hfsmp);

out:
	if (hfsmp->hfc_filevp) {
		(void) hfc_btree_close(hfsmp, hfsmp->hfc_filevp);
		hfsmp->hfc_filevp = NULL;
	}
	if (hotdata) {
		hfs_free(hotdata, hotdata->size);
		hfsmp->hfc_recdata = NULL;
	}
	hfsmp->hfc_stage = HFC_DISABLED;
	wakeup((caddr_t)&hfsmp->hfc_stage);

	lck_mtx_unlock(&hfsmp->hfc_mutex);
	return (error);
}


static void
reset_file_ids(struct hfsmount *hfsmp, uint32_t *fileid_table, int num_ids)
{
	int i, error;

	for(i=0; i < num_ids; i++) {
		struct vnode *vp;

		error = hfs_vget(hfsmp, fileid_table[i], &vp, 0, 0);
		if (error) {
			if (error == ENOENT) {
				error = 0;
				continue;  /* stale entry, go to next */
			}
			continue;
		}

		// hfs_vget returns a locked cnode so no need to lock here

		if ((hfsmp->hfs_flags & HFS_CS_HOTFILE_PIN) && (VTOC(vp)->c_attr.ca_recflags & kHFSFastDevPinnedMask)) {
			error = hfs_pin_vnode(hfsmp, vp, HFS_UNPIN_IT, NULL);
		}

		/*
		 * The updates to the catalog must be journaled
		 */
		hfs_start_transaction(hfsmp);

		//
		// turn off _all_ the hotfile related bits since we're resetting state
		//
		if (VTOC(vp)->c_attr.ca_recflags & kHFSFastDevCandidateMask) {
			vnode_clearfastdevicecandidate(vp);
		}

		VTOC(vp)->c_attr.ca_recflags &= ~(kHFSFastDevPinnedMask|kHFSDoNotFastDevPinMask|kHFSFastDevCandidateMask|kHFSAutoCandidateMask);
		VTOC(vp)->c_flag |= C_MODIFIED;

		hfs_update(vp, 0);

		hfs_end_transaction(hfsmp);
		
		hfs_unlock(VTOC(vp));
		vnode_put(vp);
	}
}

static int
flag_hotfile(struct hfsmount *hfsmp, const char *filename)
{
	struct vnode *dvp = NULL, *fvp = NULL;
	vfs_context_t ctx = vfs_context_kernel();
	int  error=0;
	size_t fname_len;
	const char *orig_fname = filename;
	
	if (filename == NULL) {
		return EINVAL;
	}

	fname_len = strlen(filename);    // do NOT include the trailing '\0' so that we break out of the loop below
	
	error = hfs_vfs_root(HFSTOVFS(hfsmp), &dvp, ctx);
	if (error) {
		return (error);
	}

	/* At this point, 'dvp' must be considered iocounted */
	const char *ptr;
	ptr = filename;

	while (ptr < (orig_fname + fname_len - 1)) {
		for(; ptr < (orig_fname + fname_len) && *ptr && *ptr != '/'; ptr++) {
			/* just keep advancing till we reach the end of the string or a slash */
		}

		struct componentname cname = {
			.cn_nameiop = LOOKUP,
			.cn_flags 	= ISLASTCN,
			.cn_pnbuf 	= __DECONST(char *, orig_fname),
			.cn_nameptr = __DECONST(char *, filename),
			.cn_pnlen 	= fname_len,
			.cn_namelen = ptr - filename
		};

        struct vnop_lookup_args ap = {
            .a_dvp      = dvp,
            .a_vpp      = &fvp,
            .a_cnp      = &cname,
            .a_context  = ctx
        };

        error = hfs_vnop_lookup(&ap);
		if (error) {
			/*
			 * If 'dvp' is non-NULL, then it has an iocount.  Make sure to release it
			 * before bailing out.  VNOP_LOOKUP could legitimately return ENOENT
			 * if the item didn't exist or if we raced with a delete.
			 */
			if (dvp) {
				vnode_put(dvp);
				dvp = NULL;
			}
			return error;
		}

		if (ptr < orig_fname + fname_len - 1) {
			//
			// we've got a multi-part pathname so drop the ref on the dir,
			// make dvp become what we just looked up, and advance over
			// the slash character in the pathname to get to the next part
			// of the component
			//
			vnode_put(dvp);
			dvp = fvp;
			fvp = NULL;

			filename = ++ptr;   // skip the slash character
		}
	}
	
	if (fvp == NULL) {
		error = ENOENT;
		goto out;
	}

	struct cnode *cp = VTOC(fvp);
	if ((error = hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT)) != 0) {
		goto out;
	}

	hfs_start_transaction(hfsmp);
	
	cp->c_attr.ca_recflags |= (kHFSFastDevCandidateMask|kHFSAutoCandidateMask);
	cp->c_flag |= C_MODIFIED;

	hfs_update(fvp, 0);

	hfs_end_transaction(hfsmp);

	hfs_unlock(cp);
	//printf("hfs: flagged /%s with the fast-dev-candidate|auto-candidate flags\n", filename);


out:
	if (fvp) {
		vnode_put(fvp);
		fvp = NULL;
	}

	if (dvp) {
		vnode_put(dvp);
		dvp = NULL;
	}

	return error;
}


static void
hfs_setup_default_cf_hotfiles(struct hfsmount *hfsmp)
{
	const char *system_default_hotfiles[] = {
		"usr",
		"System",
		"Applications",
		"private/var/db/dyld"
	};
	int i;

	for(i=0; i < (int)(sizeof(system_default_hotfiles)/sizeof(char *)); i++) {
		flag_hotfile(hfsmp, system_default_hotfiles[i]);
	}
}


#define NUM_FILE_RESET_IDS   4096    // so we allocate 16k to hold file-ids

static void
hfs_hotfile_reset(struct hfsmount *hfsmp)
{
	CatalogKey * keyp;
	CatalogRecord * datap;
	u_int32_t  dataSize;
	BTScanState scanstate;
	BTreeIterator * iterator = NULL;
	FSBufferDescriptor  record;
	u_int32_t  data;
	u_int32_t  cnid;
	int error = 0;
	uint32_t *fileids=NULL;
	int cur_id_index = 0;

	int cleared = 0;  /* debug variables */
	int filecount = 0;
	int dircount = 0;

#if HFC_VERBOSE
	printf("hfs: %s: %s\n", hfsmp->vcbVN, __FUNCTION__);
#endif

	iterator = hfs_mallocz(sizeof(*iterator));

	fileids = hfs_malloc(NUM_FILE_RESET_IDS * sizeof(uint32_t));

	record.bufferAddress = &data;
	record.itemSize = sizeof(u_int32_t);
	record.itemCount = 1;

	/*
	 * Get ready to scan the Catalog file.
	 */
	error = BTScanInitialize(VTOF(HFSTOVCB(hfsmp)->catalogRefNum), 0, 0, 0,
	                       kCatSearchBufferSize, &scanstate);
	if (error) {
		printf("hfs_hotfile_reset: err %d BTScanInit\n", error);
		goto out;
	}

	/*
	 * Visit all the catalog btree leaf records, clearing any that have the
	 * HotFileCached bit set.
	 */
	for (;;) {
		error = BTScanNextRecord(&scanstate, 0, (void **)&keyp, (void **)&datap, &dataSize);
		if (error) {
			if (error == btNotFound)
				error = 0;
			else
				printf("hfs_hotfile_reset: err %d BTScanNext\n", error);
			break;
		}

		if (datap->recordType == kHFSPlusFolderRecord && (dataSize == sizeof(HFSPlusCatalogFolder))) {
			HFSPlusCatalogFolder *dirp = (HFSPlusCatalogFolder *)datap;

			dircount++;
		
			if ((dirp->flags & (kHFSFastDevPinnedMask|kHFSDoNotFastDevPinMask|kHFSFastDevCandidateMask|kHFSAutoCandidateMask)) == 0) {
				continue;
			}

			cnid = dirp->folderID;
		} else if ((datap->recordType == kHFSPlusFileRecord) && (dataSize == sizeof(HFSPlusCatalogFile))) {
			HFSPlusCatalogFile *filep = (HFSPlusCatalogFile *)datap;   

			filecount++;

			/*
			 * If the file doesn't have any of the HotFileCached bits set, ignore it.
			 */
			if ((filep->flags & (kHFSFastDevPinnedMask|kHFSDoNotFastDevPinMask|kHFSFastDevCandidateMask|kHFSAutoCandidateMask)) == 0) {
				continue;
			}

			cnid = filep->fileID;
		} else {
			continue;
		}

		/* Skip over journal files. */
		if (cnid == hfsmp->hfs_jnlfileid || cnid == hfsmp->hfs_jnlinfoblkid) {
			continue;
		}

		//
		// Just record the cnid of the file for now.  We will modify it separately
		// because we can't modify the catalog while we're scanning it.
		//
		fileids[cur_id_index++] = cnid;
		if (cur_id_index >= NUM_FILE_RESET_IDS) {
			//
			// We're over the limit of file-ids so we have to terminate this
			// scan, go modify all the catalog records, then restart the scan.
			// This is required because it's not permissible to modify the
			// catalog while scanning it.
			//
			(void) BTScanTerminate(&scanstate, &data, &data, &data);

			reset_file_ids(hfsmp, fileids, cur_id_index);
			cleared += cur_id_index;
			cur_id_index = 0;

			// restart the scan
			error = BTScanInitialize(VTOF(HFSTOVCB(hfsmp)->catalogRefNum), 0, 0, 0,
						 kCatSearchBufferSize, &scanstate);
			if (error) {
				printf("hfs_hotfile_reset: err %d BTScanInit\n", error);
				goto out;
			}
			continue;
		}
	}

	if (cur_id_index) {
		reset_file_ids(hfsmp, fileids, cur_id_index);
		cleared += cur_id_index;
		cur_id_index = 0;
	}

	printf("hfs: cleared HotFileCache related bits on %d files out of %d (dircount %d)\n", cleared, filecount, dircount);

	(void) BTScanTerminate(&scanstate, &data, &data, &data);

out:	
	hfs_free(fileids, NUM_FILE_RESET_IDS * sizeof(uint32_t));
	hfs_free(iterator, sizeof(*iterator));

	//
	// If the hotfile btree exists, delete it.  We need to open
	// it to be able to delete it because we need the hfc_filevp
	// for deletion.
	//
	error = hfc_btree_open_ext(hfsmp, &hfsmp->hfc_filevp, 1);
	if (!error) {
		printf("hfs: hotfile_reset: deleting existing hotfile btree\n");
		hfc_btree_delete(hfsmp);
	}
	
	if (hfsmp->hfc_filevp) {
		(void) hfc_btree_close(hfsmp, hfsmp->hfc_filevp);
		hfsmp->hfc_filevp = NULL;
	}

	hfsmp->hfs_hotfile_blk_adjust = 0;
	hfsmp->hfs_hotfile_freeblks = hfsmp->hfs_hotfile_maxblks;
}


//
// This should ONLY be called by hfs_recording_init() and the special fsctl.
//
// We assume that the hotfile btree is already opened.
//
static int
hfs_hotfile_repin_files(struct hfsmount *hfsmp)
{
	BTreeIterator * iterator = NULL;
	HotFileKey * key;
	filefork_t * filefork;
	int  error = 0;
	int  bt_op;
	enum hfc_stage stage;
	uint32_t pinned_blocks;
	uint32_t num_files=0, nrsrc=0;
	uint32_t total_pinned=0;

	if (!(hfsmp->hfs_flags & HFS_CS_HOTFILE_PIN) || !hfsmp->hfc_filevp) {
		//
		// this is only meaningful if we're pinning hotfiles
		// (as opposed to the regular form of hotfiles that
		// get relocated to the hotfile zone)
		//
		return 0;
	}

#if HFC_VERBOSE
	printf("hfs: %s: %s\n", hfsmp->vcbVN, __FUNCTION__);
#endif
	
	if (hfs_lock(VTOC(hfsmp->hfc_filevp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT) != 0) {
		return (EPERM);
	}


	iterator = hfs_mallocz(sizeof(*iterator));

	stage = hfsmp->hfc_stage;
	hfsmp->hfc_stage = HFC_BUSY;

	bt_op = kBTreeFirstRecord;

	key = (HotFileKey*) &iterator->key;

	filefork = VTOF(hfsmp->hfc_filevp);
	int lockflags;

	while (1) {

		lockflags = 0;
		/*
		 * Obtain the first record (ie the coldest one).
		 */
		if (BTIterateRecord(filefork, bt_op, iterator, NULL, NULL) != 0) {
			// no more records
			error = 0;
			break;
		}
		if (key->keyLength != HFC_KEYLENGTH) {
			// printf("hfs: hotfiles_repin_files: invalid key length %d\n", key->keyLength);
			error = EFTYPE;
			break;
		}		
		if (key->temperature == HFC_LOOKUPTAG) {
			// ran into thread records in the hotfile btree
			error = 0;
			break;
		}

        //
		// Just lookup the records in the catalog and pin the direct
		// mapped extents.  Faster than instantiating full vnodes
		// (and thereby thrashing the system vnode cache).
		//
		struct cat_desc fdesc;
		struct cat_attr attr;
		struct cat_fork fork;
        uint8_t forktype = 0;

		lockflags = hfs_systemfile_lock(hfsmp, (SFL_CATALOG | SFL_EXTENTS), HFS_SHARED_LOCK);
        /*
         * Snoop the cnode hash to find out if the item we want is in-core already.
         *
         * We largely expect this function to fail (the items we want are probably not in the hash).
         * we use the special variant which bails out as soon as it finds a vnode (even if it is
         * marked as open-unlinked or actually removed on-disk.  If we find a vnode, then we
         * release the systemfile locks and go through the pin-vnode path instead.
         */
        if (hfs_chash_snoop (hfsmp, key->fileID, 1, NULL, NULL) == 0) {
            pinned_blocks = 0;

            /* unlock immediately and go through the in-core path */
            hfs_systemfile_unlock(hfsmp, lockflags);
			lockflags = 0;

            error = hfs_getvnode_and_pin (hfsmp, key->fileID, &pinned_blocks);
            if (error) {
                /* if ENOENT, then it was deleted in the catalog. Remove from our hotfiles tracking */
                if (error == ENOENT) {
                    hfc_btree_delete_record(hfsmp, iterator, key);
                }
                /* other errors, just ignore and move on with life */
            }
            else { //!error
                total_pinned += pinned_blocks;
                num_files++;
            }

            goto next;
        }

        /* If we get here, we're still holding the systemfile locks */
		error = cat_idlookup(hfsmp, key->fileID, 1, 0, &fdesc, &attr, &fork);
		if (error) {
			//
			// this file system could have been mounted while booted from a
			// different partition and thus the hotfile btree would not have
			// been maintained.  thus a file that was hotfile cached could
			// have been deleted while booted from a different partition which
			// means we need to delete it from the hotfile btree.
			//
			// block accounting is taken care of at the end: we re-assign
			// hfsmp->hfs_hotfile_freeblks based on how many blocks we actually
			// pinned.
			//
			hfc_btree_delete_record(hfsmp, iterator, key);

			goto next;
		}

		if (fork.cf_size == 0) {
			// hmmm, the data is probably in the resource fork (aka a compressed file)
			error = cat_idlookup(hfsmp, key->fileID, 1, 1, &fdesc, &attr, &fork);
			if (error) {
				hfc_btree_delete_record(hfsmp, iterator, key);
				goto next;
			}
            forktype = 0xff;
			nrsrc++;
		}

		pinned_blocks = 0;

        /* Can't release the catalog /extents lock yet, we may need to go find the overflow blocks */
        error = hfs_pin_extent_record (hfsmp, fork.cf_extents, &pinned_blocks);
        if (error) {
            goto next;  //skip to next
        }
		/* add in the blocks from the inline 8 */
        total_pinned += pinned_blocks;
        pinned_blocks = 0;

        /* Could this file have overflow extents? */
        if (fork.cf_extents[kHFSPlusExtentDensity-1].startBlock) {
            /* better pin them, too */
            error = hfs_pin_overflow_extents (hfsmp, key->fileID, forktype, &pinned_blocks);
            if (error) {
				/* If we fail to pin all of the overflow extents, then just skip to the next file */
                goto next;
            }
        }

		num_files++;
        if (pinned_blocks) {
            /* now add in any overflow also */
            total_pinned += pinned_blocks;
        }

	next:
		if (lockflags) {
			hfs_systemfile_unlock(hfsmp, lockflags);
			lockflags = 0;
		}
		bt_op = kBTreeNextRecord;

	} /* end while */

#if HFC_VERBOSE
	printf("hfs: hotfiles_repin_files: re-pinned %d files (nrsrc %d, total pinned %d blks; freeblock %d, maxblocks %d, calculated free: %d)\n",
	       num_files, nrsrc, total_pinned, hfsmp->hfs_hotfile_freeblks, hfsmp->hfs_hotfile_maxblks,
	      hfsmp->hfs_hotfile_maxblks - total_pinned);
#endif
	//
	// make sure this is accurate based on how many blocks we actually pinned
	//
	hfsmp->hfs_hotfile_freeblks = hfsmp->hfs_hotfile_maxblks - total_pinned;

	hfs_unlock(VTOC(hfsmp->hfc_filevp));

	hfs_free(iterator, sizeof(*iterator));	
	hfsmp->hfc_stage = stage;
	wakeup((caddr_t)&hfsmp->hfc_stage);
	return (error);
}

void
hfs_repin_hotfiles(struct hfsmount *hfsmp)
{
	int error, need_close;
	
	lck_mtx_lock(&hfsmp->hfc_mutex);

	if (hfsmp->hfc_filevp == NULL) {
		error = hfc_btree_open(hfsmp, &hfsmp->hfc_filevp);
		if (!error) {
			need_close = 1;
		} else {
			printf("hfs: failed to open the btree err=%d.  Unable to re-pin hotfiles.\n", error);
			lck_mtx_unlock(&hfsmp->hfc_mutex);
			return;
		}
	} else {
		need_close = 0;
	}

	hfs_pin_vnode(hfsmp, hfsmp->hfc_filevp, HFS_PIN_IT, NULL);
			
	hfs_hotfile_repin_files(hfsmp);

	if (need_close) {
		(void) hfc_btree_close(hfsmp, hfsmp->hfc_filevp);
		hfsmp->hfc_filevp = NULL;
	}

	lck_mtx_unlock(&hfsmp->hfc_mutex);
}

/*
 * For a given file ID, find and pin all of its overflow extents to the underlying CS
 * device.  Assumes that the extents overflow b-tree is locked for the duration of this call.
 *
 * Emit the number of blocks pinned in output argument 'pinned'
 *
 * Return success or failure (errno) in return value.
 *
 */
int hfs_pin_overflow_extents (struct hfsmount *hfsmp, uint32_t fileid,
                                     uint8_t forktype, uint32_t *pinned) {

    struct BTreeIterator *ext_iter = NULL;
    ExtentKey *ext_key_ptr = NULL;
    ExtentRecord ext_data;
    FSBufferDescriptor btRecord;
    uint16_t btRecordSize;
    int error = 0;

    uint32_t pinned_blocks = 0;


    ext_iter = hfs_mallocz(sizeof (*ext_iter));

    BTInvalidateHint (ext_iter);
    ext_key_ptr = (ExtentKey*)&ext_iter->key;
    btRecord.bufferAddress = &ext_data;
    btRecord.itemCount = 1;

    /*
     * This is like when you delete a file; we don't actually need most of the search machinery because
     * we are going to need all of the extent records that belong to this file (for a given fork type),
     * so we might as well use a straight-up iterator.
     *
     * Position the B-Tree iterator at the first record with this file ID
     */
    btRecord.itemSize = sizeof (HFSPlusExtentRecord);
    ext_key_ptr->hfsPlus.keyLength = kHFSPlusExtentKeyMaximumLength;
    ext_key_ptr->hfsPlus.forkType = forktype;
    ext_key_ptr->hfsPlus.pad = 0;
    ext_key_ptr->hfsPlus.fileID = fileid;
    ext_key_ptr->hfsPlus.startBlock = 0;

    error = BTSearchRecord (VTOF(hfsmp->hfs_extents_vp), ext_iter, &btRecord, &btRecordSize, ext_iter);
    if (error ==  btNotFound) {
        /* empty b-tree, so that's ok. we'll fall out during error check below. */
        error = 0;
    }

    while (1) {
        uint32_t found_fileid;
        uint32_t pblocks;

        error = BTIterateRecord (VTOF(hfsmp->hfs_extents_vp), kBTreeNextRecord, ext_iter, &btRecord, &btRecordSize);
        if (error) {
            /* swallow it if it's btNotFound, otherwise just bail out */
            if (error == btNotFound)
                error = 0;
            break;
        }

        found_fileid = ext_key_ptr->hfsPlus.fileID;
        /*
         * We only do one fork type at a time. So if either the fork-type doesn't
         * match what we are looking for (resource or data), OR the file id doesn't match
         * which indicates that there's nothing more with this file ID as the key, then bail out
         */
        if ((found_fileid != fileid) || (ext_key_ptr->hfsPlus.forkType != forktype))  {
            error = 0;
            break;
        }

        /* Otherwise, we now have an extent record. Process and pin all of the file extents. */
        pblocks = 0;
        error = hfs_pin_extent_record (hfsmp, ext_data.hfsPlus, &pblocks);

        if (error) {
            break;
        }
        pinned_blocks += pblocks;

        /* if 8th extent is empty, then bail out */
        if (ext_data.hfsPlus[kHFSPlusExtentDensity-1].startBlock == 0) {
            error = 0;
            break;
        }

    } // end extent-getting loop

    /* dump the iterator */
    hfs_free(ext_iter, sizeof(*ext_iter));

    if (error == 0) {
        /*
         * In the event that the file has no overflow extents, pinned_blocks
         * will never be updated, so we'll properly export 0 pinned blocks to caller
         */
        *pinned = pinned_blocks;
    }

    return error;

}


static int
hfs_getvnode_and_pin (struct hfsmount *hfsmp, uint32_t fileid, uint32_t *pinned) {
    struct vnode *vp;
    int error = 0;
    *pinned = 0;
    uint32_t pblocks;

    /*
     * Acquire the vnode for this file.  This returns a locked cnode on success
     */
    error = hfs_vget(hfsmp, fileid, &vp, 0, 0);
    if (error) {
        /* It's possible the file was open-unlinked. In this case, we'll get ENOENT back. */
        return error;
    }

    /*
     * Symlinks that may have been inserted into the hotfile zone during a previous OS are now stuck
     * here.  We do not want to move them.
     */
    if (!vnode_isreg(vp)) {
        hfs_unlock(VTOC(vp));
        vnode_put(vp);
        return EPERM;
    }

    if (!(VTOC(vp)->c_attr.ca_recflags & kHFSFastDevPinnedMask)) {
        hfs_unlock(VTOC(vp));
        vnode_put(vp);
        return EINVAL;
    }

    error = hfs_pin_vnode(hfsmp, vp, HFS_PIN_IT, &pblocks);
    if (error == 0) {
        *pinned = pblocks;
    }

    hfs_unlock(VTOC(vp));
    vnode_put(vp);

    return error;

}

/*
 * Pins an HFS Extent record to the underlying CoreStorage.  Assumes that Catalog & Extents overflow
 * B-trees are held locked, as needed.
 *
 * Returns the number of blocks pinned in the output argument 'pinned'
 *
 * Returns error status (0 || errno) in return value.
 */
static int hfs_pin_extent_record (struct hfsmount *hfsmp, HFSPlusExtentRecord extents, uint32_t *pinned) {
    uint32_t pb = 0;
    int i;
    int error = 0;

	if (pinned == NULL) {
		return EINVAL;
	}
    *pinned = 0;



	/* iterate through the extents */
	for ( i = 0; i < kHFSPlusExtentDensity; i++) {
		if (extents[i].startBlock == 0) {
			break;
		}

		error = hfs_pin_block_range (hfsmp, HFS_PIN_IT, extents[i].startBlock,
				extents[i].blockCount);

		if (error) {
			break;
		}
		pb += extents[i].blockCount;
	}

    *pinned = pb;

	return error;
}

/*
 * Consume an HFS Plus on-disk catalog record and pin its blocks
 * to the underlying CS devnode.
 *
 * NOTE: This is an important distinction!
 * This function takes in an HFSPlusCatalogFile* which is the actual
 * 200-some-odd-byte on-disk representation in the Catalog B-Tree (not
 * one of the run-time structs that we normally use.
 *
 * This assumes that the catalog and extents-overflow btrees
 * are locked, at least in shared mode
 */
static int hfs_pin_catalog_rec (struct hfsmount *hfsmp, HFSPlusCatalogFile *cfp, int rsrc) {
	uint32_t pinned_blocks = 0;
	HFSPlusForkData *forkdata;
	int error = 0;
	uint8_t forktype = 0;

	if (rsrc) {
        forkdata = &cfp->resourceFork;
		forktype = 0xff;
	}
	else {
		forkdata = &cfp->dataFork;
	}

	uint32_t pblocks = 0;

	/* iterate through the inline extents */
	error = hfs_pin_extent_record (hfsmp, forkdata->extents, &pblocks);
	if (error) {
        return error;
	}

	pinned_blocks += pblocks;
    pblocks = 0;

	/* it may have overflow extents */
	if (forkdata->extents[kHFSPlusExtentDensity-1].startBlock != 0) {
        error = hfs_pin_overflow_extents (hfsmp, cfp->fileID, forktype, &pblocks);
	}
    pinned_blocks += pblocks;

	hfsmp->hfs_hotfile_freeblks -= pinned_blocks;

	return error;
}


/*
 *
 */
int
hfs_recording_init(struct hfsmount *hfsmp)
{
	CatalogKey * keyp;
	CatalogRecord * datap;
	u_int32_t  dataSize;
	HFSPlusCatalogFile *filep;
	BTScanState scanstate;
	BTreeIterator * iterator = NULL;
	FSBufferDescriptor  record;
	HotFileKey * key;
	filefork_t * filefork;
	u_int32_t  data;
	struct cat_attr cattr;
	u_int32_t  cnid;
	int error = 0;
	long starting_temp;

	int started_tr = 0;
	int started_scan = 0;

	int inserted = 0;  /* debug variables */
	int filecount = 0;
	int uncacheable = 0;

	/*
	 * For now, only the boot volume is supported.
	 */
	if ((vfs_flags(HFSTOVFS(hfsmp)) & MNT_ROOTFS) == 0) {
		hfsmp->hfc_stage = HFC_DISABLED;
		return (EPERM);
	}

	/* We grab the HFC mutex even though we're not fully mounted yet, just for orderliness */
	lck_mtx_lock (&hfsmp->hfc_mutex);

	/*
	 * Tracking of hot files requires up-to-date access times.
	 * So if access time updates are disabled, then we disable
	 * hot files, too.
	 */
	if (vfs_flags(HFSTOVFS(hfsmp)) & MNT_NOATIME) {
		hfsmp->hfc_stage = HFC_DISABLED;
		lck_mtx_unlock (&hfsmp->hfc_mutex);
		return EPERM;
	}
	
	//
	// Check if we've been asked to suspend operation
	//
	cnid = GetFileInfo(HFSTOVCB(hfsmp), kRootDirID, ".hotfile-suspend", &cattr, NULL);
	if (cnid != 0) {
		printf("hfs: %s: %s: hotfiles explicitly disabled!  remove /.hotfiles-suspend to re-enable\n", hfsmp->vcbVN, __FUNCTION__);
		hfsmp->hfc_stage = HFC_DISABLED;
		lck_mtx_unlock (&hfsmp->hfc_mutex);
		return EPERM;
	}

	//
	// Check if we've been asked to reset our state.
	//
	cnid = GetFileInfo(HFSTOVCB(hfsmp), kRootDirID, ".hotfile-reset", &cattr, NULL);
	if (cnid != 0) {
		hfs_hotfile_reset(hfsmp);
	}

	if (hfsmp->hfs_flags & HFS_CS_HOTFILE_PIN) {
		//
		// Cooperative Fusion (CF) systems use different constants 
		// than traditional hotfile systems.  These were picked after a bit of
		// experimentation - we can cache many more files on the
		// ssd in an CF system and we can do so more rapidly
		// so bump the limits considerably (and turn down the
		// duration so that it doesn't take weeks to adopt all
		// the files).
		//
		hfc_default_file_count = 20000;
		hfc_default_duration   = 300;    // 5min
		hfc_max_file_count     = 50000;
		hfc_max_file_size      = (512ULL * 1024ULL * 1024ULL);
	}

	/*
	 * If the Hot File btree exists then metadata zone is ready.
	 */
	cnid = GetFileInfo(HFSTOVCB(hfsmp), kRootDirID, HFC_FILENAME, &cattr, NULL);
	if (cnid != 0 && S_ISREG(cattr.ca_mode)) {
		int recreate = 0;
		
		if (hfsmp->hfc_stage == HFC_DISABLED)
			hfsmp->hfc_stage = HFC_IDLE;
		hfsmp->hfs_hotfile_freeblks = 0;

		if ((hfsmp->hfs_flags & HFS_CS_HOTFILE_PIN) && cattr.ca_blocks > 0) {
			//
			// make sure the hotfile btree is pinned
			//
			error = hfc_btree_open(hfsmp, &hfsmp->hfc_filevp);
			if (!error) {
				/* XXX: must fix hfs_pin_vnode too */
				hfs_pin_vnode(hfsmp, hfsmp->hfc_filevp, HFS_PIN_IT, NULL);
				
			} else {
				printf("hfs: failed to open the btree err=%d.  Recreating hotfile btree.\n", error);
				recreate = 1;
			}
			
			hfs_hotfile_repin_files(hfsmp);

			if (hfsmp->hfc_filevp) {
				(void) hfc_btree_close(hfsmp, hfsmp->hfc_filevp);
				hfsmp->hfc_filevp = NULL;
			}

		} else if (hfsmp->hfs_flags & HFS_CS_HOTFILE_PIN) {
			// hmmm, the hotfile btree is zero bytes long?  how odd.  let's recreate it.
			printf("hfs: hotfile btree is zero bytes long?!  recreating it.\n");
			recreate = 1;
		}

		if (!recreate) {
			/* don't forget to unlock the mutex */
			lck_mtx_unlock (&hfsmp->hfc_mutex);
			return (0);
		} else {
			//
			// open the hotfile btree file ignoring errors because
			// we need the vnode pointer for hfc_btree_delete() to
			// be able to do its work
			//
			error = hfc_btree_open_ext(hfsmp, &hfsmp->hfc_filevp, 1);
			if (!error) {
				// and delete it!
				error = hfc_btree_delete(hfsmp);
				(void) hfc_btree_close(hfsmp, hfsmp->hfc_filevp);
				hfsmp->hfc_filevp = NULL;
			}
		}
	}

	printf("hfs: %s: %s: creating the hotfile btree\n", hfsmp->vcbVN, __FUNCTION__);
	if (hfs_start_transaction(hfsmp) != 0) {
		lck_mtx_unlock (&hfsmp->hfc_mutex);
		return EINVAL;
	}

	/* B-tree creation must be journaled */
	started_tr = 1;

	error = hfc_btree_create(hfsmp, HFSTOVCB(hfsmp)->blockSize, HFC_DEFAULT_FILE_COUNT);
	if (error) {
#if HFC_VERBOSE
		printf("hfs: Error %d creating hot file b-tree on %s \n", error, hfsmp->vcbVN);
#endif
		goto recording_init_out;
	}

	hfs_end_transaction (hfsmp);
	started_tr = 0;
	/*
	 * Do a journal flush + flush track cache. We have to ensure that the async I/Os have been issued to the media
	 * before proceeding.
	 */
	hfs_flush (hfsmp, HFS_FLUSH_FULL);

	/* now re-start a new transaction */
	if (hfs_start_transaction (hfsmp) != 0) {
		lck_mtx_unlock (&hfsmp->hfc_mutex);
		return EINVAL;
	}
	started_tr = 1;

	/*
	 * Open the Hot File B-tree file for writing.
	 */
	if (hfsmp->hfc_filevp)
		panic("hfs_recording_init: hfc_filevp exists (vp = %p)", hfsmp->hfc_filevp);

	error = hfc_btree_open(hfsmp, &hfsmp->hfc_filevp);
	if (error) {
#if HFC_VERBOSE
		printf("hfs: Error %d opening hot file b-tree on %s \n", error, hfsmp->vcbVN);
#endif
		goto recording_init_out;
	}

	/*
	 * This function performs work similar to namei; we must NOT hold the catalog lock while
	 * calling it. This will decorate catalog records as being pinning candidates. (no hotfiles work)
	 */
	hfs_setup_default_cf_hotfiles(hfsmp);

	/*
	 * now grab the hotfiles b-tree vnode/cnode lock first, as it is not classified as a systemfile.
	 */
	if (hfs_lock(VTOC(hfsmp->hfc_filevp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT) != 0) {
		error = EPERM;
		(void) hfc_btree_close(hfsmp, hfsmp->hfc_filevp);
		/* zero it out to avoid pinning later on */
		hfsmp->hfc_filevp = NULL;
		goto recording_init_out;
	}

	iterator = hfs_mallocz(sizeof(*iterator));

	key = (HotFileKey*) &iterator->key;
	key->keyLength = HFC_KEYLENGTH;

	record.bufferAddress = &data;
	record.itemSize = sizeof(u_int32_t);
	record.itemCount = 1;

#if HFC_VERBOSE
	printf("hfs: Evaluating space for \"%s\" metadata zone... (freeblks %d)\n", HFSTOVCB(hfsmp)->vcbVN,
	       hfsmp->hfs_hotfile_freeblks);
#endif

	/*
	 * Get ready to scan the Catalog file. We explicitly do NOT grab the catalog lock because
	 * we're fully single-threaded at the moment (by virtue of being called during mount()),
	 * and if we have to grow the hotfile btree, then we would need to grab the catalog lock
	 * and if we take a shared lock here, it would deadlock (see <rdar://problem/21486585>)
	 *
	 * We already started a transaction so we should already be holding the journal lock at this point.
	 * Note that we have to hold the journal lock / start a txn BEFORE the systemfile locks.
	 */

	error = BTScanInitialize(VTOF(HFSTOVCB(hfsmp)->catalogRefNum), 0, 0, 0,
	                       kCatSearchBufferSize, &scanstate);
	if (error) {
		printf("hfs_recording_init: err %d BTScanInit\n", error);

		/* drop the systemfile locks */
		hfs_unlock(VTOC(hfsmp->hfc_filevp));

		(void) hfc_btree_close(hfsmp, hfsmp->hfc_filevp);

		/* zero it out to avoid pinning */
		hfsmp->hfc_filevp = NULL;
		goto recording_init_out;
	}

	started_scan = 1;

	filefork = VTOF(hfsmp->hfc_filevp);

	starting_temp = random() % HF_TEMP_RANGE;

	/*
	 * Visit all the catalog btree leaf records. We have to hold the catalog lock to do this.
	 *
	 * NOTE: The B-Tree scanner reads from the media itself. Under normal circumstances it would be
	 * fine to simply use b-tree routines to read blocks that correspond to b-tree nodes, because the
	 * block cache is going to ensure you always get the cached copy of a block (even if a journal
	 * txn has modified one of those blocks).  That is NOT true when
	 * using the scanner.  In particular, it will always read whatever is on-disk. So we have to ensure
	 * that the journal has flushed and that the async I/Os to the metadata files have been issued.
	 */
	for (;;) {
		error = BTScanNextRecord(&scanstate, 0, (void **)&keyp, (void **)&datap, &dataSize);
		if (error) {
			if (error == btNotFound)
				error = 0;
			else
				printf("hfs_recording_init: err %d BTScanNext\n", error);
			break;
		}
		if ((datap->recordType != kHFSPlusFileRecord) ||
		    (dataSize != sizeof(HFSPlusCatalogFile))) {
			continue;
		}
		filep = (HFSPlusCatalogFile *)datap;
		filecount++;

		if (hfsmp->hfs_flags & HFS_CS_HOTFILE_PIN) {
			if (filep->flags & kHFSDoNotFastDevPinMask) {
				uncacheable++;
			}

			//
			// If the file does not have the FastDevPinnedMask set, we
			// can ignore it and just go to the next record.
			//
			if ((filep->flags & kHFSFastDevPinnedMask) == 0) {
				continue;
			}
		} else if (filep->dataFork.totalBlocks == 0) {
			continue;
		}

		/*
		 * On a regular hdd, any file that has blocks inside
		 * the hot file space is recorded for later eviction.
		 *
		 * For now, resource forks are ignored.
		 *
		 * We don't do this on CF systems as there is no real
		 * hotfile area - we just pin/unpin blocks belonging to
		 * interesting files.
		 */
		if (!(hfsmp->hfs_flags & HFS_CS_HOTFILE_PIN) && !hotextents(hfsmp, &filep->dataFork.extents[0])) {
			continue;
		}
		cnid = filep->fileID;

		/* Skip over journal files and the hotfiles B-Tree file. */
		if (cnid == hfsmp->hfs_jnlfileid
			|| cnid == hfsmp->hfs_jnlinfoblkid
			|| cnid == VTOC(hfsmp->hfc_filevp)->c_fileid) {
			continue;
		}
		/*
		 * XXX - need to skip quota files as well.
		 */

		uint32_t temp;

		if (hfsmp->hfs_flags & HFS_CS_HOTFILE_PIN) {
			int rsrc = 0;

			temp = (uint32_t)starting_temp++;
			if (filep->flags & kHFSAutoCandidateMask) {
				temp += MAX_NORMAL_TEMP;
			}

			/* use the data fork by default */
			if (filep->dataFork.totalBlocks == 0) {
				/*
                 * but if empty, switch to rsrc as its likely
                 * a compressed file
                 */
				rsrc = 1;
			}

			error =  hfs_pin_catalog_rec (hfsmp, filep, rsrc);
			if (error)
				break;

		} else {
			temp = HFC_MINIMUM_TEMPERATURE;
		}

		/* Insert a hot file entry. */
		key->keyLength   = HFC_KEYLENGTH;
		key->temperature = temp;
		key->fileID      = cnid;
		key->forkType    = 0;
		data = 0x3f3f3f3f;
		error = BTInsertRecord(filefork, iterator, &record, record.itemSize);
		if (error) {
			printf("hfs_recording_init: BTInsertRecord failed %d (fileid %d)\n", error, key->fileID);
			error = MacToVFSError(error);
			break;
		}

		/* Insert the corresponding thread record. */
		key->keyLength = HFC_KEYLENGTH;
		key->temperature = HFC_LOOKUPTAG;
		key->fileID = cnid;
		key->forkType = 0;
		data = temp;
		error = BTInsertRecord(filefork, iterator, &record, record.itemSize);
		if (error) {
			printf("hfs_recording_init: BTInsertRecord failed %d (fileid %d)\n", error, key->fileID);
			error = MacToVFSError(error);
			break;
		}
		inserted++;
	} // end catalog iteration loop

	save_btree_user_info(hfsmp);
	(void) BTFlushPath(filefork);

recording_init_out:

	/* Unlock first, then pin after releasing everything else */
	if (hfsmp->hfc_filevp) {
		hfs_unlock (VTOC(hfsmp->hfc_filevp));
	}

	if (started_scan) {
		(void) BTScanTerminate (&scanstate, &data, &data, &data);
	}

	if (started_tr) {
		hfs_end_transaction(hfsmp);
	}

#if HFC_VERBOSE
	printf("hfs: %d files identified out of %d (freeblocks is now: %d)\n", inserted, filecount, hfsmp->hfs_hotfile_freeblks);
	if (uncacheable) {
		printf("hfs: %d files were marked as uncacheable\n", uncacheable);
	}
#endif
	
	if (iterator)
		hfs_free(iterator, sizeof(*iterator));

	if (hfsmp->hfc_filevp) {
		if (hfsmp->hfs_flags & HFS_CS_HOTFILE_PIN) {
			hfs_pin_vnode(hfsmp, hfsmp->hfc_filevp, HFS_PIN_IT, NULL);
		}
		(void) hfc_btree_close(hfsmp, hfsmp->hfc_filevp);
		hfsmp->hfc_filevp = NULL;
	}

	if (error == 0)
		hfsmp->hfc_stage = HFC_IDLE;

	/* Finally, unlock the HFC mutex */
	lck_mtx_unlock (&hfsmp->hfc_mutex);

	return (error);
}

/*
 * Use sync to perform ocassional background work.
 */
int
hfs_hotfilesync(struct hfsmount *hfsmp, vfs_context_t ctx)
{
	if (hfsmp->hfc_stage) {
		struct timeval tv;

		lck_mtx_lock(&hfsmp->hfc_mutex);

		switch (hfsmp->hfc_stage) {
		case HFC_IDLE:
			(void) hfs_recording_start(hfsmp);
			break;
	
		case HFC_RECORDING:
			microtime(&tv);
			if (tv.tv_sec > hfsmp->hfc_timeout)
				(void) hfs_recording_stop(hfsmp);
			break;
	
		case HFC_EVICTION:
			(void) hotfiles_evict(hfsmp, ctx);
			break;
	
		case HFC_ADOPTION:
			(void) hotfiles_adopt(hfsmp);
			break;
		default:
			break;
		}

		lck_mtx_unlock(&hfsmp->hfc_mutex);
	}
	return (0);
}

/*
 * Add a hot file to the recording list.
 *
 * This can happen when a hot file gets reclaimed or at the
 * end of the recording period for any active hot file.
 *
 * NOTE: Since both the data and resource fork can  be hot,
 * there can be two entries for the same file id.
 *
 * Note: the cnode is locked on entry.
 */
int
hfs_addhotfile(struct vnode *vp)
{
	hfsmount_t *hfsmp;
	int error;

	hfsmp = VTOHFS(vp);
	if (hfsmp->hfc_stage != HFC_RECORDING)
		return (0);

	lck_mtx_lock(&hfsmp->hfc_mutex);
	error = hfs_addhotfile_internal(vp);
	lck_mtx_unlock(&hfsmp->hfc_mutex);
	return (error);
}

static int
hf_ignore_process(const char *pname, size_t maxlen)
{
	if (   strncmp(pname, "mds", maxlen) == 0
	    || strncmp(pname, "mdworker", maxlen) == 0
	    || strncmp(pname, "mds_stores", maxlen) == 0
	    || strncmp(pname, "makewhatis", maxlen) == 0) {
		return 1;
	}

	return 0;
	
}

static int
hfs_addhotfile_internal(struct vnode *vp)
{
	hotfile_data_t *hotdata;
	hotfile_entry_t *entry;
	hfsmount_t *hfsmp;
	cnode_t *cp;
	filefork_t *ffp;
	u_int32_t temperature;

	hfsmp = VTOHFS(vp);
	if (hfsmp->hfc_stage != HFC_RECORDING)
		return (0);

	/* 
	 * Only regular files are eligible for hotfiles addition. 
	 * 
	 * Symlinks were previously added to the list and may exist in 
	 * extant hotfiles regions, but no new ones will be added, and no
	 * symlinks will now be relocated/evicted from the hotfiles region.
	 */
	if (!vnode_isreg(vp) || vnode_issystem(vp)) {
		return (0);
	}

	/* Skip resource forks for now. */
	if (VNODE_IS_RSRC(vp)) {
		return (0);
	}
	if ((hotdata = hfsmp->hfc_recdata) == NULL) {
		return (0);
	}
	ffp = VTOF(vp);
	cp = VTOC(vp);

	if (cp->c_attr.ca_recflags & (kHFSFastDevPinnedMask|kHFSDoNotFastDevPinMask)) {
		// it's already a hotfile or can't be a hotfile...
		return 0;
	}

	if (vnode_isdir(vp) || vnode_issystem(vp) || (cp->c_flag & (C_DELETED | C_NOEXISTS))) {
		return 0;
	}

	if ((hfsmp->hfs_flags & HFS_CS_HOTFILE_PIN) && vnode_isfastdevicecandidate(vp)) {
		//
		// On cooperative fusion (CF) systems we have different criteria for whether something
		// can be pinned to the ssd.
		//
		if (cp->c_flag & (C_DELETED|C_NOEXISTS)) {
			//
			// dead files are definitely not worth caching
			//
			return 0;
		} else if (ffp->ff_blocks == 0 && !(cp->c_bsdflags & UF_COMPRESSED) && !(cp->c_attr.ca_recflags & kHFSFastDevCandidateMask)) {
			//
			// empty files aren't worth caching but compressed ones might be, as are 
			// newly created files that live in WorthCaching directories... 
			//
			return 0;
		}

		char pname[256];
		pname[0] = '\0';
		proc_selfname(pname, sizeof(pname));
		if (hf_ignore_process(pname, sizeof(pname))) {
			// ignore i/o's from certain system daemons 
			return 0;
		}

		temperature = cp->c_fileid;        // in memory we just keep it sorted by file-id
	} else {
		// the normal hard drive based hotfile checks
		if ((ffp->ff_bytesread == 0) ||
		    (ffp->ff_blocks == 0) ||
		    (ffp->ff_size == 0) ||
		    (ffp->ff_blocks > hotdata->maxblocks) ||
		    (cp->c_bsdflags & (UF_NODUMP | UF_COMPRESSED)) ||
		    (cp->c_atime < hfsmp->hfc_timebase)) {
			return (0);
		}

		temperature = ffp->ff_bytesread / ffp->ff_size;
		if (temperature < hotdata->threshold) {
			return (0);
		}
	}

	/*
	 * If there is room or this file is hotter than
	 * the coldest one then add it to the list.
	 *
	 */
	if ((hotdata->activefiles < hfsmp->hfc_maxfiles) ||
	    (hotdata->coldest == NULL) ||
	    (temperature >= hotdata->coldest->temperature)) {
		++hotdata->refcount;
		entry = hf_getnewentry(hotdata);
		entry->temperature = temperature;
		entry->fileid = cp->c_fileid;
		//
		// if ffp->ff_blocks is zero, it might be compressed so make sure we record
		// that there's at least one block.
		//
		entry->blocks = ffp->ff_blocks ? ffp->ff_blocks : 1;   
		if (hf_insert(hotdata, entry) == EEXIST) {
			// entry is already present, don't need to add it again
			entry->right = hotdata->freelist;
			hotdata->freelist = entry;
		}
		--hotdata->refcount;
	}

	return (0);
}

/*
 * Remove a hot file from the recording list.
 *
 * This can happen when a hot file becomes
 * an active vnode (active hot files are
 * not kept in the recording list until the
 * end of the recording period).
 *
 * Note: the cnode is locked on entry.
 */
int
hfs_removehotfile(struct vnode *vp)
{
	hotfile_data_t *hotdata;
	hfsmount_t *hfsmp;
	cnode_t *cp;
	filefork_t *ffp;
	u_int32_t temperature;

	hfsmp = VTOHFS(vp);
	if (hfsmp->hfc_stage != HFC_RECORDING)
		return (0);

	if ((!vnode_isreg(vp)) || vnode_issystem(vp)) {
		return (0);
	}

	ffp = VTOF(vp);
	cp = VTOC(vp);

	if ((ffp->ff_bytesread == 0) || (ffp->ff_blocks == 0) ||
	    (ffp->ff_size == 0) || (cp->c_atime < hfsmp->hfc_timebase)) {
		return (0);
	}

	lck_mtx_lock(&hfsmp->hfc_mutex);
	if (hfsmp->hfc_stage != HFC_RECORDING)
		goto out;
	if ((hotdata = hfsmp->hfc_recdata) == NULL)
		goto out;

	temperature = ffp->ff_bytesread / ffp->ff_size;
	if (temperature < hotdata->threshold)
		goto out;

	if (hotdata->coldest && (temperature >= hotdata->coldest->temperature)) {
		++hotdata->refcount;
		hf_delete(hotdata, VTOC(vp)->c_fileid, temperature);
		--hotdata->refcount;
	}
out:
	lck_mtx_unlock(&hfsmp->hfc_mutex);
	return (0);
}

int
hfs_hotfile_deleted(__unused struct vnode *vp)
{
#if 1
	return 0;
#else	
	//
	// XXXdbg - this code, while it would work, would introduce a huge inefficiency
	//          to deleting files as the way it's written would require us to open
	//          the hotfile btree on every open, delete two records in it and then
	//          close the hotfile btree (which involves more writes).
	//
	//          We actually can be lazy about deleting hotfile records for files
	//          that get deleted.  When it's time to evict things, if we encounter
	//          a record that references a dead file (i.e. a fileid which no
	//          longer exists), the eviction code will remove the records.  Likewise
	//          the code that scans the HotFile B-Tree at boot time to re-pin files
	//          will remove dead records.
	//

	hotfile_data_t *hotdata;
	hfsmount_t *hfsmp;
	cnode_t *cp;
	filefork_t *filefork;
	u_int32_t temperature;
	BTreeIterator * iterator = NULL;
	FSBufferDescriptor record;
	HotFileKey *key;
	u_int32_t data;
	int error=0;

	cp = VTOC(vp);
	if (cp == NULL || !(cp->c_attr.ca_recflags & kHFSFastDevPinnedMask)) {
		return 0;
	}

	hfsmp = VTOHFS(vp);
	if (!(hfsmp->hfs_flags & HFS_CS_HOTFILE_PIN)) {
		return 0;
	}
	
	if (hfc_btree_open(hfsmp, &hfsmp->hfc_filevp) != 0 || hfsmp->hfc_filevp == NULL) {
		// either there is no hotfile info or it's damaged
		return EINVAL;
	}
	
	filefork = VTOF(hfsmp->hfc_filevp);
	if (filefork == NULL) {
		return 0;
	}

	iterator = hfs_mallocz(sizeof(*iterator));

	key = (HotFileKey*) &iterator->key;

	record.bufferAddress = &data;
	record.itemSize = sizeof(u_int32_t);
	record.itemCount = 1;

	key->keyLength = HFC_KEYLENGTH;
	key->temperature = HFC_LOOKUPTAG;
	key->fileID = cp->c_fileid;
	key->forkType = 0;

	lck_mtx_lock(&hfsmp->hfc_mutex);
	(void) BTInvalidateHint(iterator);
	if (BTSearchRecord(filefork, iterator, &record, NULL, iterator) == 0) {
		temperature = key->temperature;
		hfc_btree_delete_record(hfsmp, iterator, key);
	} else {
		//printf("hfs: hotfile_deleted: did not find fileid %d\n", cp->c_fileid);
		error = ENOENT;
	}

	if ((hotdata = hfsmp->hfc_recdata) != NULL) {
		// just in case, also make sure it's removed from the in-memory list as well
		++hotdata->refcount;
		hf_delete(hotdata, cp->c_fileid, cp->c_fileid);
		--hotdata->refcount;
	}

	lck_mtx_unlock(&hfsmp->hfc_mutex);
	hfs_free(iterator, sizeof(*iterator));

	hfc_btree_close(hfsmp, hfsmp->hfc_filevp);
	
	return error;
#endif
}

int
hfs_hotfile_adjust_blocks(struct vnode *vp, int64_t num_blocks)
{
	hfsmount_t *hfsmp;
	
	if (vp == NULL) {
		return 0;
	}

	hfsmp = VTOHFS(vp);

	if (!(hfsmp->hfs_flags & HFS_CS_HOTFILE_PIN) || num_blocks == 0 || vp == NULL) {
		return 0;
	}

	//
	// if file is not HotFileCached or it has the CanNotHotFile cache
	// bit set then there is nothing to do
	//
	if (!(VTOC(vp)->c_attr.ca_recflags & kHFSFastDevPinnedMask) || (VTOC(vp)->c_attr.ca_recflags & kHFSDoNotFastDevPinMask)) {
		// it's not a hot file or can't be one so don't bother tracking
		return 0;
	}
	
	OSAddAtomic(num_blocks, &hfsmp->hfs_hotfile_blk_adjust);

	return (0);
}

//
// Assumes hfsmp->hfc_mutex is LOCKED
//
static int
hfs_hotfile_cur_freeblks(hfsmount_t *hfsmp)
{
	if (hfsmp->hfc_stage < HFC_IDLE) {
		return 0;
	}
	
	int cur_blk_adjust = hfsmp->hfs_hotfile_blk_adjust;   // snap a copy of this value

	if (cur_blk_adjust) {
		OSAddAtomic(-cur_blk_adjust, &hfsmp->hfs_hotfile_blk_adjust);
		hfsmp->hfs_hotfile_freeblks += cur_blk_adjust;
	}

	return hfsmp->hfs_hotfile_freeblks;
}


/*
 *========================================================================
 *                     HOT FILE MAINTENANCE ROUTINES
 *========================================================================
 */

static int
hotfiles_collect_callback(struct vnode *vp, __unused void *cargs)
{
        if ((vnode_isreg(vp)) && !vnode_issystem(vp))
	        (void) hfs_addhotfile_internal(vp);

	return (VNODE_RETURNED);
}

/*
 * Add all active hot files to the recording list.
 */
static int
hotfiles_collect(struct hfsmount *hfsmp)
{
	struct mount *mp = HFSTOVFS(hfsmp);

	if (vfs_busy(mp, LK_NOWAIT))
		return (0);

	/*
	 * hotfiles_collect_callback will be called for each vnode
	 * hung off of this mount point
	 * the vnode will be
	 * properly referenced and unreferenced around the callback
	 */
	vnode_iterate(mp, 0, hotfiles_collect_callback, (void *)NULL);

	vfs_unbusy(mp);

	return (0);
}


/*
 * Update the data of a btree record
 * This is called from within BTUpdateRecord.
 */
static int
update_callback(const HotFileKey *key, u_int32_t *data, u_int32_t *state)
{
	if (key->temperature == HFC_LOOKUPTAG)
		*data = *state;
	return (0);
}

/*
 * Identify files already in hot area.
 */
static int
hotfiles_refine(struct hfsmount *hfsmp)
{
	BTreeIterator * iterator = NULL;
	struct mount *mp;
	filefork_t * filefork;
	hotfilelist_t  *listp;
	FSBufferDescriptor  record;
	HotFileKey * key;
	u_int32_t  data;
	int  i;
	int  error = 0;

	if ((listp = hfsmp->hfc_filelist) == NULL)
		return (0);	

	if (hfsmp->hfs_flags & HFS_CS_HOTFILE_PIN) {
		// on ssd's we don't refine the temperature since the
		// replacement algorithm is simply random
		return 0;
	}

	mp = HFSTOVFS(hfsmp);

	iterator = hfs_mallocz(sizeof(*iterator));

	key = (HotFileKey*) &iterator->key;

	record.bufferAddress = &data;
	record.itemSize = sizeof(u_int32_t);
	record.itemCount = 1;

	if (hfs_start_transaction(hfsmp) != 0) {
	    error = EINVAL;
	    goto out;
	} 
	if (hfs_lock(VTOC(hfsmp->hfc_filevp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT) != 0) {
		error = EPERM;
		goto out1;
	}
	filefork = VTOF(hfsmp->hfc_filevp);

	for (i = 0; i < listp->hfl_count; ++i) {
		/*
		 * Check if entry (thread) is already in hot area.
		 */
		key->keyLength = HFC_KEYLENGTH;
		key->temperature = HFC_LOOKUPTAG;
		key->fileID = listp->hfl_hotfile[i].hf_fileid;
		key->forkType = 0;
		(void) BTInvalidateHint(iterator);
		if (BTSearchRecord(filefork, iterator, &record, NULL, iterator) != 0) {
			continue;  /* not in hot area, so skip */
		}

		/*
		 * Update thread entry with latest temperature.
		 */
		error = BTUpdateRecord(filefork, iterator,
				       (IterateCallBackProcPtr)update_callback,
				      &listp->hfl_hotfile[i].hf_temperature);
		if (error) {
			printf("hfs: hotfiles_refine: BTUpdateRecord failed %d (file %d)\n", error, key->fileID);
			error = MacToVFSError(error);
			//	break;
		}
		/*
		 * Re-key entry with latest temperature.
		 */
		key->keyLength = HFC_KEYLENGTH;
		key->temperature = data;
		key->fileID = listp->hfl_hotfile[i].hf_fileid;
		key->forkType = 0;
		/* Pick up record data. */
		(void) BTInvalidateHint(iterator);
		(void) BTSearchRecord(filefork, iterator, &record, NULL, iterator);
		error = BTDeleteRecord(filefork, iterator);
		if (error) {
			printf("hfs: hotfiles_refine: BTDeleteRecord failed %d (file %d)\n", error, key->fileID);
			error = MacToVFSError(error);
			break;
		}
		key->keyLength = HFC_KEYLENGTH;
		key->temperature = listp->hfl_hotfile[i].hf_temperature;
		key->fileID = listp->hfl_hotfile[i].hf_fileid;
		key->forkType = 0;
		error = BTInsertRecord(filefork, iterator, &record, record.itemSize);
		if (error) {
			printf("hfs: hotfiles_refine: BTInsertRecord failed %d (file %d)\n", error, key->fileID);
			error = MacToVFSError(error);
			break;
		}
		/*
		 * Invalidate this entry in the list.
		 */
		listp->hfl_hotfile[i].hf_temperature = 0;
		listp->hfl_totalblocks -= listp->hfl_hotfile[i].hf_blocks;
		
	} /* end for */

	(void) BTFlushPath(filefork);
	hfs_unlock(VTOC(hfsmp->hfc_filevp));

out1:
	hfs_end_transaction(hfsmp);
out:
	if (iterator)
		hfs_free(iterator, sizeof(*iterator));	
	return (error);
}

/*
 * Move new hot files into hot area.
 *
 * Requires that the hfc_mutex be held.
 */
static int
hotfiles_adopt(struct hfsmount *hfsmp)
{
	BTreeIterator * iterator = NULL;
	struct vnode *vp;
	filefork_t * filefork;
	hotfilelist_t  *listp;
	FSBufferDescriptor  record;
	HotFileKey * key;
	u_int32_t  data;
	enum hfc_stage stage;
	int  fileblocks;
	int  blksmoved;
	int  i;
	int  last;
	int  error = 0;
	int  startedtrans = 0;
	//
	// all files in a given adoption phase have a temperature
	// that starts at a random value and then increases linearly.
	// the idea is that during eviction, files that were adopted
	// together will be evicted together
	//
	long starting_temp = random() % HF_TEMP_RANGE;
	long temp_adjust = 0;

	if ((listp = hfsmp->hfc_filelist) == NULL)
		return (0);	

	if (hfsmp->hfc_stage != HFC_ADOPTION) {
		return (EBUSY);
	}
	if (hfs_lock(VTOC(hfsmp->hfc_filevp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT) != 0) {
		return (EPERM);
	}

	iterator = hfs_mallocz(sizeof(*iterator));

#if HFC_VERBOSE
		printf("hfs:%s: hotfiles_adopt: (hfl_next: %d, hotfile start/end block: %d - %d; max/free: %d/%d; maxfiles: %d)\n",
		       hfsmp->vcbVN,
		       listp->hfl_next,
		       hfsmp->hfs_hotfile_start, hfsmp->hfs_hotfile_end,
		       hfsmp->hfs_hotfile_maxblks, hfsmp->hfs_hotfile_freeblks, hfsmp->hfc_maxfiles);
#endif

	stage = hfsmp->hfc_stage;
	hfsmp->hfc_stage = HFC_BUSY;

	blksmoved = 0;
	last = listp->hfl_next + HFC_FILESPERSYNC;
	if (last > listp->hfl_count)
		last = listp->hfl_count;

	key = (HotFileKey*) &iterator->key;
	key->keyLength = HFC_KEYLENGTH;

	record.bufferAddress = &data;
	record.itemSize = sizeof(u_int32_t);
	record.itemCount = 1;

	filefork = VTOF(hfsmp->hfc_filevp);

	for (i = listp->hfl_next; (i < last) && (blksmoved < HFC_BLKSPERSYNC); ++i) {
		/*
		 * Skip entries that aren't going to work.
		 */
		if (listp->hfl_hotfile[i].hf_temperature == 0) {
			//printf("hfs: zero temp on file-id %d\n", listp->hfl_hotfile[i].hf_fileid);
			listp->hfl_next++;
			continue;
		}
		if (listp->hfl_hotfile[i].hf_fileid == VTOC(hfsmp->hfc_filevp)->c_fileid) {
			//printf("hfs: cannot adopt the hotfile b-tree itself! (file-id %d)\n", listp->hfl_hotfile[i].hf_fileid);
			listp->hfl_next++;
			continue;
		}
		if (listp->hfl_hotfile[i].hf_fileid < kHFSFirstUserCatalogNodeID) {
			//printf("hfs: cannot adopt system files (file-id %d)\n", listp->hfl_hotfile[i].hf_fileid);
			listp->hfl_next++;
			continue;
		}

		/*
		 * Acquire a vnode for this file.
		 */
		error = hfs_vget(hfsmp, listp->hfl_hotfile[i].hf_fileid, &vp, 0, 0);
		if (error) {
			//printf("failed to get fileid %d (err %d)\n", listp->hfl_hotfile[i].hf_fileid, error);
			if (error == ENOENT) {
				error = 0;
				listp->hfl_next++;
				continue;  /* stale entry, go to next */
			}
			break;
		}

		//printf("hfs: examining hotfile entry w/fileid %d, temp %d, blocks %d (HotFileCached: %s)\n",
		//       listp->hfl_hotfile[i].hf_fileid, listp->hfl_hotfile[i].hf_temperature,
		//       listp->hfl_hotfile[i].hf_blocks,
		//       (VTOC(vp)->c_attr.ca_recflags & kHFSFastDevPinnedMask) ? "YES" : "NO");

		if (!vnode_isreg(vp)) {
			/* Symlinks are ineligible for adoption into the hotfile zone.  */
			//printf("hfs: hotfiles_adopt: huh, not a file %d (%d)\n", listp->hfl_hotfile[i].hf_fileid, VTOC(vp)->c_cnid);
			hfs_unlock(VTOC(vp));
			vnode_put(vp);
			listp->hfl_hotfile[i].hf_temperature = 0;
			listp->hfl_next++;
			continue;  /* stale entry, go to next */
		}
		if (   (VTOC(vp)->c_flag & (C_DELETED | C_NOEXISTS))
		    || (!(hfsmp->hfs_flags & HFS_CS_HOTFILE_PIN) && hotextents(hfsmp, &VTOF(vp)->ff_extents[0]))
		    || (VTOC(vp)->c_attr.ca_recflags & (kHFSFastDevPinnedMask|kHFSDoNotFastDevPinMask))) {
			hfs_unlock(VTOC(vp));
			vnode_put(vp);
			listp->hfl_hotfile[i].hf_temperature = 0;
			listp->hfl_next++;
			listp->hfl_totalblocks -= listp->hfl_hotfile[i].hf_blocks;
			continue;  /* stale entry, go to next */
		}

		fileblocks = VTOF(vp)->ff_blocks;

		//
		// for CF, if the file is empty (and not compressed) or it is too large,
		// do not try to pin it.  (note: if fileblocks == 0 but the file is marked
		// as compressed, we may still be able to cache it).
		//
		if ((hfsmp->hfs_flags & HFS_CS_HOTFILE_PIN) &&
		    ((fileblocks == 0 && !(VTOC(vp)->c_bsdflags & UF_COMPRESSED)) ||
		     (unsigned int)fileblocks > (HFC_MAXIMUM_FILESIZE / (uint64_t)HFSTOVCB(hfsmp)->blockSize))) {
			// don't try to cache something too large or that's zero-bytes

			vnode_clearfastdevicecandidate(vp);    // turn off the fast-dev-candidate flag so we don't keep trying to cache it.

			hfs_unlock(VTOC(vp));
			vnode_put(vp);
			listp->hfl_hotfile[i].hf_temperature = 0;
			listp->hfl_next++;
			listp->hfl_totalblocks -= listp->hfl_hotfile[i].hf_blocks;
			continue;  /* entry is too big, just carry on with the next guy */
		}

		//
		// If a file is not an autocandidate (i.e. it's a user-tagged file desirous of
		// being hotfile cached) but it is already bigger than 4 megs, don't bother
		// hotfile caching it.  Note that if a user tagged file starts small, gets
		// adopted and then grows over time we will allow it to grow bigger than 4 megs
		// which is intentional for things like the Mail or Photos database files which
		// grow slowly over time and benefit from being on the FastDevice.
		//
		if ((hfsmp->hfs_flags & HFS_CS_HOTFILE_PIN) &&
		    !(VTOC(vp)->c_attr.ca_recflags & kHFSAutoCandidateMask) && 
		    (VTOC(vp)->c_attr.ca_recflags & kHFSFastDevCandidateMask) && 
		    (unsigned int)fileblocks > ((4*1024*1024) / (uint64_t)HFSTOVCB(hfsmp)->blockSize)) {

			vnode_clearfastdevicecandidate(vp);    // turn off the fast-dev-candidate flag so we don't keep trying to cache it.

			hfs_unlock(VTOC(vp));
			vnode_put(vp);
			listp->hfl_hotfile[i].hf_temperature = 0;
			listp->hfl_next++;
			listp->hfl_totalblocks -= listp->hfl_hotfile[i].hf_blocks;
			continue;  /* entry is too big, just carry on with the next guy */
		}

		if (fileblocks > hfs_hotfile_cur_freeblks(hfsmp)) {
			//
			// No room for this file.  Although eviction should have made space
			// it's best that we check here as well since writes to existing
			// hotfiles may have eaten up space since we performed eviction
			//
			hfs_unlock(VTOC(vp));
			vnode_put(vp);
			listp->hfl_next++;
			listp->hfl_totalblocks -= fileblocks;
			continue;  /* entry too big, go to next */
		}
		
		if ((blksmoved > 0) &&
		    (blksmoved + fileblocks) > HFC_BLKSPERSYNC) {
			//
			// we've done enough work, let's be nice to the system and
			// stop until the next iteration
			//
			hfs_unlock(VTOC(vp));
			vnode_put(vp);
			break;  /* adopt this entry the next time around */
		}

		//
		// The size of data for a hot file record is 4 bytes. The data
		// stored in hot file record is not really meaningful. However
		// to aid debugging, we store first four bytes of the file name
		// or the ASCII text "????"
		//
		if (VTOC(vp)->c_desc.cd_nameptr && (VTOC(vp)->c_desc.cd_namelen > 0)) {
			size_t max_len;

			max_len = sizeof(u_int32_t);
			if (max_len > (unsigned)VTOC(vp)->c_desc.cd_namelen)
				max_len = VTOC(vp)->c_desc.cd_namelen;

			memcpy(&data, VTOC(vp)->c_desc.cd_nameptr, max_len);
		} else
			data = 0x3f3f3f3f;


		if (hfsmp->hfs_flags & HFS_CS_HOTFILE_PIN) {
			//
			// For CF we pin the blocks belonging to the file
			// to the "fast" (aka ssd) media
			//
			uint32_t pinned_blocks;

			if (vnode_isautocandidate(vp)) {
				VTOC(vp)->c_attr.ca_recflags |= kHFSAutoCandidateMask;
			}
			if (VTOC(vp)->c_attr.ca_recflags & kHFSAutoCandidateMask) {
				//
				// this moves auto-cached files to the higher tier 
				// of "temperatures" which means they are less likely
				// to get evicted (user selected hotfiles will get
				// evicted first in the theory that they change more
				// frequently compared to system files)
				//
				temp_adjust = MAX_NORMAL_TEMP;
			} else {
				temp_adjust = 0;
			}

			hfs_unlock(VTOC(vp));  // don't need an exclusive lock for this
			hfs_lock(VTOC(vp), HFS_SHARED_LOCK, HFS_LOCK_ALLOW_NOEXISTS);

			error = hfs_pin_vnode(hfsmp, vp, HFS_PIN_IT, &pinned_blocks);

			fileblocks = pinned_blocks;

			// go back to an exclusive lock since we're going to modify the cnode again
			hfs_unlock(VTOC(vp));
			hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS);
		} else {
			//
			// Old style hotfiles moves the data to the center (aka "hot")
			// region of the disk
			//
			error = hfs_relocate(vp, hfsmp->hfs_hotfile_start, kauth_cred_get(), current_proc());
		}

		if (!error) {
			VTOC(vp)->c_attr.ca_recflags |= kHFSFastDevPinnedMask;
			VTOC(vp)->c_flag |= C_MODIFIED;
		} else if ((hfsmp->hfs_flags & HFS_CS_HOTFILE_PIN) && error == EALREADY) {
			//
			// If hfs_pin_vnode() returned EALREADY then this file is not
			// ever able to be hotfile cached the normal way.  This can
			// happen with compressed files which have their data stored
			// in an extended attribute.  We flag them so that we won't
			// bother to try and hotfile cache them again the next time
			// they're read.
			//
			VTOC(vp)->c_attr.ca_recflags |= kHFSDoNotFastDevPinMask;
			VTOC(vp)->c_flag |= C_MODIFIED;
		}

		hfs_unlock(VTOC(vp));
		vnode_put(vp);
		if (error) {
#if HFC_VERBOSE
			if (error != EALREADY) {
				printf("hfs: hotfiles_adopt: could not relocate file %d (err %d)\n", listp->hfl_hotfile[i].hf_fileid, error);
			}
#endif

			if (last < listp->hfl_count) {
				last++;
			}
			/* Move on to next item. */
			listp->hfl_next++;
			continue;
		}
		/* Keep hot file free space current. */
		hfsmp->hfs_hotfile_freeblks -= fileblocks;
		listp->hfl_totalblocks -= fileblocks;
		
		/* Insert hot file entry */
		key->keyLength   = HFC_KEYLENGTH;

		if (hfsmp->hfs_flags & HFS_CS_HOTFILE_PIN) {
			//
			// The "temperature" for a CF hotfile is simply a random
			// number that we sequentially increment for each file in
			// the set of files we're currently adopting.  This has the
			// nice property that all of the files we pin to the ssd
			// in the current phase will sort together in the hotfile
			// btree.  When eviction time comes we will evict them
			// together as well.  This gives the eviction phase temporal
			// locality - things written together get evicted together
			// which is what ssd's like.
			//
			listp->hfl_hotfile[i].hf_temperature = (uint32_t)temp_adjust + starting_temp++;
		}

		key->temperature = listp->hfl_hotfile[i].hf_temperature;
		key->fileID      = listp->hfl_hotfile[i].hf_fileid;
		key->forkType    = 0;

		/* Start a new transaction before calling BTree code. */
		if (hfs_start_transaction(hfsmp) != 0) {
		    error = EINVAL;
		    break;
		}
		startedtrans = 1;

		error = BTInsertRecord(filefork, iterator, &record, record.itemSize);
		if (error) {
			int orig_error = error;
			error = MacToVFSError(error);
			printf("hfs: hotfiles_adopt:1: BTInsertRecord failed %d/%d (fileid %d)\n", error, orig_error, key->fileID);
			stage = HFC_IDLE;
			break;
		}

		/* Insert thread record */
		key->keyLength = HFC_KEYLENGTH;
		key->temperature = HFC_LOOKUPTAG;
		key->fileID = listp->hfl_hotfile[i].hf_fileid;
		key->forkType = 0;
		data = listp->hfl_hotfile[i].hf_temperature;
		error = BTInsertRecord(filefork, iterator, &record, record.itemSize);
		if (error) {
			int orig_error = error;
			error = MacToVFSError(error);
			printf("hfs: hotfiles_adopt:2: BTInsertRecord failed %d/%d (fileid %d)\n", error, orig_error, key->fileID);
			stage = HFC_IDLE;
			break;
		} else {
			(void) BTFlushPath(filefork);
			blksmoved += fileblocks;
		}

		listp->hfl_next++;
		if (listp->hfl_next >= listp->hfl_count) {
			break;
		}

		/* Transaction complete. */
		if (startedtrans) {
		    hfs_end_transaction(hfsmp);
		    startedtrans = 0;
		}

		if (hfs_hotfile_cur_freeblks(hfsmp) <= 0) {
#if HFC_VERBOSE
			printf("hfs: hotfiles_adopt: free space exhausted (%d)\n", hfsmp->hfs_hotfile_freeblks);
#endif
			break;
		}
	} /* end for */

#if HFC_VERBOSE
	printf("hfs: hotfiles_adopt: [%d] adopted %d blocks (%d files left)\n", listp->hfl_next, blksmoved, listp->hfl_count - i);
#endif
	if (!startedtrans) {
		// start a txn so we'll save the btree summary info
		if (hfs_start_transaction(hfsmp) == 0) {
			startedtrans = 1;
		}
	}		

	/* Finish any outstanding transactions. */
	if (startedtrans) {
		save_btree_user_info(hfsmp);

		(void) BTFlushPath(filefork);
		hfs_end_transaction(hfsmp);
		startedtrans = 0;
	}
	hfs_unlock(VTOC(hfsmp->hfc_filevp));

	if ((listp->hfl_next >= listp->hfl_count) || (hfsmp->hfs_hotfile_freeblks <= 0)) {
#if HFC_VERBOSE
		printf("hfs: hotfiles_adopt: all done relocating %d files\n", listp->hfl_count);
		printf("hfs: hotfiles_adopt: %d blocks free in hot file band\n", hfsmp->hfs_hotfile_freeblks);
#endif
		stage = HFC_IDLE;
	}
	hfs_free(iterator, sizeof(*iterator));

	if (stage != HFC_ADOPTION && hfsmp->hfc_filevp) {
		(void) hfc_btree_close(hfsmp, hfsmp->hfc_filevp);
		hfsmp->hfc_filevp = NULL;
	}
	hfsmp->hfc_stage = stage;
	wakeup((caddr_t)&hfsmp->hfc_stage);
	return (error);
}

/*
 * Reclaim space by evicting the coldest files.
 *
 * Requires that the hfc_mutex be held.
 */
static int
hotfiles_evict(struct hfsmount *hfsmp, vfs_context_t ctx)
{
	BTreeIterator * iterator = NULL;
	struct vnode *vp;
	HotFileKey * key;
	filefork_t * filefork;
	hotfilelist_t  *listp;
	enum hfc_stage stage;
	u_int32_t savedtemp;
	int  blksmoved;
	int  filesmoved;
	int  fileblocks;
	int  error = 0;
	int  startedtrans = 0;
	int  bt_op;

	if (hfsmp->hfc_stage != HFC_EVICTION) {
		return (EBUSY);
	}

	if ((listp = hfsmp->hfc_filelist) == NULL)
		return (0);	

	if (hfs_lock(VTOC(hfsmp->hfc_filevp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT) != 0) {
		return (EPERM);
	}

#if HFC_VERBOSE
		printf("hfs:%s: hotfiles_evict (hotfile start/end block: %d - %d; max/free: %d/%d; maxfiles: %d)\n",
		       hfsmp->vcbVN,
		       hfsmp->hfs_hotfile_start, hfsmp->hfs_hotfile_end,
		       hfsmp->hfs_hotfile_maxblks, hfsmp->hfs_hotfile_freeblks, hfsmp->hfc_maxfiles);
#endif

	iterator = hfs_mallocz(sizeof(*iterator));

	stage = hfsmp->hfc_stage;
	hfsmp->hfc_stage = HFC_BUSY;

	filesmoved = blksmoved = 0;
	bt_op = kBTreeFirstRecord;

	key = (HotFileKey*) &iterator->key;

	filefork = VTOF(hfsmp->hfc_filevp);

#if HFC_VERBOSE
	printf("hfs: hotfiles_evict: reclaim blks %d\n", listp->hfl_reclaimblks);
#endif
	
	while (listp->hfl_reclaimblks > 0 &&
	       blksmoved < HFC_BLKSPERSYNC &&
	       filesmoved < HFC_FILESPERSYNC) {

		/*
		 * Obtain the first record (ie the coldest one).
		 */
		if (BTIterateRecord(filefork, bt_op, iterator, NULL, NULL) != 0) {
#if HFC_VERBOSE
			printf("hfs: hotfiles_evict: no more records\n");
#endif
			error = 0;
			stage = HFC_ADOPTION;
			break;
		}
		if (key->keyLength != HFC_KEYLENGTH) {
			printf("hfs: hotfiles_evict: invalid key length %d\n", key->keyLength);
			error = EFTYPE;
			break;
		}		
		if (key->temperature == HFC_LOOKUPTAG) {
#if HFC_VERBOSE
			printf("hfs: hotfiles_evict: ran into thread records\n");
#endif
			error = 0;
			stage = HFC_ADOPTION;
			break;
		}

		// Jump straight to delete for some files...
		if (key->fileID == VTOC(hfsmp->hfc_filevp)->c_fileid
			|| key->fileID == hfsmp->hfs_jnlfileid
			|| key->fileID == hfsmp->hfs_jnlinfoblkid
			|| key->fileID < kHFSFirstUserCatalogNodeID) {
			goto delete;
		}

		/*
		 * Aquire the vnode for this file.
		 */
		error = hfs_vget(hfsmp, key->fileID, &vp, 0, 0);
		if (error) {
			if (error == ENOENT) {
				goto delete;  /* stale entry, go to next */
			} else {
				printf("hfs: hotfiles_evict: err %d getting file %d\n",
				       error, key->fileID);
			}
			break;
		}

		/* 
		 * Symlinks that may have been inserted into the hotfile zone during a previous OS are now stuck 
		 * here.  We do not want to move them. 
		 */
		if (!vnode_isreg(vp)) {
			//printf("hfs: hotfiles_evict: huh, not a file %d\n", key->fileID);
			hfs_unlock(VTOC(vp));
			vnode_put(vp);
			goto delete;  /* invalid entry, go to next */
		}

		fileblocks = VTOF(vp)->ff_blocks;
		if ((blksmoved > 0) &&
		    (blksmoved + fileblocks) > HFC_BLKSPERSYNC) {
			hfs_unlock(VTOC(vp));
			vnode_put(vp);
			break;
		}
		/*
		 * Make sure file is in the hot area.
		 */
		if (!hotextents(hfsmp, &VTOF(vp)->ff_extents[0]) && !(VTOC(vp)->c_attr.ca_recflags & kHFSFastDevPinnedMask)) {
#if HFC_VERBOSE
			printf("hfs: hotfiles_evict: file %d isn't hot!\n", key->fileID);
#endif
			hfs_unlock(VTOC(vp));
			vnode_put(vp);
			goto delete;  /* stale entry, go to next */
		}
		
		/*
		 * Relocate file out of hot area.  On cooperative fusion (CF) that just 
		 * means un-pinning the data from the ssd.  For traditional hotfiles that means moving
		 * the file data out of the hot region of the disk.
		 */
		if (hfsmp->hfs_flags & HFS_CS_HOTFILE_PIN) {
			uint32_t pinned_blocks;
			
			hfs_unlock(VTOC(vp));  // don't need an exclusive lock for this
			hfs_lock(VTOC(vp), HFS_SHARED_LOCK, HFS_LOCK_ALLOW_NOEXISTS);

			error = hfs_pin_vnode(hfsmp, vp, HFS_UNPIN_IT, &pinned_blocks);
			fileblocks = pinned_blocks;

			if (!error) {
				// go back to an exclusive lock since we're going to modify the cnode again
				hfs_unlock(VTOC(vp));
				hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS);
			}
		} else {
			error = hfs_relocate(vp, HFSTOVCB(hfsmp)->nextAllocation, vfs_context_ucred(ctx), vfs_context_proc(ctx));
		}
		if (error) {
#if HFC_VERBOSE
			printf("hfs: hotfiles_evict: err %d relocating file %d\n", error, key->fileID);
#endif
			hfs_unlock(VTOC(vp));
			vnode_put(vp);
			bt_op = kBTreeNextRecord;
			goto next;  /* go to next */
		} else {
			VTOC(vp)->c_attr.ca_recflags &= ~kHFSFastDevPinnedMask;
			VTOC(vp)->c_flag |= C_MODIFIED;
		}

		//
		// We do not believe that this call to hfs_fsync() is
		// necessary and it causes a journal transaction
		// deadlock so we are removing it.
		//
		// (void) hfs_fsync(vp, MNT_WAIT, 0, p);

		hfs_unlock(VTOC(vp));
		vnode_put(vp);

		hfsmp->hfs_hotfile_freeblks += fileblocks;
		listp->hfl_reclaimblks -= fileblocks;
		if (listp->hfl_reclaimblks < 0)
			listp->hfl_reclaimblks = 0;
		blksmoved += fileblocks;
		filesmoved++;
delete:
		/* Start a new transaction before calling BTree code. */
		if (hfs_start_transaction(hfsmp) != 0) {
		    error = EINVAL;
		    break;
		}
		startedtrans = 1;

		error = BTDeleteRecord(filefork, iterator);
		if (error) {
			error = MacToVFSError(error);
			break;
		}
		savedtemp = key->temperature;
		key->temperature = HFC_LOOKUPTAG;
		error = BTDeleteRecord(filefork, iterator);
		if (error) {
			error = MacToVFSError(error);
			break;
		}
		key->temperature = savedtemp;
next:
		(void) BTFlushPath(filefork);

		/* Transaction complete. */
		if (startedtrans) {
			hfs_end_transaction(hfsmp);
			startedtrans = 0;
		}

	} /* end while */

#if HFC_VERBOSE
	printf("hfs: hotfiles_evict: moved %d files (%d blks, %d to go)\n", filesmoved, blksmoved, listp->hfl_reclaimblks);
#endif
	/* Finish any outstanding transactions. */
	if (startedtrans) {
		save_btree_user_info(hfsmp);

		(void) BTFlushPath(filefork);
		hfs_end_transaction(hfsmp);
		startedtrans = 0;
	}
	hfs_unlock(VTOC(hfsmp->hfc_filevp));

	/*
	 * Move to next stage when finished.
	 */
	if (listp->hfl_reclaimblks <= 0) {
		stage = HFC_ADOPTION;
#if HFC_VERBOSE
		printf("hfs: hotfiles_evict: %d blocks free in hot file band\n", hfsmp->hfs_hotfile_freeblks);
#endif
	}
	hfs_free(iterator, sizeof(*iterator));	
	hfsmp->hfc_stage = stage;
	wakeup((caddr_t)&hfsmp->hfc_stage);
	return (error);
}

/*
 * Age the existing records in the hot files b-tree.
 */
static int
hotfiles_age(struct hfsmount *hfsmp)
{
	BTreeInfoRec  btinfo;
	BTreeIterator * iterator = NULL;
	BTreeIterator * prev_iterator;
	FSBufferDescriptor  record;
	FSBufferDescriptor  prev_record;
	HotFileKey * key;
	HotFileKey * prev_key;
	filefork_t * filefork;
	u_int32_t  data;
	u_int32_t  prev_data;
	u_int32_t  newtemp;
	int  error;
	int  i;
	int  numrecs;
	int  aged = 0;
	u_int16_t  reclen;


	if (hfsmp->hfs_flags & HFS_CS_HOTFILE_PIN) {
		//
		// hotfiles don't age on CF
		//
		return 0;
	}

	iterator = hfs_mallocz(2 * sizeof(*iterator));

	key = (HotFileKey*) &iterator->key;

	prev_iterator = &iterator[1];
	prev_key = (HotFileKey*) &prev_iterator->key;

	record.bufferAddress = &data;
	record.itemSize = sizeof(data);
	record.itemCount = 1;
	prev_record.bufferAddress = &prev_data;
	prev_record.itemSize = sizeof(prev_data);
	prev_record.itemCount = 1;

	/*
	 * Capture b-tree changes inside a transaction
	 */
	if (hfs_start_transaction(hfsmp) != 0) {
	    error = EINVAL;
	    goto out2;
	} 
	if (hfs_lock(VTOC(hfsmp->hfc_filevp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT) != 0) {
		error = EPERM;
		goto out1;
	}
	filefork = VTOF(hfsmp->hfc_filevp);

	error = BTGetInformation(filefork, 0, &btinfo);
	if (error) {
		error = MacToVFSError(error);
		goto out;
	}
	if (btinfo.numRecords < 2) {
		error = 0;
		goto out;
	}
	
	/* Only want 1st half of leaf records */
	numrecs = (btinfo.numRecords /= 2) - 1;

	error = BTIterateRecord(filefork, kBTreeFirstRecord, iterator, &record, &reclen);
	if (error) {
		printf("hfs_agehotfiles: BTIterateRecord: %d\n", error);
		error = MacToVFSError(error);
		goto out;
	}
	bcopy(iterator, prev_iterator, sizeof(BTreeIterator));
	prev_data = data;

	for (i = 0; i < numrecs; ++i) {
		error = BTIterateRecord(filefork, kBTreeNextRecord, iterator, &record, &reclen);
		if (error == 0) {
			if (key->temperature < prev_key->temperature) {
				printf("hfs_agehotfiles: out of order keys!\n");
				error = EFTYPE;
				break;
			}
			if (reclen != sizeof(data)) {
				printf("hfs_agehotfiles: invalid record length %d\n", reclen);
				error = EFTYPE;
				break;
			}
			if (key->keyLength != HFC_KEYLENGTH) {
				printf("hfs_agehotfiles: invalid key length %d\n", key->keyLength);
				error = EFTYPE;
				break;
			}
		} else if ((error == fsBTEndOfIterationErr || error == fsBTRecordNotFoundErr) &&
		    (i == (numrecs - 1))) {
			error = 0;
		} else if (error) {
			printf("hfs_agehotfiles: %d of %d BTIterateRecord: %d\n", i, numrecs, error);
			error = MacToVFSError(error);
			break;
		}
		if (prev_key->temperature == HFC_LOOKUPTAG) {
#if HFC_VERBOSE	
			printf("hfs_agehotfiles: ran into thread record\n");
#endif
			error = 0;
			break;
		}
		error = BTDeleteRecord(filefork, prev_iterator);
		if (error) {
			printf("hfs_agehotfiles: BTDeleteRecord failed %d (file %d)\n", error, prev_key->fileID);
			error = MacToVFSError(error);
			break;
		}
		
		/* Age by halving the temperature (floor = 4) */
		newtemp = MAX(prev_key->temperature >> 1, 4);
		prev_key->temperature = newtemp;
	
		error = BTInsertRecord(filefork, prev_iterator, &prev_record, prev_record.itemSize);
		if (error) {
			printf("hfs_agehotfiles: BTInsertRecord failed %d (file %d)\n", error, prev_key->fileID);
			error = MacToVFSError(error);
			break;
		}
		++aged;
		/*
		 * Update thread entry with latest temperature.
		 */
		prev_key->temperature = HFC_LOOKUPTAG;
		error = BTUpdateRecord(filefork, prev_iterator,
				(IterateCallBackProcPtr)update_callback,
				&newtemp);
		if (error) {
			printf("hfs_agehotfiles: %d of %d BTUpdateRecord failed %d (file %d, %d)\n",
				i, numrecs, error, prev_key->fileID, newtemp);
			error = MacToVFSError(error);
		//	break;
		}

		bcopy(iterator, prev_iterator, sizeof(BTreeIterator));
		prev_data = data;

	} /* end for */

#if HFC_VERBOSE	
	if (error == 0)
		printf("hfs_agehotfiles: aged %d records out of %d\n", aged, btinfo.numRecords);
#endif
	(void) BTFlushPath(filefork);
out:
	hfs_unlock(VTOC(hfsmp->hfc_filevp));
out1:
	hfs_end_transaction(hfsmp);
out2:
	if (iterator)
		hfs_free(iterator, 2 * sizeof(*iterator));
	return (error);
}

/*
 * Return true if any blocks (or all blocks if all is true)
 * are contained in the hot file region.
 */
static int
hotextents(struct hfsmount *hfsmp, HFSPlusExtentDescriptor * extents)
{
	u_int32_t  b1, b2;
	int  i;
	int  inside = 0;

	for (i = 0; i < kHFSPlusExtentDensity; ++i) {
		b1 = extents[i].startBlock;
		if (b1 == 0)
			break;
		b2 = b1 + extents[i].blockCount - 1;
		if ((b1 >= hfsmp->hfs_hotfile_start &&
		     b2 <= hfsmp->hfs_hotfile_end) ||
		    (b1 < hfsmp->hfs_hotfile_end && 
		     b2 > hfsmp->hfs_hotfile_end)) {
			inside = 1;
			break;
		}
	}
	return (inside);
}


/*
 *========================================================================
 *                       HOT FILE B-TREE ROUTINES
 *========================================================================
 */

/*
 * Open the hot files b-tree for writing.
 *
 * On successful exit the vnode has a reference but not an iocount.
 */
static int
hfc_btree_open(struct hfsmount *hfsmp, struct vnode **vpp)
{
	return hfc_btree_open_ext(hfsmp, vpp, 0);
}

static int
hfc_btree_open_ext(struct hfsmount *hfsmp, struct vnode **vpp, int ignore_btree_errs)
{
	proc_t p;
	struct vnode *vp;
	struct cat_desc  cdesc;
	struct cat_attr  cattr;
	struct cat_fork  cfork;
	static char filename[] = HFC_FILENAME;
	int  error;
	int  retry = 0;
	int lockflags;
	int newvnode_flags = 0;

	*vpp = NULL;
	p = current_proc();

	bzero(&cdesc, sizeof(cdesc));
	cdesc.cd_parentcnid = kRootDirID;
	cdesc.cd_nameptr = (const u_int8_t *)filename;
	cdesc.cd_namelen = strlen(filename);

	lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG, HFS_SHARED_LOCK);

	error = cat_lookup(hfsmp, &cdesc, 0, 0, &cdesc, &cattr, &cfork, NULL);

	hfs_systemfile_unlock(hfsmp, lockflags);

	if (error) {
		printf("hfs: hfc_btree_open: cat_lookup error %d\n", error);
		return (error);
	}
again:
	cdesc.cd_flags |= CD_ISMETA;
	error = hfs_getnewvnode(hfsmp, NULL, NULL, &cdesc, 0, &cattr, 
							&cfork, &vp, &newvnode_flags);
	if (error) {
		printf("hfs: hfc_btree_open: hfs_getnewvnode error %d\n", error);
		cat_releasedesc(&cdesc);
		return (error);
	}
	if (!vnode_issystem(vp)) {
#if HFC_VERBOSE
		printf("hfs: hfc_btree_open: file has UBC, try again\n");
#endif
		hfs_unlock(VTOC(vp));
		vnode_recycle(vp);
		vnode_put(vp);
		if (retry++ == 0)
			goto again;
		else
			return (EBUSY);
	}

	/* Open the B-tree file for writing... */
	error = BTOpenPath(VTOF(vp), (KeyCompareProcPtr) hfc_comparekeys);	
	if (error) {
		if (!ignore_btree_errs) {
			printf("hfs: hfc_btree_open: BTOpenPath error %d; filesize %lld\n", error, VTOF(vp)->ff_size);
			error = MacToVFSError(error);
		} else {
			error = 0;
		}
	}

	hfs_unlock(VTOC(vp));
	if (error == 0) {
		*vpp = vp;
		vnode_ref(vp);  /* keep a reference while its open */
	}
	vnode_put(vp);

	if (!vnode_issystem(vp))
		panic("hfs: hfc_btree_open: not a system file (vp = %p)", vp);

	HotFilesInfo hotfileinfo;

	if (error == 0 && (hfsmp->hfs_flags & HFS_CS_HOTFILE_PIN)) {
		if ((BTGetUserData(VTOF(vp), &hotfileinfo, sizeof(hotfileinfo)) == 0) && (SWAP_BE32 (hotfileinfo.magic) == HFC_MAGIC)) {
			if (hfsmp->hfs_hotfile_freeblks == 0) {
				hfsmp->hfs_hotfile_freeblks = hfsmp->hfs_hotfile_maxblks - SWAP_BE32 (hotfileinfo.usedblocks);
			}

			hfs_hotfile_cur_freeblks(hfsmp);        // factors in any adjustments that happened at run-time
		}
	}
	
	return (error);
}

/*
 * Close the hot files b-tree.
 *
 * On entry the vnode has a reference.
 */
static int
hfc_btree_close(struct hfsmount *hfsmp, struct vnode *vp)
{
	proc_t p = current_proc();
	int  error = 0;


	if (hfsmp->jnl) {
	    hfs_flush(hfsmp, HFS_FLUSH_JOURNAL);
	}

	if (vnode_get(vp) == 0) {
		error = hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT);
		if (error == 0) {
			(void) hfs_fsync(vp, MNT_WAIT, 0, p);
			error = BTClosePath(VTOF(vp));
			hfs_unlock(VTOC(vp));
		}
		vnode_rele(vp);
		vnode_recycle(vp);
		vnode_put(vp);
	}
	
	return (error);
}

//
// Assumes that hfsmp->hfc_filevp points to the hotfile btree vnode
// (i.e. you called hfc_btree_open() ahead of time)
//
static int
hfc_btree_delete_record(struct hfsmount *hfsmp, BTreeIterator *iterator, HotFileKey *key)
{
	int error;
	filefork_t *filefork=VTOF(hfsmp->hfc_filevp);

	/* Start a new transaction before calling BTree code. */
	if (hfs_start_transaction(hfsmp) != 0) {
		return EINVAL;
	}

	error = BTDeleteRecord(filefork, iterator);
	if (error) {
		error = MacToVFSError(error);
		printf("hfs: failed to delete record for file-id %d : err %d\n", key->fileID, error);
		goto out;
	}

	int savedtemp;
	savedtemp = key->temperature;
	key->temperature = HFC_LOOKUPTAG;
	error = BTDeleteRecord(filefork, iterator);
	if (error) {
		error = MacToVFSError(error);
		printf("hfs:2: failed to delete record for file-id %d : err %d\n", key->fileID, error);
	}
	key->temperature = savedtemp;

	(void) BTFlushPath(filefork);

out:
	/* Transaction complete. */
	hfs_end_transaction(hfsmp);

	return error;
}

//
// You have to have already opened the hotfile btree so
// that hfsmp->hfc_filevp is filled in.
//
static int
hfc_btree_delete(struct hfsmount *hfsmp)
{
	struct vnode *dvp = NULL;
	vfs_context_t ctx = vfs_context_current();
	struct vnode_attr va;
	static char filename[] = HFC_FILENAME;
	int  error;

	error = hfs_vfs_root(HFSTOVFS(hfsmp), &dvp, ctx);
	if (error) {
		return (error);
	}

	struct componentname cname = {
		.cn_nameiop = DELETE,
		.cn_flags = ISLASTCN,
		.cn_pnbuf = filename,
		.cn_pnlen = sizeof(filename),
		.cn_nameptr = filename,
		.cn_namelen = strlen(filename),
	};

	VATTR_INIT(&va);
	VATTR_SET(&va, va_type, VREG);
	VATTR_SET(&va, va_mode, S_IFREG | S_IRUSR | S_IWUSR);
	VATTR_SET(&va, va_uid, 0);
	VATTR_SET(&va, va_gid, 0);

	if (hfs_start_transaction(hfsmp) != 0) {
	    error = EINVAL;
	    goto out;
	} 

    struct vnop_remove_args ap = {
        .a_dvp = dvp,
        .a_vp  = hfsmp->hfc_filevp,
        .a_cnp = &cname,
    };

    error = hfs_vnop_remove(&ap);
	if (error) {
		printf("hfs: error %d removing HFBT on %s\n", error, HFSTOVCB(hfsmp)->vcbVN);
	}

	hfs_end_transaction(hfsmp);

out:
	if (dvp) {
		vnode_put(dvp);
		dvp = NULL;
	}

	return 0;
}




/*
 *  Create a hot files btree file.
 *
 */
static int
hfc_btree_create(struct hfsmount *hfsmp, unsigned int nodesize, unsigned int entries)
{
	struct vnode *dvp = NULL;
	struct vnode *vp = NULL;
	struct cnode *cp = NULL;
	vfs_context_t ctx = vfs_context_current();
	struct vnode_attr va;
	static char filename[] = HFC_FILENAME;
	int  error;

	if (hfsmp->hfc_filevp)
		panic("hfs: hfc_btree_create: hfc_filevp exists (vp = %p)", hfsmp->hfc_filevp);

	error = hfs_vfs_root(HFSTOVFS(hfsmp), &dvp, ctx);
	if (error) {
		return (error);
	}

	struct componentname cname = {
		.cn_nameiop = CREATE,
		.cn_flags = ISLASTCN,
		.cn_pnbuf = filename,
		.cn_pnlen = sizeof(filename),
		.cn_nameptr = filename,
		.cn_namelen = strlen(filename)
	};

	VATTR_INIT(&va);
	VATTR_SET(&va, va_type, VREG);
	VATTR_SET(&va, va_mode, S_IFREG | S_IRUSR | S_IWUSR);
	VATTR_SET(&va, va_uid, 0);
	VATTR_SET(&va, va_gid, 0);

	if (hfs_start_transaction(hfsmp) != 0) {
		vnode_put(dvp);
		return EINVAL;
	}

	/* call ourselves directly, ignore the higher-level VFS file creation code */

    struct vnop_create_args ap = {
        .a_dvp = dvp,
        .a_vpp = &vp,
        .a_cnp = &cname,
        .a_vap = &va
    };

    error = hfs_vnop_create(&ap);
	if (error) {
		printf("hfs: error %d creating HFBT on %s\n", error, HFSTOVCB(hfsmp)->vcbVN);
		goto out;
	}
	if (dvp) {
		vnode_put(dvp);
		dvp = NULL;
	}
	if ((error = hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT))) {
		goto out;
	}
	cp = VTOC(vp);

	/* Don't use non-regular files or files with links. */
	if (!vnode_isreg(vp) || cp->c_linkcount != 1) {
		error = EFTYPE;
		goto out;
	}

	printf("hfs: created HFBT on %s\n", HFSTOVCB(hfsmp)->vcbVN);

	if (VTOF(vp)->ff_size < nodesize) {
		caddr_t  buffer;
		u_int16_t *index;
		u_int16_t  offset;
		BTNodeDescriptor  *ndp;
		BTHeaderRec  *bthp;
		HotFilesInfo *hotfileinfo;
		int  nodecnt;
		int  filesize;
		int  entirespernode;

		/*
		 * Mark it invisible (truncate will pull these changes).
		 */
		((FndrFileInfo *)&cp->c_finderinfo[0])->fdFlags |=
			SWAP_BE16 (kIsInvisible + kNameLocked);

		buffer = hfs_mallocz(nodesize);
		index = (u_int16_t *)buffer;
	
		entirespernode = (nodesize - sizeof(BTNodeDescriptor) - 2) /
				 (sizeof(HotFileKey) + 6);
		nodecnt = 2 + howmany(entries * 2, entirespernode);
		nodecnt = roundup(nodecnt, 8);
		filesize = nodecnt * nodesize;
	
		/* FILL IN THE NODE DESCRIPTOR:  */
		ndp = (BTNodeDescriptor *)buffer;
		ndp->kind = kBTHeaderNode;
		ndp->numRecords = SWAP_BE16 (3);
		offset = sizeof(BTNodeDescriptor);
		index[(nodesize / 2) - 1] = SWAP_BE16 (offset);
	
		/* FILL IN THE HEADER RECORD:  */
		bthp = (BTHeaderRec *)((u_int8_t *)buffer + offset);
		bthp->nodeSize     = SWAP_BE16 (nodesize);
		bthp->totalNodes   = SWAP_BE32 (filesize / nodesize);
		bthp->freeNodes    = SWAP_BE32 (nodecnt - 1);
		bthp->clumpSize    = SWAP_BE32 (filesize);
		bthp->btreeType    = kUserBTreeType; /* non-metadata */
		bthp->attributes  |= SWAP_BE32 (kBTBigKeysMask);
		bthp->maxKeyLength = SWAP_BE16 (HFC_KEYLENGTH);
		offset += sizeof(BTHeaderRec);
		index[(nodesize / 2) - 2] = SWAP_BE16 (offset);
	
		/* FILL IN THE USER RECORD:  */
		hotfileinfo = (HotFilesInfo *)((u_int8_t *)buffer + offset);
		hotfileinfo->magic       = SWAP_BE32 (HFC_MAGIC);
		hotfileinfo->version     = SWAP_BE32 (HFC_VERSION);
		hotfileinfo->duration    = SWAP_BE32 (HFC_DEFAULT_DURATION);
		hotfileinfo->timebase    = 0;
		hotfileinfo->timeleft    = 0;
		hotfileinfo->threshold   = SWAP_BE32 (HFC_MINIMUM_TEMPERATURE);
		hotfileinfo->maxfileblks = SWAP_BE32 (HFC_MAXIMUM_FILESIZE / HFSTOVCB(hfsmp)->blockSize);
		if (hfsmp->hfs_flags & HFS_CS_HOTFILE_PIN) {
			if (hfsmp->hfs_hotfile_freeblks == 0) {
				hfsmp->hfs_hotfile_freeblks = hfsmp->hfs_hotfile_maxblks;
			}
			hotfileinfo->usedblocks = SWAP_BE32 (hfsmp->hfs_hotfile_maxblks - hfsmp->hfs_hotfile_freeblks);
		} else {
			hotfileinfo->maxfilecnt  = SWAP_BE32 (HFC_DEFAULT_FILE_COUNT);
		}
		strlcpy((char *)hotfileinfo->tag, hfc_tag,
			sizeof hotfileinfo->tag);
		offset += kBTreeHeaderUserBytes;
		index[(nodesize / 2) - 3] = SWAP_BE16 (offset);
	
		/* FILL IN THE MAP RECORD (only one node in use). */
		*((u_int8_t *)buffer + offset) = 0x80;
		offset += nodesize - sizeof(BTNodeDescriptor) - sizeof(BTHeaderRec)
				   - kBTreeHeaderUserBytes - (4 * sizeof(int16_t));
		index[(nodesize / 2) - 4] = SWAP_BE16 (offset);

		vnode_setnoflush(vp);
		error = hfs_truncate(vp, (off_t)filesize, IO_NDELAY, 0, ctx);
		if (error) {
			printf("hfs: error %d growing HFBT on %s\n", error, HFSTOVCB(hfsmp)->vcbVN);
			goto out;
		}
		cp->c_flag |= C_ZFWANTSYNC;
		cp->c_zftimeout = 1;
		
		if (error == 0) {
			struct vnop_write_args args;
			uio_t auio;

			auio = uio_create(1, 0, UIO_SYSSPACE, UIO_WRITE);
			uio_addiov(auio, (uintptr_t)buffer, nodesize);

			args.a_desc = &vnop_write_desc;
			args.a_vp = vp;
			args.a_uio = auio;
			args.a_ioflag = 0;
			args.a_context = ctx;

			hfs_unlock(cp);
			cp = NULL;

			error = hfs_vnop_write(&args);
			if (error)
				printf("hfs: error %d writing HFBT on %s\n", error, HFSTOVCB(hfsmp)->vcbVN);

			uio_free(auio);
		}
		hfs_free(buffer, nodesize);
	}
out:
	hfs_end_transaction(hfsmp);
	if (dvp) {
		vnode_put(dvp);
	}
	if (vp) {
		if (cp)
			hfs_unlock(cp);
		vnode_recycle(vp);
		vnode_put(vp);
	}
	return (error);
}

/*
 * Compare two hot file b-tree keys.
 *
 * Result:   +n  search key > trial key
 *            0  search key = trial key
 *           -n  search key < trial key
 */
static int
hfc_comparekeys(HotFileKey *searchKey, HotFileKey *trialKey)
{
	/*
	 * Compared temperatures first.
	 */
	if (searchKey->temperature == trialKey->temperature) {
		/*
		 * Temperatures are equal so compare file ids.
		 */
		if (searchKey->fileID == trialKey->fileID) {
			/*
			 * File ids are equal so compare fork types.
			 */
			if (searchKey->forkType == trialKey->forkType) {
				return (0);
			} else if (searchKey->forkType > trialKey->forkType) {
				return (1);
			}
		} else if (searchKey->fileID > trialKey->fileID) {
			return (1);
		}
	} else if (searchKey->temperature > trialKey->temperature) {
		return (1);
	}
	
	return (-1);
}


/*
 *========================================================================
 *               HOT FILE DATA COLLECTING ROUTINES
 *========================================================================
 */

/*
 * Lookup a hot file entry in the tree.
 */
#if HFC_DEBUG
static hotfile_entry_t *
hf_lookup(hotfile_data_t *hotdata, u_int32_t fileid, u_int32_t temperature)
{
	hotfile_entry_t *entry = hotdata->rootentry;

	while (entry &&
	       entry->temperature != temperature &&
	       entry->fileid != fileid) {

		if (temperature > entry->temperature)
			entry = entry->right;
		else if (temperature < entry->temperature)
			entry = entry->left;
		else if (fileid > entry->fileid)
			entry = entry->right;
		else
			entry = entry->left;
	}
	return (entry);
}
#endif

/*
 * Insert a hot file entry into the tree.
 */
static int
hf_insert(hotfile_data_t *hotdata, hotfile_entry_t *newentry) 
{
	hotfile_entry_t *entry = hotdata->rootentry;
	u_int32_t fileid = newentry->fileid;
	u_int32_t temperature = newentry->temperature;

	if (entry == NULL) {
		hotdata->rootentry = newentry;
		hotdata->coldest = newentry;
		hotdata->activefiles++;
		return 0;
	}

	while (entry) {
		if (temperature > entry->temperature) {
			if (entry->right) {
				entry = entry->right;
			} else {
				entry->right = newentry;
				break;
			}
		} else if (temperature < entry->temperature) {
			if (entry->left) {
				entry = entry->left;
			} else {
			    	entry->left = newentry;
				break;
			}
		} else if (fileid > entry->fileid) { 
			if (entry->right) {
				entry = entry->right;
			} else {
	       			if (entry->fileid != fileid)
					entry->right = newentry;
				break;
			}
		} else { 
			if (entry->left) {
				entry = entry->left;
			} else {
	       			if (entry->fileid != fileid) {
			    		entry->left = newentry;
				} else {
					return EEXIST;
				}
				break;
			}
		}
	}

	hotdata->activefiles++;
	return 0;
}

/*
 * Find the coldest entry in the tree.
 */
static hotfile_entry_t *
hf_coldest(hotfile_data_t *hotdata)
{
	hotfile_entry_t *entry = hotdata->rootentry;

	if (entry) {
		while (entry->left)
			entry = entry->left;
	}
	return (entry);
}

/*
 * Find the hottest entry in the tree.
 */
static hotfile_entry_t *
hf_hottest(hotfile_data_t *hotdata)
{
	hotfile_entry_t *entry = hotdata->rootentry;

	if (entry) {
		while (entry->right)
			entry = entry->right;
	}
	return (entry);
}

/*
 * Delete a hot file entry from the tree.
 */
static void
hf_delete(hotfile_data_t *hotdata, u_int32_t fileid, u_int32_t temperature)
{
	hotfile_entry_t *entry, *parent, *next;

	parent = NULL;
	entry = hotdata->rootentry;

	while (entry &&
	       entry->temperature != temperature &&
	       entry->fileid != fileid) {

		parent = entry;
		if (temperature > entry->temperature)
			entry = entry->right;
		else if (temperature < entry->temperature)
			entry = entry->left;
		else if (fileid > entry->fileid)
			entry = entry->right;
		else
			entry = entry->left;
	}

	if (entry) {
		/*
		 * Reorganize the sub-trees spanning from our entry.
		 */
		if ((next = entry->right)) {
			hotfile_entry_t *pnextl, *psub;
			/*
			 * Tree pruning: take the left branch of the
			 * current entry and place it at the lowest
			 * left branch of the current right branch 
			 */
			psub = next;
			
			/* Walk the Right/Left sub tree from current entry */
			while ((pnextl = psub->left))
				psub = pnextl;	
			
			/* Plug the old left tree to the new ->Right leftmost entry */	
			psub->left = entry->left;
	
		} else /* only left sub-tree, simple case */ {  
			next = entry->left;
		}
		/* 
		 * Now, plug the current entry sub tree to
		 * the good pointer of our parent entry.
		 */
		if (parent == NULL)
			hotdata->rootentry = next;
		else if (parent->left == entry)
			parent->left = next;
		else
			parent->right = next;	
		
		/* Place entry back on the free-list */
		entry->left = 0;
		entry->fileid = 0;
		entry->temperature = 0;

		entry->right = hotdata->freelist; 
		hotdata->freelist = entry; 		
		hotdata->activefiles--;
		
		if (hotdata->coldest == entry || hotdata->coldest == NULL) {
			hotdata->coldest = hf_coldest(hotdata);
		}

	}
}

/*
 * Get a free hot file entry.
 */
static hotfile_entry_t *
hf_getnewentry(hotfile_data_t *hotdata)
{
	hotfile_entry_t * entry;
	
	/*
	 * When the free list is empty then steal the coldest one
	 */
	if (hotdata->freelist == NULL) {
		entry = hf_coldest(hotdata);
		hf_delete(hotdata, entry->fileid, entry->temperature);
	}
	entry = hotdata->freelist;
	hotdata->freelist = entry->right;
	entry->right = 0;
	
	return (entry);
}


/*
 * Generate a sorted list of hot files (hottest to coldest).
 *
 * As a side effect, every node in the hot file tree will be
 * deleted (moved to the free list).
 */
static void
hf_getsortedlist(hotfile_data_t * hotdata, hotfilelist_t *sortedlist)
{
	int i = 0;
	hotfile_entry_t *entry;
	
	while ((entry = hf_hottest(hotdata)) != NULL) {
		sortedlist->hfl_hotfile[i].hf_fileid = entry->fileid;
		sortedlist->hfl_hotfile[i].hf_temperature = entry->temperature;
		sortedlist->hfl_hotfile[i].hf_blocks = entry->blocks;
		sortedlist->hfl_totalblocks += entry->blocks;
		++i;

		hf_delete(hotdata, entry->fileid, entry->temperature);
	}
	
	sortedlist->hfl_count = i;
	
#if HFC_VERBOSE
	printf("hfs: hf_getsortedlist returning %d entries w/%d total blocks\n", i, sortedlist->hfl_totalblocks);
#endif
}


#if HFC_DEBUG
static void
hf_maxdepth(hotfile_entry_t * root, int depth, int *maxdepth)
{
	if (root) {
		depth++;
		if (depth > *maxdepth)
			*maxdepth = depth;
		hf_maxdepth(root->left, depth, maxdepth);
		hf_maxdepth(root->right, depth, maxdepth);
	}
}

static void
hf_printtree(hotfile_entry_t * root)
{
	if (root) {
		hf_printtree(root->left);
		printf("hfs: temperature: % 8d, fileid %d\n", root->temperature, root->fileid);
		hf_printtree(root->right);
	}
}
#endif