psort_b-fbsd.c   [plain text]


/****************************************************************************/
/*-
 * Copyright (c) 1992, 1993
 *	The Regents of the University of California.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 4. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 */

#if defined(LIBC_SCCS) && !defined(lint)
static char sccsid[] = "@(#)qsort.c	8.1 (Berkeley) 6/4/93";
#endif /* LIBC_SCCS and not lint */
#include <sys/cdefs.h>
__FBSDID("$FreeBSD: src/lib/libc/stdlib/qsort.c,v 1.15 2008/01/14 09:21:34 das Exp $");

#include <stdlib.h>
#include <pthread.h>
#include <dispatch/dispatch.h>
#include <stddef.h>
#include <string.h>
#include <libkern/OSAtomic.h>
#include <sys/mman.h>
#include <errno.h>
#define __APPLE_API_PRIVATE
#include <machine/cpu_capabilities.h>

#ifdef I_AM_PSORT_R
typedef int		 cmp_t(void *, const void *, const void *);
#else
typedef int		 cmp_t(const void *, const void *);
#endif
#ifdef I_AM_PSORT_B
static inline char	*med3(char *, char *, char *, cmp_t ^, void *) __attribute__((always_inline));
#else
static inline char	*med3(char *, char *, char *, cmp_t *, void *) __attribute__((always_inline));
#endif
static inline void	 swapfunc(char *, char *, int, int) __attribute__((always_inline));

#define min(a, b)	(a) < (b) ? a : b

#define NARGS			((PAGESIZE - offsetof(struct page, args)) / sizeof(union args))
#define PAGESIZE		4096
#define PARALLEL_MIN_SIZE	2000	/* determine heuristically */

struct shared; /* forward reference */
union args {
    union args *next;
    struct {
	struct shared *shared;
	void *a;
	size_t n;
	int depth_limit;
    } /* anonymous */;
};

struct page {
    struct page *next;
    union args args[0];
};

struct shared {
    char *who;
    union args *freelist;
    struct page *pagelist;
#ifdef I_AM_PSORT_R
    void *thunk;
#endif
#ifdef I_AM_PSORT_B
    cmp_t ^cmp;
#else
    cmp_t *cmp;
#endif
    size_t es;
    size_t turnoff;
    dispatch_queue_t queue;
    pthread_cond_t cond;
    pthread_mutex_t mutex;
    OSSpinLock sharedlock;
    int count;
};

static union args *
getargs(struct shared *shared)
{
    union args *args;

    OSSpinLockLock(&shared->sharedlock);
    if(!shared->freelist) {
	struct page *page;
	union args *prev;
	int i;
	if((page = (struct page *)mmap(NULL, PAGESIZE, PROT_READ|PROT_WRITE, MAP_ANON|MAP_PRIVATE, -1, 0)) == NULL)
	    return NULL;
	page->next = shared->pagelist;
	shared->pagelist = page;
	prev = NULL;
	for(args = page->args, i = NARGS; i > 0; args++, i--) {
	    args->next = prev;
	    prev = args;
	}
	shared->freelist = prev;
    }
    args = shared->freelist;
    shared->freelist = args->next;
    OSSpinLockUnlock(&shared->sharedlock);
    return args;
}

static void
returnargs(struct shared *shared, union args *args)
{
    OSSpinLockLock(&shared->sharedlock);
    args->next = shared->freelist;
    shared->freelist = args;
    OSSpinLockUnlock(&shared->sharedlock);
}

/*
 * Qsort routine from Bentley & McIlroy's "Engineering a Sort Function".
 */
#define swapcode(TYPE, parmi, parmj, n) { 		\
	long i = (n) / sizeof (TYPE); 			\
	TYPE *pi = (TYPE *) (parmi); 		\
	TYPE *pj = (TYPE *) (parmj); 		\
	do { 						\
		TYPE	t = *pi;		\
		*pi++ = *pj;				\
		*pj++ = t;				\
        } while (--i > 0);				\
}

#define SWAPINIT(a, es) swaptype = ((char *)a - (char *)0) % sizeof(long) || \
	es % sizeof(long) ? 2 : es == sizeof(long)? 0 : 1;

static inline void
swapfunc(a, b, n, swaptype)
	char *a, *b;
	int n, swaptype;
{
	if(swaptype <= 1)
		swapcode(long, a, b, n)
	else
		swapcode(char, a, b, n)
}

#define swap(a, b)					\
	if (swaptype == 0) {				\
		long t = *(long *)(a);			\
		*(long *)(a) = *(long *)(b);		\
		*(long *)(b) = t;			\
	} else						\
		swapfunc(a, b, es, swaptype)

#define vecswap(a, b, n) 	if ((n) > 0) swapfunc(a, b, n, swaptype)

#ifdef I_AM_PSORT_R
#define	CMP(t, x, y) (cmp((t), (x), (y)))
#else
#define	CMP(t, x, y) (cmp((x), (y)))
#endif

static inline char *
med3(char *a, char *b, char *c,
#ifdef I_AM_PSORT_B
cmp_t ^cmp,
#else
cmp_t *cmp,
#endif
void *thunk
#ifndef I_AM_PSORT_R
__unused
#endif
)
{
	return CMP(thunk, a, b) < 0 ?
	       (CMP(thunk, b, c) < 0 ? b : (CMP(thunk, a, c) < 0 ? c : a ))
              :(CMP(thunk, b, c) > 0 ? b : (CMP(thunk, a, c) < 0 ? a : c ));
}

#ifdef __LP64__
#define DEPTH(x)	(2 * (flsl((long)(x)) - 1))
#else /* !__LP64__ */
#define DEPTH(x)	(2 * (fls((int)(x)) - 1))
#endif /* __LP64__ */

#ifdef I_AM_PSORT_R
int __heapsort_r(void *, size_t, size_t, void *, int (*)(void *, const void *, const void *));
#endif

static void _psort_parallel(void *x);

static void
_psort(void *a, size_t n, size_t es,
#ifdef I_AM_PSORT_R
void *thunk,
#else
#define thunk	NULL
#endif
#ifdef I_AM_PSORT_B
cmp_t ^cmp,
#else
cmp_t *cmp,
#endif
int depth_limit, struct shared *shared)
{
	char *pa, *pb, *pc, *pd, *pl, *pm, *pn;
	size_t d, r;
	int cmp_result;
	int swaptype, swap_cnt;

loop:
	if (depth_limit-- <= 0) {
#ifdef I_AM_PSORT_B
		heapsort_b(a, n, es, cmp);
#elif defined(I_AM_PSORT_R)
		__heapsort_r(a, n, es, thunk, cmp);
#else
		heapsort(a, n, es, cmp);
#endif
		return;
	}
	SWAPINIT(a, es);
	swap_cnt = 0;
	if (n < 7) {
		for (pm = (char *)a + es; pm < (char *)a + n * es; pm += es)
			for (pl = pm; 
			     pl > (char *)a && CMP(thunk, pl - es, pl) > 0;
			     pl -= es)
				swap(pl, pl - es);
		return;
	}
	pm = (char *)a + (n / 2) * es;
	if (n > 7) {
		pl = a;
		pn = (char *)a + (n - 1) * es;
		if (n > 40) {
			d = (n / 8) * es;
			pl = med3(pl, pl + d, pl + 2 * d, cmp, thunk);
			pm = med3(pm - d, pm, pm + d, cmp, thunk);
			pn = med3(pn - 2 * d, pn - d, pn, cmp, thunk);
		}
		pm = med3(pl, pm, pn, cmp, thunk);
	}
	swap(a, pm);
	pa = pb = (char *)a + es;

	pc = pd = (char *)a + (n - 1) * es;
	for (;;) {
		while (pb <= pc && (cmp_result = CMP(thunk, pb, a)) <= 0) {
			if (cmp_result == 0) {
				swap_cnt = 1;
				swap(pa, pb);
				pa += es;
			}
			pb += es;
		}
		while (pb <= pc && (cmp_result = CMP(thunk, pc, a)) >= 0) {
			if (cmp_result == 0) {
				swap_cnt = 1;
				swap(pc, pd);
				pd -= es;
			}
			pc -= es;
		}
		if (pb > pc)
			break;
		swap(pb, pc);
		swap_cnt = 1;
		pb += es;
		pc -= es;
	}

	pn = (char *)a + n * es;
	r = min(pa - (char *)a, pb - pa);
	vecswap(a, pb - r, r);
	r = min(pd - pc, pn - pd - es);
	vecswap(pb, pn - r, r);

	if (swap_cnt == 0) {  /* Switch to insertion sort */
		r = 1 + n / 4; /* n >= 7, so r >= 2 */
		for (pm = (char *)a + es; pm < (char *)a + n * es; pm += es)
			for (pl = pm; 
			     pl > (char *)a && CMP(thunk, pl - es, pl) > 0;
			     pl -= es) {
				swap(pl, pl - es);
				if (++swap_cnt > r) goto nevermind;
			}
		return;
	}

nevermind:
	if ((r = pb - pa) > es) {
		r /= es;
		if (shared && r > shared->turnoff) {
			union args *args = getargs(shared);

			if (args == NULL)
				LIBC_ABORT("%s: getargs: %s", shared->who, strerror(errno));
			args->shared = shared;
			args->a = a;
			args->n = r;
			args->depth_limit = depth_limit;
			OSAtomicIncrement32(&shared->count);
			dispatch_async_f(shared->queue, args, _psort_parallel);
		} else {
#ifdef I_AM_PSORT_R
			_psort(a, r, es, thunk, cmp, depth_limit, NULL);
#else
			_psort(a, r, es, cmp, depth_limit, NULL);
#endif
		}
	}
	if ((r = pd - pc) > es) {
		/* Iterate rather than recurse to save stack space */
		a = pn - r;
		n = r / es;
		goto loop;
	}
/*		psort(pn - r, r / es, es, cmp);*/
}

static void
_psort_parallel(void *x)
{
	union args *args = (union args *)x;
	struct shared *shared = args->shared;

	_psort(args->a, args->n, shared->es,
#ifdef I_AM_PSORT_R
		shared->thunk,
#endif
		shared->cmp, args->depth_limit, shared);
	returnargs(shared, args);
	if(OSAtomicDecrement32(&shared->count) <= 0) {
		pthread_mutex_lock(&shared->mutex);
		pthread_cond_signal(&shared->cond);
		pthread_mutex_unlock(&shared->mutex);
	}
}

/* fast, approximate integer square root */
static size_t
isqrt(size_t x)
{
    size_t s = 1L << (flsl(x) / 2);
    return (s + x / s) / 2;
}

void
#ifdef I_AM_PSORT_R
psort_r(void *a, size_t n, size_t es, void *thunk, cmp_t *cmp)
#elif defined(I_AM_PSORT_B)
psort_b(void *a, size_t n, size_t es, cmp_t ^cmp)
#else
psort(void *a, size_t n, size_t es, cmp_t *cmp)
#endif
{
	if (n >= PARALLEL_MIN_SIZE && _NumCPUs() > 1) {
		struct shared shared;
		union args *args;

		bzero(&shared, sizeof(shared));
		shared.sharedlock = OS_SPINLOCK_INIT;
		if ((args = getargs(&shared)) != NULL) {
			struct page *p, *pp;
#ifdef I_AM_PSORT_R
			shared.who = "psort_r";
			shared.thunk = thunk;
#elif defined(I_AM_PSORT_B)
			shared.who = "psort_b";
#else
			shared.who = "psort";
#endif
			shared.cmp = cmp;
			shared.es = es;
			shared.queue = dispatch_get_global_queue(DISPATCH_QUEUE_PRIORITY_DEFAULT, 0);
			shared.cond = (pthread_cond_t)PTHREAD_COND_INITIALIZER;
			shared.mutex = (pthread_mutex_t)PTHREAD_MUTEX_INITIALIZER;
			args->a = a;
			args->n = n;
			args->depth_limit = DEPTH(n);
			args->shared = &shared;
			/*
			 * The turnoff value is the size of a partition that,
			 * below which, we stop doing in parallel, and just do
			 * in the current thread.  The value of sqrt(n) was
			 * determined heuristically.  There is a smaller
			 * dependence on the slowness of the comparison
			 * function, and there might be a dependence on the
			 * number of processors, but the algorithm has not been
			 * determined.  Because the sensitivity to the turnoff
			 * value is relatively low, we use a fast, approximate
			 * integer square root routine that is good enough for
			 * this purpose.
			 */
			shared.turnoff = isqrt(n);
			OSAtomicIncrement32(&shared.count);
			_psort_parallel(args);

			/* wait for queue to drain */
			pthread_mutex_lock(&shared.mutex);
			while(shared.count > 0)
				pthread_cond_wait(&shared.cond, &shared.mutex);

			pthread_mutex_unlock(&shared.mutex);
			pthread_mutex_destroy(&shared.mutex);
			pthread_cond_destroy(&shared.cond);
			for(p = shared.pagelist; p; p = pp) {
				pp = p->next;
				munmap(p, PAGESIZE);
			}
			return;
		}
	}
	/* Just call qsort */
#ifdef I_AM_PSORT_R
	qsort_r(a, n, es, thunk, cmp);
#elif defined(I_AM_PSORT_B)
	qsort_b(a, n, es, cmp);
#else
	qsort(a, n, es, cmp);
#endif
}