--- psort.c.orig 2008-11-24 17:01:07.000000000 -0800 +++ psort.c 2008-11-24 22:02:57.000000000 -0800 @@ -1,3 +1,4 @@ +/****************************************************************************/ /*- * Copyright (c) 1992, 1993 * The Regents of the University of California. All rights reserved. @@ -34,14 +35,22 @@ static char sccsid[] = "@(#)qsort.c 8.1 __FBSDID("$FreeBSD: src/lib/libc/stdlib/qsort.c,v 1.15 2008/01/14 09:21:34 das Exp $"); #include <stdlib.h> +#include <pthread.h> +#include <dispatch/dispatch.h> +#include <stddef.h> #include <string.h> +#include <libkern/OSAtomic.h> +#include <sys/mman.h> +#include <errno.h> +#define __APPLE_API_PRIVATE +#include <machine/cpu_capabilities.h> -#ifdef I_AM_QSORT_R +#ifdef I_AM_PSORT_R typedef int cmp_t(void *, const void *, const void *); #else typedef int cmp_t(const void *, const void *); #endif -#ifdef I_AM_QSORT_B +#ifdef I_AM_PSORT_B static inline char *med3(char *, char *, char *, cmp_t ^, void *) __attribute__((always_inline)); #else static inline char *med3(char *, char *, char *, cmp_t *, void *) __attribute__((always_inline)); @@ -50,6 +59,83 @@ static inline void swapfunc(char *, cha #define min(a, b) (a) < (b) ? a : b +#define NARGS ((PAGESIZE - offsetof(struct page, args)) / sizeof(union args)) +#define PAGESIZE 4096 +#define PARALLEL_MIN_SIZE 2000 /* determine heuristically */ + +struct shared; /* forward reference */ +union args { + union args *next; + struct { + struct shared *shared; + void *a; + size_t n; + int depth_limit; + } /* anonymous */; +}; + +struct page { + struct page *next; + union args args[0]; +}; + +struct shared { + char *who; + union args *freelist; + struct page *pagelist; +#ifdef I_AM_PSORT_R + void *thunk; +#endif +#ifdef I_AM_PSORT_B + cmp_t ^cmp; +#else + cmp_t *cmp; +#endif + size_t es; + size_t turnoff; + dispatch_queue_t queue; + pthread_cond_t cond; + pthread_mutex_t mutex; + OSSpinLock sharedlock; + int count; +}; + +static union args * +getargs(struct shared *shared) +{ + union args *args; + + OSSpinLockLock(&shared->sharedlock); + if(!shared->freelist) { + struct page *page; + union args *prev; + int i; + if((page = (struct page *)mmap(NULL, PAGESIZE, PROT_READ|PROT_WRITE, MAP_ANON|MAP_PRIVATE, -1, 0)) == NULL) + return NULL; + page->next = shared->pagelist; + shared->pagelist = page; + prev = NULL; + for(args = page->args, i = NARGS; i > 0; args++, i--) { + args->next = prev; + prev = args; + } + shared->freelist = prev; + } + args = shared->freelist; + shared->freelist = args->next; + OSSpinLockUnlock(&shared->sharedlock); + return args; +} + +static void +returnargs(struct shared *shared, union args *args) +{ + OSSpinLockLock(&shared->sharedlock); + args->next = shared->freelist; + shared->freelist = args; + OSSpinLockUnlock(&shared->sharedlock); +} + /* * Qsort routine from Bentley & McIlroy's "Engineering a Sort Function". */ @@ -88,7 +174,7 @@ swapfunc(a, b, n, swaptype) #define vecswap(a, b, n) if ((n) > 0) swapfunc(a, b, n, swaptype) -#ifdef I_AM_QSORT_R +#ifdef I_AM_PSORT_R #define CMP(t, x, y) (cmp((t), (x), (y))) #else #define CMP(t, x, y) (cmp((x), (y))) @@ -96,13 +182,13 @@ swapfunc(a, b, n, swaptype) static inline char * med3(char *a, char *b, char *c, -#ifdef I_AM_QSORT_B +#ifdef I_AM_PSORT_B cmp_t ^cmp, #else cmp_t *cmp, #endif void *thunk -#ifndef I_AM_QSORT_R +#ifndef I_AM_PSORT_R __unused #endif ) @@ -118,23 +204,25 @@ __unused #define DEPTH(x) (2 * (fls((int)(x)) - 1)) #endif /* __LP64__ */ -#ifdef I_AM_QSORT_R +#ifdef I_AM_PSORT_R int __heapsort_r(void *, size_t, size_t, void *, int (*)(void *, const void *, const void *)); #endif +static void _psort_parallel(void *x); + static void -_qsort(void *a, size_t n, size_t es, -#ifdef I_AM_QSORT_R +_psort(void *a, size_t n, size_t es, +#ifdef I_AM_PSORT_R void *thunk, #else #define thunk NULL #endif -#ifdef I_AM_QSORT_B +#ifdef I_AM_PSORT_B cmp_t ^cmp, #else cmp_t *cmp, #endif -int depth_limit) +int depth_limit, struct shared *shared) { char *pa, *pb, *pc, *pd, *pl, *pm, *pn; size_t d, r; @@ -143,9 +231,9 @@ int depth_limit) loop: if (depth_limit-- <= 0) { -#ifdef I_AM_QSORT_B +#ifdef I_AM_PSORT_B heapsort_b(a, n, es, cmp); -#elif defined(I_AM_QSORT_R) +#elif defined(I_AM_PSORT_R) __heapsort_r(a, n, es, thunk, cmp); #else heapsort(a, n, es, cmp); @@ -222,33 +310,135 @@ loop: } nevermind: - if ((r = pb - pa) > es) -#ifdef I_AM_QSORT_R - _qsort(a, r / es, es, thunk, cmp, depth_limit); + if ((r = pb - pa) > es) { + r /= es; + if (shared && r > shared->turnoff) { + union args *args = getargs(shared); + + if (args == NULL) + LIBC_ABORT("%s: getargs: %s", shared->who, strerror(errno)); + args->shared = shared; + args->a = a; + args->n = r; + args->depth_limit = depth_limit; + OSAtomicIncrement32(&shared->count); + dispatch_async_f(shared->queue, args, _psort_parallel); + } else { +#ifdef I_AM_PSORT_R + _psort(a, r, es, thunk, cmp, depth_limit, NULL); #else - _qsort(a, r / es, es, cmp, depth_limit); + _psort(a, r, es, cmp, depth_limit, NULL); #endif + } + } if ((r = pd - pc) > es) { /* Iterate rather than recurse to save stack space */ a = pn - r; n = r / es; goto loop; } -/* qsort(pn - r, r / es, es, cmp);*/ +/* psort(pn - r, r / es, es, cmp);*/ +} + +static void +_psort_parallel(void *x) +{ + union args *args = (union args *)x; + struct shared *shared = args->shared; + + _psort(args->a, args->n, shared->es, +#ifdef I_AM_PSORT_R + shared->thunk, +#endif + shared->cmp, args->depth_limit, shared); + returnargs(shared, args); + if(OSAtomicDecrement32(&shared->count) <= 0) { + pthread_mutex_lock(&shared->mutex); + pthread_cond_signal(&shared->cond); + pthread_mutex_unlock(&shared->mutex); + } +} + +/* fast, approximate integer square root */ +static size_t +isqrt(size_t x) +{ + size_t s = 1L << (flsl(x) / 2); + return (s + x / s) / 2; } void -#ifdef I_AM_QSORT_R -qsort_r(void *a, size_t n, size_t es, void *thunk, cmp_t *cmp) -#elif defined(I_AM_QSORT_B) -qsort_b(void *a, size_t n, size_t es, cmp_t ^cmp) +#ifdef I_AM_PSORT_R +psort_r(void *a, size_t n, size_t es, void *thunk, cmp_t *cmp) +#elif defined(I_AM_PSORT_B) +psort_b(void *a, size_t n, size_t es, cmp_t ^cmp) #else -qsort(void *a, size_t n, size_t es, cmp_t *cmp) +psort(void *a, size_t n, size_t es, cmp_t *cmp) #endif { - _qsort(a, n, es, -#ifdef I_AM_QSORT_R - thunk, + if (n >= PARALLEL_MIN_SIZE && _NumCPUs() > 1) { + struct shared shared; + union args *args; + + bzero(&shared, sizeof(shared)); + shared.sharedlock = OS_SPINLOCK_INIT; + if ((args = getargs(&shared)) != NULL) { + struct page *p, *pp; +#ifdef I_AM_PSORT_R + shared.who = "psort_r"; + shared.thunk = thunk; +#elif defined(I_AM_PSORT_B) + shared.who = "psort_b"; +#else + shared.who = "psort"; +#endif + shared.cmp = cmp; + shared.es = es; + shared.queue = dispatch_get_global_queue(DISPATCH_QUEUE_PRIORITY_DEFAULT, 0); + shared.cond = (pthread_cond_t)PTHREAD_COND_INITIALIZER; + shared.mutex = (pthread_mutex_t)PTHREAD_MUTEX_INITIALIZER; + args->a = a; + args->n = n; + args->depth_limit = DEPTH(n); + args->shared = &shared; + /* + * The turnoff value is the size of a partition that, + * below which, we stop doing in parallel, and just do + * in the current thread. The value of sqrt(n) was + * determined heuristically. There is a smaller + * dependence on the slowness of the comparison + * function, and there might be a dependence on the + * number of processors, but the algorithm has not been + * determined. Because the sensitivity to the turnoff + * value is relatively low, we use a fast, approximate + * integer square root routine that is good enough for + * this purpose. + */ + shared.turnoff = isqrt(n); + OSAtomicIncrement32(&shared.count); + _psort_parallel(args); + + /* wait for queue to drain */ + pthread_mutex_lock(&shared.mutex); + while(shared.count > 0) + pthread_cond_wait(&shared.cond, &shared.mutex); + + pthread_mutex_unlock(&shared.mutex); + pthread_mutex_destroy(&shared.mutex); + pthread_cond_destroy(&shared.cond); + for(p = shared.pagelist; p; p = pp) { + pp = p->next; + munmap(p, PAGESIZE); + } + return; + } + } + /* Just call qsort */ +#ifdef I_AM_PSORT_R + qsort_r(a, n, es, thunk, cmp); +#elif defined(I_AM_PSORT_B) + qsort_b(a, n, es, cmp); +#else + qsort(a, n, es, cmp); #endif - cmp, DEPTH(n)); }