psort.c.patch   [plain text]


--- psort.c.orig	2008-11-24 17:01:07.000000000 -0800
+++ psort.c	2008-11-24 22:02:57.000000000 -0800
@@ -1,3 +1,4 @@
+/****************************************************************************/
 /*-
  * Copyright (c) 1992, 1993
  *	The Regents of the University of California.  All rights reserved.
@@ -34,14 +35,22 @@ static char sccsid[] = "@(#)qsort.c	8.1 
 __FBSDID("$FreeBSD: src/lib/libc/stdlib/qsort.c,v 1.15 2008/01/14 09:21:34 das Exp $");
 
 #include <stdlib.h>
+#include <pthread.h>
+#include <dispatch/dispatch.h>
+#include <stddef.h>
 #include <string.h>
+#include <libkern/OSAtomic.h>
+#include <sys/mman.h>
+#include <errno.h>
+#define __APPLE_API_PRIVATE
+#include <machine/cpu_capabilities.h>
 
-#ifdef I_AM_QSORT_R
+#ifdef I_AM_PSORT_R
 typedef int		 cmp_t(void *, const void *, const void *);
 #else
 typedef int		 cmp_t(const void *, const void *);
 #endif
-#ifdef I_AM_QSORT_B
+#ifdef I_AM_PSORT_B
 static inline char	*med3(char *, char *, char *, cmp_t ^, void *) __attribute__((always_inline));
 #else
 static inline char	*med3(char *, char *, char *, cmp_t *, void *) __attribute__((always_inline));
@@ -50,6 +59,83 @@ static inline void	 swapfunc(char *, cha
 
 #define min(a, b)	(a) < (b) ? a : b
 
+#define NARGS			((PAGESIZE - offsetof(struct page, args)) / sizeof(union args))
+#define PAGESIZE		4096
+#define PARALLEL_MIN_SIZE	2000	/* determine heuristically */
+
+struct shared; /* forward reference */
+union args {
+    union args *next;
+    struct {
+	struct shared *shared;
+	void *a;
+	size_t n;
+	int depth_limit;
+    } /* anonymous */;
+};
+
+struct page {
+    struct page *next;
+    union args args[0];
+};
+
+struct shared {
+    char *who;
+    union args *freelist;
+    struct page *pagelist;
+#ifdef I_AM_PSORT_R
+    void *thunk;
+#endif
+#ifdef I_AM_PSORT_B
+    cmp_t ^cmp;
+#else
+    cmp_t *cmp;
+#endif
+    size_t es;
+    size_t turnoff;
+    dispatch_queue_t queue;
+    pthread_cond_t cond;
+    pthread_mutex_t mutex;
+    OSSpinLock sharedlock;
+    int count;
+};
+
+static union args *
+getargs(struct shared *shared)
+{
+    union args *args;
+
+    OSSpinLockLock(&shared->sharedlock);
+    if(!shared->freelist) {
+	struct page *page;
+	union args *prev;
+	int i;
+	if((page = (struct page *)mmap(NULL, PAGESIZE, PROT_READ|PROT_WRITE, MAP_ANON|MAP_PRIVATE, -1, 0)) == NULL)
+	    return NULL;
+	page->next = shared->pagelist;
+	shared->pagelist = page;
+	prev = NULL;
+	for(args = page->args, i = NARGS; i > 0; args++, i--) {
+	    args->next = prev;
+	    prev = args;
+	}
+	shared->freelist = prev;
+    }
+    args = shared->freelist;
+    shared->freelist = args->next;
+    OSSpinLockUnlock(&shared->sharedlock);
+    return args;
+}
+
+static void
+returnargs(struct shared *shared, union args *args)
+{
+    OSSpinLockLock(&shared->sharedlock);
+    args->next = shared->freelist;
+    shared->freelist = args;
+    OSSpinLockUnlock(&shared->sharedlock);
+}
+
 /*
  * Qsort routine from Bentley & McIlroy's "Engineering a Sort Function".
  */
@@ -88,7 +174,7 @@ swapfunc(a, b, n, swaptype)
 
 #define vecswap(a, b, n) 	if ((n) > 0) swapfunc(a, b, n, swaptype)
 
-#ifdef I_AM_QSORT_R
+#ifdef I_AM_PSORT_R
 #define	CMP(t, x, y) (cmp((t), (x), (y)))
 #else
 #define	CMP(t, x, y) (cmp((x), (y)))
@@ -96,13 +182,13 @@ swapfunc(a, b, n, swaptype)
 
 static inline char *
 med3(char *a, char *b, char *c,
-#ifdef I_AM_QSORT_B
+#ifdef I_AM_PSORT_B
 cmp_t ^cmp,
 #else
 cmp_t *cmp,
 #endif
 void *thunk
-#ifndef I_AM_QSORT_R
+#ifndef I_AM_PSORT_R
 __unused
 #endif
 )
@@ -118,23 +204,25 @@ __unused
 #define DEPTH(x)	(2 * (fls((int)(x)) - 1))
 #endif /* __LP64__ */
 
-#ifdef I_AM_QSORT_R
+#ifdef I_AM_PSORT_R
 int __heapsort_r(void *, size_t, size_t, void *, int (*)(void *, const void *, const void *));
 #endif
 
+static void _psort_parallel(void *x);
+
 static void
-_qsort(void *a, size_t n, size_t es,
-#ifdef I_AM_QSORT_R
+_psort(void *a, size_t n, size_t es,
+#ifdef I_AM_PSORT_R
 void *thunk,
 #else
 #define thunk	NULL
 #endif
-#ifdef I_AM_QSORT_B
+#ifdef I_AM_PSORT_B
 cmp_t ^cmp,
 #else
 cmp_t *cmp,
 #endif
-int depth_limit)
+int depth_limit, struct shared *shared)
 {
 	char *pa, *pb, *pc, *pd, *pl, *pm, *pn;
 	size_t d, r;
@@ -143,9 +231,9 @@ int depth_limit)
 
 loop:
 	if (depth_limit-- <= 0) {
-#ifdef I_AM_QSORT_B
+#ifdef I_AM_PSORT_B
 		heapsort_b(a, n, es, cmp);
-#elif defined(I_AM_QSORT_R)
+#elif defined(I_AM_PSORT_R)
 		__heapsort_r(a, n, es, thunk, cmp);
 #else
 		heapsort(a, n, es, cmp);
@@ -222,33 +310,135 @@ loop:
 	}
 
 nevermind:
-	if ((r = pb - pa) > es)
-#ifdef I_AM_QSORT_R
-		_qsort(a, r / es, es, thunk, cmp, depth_limit);
+	if ((r = pb - pa) > es) {
+		r /= es;
+		if (shared && r > shared->turnoff) {
+			union args *args = getargs(shared);
+
+			if (args == NULL)
+				LIBC_ABORT("%s: getargs: %s", shared->who, strerror(errno));
+			args->shared = shared;
+			args->a = a;
+			args->n = r;
+			args->depth_limit = depth_limit;
+			OSAtomicIncrement32(&shared->count);
+			dispatch_async_f(shared->queue, args, _psort_parallel);
+		} else {
+#ifdef I_AM_PSORT_R
+			_psort(a, r, es, thunk, cmp, depth_limit, NULL);
 #else
-		_qsort(a, r / es, es, cmp, depth_limit);
+			_psort(a, r, es, cmp, depth_limit, NULL);
 #endif
+		}
+	}
 	if ((r = pd - pc) > es) {
 		/* Iterate rather than recurse to save stack space */
 		a = pn - r;
 		n = r / es;
 		goto loop;
 	}
-/*		qsort(pn - r, r / es, es, cmp);*/
+/*		psort(pn - r, r / es, es, cmp);*/
+}
+
+static void
+_psort_parallel(void *x)
+{
+	union args *args = (union args *)x;
+	struct shared *shared = args->shared;
+
+	_psort(args->a, args->n, shared->es,
+#ifdef I_AM_PSORT_R
+		shared->thunk,
+#endif
+		shared->cmp, args->depth_limit, shared);
+	returnargs(shared, args);
+	if(OSAtomicDecrement32(&shared->count) <= 0) {
+		pthread_mutex_lock(&shared->mutex);
+		pthread_cond_signal(&shared->cond);
+		pthread_mutex_unlock(&shared->mutex);
+	}
+}
+
+/* fast, approximate integer square root */
+static size_t
+isqrt(size_t x)
+{
+    size_t s = 1L << (flsl(x) / 2);
+    return (s + x / s) / 2;
 }
 
 void
-#ifdef I_AM_QSORT_R
-qsort_r(void *a, size_t n, size_t es, void *thunk, cmp_t *cmp)
-#elif defined(I_AM_QSORT_B)
-qsort_b(void *a, size_t n, size_t es, cmp_t ^cmp)
+#ifdef I_AM_PSORT_R
+psort_r(void *a, size_t n, size_t es, void *thunk, cmp_t *cmp)
+#elif defined(I_AM_PSORT_B)
+psort_b(void *a, size_t n, size_t es, cmp_t ^cmp)
 #else
-qsort(void *a, size_t n, size_t es, cmp_t *cmp)
+psort(void *a, size_t n, size_t es, cmp_t *cmp)
 #endif
 {
-	_qsort(a, n, es,
-#ifdef I_AM_QSORT_R
-		thunk,
+	if (n >= PARALLEL_MIN_SIZE && _NumCPUs() > 1) {
+		struct shared shared;
+		union args *args;
+
+		bzero(&shared, sizeof(shared));
+		shared.sharedlock = OS_SPINLOCK_INIT;
+		if ((args = getargs(&shared)) != NULL) {
+			struct page *p, *pp;
+#ifdef I_AM_PSORT_R
+			shared.who = "psort_r";
+			shared.thunk = thunk;
+#elif defined(I_AM_PSORT_B)
+			shared.who = "psort_b";
+#else
+			shared.who = "psort";
+#endif
+			shared.cmp = cmp;
+			shared.es = es;
+			shared.queue = dispatch_get_global_queue(DISPATCH_QUEUE_PRIORITY_DEFAULT, 0);
+			shared.cond = (pthread_cond_t)PTHREAD_COND_INITIALIZER;
+			shared.mutex = (pthread_mutex_t)PTHREAD_MUTEX_INITIALIZER;
+			args->a = a;
+			args->n = n;
+			args->depth_limit = DEPTH(n);
+			args->shared = &shared;
+			/*
+			 * The turnoff value is the size of a partition that,
+			 * below which, we stop doing in parallel, and just do
+			 * in the current thread.  The value of sqrt(n) was
+			 * determined heuristically.  There is a smaller
+			 * dependence on the slowness of the comparison
+			 * function, and there might be a dependence on the
+			 * number of processors, but the algorithm has not been
+			 * determined.  Because the sensitivity to the turnoff
+			 * value is relatively low, we use a fast, approximate
+			 * integer square root routine that is good enough for
+			 * this purpose.
+			 */
+			shared.turnoff = isqrt(n);
+			OSAtomicIncrement32(&shared.count);
+			_psort_parallel(args);
+
+			/* wait for queue to drain */
+			pthread_mutex_lock(&shared.mutex);
+			while(shared.count > 0)
+				pthread_cond_wait(&shared.cond, &shared.mutex);
+
+			pthread_mutex_unlock(&shared.mutex);
+			pthread_mutex_destroy(&shared.mutex);
+			pthread_cond_destroy(&shared.cond);
+			for(p = shared.pagelist; p; p = pp) {
+				pp = p->next;
+				munmap(p, PAGESIZE);
+			}
+			return;
+		}
+	}
+	/* Just call qsort */
+#ifdef I_AM_PSORT_R
+	qsort_r(a, n, es, thunk, cmp);
+#elif defined(I_AM_PSORT_B)
+	qsort_b(a, n, es, cmp);
+#else
+	qsort(a, n, es, cmp);
 #endif
-		cmp, DEPTH(n));
 }