#include <Foundation/Foundation.h>
#include <libkern/OSAtomic.h>
#include <sys/sysctl.h>
#include <mach/mach.h>
#include <mach/mach_time.h>
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <stdbool.h>
#include <unistd.h>
#include <assert.h>
#include <errno.h>
#include <pthread.h>
#include <math.h>
#ifdef __BLOCKS__
#include <Block.h>
#endif
#include <dispatch/dispatch.h>
#include <dispatch/private.h>

extern "C" {
__private_extern__ void func(void);
#ifdef __BLOCKS__
__private_extern__ void (^block)(void);
#endif
static void backflip(void *ctxt);
static void backflip_done(void);
}

@interface BasicObject : NSObject
{
}
- (void) method;
@end

@implementation BasicObject
- (void) method
{
}
@end

class BasicClass {
public:
	virtual void virtfunc(void) {
	};
};

static void *
force_a_thread(void *arg)
{
	pause();
	abort();
	return arg;
}

static volatile int32_t global;

static const size_t cnt = 10000000;
static const size_t cnt2 = 100000;

static uint64_t bfs;
static long double loop_cost;
static long double cycles_per_nanosecond;
static mach_timebase_info_data_t tbi;

//static void func2(void *, dispatch_item_t di);

static void __attribute__((noinline))
print_result(uint64_t s, const char *str)
{
	uint64_t d, e = mach_absolute_time();
	long double dd;

	d = e - s;

	if (tbi.numer != tbi.denom) {
		d *= tbi.numer;
		d /= tbi.denom;
	}

	dd = (typeof(dd))d / (typeof(dd))cnt;

	dd -= loop_cost;

	if (loop_cost == 0.0) {
		loop_cost = dd;
	}

	dd *= cycles_per_nanosecond;

	printf("%-45s%15.3Lf cycles\n", str, dd);
}

static void __attribute__((noinline))
print_result2(uint64_t s, const char *str)
{
	uint64_t d, e = mach_absolute_time();
	long double dd;

	d = e - s;

	if (tbi.numer != tbi.denom) {
		d *= tbi.numer;
		d /= tbi.denom;
	}

	dd = (typeof(dd))d / (typeof(dd))cnt2;

	dd -= loop_cost;
	dd *= cycles_per_nanosecond;

	printf("%-45s%15.3Lf cycles\n", str, dd);
}

#if defined(__i386__) || defined(__x86_64__)
static inline uint64_t
rdtsc(void)
{
	uint32_t lo, hi;

	asm volatile("rdtsc" : "=a" (lo), "=d" (hi));

	return (uint64_t)hi << 32 | lo;
}
#endif

static struct fml {
	struct fml *fml_next;
} *fixed_malloc_lifo_head;

struct fml *fixed_malloc_lifo(void);// __attribute__((noinline));
void fixed_free_lifo(struct fml *fml);// __attribute__((noinline));

struct fml *
fixed_malloc_lifo(void)
{
	struct fml *fml_r = fixed_malloc_lifo_head;

	if (fml_r) {
		fixed_malloc_lifo_head = fml_r->fml_next;
		return fml_r;
	} else {
		return (struct fml *)malloc(32);
	}
}

void
fixed_free_lifo(struct fml *fml)
{
	fml->fml_next = fixed_malloc_lifo_head;
	fixed_malloc_lifo_head = fml;
}

int
main(void)
{
	NSAutoreleasePool *pool = [[NSAutoreleasePool alloc] init];
	pthread_mutex_t plock = PTHREAD_MUTEX_INITIALIZER;
	OSSpinLock slock = OS_SPINLOCK_INIT;
	BasicObject *bo;
	BasicClass *bc;
	pthread_t pthr_pause;
	dispatch_queue_t q, mq;
	kern_return_t kr;
	semaphore_t sem;
	uint64_t freq;
	uint64_t s;
	size_t freq_len = sizeof(freq);
	size_t bf_cnt = cnt;
	unsigned i;
	int r;

	r = sysctlbyname("hw.cpufrequency", &freq, &freq_len, NULL, 0);
	assert(r != -1);
	assert(freq_len == sizeof(freq));

	cycles_per_nanosecond = (long double)freq / (long double)NSEC_PER_SEC;

	assert(pool);

	/* Malloc has different logic for threaded apps. */
	r = pthread_create(&pthr_pause, NULL, force_a_thread, NULL);
	assert(r == 0);

	kr = mach_timebase_info(&tbi);
	assert(kr == 0);
#if defined(__i386__) || defined(__x86_64__)
	assert(tbi.numer == tbi.denom); /* This will fail on PowerPC. */
#endif

	bo = [[BasicObject alloc] init];
	assert(bo);

	bc = new BasicClass();
	assert(bc);

	q = dispatch_queue_create("com.apple.bench-dispatch", NULL);
	assert(q);

	mq = dispatch_get_main_queue();
	assert(mq);

	printf("%-45s%15Lf\n\n", "Cycles per nanosecond:", cycles_per_nanosecond);

	s = mach_absolute_time();
	for (i = cnt; i; i--) {
		asm volatile("");
	}
	print_result(s, "Empty loop:");

	printf("\nLoop cost subtracted from the following:\n\n");

	s = mach_absolute_time();
	for (i = cnt; i; i--) {
		mach_absolute_time();
	}
	print_result(s, "mach_absolute_time():");

#if defined(__i386__) || defined(__x86_64__)
	s = mach_absolute_time();
	for (i = cnt; i; i--) {
		rdtsc();
	}
	print_result(s, "rdtsc():");
#endif

	s = mach_absolute_time();
	for (i = cnt2; i; i--) {
		pthread_t pthr;
		void *pr;

		r = pthread_create(&pthr, NULL, (void *(*)(void *))func, NULL);
		assert(r == 0);
		r = pthread_join(pthr, &pr);
		assert(r == 0);
	}
	print_result2(s, "pthread create+join:");

	s = mach_absolute_time();
	for (i = cnt2; i; i--) {
		kr = semaphore_create(mach_task_self(), &sem, SYNC_POLICY_FIFO, 0);
		assert(kr == 0);
		kr = semaphore_destroy(mach_task_self(), sem);
		assert(kr == 0);
	}
	print_result2(s, "Mach semaphore create/destroy:");

	kr = semaphore_create(mach_task_self(), &sem, SYNC_POLICY_FIFO, 0);
	assert(kr == 0);
	s = mach_absolute_time();
	for (i = cnt2; i; i--) {
		kr = semaphore_signal(sem);
		assert(kr == 0);
	}
	print_result2(s, "Mach semaphore signal:");
	kr = semaphore_destroy(mach_task_self(), sem);
	assert(kr == 0);

	s = mach_absolute_time();
	for (i = cnt; i; i--) {
		pthread_yield_np();
	}
	print_result(s, "pthread_yield_np():");

	s = mach_absolute_time();
	for (i = cnt; i; i--) {
		free(malloc(32));
	}
	print_result(s, "free(malloc(32)):");

	s = mach_absolute_time();
	for (i = cnt / 2; i; i--) {
		void *m1 = malloc(32);
		void *m2 = malloc(32);
		free(m1);
		free(m2);
	}
	print_result(s, "Avoiding the MRU cache of free(malloc(32)):");

	s = mach_absolute_time();
	for (i = cnt; i; i--) {
		fixed_free_lifo(fixed_malloc_lifo());
	}
	print_result(s, "per-thread/fixed free(malloc(32)):");

	s = mach_absolute_time();
	for (i = cnt; i; i--) {
		assert(strtoull("18446744073709551615", NULL, 0) == ~0ull);
	}
	print_result(s, "strtoull(\"18446744073709551615\") == ~0ull:");

	s = mach_absolute_time();
	for (i = cnt; i; i--) {
		func();
	}
	print_result(s, "Empty function call:");

#ifdef __BLOCKS__
	s = mach_absolute_time();
	for (i = cnt; i; i--) {
		block();
	}
	print_result(s, "Empty block call:");
#endif

	s = mach_absolute_time();
	for (i = cnt; i; i--) {
		[bo method];
	}
	print_result(s, "Empty ObjC call:");

	s = mach_absolute_time();
	for (i = cnt; i; i--) {
		bc->virtfunc();
	}
	print_result(s, "Empty C++ virtual call:");

	s = mach_absolute_time();
	for (i = cnt2; i; i--) {
		[bo description];
	}
	print_result2(s, "\"description\" ObjC call:");

	[pool release];

	pool = NULL;

#if defined(__i386__) || defined(__x86_64__)
	s = mach_absolute_time();
	for (i = cnt; i; i--) {
		asm("nop");
	}
	print_result(s, "raw 'nop':");

	s = mach_absolute_time();
	for (i = cnt; i; i--) {
		asm("pause");
	}
	print_result(s, "raw 'pause':");

	s = mach_absolute_time();
	for (i = cnt; i; i--) {
		asm("mfence");
	}
	print_result(s, "Atomic mfence:");

	s = mach_absolute_time();
	for (i = cnt; i; i--) {
		asm("lfence");
	}
	print_result(s, "Atomic lfence:");

	s = mach_absolute_time();
	for (i = cnt; i; i--) {
		asm("sfence");
	}
	print_result(s, "Atomic sfence:");

	s = mach_absolute_time();
	for (i = cnt; i; i--) {
		uint64_t sidt_rval;
		asm("sidt %0" : "=m" (sidt_rval));
	}
	print_result(s, "'sidt' instruction:");

	s = mach_absolute_time();
	for (i = cnt; i; i--) {
		int prev;
		asm volatile("cmpxchg %1,%2" : "=a" (prev) : "r" (0l), "m" (global), "0" (1l));
	}
	print_result(s, "'cmpxchg' without the 'lock' prefix:");
#endif

	s = mach_absolute_time();
	for (i = cnt; i; i--) {
		__sync_lock_test_and_set(&global, 0);
	}
	print_result(s, "Atomic xchg:");

	s = mach_absolute_time();
	for (i = cnt; i; i--) {
		__sync_val_compare_and_swap(&global, 1, 0);
	}
	print_result(s, "Atomic cmpxchg:");

	s = mach_absolute_time();
	for (i = cnt; i; i--) {
		__sync_fetch_and_add(&global, 1);
	}
	print_result(s, "Atomic increment:");

	global = 0;

	s = mach_absolute_time();
	for (i = cnt; i; i--) {
		OSAtomicIncrement32Barrier(&global);
	}
	print_result(s, "OSAtomic increment:");

	global = 0;

	s = mach_absolute_time();
	for (i = cnt; i; i--) {
		while (!__sync_bool_compare_and_swap(&global, 0, 1)) {
			do {
#if defined(__i386__) || defined(__x86_64__)
				asm("pause");
#endif
			} while (global);
		}
		global = 0;
	}
	print_result(s, "Inlined spin lock/unlock:");

	s = mach_absolute_time();
	for (i = cnt; i; i--) {
		OSSpinLockLock(&slock);
		OSSpinLockUnlock(&slock);
	}
	print_result(s, "OS spin lock/unlock:");

	s = mach_absolute_time();
	for (i = cnt; i; i--) {
		r = pthread_mutex_lock(&plock);
		assert(r == 0);
		r = pthread_mutex_unlock(&plock);
		assert(r == 0);
	}
	print_result(s, "pthread lock/unlock:");

#ifdef __BLOCKS__
	s = mach_absolute_time();
	for (i = cnt; i; i--) {
		dispatch_sync(q, ^{ });
	}
	print_result(s, "dispatch_sync:");
#endif

	s = mach_absolute_time();
	for (i = cnt; i; i--) {
		dispatch_sync_f(q, NULL, (void (*)(void *))func);
	}
	print_result(s, "dispatch_sync_f:");

#ifdef __BLOCKS__
	s = mach_absolute_time();
	for (i = cnt; i; i--) {
		dispatch_barrier_sync(q, ^{ });
	}
	print_result(s, "dispatch_barrier_sync:");
#endif

	s = mach_absolute_time();
	for (i = cnt; i; i--) {
		dispatch_barrier_sync_f(q, NULL, (void (*)(void *))func);
	}
	print_result(s, "dispatch_barrier_sync_f:");

	s = mach_absolute_time();
	dispatch_apply_f(cnt, dispatch_get_global_queue(DISPATCH_QUEUE_PRIORITY_DEFAULT, 0), NULL, (void (*)(void *, size_t))func);
	s += loop_cost; /* cancel out the implicit subtraction done by the next line */
	print_result(s, "dispatch_apply_f():");

	// we do a "double backflip" to hit the fast-path of the enqueue/dequeue logic
	bfs = mach_absolute_time();
	dispatch_async_f(dispatch_get_main_queue(), &bf_cnt, backflip);
	dispatch_async_f(dispatch_get_main_queue(), &bf_cnt, backflip);

	dispatch_main();
}

__attribute__((noinline))
void
backflip_done(void)
{
	print_result(bfs, "dispatch_async_f():");
	exit(EXIT_SUCCESS);
}

void
backflip(void *ctxt)
{
	size_t *bf_cnt = (size_t *)ctxt;
	if (--(*bf_cnt)) {
		return dispatch_async_f(dispatch_get_main_queue(), ctxt, backflip);
	}
	backflip_done();
}