exp2f.s   [plain text]


/*
 *  exp2f.s
 *  LibmV5
 *
 *  Created by Ian Ollmann on 10/4/05.
 *  exp2 polynomial approximation by Ali Sazegari
 *  Copyright 2005 Apple Computer Inc. All rights reserved.
 *
 */

#include <../../i386.subproj/machine/asm.h>
#include "abi.h"

#if defined( __LP64__ )
	#error not 64-bit ready
#endif

#if 0		//this is too error prone currently. Its probably a bug, because it worked in standalone tests.

ENTRY( exp2f )
	movss   4(%esp),	%xmm0
	call	_exp2f$3SSE
	subl	$4,			%esp
	movss	%xmm0,		(%esp)
	flds	(%esp)
	addl	$4,			%esp
	ret

ENTRY(exp2f$3SSE)    
	pushl	$0x4b000000				//0x1.0p23
	pushl	$0x3f800000				//1.0
	pushl	$0x43400000				//192.0
	pushl	$0x3f317214				//0.693147
	pushl	$0x3e75feff				//0.240231
	pushl	$0x3d633f3a				//0.0554802
	pushl	$0x3c1eadbc				//0.00968498
	pushl	$0x3aa252d4				//0.00123843
	pushl	$0x39658674				//0.000218892
	pushl	$0x7e800000				//0x1.0p126
	pushl	$0x00800000				//0x1.0p-126
	
	//calculate e = floor(x)
	//we can cheat a bit here, since values outside of +-150 or so return either inf or zero, so don't have to be correct
	movss	40(%esp),	%xmm1			//load 0x1.0p23
	movss	36(%esp),	%xmm2			//load 1.0
	movaps	%xmm1,		%xmm3			//0x1.0p23
	pslld   $7,			%xmm1			//0x80000000
	ucomiss	%xmm1,		%xmm0			//special case for NaN or 0.0
	movaps  %xmm1,		%xmm7			//0x80000000
	andps	%xmm0,		%xmm1			// sign bit of x
	andnps	%xmm0,		%xmm7			// |x|
	je		__exp2f_nan0
	ucomiss	32(%esp),	%xmm7			// |x| > 192.0 
	movaps	%xmm0,		%xmm5			// copy of x
	orps	%xmm1,		%xmm3			//+-0x1.0p23
	movaps	%xmm0,		%xmm4			// another copy of x

	//if x is infinite, calculating the fractional part will give us a NaN, so weed out large values now
	jnbe		__exp2f_large

	//otherwise find floor(x)
	addss	%xmm3,		%xmm0			// x +- 0x1.0p23
	subss	%xmm3,		%xmm0			// rint(x)
	cmpss	$1,	%xmm0,	%xmm5			// x < rint(x) 
	andps	%xmm2,		%xmm5			// x < rint(x) ? 1.0 : 0.0
	subss	%xmm5,		%xmm0			// e


	//move e to the integer registers
	cvttss2si %xmm0,	%eax			// (int) e
	
	//Calculate the fractional part:  f = x - e
	//and calculate exp2(f).
	subss	%xmm0,		%xmm4			// f
	movaps	%xmm4,		%xmm0			// copy f
	mulss	(%esp),		%xmm0			// f * 0.000218892
	addss	12(%esp),	%xmm0			// f * 0.000218892 + 0.00123843
	mulss	%xmm4,		%xmm0			// (f * 0.000218892 + 0.00123843) * f
	addss	16(%esp),	%xmm0			// 0.00968498 + (f * 0.000218892 + 0.00123843) * f
	mulss	%xmm4,		%xmm0			// (0.00968498 + (f * 0.000218892 + 0.00123843) * f) * f
	addss	20(%esp),	%xmm0			// 0.0554802 + (0.00968498 + (f * 0.000218892 + 0.00123843) * f) * f
	mulss	%xmm4,		%xmm0			// (0.0554802 + (0.00968498 + (f * 0.000218892 + 0.00123843) * f) * f) * f
	addss	24(%esp),	%xmm0			// 0.240231 + (0.0554802 + (0.00968498 + (f * 0.000218892 + 0.00123843) * f) * f) * f
	mulss	%xmm4,		%xmm0			// (0.240231 + (0.0554802 + (0.00968498 + (f * 0.000218892 + 0.00123843) * f) * f) * f) * f
	addss	28(%esp),	%xmm0			// 0.693147 + (0.240231 + (0.0554802 + (0.00968498 + (f * 0.000218892 + 0.00123843) * f) * f) * f) * f
	mulss	%xmm4,		%xmm0			// (0.693147 + (0.240231 + (0.0554802 + (0.00968498 + (f * 0.000218892 + 0.00123843) * f) * f) * f) * f) * f
	addss	%xmm2,		%xmm0			//	exp2(f) = 1.0 + (0.693147 + (0.240231 + (0.0554802 + (0.00968498 + (f * 0.000218892 + 0.00123843) * f) * f) * f) * f) * f
	
	//result = scalbnf( exp2(f), (int) e )
	//start off by figuring out some information about e
	movl	%eax,		%edx			// copy e to edx too
	movl	%eax,		%ecx			// copy e to ecx too
	sarl	$31,		%eax			// e < 0 ? -1 : 0
	xorl	%eax,		%ecx
	subl	%eax,		%ecx			// abs(e)
	movss	4(%esp,%eax,4), %xmm3		// e < 0 ? 0x1.0p-126 : 0x1.0p126
	cmp		$126,		%ecx			// |e| < 126
	jbe		__exp2f_exit
	mulss	%xmm3,		%xmm0			// f *= e < 0 ? 0x1.0p-126 : 0x1.0p126
	movl	%eax,		%ecx
	xorl	$126,		%ecx
	subl	%eax,		%ecx			//+-126
	subl	%ecx,		%edx			// e +- 126

__exp2f_exit:
	addl	$127,		%edx			//bias the exponent. We know this can't overflow/underflow because these cases were sent to __exp2f_large
	shll	$23,		%edx
	movd	%edx,		%xmm5
	mulss	%xmm5,		%xmm0
	addl	$44,		%esp
	ret

__exp2f_large:	//handle large magnitude inputs. Result is either 0 or inf, with overflow or underflow set
	cmpss	$0, %xmm0,	%xmm0			//always true. No NaNs here.
	pslld	$23,		%xmm0			//-inf
	cmpss	$0,	%xmm4,	%xmm0			// x == -inf
	cmpss	$1,	%xmm4,	%xmm2			// 1.0 < e ? -1L : 0
	movss	(%esp),		%xmm3			// 0x1.0p-126
	maxss	%xmm3,		%xmm4
	andnps	%xmm4,		%xmm0
	movaps	%xmm2,		%xmm5			// copy 1.0 < e ? -1L : 0
	pslld	$25,		%xmm2			// 1.0 < e ? 0xfe000000 : 0
	psrld	$1,			%xmm2			// 1.0 < e ? 0x7f000000 : 0
	andnps	%xmm3,		%xmm5			// 1.0 < e ? 0: 0x1.0p-126
	orps	%xmm5,		%xmm2			// 1.0 < e ? 0x7f000000 : 0x00800000
	mulss	%xmm2,		%xmm0
	addl	$44,		%esp
	ret
	
//handle special cases for NaN and 1.0
__exp2f_nan0:
	addss	%xmm2,	%xmm0				//silence NaNs, set xmm0 to 1.0 otherwise
	addl	$44,	%esp
	ret
	
#endif