atanf.s [plain text]

/*	atanf.s -- atanf for standard math library.

	Written by Eric Postpischil, July 2007.
*/


	.literal8

// Define the points where our algorithm changes.
nPoint0:	.double	-1
pPoint0:	.double	+1
nPoint1:	.double	-16777216
pPoint1:	.double	+16777216

// Define miscellaneous constants.
nHalfPi:	.double	-1.5707963267948966192313217
One:		.double	 1
pHalfPi:	.double	+1.5707963267948966192313217

// Define a coefficient for center polynomial (used for x in [-1, +1]).
C2:			.double	 0.0029352921857004596570518


	.const
	.align	4

/*	Define some coefficients for center polynomial (used for x in [-1, +1]).
	These are stored in pairs at aligned addresses for use in SIMD
	instructions.
*/
C01:		.double	 2.2971562298314633761676433,  0.0207432003007420961489920
C00:		.double	 2.4449692297316409651126041,  3.7888879287802702842997915
C11:		.double	-2.9466967515109826289085300, -4.9721072376211623038916292
C10:		.double	 5.4728447324456990092824269,  6.7197076223592378022736307

// Define coefficients for the tail polynomial (used for x outside [-1, +1]).
T01:	.double	 0.9193672354545696501477531, -0.0052035944094405570566862
T00:	.double	 0.3992772987220534996563340,  0.2521268658714555740707959
T11:	.double	-0.5273186542866779494437308, -0.7201738803584184183894208
T10:	.double	 0.1730466268612773143731748,  0.1408679409162453515360961


// Rename the general registers (just to make it easier to keep track of them).
#if defined __i386__
	#define	r0	%eax
	#define	r1	%ecx
	#define	r2	%edx
	#define	r3	%ebx
	#define	r4	%esp
	#define	r5	%ebp
	#define	r6	%esi
	#define	r7	%edi
#elif defined __x86_64__
	#define	r0	%rax
	#define	r1	%rcx
	#define	r2	%rdx
	#define	r3	%rbx
	#define	r4	%rsp
	#define	r5	%rbp
	#define	r6	%rsi
	#define	r7	%rdi
#else
	#error "Unknown architecture."
#endif


	.text


// Define various symbols.

#define	BaseP		r0		// Base address for position-independent addressing.

#define	p0			%xmm0	// Must be in %xmm0 for return on x86_64.
#define	x			%xmm1
#define	p1			%xmm2
#define	x1			%xmm3
#define	v0			%xmm4
#define	t0			%xmm5

#if defined __i386__

	// Define location of argument x.
	#define	Argx			4(%esp)

	// Define how to address data.  BaseP must contain the address of label 0.
	#define	Address(label)	label-0b(BaseP)

#elif defined __x86_64__

	// Define location of argument x.
	#define	Argx			%xmm0

	// Define how to address data.
	#define	Address(label)	label(%rip)

#endif


/*	float atanf(float x).

	Notes:

		Citations in parentheses below indicate the source of a requirement.

		"C" stands for ISO/IEC 9899:TC2.

		The Open Group specification (IEEE Std 1003.1, 2004 edition) adds no
		requirements since it defers to C and requires errno behavior only if
		we choose to support it by arranging for "math_errhandling &
		MATH_ERRNO" to be non-zero, which we do not.

	Return value:

		For arctangent of +/- zero, return zero with same sign (C F.9 12 and
		F.9.1.3).

		For arctangent of +/- infinity, return +/- pi/2 (C F.9.1.3).

		For a NaN, return the same NaN (C F.9 11 and 13).  (If the NaN is a
		signalling NaN, we return the "same" NaN quieted.)

		Otherwise:

			If the rounding mode is round-to-nearest, return arctangent(x)
			faithfully rounded.

			Return a value in [-pi/2, +pi/2] (C 7.12.4.3 3).  Note that this
			prohibits returning correctly rounded values for atanf of large
			positive or negative arguments, since pi/2 rounded to a float lies
			outside that interval.
		
			Not implemented:  In other rounding modes, return arctangent(x)
			possibly with slightly worse error, not necessarily honoring the
			rounding mode (Ali Sazegari narrowing C F.9 10).

	Exceptions:

		Raise underflow for a denormal result (C F.9 7 and Draft Standard for
		Floating-Point Arithmetic P754 Draft 1.2.5 9.5).  If the input is the
		smallest normal, underflow may or may not be raised.  This is stricter
		than the older 754 standard.

		May or may not raise inexact, even if the result is exact (C F.9 8).

		Raise invalid if the input is a signalling NaN (C 5.2.4.2.2 3, in spite
		of C 4.2.1)  but not if the input is a quiet NaN (C F.9 11).

		May not raise exceptions otherwise (C F.9 9).

	Properties:

		Monotonic.

		Exhaustive testing proved this routine returns faithfully rounded
		results.
*/
	.align	5
#if !defined DevelopmentInstrumentation
	// This is the regular name used in the deployed implementation.
		.globl _atanf
	_atanf:
#else
	// This is the name used for a special test version of the routine.
		.globl _atanfInstrumented
	_atanfInstrumented:
#endif

	cvtss2sd	Argx, x				// Convert x to double precision.

	#if defined __i386__

		// Get address of 0 in BaseP.
			call	0f				// Push program counter onto stack.
		0:
			pop		BaseP			// Get program counter.

	#endif

/*	We use different algorithms for different parts of the domain.  There
	is a "negative tail" from -infinity to nPoint0, a center from nPoint0 to
	pPoint0, and a "positive tail" from pPoint0 to +infinity.  Here, we compare
	and branch to the appropriate code.

	NaNs are weeded out in PositiveTail or NegativeTail.
*/

	ucomisd		Address(pPoint0), x
	ja			PositiveTail

	ucomisd		Address(nPoint0), x
	jb			NegativeTail


/*	Here we have nPoint0 <= x <= pPoint0.  This is handled with a simple
	evaluation of a polynomial that approximates arctangent.

	The polynomial has been arranged into the form:

		x * c2
			* ((x**4 + c01 * x**2 + c00))
			* ((x**4 + c11 * x**2 + c10))
			* ((x**4 + c21 * x**2 + c20))
			* ((x**4 + c31 * x**2 + c30))

	The coefficients are stored in pairs, with c01 and c21 at C01, c00 and c20
	at C00, c11 and c31 at C11, and c10 and c30 at C10.  c2 is at C2.

	The quartic factors are evaluated in SIMD registers.  For brevity, some
	comments below describe only one element of a register.  The other is
	analagous.
*/
	movlhps		x, x1				// Save a copy of x for later.
	mulsd		x, x				// Form x**2.
#define	x2	x	// Define name describing current register contents.
	movlhps		x2, x2				// Duplicate x**2.
	movapd		Address(C11), p1
	addpd		x2, p1				// Form x**2 + c11.
	mulpd		x2, p1				// Form x**4 + c11 * x**2.
	addpd		Address(C10), p1	// Form x**4 + c11 * x**2 + c10.
	movapd		Address(C01), p0	// Get first coefficients.
	movlpd		Address(C2), x1		// Put c2 in one element, with x in other.
	addpd		x2, p0				// Form x**2 + c01.
	mulpd		x2, p0				// Form x**4 + c01 * x**2.
	addpd		Address(C00), p0	// Form x**4 + c01 * x**2 + c00.
	mulpd		p1, p0				// Combine factors.
	mulpd		x1, p0				// Multiply by x and c2.
	movhlps		p0, p1				// Get high element.
	mulsd		p1, p0				// Finish combining factors.
#undef x2

// Return the double-precision number currently in p0.
ReturnDoubleInp0:
	cvtsd2ss	p0, p0				// Convert result to single precision.

// Return the single-precision number currently in p0.
ReturnSingleInp0:

	#if defined __i386__
		movss		p0, Argx		// Shuttle result through memory.
			// This uses the argument area for scratch space, which is allowed.
		flds		Argx			// Return input on floating-point stack.
	#else
		// On x86_64, the return value is now in p0, which is %xmm0.
	#endif

	ret


// Handle pPoint0 < x.
PositiveTail:

	ucomisd		Address(pPoint1), x		// Compare x to algorithm change point.
	jae			InputIsPositiveSpecial

	movsd		Address(pHalfPi), p0	// Prepare +pi/2.

CommonTail:

/*	Here we have pPoint0 < x < pPoint1.  The algorithm here is inspired by the
	identity (for positive x) arctangent(x) = +pi/2 - arctangent(1/x).  This
	is approximated by evaluating:

		+pi/2 -
			* ((x**4 + t01 * x**2 + t00))
			* ((x**4 + t11 * x**2 + t10))
			* ((x**4 + t21 * x**2 + t20))
			* ((x**4 + t31 * x**2 + t30))
			* (1/x) * (1/x)**16.

	The quartic factors are evaluated in SIMD registers.  For brevity, some
	comments below describe only one element of a register.  The other is
	analagous.
*/

	movsd		Address(One), v0	// Get constant.
	divsd		x, v0				// Form 1/x, which we will call r.
		/*	I tried rcpss, but using it requires a precision conversion, two
			adds, and three multiplies.  divsd performance is not bad on Core2, 
			and I was not able to get the rcpss version to run as fast.
		*/
	mulsd		x, x				// Form x**2.
#define	x2	x	// Define name describing current register contents.
	movlhps		v0, v0				// Copy finished reciprocal to high element.
	movlhps		x2, x2				// Duplicate x**2.
	mulsd		v0, v0				// Form r**2.
	mulsd		v0, v0				// Form r**4.
	mulsd		v0, v0				// Form r**8.
	movapd		Address(T11), p1	// Get first coefficients.
	mulsd		v0, v0				// Form r**16.

	movapd		Address(T01), t0	// Get first coefficients.
	addpd		x2, p1				// Form x**2 + t11.
	mulpd		x2, p1				// Form x**4 + t11 * x**2.
	addpd		Address(T10), p1	// Form x**4 + t11 * x**2 + t10.
	addpd		x2, t0				// Form x**2 + t01.
	mulpd		x2, t0				// Form x**4 + t01 * x**2.
	addpd		Address(T00), t0	// Form x**4 + t01 * x**2 + t00.
	mulpd		p1, t0				// Combine factors.
	mulpd		v0, t0				// Multiply by r**16 and r.
	movhlps		t0, p1				// Get high element.
	mulsd		p1, t0				// Finish combining factors.
	subsd		t0, p0				// Subtract from pi/2.
#undef x2

	jmp			ReturnDoubleInp0


// Handle x < nPoint0.
NegativeTail:

	ucomisd		Address(nPoint1), x		// Compare x to algorithm change point.
	jbe			InputIsNegativeSpecial

	movsd		Address(nHalfPi), p0	// Prepare -pi/2.

	jmp			CommonTail


	.literal8
/*	Define values near +/- pi/2 that yield +/- pi/2 rounded toward zero when
	converted to single precision.  This allows us to generate inexact and
	return the desired values for atanf of big positive and negative numbers.
*/
AlmostpHalfPi:	.double	+1.5707962
AlmostnHalfPi:	.double	-1.5707962


	.text

/*	Here we handle inputs greater or equal to pPoint1, including infinity,
	but not including NaNs.
*/
InputIsPositiveSpecial:

	/*	The input is a big positive number.  Return +pi/2 rounded toward zero
		and generate an inexact exception.
	*/
	cvtsd2ss	Address(AlmostpHalfPi), p0
	jmp			ReturnSingleInp0


/*	Here we handle inputs less than or equal to -1, including -infinity and
	NaNs.  The condition code must be set as if by using ucomisd to compare the
	input in the "source operand" to nPoint1 in the "destination operand".
*/
InputIsNegativeSpecial:

	jp			InputIsNaN

	/*	The input is a big negative number.  Return -pi/2 rounded toward zero
		and generate an inexact exception.
	*/
	cvtsd2ss	Address(AlmostnHalfPi), p0
	jmp			ReturnSingleInp0


InputIsNaN:

	#if defined __i386__
		flds		Argx			// Return result on floating-point stack.
	#else
		cvtsd2ss	x, %xmm0		// Return in %xmm0.
			/*	We cannot just return the input, because we must quiet a
				signalling NaN.
			*/
	#endif

	ret