# atanf.s   [plain text]

```/*	atanf.s -- atanf for standard math library.

Written by Eric Postpischil, July 2007.
*/

.literal8

// Define the points where our algorithm changes.
nPoint0:	.double	-1
pPoint0:	.double	+1
nPoint1:	.double	-16777216
pPoint1:	.double	+16777216

// Define miscellaneous constants.
nHalfPi:	.double	-1.5707963267948966192313217
One:		.double	 1
pHalfPi:	.double	+1.5707963267948966192313217

// Define a coefficient for center polynomial (used for x in [-1, +1]).
C2:			.double	 0.0029352921857004596570518

.const
.align	4

/*	Define some coefficients for center polynomial (used for x in [-1, +1]).
These are stored in pairs at aligned addresses for use in SIMD
instructions.
*/
C01:		.double	 2.2971562298314633761676433,  0.0207432003007420961489920
C00:		.double	 2.4449692297316409651126041,  3.7888879287802702842997915
C11:		.double	-2.9466967515109826289085300, -4.9721072376211623038916292
C10:		.double	 5.4728447324456990092824269,  6.7197076223592378022736307

// Define coefficients for the tail polynomial (used for x outside [-1, +1]).
T01:	.double	 0.9193672354545696501477531, -0.0052035944094405570566862
T00:	.double	 0.3992772987220534996563340,  0.2521268658714555740707959
T11:	.double	-0.5273186542866779494437308, -0.7201738803584184183894208
T10:	.double	 0.1730466268612773143731748,  0.1408679409162453515360961

// Rename the general registers (just to make it easier to keep track of them).
#if defined __i386__
#define	r0	%eax
#define	r1	%ecx
#define	r2	%edx
#define	r3	%ebx
#define	r4	%esp
#define	r5	%ebp
#define	r6	%esi
#define	r7	%edi
#elif defined __x86_64__
#define	r0	%rax
#define	r1	%rcx
#define	r2	%rdx
#define	r3	%rbx
#define	r4	%rsp
#define	r5	%rbp
#define	r6	%rsi
#define	r7	%rdi
#else
#error "Unknown architecture."
#endif

.text

// Define various symbols.

#define	p0			%xmm0	// Must be in %xmm0 for return on x86_64.
#define	x			%xmm1
#define	p1			%xmm2
#define	x1			%xmm3
#define	v0			%xmm4
#define	t0			%xmm5

#if defined __i386__

// Define location of argument x.
#define	Argx			4(%esp)

// Define how to address data.  BaseP must contain the address of label 0.

#elif defined __x86_64__

// Define location of argument x.
#define	Argx			%xmm0

// Define how to address data.

#endif

/*	float atanf(float x).

Notes:

Citations in parentheses below indicate the source of a requirement.

"C" stands for ISO/IEC 9899:TC2.

The Open Group specification (IEEE Std 1003.1, 2004 edition) adds no
requirements since it defers to C and requires errno behavior only if
we choose to support it by arranging for "math_errhandling &
MATH_ERRNO" to be non-zero, which we do not.

Return value:

For arctangent of +/- zero, return zero with same sign (C F.9 12 and
F.9.1.3).

For arctangent of +/- infinity, return +/- pi/2 (C F.9.1.3).

For a NaN, return the same NaN (C F.9 11 and 13).  (If the NaN is a
signalling NaN, we return the "same" NaN quieted.)

Otherwise:

If the rounding mode is round-to-nearest, return arctangent(x)
faithfully rounded.

Return a value in [-pi/2, +pi/2] (C 7.12.4.3 3).  Note that this
prohibits returning correctly rounded values for atanf of large
positive or negative arguments, since pi/2 rounded to a float lies
outside that interval.

Not implemented:  In other rounding modes, return arctangent(x)
possibly with slightly worse error, not necessarily honoring the
rounding mode (Ali Sazegari narrowing C F.9 10).

Exceptions:

Raise underflow for a denormal result (C F.9 7 and Draft Standard for
Floating-Point Arithmetic P754 Draft 1.2.5 9.5).  If the input is the
smallest normal, underflow may or may not be raised.  This is stricter
than the older 754 standard.

May or may not raise inexact, even if the result is exact (C F.9 8).

Raise invalid if the input is a signalling NaN (C 5.2.4.2.2 3, in spite
of C 4.2.1)  but not if the input is a quiet NaN (C F.9 11).

May not raise exceptions otherwise (C F.9 9).

Properties:

Monotonic.

Exhaustive testing proved this routine returns faithfully rounded
results.
*/
.align	5
#if !defined DevelopmentInstrumentation
// This is the regular name used in the deployed implementation.
.globl _atanf
_atanf:
#else
// This is the name used for a special test version of the routine.
.globl _atanfInstrumented
_atanfInstrumented:
#endif

cvtss2sd	Argx, x				// Convert x to double precision.

#if defined __i386__

// Get address of 0 in BaseP.
call	0f				// Push program counter onto stack.
0:
pop		BaseP			// Get program counter.

#endif

/*	We use different algorithms for different parts of the domain.  There
is a "negative tail" from -infinity to nPoint0, a center from nPoint0 to
pPoint0, and a "positive tail" from pPoint0 to +infinity.  Here, we compare
and branch to the appropriate code.

NaNs are weeded out in PositiveTail or NegativeTail.
*/

ja			PositiveTail

jb			NegativeTail

/*	Here we have nPoint0 <= x <= pPoint0.  This is handled with a simple
evaluation of a polynomial that approximates arctangent.

The polynomial has been arranged into the form:

x * c2
* ((x**4 + c01 * x**2 + c00))
* ((x**4 + c11 * x**2 + c10))
* ((x**4 + c21 * x**2 + c20))
* ((x**4 + c31 * x**2 + c30))

The coefficients are stored in pairs, with c01 and c21 at C01, c00 and c20
at C00, c11 and c31 at C11, and c10 and c30 at C10.  c2 is at C2.

The quartic factors are evaluated in SIMD registers.  For brevity, some
comments below describe only one element of a register.  The other is
analagous.
*/
movlhps		x, x1				// Save a copy of x for later.
mulsd		x, x				// Form x**2.
#define	x2	x	// Define name describing current register contents.
movlhps		x2, x2				// Duplicate x**2.
addpd		x2, p1				// Form x**2 + c11.
mulpd		x2, p1				// Form x**4 + c11 * x**2.
addpd		Address(C10), p1	// Form x**4 + c11 * x**2 + c10.
movapd		Address(C01), p0	// Get first coefficients.
movlpd		Address(C2), x1		// Put c2 in one element, with x in other.
addpd		x2, p0				// Form x**2 + c01.
mulpd		x2, p0				// Form x**4 + c01 * x**2.
addpd		Address(C00), p0	// Form x**4 + c01 * x**2 + c00.
mulpd		p1, p0				// Combine factors.
mulpd		x1, p0				// Multiply by x and c2.
movhlps		p0, p1				// Get high element.
mulsd		p1, p0				// Finish combining factors.
#undef x2

// Return the double-precision number currently in p0.
ReturnDoubleInp0:
cvtsd2ss	p0, p0				// Convert result to single precision.

// Return the single-precision number currently in p0.
ReturnSingleInp0:

#if defined __i386__
movss		p0, Argx		// Shuttle result through memory.
// This uses the argument area for scratch space, which is allowed.
flds		Argx			// Return input on floating-point stack.
#else
// On x86_64, the return value is now in p0, which is %xmm0.
#endif

ret

// Handle pPoint0 < x.
PositiveTail:

ucomisd		Address(pPoint1), x		// Compare x to algorithm change point.
jae			InputIsPositiveSpecial

movsd		Address(pHalfPi), p0	// Prepare +pi/2.

CommonTail:

/*	Here we have pPoint0 < x < pPoint1.  The algorithm here is inspired by the
identity (for positive x) arctangent(x) = +pi/2 - arctangent(1/x).  This
is approximated by evaluating:

+pi/2 -
* ((x**4 + t01 * x**2 + t00))
* ((x**4 + t11 * x**2 + t10))
* ((x**4 + t21 * x**2 + t20))
* ((x**4 + t31 * x**2 + t30))
* (1/x) * (1/x)**16.

The quartic factors are evaluated in SIMD registers.  For brevity, some
comments below describe only one element of a register.  The other is
analagous.
*/

movsd		Address(One), v0	// Get constant.
divsd		x, v0				// Form 1/x, which we will call r.
/*	I tried rcpss, but using it requires a precision conversion, two
and I was not able to get the rcpss version to run as fast.
*/
mulsd		x, x				// Form x**2.
#define	x2	x	// Define name describing current register contents.
movlhps		v0, v0				// Copy finished reciprocal to high element.
movlhps		x2, x2				// Duplicate x**2.
mulsd		v0, v0				// Form r**2.
mulsd		v0, v0				// Form r**4.
mulsd		v0, v0				// Form r**8.
movapd		Address(T11), p1	// Get first coefficients.
mulsd		v0, v0				// Form r**16.

movapd		Address(T01), t0	// Get first coefficients.
addpd		x2, p1				// Form x**2 + t11.
mulpd		x2, p1				// Form x**4 + t11 * x**2.
addpd		Address(T10), p1	// Form x**4 + t11 * x**2 + t10.
addpd		x2, t0				// Form x**2 + t01.
mulpd		x2, t0				// Form x**4 + t01 * x**2.
addpd		Address(T00), t0	// Form x**4 + t01 * x**2 + t00.
mulpd		p1, t0				// Combine factors.
mulpd		v0, t0				// Multiply by r**16 and r.
movhlps		t0, p1				// Get high element.
mulsd		p1, t0				// Finish combining factors.
subsd		t0, p0				// Subtract from pi/2.
#undef x2

jmp			ReturnDoubleInp0

// Handle x < nPoint0.
NegativeTail:

ucomisd		Address(nPoint1), x		// Compare x to algorithm change point.
jbe			InputIsNegativeSpecial

movsd		Address(nHalfPi), p0	// Prepare -pi/2.

jmp			CommonTail

.literal8
/*	Define values near +/- pi/2 that yield +/- pi/2 rounded toward zero when
converted to single precision.  This allows us to generate inexact and
return the desired values for atanf of big positive and negative numbers.
*/
AlmostpHalfPi:	.double	+1.5707962
AlmostnHalfPi:	.double	-1.5707962

.text

/*	Here we handle inputs greater or equal to pPoint1, including infinity,
but not including NaNs.
*/
InputIsPositiveSpecial:

/*	The input is a big positive number.  Return +pi/2 rounded toward zero
and generate an inexact exception.
*/
jmp			ReturnSingleInp0

/*	Here we handle inputs less than or equal to -1, including -infinity and
NaNs.  The condition code must be set as if by using ucomisd to compare the
input in the "source operand" to nPoint1 in the "destination operand".
*/
InputIsNegativeSpecial:

jp			InputIsNaN

/*	The input is a big negative number.  Return -pi/2 rounded toward zero
and generate an inexact exception.
*/
jmp			ReturnSingleInp0

InputIsNaN:

#if defined __i386__
flds		Argx			// Return result on floating-point stack.
#else
cvtsd2ss	x, %xmm0		// Return in %xmm0.
/*	We cannot just return the input, because we must quiet a
signalling NaN.
*/
#endif

ret
```