# atan2f.s   [plain text]

```/*	atan2f.s -- atan2f for standard math library.

Written by Eric Postpischil, July 2007.
*/

.literal8

// Define miscellaneous constants.

Threshold:	.double	2.384185791015625e-07		// 2**-22.
nZero:		.double	-0

pPi1v4:		.double	+.7853981633974483096156608	// 1/4 pi.
nPi1v4:		.double	-.7853981633974483096156608
pPi1v2:		.double	+1.570796326794896619231322	// 1/2 pi.
nPi1v2:		.double	-1.570796326794896619231322
pPi3v4:		.double	+2.356194490192344928846982	// 3/4 pi.
nPi3v4:		.double	-2.356194490192344928846982

/*	Define values near +/- pi that yield +/- pi rounded toward zero when
converted to single precision.  This allows us to generate inexact and
return the desired values for atan2f on and near the negative side of the x
axis.
*/
AlmostpPi:	.double	+3.1415924

// Define a coefficient for center polynomial (used for x in [-1, +1]).
C2:			.double	 0.0029352921857004596570518

.const
.align	4

/*	Define some coefficients for center polynomial (used for x in [-1, +1]).
These are stored in pairs at aligned addresses for use in SIMD
instructions.
*/
C01:		.double	 2.2971562298314633761676433,  0.0207432003007420961489920
C00:		.double	 2.4449692297316409651126041,  3.7888879287802702842997915
C11:		.double	-2.9466967515109826289085300, -4.9721072376211623038916292
C10:		.double	 5.4728447324456990092824269,  6.7197076223592378022736307

// This needs to be 16-byte aligned because it is used in an orpd instruction.
.align	4
pPi:		.double	+3.141592653589793238462643	// pi.

// Rename the general registers (just to make it easier to keep track of them).
#if defined __i386__
#define	r0	%eax
#define	r1	%ecx
#define	r2	%edx
#define	r3	%ebx
#define	r4	%esp
#define	r5	%ebp
#define	r6	%esi
#define	r7	%edi
#elif defined __x86_64__
#define	r0	%rax
#define	r1	%rcx
#define	r2	%rdx
#define	r3	%rbx
#define	r4	%rsp
#define	r5	%rbp
#define	r6	%rsi
#define	r7	%rdi
#else
#error "Unknown architecture."
#endif

.text

// Define various symbols.

#define	y			%xmm0	// Must be in %xmm0 for return on x86_64.
#define	x			%xmm1
#define	p0			%xmm2
#define	x1			%xmm3
#define	t0			%xmm4
#define	Base		%xmm5
#define	p1			%xmm6

#if defined __i386__

// Define locations of arguments.
#define	Argy			4(%esp)
#define	Argx			8(%esp)

// Define how to address data.  BaseP must contain the address of label 0.

#elif defined __x86_64__

// Define locations of arguments.
#define	Argx			%xmm1
#define	Argy			%xmm0

// Define how to address data.

#endif

/*	float atan2f(float x).

Notes:

This routine has not been proven to be correct.  See the notes in the
accompanying C version regarding potential proof.  The polynomial it
uses was proven to provide faithfully rounded results in atanf.  atan2f
points used in the domain of the polynomial.

Citations in parentheses below indicate the source of a requirement.

"C" stands for ISO/IEC 9899:TC2.

The Open Group specification (IEEE Std 1003.1, 2004 edition) adds no
requirements since it defers to C and requires errno behavior only if
we choose to support it by arranging for "math_errhandling &
MATH_ERRNO" to be non-zero, which we do not.

Return value for atan2f(y, x) (C F.9.1 12 and C F.9.1.4):

y			x			atan2f(y, x)

-infinity	-infinity	-3*pi/4.
< 0			-2*pi/4.
-0			-2*pi/4.
+0			-2*pi/4.
> 0			-2*pi/4.
+infinity	-1*pi/4.

< 0			-infinity	-4*pi/4.
< 0			arctangent(y/x) in [-4*pi/4, -2*pi/4].
-0			-2*pi/4.
+0			-2*pi/4.
> 0			arctangent(y/x) in [-2*pi/4, -0*pi/4].
+infinity	-0*pi/4.

-0			-infinity	-4*pi/4.
< 0			-4*pi/4.
-0			-4*pi/4.
+0			-0*pi/4.
> 0			-0*pi/4.
+infinity	-0*pi/4.

+0			-infinity	+4*pi/4.
< 0			+4*pi/4.
-0			+4*pi/4.
+0			+0*pi/4.
> 0			+0*pi/4.
+infinity	+0*pi/4.

> 0			-infinity	+4*pi/4.
< 0			arctangent(y/x) in [+2*pi/4, +4*pi/4].
-0			+2*pi/4.
+0			+2*pi/4.
> 0			arctangent(y/x) in [+0*pi/4, +2*pi/4].
+infinity	+0*pi/4.

+infinity	-infinity	+3*pi/4.
< 0			+2*pi/4.
-0			+2*pi/4.
+0			+2*pi/4.
> 0			+2*pi/4.
+infinity	+1*pi/4.

If either input is a NaN, return one of the NaNs in the input.  (C F.9
11 and 13).  (If the NaN is a signalling NaN, we return the "same" NaN
quieted.)

Otherwise:

If the rounding mode is round-to-nearest, return arctangent(x)
faithfully rounded.

Return a value in [-pi, +pi] (C 7.12.4.4 3).  Note that this
prohibits returning correctly rounded values for -pi and +pi, since
pi rounded to a float lies outside that interval.

Not implemented:  In other rounding modes, return arctangent(x)
possibly with slightly worse error, not necessarily honoring the
rounding mode (Ali Sazegari narrowing C F.9 10).

Exceptions:

Raise underflow for a denormal result (C F.9 7 and Draft Standard for
Floating-Point Arithmetic P754 Draft 1.2.5 9.5).  If the input is the
smallest normal, underflow may or may not be raised.  This is stricter
than the older 754 standard.

May or may not raise inexact, even if the result is exact (C F.9 8).

Raise invalid if the input is a signalling NaN (C 5.2.4.2.2 3, in spite
of C 4.2.1)  but not if the input is a quiet NaN (C F.9 11).

May not raise exceptions otherwise (C F.9 9).

Properties:

We desire this routine to be monotonic, but that has not
been proven.  (For atan2f, monotonicity would mean, if (x0, y0) and
(x1, y1) are in the same quadrant, then y0/x0 <= y1/x1 implies
atan2f(y0, x0) <= atan2f(y1, x1).)
*/
.align	5
.globl _atan2f
_atan2f:

cvtss2sd	Argy, y					// Convert to double precision.
cvtss2sd	Argx, x

#if defined __i386__

// Get address of 0 in BaseP.
call	0f					// Push program counter onto stack.
0:
pop		BaseP				// Get program counter.

#endif

#define	nx	t0
xorpd		x, nx					// Negate x.

ucomisd		x, y
jae			yGEx					// If we jump, y >= x.
je			Unordered				// If we jump, an operand is a NaN.
// If we fall through, y < x.

ucomisd		y, nx
jae			yLTx_and_nxGEy			// If we jump, y < x and -x >= y.
// If we fall through, -x < y < x.

// Return atand(y/x).
divsd		x, y					// Form y/x.
movsd		y, x					// Move to register used by Polynomial.
movsd		Address(nZero), Base	// Set Base to -0.
// This makes the return value -0 if y is -0 and x > 0.
jmp			Polynomial				// Return 0 + arctangent(y/x).

yLTx_and_nxGEy:							// Here y < x && y <= -x.
je			yLTx_and_nxEQy			// If we jump, y < x && -x == y.
// If we fall through, y < x < -y.

// Return -pi/2 - atand(x/y).
movsd		Address(nZero), t0		// Get -0 for sign bit.
divsd		y, x					// Form x/y.
xorpd		t0, x					// Form -x/y.
movsd		Address(nPi1v2), Base	// Set Base to -pi/2.
jmp			Polynomial				// Return -pi/2 + arctangent(-x/y).

yLTx_and_nxEQy: 						// Here -x == y < x.
// Return -1/4*pi with inexact exception.
jmp			ReturnSingle

yGEx:									// Here y >= x.
je			yEQx					// If we jump, y == x.
// If we fall through, y > x.

ucomisd		y, nx
#undef	nx
jae			yGTx_and_nxGEy			// If we jump, y > x && -x >= y.
// If we fall through, -y < x < y.

// Return +pi/2 - atand(x/y).
movsd		Address(nZero), t0		// Get -0 for sign bit.
divsd		y, x					// Form x/y.
movsd		Address(pPi1v2), Base	// Set Base to +pi/2.
xorpd		t0, x					// Form -x/y.
jmp			Polynomial				// Return +pi/2 + arctangent(-x/y).

yGTx_and_nxGEy:							// Here y > x && -x >= y.
je			yGTx_and_nxEQy			// If we jump, y > x && -x == y.
// If we fall through, x < y < -x.

movapd		Base, t0				// Copy mask.
andpd		y, Base					// Extract sign bit of y.
divsd		x, y					// Form y/x.
andnpd		y, t0					// Take absolute value of quotient.
comisd		Address(Threshold), t0	// Is quotient small?
jbe			NearNegativeXAxis
orpd		Address(pPi), Base		// Set Base to pi with y's sign.
movsd		y, x					// Move to register used by Polynomial.
jmp			Polynomial				// Return +/- pi + arctangent(y/x).

NearNegativeXAxis:
// Return -pi or +pi, matching the sign of y, rounded toward zero.
xorpd		Base, p0
jmp			ReturnDouble

yGTx_and_nxEQy:							// Here x < y == -x.
// Return +3/4*pi with inexact exception.
jmp			ReturnSingle

yEQx:									// Here y == x.

jae			yEQx_and_yGE0			// If we jump, y == x && y >= 0.
// If we fall through, x == y < -x.

// Return -3/4*pi with inexact exception.
jmp			ReturnSingle

yEQx_and_yGE0:							// Here y == x && y >= 0.
je			yEQx_and_yEQ0			// If we jump, y == x && y == 0.
// If we fall through, -x < y == x.

// Return +1/4*pi with inexact exception.
jmp			ReturnSingle

yEQx_and_yEQ0:							// Here y == x == 0.

/*	Return:
x	y	atan2f(y, x)
-0	-0	-pi, with inexact exception.
-0	+0	+pi, with inexact exception.
+0	-0	-0.
+0	+0	+0.
*/

/*	We want to know if x is -0 or +0, but there is no direct test for that
that puts the results in a vector register.  We do an arithmetic right
shift to fill up the exponent bits with copies of the sign bit.  This
produces a NaN or +0.  Then we test for "unordered", which yields all
one bits if x was -0 and all zero bits if x was +0.
*/
psraw		\$12, x
cmpunordsd	x, x

// Form (almost) pi if x was -0 and 0 if x was +0.
andpd		x, p0

orpd		y, p0					// Apply the sign bit from y.
jmp			ReturnDouble

Unordered:								// Here x or y is a NaN.
addsd		x, y					// Return one of the NaNs, quieted.
cvtsd2ss	y, y
jmp			ReturnSingle

/*	This is the principal arctangent evaluation.  Previous code has prepared
the Base and y registers, and we need to calculate Base + arctangent(y).
The result is then converted to a single-precision number and returned.

-1 <= y <= +1.  (Actually, -1 < y < +1.  The equalities were sidetracked
during all the branching above, and division of two different
single-precision numbers converted to double-precision never rounds to
one.)

There are some slight inefficiencies here, in that special cases could omit
a few instructions -- sometimes the base is zero or y had to be negated to
fit this common code.  So, if speed is all important, this routine might be
speeded up a little by replicating this code.
*/
Polynomial:

/*	The polynomial that approximates arctangent has been arranged into the
form:

x * c2
* ((x**4 + c01 * x**2 + c00))
* ((x**4 + c11 * x**2 + c10))
* ((x**4 + c21 * x**2 + c20))
* ((x**4 + c31 * x**2 + c30))

The coefficients are stored in pairs, with c01 and c21 at C01, c00 and c20
at C00, c11 and c31 at C11, and c10 and c30 at C10.  c2 is at C2.

The quartic factors are evaluated in SIMD registers.  For brevity, some
comments below describe only one element of a register.  The other is
analagous.
*/
movsd		x, x1				// Save a copy of x for later.
mulsd		x, x				// Form x**2.
#define	x2	x	// Define name describing current register contents.
movlhps		x2, x2				// Duplicate x**2.
addpd		x2, p1				// Form x**2 + c11.
mulpd		x2, p1				// Form x**4 + c11 * x**2.
addpd		Address(C10), p1	// Form x**4 + c11 * x**2 + c10.
movapd		Address(C01), p0	// Get first coefficients.
addpd		x2, p0				// Form x**2 + c01.
mulpd		x2, p0				// Form x**4 + c01 * x**2.
movhpd		Address(C2), x1		// Put c2 in one element, with x in other.
addpd		Address(C00), p0	// Form x**4 + c01 * x**2 + c00.
mulpd		p1, p0				// Combine factors.
mulpd		x1, p0				// Multiply by x and c2.
movhlps		p0, p1				// Get high element.
mulsd		p1, p0				// Finish combining factors.
#undef x2

// Return the double-precision number currently in p0.
ReturnDouble:
cvtsd2ss	p0, y				// Convert result to single precision.

// Return the single-precision number currently in p0.
ReturnSingle:

#if defined __i386__
movss		y, Argx			// Shuttle result through memory.
// This uses the argument area for scratch space, which is allowed.
flds		Argx			// Return input on floating-point stack.
#else
// On x86_64, the return value is now in y, which is %xmm0.
#endif

ret
```