asinhf.s   [plain text]


/*
 * asinhf.s
 *
 *      by Stephen Canon
 *
 * Copyright (c) 2007, Apple Inc.  All Rights Reserved.
 *
 * This file implements the C99 asinhf function for the MacOS X __i386__ and __x86_64__ ABIs.
 */

/*
 This code divides the floating-point numbers into three ranges, and uses a different approximation
 to asinhf on each range.
 
  For the "small" case, |x| < 1/4, a simpler approximation by a 7th-order polynomial suffices:
 
    asinhf(x) = copysign(x) p(x)
		
	p(x) has no constant term, and is computed via the factorization:
	
		p(x) = cx(x + a)(x + b)(x^2 + cx + d)(x^2 + ex + f)
		
	the two linear terms and the two quadratic terms are computed side-by-side in packed arithmetic.
	Some care must be taken to avoid setting inexact in the product (x+b)(x^2 + ex + f) for the case
	when x = 0, as b and f are full-width double-precision numbers.
	

  The "middle" case is 1/4 < |x| < 4.  Here a straightforward rational polynomial approximation is used:
                               p(x)
		asinhf(x) = copysign(x) ----------
                               q(x)
	p(x) and q(x) are both order 6 polynomials, so they are computed side-by-side in packed arithmetic.
	

	The "large" case is the most complicated, owing to the fact that asinh(|x|) = log(2|x|) + O(1/x^2)
	for large x. Because of this, we take a divide and compute a logarithm (via polynomial approximation).
	The logarithm code is copied from the file logf.s (Ians code), though some registers have been renamed,
	so take care in making edits.  A sketch of the algorithm for the log is provided where it is used in
	this code, but more detailed comments may be found in logf.s.
	
	After the log, a minimax polynomial in 1/x is used as a correction.
	
	The "large" codepath is probably most suceptable to improvement - it is possible that a rational
	approximation would prove faster (in terms of latency) for this case.  The "small" and "middle"
	code paths are believed to be more nearly optimal.

	Globally, the error of this code is < .51 ulps
	
	- stc (14 june, 2007)
 */

#include <machine/asm.h>
#include "abi.h"

.const

// Coefficients for 7th order polynomial approximation on [0, 1/4]
// The polynomail is computed in packed factored form as follows:
// 
//     hi double:        (x + ahi)  * (x(x + b1hi) + b0hi)
//     lo double:  (cx * (x + alo)) * (x(x + b1lo) + b0lo)
//
// The high and low parts are then unpacked and multiplied.
.align 4
asinhf_low: .quad 0x4008183efcaf7119, 0x4007eba0a6c21cf1 // b0hi, b0lo
						.quad 0x3ffe9547f4507ace, 0xbffd2c173c2ad586 // ahi, alo
						.quad 0x40021aa6afb79159, 0xc0015e2ed556dde5 // b1hi, b1lo
						.quad 0xbfa050433953b0f9                  // c

// Coefficients for rational approximation on [1/4, 4]
// p(x) and q(x) are computed side-by-side in packed arithmetic, then unpacked and divided.
.align 4
asinhf_mid:	.quad 0x3e4328ccef61bd30,	0x3f80f6f9cf323b3c	// p[0], q[0]
						.quad 0x3f80f6e561f06785,	0x3f85b29a3e277523	// p[1], q[1]
						.quad 0x3f85b32a2f11b40a, 0x3f8e26416e925090	// p[2], q[2]
						.quad 0x3f8b5062b37338e9,	0x3f8417abb8190857	// p[3], q[3]
						.quad 0x3f807d5f61f237fc, 0x3f74ccecc3a9e525	// p[4], q[4]
						.quad 0x3f6a9bd617bd81f1,	0x3f5135659e34c549	// p[5], q[5]
						.quad 0x3f26c17c7b263d18,	0x3f001e11059bddca	// p[6], q[6]
						

.align 4
// Polynomial coefficients for the correction to log(2x) for the "large" case.
// The polynomial is computed in factored form as follows:
//
//          cx(x + a)(x^2 + b1hi x + b0hi)(x^2 + b1lo x + b0lo)
//
// The two quadratic terms are computed with packed arithmetic.
                 .quad 0x4005413ddf1d2ea5, 0x4003fbe28981b6b8 // b0hi, b0lo
                 .quad 0x4006dd1e92567458, 0xc0057cccb680cfdf // b1hi, b1lo
                 .quad 0x3e9564f782290c65, 0x3fa3493848c3ecc6 // a, c
// Table used to compute log(2x) for the "large" case.  Copied from logf.s
asinhf_hi_table: .quad	0x0000000000000000,	0x3ff0000000000000	// log(1.00000), 1/1.00000
                 .quad	0x3f6ff00aa2b10bc0,	0x3fefe01fe01fe020	// log(1.00391), 1/1.00391
                 .quad	0x3f7fe02a6b106789,	0x3fefc07f01fc07f0	// log(1.00781), 1/1.00781
                 .quad	0x3f87dc475f810a77,	0x3fefa11caa01fa12	// log(1.01172), 1/1.01172
                 .quad	0x3f8fc0a8b0fc03e4,	0x3fef81f81f81f820	// log(1.01562), 1/1.01562
                 .quad	0x3f93cea44346a575,	0x3fef6310aca0dbb5	// log(1.01953), 1/1.01953
                 .quad	0x3f97b91b07d5b11b,	0x3fef44659e4a4271	// log(1.02344), 1/1.02344
                 .quad	0x3f9b9fc027af9198,	0x3fef25f644230ab5	// log(1.02734), 1/1.02734
                 .quad	0x3f9f829b0e783300,	0x3fef07c1f07c1f08	// log(1.03125), 1/1.03125
                 .quad	0x3fa1b0d98923d980,	0x3feee9c7f8458e02	// log(1.03516), 1/1.03516
                 .quad	0x3fa39e87b9febd60,	0x3feecc07b301ecc0	// log(1.03906), 1/1.03906
                 .quad	0x3fa58a5bafc8e4d5,	0x3feeae807aba01eb	// log(1.04297), 1/1.04297
                 .quad	0x3fa77458f632dcfc,	0x3fee9131abf0b767	// log(1.04688), 1/1.04688
                 .quad	0x3fa95c830ec8e3eb,	0x3fee741aa59750e4	// log(1.05078), 1/1.05078
                 .quad	0x3fab42dd711971bf,	0x3fee573ac901e574	// log(1.05469), 1/1.05469
                 .quad	0x3fad276b8adb0b52,	0x3fee3a9179dc1a73	// log(1.05859), 1/1.05859
                 .quad	0x3faf0a30c01162a6,	0x3fee1e1e1e1e1e1e	// log(1.06250), 1/1.06250
                 .quad	0x3fb075983598e471,	0x3fee01e01e01e01e	// log(1.06641), 1/1.06641
                 .quad	0x3fb16536eea37ae1,	0x3fede5d6e3f8868a	// log(1.07031), 1/1.07031
                 .quad	0x3fb253f62f0a1417,	0x3fedca01dca01dca	// log(1.07422), 1/1.07422
                 .quad	0x3fb341d7961bd1d1,	0x3fedae6076b981db	// log(1.07812), 1/1.07812
                 .quad	0x3fb42edcbea646f0,	0x3fed92f2231e7f8a	// log(1.08203), 1/1.08203
                 .quad	0x3fb51b073f06183f,	0x3fed77b654b82c34	// log(1.08594), 1/1.08594
                 .quad	0x3fb60658a93750c4,	0x3fed5cac807572b2	// log(1.08984), 1/1.08984
                 .quad	0x3fb6f0d28ae56b4c,	0x3fed41d41d41d41d	// log(1.09375), 1/1.09375
                 .quad	0x3fb7da766d7b12cd,	0x3fed272ca3fc5b1a	// log(1.09766), 1/1.09766
                 .quad	0x3fb8c345d6319b21,	0x3fed0cb58f6ec074	// log(1.10156), 1/1.10156
                 .quad	0x3fb9ab42462033ad,	0x3fecf26e5c44bfc6	// log(1.10547), 1/1.10547
                 .quad	0x3fba926d3a4ad563,	0x3fecd85689039b0b	// log(1.10938), 1/1.10938
                 .quad	0x3fbb78c82bb0eda1,	0x3fecbe6d9601cbe7	// log(1.11328), 1/1.11328
                 .quad	0x3fbc5e548f5bc743,	0x3feca4b3055ee191	// log(1.11719), 1/1.11719
                 .quad	0x3fbd4313d66cb35d,	0x3fec8b265afb8a42	// log(1.12109), 1/1.12109
                 .quad	0x3fbe27076e2af2e6,	0x3fec71c71c71c71c	// log(1.12500), 1/1.12500
                 .quad	0x3fbf0a30c01162a6,	0x3fec5894d10d4986	// log(1.12891), 1/1.12891
                 .quad	0x3fbfec9131dbeabb,	0x3fec3f8f01c3f8f0	// log(1.13281), 1/1.13281
                 .quad	0x3fc0671512ca596e,	0x3fec26b5392ea01c	// log(1.13672), 1/1.13672
                 .quad	0x3fc0d77e7cd08e59,	0x3fec0e070381c0e0	// log(1.14062), 1/1.14062
                 .quad	0x3fc14785846742ac,	0x3febf583ee868d8b	// log(1.14453), 1/1.14453
                 .quad	0x3fc1b72ad52f67a0,	0x3febdd2b899406f7	// log(1.14844), 1/1.14844
                 .quad	0x3fc2266f190a5acb,	0x3febc4fd65883e7b	// log(1.15234), 1/1.15234
                 .quad	0x3fc29552f81ff523,	0x3febacf914c1bad0	// log(1.15625), 1/1.15625
                 .quad	0x3fc303d718e47fd3,	0x3feb951e2b18ff23	// log(1.16016), 1/1.16016
                 .quad	0x3fc371fc201e8f74,	0x3feb7d6c3dda338b	// log(1.16406), 1/1.16406
                 .quad	0x3fc3dfc2b0ecc62a,	0x3feb65e2e3beee05	// log(1.16797), 1/1.16797
                 .quad	0x3fc44d2b6ccb7d1e,	0x3feb4e81b4e81b4f	// log(1.17188), 1/1.17188
                 .quad	0x3fc4ba36f39a55e5,	0x3feb37484ad806ce	// log(1.17578), 1/1.17578
                 .quad	0x3fc526e5e3a1b438,	0x3feb2036406c80d9	// log(1.17969), 1/1.17969
                 .quad	0x3fc59338d9982086,	0x3feb094b31d922a4	// log(1.18359), 1/1.18359
                 .quad	0x3fc5ff3070a793d4,	0x3feaf286bca1af28	// log(1.18750), 1/1.18750
                 .quad	0x3fc66acd4272ad51,	0x3feadbe87f94905e	// log(1.19141), 1/1.19141
                 .quad	0x3fc6d60fe719d21d,	0x3feac5701ac5701b	// log(1.19531), 1/1.19531
                 .quad	0x3fc740f8f54037a5,	0x3feaaf1d2f87ebfd	// log(1.19922), 1/1.19922
                 .quad	0x3fc7ab890210d909,	0x3fea98ef606a63be	// log(1.20312), 1/1.20312
                 .quad	0x3fc815c0a14357eb,	0x3fea82e65130e159	// log(1.20703), 1/1.20703
                 .quad	0x3fc87fa06520c911,	0x3fea6d01a6d01a6d	// log(1.21094), 1/1.21094
                 .quad	0x3fc8e928de886d41,	0x3fea574107688a4a	// log(1.21484), 1/1.21484
                 .quad	0x3fc9525a9cf456b4,	0x3fea41a41a41a41a	// log(1.21875), 1/1.21875
                 .quad	0x3fc9bb362e7dfb83,	0x3fea2c2a87c51ca0	// log(1.22266), 1/1.22266
                 .quad	0x3fca23bc1fe2b563,	0x3fea16d3f97a4b02	// log(1.22656), 1/1.22656
                 .quad	0x3fca8becfc882f19,	0x3fea01a01a01a01a	// log(1.23047), 1/1.23047
                 .quad	0x3fcaf3c94e80bff3,	0x3fe9ec8e951033d9	// log(1.23438), 1/1.23438
                 .quad	0x3fcb5b519e8fb5a4,	0x3fe9d79f176b682d	// log(1.23828), 1/1.23828
                 .quad	0x3fcbc286742d8cd6,	0x3fe9c2d14ee4a102	// log(1.24219), 1/1.24219
                 .quad	0x3fcc2968558c18c1,	0x3fe9ae24ea5510da	// log(1.24609), 1/1.24609
                 .quad	0x3fcc8ff7c79a9a22,	0x3fe999999999999a	// log(1.25000), 1/1.25000
                 .quad	0x3fccf6354e09c5dc,	0x3fe9852f0d8ec0ff	// log(1.25391), 1/1.25391
                 .quad	0x3fcd5c216b4fbb91,	0x3fe970e4f80cb872	// log(1.25781), 1/1.25781
                 .quad	0x3fcdc1bca0abec7d,	0x3fe95cbb0be377ae	// log(1.26172), 1/1.26172
                 .quad	0x3fce27076e2af2e6,	0x3fe948b0fcd6e9e0	// log(1.26562), 1/1.26562
                 .quad	0x3fce8c0252aa5a60,	0x3fe934c67f9b2ce6	// log(1.26953), 1/1.26953
                 .quad	0x3fcef0adcbdc5936,	0x3fe920fb49d0e229	// log(1.27344), 1/1.27344
                 .quad	0x3fcf550a564b7b37,	0x3fe90d4f120190d5	// log(1.27734), 1/1.27734
                 .quad	0x3fcfb9186d5e3e2b,	0x3fe8f9c18f9c18fa	// log(1.28125), 1/1.28125
                 .quad	0x3fd00e6c45ad501d,	0x3fe8e6527af1373f	// log(1.28516), 1/1.28516
                 .quad	0x3fd0402594b4d041,	0x3fe8d3018d3018d3	// log(1.28906), 1/1.28906
                 .quad	0x3fd071b85fcd590d,	0x3fe8bfce8062ff3a	// log(1.29297), 1/1.29297
                 .quad	0x3fd0a324e27390e3,	0x3fe8acb90f6bf3aa	// log(1.29688), 1/1.29688
                 .quad	0x3fd0d46b579ab74b,	0x3fe899c0f601899c	// log(1.30078), 1/1.30078
                 .quad	0x3fd1058bf9ae4ad5,	0x3fe886e5f0abb04a	// log(1.30469), 1/1.30469
                 .quad	0x3fd136870293a8b0,	0x3fe87427bcc092b9	// log(1.30859), 1/1.30859
                 .quad	0x3fd1675cababa60e,	0x3fe8618618618618	// log(1.31250), 1/1.31250
                 .quad	0x3fd1980d2dd4236f,	0x3fe84f00c2780614	// log(1.31641), 1/1.31641
                 .quad	0x3fd1c898c16999fb,	0x3fe83c977ab2bedd	// log(1.32031), 1/1.32031
                 .quad	0x3fd1f8ff9e48a2f3,	0x3fe82a4a0182a4a0	// log(1.32422), 1/1.32422
                 .quad	0x3fd22941fbcf7966,	0x3fe8181818181818	// log(1.32812), 1/1.32812
                 .quad	0x3fd2596010df763a,	0x3fe8060180601806	// log(1.33203), 1/1.33203
                 .quad	0x3fd2895a13de86a3,	0x3fe7f405fd017f40	// log(1.33594), 1/1.33594
                 .quad	0x3fd2b9303ab89d25,	0x3fe7e225515a4f1d	// log(1.33984), 1/1.33984
                 .quad	0x3fd2e8e2bae11d31,	0x3fe7d05f417d05f4	// log(1.34375), 1/1.34375
                 .quad	0x3fd31871c9544185,	0x3fe7beb3922e017c	// log(1.34766), 1/1.34766
                 .quad	0x3fd347dd9a987d55,	0x3fe7ad2208e0ecc3	// log(1.35156), 1/1.35156
                 .quad	0x3fd3772662bfd85b,	0x3fe79baa6bb6398b	// log(1.35547), 1/1.35547
                 .quad	0x3fd3a64c556945ea,	0x3fe78a4c8178a4c8	// log(1.35938), 1/1.35938
                 .quad	0x3fd3d54fa5c1f710,	0x3fe77908119ac60d	// log(1.36328), 1/1.36328
                 .quad	0x3fd404308686a7e4,	0x3fe767dce434a9b1	// log(1.36719), 1/1.36719
                 .quad	0x3fd432ef2a04e814,	0x3fe756cac201756d	// log(1.37109), 1/1.37109
                 .quad	0x3fd4618bc21c5ec2,	0x3fe745d1745d1746	// log(1.37500), 1/1.37500
                 .quad	0x3fd49006804009d1,	0x3fe734f0c541fe8d	// log(1.37891), 1/1.37891
                 .quad	0x3fd4be5f957778a1,	0x3fe724287f46debc	// log(1.38281), 1/1.38281
                 .quad	0x3fd4ec973260026a,	0x3fe713786d9c7c09	// log(1.38672), 1/1.38672
                 .quad	0x3fd51aad872df82d,	0x3fe702e05c0b8170	// log(1.39062), 1/1.39062
                 .quad	0x3fd548a2c3add263,	0x3fe6f26016f26017	// log(1.39453), 1/1.39453
                 .quad	0x3fd5767717455a6c,	0x3fe6e1f76b4337c7	// log(1.39844), 1/1.39844
                 .quad	0x3fd5a42ab0f4cfe2,	0x3fe6d1a62681c861	// log(1.40234), 1/1.40234
                 .quad	0x3fd5d1bdbf5809ca,	0x3fe6c16c16c16c17	// log(1.40625), 1/1.40625
                 .quad	0x3fd5ff3070a793d4,	0x3fe6b1490aa31a3d	// log(1.41016), 1/1.41016
                 .quad	0x3fd62c82f2b9c795,	0x3fe6a13cd1537290	// log(1.41406), 1/1.41406
                 .quad	0x3fd659b57303e1f3,	0x3fe691473a88d0c0	// log(1.41797), 1/1.41797
                 .quad	0x3fd686c81e9b14af,	0x3fe6816816816817	// log(1.42188), 1/1.42188
                 .quad	0x3fd6b3bb2235943e,	0x3fe6719f3601671a	// log(1.42578), 1/1.42578
                 .quad	0x3fd6e08eaa2ba1e4,	0x3fe661ec6a5122f9	// log(1.42969), 1/1.42969
                 .quad	0x3fd70d42e2789236,	0x3fe6524f853b4aa3	// log(1.43359), 1/1.43359
                 .quad	0x3fd739d7f6bbd007,	0x3fe642c8590b2164	// log(1.43750), 1/1.43750
                 .quad	0x3fd7664e1239dbcf,	0x3fe63356b88ac0de	// log(1.44141), 1/1.44141
                 .quad	0x3fd792a55fdd47a2,	0x3fe623fa77016240	// log(1.44531), 1/1.44531
                 .quad	0x3fd7bede0a37afc0,	0x3fe614b36831ae94	// log(1.44922), 1/1.44922
                 .quad	0x3fd7eaf83b82afc3,	0x3fe6058160581606	// log(1.45312), 1/1.45312
                 .quad	0x3fd816f41da0d496,	0x3fe5f66434292dfc	// log(1.45703), 1/1.45703
                 .quad	0x3fd842d1da1e8b17,	0x3fe5e75bb8d015e7	// log(1.46094), 1/1.46094
                 .quad	0x3fd86e919a330ba0,	0x3fe5d867c3ece2a5	// log(1.46484), 1/1.46484
                 .quad	0x3fd89a3386c1425b,	0x3fe5c9882b931057	// log(1.46875), 1/1.46875
                 .quad	0x3fd8c5b7c858b48b,	0x3fe5babcc647fa91	// log(1.47266), 1/1.47266
                 .quad	0x3fd8f11e873662c7,	0x3fe5ac056b015ac0	// log(1.47656), 1/1.47656
                 .quad	0x3fd91c67eb45a83e,	0x3fe59d61f123ccaa	// log(1.48047), 1/1.48047
                 .quad	0x3fd947941c2116fb,	0x3fe58ed2308158ed	// log(1.48438), 1/1.48438
                 .quad	0x3fd972a341135158,	0x3fe5805601580560	// log(1.48828), 1/1.48828
                 .quad	0x3fd99d958117e08b,	0x3fe571ed3c506b3a	// log(1.49219), 1/1.49219
                 .quad	0x3fd9c86b02dc0863,	0x3fe56397ba7c52e2	// log(1.49609), 1/1.49609
                 .quad	0x3fd9f323ecbf984c,	0x3fe5555555555555	// log(1.50000), 1/1.50000
                 .quad	0x3fda1dc064d5b995,	0x3fe54725e6bb82fe	// log(1.50391), 1/1.50391
                 .quad	0x3fda484090e5bb0a,	0x3fe5390948f40feb	// log(1.50781), 1/1.50781
                 .quad	0x3fda72a4966bd9ea,	0x3fe52aff56a8054b	// log(1.51172), 1/1.51172
                 .quad	0x3fda9cec9a9a084a,	0x3fe51d07eae2f815	// log(1.51562), 1/1.51562
                 .quad	0x3fdac718c258b0e4,	0x3fe50f22e111c4c5	// log(1.51953), 1/1.51953
                 .quad	0x3fdaf1293247786b,	0x3fe5015015015015	// log(1.52344), 1/1.52344
                 .quad	0x3fdb1b1e0ebdfc5b,	0x3fe4f38f62dd4c9b	// log(1.52734), 1/1.52734
                 .quad	0x3fdb44f77bcc8f63,	0x3fe4e5e0a72f0539	// log(1.53125), 1/1.53125
                 .quad	0x3fdb6eb59d3cf35e,	0x3fe4d843bedc2c4c	// log(1.53516), 1/1.53516
                 .quad	0x3fdb9858969310fb,	0x3fe4cab88725af6e	// log(1.53906), 1/1.53906
                 .quad	0x3fdbc1e08b0dad0a,	0x3fe4bd3edda68fe1	// log(1.54297), 1/1.54297
                 .quad	0x3fdbeb4d9da71b7c,	0x3fe4afd6a052bf5b	// log(1.54688), 1/1.54688
                 .quad	0x3fdc149ff115f027,	0x3fe4a27fad76014a	// log(1.55078), 1/1.55078
                 .quad	0x3fdc3dd7a7cdad4d,	0x3fe49539e3b2d067	// log(1.55469), 1/1.55469
                 .quad	0x3fdc66f4e3ff6ff8,	0x3fe4880522014880	// log(1.55859), 1/1.55859
                 .quad	0x3fdc8ff7c79a9a22,	0x3fe47ae147ae147b	// log(1.56250), 1/1.56250
                 .quad	0x3fdcb8e0744d7aca,	0x3fe46dce34596066	// log(1.56641), 1/1.56641
                 .quad	0x3fdce1af0b85f3eb,	0x3fe460cbc7f5cf9a	// log(1.57031), 1/1.57031
                 .quad	0x3fdd0a63ae721e64,	0x3fe453d9e2c776ca	// log(1.57422), 1/1.57422
                 .quad	0x3fdd32fe7e00ebd5,	0x3fe446f86562d9fb	// log(1.57812), 1/1.57812
                 .quad	0x3fdd5b7f9ae2c684,	0x3fe43a2730abee4d	// log(1.58203), 1/1.58203
                 .quad	0x3fdd83e7258a2f3e,	0x3fe42d6625d51f87	// log(1.58594), 1/1.58594
                 .quad	0x3fddac353e2c5954,	0x3fe420b5265e5951	// log(1.58984), 1/1.58984
                 .quad	0x3fddd46a04c1c4a1,	0x3fe4141414141414	// log(1.59375), 1/1.59375
                 .quad	0x3fddfc859906d5b5,	0x3fe40782d10e6566	// log(1.59766), 1/1.59766
                 .quad	0x3fde24881a7c6c26,	0x3fe3fb013fb013fb	// log(1.60156), 1/1.60156
                 .quad	0x3fde4c71a8687704,	0x3fe3ee8f42a5af07	// log(1.60547), 1/1.60547
                 .quad	0x3fde744261d68788,	0x3fe3e22cbce4a902	// log(1.60938), 1/1.60938
                 .quad	0x3fde9bfa659861f5,	0x3fe3d5d991aa75c6	// log(1.61328), 1/1.61328
                 .quad	0x3fdec399d2468cc0,	0x3fe3c995a47babe7	// log(1.61719), 1/1.61719
                 .quad	0x3fdeeb20c640ddf4,	0x3fe3bd60d9232955	// log(1.62109), 1/1.62109
                 .quad	0x3fdf128f5faf06ed,	0x3fe3b13b13b13b14	// log(1.62500), 1/1.62500
                 .quad	0x3fdf39e5bc811e5c,	0x3fe3a524387ac822	// log(1.62891), 1/1.62891
                 .quad	0x3fdf6123fa7028ac,	0x3fe3991c2c187f63	// log(1.63281), 1/1.63281
                 .quad	0x3fdf884a36fe9ec2,	0x3fe38d22d366088e	// log(1.63672), 1/1.63672
                 .quad	0x3fdfaf588f78f31f,	0x3fe3813813813814	// log(1.64062), 1/1.64062
                 .quad	0x3fdfd64f20f61572,	0x3fe3755bd1c945ee	// log(1.64453), 1/1.64453
                 .quad	0x3fdffd2e0857f498,	0x3fe3698df3de0748	// log(1.64844), 1/1.64844
                 .quad	0x3fe011fab125ff8a,	0x3fe35dce5f9f2af8	// log(1.65234), 1/1.65234
                 .quad	0x3fe02552a5a5d0ff,	0x3fe3521cfb2b78c1	// log(1.65625), 1/1.65625
                 .quad	0x3fe0389eefce633b,	0x3fe34679ace01346	// log(1.66016), 1/1.66016
                 .quad	0x3fe04bdf9da926d2,	0x3fe33ae45b57bcb2	// log(1.66406), 1/1.66406
                 .quad	0x3fe05f14bd26459c,	0x3fe32f5ced6a1dfa	// log(1.66797), 1/1.66797
                 .quad	0x3fe0723e5c1cdf40,	0x3fe323e34a2b10bf	// log(1.67188), 1/1.67188
                 .quad	0x3fe0855c884b450e,	0x3fe3187758e9ebb6	// log(1.67578), 1/1.67578
                 .quad	0x3fe0986f4f573521,	0x3fe30d190130d190	// log(1.67969), 1/1.67969
                 .quad	0x3fe0ab76bece14d2,	0x3fe301c82ac40260	// log(1.68359), 1/1.68359
                 .quad	0x3fe0be72e4252a83,	0x3fe2f684bda12f68	// log(1.68750), 1/1.68750
                 .quad	0x3fe0d163ccb9d6b8,	0x3fe2eb4ea1fed14b	// log(1.69141), 1/1.69141
                 .quad	0x3fe0e44985d1cc8c,	0x3fe2e025c04b8097	// log(1.69531), 1/1.69531
                 .quad	0x3fe0f7241c9b497d,	0x3fe2d50a012d50a0	// log(1.69922), 1/1.69922
                 .quad	0x3fe109f39e2d4c97,	0x3fe2c9fb4d812ca0	// log(1.70312), 1/1.70312
                 .quad	0x3fe11cb81787ccf8,	0x3fe2bef98e5a3711	// log(1.70703), 1/1.70703
                 .quad	0x3fe12f719593efbc,	0x3fe2b404ad012b40	// log(1.71094), 1/1.71094
                 .quad	0x3fe1422025243d45,	0x3fe2a91c92f3c105	// log(1.71484), 1/1.71484
                 .quad	0x3fe154c3d2f4d5ea,	0x3fe29e4129e4129e	// log(1.71875), 1/1.71875
                 .quad	0x3fe1675cababa60e,	0x3fe293725bb804a5	// log(1.72266), 1/1.72266
                 .quad	0x3fe179eabbd899a1,	0x3fe288b01288b013	// log(1.72656), 1/1.72656
                 .quad	0x3fe18c6e0ff5cf06,	0x3fe27dfa38a1ce4d	// log(1.73047), 1/1.73047
                 .quad	0x3fe19ee6b467c96f,	0x3fe27350b8812735	// log(1.73438), 1/1.73438
                 .quad	0x3fe1b154b57da29f,	0x3fe268b37cd60127	// log(1.73828), 1/1.73828
                 .quad	0x3fe1c3b81f713c25,	0x3fe25e22708092f1	// log(1.74219), 1/1.74219
                 .quad	0x3fe1d610fe677003,	0x3fe2539d7e9177b2	// log(1.74609), 1/1.74609
                 .quad	0x3fe1e85f5e7040d0,	0x3fe2492492492492	// log(1.75000), 1/1.75000
                 .quad	0x3fe1faa34b87094c,	0x3fe23eb79717605b	// log(1.75391), 1/1.75391
                 .quad	0x3fe20cdcd192ab6e,	0x3fe23456789abcdf	// log(1.75781), 1/1.75781
                 .quad	0x3fe21f0bfc65beec,	0x3fe22a0122a0122a	// log(1.76172), 1/1.76172
                 .quad	0x3fe23130d7bebf43,	0x3fe21fb78121fb78	// log(1.76562), 1/1.76562
                 .quad	0x3fe2434b6f483934,	0x3fe21579804855e6	// log(1.76953), 1/1.76953
                 .quad	0x3fe2555bce98f7cb,	0x3fe20b470c67c0d9	// log(1.77344), 1/1.77344
                 .quad	0x3fe26762013430e0,	0x3fe2012012012012	// log(1.77734), 1/1.77734
                 .quad	0x3fe2795e1289b11b,	0x3fe1f7047dc11f70	// log(1.78125), 1/1.78125
                 .quad	0x3fe28b500df60783,	0x3fe1ecf43c7fb84c	// log(1.78516), 1/1.78516
                 .quad	0x3fe29d37fec2b08b,	0x3fe1e2ef3b3fb874	// log(1.78906), 1/1.78906
                 .quad	0x3fe2af15f02640ad,	0x3fe1d8f5672e4abd	// log(1.79297), 1/1.79297
                 .quad	0x3fe2c0e9ed448e8c,	0x3fe1cf06ada2811d	// log(1.79688), 1/1.79688
                 .quad	0x3fe2d2b4012edc9e,	0x3fe1c522fc1ce059	// log(1.80078), 1/1.80078
                 .quad	0x3fe2e47436e40268,	0x3fe1bb4a4046ed29	// log(1.80469), 1/1.80469
                 .quad	0x3fe2f62a99509546,	0x3fe1b17c67f2bae3	// log(1.80859), 1/1.80859
                 .quad	0x3fe307d7334f10be,	0x3fe1a7b9611a7b96	// log(1.81250), 1/1.81250
                 .quad	0x3fe3197a0fa7fe6a,	0x3fe19e0119e0119e	// log(1.81641), 1/1.81641
                 .quad	0x3fe32b1339121d71,	0x3fe19453808ca29c	// log(1.82031), 1/1.82031
                 .quad	0x3fe33ca2ba328995,	0x3fe18ab083902bdb	// log(1.82422), 1/1.82422
                 .quad	0x3fe34e289d9ce1d3,	0x3fe1811811811812	// log(1.82812), 1/1.82812
                 .quad	0x3fe35fa4edd36ea0,	0x3fe1778a191bd684	// log(1.83203), 1/1.83203
                 .quad	0x3fe37117b54747b6,	0x3fe16e0689427379	// log(1.83594), 1/1.83594
                 .quad	0x3fe38280fe58797f,	0x3fe1648d50fc3201	// log(1.83984), 1/1.83984
                 .quad	0x3fe393e0d3562a1a,	0x3fe15b1e5f75270d	// log(1.84375), 1/1.84375
                 .quad	0x3fe3a5373e7ebdfa,	0x3fe151b9a3fdd5c9	// log(1.84766), 1/1.84766
                 .quad	0x3fe3b68449fffc23,	0x3fe1485f0e0acd3b	// log(1.85156), 1/1.85156
                 .quad	0x3fe3c7c7fff73206,	0x3fe13f0e8d344724	// log(1.85547), 1/1.85547
                 .quad	0x3fe3d9026a7156fb,	0x3fe135c81135c811	// log(1.85938), 1/1.85938
                 .quad	0x3fe3ea33936b2f5c,	0x3fe12c8b89edc0ac	// log(1.86328), 1/1.86328
                 .quad	0x3fe3fb5b84d16f42,	0x3fe12358e75d3033	// log(1.86719), 1/1.86719
                 .quad	0x3fe40c7a4880dce9,	0x3fe11a3019a74826	// log(1.87109), 1/1.87109
                 .quad	0x3fe41d8fe84672ae,	0x3fe1111111111111	// log(1.87500), 1/1.87500
                 .quad	0x3fe42e9c6ddf80bf,	0x3fe107fbbe011080	// log(1.87891), 1/1.87891
                 .quad	0x3fe43f9fe2f9ce67,	0x3fe0fef010fef011	// log(1.88281), 1/1.88281
                 .quad	0x3fe4509a5133bb0a,	0x3fe0f5edfab325a2	// log(1.88672), 1/1.88672
                 .quad	0x3fe4618bc21c5ec2,	0x3fe0ecf56be69c90	// log(1.89062), 1/1.89062
                 .quad	0x3fe472743f33aaad,	0x3fe0e40655826011	// log(1.89453), 1/1.89453
                 .quad	0x3fe48353d1ea88df,	0x3fe0db20a88f4696	// log(1.89844), 1/1.89844
                 .quad	0x3fe4942a83a2fc07,	0x3fe0d24456359e3a	// log(1.90234), 1/1.90234
                 .quad	0x3fe4a4f85db03ebb,	0x3fe0c9714fbcda3b	// log(1.90625), 1/1.90625
                 .quad	0x3fe4b5bd6956e274,	0x3fe0c0a7868b4171	// log(1.91016), 1/1.91016
                 .quad	0x3fe4c679afccee3a,	0x3fe0b7e6ec259dc8	// log(1.91406), 1/1.91406
                 .quad	0x3fe4d72d3a39fd00,	0x3fe0af2f722eecb5	// log(1.91797), 1/1.91797
                 .quad	0x3fe4e7d811b75bb1,	0x3fe0a6810a6810a7	// log(1.92188), 1/1.92188
                 .quad	0x3fe4f87a3f5026e9,	0x3fe09ddba6af8360	// log(1.92578), 1/1.92578
                 .quad	0x3fe50913cc01686b,	0x3fe0953f39010954	// log(1.92969), 1/1.92969
                 .quad	0x3fe519a4c0ba3446,	0x3fe08cabb37565e2	// log(1.93359), 1/1.93359
                 .quad	0x3fe52a2d265bc5ab,	0x3fe0842108421084	// log(1.93750), 1/1.93750
                 .quad	0x3fe53aad05b99b7d,	0x3fe07b9f29b8eae2	// log(1.94141), 1/1.94141
                 .quad	0x3fe54b2467999498,	0x3fe073260a47f7c6	// log(1.94531), 1/1.94531
                 .quad	0x3fe55b9354b40bcd,	0x3fe06ab59c7912fb	// log(1.94922), 1/1.94922
                 .quad	0x3fe56bf9d5b3f399,	0x3fe0624dd2f1a9fc	// log(1.95312), 1/1.95312
                 .quad	0x3fe57c57f336f191,	0x3fe059eea0727586	// log(1.95703), 1/1.95703
                 .quad	0x3fe58cadb5cd7989,	0x3fe05197f7d73404	// log(1.96094), 1/1.96094
                 .quad	0x3fe59cfb25fae87e,	0x3fe04949cc1664c5	// log(1.96484), 1/1.96484
                 .quad	0x3fe5ad404c359f2d,	0x3fe0410410410410	// log(1.96875), 1/1.96875
                 .quad	0x3fe5bd7d30e71c73,	0x3fe038c6b78247fc	// log(1.97266), 1/1.97266
                 .quad	0x3fe5cdb1dc6c1765,	0x3fe03091b51f5e1a	// log(1.97656), 1/1.97656
                 .quad	0x3fe5ddde57149923,	0x3fe02864fc7729e9	// log(1.98047), 1/1.98047
                 .quad	0x3fe5ee02a9241675,	0x3fe0204081020408	// log(1.98438), 1/1.98438
                 .quad	0x3fe5fe1edad18919,	0x3fe0182436517a37	// log(1.98828), 1/1.98828
                 .quad	0x3fe60e32f44788d9,	0x3fe0101010101010	// log(1.99219), 1/1.99219
                 .quad	0x3fe61e3efda46467,	0x3fe0080402010080	// log(1.99609), 1/1.99609

.literal8
.align 3
one:            .quad				0x3ff0000000000000
onehalf:        .quad				0x3fe0000000000000
onethird:       .quad       0x3fd5555555555555
log2:           .quad       0x3fe62e42fefa39ef   // ln(2)

.text
#if defined( __x86_64__ )
	#define RELATIVE_ADDR( _a )			(_a)( %rip )
#else
	#define RELATIVE_ADDR( _a )			(_a)-asinhf_body( %ecx )

.align 4
asinhf_pic:
	movl			(%esp),								%ecx				// Copy address of this instruction to %ecx
	ret
#endif

ENTRY(asinhf)
#if defined(__i386__)
	movl			FRAME_SIZE(STACKP),		%eax
	movss			FRAME_SIZE(STACKP),		%xmm0
	calll			asinhf_pic
asinhf_body:
#else
	movd			%xmm0,								%eax
#endif
	andnpd		%xmm5,								%xmm5				// zero out xmm5
	
	andl			$0x7fffffff,					%eax				// eax <-- |x|
	
	movd			%eax,									%xmm1				// xmm1 <-- |x|
	cvtss2sd	%xmm1,								%xmm2				// xmm2 <-- (double)|x|
	andnps		%xmm0,								%xmm1				// xmm1 <-- signbit(x)
	
	subl			$0x3e800000,					%eax				// eax <-- |x| - 1/4 as integers
	cmpl			$0x02000000,					%eax				// if ( |x| > 4 or |x| < 1/4 or isnan(x) )
	ja				2f																//      goto 2
	
	// 1/4 < |x| < 4
	lea		RELATIVE_ADDR(asinhf_mid), DX_P
#if defined( __SSE3__ )
	movddup		%xmm2,								%xmm0				// xmm0 <-- [x, x]
#else
	movapd    %xmm2,                %xmm0
	unpcklpd  %xmm0,                %xmm0
#endif

	mulsd			%xmm2,								%xmm2				// xmm2 <-- x*x
	movapd		%xmm0,								%xmm3				// xmm3 <-- [x, x]
	movapd		%xmm0,								%xmm4				// xmm4 <-- [x, x]
	mulpd			48(DX_P),							%xmm3				// xmm3 <-- [p3x, q3x]
	mulpd			80(DX_P),							%xmm4				// xmm4 <-- [p5x, q5x]
	mulpd			16(DX_P),							%xmm0				// xmm0 <-- [p1x, q1x]
	movapd		96(DX_P),							%xmm6				// xmm6 <-- [p6, q6]
	
#if defined( __SSE3__ )
	movddup		%xmm2,								%xmm5				// xmm5 <-- [x2, x2]
#else
	movapd    %xmm2,                %xmm5
	unpcklpd  %xmm5,                %xmm5
#endif
	
	mulsd			%xmm2,								%xmm2				// xmm2 <-- x4
	mulpd			%xmm5,								%xmm6				// xmm6 <-- [p6x2, q6x2]
	addpd			32(DX_P),							%xmm3				// xmm3 <-- [p3x + p2, q3x + q2]
	addpd			64(DX_P),							%xmm4				// xmm4 <-- [p5x + p4, q5x + q4]
	addpd			(DX_P),								%xmm0				// xmm0 <-- [p1x + p0, q1x + q0]
	
	mulpd			%xmm3,								%xmm5				// xmm5 <-- [p3x3 + p2x2, q3x3 + q2x2]
	addpd			%xmm4,								%xmm6				// xmm6 <-- [p6x2 + p5x + p4, q6x2 + q5x + q4]
	unpcklpd	%xmm2,								%xmm2				// xmm2 <-- [x4, x4]
	mulpd			%xmm2,								%xmm6				// xmm6 <-- [p6x6 + p5x5 + p4x4, q6x6 + q5x5 + q4x4]
	addpd			%xmm0,								%xmm5				// xmm5 <-- [p3x3 + p2x2 + p1x + p0, q3x3 + q2x2 + q1x + q0]
	addpd			%xmm6,								%xmm5				// xmm5 <-- [p(x), q(x)]
	
	movhlps		%xmm5,								%xmm6				// xmm6 <-- q(x)
	divsd			%xmm6,								%xmm5				// xmm5 <-- p(x)/q(x)
	
	cvtsd2ss	%xmm5,								%xmm0
	orps			%xmm1,								%xmm0
#if defined(__i386__)
	movss			%xmm0,								FRAME_SIZE( STACKP )
	flds			FRAME_SIZE( STACKP )
#endif
	ret

2:
	jge				3f																// if ( |x| > 4 or isnan(x) ) goto 3
	
// Polynomial approximation to asinhf(x) for |x| < 1/4.  Details of the computation are at top with the constants.

	lea		RELATIVE_ADDR(asinhf_low), DX_P
	
#if defined( __SSE3__ )
	movddup		%xmm2,								%xmm0				// xmm0 <-- [x, x]
#else
	movapd    %xmm2,                %xmm0
	unpcklpd  %xmm0,                %xmm0
#endif

	cmppd			$0,				%xmm0,			%xmm5				// detect |x| = 0
	
	mulsd			48(DX_P),							%xmm2				// xmm2 <-- [..., cx]
	movapd		%xmm0,								%xmm3				// xmm3 <-- [x, x]
	addpd			32(DX_P),							%xmm0				// xmm0 <-- [x, x] + [b1hi, b1lo]
	movapd		%xmm3,								%xmm4				// xmm4 <-- [x, x]
	addpd			16(DX_P),							%xmm3				// xmm3 <-- [x, x] + [ahi, alo]
	mulpd			%xmm4,								%xmm0				// xmm0 <-- [x, x] * [x + b1hi, x + b1lo]
	mulsd			%xmm2,								%xmm3				// xmm3 <-- [x + ahi, cx * (x + alo)]
	addpd			(DX_P),								%xmm0				// xmm0 <-- [x^2 + b1hi x, x^2 + b1lo x] + [b0hi, b0lo]
	
	andnpd		%xmm3,								%xmm5				// if |x| = 0, set xmm3 = 0 to suppress inexact from the next multiply.
	
	mulpd			%xmm5,								%xmm0				// xmm0 <-- [(x + ahi)(x^2 + b1hi x + b0hi), cx(x + alo)(x^2 + b1lo x + b0lo)]
	movhlps		%xmm0,								%xmm2				// xmm2 <-- [..., (x + ahi)(x^2 + b1hi x + b0hi)]
	mulsd			%xmm2,								%xmm0				// xmm0 <-- cx(x + alo)(x + ahi)(x^2 + b1lo x + b0lo)(x^2 + b1hi x + b0hi)
	
	cvtsd2ss	%xmm0,								%xmm0
	orps			%xmm1,								%xmm0
#if defined(__i386__)
	movss			%xmm0,								FRAME_SIZE( STACKP )
	flds			FRAME_SIZE( STACKP )
#endif
	ret

3:
	addl			$0x3e800000,					%eax				// %eax <-- |x|
	cmpl			$0x7f800000,					%eax
	ja				4f																// if isnan(x)  goto 4
	je				5f																// if (|x| = ) goto 5
	
// Go off and compute the reciprocal of |x|.  This will take a little while, but we have other stuff to do
// while this is up in the air.	
	
		movsd		RELATIVE_ADDR( one ),	%xmm0
		divsd		%xmm2,								%xmm0				// xmm0 <-- 1/|x|
	
// We need to take a logarithm for this case.  This code is shamelessly taken from our log2f.s implementation.
// Consult that file for more details of operation.

// begin by factoring x as 2^n*(mantissa)

	movl			%eax,									%edx
	shrl			$23,									%edx				// right-align exponent bits
	subl			$126,									%edx				// subtract exponent bias, but add one to get log(2x) instead of log(x)
																							//    i.e. we subtract 126 instead of 127 (the actual bias).
	cvtsi2sd	%edx,									%xmm2				// xmm2 <-- trunc(lg |x|) = n
	mulsd		RELATIVE_ADDR(log2),		%xmm2				// xmm2 <-- log(2^n)
	
// now, let mantissa = 1 + a + b = (1 + a)(1 + r), if r = b/(1+a), and
//
//      log(mantissa) = log(1+a) + log(1+r).
//
// we look up log(1+a) and 1/(1+a) in a table, keyed from a, and we compute log(1+r) via taylor series.
	
	movl			%eax,									%edx
	andl			$0x00007fff,					%eax				// eax <-- b
	andl			$0x007f8000,					%edx				// edx <-- a << 15
	orl				$0x3f800000,					%eax				// eax <-- 1 + b
	shrl			$(15-4),							%edx				// edx <-- a << 4, lookup key for log(1+a)
	movd			%eax,									%xmm3				// xmm3 <-- 1 + b
	cvtss2sd	%xmm3,								%xmm3
	subsd		RELATIVE_ADDR( one ),		%xmm3				// xmm3 <-- b
	lea	RELATIVE_ADDR(asinhf_hi_table), AX_P
	mulsd			8(AX_P, DX_P, 1),			%xmm3				// xmm3 <-- r = b * 1/(1+a)
	movapd		%xmm3,								%xmm4
	mulsd	RELATIVE_ADDR(onethird),	%xmm3				// xmm3 <-- 1/3 r
	movapd		%xmm4,								%xmm5				// xmm5 <-- r
	mulsd			%xmm4,								%xmm4				// xmm4 <-- r^2
	subsd	RELATIVE_ADDR(onehalf),		%xmm3				// xmm3 <-- 1/3 r - 1/2
	mulsd			%xmm4,								%xmm3				// xmm3 <-- 1/3 r^3 - 1/2 r^2
	addsd			%xmm5,								%xmm3				// xmm3 <-- 1/3 r^3 - 1/2 r^2 + r ~ log(1+r)
	addsd			(AX_P, DX_P, 1),			%xmm3				// xmm3 <-- log(a) + log(1+r)
	addsd			%xmm2,								%xmm3				// xmm3 <-- log(2x) with error < .5002 ulps
	
	
// Now we compute a correction from a polynomial in 1/|x|, which conveniently has landed in %xmm0 about now.

	movsd			-8(AX_P),							%xmm6				// xmm6 <-- [ ... , c ]
	movapd		-32(AX_P),						%xmm7				// xmm7 <-- [ b1hi, b1lo ]
#if defined( __SSE3__ )
	movddup   %xmm0,								%xmm2       // xmm2 <-- [ 1/x , 1/x ]
#else
	movapd    %xmm0,								%xmm2
	unpcklpd  %xmm2,                %xmm2
#endif 
	mulsd			%xmm0,								%xmm6				// xmm6 <-- [ ... , c/x ]
	addpd			%xmm2,								%xmm7				// xmm7 <-- [ 1/x + b1hi, 1/x + b1lo ]
	addsd			-16(AX_P),						%xmm0				// xmm0 <-- [ ... , 1/x + a ]
	mulpd			%xmm7,								%xmm2				// xmm2 <-- [ 1/x^2 + b1hi/x, 1/x^2 + b1lo/x ]
	mulsd			%xmm6,								%xmm0       // xmm0 <-- [ ... , c/x(1/x + a) ]
	addpd			-48(AX_P),						%xmm2				// xmm2 <-- [ 1/x^2 + b1hi/x + b0hi, 1/x^2 + b1lo/x + b0lo ]
	movhlps		%xmm2,								%xmm4				// xmm4 <-- [ ... , 1/x^2 + b1hi/x + b0hi ]
	mulsd			%xmm2,								%xmm0				// xmm0 <-- [ ... , c/x(1/x + a)(1/x^2 + b1lo/x + b0lo) ]
	mulsd			%xmm4,								%xmm0				// xmm0 <-- [ ... , c/x(1/x + a)(1/x^2 + b1lo/x + b0lo)(1/x^2 + b1hi/x + b0hi) ]
	
// Add log(2x) to the correction to get the final result	
	
	addsd			%xmm3,								%xmm0
	cvtsd2ss	%xmm0,								%xmm0
	orps			%xmm1,								%xmm0
#if defined(__i386__)
	movss			%xmm0,								FRAME_SIZE( STACKP )
	flds			FRAME_SIZE( STACKP )
#endif
	ret

4:
	addss			%xmm0,								%xmm0				// quiet the NaN
5:
	#if defined(__i386__)
	movss			%xmm0,								FRAME_SIZE( STACKP )
	flds			FRAME_SIZE( STACKP )
#endif
	ret