# log2f.s   [plain text]

```/*
*  log2f.s
*
*      by Ian Ollmann
*
*/

#include <machine/asm.h>
#include "abi.h"

.const
.align 4

//  256 entry of Lookup table of values used for log2 calculation, generated as:
//
//  [ollmia:/tmp] iano% cat main.c
//  #include <stdio.h>
//  #include <stdint.h>
//  #include <math.h>
//
//   int main (void)
//    {
//
//      int i;
//
//      for( i = 0; i < 256; i++ )
//      {
//          double d = (double) i / 256.0 + 1.0;
//  	union
//  	{
//  		double 	d;
//  		uint64_t	u;
//  	}u, u2;
//
//  	u.d = log2l( (long double) d );
//  	u2.d = 1.0 / d;
//
//          printf( "0x%llx, 0x%llx,\t//log2(%7.5f), 1/%7.5f\n", u.u, u2.u, d, d );
//      }
//
//      return 0;
//   }

log2f_table:    .quad	               0x0,	0x3ff0000000000000	//log2(1.00000), 1/1.00000

.literal8
.align 3
one:            .double     1.0
onehalf:        .double     0.5
onequarter:     .double     0.25
onefifth:       .double     0.2
recip_log2:     .quad       0x3ff71547652b82fe      // 1.0 / ln(2)

.text

#if defined( __x86_64__ )
#define RELATIVE_ADDR( _a)								(_a)( %rip )
#define RELATIVE_ADDR_B( _a)							(_a)( %rip )
#define RELATIVE_ADDR2( _a, _i, _step)                  ( %r8, _i, _step )
#elif defined( __i386__ )

//a short routine to get the local address
.align 4
ret
#else
#error arch not supported
#endif

//
//  log2f -- overall approach
//
//      We break up log2f(x) as follows:
//
//          i = ilogb(x)                //integer part of the result
//          m = scalbn( x, -i )         //mantissa has range 1.0 <= m < 2.0
//
//      log2f(x) = i + log2f(m)
//
//      We further break down m as :
//
//          m = (1+a/256.0)(1+r)              a = high 8 explicit bits of mantissa(m)
//          log2f(m) = log2(1+a/256.0) + log2(1+r)
//
//      We use the high 8 bits of the mantissa to look up log2(1+a/256.0) in log2f_table above
//      We calculate 1+r as:
//
//          1+r = m * (1 /(1+a/256.0))
//
//      We can lookup (from the same table) the value of 1/(1+a/256.0) based on a too.
//
//      So the whole calculation is:
//
//          log2f(x) = i + log2(1+a/256.0) + log2(1+r) = i + log2f_table[a][0] + log2f( m * log2f_table[a][1] )
//
//      The third term is calculated using the Taylor series:
//
//          log(x+1) = x - x**2/2 + x**3/3
//
//      Adding one more term produces excellent results, but this is good enough to get within 0.519028 ulps
//      for the entire single precision number range and saves 7 cycles.
//
//      The edge case code is done in integer to avoid setting flags, and because it is faster that way.
//

ENTRY( log2f )
#if defined( __i386__ )
movl    FRAME_SIZE( STACKP ),   %eax
#else
movd    %xmm0,                  %eax
#endif

//Get exceptional cases out of the way
//push NaNs around to negative
movl    %eax,                   %ecx                // set aside x
addl    \$0x00800000,            %eax                // push Infs, NaNs negative
xorps   %xmm1,                  %xmm1               // 0.0f
cmpl    \$0x00800000,            %eax                // if( x <= 0 or Inf, NaN )
jle     2f                                          //      goto 2

//Check for narrow range near 1 where 3rd order Taylor series is not enough
subl    \$0x3ff80000,            %eax                // subtract low value in our range (+0x00800000)
cmpl    \$0x000d8000,            %eax                // check to see if we are in the range
jb      7f

movl    \$-127,                  %edx                // bias of float
cmpl    \$0x00800000,            %ecx                // if isnormal( x )
jae     1f                                          //      goto 1

//deal with denormal
subl    \$126,                   %edx                // set aside -126 as a bias adjustment (was 0 )
orl     \$0x3f800000,            %ecx                // multiply low bits by 2**127 by oring in 1.0
movl    \$0x3f800000,            %eax                // prepare 1.0f
movd    %ecx,                   %xmm3               // copy to vr
movd    %eax,                   %xmm4               // copy to vr
subss   %xmm4,                  %xmm3               // (x | 2**127) - 1.0f
movd    %xmm3,                  %ecx                // copy normalized scaled number back to %ecx

1:  //normal operand
movl    %ecx,                   %eax                // x
andl    \$0x7f800000,            %ecx                // exponent of x
cmpl    %ecx,                   %eax                // is this a power of 2?
je      6f                                          //  early out for power of 2, saves inexact flag for exact cases

//non-power of two
xorl    %ecx,                   %eax                // mask off exponent
shrl    \$23,                    %ecx                // push exponent to unit precision
addl    %edx,                   %ecx                // remove bias from exponent part
movl    %eax,                   %edx                // set aside mantissa bits
cvtsi2sd  %ecx,                 %xmm0               // move integer part of the result to xmm0.
andl    \$0x00007fff,            %eax                // isolate low bits of mantissa
andl    \$0x007f8000,            %edx                // isolate high bits of mantissa
orl     \$0x3f800000,            %eax                // set exponent of the mantissa to 0 + bias
shrl    \$(15-4),                %edx
movd    %eax,                   %xmm1               // move reduced mantissa to xmm0 (1+r)
cvtss2sd    %xmm1,              %xmm1

//do PIC
#if defined( __i386__ )
calll   log2f_pic                                   // set %ecx to point to local_addr
#else
andq    \$0xff0,                 %rdx
#endif

subsd   RELATIVE_ADDR( one ),   %xmm1               // r = (1+r) - 1.0

mulsd   8( AX_P, DX_P, 1),      %xmm1               // r *= 1 / (1+a)

//Do a relatively small/cheap Taylor series:    log2(r+1) = (r - 0.5rr + (1/3)rrr)/ln(2)
movsd   RELATIVE_ADDR( onethird), %xmm3             // 1/3
movapd  %xmm1,                  %xmm2               // r
mulsd   %xmm1,                  %xmm3               // (1/3)r
mulsd   %xmm2,                  %xmm2               // rr
subsd   RELATIVE_ADDR( onehalf),%xmm3               // -0.5 + (1/3)r
mulsd   %xmm2,                  %xmm3               // -0.5rr + (1/3)rrr
addsd   %xmm1,                  %xmm3               // r - 0.5rr + (1/3)rrr
mulsd   RELATIVE_ADDR( recip_log2 ), %xmm3          // (r - 0.5rr + (1/3)rrr)/ln(2)
addsd   ( AX_P, DX_P, 1),       %xmm3               // log2(1+a) + (r - 0.5rr + (1/3)rrr)/ln(2)
cvtsd2ss %xmm0,                 %xmm0               // round to float

#if defined( __i386__ )
movss   %xmm0,                  FRAME_SIZE(STACKP)
flds    FRAME_SIZE(STACKP)
#endif
ret

2:  //special case code for negative, NaN or 0
#if defined( __i386__ )
movss   FRAME_SIZE( STACKP ),   %xmm0
#endif
ucomiss %xmm1,                  %xmm0               //test for x == 0
jb      4f                                          //NaN or negative

//We imagine that 0.0, Inf are the most common cases, so those falls through
ja      3f                                          //Infinity, just return inf

//We need to return -Inf for zero and set div/0 flag
pcmpeqb %xmm0,                  %xmm0               // -1U
cvtdq2ps %xmm0,                 %xmm0               // -1.0f
divss   %xmm1,                  %xmm0               // -1.0f / 0 = -Inf + div/0 flag
#if defined( __i386__ )
movss   %xmm0,                  FRAME_SIZE(STACKP)
#endif
3:
#if defined( __i386__ )
flds    FRAME_SIZE(STACKP)
#endif
ret

4:  jp      5f                                          // handle NaN elsewhere

//negative number, return NaN and set invalid
pcmpeqb %xmm0,                  %xmm0               // -1U
pslld   \$23,                    %xmm0               // 0xff800000, -inf
mulss   %xmm1,                  %xmm0               // 0 * -inf = NaN, set invalid
#if defined( __i386__ )
movss   %xmm0,                  FRAME_SIZE(STACKP)
flds    FRAME_SIZE(STACKP)
#endif
ret

//Its a NaN
5:
#if defined( __i386__ )
flds    FRAME_SIZE(STACKP )                         //load the NaN
#else
#endif
ret

6:  //early out for power of 2
shrl    \$23,                    %eax
#if defined( __i386__ )
movl    %eax,                   FRAME_SIZE( STACKP )
fildl    FRAME_SIZE(STACKP)
#else
cvtsi2ss %eax,                  %xmm0
#endif
ret

7:
#if defined( __i386__ )
cvtss2sd FRAME_SIZE(STACKP),    %xmm2               // convert to double
xorps   %xmm0,                  %xmm0               // zero xmm0

//do PIC
calll   log2f_pic                                   // set %ecx to point to local_addr
#else
cvtss2sd    %xmm0,              %xmm2               // convert to double
#endif

subsd   RELATIVE_ADDR_B( one ), %xmm2               // r = x - 1.0
movapd  %xmm2,                  %xmm1               // r again
mulsd   %xmm2,                  %xmm2

//Do a more expensive Taylor series:    log2(r+1) = (r - 0.5rr + (1/3)rrr - 0.25rrrr + 0.2rrrrr - 1/6rrrrrr)/ln(2)
movsd   RELATIVE_ADDR_B( onesixth ), %xmm4         // 1/6
movsd   RELATIVE_ADDR_B( onefifth ), %xmm3         // 0.2
mulsd   %xmm2,                  %xmm4               // 1/6rr
mulsd   %xmm2,                  %xmm3               // 0.2rr
mulsd   %xmm2,                  %xmm4               // 0.25rr + 1/6rrrr
mulsd   %xmm2,                  %xmm3