roundl.s [plain text]

/*
 *  roundl.s
 *
 *		by Ian Ollmann
 *
 *  Copyright (c) 2007 Apple Inc. All rights reserved.
 *
 *	Implementation for C99 round, lround and llround functions for __i386__ and __x86_64__.
 */

#include "machine/asm.h"

#define LOCAL_STACK_SIZE	12
#include "abi.h"

.literal8
zero:           .long   0,          0x80000000      // { 0.0f, -0.0f }
one:            .long   0x3f800000, 0xbf800000      // { 1.0f, -1.0f }
large:          .long   0x5f000000, 0xdf000000      // { 0x1.0p63, -0x1.0p63 }

.literal16
explicitBit:    .quad   0x8000000000000000,     0
roundMask62:    .quad   0xFFFFFFFFFFFFFFFE,     0

.text
#if defined( __x86_64__ )
ENTRY( roundl )
    movzwq  8+FRAME_SIZE( STACKP ),     %rdx    // sign + biased exponent
    movq    FRAME_SIZE( STACKP ),       %rax    // mantissa
    fldt    FRAME_SIZE( STACKP )                // {x}
    movq    %rdx,                       %r8     // sign + biased exponent
    andq    $0x7fff,                    %rdx    // exponent + bias
    shrq    $15,                        %r8     // x < 0 ? 1 : 0
    subq    $0x3ffe,                    %rdx    // push |x| < 0.5 negative
    cmp     $(63),                      %rdx    // if( |x| >= 0x1.0p62 || |x| < 0.5 || isnan(x) )
    jae     1f                                  //      goto 1
  
//  |x| >= 0.5 and conversion does not overflow.
    movq    $63,                        %rcx    // 63
    subq    %rdx,                       %rcx    // 63-(exponent+1)
    leaq    large(%rip),                %r9     // address of large array
    fadds   (%r9, %r8, 4)                       // { x + (x < 0 ? -0x1.0p63 : 0x1.0p63) }       set inexact as necessary
    shrq    %cl,                        %rax    // shift units bit into 2's position
    fstp    %st(0)                              // { }
    addq    $1,                         %rax    // round away from zero
    shrq    $1,                         %rax    // shift units bit to 1's position

    // find new exponent
    bsrq    %rax,                       %r9     // position of leading set bit. rax is never zero.
    movq    $0x3fff,                    %rdx    // bias
    movq    $63,                        %rcx    // 63
    addq    %r9,                        %rdx    // biased exponent
    subq    %r9,                        %rcx    // 63 - position of leading set bit
    movw    %dx,        8+FRAME_SIZE( STACKP )  // write out new exponent
    
    // shift significand into position
    shlq    %cl,                        %rax    // shift leading bit to higest position
    movq    %rax,       FRAME_SIZE( STACKP )    // write mantissa

    // get sign
    fldt    FRAME_SIZE( STACKP )                // { |result| }
    leaq    one( %rip ),                %rax    // address of one array
    fmuls   (%rax, %r8, 4 )                     // { result }       multiply by +1 or -1 according to sign of original result
    ret
    
//  |x| >= 0x1.0p62 || |x| < 0.5 || isnan(x)
1:  je      3f
    jg      2f

//  |x| < 0.5
    fistpl  FRAME_SIZE( STACKP )                // { } set inexact if x != 0
    leaq    zero( %rip),                %rax    // address of zero array
    flds    (%rax, %r8, 4 )                     // load result
2:  ret
  
//  0x1.0p62 <= |x| < 0x1.0p63
3:  leaq    large(%rip),                %r9     // address of large array
    fadds   (%r9, %r8, 4)                       // { x + (x < 0 ? -0x1.0p63 : 0x1.0p63) }       set inexact as necessary
    fstp    %st(0)                              // { }
    addq    $1,                         %rax    // add 0.5 to significand
    jz      4f                                  // handle overflow
    
    andq    roundMask62(%rip),          %rax    // prune fractional bits
    movq    %rax,                       FRAME_SIZE( STACKP )    // write to mantissa
    fldt    FRAME_SIZE( STACKP )                // load result
    ret

// result is +- 0x1.0p63
4:  flds    (%r9, %r8, 4)                       // load result
    ret
    
    

#else
ENTRY( roundl )
    movzwl  8+FRAME_SIZE( STACKP ),     %edx
    movq    FRAME_SIZE( STACKP ),       %xmm0
    fldt    FRAME_SIZE( STACKP )
    calll   0f
0:  popl    %ecx
    movl    %edx,                       %eax    // sign + biased exponent
    andl    $0x7fff,                    %edx    // biased exponent
    shrl    $15,                        %eax    // signof( x )
    subl    $0x3ffe,                    %edx    // push |x| < 0.5 negative
    cmp     $63,                        %edx    // if( |x| >= 0x1.0p62 || |x| < 0.5 || isnan(x) )
    jae     1f                                  //      goto 1

//  |x| >= 0.5 and conversion does not overflow.
    subl    $63,                        %edx    // (exponent+1) - 63
    fadds   (large-0b)(%ecx, %eax, 4)           // set inexact if necessary
    negl    %edx                                // 63 - (exponent+1)
    fstp    %st(0)                              // {}
    movd    %edx,                       %xmm1   // 63 - (exponent+1)
    psrlq   %xmm1,                      %xmm0   // move 0.5 bit to units position
    pcmpeqb %xmm1,                      %xmm1   // -1
    psubq   %xmm1,                      %xmm0   // add 1
    psrlq   $1,                         %xmm0   // move 1's bit to units position
    movq    %xmm0,                      FRAME_SIZE( STACKP )    // write out
    
    fildll  FRAME_SIZE( STACKP )                // { |result| }
    fmuls   (one-0b)(%ecx, %eax, 4)             // { result }
    ret


//  |x| >= 0x1.0p62 || |x| < 0.5 || isnan(x)
1:  je      3f
    jg      2f

//  |x| < 0.5
    fistpl  FRAME_SIZE( STACKP )                // { } set inexact if x != 0
    flds    (zero-0b)(%ecx, %eax, 4 )                     // load result
2:  ret

//  0x1.0p62 <= |x| < 0x1.0p63
3:  fadds   (large-0b)(%ecx, %eax, 4)           // { x + (x < 0 ? -0x1.0p63 : 0x1.0p63) }       set inexact as necessary
    fstp    %st(0)                              // { }
    movdqa  %xmm0,                      %xmm2   // significand
    pcmpeqb %xmm1,                      %xmm1   // -1LL
    psubq   %xmm1,                      %xmm0   // add 0.5 to significand
    pxor    %xmm0,                      %xmm2   // set leading bit if leading bit changed (overflow)
    movmskpd    %xmm2,                  %edx
    test    $1,                         %edx
    jnz     4f
    
    pand    (roundMask62-0b)(%ecx),     %xmm0    // prune fractional bits
    movq    %xmm0,                      FRAME_SIZE( STACKP )    // write to mantissa
    fldt    FRAME_SIZE( STACKP )                // load result
    ret

// result is +- 0x1.0p63
4:  flds    (large-0b)(%ecx, %eax, 4)           // load result
    ret


#endif

	
#if defined ( __i386__ )

    ENTRY( round )
        movsd       FRAME_SIZE( STACKP ),   %xmm0
        fldl        FRAME_SIZE( STACKP )
        xorps       %xmm1,                  %xmm1
        ucomisd     %xmm0,                  %xmm1
        psrlq       $32,                    %xmm0
        je          1f
        movd        %xmm0,                  %ecx
        movl        %ecx,                   %eax
        andl        $0x7fffffff,            %ecx
        cmpl        $0x43300000,            %ecx
        jae         1f

        //truncate
        fld         %st(0)                      // { x, x }
    #if defined( __SSE3__ )    
        fisttpll    FRAME_SIZE( STACKP )
        andl        $0x80000000,            %eax
        orl         $0x3f800000,            %eax
        fildll      FRAME_SIZE( STACKP )
    #else
        fnstcw      FRAME_SIZE( STACKP )
        andl        $0x80000000,            %eax
        orl         $0x3f800000,            %eax
        movw        FRAME_SIZE( STACKP ),       %cx
        movw        %cx,                        %dx
        orw         $0xc00,                     %cx
        movw        %cx,                        FRAME_SIZE( STACKP )
        fldcw       FRAME_SIZE( STACKP )
        frndint
        movw        %dx,                        FRAME_SIZE( STACKP )
        fldcw       FRAME_SIZE( STACKP )    
    #endif
        
        fsubr       %st(0), %st(1)              // { trunc(x), fract }
        fxch                                    // { fract, trunc(x) }
        fstpl       FRAME_SIZE(STACKP)          // { trunc(x) }
        movl        4+FRAME_SIZE(STACKP), %ecx
        andl        $0x3fe00000,        %ecx
        cmpl        $0x3fe00000,        %ecx
        jne         1f
        movl        %eax,   FRAME_SIZE( STACKP )
        flds        FRAME_SIZE(STACKP)          // { copysign( 1.0, x ), trunc(x) }
        faddp                                   // { round(x) }
        
    1:  ret

#else //x86_64

    ENTRY( round )
        movd    %xmm0,                      %rcx
        movq    $-1,                        %rdx            // -1ULL
        movq    $0x43300000,                %rax            
        shrq    $1,                         %rdx            // 0x7fffffffffffffff
        shlq    $32,                        %rax            // 0x1.0p52
        andq    %rdx,                       %rcx            // |x|
        subq    $1,                         %rcx            // push 0 negative
        cmpq    %rax,                       %rcx            // |x| - 1 ulp >= 0x1.0p52?
        jae     1f                                          // if x == 0.0f, x >= 2**23 or x is NaN, skip to the end

        //find trunc(x)
        cvttsd2si   %xmm0,                  %rax
        movl        $0x3ff00000,            %edx
        movl        $0x80000000,            %ecx
		movsd		%xmm0,					%xmm1
        cvtsi2sd    %rax,                   %xmm0           // trunc(x)
        subsd       %xmm0,                  %xmm1           // fraction
        movd        %xmm1,                  %rax
        shrq        $32,                    %rax			// top 32-bits of the fraction
        andl        %eax,                   %ecx			// set aside sign bit
        andl        %edx,                   %eax			// and with 0x3ff00000 to remove sign (mantissa unimportant here)
        cmpl        $0x3fe00000,            %eax			// if the fraction was less than 0.5
        jb          1f										//  then exit

        orl         %ecx,                   %edx			// copysign( 1.0, x ) >> 32
        shlq        $32,                    %rdx			// copysign( 1.0, x )
        movd        %rdx,                   %xmm1			// move to vr
        addsd       %xmm1,                  %xmm0			// round trunc(x) away from zero
        
    1:  ret

#endif


ENTRY( roundf )
#if defined( __i386__ )
    movl    FRAME_SIZE( STACKP ),       %ecx
    movss   FRAME_SIZE( STACKP ),       %xmm0
#else
    movd    %xmm0,                      %ecx
#endif
    movl    %ecx,                       %edx
    andl    $0x7fffffff,                %ecx
    subl    $1,                         %ecx            // push 0 negative
    cmpl    $0x4affffff,                %ecx            
    jae     1f                                          // if x == 0.0f, x >= 2**23 or x is NaN, skip to the end

    cvttps2dq   %xmm0,                  %xmm1           // Find trunc(x) as integer
    andl    $0x80000000,                %edx            // signof(x)
    movl    $0x3f000000,                %ecx            // 0.5f
    cvtdq2ps    %xmm1,                  %xmm1           // trunc(x) as float
    orl     $0x3f800000,                %edx            // copysign( 1.0f, x )
    movd    %ecx,                       %xmm3           // 0.5f

    subss   %xmm1,                      %xmm0           // fract = x - trunc(x):  calculate the fractional part
    movd    %edx,                       %xmm2           // copysign( 1.0f, x )
    andps   %xmm3,                      %xmm0           // |fract| >= 0.5f ? 0.5f : something else 
    pcmpeqd %xmm3,                      %xmm0           // |fract| >= 0.5f ? -1U : 0
    andps   %xmm2,                      %xmm0           // |fract| >= 0.5f ? copysign( 1.0f, x ) : 0
    addss   %xmm1,                      %xmm0           // roundf(x)

#if defined( __i386__ )
    movss       %xmm0,                  FRAME_SIZE( STACKP )
#endif

1:
#if defined( __i386__ )
    flds        FRAME_SIZE( STACKP )
#endif
    ret