roundl.s [plain text]

/*
 *  roundl.s
 *
 *		by Ian Ollmann
 *
 *  Copyright (c) 2007 Apple Inc. All rights reserved.
 *
 *	Implementation for C99 round, lround and llround functions for __i386__ and __x86_64__.
 */

#include "machine/asm.h"

#define LOCAL_STACK_SIZE	12
#include "abi.h"

.literal8
zero:           .long   0,          0x80000000      // { 0.0f, -0.0f }
one:            .long   0x3f800000, 0xbf800000      // { 1.0f, -1.0f }
large:          .long   0x5f000000, 0xdf000000      // { 0x1.0p63, -0x1.0p63 }

.literal16
explicitBit:    .quad   0x8000000000000000,     0
roundMask62:    .quad   0xFFFFFFFFFFFFFFFE,     0

.text
#if defined( __x86_64__ )
ENTRY( roundl )
    movzwq  8+FRAME_SIZE( STACKP ),     %rdx    // sign + biased exponent
    movq    FRAME_SIZE( STACKP ),       %rax    // mantissa
    fldt    FRAME_SIZE( STACKP )                // {x}
    movq    %rdx,                       %r8     // sign + biased exponent
    andq    $0x7fff,                    %rdx    // exponent + bias
    shrq    $15,                        %r8     // x < 0 ? 1 : 0
    subq    $0x3ffe,                    %rdx    // push |x| < 0.5 negative
    cmp     $(63),                      %rdx    // if( |x| >= 0x1.0p62 || |x| < 0.5 || isnan(x) )
    jae     1f                                  //      goto 1
  
//  |x| >= 0.5 and conversion does not overflow.
    movq    $63,                        %rcx    // 63
    subq    %rdx,                       %rcx    // 63-(exponent+1)
    leaq    large(%rip),                %r9     // address of large array
    fadds   (%r9, %r8, 4)                       // { x + (x < 0 ? -0x1.0p63 : 0x1.0p63) }       set inexact as necessary
    shrq    %cl,                        %rax    // shift units bit into 2's position
    fstp    %st(0)                              // { }
    addq    $1,                         %rax    // round away from zero
    shrq    $1,                         %rax    // shift units bit to 1's position

    // find new exponent
    bsrq    %rax,                       %r9     // position of leading set bit. rax is never zero.
    movq    $0x3fff,                    %rdx    // bias
    movq    $63,                        %rcx    // 63
    addq    %r9,                        %rdx    // biased exponent
    subq    %r9,                        %rcx    // 63 - position of leading set bit
    movw    %dx,        8+FRAME_SIZE( STACKP )  // write out new exponent
    
    // shift significand into position
    shlq    %cl,                        %rax    // shift leading bit to higest position
    movq    %rax,       FRAME_SIZE( STACKP )    // write mantissa

    // get sign
    fldt    FRAME_SIZE( STACKP )                // { |result| }
    leaq    one( %rip ),                %rax    // address of one array
    fmuls   (%rax, %r8, 4 )                     // { result }       multiply by +1 or -1 according to sign of original result
    ret
    
//  |x| >= 0x1.0p62 || |x| < 0.5 || isnan(x)
1:  je      3f
    jg      2f

//  |x| < 0.5
    fistpl  FRAME_SIZE( STACKP )                // { } set inexact if x != 0
    leaq    zero( %rip),                %rax    // address of zero array
    flds    (%rax, %r8, 4 )                     // load result
2:  ret
  
//  0x1.0p62 <= |x| < 0x1.0p63
3:  leaq    large(%rip),                %r9     // address of large array
    fadds   (%r9, %r8, 4)                       // { x + (x < 0 ? -0x1.0p63 : 0x1.0p63) }       set inexact as necessary
    fstp    %st(0)                              // { }
    addq    $1,                         %rax    // add 0.5 to significand
    jz      4f                                  // handle overflow
    
    andq    roundMask62(%rip),          %rax    // prune fractional bits
    movq    %rax,                       FRAME_SIZE( STACKP )    // write to mantissa
    fldt    FRAME_SIZE( STACKP )                // load result
    ret

// result is +- 0x1.0p63
4:  flds    (%r9, %r8, 4)                       // load result
    ret
    
    

#else
ENTRY( roundl )
    movzwl  8+FRAME_SIZE( STACKP ),     %edx
    movq    FRAME_SIZE( STACKP ),       %xmm0
    fldt    FRAME_SIZE( STACKP )
    calll   0f
0:  popl    %ecx
    movl    %edx,                       %eax    // sign + biased exponent
    andl    $0x7fff,                    %edx    // biased exponent
    shrl    $15,                        %eax    // signof( x )
    subl    $0x3ffe,                    %edx    // push |x| < 0.5 negative
    cmp     $63,                        %edx    // if( |x| >= 0x1.0p62 || |x| < 0.5 || isnan(x) )
    jae     1f                                  //      goto 1

//  |x| >= 0.5 and conversion does not overflow.
    subl    $63,                        %edx    // (exponent+1) - 63
    fadds   (large-0b)(%ecx, %eax, 4)           // set inexact if necessary
    negl    %edx                                // 63 - (exponent+1)
    fstp    %st(0)                              // {}
    movd    %edx,                       %xmm1   // 63 - (exponent+1)
    psrlq   %xmm1,                      %xmm0   // move 0.5 bit to units position
    pcmpeqb %xmm1,                      %xmm1   // -1
    psubq   %xmm1,                      %xmm0   // add 1
    psrlq   $1,                         %xmm0   // move 1's bit to units position
    movq    %xmm0,                      FRAME_SIZE( STACKP )    // write out
    
    fildll  FRAME_SIZE( STACKP )                // { |result| }
    fmuls   (one-0b)(%ecx, %eax, 4)             // { result }
    ret


//  |x| >= 0x1.0p62 || |x| < 0.5 || isnan(x)
1:  je      3f
    jg      2f

//  |x| < 0.5
    fistpl  FRAME_SIZE( STACKP )                // { } set inexact if x != 0
    flds    (zero-0b)(%ecx, %eax, 4 )                     // load result
2:  ret

//  0x1.0p62 <= |x| < 0x1.0p63
3:  fadds   (large-0b)(%ecx, %eax, 4)           // { x + (x < 0 ? -0x1.0p63 : 0x1.0p63) }       set inexact as necessary
    fstp    %st(0)                              // { }
    movdqa  %xmm0,                      %xmm2   // significand
    pcmpeqb %xmm1,                      %xmm1   // -1LL
    psubq   %xmm1,                      %xmm0   // add 0.5 to significand
    pxor    %xmm0,                      %xmm2   // set leading bit if leading bit changed (overflow)
    movmskpd    %xmm2,                  %edx
    test    $1,                         %edx
    jnz     4f
    
    pand    (roundMask62-0b)(%ecx),     %xmm0    // prune fractional bits
    movq    %xmm0,                      FRAME_SIZE( STACKP )    // write to mantissa
    fldt    FRAME_SIZE( STACKP )                // load result
    ret

// result is +- 0x1.0p63
4:  flds    (large-0b)(%ecx, %eax, 4)           // load result
    ret


#endif