floorf.s [plain text]

/* Single-precision floorf, reimplemented using integer operations
 * for improved performance, especially on in-order machines.
 *
 * Steve Canon, March 2009.
 */

#if defined __i386__
 
#include <System/i386/cpu_capabilities.h>
.set cpubits, _COMM_PAGE_CPU_CAPABILITIES

.text
.align 4
.globl _floorf
_floorf:
	movss	  4(%esp),			%xmm0	// load argument
	testl	  $(kHasSSE4_1),	cpubits
	jz			0f
	
	// fast path using SSE 4.1
	roundss		$0x1,	%xmm0,	%xmm0
	movss		%xmm0,		  4(%esp)
	flds	  4(%esp)
	ret

.align 4
0:	// no SSE 4.1
	mov		  4(%esp),			%eax
	mov			$23,			%cl
	mov			%eax,			%edx
	shr			%cl,			%eax
	dec			%edx
	sub			$0x7f,			%al
	jb			2f
	
	sub			%al,			%cl
	mov			$0xffffffff,	%eax
	jbe			1f

	shl			%cl,			%eax	// m = mask for integral bits of x
	mov			%edx,			%ecx
	not			%edx
	sar			$31,			%edx	// (x < 0) ? -1 : 0
	or			%eax,			%edx	// (x < 0) ? -1 : m
	sub			%edx,			%ecx	// (x < 0) ? x : (x + (1.0 - ulp(x)))
	and			%ecx,			%eax	// ceil(x)
	mov			%eax,		  4(%esp)
	cvttps2dq	%xmm0,			%xmm0	// set inexact
1:	flds	  4(%esp)
	ret
.align 4
2:	cvttps2dq	%xmm0,			%xmm0	// set inexact
	cmp			$0xffffffff,	%edx	// if x < 0.0, goto 3
	jl			3f
	andl		$0x80000000,  4(%esp)	// copysign(0.0, x)
	flds	  4(%esp)
	ret
.align 4
3:	movl		$0xbf800000,  4(%esp)	// return 1.0
	flds	  4(%esp)
	ret

#elif defined __x86_64__

.const
.align 4
mone:	.long	0xbf800000
absmask:.long	0x7fffffff

.text
.align 4
.globl _floorf
_floorf:
    movd		%xmm0,			%ecx
    andl		$0x7fffffff,	%ecx	// |x|
    subl		$1,				%ecx	// subtract 1. This forces |+-0| to -0
    cmpl		$0x4afffffe,	%ecx	// values >= 0x4b000000 - 1 are either integers, NaN or Inf	
    ja			1f						// unsigned compare adds 0 to the list

    cvttps2dq   %xmm0,			%xmm2
    cvtdq2ps    %xmm2,			%xmm2
    cmpltss		%xmm2,			%xmm0
    andps		mone(%rip),		%xmm0
    addss		%xmm2,			%xmm0
1:	ret

#else
	#error unknown arch
#endif