asm-block-18.c   [plain text]


/* APPLE LOCAL file CW asm blocks */
/* { dg-do assemble { target i?86*-*-darwin* } } */
/* { dg-require-effective-target ilp32 } */
/* { dg-options { -fasm-blocks -msse3 -O2 } } */
/* Radar 4248228 */

int packedw0x80;
typedef int DWORD;
typedef unsigned char unsigned8;
typedef int int32;
#define M_m0		0
#define M_m8		8
#define M_m16		16
#define M_m24		24

extern void e1(const unsigned8 *, unsigned8 *, int32, int32, int32, int32);

typedef struct
{
  DWORD	m0[2];
  DWORD	m8[2];
  DWORD	m16[2];
  DWORD	m24[2];
} M_2;

void
e2(const unsigned8 *srcPtr, unsigned8 *dstPtr, int32 rows, int32 cols,
   int32 sRowBytes, int32 dRowBytes)
{
  int32	sRowB, dRowB, MMXColCnt, r0sum, r0sq, extras;
  M_2	qArray, *pqArray;
  if (rows <= 0  ||  cols <= 0)
    return;
  if (rows <= 1  ||  cols <= 7)
    {
      e1(srcPtr, dstPtr, rows, cols, sRowBytes, dRowBytes);
      return;
    }
  asm {
	mov		ebx, cols
	sub		rows, 1
	mov		ecx, ebx
	and		ecx, 3
	sar		ebx, 2
	mov		extras, ecx
	mov		MMXColCnt, ebx
	mov		eax, sRowBytes
	mov		ebx, cols
   	mov		ecx, dRowBytes
	and		ebx, 0fffffffCh
	mov		esi, eax
	mov		sRowB, eax
	mov		dRowB, ecx
	mov		eax, srcPtr
	lea		edx, qArray
	sub		eax, esi
	add		edx, 7
	and		edx, 0fffffff8h
	mov		srcPtr, eax
	mov		pqArray, edx
	mov		edi, dstPtr
Row:
	movd		mm1, [-1][eax][esi]
	pxor		mm3, mm3
	movd		mm0, [-1][eax]
	pslld		mm1, 24
	movd		mm2, [-1][eax][esi*2]
	punpcklbw	mm1, mm3
	movq		mm4, mm1
	pslld		mm0, 24
	pslld		mm2, 24
	pmullw		mm1, mm1
	punpcklbw	mm0, mm3
	paddw		mm4, mm0
	punpcklbw	mm2, mm3
	pmullw		mm0, mm0
	paddw		mm4, mm2
	pmullw		mm2, mm2
	punpckhwd	mm1, mm3
	movd		mm6, [eax][esi]
	psrlq		mm4, 48
	movd		mm7, [eax]
	punpckhwd	mm0, mm3
	movd		r0sum, mm4
	paddd		mm0, mm1
	movd		mm5, [eax][esi*2]
	punpckhwd	mm2, mm3
	punpcklbw	mm6, mm3
	paddd		mm0, mm2
	psrlq		mm0, 32
	movq		mm1, mm6
	punpcklbw	mm7, mm3
	pmullw		mm1, mm1
	movd		r0sq, mm0
	punpcklbw	mm5, mm3
	paddw		mm6, mm5
	pmullw		mm5, mm5
	paddw		mm6, mm7
	pmullw		mm7, mm7
	movq		mm4, mm1
	punpcklwd	mm1, mm3
	movq		mm0, mm5
	punpckhwd	mm4, mm3
	movq		mm2, mm7
	punpcklwd	mm0, mm3
	paddd		mm1, mm0
	punpcklwd	mm2, mm3

	punpckhwd	mm5, mm3
	paddd		mm1, mm2
	punpckhwd	mm7, mm3
	paddd		mm4, mm5
	paddd		mm4, mm7
	movq		mm0, mm1
	movd		mm2, r0sq
	movq		mm7, mm4
	movq		mm5, mm4
	psrlq		mm1, 32
	paddd		mm2, mm0
	psllq		mm5, 32
	paddd		mm2, mm1
	paddd		mm7, mm1
	paddd		mm2, mm5
	psllq		mm0, 32
	paddd		mm7, mm5
	psrlq		mm4, 32
	paddd		mm2, mm0
	paddd		mm7, mm4
	movq		mm0, mm2
	pslld		mm2, 3
	movd		r0sq, mm4
	paddd		mm2, mm0

	movd		mm1, r0sum
	movq		mm5, mm6
	paddw		mm5, mm1
	movq		mm1, mm6
	psrlq		mm1, 16
	movq		mm4, mm6
	psllq		mm4, 16
	paddw		mm5, mm1
	psrlq		mm6, 48
	paddw		mm5, mm4
	movq		mm0, mm5
	punpcklwd	mm5, mm3
	mov		ecx, MMXColCnt
	pmaddwd		mm5, mm5
	mov		ebx, pqArray
	movq		mm4, mm7
	add		eax, 4
	movd		r0sum, mm6
	psubd		mm2, mm5
Col:
	movd		mm6, [eax][esi]
	movd		mm7, [eax]
	punpcklbw	mm6, mm3
	movd		mm5, [eax][esi*2]
	movq		mm1, mm6
	punpcklbw	mm7, mm3
	pmullw		mm1, mm1
	punpcklbw	mm5, mm3
	paddw		mm6, mm5
	pmullw		mm5, mm5
	paddw		mm6, mm7
	pmullw		mm7, mm7
	movq		[ebx][M_m0], mm6
	psllq		mm6, 48

	movq		[ebx][M_m8], mm4
	paddw		mm6, mm0
	movq		[ebx][M_m16], mm2
	punpckhwd	mm6, mm3

	pmaddwd		mm6, mm6
	movq		mm4, mm1
	punpcklwd	mm1, mm3
	movq		mm0, mm5
	punpckhwd	mm4, mm3
	movq		mm2, mm7
	punpcklwd	mm0, mm3
	paddd		mm1, mm0
	punpcklwd	mm2, mm3

	punpckhwd	mm5, mm3
	paddd		mm1, mm2
	punpckhwd	mm7, mm3
	paddd		mm4, mm5
	paddd		mm4, mm7
	movq		mm0, mm1
	movq		mm2, [ebx][M_m8]
	psllq		mm0, 32
	movq		mm5, [ebx][M_m16]
	paddd		mm0, mm2
	movq		mm2, mm0
	pslld		mm0, 3
	movq		mm7, mm5
	paddd		mm0, mm2
	psubd		mm0, mm6
	movq		mm6, mm5
	movq		mm2, mm0
	pslld		mm6, 10
	movq		mm3, mm0
	pslld		mm7, 4
	pslld		mm2, 10
	paddd		mm6, mm7
	pslld		mm3, 4
	movq		mm7, mm6
	paddd		mm2, mm3
	paddd		mm6, mm6
	movq		mm3, mm2
	paddd		mm6, mm7
	paddd		mm2, mm2
	movq		mm7, mm5
	pslld		mm7, 1
	paddd		mm5, mm7
	pslld		mm7, 1
	paddd		mm5, mm7
	pslld		mm7, 5
	paddd		mm5, mm7
	pslld		mm7, 1
	paddd		mm5, mm7
	psrld		mm5, 9
	paddd		mm2, mm3
	movq		mm7, packedw0x80
	paddd		mm5, mm6
	psrld		mm5, 16
	movq		mm3, mm0
	pslld		mm3, 1
	paddd		mm0, mm3
	pslld		mm3, 1
	paddd		mm0, mm3
	pslld		mm3, 5
	paddd		mm0, mm3
	pslld		mm3, 1
	paddd		mm0, mm3
	psrld		mm0, 9
	movq		mm3, mm5
	push		ecx
	paddd		mm0, mm2
	mov		ecx, 8
	psrld		mm0, 16
	punpckhdq	mm3, mm0
	pxor		mm2, mm2
	punpckldq	mm5, mm0
	pxor		mm0, mm0
	psllq		mm3, 16
	por		mm5, mm3
sqroot:
	por		mm2, mm7
	movq		mm6, mm5
	movq		mm3, mm2
	pmullw		mm2, mm2
	psubusw		mm6, mm2
	psubusw		mm2, mm5
	pcmpeqw		mm2, mm6
	pcmpeqw		 mm6, mm0
	pxor		mm2, mm6
	pand		mm2, mm7
	psrlw		mm7, 1
	pxor		mm2, mm3
	dec		ecx
	jnz		sqroot
	pop		ecx
	packuswb	mm2, mm2
	movq		mm6, [ebx][M_m0]
	pxor		mm3, mm3

	movd		[edi], mm2
	movq		mm0, mm1
	movd		mm2, r0sq
	movq		mm7, mm4
	paddd		mm2, mm0
	psrlq		mm1, 32
	movq		mm5, mm4
	paddd		mm2, mm1
	psllq		mm5, 32
	paddd		mm7, mm1
	paddd		mm2, mm5
	paddd		mm7, mm5
	psllq		mm0, 32
	paddd		mm2, mm0
	psrlq		mm4, 32
	paddd		mm7, mm4
	movq		mm0, mm2
	pslld		mm2, 3
	movd		mm1, r0sum
	paddd		mm2, mm0

	movd		r0sq, mm4
	movq		mm5, mm6
	paddw		mm5, mm1
	movq		mm1, mm6
	psrlq		mm1, 16
	movq		mm4, mm6
	psllq		mm4, 16
	paddw		mm5, mm1
	psrlq		mm6, 48
	paddw		mm5, mm4
	movq		mm0, mm5
	punpcklwd	mm5, mm3
	movd		r0sum, mm6
	pmaddwd		mm5, mm5
	add		eax, 4
	add		edi, 4
	psubd		mm2, mm5
	movq		mm4, mm7
	dec		ecx
   	jnz		Col
	mov		ecx, extras
	cmp		ecx, 0
	je		EndRow
	movd		mm6, [eax][esi]
	movd		mm7, [eax]
	punpcklbw	mm6, mm3
	movd		mm5, [eax][esi*2]
	movq		mm1, mm6
	punpcklbw	mm7, mm3
	pmullw		mm1, mm1
	punpcklbw	mm5, mm3
	paddw		mm6, mm5
	pmullw		mm5, mm5
	paddw		mm6, mm7
	pmullw		mm7, mm7
	movq		[ebx][M_m0], mm6
	psllq		mm6, 48

	movq		[ebx][M_m8], mm4
	paddw		mm6, mm0
	movq		[ebx][M_m16], mm2
	punpckhwd	mm6, mm3

	pmaddwd		mm6, mm6
	movq		mm4, mm1
	punpcklwd	mm1, mm3
	movq		mm0, mm5
	punpckhwd	mm4, mm3
	movq		mm2, mm7
	punpcklwd	mm0, mm3
	paddd		mm1, mm0
	punpcklwd	mm2, mm3

	punpckhwd	mm5, mm3
	paddd		mm1, mm2
	punpckhwd	mm7, mm3
	paddd		mm4, mm5
	paddd		mm4, mm7
	movq		mm0, mm1
	movq		mm2, [ebx][M_m8]
	psllq		mm0, 32
	movq		mm5, [ebx][M_m16]
	paddd		mm0, mm2
	movq		mm2, mm0
	pslld		mm0, 3
	movq		mm7, mm5
	paddd		mm0, mm2
	psubd		mm0, mm6
	movq		mm6, mm5
	movq		mm2, mm0
	pslld		mm6, 10
	movq		mm3, mm0
	pslld		mm7, 4
	pslld		mm2, 10
	paddd		mm6, mm7
	pslld		mm3, 4
	movq		mm7, mm6
	paddd		mm2, mm3
	paddd		mm6, mm6
	movq		mm3, mm2
	paddd		mm6, mm7
	paddd		mm2, mm2
	movq		mm7, mm5
	pslld		mm7, 1
	paddd		mm5, mm7
	pslld		mm7, 1
	paddd		mm5, mm7
	pslld		mm7, 5
	paddd		mm5, mm7
	pslld		mm7, 1
	paddd		mm5, mm7
	psrld		mm5, 9
	paddd		mm2, mm3
	movq		mm7, packedw0x80
	paddd		mm5, mm6
	psrld		mm5, 16
	movq		mm3, mm0
	pslld		mm3, 1
	paddd		mm0, mm3
	pslld		mm3, 1
	paddd		mm0, mm3
	pslld		mm3, 5
	paddd		mm0, mm3
	pslld		mm3, 1
	paddd		mm0, mm3
	psrld		mm0, 9
	movq		mm3, mm5
	push		ecx
	paddd		mm0, mm2
	mov		ecx, 8
	psrld		mm0, 16
	punpckhdq	mm3, mm0
	pxor		mm2, mm2
	punpckldq	mm5, mm0
	pxor		mm0, mm0
	psllq		mm3, 16
	por		mm5, mm3
sqrootExtras:
	por		mm2, mm7
	movq		mm6, mm5
	movq		mm3, mm2
	pmullw		mm2, mm2
	psubusw		mm6, mm2
	psubusw		mm2, mm5
	pcmpeqw		mm2, mm6
	pcmpeqw		mm6, mm0
	pxor		mm2, mm6
	pand		mm2, mm7
	psrlw		mm7, 1
	pxor		mm2, mm3
	dec		ecx
	jnz		sqrootExtras
	pop		ecx
	packuswb	mm2, mm2
	movq		mm6, [ebx][M_m0]
	pxor		mm3, mm3

	movd		ebx, mm2
	mov		ecx, extras
StoreExtras:
	mov		[edi], bl
	inc		edi
	shr		ebx, 8
	dec		ecx
	jg		StoreExtras
EndRow:
	mov		eax, srcPtr
	mov		edi, dstPtr
   	mov		edx, dRowB
	add		eax, esi
	mov		ebx, rows
	mov		srcPtr, eax
	add		edi, edx
	dec		ebx
	mov		dstPtr, edi
	mov		rows, ebx
	jnz		Row
	mov		rows, 1
	add		eax, esi
	mov		srcPtr, eax
	mov		dstPtr, edi
	emms
  }
  e1(srcPtr, dstPtr, rows, cols, sRowBytes, dRowBytes);
}