mips3.s   [plain text]


.rdata
.asciiz	"mips3.s, Version 1.1"
.asciiz	"MIPS III/IV ISA artwork by Andy Polyakov <appro@fy.chalmers.se>"

/*
 * ====================================================================
 * Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
 * project.
 *
 * Rights for redistribution and usage in source and binary forms are
 * granted according to the OpenSSL license. Warranty of any kind is
 * disclaimed.
 * ====================================================================
 */

/*
 * This is my modest contributon to the OpenSSL project (see
 * http://www.openssl.org/ for more information about it) and is
 * a drop-in MIPS III/IV ISA replacement for crypto/bn/bn_asm.c
 * module. For updates see http://fy.chalmers.se/~appro/hpe/.
 *
 * The module is designed to work with either of the "new" MIPS ABI(5),
 * namely N32 or N64, offered by IRIX 6.x. It's not ment to work under
 * IRIX 5.x not only because it doesn't support new ABIs but also
 * because 5.x kernels put R4x00 CPU into 32-bit mode and all those
 * 64-bit instructions (daddu, dmultu, etc.) found below gonna only
 * cause illegal instruction exception:-(
 *
 * In addition the code depends on preprocessor flags set up by MIPSpro
 * compiler driver (either as or cc) and therefore (probably?) can't be
 * compiled by the GNU assembler. GNU C driver manages fine though...
 * I mean as long as -mmips-as is specified or is the default option,
 * because then it simply invokes /usr/bin/as which in turn takes
 * perfect care of the preprocessor definitions. Another neat feature
 * offered by the MIPSpro assembler is an optimization pass. This gave
 * me the opportunity to have the code looking more regular as all those
 * architecture dependent instruction rescheduling details were left to
 * the assembler. Cool, huh?
 *
 * Performance improvement is astonishing! 'apps/openssl speed rsa dsa'
 * goes way over 3 times faster!
 *
 *					<appro@fy.chalmers.se>
 */
#include <asm.h>
#include <regdef.h>

#if _MIPS_ISA>=4
#define	MOVNZ(cond,dst,src)	\
	movn	dst,src,cond
#else
#define	MOVNZ(cond,dst,src)	\
	.set	noreorder;	\
	bnezl	cond,.+8;	\
	move	dst,src;	\
	.set	reorder
#endif

.text

.set	noat
.set	reorder

#define	MINUS4	v1

.align	5
LEAF(bn_mul_add_words)
	.set	noreorder
	bgtzl	a2,.L_bn_mul_add_words_proceed
	ld	t0,0(a1)
	jr	ra
	move	v0,zero
	.set	reorder

.L_bn_mul_add_words_proceed:
	li	MINUS4,-4
	and	ta0,a2,MINUS4
	move	v0,zero
	beqz	ta0,.L_bn_mul_add_words_tail

.L_bn_mul_add_words_loop:
	dmultu	t0,a3
	ld	t1,0(a0)
	ld	t2,8(a1)
	ld	t3,8(a0)
	ld	ta0,16(a1)
	ld	ta1,16(a0)
	daddu	t1,v0
	sltu	v0,t1,v0	/* All manuals say it "compares 32-bit
				 * values", but it seems to work fine
				 * even on 64-bit registers. */
	mflo	AT
	mfhi	t0
	daddu	t1,AT
	daddu	v0,t0
	sltu	AT,t1,AT
	sd	t1,0(a0)
	daddu	v0,AT

	dmultu	t2,a3
	ld	ta2,24(a1)
	ld	ta3,24(a0)
	daddu	t3,v0
	sltu	v0,t3,v0
	mflo	AT
	mfhi	t2
	daddu	t3,AT
	daddu	v0,t2
	sltu	AT,t3,AT
	sd	t3,8(a0)
	daddu	v0,AT

	dmultu	ta0,a3
	subu	a2,4
	PTR_ADD	a0,32
	PTR_ADD	a1,32
	daddu	ta1,v0
	sltu	v0,ta1,v0
	mflo	AT
	mfhi	ta0
	daddu	ta1,AT
	daddu	v0,ta0
	sltu	AT,ta1,AT
	sd	ta1,-16(a0)
	daddu	v0,AT


	dmultu	ta2,a3
	and	ta0,a2,MINUS4
	daddu	ta3,v0
	sltu	v0,ta3,v0
	mflo	AT
	mfhi	ta2
	daddu	ta3,AT
	daddu	v0,ta2
	sltu	AT,ta3,AT
	sd	ta3,-8(a0)
	daddu	v0,AT
	.set	noreorder
	bgtzl	ta0,.L_bn_mul_add_words_loop
	ld	t0,0(a1)

	bnezl	a2,.L_bn_mul_add_words_tail
	ld	t0,0(a1)
	.set	reorder

.L_bn_mul_add_words_return:
	jr	ra

.L_bn_mul_add_words_tail:
	dmultu	t0,a3
	ld	t1,0(a0)
	subu	a2,1
	daddu	t1,v0
	sltu	v0,t1,v0
	mflo	AT
	mfhi	t0
	daddu	t1,AT
	daddu	v0,t0
	sltu	AT,t1,AT
	sd	t1,0(a0)
	daddu	v0,AT
	beqz	a2,.L_bn_mul_add_words_return

	ld	t0,8(a1)
	dmultu	t0,a3
	ld	t1,8(a0)
	subu	a2,1
	daddu	t1,v0
	sltu	v0,t1,v0
	mflo	AT
	mfhi	t0
	daddu	t1,AT
	daddu	v0,t0
	sltu	AT,t1,AT
	sd	t1,8(a0)
	daddu	v0,AT
	beqz	a2,.L_bn_mul_add_words_return

	ld	t0,16(a1)
	dmultu	t0,a3
	ld	t1,16(a0)
	daddu	t1,v0
	sltu	v0,t1,v0
	mflo	AT
	mfhi	t0
	daddu	t1,AT
	daddu	v0,t0
	sltu	AT,t1,AT
	sd	t1,16(a0)
	daddu	v0,AT
	jr	ra
END(bn_mul_add_words)

.align	5
LEAF(bn_mul_words)
	.set	noreorder
	bgtzl	a2,.L_bn_mul_words_proceed
	ld	t0,0(a1)
	jr	ra
	move	v0,zero
	.set	reorder

.L_bn_mul_words_proceed:
	li	MINUS4,-4
	and	ta0,a2,MINUS4
	move	v0,zero
	beqz	ta0,.L_bn_mul_words_tail

.L_bn_mul_words_loop:
	dmultu	t0,a3
	ld	t2,8(a1)
	ld	ta0,16(a1)
	ld	ta2,24(a1)
	mflo	AT
	mfhi	t0
	daddu	v0,AT
	sltu	t1,v0,AT
	sd	v0,0(a0)
	daddu	v0,t1,t0

	dmultu	t2,a3
	subu	a2,4
	PTR_ADD	a0,32
	PTR_ADD	a1,32
	mflo	AT
	mfhi	t2
	daddu	v0,AT
	sltu	t3,v0,AT
	sd	v0,-24(a0)
	daddu	v0,t3,t2

	dmultu	ta0,a3
	mflo	AT
	mfhi	ta0
	daddu	v0,AT
	sltu	ta1,v0,AT
	sd	v0,-16(a0)
	daddu	v0,ta1,ta0


	dmultu	ta2,a3
	and	ta0,a2,MINUS4
	mflo	AT
	mfhi	ta2
	daddu	v0,AT
	sltu	ta3,v0,AT
	sd	v0,-8(a0)
	daddu	v0,ta3,ta2
	.set	noreorder
	bgtzl	ta0,.L_bn_mul_words_loop
	ld	t0,0(a1)

	bnezl	a2,.L_bn_mul_words_tail
	ld	t0,0(a1)
	.set	reorder

.L_bn_mul_words_return:
	jr	ra

.L_bn_mul_words_tail:
	dmultu	t0,a3
	subu	a2,1
	mflo	AT
	mfhi	t0
	daddu	v0,AT
	sltu	t1,v0,AT
	sd	v0,0(a0)
	daddu	v0,t1,t0
	beqz	a2,.L_bn_mul_words_return

	ld	t0,8(a1)
	dmultu	t0,a3
	subu	a2,1
	mflo	AT
	mfhi	t0
	daddu	v0,AT
	sltu	t1,v0,AT
	sd	v0,8(a0)
	daddu	v0,t1,t0
	beqz	a2,.L_bn_mul_words_return

	ld	t0,16(a1)
	dmultu	t0,a3
	mflo	AT
	mfhi	t0
	daddu	v0,AT
	sltu	t1,v0,AT
	sd	v0,16(a0)
	daddu	v0,t1,t0
	jr	ra
END(bn_mul_words)

.align	5
LEAF(bn_sqr_words)
	.set	noreorder
	bgtzl	a2,.L_bn_sqr_words_proceed
	ld	t0,0(a1)
	jr	ra
	move	v0,zero
	.set	reorder

.L_bn_sqr_words_proceed:
	li	MINUS4,-4
	and	ta0,a2,MINUS4
	move	v0,zero
	beqz	ta0,.L_bn_sqr_words_tail

.L_bn_sqr_words_loop:
	dmultu	t0,t0
	ld	t2,8(a1)
	ld	ta0,16(a1)
	ld	ta2,24(a1)
	mflo	t1
	mfhi	t0
	sd	t1,0(a0)
	sd	t0,8(a0)

	dmultu	t2,t2
	subu	a2,4
	PTR_ADD	a0,64
	PTR_ADD	a1,32
	mflo	t3
	mfhi	t2
	sd	t3,-48(a0)
	sd	t2,-40(a0)

	dmultu	ta0,ta0
	mflo	ta1
	mfhi	ta0
	sd	ta1,-32(a0)
	sd	ta0,-24(a0)


	dmultu	ta2,ta2
	and	ta0,a2,MINUS4
	mflo	ta3
	mfhi	ta2
	sd	ta3,-16(a0)
	sd	ta2,-8(a0)

	.set	noreorder
	bgtzl	ta0,.L_bn_sqr_words_loop
	ld	t0,0(a1)

	bnezl	a2,.L_bn_sqr_words_tail
	ld	t0,0(a1)
	.set	reorder

.L_bn_sqr_words_return:
	move	v0,zero
	jr	ra

.L_bn_sqr_words_tail:
	dmultu	t0,t0
	subu	a2,1
	mflo	t1
	mfhi	t0
	sd	t1,0(a0)
	sd	t0,8(a0)
	beqz	a2,.L_bn_sqr_words_return

	ld	t0,8(a1)
	dmultu	t0,t0
	subu	a2,1
	mflo	t1
	mfhi	t0
	sd	t1,16(a0)
	sd	t0,24(a0)
	beqz	a2,.L_bn_sqr_words_return

	ld	t0,16(a1)
	dmultu	t0,t0
	mflo	t1
	mfhi	t0
	sd	t1,32(a0)
	sd	t0,40(a0)
	jr	ra
END(bn_sqr_words)

.align	5
LEAF(bn_add_words)
	.set	noreorder
	bgtzl	a3,.L_bn_add_words_proceed
	ld	t0,0(a1)
	jr	ra
	move	v0,zero
	.set	reorder

.L_bn_add_words_proceed:
	li	MINUS4,-4
	and	AT,a3,MINUS4
	move	v0,zero
	beqz	AT,.L_bn_add_words_tail

.L_bn_add_words_loop:
	ld	ta0,0(a2)
	subu	a3,4
	ld	t1,8(a1)
	and	AT,a3,MINUS4
	ld	t2,16(a1)
	PTR_ADD	a2,32
	ld	t3,24(a1)
	PTR_ADD	a0,32
	ld	ta1,-24(a2)
	PTR_ADD	a1,32
	ld	ta2,-16(a2)
	ld	ta3,-8(a2)
	daddu	ta0,t0
	sltu	t8,ta0,t0
	daddu	t0,ta0,v0
	sltu	v0,t0,ta0
	sd	t0,-32(a0)
	daddu	v0,t8

	daddu	ta1,t1
	sltu	t9,ta1,t1
	daddu	t1,ta1,v0
	sltu	v0,t1,ta1
	sd	t1,-24(a0)
	daddu	v0,t9

	daddu	ta2,t2
	sltu	t8,ta2,t2
	daddu	t2,ta2,v0
	sltu	v0,t2,ta2
	sd	t2,-16(a0)
	daddu	v0,t8
	
	daddu	ta3,t3
	sltu	t9,ta3,t3
	daddu	t3,ta3,v0
	sltu	v0,t3,ta3
	sd	t3,-8(a0)
	daddu	v0,t9
	
	.set	noreorder
	bgtzl	AT,.L_bn_add_words_loop
	ld	t0,0(a1)

	bnezl	a3,.L_bn_add_words_tail
	ld	t0,0(a1)
	.set	reorder

.L_bn_add_words_return:
	jr	ra

.L_bn_add_words_tail:
	ld	ta0,0(a2)
	daddu	ta0,t0
	subu	a3,1
	sltu	t8,ta0,t0
	daddu	t0,ta0,v0
	sltu	v0,t0,ta0
	sd	t0,0(a0)
	daddu	v0,t8
	beqz	a3,.L_bn_add_words_return

	ld	t1,8(a1)
	ld	ta1,8(a2)
	daddu	ta1,t1
	subu	a3,1
	sltu	t9,ta1,t1
	daddu	t1,ta1,v0
	sltu	v0,t1,ta1
	sd	t1,8(a0)
	daddu	v0,t9
	beqz	a3,.L_bn_add_words_return

	ld	t2,16(a1)
	ld	ta2,16(a2)
	daddu	ta2,t2
	sltu	t8,ta2,t2
	daddu	t2,ta2,v0
	sltu	v0,t2,ta2
	sd	t2,16(a0)
	daddu	v0,t8
	jr	ra
END(bn_add_words)

.align	5
LEAF(bn_sub_words)
	.set	noreorder
	bgtzl	a3,.L_bn_sub_words_proceed
	ld	t0,0(a1)
	jr	ra
	move	v0,zero
	.set	reorder

.L_bn_sub_words_proceed:
	li	MINUS4,-4
	and	AT,a3,MINUS4
	move	v0,zero
	beqz	AT,.L_bn_sub_words_tail

.L_bn_sub_words_loop:
	ld	ta0,0(a2)
	subu	a3,4
	ld	t1,8(a1)
	and	AT,a3,MINUS4
	ld	t2,16(a1)
	PTR_ADD	a2,32
	ld	t3,24(a1)
	PTR_ADD	a0,32
	ld	ta1,-24(a2)
	PTR_ADD	a1,32
	ld	ta2,-16(a2)
	ld	ta3,-8(a2)
	sltu	t8,t0,ta0
	dsubu	t0,ta0
	dsubu	ta0,t0,v0
	sd	ta0,-32(a0)
	MOVNZ	(t0,v0,t8)

	sltu	t9,t1,ta1
	dsubu	t1,ta1
	dsubu	ta1,t1,v0
	sd	ta1,-24(a0)
	MOVNZ	(t1,v0,t9)


	sltu	t8,t2,ta2
	dsubu	t2,ta2
	dsubu	ta2,t2,v0
	sd	ta2,-16(a0)
	MOVNZ	(t2,v0,t8)

	sltu	t9,t3,ta3
	dsubu	t3,ta3
	dsubu	ta3,t3,v0
	sd	ta3,-8(a0)
	MOVNZ	(t3,v0,t9)

	.set	noreorder
	bgtzl	AT,.L_bn_sub_words_loop
	ld	t0,0(a1)

	bnezl	a3,.L_bn_sub_words_tail
	ld	t0,0(a1)
	.set	reorder

.L_bn_sub_words_return:
	jr	ra

.L_bn_sub_words_tail:
	ld	ta0,0(a2)
	subu	a3,1
	sltu	t8,t0,ta0
	dsubu	t0,ta0
	dsubu	ta0,t0,v0
	MOVNZ	(t0,v0,t8)
	sd	ta0,0(a0)
	beqz	a3,.L_bn_sub_words_return

	ld	t1,8(a1)
	subu	a3,1
	ld	ta1,8(a2)
	sltu	t9,t1,ta1
	dsubu	t1,ta1
	dsubu	ta1,t1,v0
	MOVNZ	(t1,v0,t9)
	sd	ta1,8(a0)
	beqz	a3,.L_bn_sub_words_return

	ld	t2,16(a1)
	ld	ta2,16(a2)
	sltu	t8,t2,ta2
	dsubu	t2,ta2
	dsubu	ta2,t2,v0
	MOVNZ	(t2,v0,t8)
	sd	ta2,16(a0)
	jr	ra
END(bn_sub_words)

#undef	MINUS4

.align 5
LEAF(bn_div_3_words)
	.set	reorder
	move	a3,a0		/* we know that bn_div_words doesn't
				 * touch a3, ta2, ta3 and preserves a2
				 * so that we can save two arguments
				 * and return address in registers
				 * instead of stack:-)
				 */
	ld	a0,(a3)
	move	ta2,a1
	ld	a1,-8(a3)
	bne	a0,a2,.L_bn_div_3_words_proceed
	li	v0,-1
	jr	ra
.L_bn_div_3_words_proceed:
	move	ta3,ra
	bal	bn_div_words
	move	ra,ta3
	dmultu	ta2,v0
	ld	t2,-16(a3)
	move	ta0,zero
	mfhi	t1
	mflo	t0
	sltu	t8,t1,v1
.L_bn_div_3_words_inner_loop:
	bnez	t8,.L_bn_div_3_words_inner_loop_done
	sgeu	AT,t2,t0
	seq	t9,t1,v1
	and	AT,t9
	sltu	t3,t0,ta2
	daddu	v1,a2
	dsubu	t1,t3
	dsubu	t0,ta2
	sltu	t8,t1,v1
	sltu	ta0,v1,a2
	or	t8,ta0
	.set	noreorder
	beqzl	AT,.L_bn_div_3_words_inner_loop
	dsubu	v0,1
	.set	reorder
.L_bn_div_3_words_inner_loop_done:
	jr	ra
END(bn_div_3_words)

.align	5
LEAF(bn_div_words)
	.set	noreorder
	bnezl	a2,.L_bn_div_words_proceed
	move	v1,zero
	jr	ra
	li	v0,-1		/* I'd rather signal div-by-zero
				 * which can be done with 'break 7' */

.L_bn_div_words_proceed:
	bltz	a2,.L_bn_div_words_body
	move	t9,v1
	dsll	a2,1
	bgtz	a2,.-4
	addu	t9,1

	.set	reorder
	negu	t1,t9
	li	t2,-1
	dsll	t2,t1
	and	t2,a0
	dsrl	AT,a1,t1
	.set	noreorder
	bnezl	t2,.+8
	break	6		/* signal overflow */
	.set	reorder
	dsll	a0,t9
	dsll	a1,t9
	or	a0,AT

#define	QT	ta0
#define	HH	ta1
#define	DH	v1
.L_bn_div_words_body:
	dsrl	DH,a2,32
	sgeu	AT,a0,a2
	.set	noreorder
	bnezl	AT,.+8
	dsubu	a0,a2
	.set	reorder

	li	QT,-1
	dsrl	HH,a0,32
	dsrl	QT,32	/* q=0xffffffff */
	beq	DH,HH,.L_bn_div_words_skip_div1
	ddivu	zero,a0,DH
	mflo	QT
.L_bn_div_words_skip_div1:
	dmultu	a2,QT
	dsll	t3,a0,32
	dsrl	AT,a1,32
	or	t3,AT
	mflo	t0
	mfhi	t1
.L_bn_div_words_inner_loop1:
	sltu	t2,t3,t0
	seq	t8,HH,t1
	sltu	AT,HH,t1
	and	t2,t8
	sltu	v0,t0,a2
	or	AT,t2
	.set	noreorder
	beqz	AT,.L_bn_div_words_inner_loop1_done
	dsubu	t1,v0
	dsubu	t0,a2
	b	.L_bn_div_words_inner_loop1
	dsubu	QT,1
	.set	reorder
.L_bn_div_words_inner_loop1_done:

	dsll	a1,32
	dsubu	a0,t3,t0
	dsll	v0,QT,32

	li	QT,-1
	dsrl	HH,a0,32
	dsrl	QT,32	/* q=0xffffffff */
	beq	DH,HH,.L_bn_div_words_skip_div2
	ddivu	zero,a0,DH
	mflo	QT
.L_bn_div_words_skip_div2:
#undef	DH
	dmultu	a2,QT
	dsll	t3,a0,32
	dsrl	AT,a1,32
	or	t3,AT
	mflo	t0
	mfhi	t1
.L_bn_div_words_inner_loop2:
	sltu	t2,t3,t0
	seq	t8,HH,t1
	sltu	AT,HH,t1
	and	t2,t8
	sltu	v1,t0,a2
	or	AT,t2
	.set	noreorder
	beqz	AT,.L_bn_div_words_inner_loop2_done
	dsubu	t1,v1
	dsubu	t0,a2
	b	.L_bn_div_words_inner_loop2
	dsubu	QT,1
	.set	reorder
.L_bn_div_words_inner_loop2_done:	
#undef	HH

	dsubu	a0,t3,t0
	or	v0,QT
	dsrl	v1,a0,t9	/* v1 contains remainder if anybody wants it */
	dsrl	a2,t9		/* restore a2 */
	jr	ra
#undef	QT
END(bn_div_words)

#define	a_0	t0
#define	a_1	t1
#define	a_2	t2
#define	a_3	t3
#define	b_0	ta0
#define	b_1	ta1
#define	b_2	ta2
#define	b_3	ta3

#define	a_4	s0
#define	a_5	s2
#define	a_6	s4
#define	a_7	a1	/* once we load a[7] we don't need a anymore */
#define	b_4	s1
#define	b_5	s3
#define	b_6	s5
#define	b_7	a2	/* once we load b[7] we don't need b anymore */

#define	t_1	t8
#define	t_2	t9

#define	c_1	v0
#define	c_2	v1
#define	c_3	a3

#define	FRAME_SIZE	48

.align	5
LEAF(bn_mul_comba8)
	.set	noreorder
	PTR_SUB	sp,FRAME_SIZE
	.frame	sp,64,ra
	.set	reorder
	ld	a_0,0(a1)	/* If compiled with -mips3 option on
				 * R5000 box assembler barks on this
				 * line with "shouldn't have mult/div
				 * as last instruction in bb (R10K
				 * bug)" warning. If anybody out there
				 * has a clue about how to circumvent
				 * this do send me a note.
				 *		<appro@fy.chalmers.se>
				 */
	ld	b_0,0(a2)
	ld	a_1,8(a1)
	ld	a_2,16(a1)
	ld	a_3,24(a1)
	ld	b_1,8(a2)
	ld	b_2,16(a2)
	ld	b_3,24(a2)
	dmultu	a_0,b_0		/* mul_add_c(a[0],b[0],c1,c2,c3); */
	sd	s0,0(sp)
	sd	s1,8(sp)
	sd	s2,16(sp)
	sd	s3,24(sp)
	sd	s4,32(sp)
	sd	s5,40(sp)
	mflo	c_1
	mfhi	c_2

	dmultu	a_0,b_1		/* mul_add_c(a[0],b[1],c2,c3,c1); */
	ld	a_4,32(a1)
	ld	a_5,40(a1)
	ld	a_6,48(a1)
	ld	a_7,56(a1)
	ld	b_4,32(a2)
	ld	b_5,40(a2)
	mflo	t_1
	mfhi	t_2
	daddu	c_2,t_1
	sltu	AT,c_2,t_1
	daddu	c_3,t_2,AT
	dmultu	a_1,b_0		/* mul_add_c(a[1],b[0],c2,c3,c1); */
	ld	b_6,48(a2)
	ld	b_7,56(a2)
	sd	c_1,0(a0)	/* r[0]=c1; */
	mflo	t_1
	mfhi	t_2
	daddu	c_2,t_1
	sltu	AT,c_2,t_1
	daddu	t_2,AT
	daddu	c_3,t_2
	sltu	c_1,c_3,t_2
	sd	c_2,8(a0)	/* r[1]=c2; */

	dmultu	a_2,b_0		/* mul_add_c(a[2],b[0],c3,c1,c2); */
	mflo	t_1
	mfhi	t_2
	daddu	c_3,t_1
	sltu	AT,c_3,t_1
	daddu	t_2,AT
	daddu	c_1,t_2
	dmultu	a_1,b_1		/* mul_add_c(a[1],b[1],c3,c1,c2); */
	mflo	t_1
	mfhi	t_2
	daddu	c_3,t_1
	sltu	AT,c_3,t_1
	daddu	t_2,AT
	daddu	c_1,t_2
	sltu	c_2,c_1,t_2
	dmultu	a_0,b_2		/* mul_add_c(a[0],b[2],c3,c1,c2); */
	mflo	t_1
	mfhi	t_2
	daddu	c_3,t_1
	sltu	AT,c_3,t_1
	daddu	t_2,AT
	daddu	c_1,t_2
	sltu	AT,c_1,t_2
	daddu	c_2,AT
	sd	c_3,16(a0)	/* r[2]=c3; */

	dmultu	a_0,b_3		/* mul_add_c(a[0],b[3],c1,c2,c3); */
	mflo	t_1
	mfhi	t_2
	daddu	c_1,t_1
	sltu	AT,c_1,t_1
	daddu	t_2,AT
	daddu	c_2,t_2
	sltu	c_3,c_2,t_2
	dmultu	a_1,b_2		/* mul_add_c(a[1],b[2],c1,c2,c3); */
	mflo	t_1
	mfhi	t_2
	daddu	c_1,t_1
	sltu	AT,c_1,t_1
	daddu	t_2,AT
	daddu	c_2,t_2
	sltu	AT,c_2,t_2
	daddu	c_3,AT
	dmultu	a_2,b_1		/* mul_add_c(a[2],b[1],c1,c2,c3); */
	mflo	t_1
	mfhi	t_2
	daddu	c_1,t_1
	sltu	AT,c_1,t_1
	daddu	t_2,AT
	daddu	c_2,t_2
	sltu	AT,c_2,t_2
	daddu	c_3,AT
	dmultu	a_3,b_0		/* mul_add_c(a[3],b[0],c1,c2,c3); */
	mflo	t_1
	mfhi	t_2
	daddu	c_1,t_1
	sltu	AT,c_1,t_1
	daddu	t_2,AT
	daddu	c_2,t_2
	sltu	AT,c_2,t_2
	daddu	c_3,AT
	sd	c_1,24(a0)	/* r[3]=c1; */

	dmultu	a_4,b_0		/* mul_add_c(a[4],b[0],c2,c3,c1); */
	mflo	t_1
	mfhi	t_2
	daddu	c_2,t_1
	sltu	AT,c_2,t_1
	daddu	t_2,AT
	daddu	c_3,t_2
	sltu	c_1,c_3,t_2
	dmultu	a_3,b_1		/* mul_add_c(a[3],b[1],c2,c3,c1); */
	mflo	t_1
	mfhi	t_2
	daddu	c_2,t_1
	sltu	AT,c_2,t_1
	daddu	t_2,AT
	daddu	c_3,t_2
	sltu	AT,c_3,t_2
	daddu	c_1,AT
	dmultu	a_2,b_2		/* mul_add_c(a[2],b[2],c2,c3,c1); */
	mflo	t_1
	mfhi	t_2
	daddu	c_2,t_1
	sltu	AT,c_2,t_1
	daddu	t_2,AT
	daddu	c_3,t_2
	sltu	AT,c_3,t_2
	daddu	c_1,AT
	dmultu	a_1,b_3		/* mul_add_c(a[1],b[3],c2,c3,c1); */
	mflo	t_1
	mfhi	t_2
	daddu	c_2,t_1
	sltu	AT,c_2,t_1
	daddu	t_2,AT
	daddu	c_3,t_2
	sltu	AT,c_3,t_2
	daddu	c_1,AT
	dmultu	a_0,b_4		/* mul_add_c(a[0],b[4],c2,c3,c1); */
	mflo	t_1
	mfhi	t_2
	daddu	c_2,t_1
	sltu	AT,c_2,t_1
	daddu	t_2,AT
	daddu	c_3,t_2
	sltu	AT,c_3,t_2
	daddu	c_1,AT
	sd	c_2,32(a0)	/* r[4]=c2; */

	dmultu	a_0,b_5		/* mul_add_c(a[0],b[5],c3,c1,c2); */
	mflo	t_1
	mfhi	t_2
	daddu	c_3,t_1
	sltu	AT,c_3,t_1
	daddu	t_2,AT
	daddu	c_1,t_2
	sltu	c_2,c_1,t_2
	dmultu	a_1,b_4		/* mul_add_c(a[1],b[4],c3,c1,c2); */
	mflo	t_1
	mfhi	t_2
	daddu	c_3,t_1
	sltu	AT,c_3,t_1
	daddu	t_2,AT
	daddu	c_1,t_2
	sltu	AT,c_1,t_2
	daddu	c_2,AT
	dmultu	a_2,b_3		/* mul_add_c(a[2],b[3],c3,c1,c2); */
	mflo	t_1
	mfhi	t_2
	daddu	c_3,t_1
	sltu	AT,c_3,t_1
	daddu	t_2,AT
	daddu	c_1,t_2
	sltu	AT,c_1,t_2
	daddu	c_2,AT
	dmultu	a_3,b_2		/* mul_add_c(a[3],b[2],c3,c1,c2); */
	mflo	t_1
	mfhi	t_2
	daddu	c_3,t_1
	sltu	AT,c_3,t_1
	daddu	t_2,AT
	daddu	c_1,t_2
	sltu	AT,c_1,t_2
	daddu	c_2,AT
	dmultu	a_4,b_1		/* mul_add_c(a[4],b[1],c3,c1,c2); */
	mflo	t_1
	mfhi	t_2
	daddu	c_3,t_1
	sltu	AT,c_3,t_1
	daddu	t_2,AT
	daddu	c_1,t_2
	sltu	AT,c_1,t_2
	daddu	c_2,AT
	dmultu	a_5,b_0		/* mul_add_c(a[5],b[0],c3,c1,c2); */
	mflo	t_1
	mfhi	t_2
	daddu	c_3,t_1
	sltu	AT,c_3,t_1
	daddu	t_2,AT
	daddu	c_1,t_2
	sltu	AT,c_1,t_2
	daddu	c_2,AT
	sd	c_3,40(a0)	/* r[5]=c3; */

	dmultu	a_6,b_0		/* mul_add_c(a[6],b[0],c1,c2,c3); */
	mflo	t_1
	mfhi	t_2
	daddu	c_1,t_1
	sltu	AT,c_1,t_1
	daddu	t_2,AT
	daddu	c_2,t_2
	sltu	c_3,c_2,t_2
	dmultu	a_5,b_1		/* mul_add_c(a[5],b[1],c1,c2,c3); */
	mflo	t_1
	mfhi	t_2
	daddu	c_1,t_1
	sltu	AT,c_1,t_1
	daddu	t_2,AT
	daddu	c_2,t_2
	sltu	AT,c_2,t_2
	daddu	c_3,AT
	dmultu	a_4,b_2		/* mul_add_c(a[4],b[2],c1,c2,c3); */
	mflo	t_1
	mfhi	t_2
	daddu	c_1,t_1
	sltu	AT,c_1,t_1
	daddu	t_2,AT
	daddu	c_2,t_2
	sltu	AT,c_2,t_2
	daddu	c_3,AT
	dmultu	a_3,b_3		/* mul_add_c(a[3],b[3],c1,c2,c3); */
	mflo	t_1
	mfhi	t_2
	daddu	c_1,t_1
	sltu	AT,c_1,t_1
	daddu	t_2,AT
	daddu	c_2,t_2
	sltu	AT,c_2,t_2
	daddu	c_3,AT
	dmultu	a_2,b_4		/* mul_add_c(a[2],b[4],c1,c2,c3); */
	mflo	t_1
	mfhi	t_2
	daddu	c_1,t_1
	sltu	AT,c_1,t_1
	daddu	t_2,AT
	daddu	c_2,t_2
	sltu	AT,c_2,t_2
	daddu	c_3,AT
	dmultu	a_1,b_5		/* mul_add_c(a[1],b[5],c1,c2,c3); */
	mflo	t_1
	mfhi	t_2
	daddu	c_1,t_1
	sltu	AT,c_1,t_1
	daddu	t_2,AT
	daddu	c_2,t_2
	sltu	AT,c_2,t_2
	daddu	c_3,AT
	dmultu	a_0,b_6		/* mul_add_c(a[0],b[6],c1,c2,c3); */
	mflo	t_1
	mfhi	t_2
	daddu	c_1,t_1
	sltu	AT,c_1,t_1
	daddu	t_2,AT
	daddu	c_2,t_2
	sltu	AT,c_2,t_2
	daddu	c_3,AT
	sd	c_1,48(a0)	/* r[6]=c1; */

	dmultu	a_0,b_7		/* mul_add_c(a[0],b[7],c2,c3,c1); */
	mflo	t_1
	mfhi	t_2
	daddu	c_2,t_1
	sltu	AT,c_2,t_1
	daddu	t_2,AT
	daddu	c_3,t_2
	sltu	c_1,c_3,t_2
	dmultu	a_1,b_6		/* mul_add_c(a[1],b[6],c2,c3,c1); */
	mflo	t_1
	mfhi	t_2
	daddu	c_2,t_1
	sltu	AT,c_2,t_1
	daddu	t_2,AT
	daddu	c_3,t_2
	sltu	AT,c_3,t_2
	daddu	c_1,AT
	dmultu	a_2,b_5		/* mul_add_c(a[2],b[5],c2,c3,c1); */
	mflo	t_1
	mfhi	t_2
	daddu	c_2,t_1
	sltu	AT,c_2,t_1
	daddu	t_2,AT
	daddu	c_3,t_2
	sltu	AT,c_3,t_2
	daddu	c_1,AT
	dmultu	a_3,b_4		/* mul_add_c(a[3],b[4],c2,c3,c1); */
	mflo	t_1
	mfhi	t_2
	daddu	c_2,t_1
	sltu	AT,c_2,t_1
	daddu	t_2,AT
	daddu	c_3,t_2
	sltu	AT,c_3,t_2
	daddu	c_1,AT
	dmultu	a_4,b_3		/* mul_add_c(a[4],b[3],c2,c3,c1); */
	mflo	t_1
	mfhi	t_2
	daddu	c_2,t_1
	sltu	AT,c_2,t_1
	daddu	t_2,AT
	daddu	c_3,t_2
	sltu	AT,c_3,t_2
	daddu	c_1,AT
	dmultu	a_5,b_2		/* mul_add_c(a[5],b[2],c2,c3,c1); */
	mflo	t_1
	mfhi	t_2
	daddu	c_2,t_1
	sltu	AT,c_2,t_1
	daddu	t_2,AT
	daddu	c_3,t_2
	sltu	AT,c_3,t_2
	daddu	c_1,AT
	dmultu	a_6,b_1		/* mul_add_c(a[6],b[1],c2,c3,c1); */
	mflo	t_1
	mfhi	t_2
	daddu	c_2,t_1
	sltu	AT,c_2,t_1
	daddu	t_2,AT
	daddu	c_3,t_2
	sltu	AT,c_3,t_2
	daddu	c_1,AT
	dmultu	a_7,b_0		/* mul_add_c(a[7],b[0],c2,c3,c1); */
	mflo	t_1
	mfhi	t_2
	daddu	c_2,t_1
	sltu	AT,c_2,t_1
	daddu	t_2,AT
	daddu	c_3,t_2
	sltu	AT,c_3,t_2
	daddu	c_1,AT
	sd	c_2,56(a0)	/* r[7]=c2; */

	dmultu	a_7,b_1		/* mul_add_c(a[7],b[1],c3,c1,c2); */
	mflo	t_1
	mfhi	t_2
	daddu	c_3,t_1
	sltu	AT,c_3,t_1
	daddu	t_2,AT
	daddu	c_1,t_2
	sltu	c_2,c_1,t_2
	dmultu	a_6,b_2		/* mul_add_c(a[6],b[2],c3,c1,c2); */
	mflo	t_1
	mfhi	t_2
	daddu	c_3,t_1
	sltu	AT,c_3,t_1
	daddu	t_2,AT
	daddu	c_1,t_2
	sltu	AT,c_1,t_2
	daddu	c_2,AT
	dmultu	a_5,b_3		/* mul_add_c(a[5],b[3],c3,c1,c2); */
	mflo	t_1
	mfhi	t_2
	daddu	c_3,t_1
	sltu	AT,c_3,t_1
	daddu	t_2,AT
	daddu	c_1,t_2
	sltu	AT,c_1,t_2
	daddu	c_2,AT
	dmultu	a_4,b_4		/* mul_add_c(a[4],b[4],c3,c1,c2); */
	mflo	t_1
	mfhi	t_2
	daddu	c_3,t_1
	sltu	AT,c_3,t_1
	daddu	t_2,AT
	daddu	c_1,t_2
	sltu	AT,c_1,t_2
	daddu	c_2,AT
	dmultu	a_3,b_5		/* mul_add_c(a[3],b[5],c3,c1,c2); */
	mflo	t_1
	mfhi	t_2
	daddu	c_3,t_1
	sltu	AT,c_3,t_1
	daddu	t_2,AT
	daddu	c_1,t_2
	sltu	AT,c_1,t_2
	daddu	c_2,AT
	dmultu	a_2,b_6		/* mul_add_c(a[2],b[6],c3,c1,c2); */
	mflo	t_1
	mfhi	t_2
	daddu	c_3,t_1
	sltu	AT,c_3,t_1
	daddu	t_2,AT
	daddu	c_1,t_2
	sltu	AT,c_1,t_2
	daddu	c_2,AT
	dmultu	a_1,b_7		/* mul_add_c(a[1],b[7],c3,c1,c2); */
	mflo	t_1
	mfhi	t_2
	daddu	c_3,t_1
	sltu	AT,c_3,t_1
	daddu	t_2,AT
	daddu	c_1,t_2
	sltu	AT,c_1,t_2
	daddu	c_2,AT
	sd	c_3,64(a0)	/* r[8]=c3; */

	dmultu	a_2,b_7		/* mul_add_c(a[2],b[7],c1,c2,c3); */
	mflo	t_1
	mfhi	t_2
	daddu	c_1,t_1
	sltu	AT,c_1,t_1
	daddu	t_2,AT
	daddu	c_2,t_2
	sltu	c_3,c_2,t_2
	dmultu	a_3,b_6		/* mul_add_c(a[3],b[6],c1,c2,c3); */
	mflo	t_1
	mfhi	t_2
	daddu	c_1,t_1
	sltu	AT,c_1,t_1
	daddu	t_2,AT
	daddu	c_2,t_2
	sltu	AT,c_2,t_2
	daddu	c_3,AT
	dmultu	a_4,b_5		/* mul_add_c(a[4],b[5],c1,c2,c3); */
	mflo	t_1
	mfhi	t_2
	daddu	c_1,t_1
	sltu	AT,c_1,t_1
	daddu	t_2,AT
	daddu	c_2,t_2
	sltu	AT,c_2,t_2
	daddu	c_3,AT
	dmultu	a_5,b_4		/* mul_add_c(a[5],b[4],c1,c2,c3); */
	mflo	t_1
	mfhi	t_2
	daddu	c_1,t_1
	sltu	AT,c_1,t_1
	daddu	t_2,AT
	daddu	c_2,t_2
	sltu	AT,c_2,t_2
	daddu	c_3,AT
	dmultu	a_6,b_3		/* mul_add_c(a[6],b[3],c1,c2,c3); */
	mflo	t_1
	mfhi	t_2
	daddu	c_1,t_1
	sltu	AT,c_1,t_1
	daddu	t_2,AT
	daddu	c_2,t_2
	sltu	AT,c_2,t_2
	daddu	c_3,AT
	dmultu	a_7,b_2		/* mul_add_c(a[7],b[2],c1,c2,c3); */
	mflo	t_1
	mfhi	t_2
	daddu	c_1,t_1
	sltu	AT,c_1,t_1
	daddu	t_2,AT
	daddu	c_2,t_2
	sltu	AT,c_2,t_2
	daddu	c_3,AT
	sd	c_1,72(a0)	/* r[9]=c1; */

	dmultu	a_7,b_3		/* mul_add_c(a[7],b[3],c2,c3,c1); */
	mflo	t_1
	mfhi	t_2
	daddu	c_2,t_1
	sltu	AT,c_2,t_1
	daddu	t_2,AT
	daddu	c_3,t_2
	sltu	c_1,c_3,t_2
	dmultu	a_6,b_4		/* mul_add_c(a[6],b[4],c2,c3,c1); */
	mflo	t_1
	mfhi	t_2
	daddu	c_2,t_1
	sltu	AT,c_2,t_1
	daddu	t_2,AT
	daddu	c_3,t_2
	sltu	AT,c_3,t_2
	daddu	c_1,AT
	dmultu	a_5,b_5		/* mul_add_c(a[5],b[5],c2,c3,c1); */
	mflo	t_1
	mfhi	t_2
	daddu	c_2,t_1
	sltu	AT,c_2,t_1
	daddu	t_2,AT
	daddu	c_3,t_2
	sltu	AT,c_3,t_2
	daddu	c_1,AT
	dmultu	a_4,b_6		/* mul_add_c(a[4],b[6],c2,c3,c1); */
	mflo	t_1
	mfhi	t_2
	daddu	c_2,t_1
	sltu	AT,c_2,t_1
	daddu	t_2,AT
	daddu	c_3,t_2
	sltu	AT,c_3,t_2
	daddu	c_1,AT
	dmultu	a_3,b_7		/* mul_add_c(a[3],b[7],c2,c3,c1); */
	mflo	t_1
	mfhi	t_2
	daddu	c_2,t_1
	sltu	AT,c_2,t_1
	daddu	t_2,AT
	daddu	c_3,t_2
	sltu	AT,c_3,t_2
	daddu	c_1,AT
	sd	c_2,80(a0)	/* r[10]=c2; */

	dmultu	a_4,b_7		/* mul_add_c(a[4],b[7],c3,c1,c2); */
	mflo	t_1
	mfhi	t_2
	daddu	c_3,t_1
	sltu	AT,c_3,t_1
	daddu	t_2,AT
	daddu	c_1,t_2
	sltu	c_2,c_1,t_2
	dmultu	a_5,b_6		/* mul_add_c(a[5],b[6],c3,c1,c2); */
	mflo	t_1
	mfhi	t_2
	daddu	c_3,t_1
	sltu	AT,c_3,t_1
	daddu	t_2,AT
	daddu	c_1,t_2
	sltu	AT,c_1,t_2
	daddu	c_2,AT
	dmultu	a_6,b_5		/* mul_add_c(a[6],b[5],c3,c1,c2); */
	mflo	t_1
	mfhi	t_2
	daddu	c_3,t_1
	sltu	AT,c_3,t_1
	daddu	t_2,AT
	daddu	c_1,t_2
	sltu	AT,c_1,t_2
	daddu	c_2,AT
	dmultu	a_7,b_4		/* mul_add_c(a[7],b[4],c3,c1,c2); */
	mflo	t_1
	mfhi	t_2
	daddu	c_3,t_1
	sltu	AT,c_3,t_1
	daddu	t_2,AT
	daddu	c_1,t_2
	sltu	AT,c_1,t_2
	daddu	c_2,AT
	sd	c_3,88(a0)	/* r[11]=c3; */

	dmultu	a_7,b_5		/* mul_add_c(a[7],b[5],c1,c2,c3); */
	mflo	t_1
	mfhi	t_2
	daddu	c_1,t_1
	sltu	AT,c_1,t_1
	daddu	t_2,AT
	daddu	c_2,t_2
	sltu	c_3,c_2,t_2
	dmultu	a_6,b_6		/* mul_add_c(a[6],b[6],c1,c2,c3); */
	mflo	t_1
	mfhi	t_2
	daddu	c_1,t_1
	sltu	AT,c_1,t_1
	daddu	t_2,AT
	daddu	c_2,t_2
	sltu	AT,c_2,t_2
	daddu	c_3,AT
	dmultu	a_5,b_7		/* mul_add_c(a[5],b[7],c1,c2,c3); */
	mflo	t_1
	mfhi	t_2
	daddu	c_1,t_1
	sltu	AT,c_1,t_1
	daddu	t_2,AT
	daddu	c_2,t_2
	sltu	AT,c_2,t_2
	daddu	c_3,AT
	sd	c_1,96(a0)	/* r[12]=c1; */

	dmultu	a_6,b_7		/* mul_add_c(a[6],b[7],c2,c3,c1); */
	mflo	t_1
	mfhi	t_2
	daddu	c_2,t_1
	sltu	AT,c_2,t_1
	daddu	t_2,AT
	daddu	c_3,t_2
	sltu	c_1,c_3,t_2
	dmultu	a_7,b_6		/* mul_add_c(a[7],b[6],c2,c3,c1); */
	mflo	t_1
	mfhi	t_2
	daddu	c_2,t_1
	sltu	AT,c_2,t_1
	daddu	t_2,AT
	daddu	c_3,t_2
	sltu	AT,c_3,t_2
	daddu	c_1,AT
	sd	c_2,104(a0)	/* r[13]=c2; */

	dmultu	a_7,b_7		/* mul_add_c(a[7],b[7],c3,c1,c2); */
	ld	s0,0(sp)
	ld	s1,8(sp)
	ld	s2,16(sp)
	ld	s3,24(sp)
	ld	s4,32(sp)
	ld	s5,40(sp)
	mflo	t_1
	mfhi	t_2
	daddu	c_3,t_1
	sltu	AT,c_3,t_1
	daddu	t_2,AT
	daddu	c_1,t_2
	sd	c_3,112(a0)	/* r[14]=c3; */
	sd	c_1,120(a0)	/* r[15]=c1; */

	PTR_ADD	sp,FRAME_SIZE

	jr	ra
END(bn_mul_comba8)

.align	5
LEAF(bn_mul_comba4)
	.set	reorder
	ld	a_0,0(a1)
	ld	b_0,0(a2)
	ld	a_1,8(a1)
	ld	a_2,16(a1)
	dmultu	a_0,b_0		/* mul_add_c(a[0],b[0],c1,c2,c3); */
	ld	a_3,24(a1)
	ld	b_1,8(a2)
	ld	b_2,16(a2)
	ld	b_3,24(a2)
	mflo	c_1
	mfhi	c_2
	sd	c_1,0(a0)

	dmultu	a_0,b_1		/* mul_add_c(a[0],b[1],c2,c3,c1); */
	mflo	t_1
	mfhi	t_2
	daddu	c_2,t_1
	sltu	AT,c_2,t_1
	daddu	c_3,t_2,AT
	dmultu	a_1,b_0		/* mul_add_c(a[1],b[0],c2,c3,c1); */
	mflo	t_1
	mfhi	t_2
	daddu	c_2,t_1
	sltu	AT,c_2,t_1
	daddu	t_2,AT
	daddu	c_3,t_2
	sltu	c_1,c_3,t_2
	sd	c_2,8(a0)

	dmultu	a_2,b_0		/* mul_add_c(a[2],b[0],c3,c1,c2); */
	mflo	t_1
	mfhi	t_2
	daddu	c_3,t_1
	sltu	AT,c_3,t_1
	daddu	t_2,AT
	daddu	c_1,t_2
	dmultu	a_1,b_1		/* mul_add_c(a[1],b[1],c3,c1,c2); */
	mflo	t_1
	mfhi	t_2
	daddu	c_3,t_1
	sltu	AT,c_3,t_1
	daddu	t_2,AT
	daddu	c_1,t_2
	sltu	c_2,c_1,t_2
	dmultu	a_0,b_2		/* mul_add_c(a[0],b[2],c3,c1,c2); */
	mflo	t_1
	mfhi	t_2
	daddu	c_3,t_1
	sltu	AT,c_3,t_1
	daddu	t_2,AT
	daddu	c_1,t_2
	sltu	AT,c_1,t_2
	daddu	c_2,AT
	sd	c_3,16(a0)

	dmultu	a_0,b_3		/* mul_add_c(a[0],b[3],c1,c2,c3); */
	mflo	t_1
	mfhi	t_2
	daddu	c_1,t_1
	sltu	AT,c_1,t_1
	daddu	t_2,AT
	daddu	c_2,t_2
	sltu	c_3,c_2,t_2
	dmultu	a_1,b_2		/* mul_add_c(a[1],b[2],c1,c2,c3); */
	mflo	t_1
	mfhi	t_2
	daddu	c_1,t_1
	sltu	AT,c_1,t_1
	daddu	t_2,AT
	daddu	c_2,t_2
	sltu	AT,c_2,t_2
	daddu	c_3,AT
	dmultu	a_2,b_1		/* mul_add_c(a[2],b[1],c1,c2,c3); */
	mflo	t_1
	mfhi	t_2
	daddu	c_1,t_1
	sltu	AT,c_1,t_1
	daddu	t_2,AT
	daddu	c_2,t_2
	sltu	AT,c_2,t_2
	daddu	c_3,AT
	dmultu	a_3,b_0		/* mul_add_c(a[3],b[0],c1,c2,c3); */
	mflo	t_1
	mfhi	t_2
	daddu	c_1,t_1
	sltu	AT,c_1,t_1
	daddu	t_2,AT
	daddu	c_2,t_2
	sltu	AT,c_2,t_2
	daddu	c_3,AT
	sd	c_1,24(a0)

	dmultu	a_3,b_1		/* mul_add_c(a[3],b[1],c2,c3,c1); */
	mflo	t_1
	mfhi	t_2
	daddu	c_2,t_1
	sltu	AT,c_2,t_1
	daddu	t_2,AT
	daddu	c_3,t_2
	sltu	c_1,c_3,t_2
	dmultu	a_2,b_2		/* mul_add_c(a[2],b[2],c2,c3,c1); */
	mflo	t_1
	mfhi	t_2
	daddu	c_2,t_1
	sltu	AT,c_2,t_1
	daddu	t_2,AT
	daddu	c_3,t_2
	sltu	AT,c_3,t_2
	daddu	c_1,AT
	dmultu	a_1,b_3		/* mul_add_c(a[1],b[3],c2,c3,c1); */
	mflo	t_1
	mfhi	t_2
	daddu	c_2,t_1
	sltu	AT,c_2,t_1
	daddu	t_2,AT
	daddu	c_3,t_2
	sltu	AT,c_3,t_2
	daddu	c_1,AT
	sd	c_2,32(a0)

	dmultu	a_2,b_3		/* mul_add_c(a[2],b[3],c3,c1,c2); */
	mflo	t_1
	mfhi	t_2
	daddu	c_3,t_1
	sltu	AT,c_3,t_1
	daddu	t_2,AT
	daddu	c_1,t_2
	sltu	c_2,c_1,t_2
	dmultu	a_3,b_2		/* mul_add_c(a[3],b[2],c3,c1,c2); */
	mflo	t_1
	mfhi	t_2
	daddu	c_3,t_1
	sltu	AT,c_3,t_1
	daddu	t_2,AT
	daddu	c_1,t_2
	sltu	AT,c_1,t_2
	daddu	c_2,AT
	sd	c_3,40(a0)

	dmultu	a_3,b_3		/* mul_add_c(a[3],b[3],c1,c2,c3); */
	mflo	t_1
	mfhi	t_2
	daddu	c_1,t_1
	sltu	AT,c_1,t_1
	daddu	t_2,AT
	daddu	c_2,t_2
	sd	c_1,48(a0)
	sd	c_2,56(a0)

	jr	ra
END(bn_mul_comba4)

#undef	a_4
#undef	a_5
#undef	a_6
#undef	a_7
#define	a_4	b_0
#define	a_5	b_1
#define	a_6	b_2
#define	a_7	b_3

.align	5
LEAF(bn_sqr_comba8)
	.set	reorder
	ld	a_0,0(a1)
	ld	a_1,8(a1)
	ld	a_2,16(a1)
	ld	a_3,24(a1)

	dmultu	a_0,a_0		/* mul_add_c(a[0],b[0],c1,c2,c3); */
	ld	a_4,32(a1)
	ld	a_5,40(a1)
	ld	a_6,48(a1)
	ld	a_7,56(a1)
	mflo	c_1
	mfhi	c_2
	sd	c_1,0(a0)

	dmultu	a_0,a_1		/* mul_add_c2(a[0],b[1],c2,c3,c1); */
	mflo	t_1
	mfhi	t_2
	slt	c_1,t_2,zero
	dsll	t_2,1
	slt	a2,t_1,zero
	daddu	t_2,a2
	dsll	t_1,1
	daddu	c_2,t_1
	sltu	AT,c_2,t_1
	daddu	c_3,t_2,AT
	sd	c_2,8(a0)

	dmultu	a_2,a_0		/* mul_add_c2(a[2],b[0],c3,c1,c2); */
	mflo	t_1
	mfhi	t_2
	slt	c_2,t_2,zero
	dsll	t_2,1
	slt	a2,t_1,zero
	daddu	t_2,a2
	dsll	t_1,1
	daddu	c_3,t_1
	sltu	AT,c_3,t_1
	daddu	t_2,AT
	daddu	c_1,t_2
	sltu	AT,c_1,t_2
	daddu	c_2,AT
	dmultu	a_1,a_1		/* mul_add_c(a[1],b[1],c3,c1,c2); */
	mflo	t_1
	mfhi	t_2
	daddu	c_3,t_1
	sltu	AT,c_3,t_1
	daddu	t_2,AT
	daddu	c_1,t_2
	sltu	AT,c_1,t_2
	daddu	c_2,AT
	sd	c_3,16(a0)

	dmultu	a_0,a_3		/* mul_add_c2(a[0],b[3],c1,c2,c3); */
	mflo	t_1
	mfhi	t_2
	slt	c_3,t_2,zero
	dsll	t_2,1
	slt	a2,t_1,zero
	daddu	t_2,a2
	dsll	t_1,1
	daddu	c_1,t_1
	sltu	AT,c_1,t_1
	daddu	t_2,AT
	daddu	c_2,t_2
	sltu	AT,c_2,t_2
	daddu	c_3,AT
	dmultu	a_1,a_2		/* mul_add_c2(a[1],b[2],c1,c2,c3); */
	mflo	t_1
	mfhi	t_2
	slt	AT,t_2,zero
	daddu	c_3,AT
	dsll	t_2,1
	slt	a2,t_1,zero
	daddu	t_2,a2
	dsll	t_1,1
	daddu	c_1,t_1
	sltu	AT,c_1,t_1
	daddu	t_2,AT
	daddu	c_2,t_2
	sltu	AT,c_2,t_2
	daddu	c_3,AT
	sd	c_1,24(a0)

	dmultu	a_4,a_0		/* mul_add_c2(a[4],b[0],c2,c3,c1); */
	mflo	t_1
	mfhi	t_2
	slt	c_1,t_2,zero
	dsll	t_2,1
	slt	a2,t_1,zero
	daddu	t_2,a2
	dsll	t_1,1
	daddu	c_2,t_1
	sltu	AT,c_2,t_1
	daddu	t_2,AT
	daddu	c_3,t_2
	sltu	AT,c_3,t_2
	daddu	c_1,AT
	dmultu	a_3,a_1		/* mul_add_c2(a[3],b[1],c2,c3,c1); */
	mflo	t_1
	mfhi	t_2
	slt	AT,t_2,zero
	daddu	c_1,AT
	dsll	t_2,1
	slt	a2,t_1,zero
	daddu	t_2,a2
	dsll	t_1,1
	daddu	c_2,t_1
	sltu	AT,c_2,t_1
	daddu	t_2,AT
	daddu	c_3,t_2
	sltu	AT,c_3,t_2
	daddu	c_1,AT
	dmultu	a_2,a_2		/* mul_add_c(a[2],b[2],c2,c3,c1); */
	mflo	t_1
	mfhi	t_2
	daddu	c_2,t_1
	sltu	AT,c_2,t_1
	daddu	t_2,AT
	daddu	c_3,t_2
	sltu	AT,c_3,t_2
	daddu	c_1,AT
	sd	c_2,32(a0)

	dmultu	a_0,a_5		/* mul_add_c2(a[0],b[5],c3,c1,c2); */
	mflo	t_1
	mfhi	t_2
	slt	c_2,t_2,zero
	dsll	t_2,1
	slt	a2,t_1,zero
	daddu	t_2,a2
	dsll	t_1,1
	daddu	c_3,t_1
	sltu	AT,c_3,t_1
	daddu	t_2,AT
	daddu	c_1,t_2
	sltu	AT,c_1,t_2
	daddu	c_2,AT
	dmultu	a_1,a_4		/* mul_add_c2(a[1],b[4],c3,c1,c2); */
	mflo	t_1
	mfhi	t_2
	slt	AT,t_2,zero
	daddu	c_2,AT
	dsll	t_2,1
	slt	a2,t_1,zero
	daddu	t_2,a2
	dsll	t_1,1
	daddu	c_3,t_1
	sltu	AT,c_3,t_1
	daddu	t_2,AT
	daddu	c_1,t_2
	sltu	AT,c_1,t_2
	daddu	c_2,AT
	dmultu	a_2,a_3		/* mul_add_c2(a[2],b[3],c3,c1,c2); */
	mflo	t_1
	mfhi	t_2
	slt	AT,t_2,zero
	daddu	c_2,AT
	dsll	t_2,1
	slt	a2,t_1,zero
	daddu	t_2,a2
	dsll	t_1,1
	daddu	c_3,t_1
	sltu	AT,c_3,t_1
	daddu	t_2,AT
	daddu	c_1,t_2
	sltu	AT,c_1,t_2
	daddu	c_2,AT
	sd	c_3,40(a0)

	dmultu	a_6,a_0		/* mul_add_c2(a[6],b[0],c1,c2,c3); */
	mflo	t_1
	mfhi	t_2
	slt	c_3,t_2,zero
	dsll	t_2,1
	slt	a2,t_1,zero
	daddu	t_2,a2
	dsll	t_1,1
	daddu	c_1,t_1
	sltu	AT,c_1,t_1
	daddu	t_2,AT
	daddu	c_2,t_2
	sltu	AT,c_2,t_2
	daddu	c_3,AT
	dmultu	a_5,a_1		/* mul_add_c2(a[5],b[1],c1,c2,c3); */
	mflo	t_1
	mfhi	t_2
	slt	AT,t_2,zero
	daddu	c_3,AT
	dsll	t_2,1
	slt	a2,t_1,zero
	daddu	t_2,a2
	dsll	t_1,1
	daddu	c_1,t_1
	sltu	AT,c_1,t_1
	daddu	t_2,AT
	daddu	c_2,t_2
	sltu	AT,c_2,t_2
	daddu	c_3,AT
	dmultu	a_4,a_2		/* mul_add_c2(a[4],b[2],c1,c2,c3); */
	mflo	t_1
	mfhi	t_2
	slt	AT,t_2,zero
	daddu	c_3,AT
	dsll	t_2,1
	slt	a2,t_1,zero
	daddu	t_2,a2
	dsll	t_1,1
	daddu	c_1,t_1
	sltu	AT,c_1,t_1
	daddu	t_2,AT
	daddu	c_2,t_2
	sltu	AT,c_2,t_2
	daddu	c_3,AT
	dmultu	a_3,a_3		/* mul_add_c(a[3],b[3],c1,c2,c3); */
	mflo	t_1
	mfhi	t_2
	daddu	c_1,t_1
	sltu	AT,c_1,t_1
	daddu	t_2,AT
	daddu	c_2,t_2
	sltu	AT,c_2,t_2
	daddu	c_3,AT
	sd	c_1,48(a0)

	dmultu	a_0,a_7		/* mul_add_c2(a[0],b[7],c2,c3,c1); */
	mflo	t_1
	mfhi	t_2
	slt	c_1,t_2,zero
	dsll	t_2,1
	slt	a2,t_1,zero
	daddu	t_2,a2
	dsll	t_1,1
	daddu	c_2,t_1
	sltu	AT,c_2,t_1
	daddu	t_2,AT
	daddu	c_3,t_2
	sltu	AT,c_3,t_2
	daddu	c_1,AT
	dmultu	a_1,a_6		/* mul_add_c2(a[1],b[6],c2,c3,c1); */
	mflo	t_1
	mfhi	t_2
	slt	AT,t_2,zero
	daddu	c_1,AT
	dsll	t_2,1
	slt	a2,t_1,zero
	daddu	t_2,a2
	dsll	t_1,1
	daddu	c_2,t_1
	sltu	AT,c_2,t_1
	daddu	t_2,AT
	daddu	c_3,t_2
	sltu	AT,c_3,t_2
	daddu	c_1,AT
	dmultu	a_2,a_5		/* mul_add_c2(a[2],b[5],c2,c3,c1); */
	mflo	t_1
	mfhi	t_2
	slt	AT,t_2,zero
	daddu	c_1,AT
	dsll	t_2,1
	slt	a2,t_1,zero
	daddu	t_2,a2
	dsll	t_1,1
	daddu	c_2,t_1
	sltu	AT,c_2,t_1
	daddu	t_2,AT
	daddu	c_3,t_2
	sltu	AT,c_3,t_2
	daddu	c_1,AT
	dmultu	a_3,a_4		/* mul_add_c2(a[3],b[4],c2,c3,c1); */
	mflo	t_1
	mfhi	t_2
	slt	AT,t_2,zero
	daddu	c_1,AT
	dsll	t_2,1
	slt	a2,t_1,zero
	daddu	t_2,a2
	dsll	t_1,1
	daddu	c_2,t_1
	sltu	AT,c_2,t_1
	daddu	t_2,AT
	daddu	c_3,t_2
	sltu	AT,c_3,t_2
	daddu	c_1,AT
	sd	c_2,56(a0)

	dmultu	a_7,a_1		/* mul_add_c2(a[7],b[1],c3,c1,c2); */
	mflo	t_1
	mfhi	t_2
	slt	c_2,t_2,zero
	dsll	t_2,1
	slt	a2,t_1,zero
	daddu	t_2,a2
	dsll	t_1,1
	daddu	c_3,t_1
	sltu	AT,c_3,t_1
	daddu	t_2,AT
	daddu	c_1,t_2
	sltu	AT,c_1,t_2
	daddu	c_2,AT
	dmultu	a_6,a_2		/* mul_add_c2(a[6],b[2],c3,c1,c2); */
	mflo	t_1
	mfhi	t_2
	slt	AT,t_2,zero
	daddu	c_2,AT
	dsll	t_2,1
	slt	a2,t_1,zero
	daddu	t_2,a2
	dsll	t_1,1
	daddu	c_3,t_1
	sltu	AT,c_3,t_1
	daddu	t_2,AT
	daddu	c_1,t_2
	sltu	AT,c_1,t_2
	daddu	c_2,AT
	dmultu	a_5,a_3		/* mul_add_c2(a[5],b[3],c3,c1,c2); */
	mflo	t_1
	mfhi	t_2
	slt	AT,t_2,zero
	daddu	c_2,AT
	dsll	t_2,1
	slt	a2,t_1,zero
	daddu	t_2,a2
	dsll	t_1,1
	daddu	c_3,t_1
	sltu	AT,c_3,t_1
	daddu	t_2,AT
	daddu	c_1,t_2
	sltu	AT,c_1,t_2
	daddu	c_2,AT
	dmultu	a_4,a_4		/* mul_add_c(a[4],b[4],c3,c1,c2); */
	mflo	t_1
	mfhi	t_2
	daddu	c_3,t_1
	sltu	AT,c_3,t_1
	daddu	t_2,AT
	daddu	c_1,t_2
	sltu	AT,c_1,t_2
	daddu	c_2,AT
	sd	c_3,64(a0)

	dmultu	a_2,a_7		/* mul_add_c2(a[2],b[7],c1,c2,c3); */
	mflo	t_1
	mfhi	t_2
	slt	c_3,t_2,zero
	dsll	t_2,1
	slt	a2,t_1,zero
	daddu	t_2,a2
	dsll	t_1,1
	daddu	c_1,t_1
	sltu	AT,c_1,t_1
	daddu	t_2,AT
	daddu	c_2,t_2
	sltu	AT,c_2,t_2
	daddu	c_3,AT
	dmultu	a_3,a_6		/* mul_add_c2(a[3],b[6],c1,c2,c3); */
	mflo	t_1
	mfhi	t_2
	slt	AT,t_2,zero
	daddu	c_3,AT
	dsll	t_2,1
	slt	a2,t_1,zero
	daddu	t_2,a2
	dsll	t_1,1
	daddu	c_1,t_1
	sltu	AT,c_1,t_1
	daddu	t_2,AT
	daddu	c_2,t_2
	sltu	AT,c_2,t_2
	daddu	c_3,AT
	dmultu	a_4,a_5		/* mul_add_c2(a[4],b[5],c1,c2,c3); */
	mflo	t_1
	mfhi	t_2
	slt	AT,t_2,zero
	daddu	c_3,AT
	dsll	t_2,1
	slt	a2,t_1,zero
	daddu	t_2,a2
	dsll	t_1,1
	daddu	c_1,t_1
	sltu	AT,c_1,t_1
	daddu	t_2,AT
	daddu	c_2,t_2
	sltu	AT,c_2,t_2
	daddu	c_3,AT
	sd	c_1,72(a0)

	dmultu	a_7,a_3		/* mul_add_c2(a[7],b[3],c2,c3,c1); */
	mflo	t_1
	mfhi	t_2
	slt	c_1,t_2,zero
	dsll	t_2,1
	slt	a2,t_1,zero
	daddu	t_2,a2
	dsll	t_1,1
	daddu	c_2,t_1
	sltu	AT,c_2,t_1
	daddu	t_2,AT
	daddu	c_3,t_2
	sltu	AT,c_3,t_2
	daddu	c_1,AT
	dmultu	a_6,a_4		/* mul_add_c2(a[6],b[4],c2,c3,c1); */
	mflo	t_1
	mfhi	t_2
	slt	AT,t_2,zero
	daddu	c_1,AT
	dsll	t_2,1
	slt	a2,t_1,zero
	daddu	t_2,a2
	dsll	t_1,1
	daddu	c_2,t_1
	sltu	AT,c_2,t_1
	daddu	t_2,AT
	daddu	c_3,t_2
	sltu	AT,c_3,t_2
	daddu	c_1,AT
	dmultu	a_5,a_5		/* mul_add_c(a[5],b[5],c2,c3,c1); */
	mflo	t_1
	mfhi	t_2
	daddu	c_2,t_1
	sltu	AT,c_2,t_1
	daddu	t_2,AT
	daddu	c_3,t_2
	sltu	AT,c_3,t_2
	daddu	c_1,AT
	sd	c_2,80(a0)

	dmultu	a_4,a_7		/* mul_add_c2(a[4],b[7],c3,c1,c2); */
	mflo	t_1
	mfhi	t_2
	slt	c_2,t_2,zero
	dsll	t_2,1
	slt	a2,t_1,zero
	daddu	t_2,a2
	dsll	t_1,1
	daddu	c_3,t_1
	sltu	AT,c_3,t_1
	daddu	t_2,AT
	daddu	c_1,t_2
	sltu	AT,c_1,t_2
	daddu	c_2,AT
	dmultu	a_5,a_6		/* mul_add_c2(a[5],b[6],c3,c1,c2); */
	mflo	t_1
	mfhi	t_2
	slt	AT,t_2,zero
	daddu	c_2,AT
	dsll	t_2,1
	slt	a2,t_1,zero
	daddu	t_2,a2
	dsll	t_1,1
	daddu	c_3,t_1
	sltu	AT,c_3,t_1
	daddu	t_2,AT
	daddu	c_1,t_2
	sltu	AT,c_1,t_2
	daddu	c_2,AT
	sd	c_3,88(a0)

	dmultu	a_7,a_5		/* mul_add_c2(a[7],b[5],c1,c2,c3); */
	mflo	t_1
	mfhi	t_2
	slt	c_3,t_2,zero
	dsll	t_2,1
	slt	a2,t_1,zero
	daddu	t_2,a2
	dsll	t_1,1
	daddu	c_1,t_1
	sltu	AT,c_1,t_1
	daddu	t_2,AT
	daddu	c_2,t_2
	sltu	AT,c_2,t_2
	daddu	c_3,AT
	dmultu	a_6,a_6		/* mul_add_c(a[6],b[6],c1,c2,c3); */
	mflo	t_1
	mfhi	t_2
	daddu	c_1,t_1
	sltu	AT,c_1,t_1
	daddu	t_2,AT
	daddu	c_2,t_2
	sltu	AT,c_2,t_2
	daddu	c_3,AT
	sd	c_1,96(a0)

	dmultu	a_6,a_7		/* mul_add_c2(a[6],b[7],c2,c3,c1); */
	mflo	t_1
	mfhi	t_2
	slt	c_1,t_2,zero
	dsll	t_2,1
	slt	a2,t_1,zero
	daddu	t_2,a2
	dsll	t_1,1
	daddu	c_2,t_1
	sltu	AT,c_2,t_1
	daddu	t_2,AT
	daddu	c_3,t_2
	sltu	AT,c_3,t_2
	daddu	c_1,AT
	sd	c_2,104(a0)

	dmultu	a_7,a_7		/* mul_add_c(a[7],b[7],c3,c1,c2); */
	mflo	t_1
	mfhi	t_2
	daddu	c_3,t_1
	sltu	AT,c_3,t_1
	daddu	t_2,AT
	daddu	c_1,t_2
	sd	c_3,112(a0)
	sd	c_1,120(a0)

	jr	ra
END(bn_sqr_comba8)

.align	5
LEAF(bn_sqr_comba4)
	.set	reorder
	ld	a_0,0(a1)
	ld	a_1,8(a1)
	ld	a_2,16(a1)
	ld	a_3,24(a1)
	dmultu	a_0,a_0		/* mul_add_c(a[0],b[0],c1,c2,c3); */
	mflo	c_1
	mfhi	c_2
	sd	c_1,0(a0)

	dmultu	a_0,a_1		/* mul_add_c2(a[0],b[1],c2,c3,c1); */
	mflo	t_1
	mfhi	t_2
	slt	c_1,t_2,zero
	dsll	t_2,1
	slt	a2,t_1,zero
	daddu	t_2,a2
	dsll	t_1,1
	daddu	c_2,t_1
	sltu	AT,c_2,t_1
	daddu	c_3,t_2,AT
	sd	c_2,8(a0)

	dmultu	a_2,a_0		/* mul_add_c2(a[2],b[0],c3,c1,c2); */
	mflo	t_1
	mfhi	t_2
	slt	c_2,t_2,zero
	dsll	t_2,1
	slt	a2,t_1,zero
	daddu	t_2,a2
	dsll	t_1,1
	daddu	c_3,t_1
	sltu	AT,c_3,t_1
	daddu	t_2,AT
	daddu	c_1,t_2
	sltu	AT,c_1,t_2
	daddu	c_2,AT
	dmultu	a_1,a_1		/* mul_add_c(a[1],b[1],c3,c1,c2); */
	mflo	t_1
	mfhi	t_2
	daddu	c_3,t_1
	sltu	AT,c_3,t_1
	daddu	t_2,AT
	daddu	c_1,t_2
	sltu	AT,c_1,t_2
	daddu	c_2,AT
	sd	c_3,16(a0)

	dmultu	a_0,a_3		/* mul_add_c2(a[0],b[3],c1,c2,c3); */
	mflo	t_1
	mfhi	t_2
	slt	c_3,t_2,zero
	dsll	t_2,1
	slt	a2,t_1,zero
	daddu	t_2,a2
	dsll	t_1,1
	daddu	c_1,t_1
	sltu	AT,c_1,t_1
	daddu	t_2,AT
	daddu	c_2,t_2
	sltu	AT,c_2,t_2
	daddu	c_3,AT
	dmultu	a_1,a_2		/* mul_add_c(a2[1],b[2],c1,c2,c3); */
	mflo	t_1
	mfhi	t_2
	slt	AT,t_2,zero
	daddu	c_3,AT
	dsll	t_2,1
	slt	a2,t_1,zero
	daddu	t_2,a2
	dsll	t_1,1
	daddu	c_1,t_1
	sltu	AT,c_1,t_1
	daddu	t_2,AT
	daddu	c_2,t_2
	sltu	AT,c_2,t_2
	daddu	c_3,AT
	sd	c_1,24(a0)

	dmultu	a_3,a_1		/* mul_add_c2(a[3],b[1],c2,c3,c1); */
	mflo	t_1
	mfhi	t_2
	slt	c_1,t_2,zero
	dsll	t_2,1
	slt	a2,t_1,zero
	daddu	t_2,a2
	dsll	t_1,1
	daddu	c_2,t_1
	sltu	AT,c_2,t_1
	daddu	t_2,AT
	daddu	c_3,t_2
	sltu	AT,c_3,t_2
	daddu	c_1,AT
	dmultu	a_2,a_2		/* mul_add_c(a[2],b[2],c2,c3,c1); */
	mflo	t_1
	mfhi	t_2
	daddu	c_2,t_1
	sltu	AT,c_2,t_1
	daddu	t_2,AT
	daddu	c_3,t_2
	sltu	AT,c_3,t_2
	daddu	c_1,AT
	sd	c_2,32(a0)

	dmultu	a_2,a_3		/* mul_add_c2(a[2],b[3],c3,c1,c2); */
	mflo	t_1
	mfhi	t_2
	slt	c_2,t_2,zero
	dsll	t_2,1
	slt	a2,t_1,zero
	daddu	t_2,a2
	dsll	t_1,1
	daddu	c_3,t_1
	sltu	AT,c_3,t_1
	daddu	t_2,AT
	daddu	c_1,t_2
	sltu	AT,c_1,t_2
	daddu	c_2,AT
	sd	c_3,40(a0)

	dmultu	a_3,a_3		/* mul_add_c(a[3],b[3],c1,c2,c3); */
	mflo	t_1
	mfhi	t_2
	daddu	c_1,t_1
	sltu	AT,c_1,t_1
	daddu	t_2,AT
	daddu	c_2,t_2
	sd	c_1,48(a0)
	sd	c_2,56(a0)

	jr	ra
END(bn_sqr_comba4)