$num="r0"; $ap="r1";
$bp="r2"; $bi="r2"; $rp="r2";
$np="r3";
$tp="r4";
$aj="r5";
$nj="r6";
$tj="r7";
$n0="r8";
$alo="r10"; $ahi="r11"; $nlo="r12"; $nhi="r14";
$_rp="$num,#12*4";
$_bp="$num,#13*4";
$_n0="$num,#14*4";
$_num="$num,#15*4"; $_bpend=$_num;
$code=<<___;
.text
.global bn_mul_mont
.type bn_mul_mont,%function
.align 2
bn_mul_mont:
stmdb sp!,{r0,r2} @ sp points at argument block
ldr $num,[sp, cmp $num, movlt r0, addlt sp,sp, blt .Labrt
stmdb sp!,{r4-r12,lr} @ save 10 registers
mov $num,$num,lsl sub sp,sp,$num @ alloca(4*num)
sub sp,sp, sub $num,$num, add $tp,$bp,$num @ &bp[num-1]
add $num,sp,$num @ $num to point at &tp[num-1]
ldr $n0,[$_n0] @ &n0
ldr $bi,[$bp] @ bp[0]
ldr $aj,[$ap], ldr $nj,[$np], ldr $n0,[$n0] @ *n0
str $tp,[$_bpend] @ save &bp[num]
umull $alo,$ahi,$aj,$bi @ ap[0]*bp[0]
str $n0,[$_n0] @ save n0 value
mul $n0,$alo,$n0 @ "tp[0]"*n0
mov $nlo, umlal $alo,$nlo,$nj,$n0 @ np[0]*n0+"t[0]"
mov $tp,sp
.L1st:
ldr $aj,[$ap], mov $alo,$ahi
mov $ahi, umlal $alo,$ahi,$aj,$bi @ ap[j]*bp[0]
ldr $nj,[$np], mov $nhi, umlal $nlo,$nhi,$nj,$n0 @ np[j]*n0
adds $nlo,$nlo,$alo
str $nlo,[$tp], adc $nlo,$nhi, cmp $tp,$num
bne .L1st
adds $nlo,$nlo,$ahi
mov $nhi, adc $nhi,$nhi, ldr $tp,[$_bp] @ restore bp
str $nlo,[$num] @ tp[num-1]=
ldr $n0,[$_n0] @ restore n0
str $nhi,[$num,
.Louter:
sub $tj,$num,sp @ "original" $num-1 value
sub $ap,$ap,$tj @ "rewind" ap to &ap[1]
sub $np,$np,$tj @ "rewind" np to &np[1]
ldr $bi,[$tp, ldr $aj,[$ap, ldr $nj,[$np, ldr $alo,[sp] @ tp[0]
ldr $tj,[sp,
mov $ahi, umlal $alo,$ahi,$aj,$bi @ ap[0]*bp[i]+tp[0]
str $tp,[$_bp] @ save bp
mul $n0,$alo,$n0
mov $nlo, umlal $alo,$nlo,$nj,$n0 @ np[0]*n0+"tp[0]"
mov $tp,sp
.Linner:
ldr $aj,[$ap], adds $alo,$ahi,$tj @ +=tp[j]
mov $ahi, umlal $alo,$ahi,$aj,$bi @ ap[j]*bp[i]
ldr $nj,[$np], mov $nhi, umlal $nlo,$nhi,$nj,$n0 @ np[j]*n0
ldr $tj,[$tp, adc $ahi,$ahi, adds $nlo,$nlo,$alo
str $nlo,[$tp], adc $nlo,$nhi, cmp $tp,$num
bne .Linner
adds $nlo,$nlo,$ahi
mov $nhi, adc $nhi,$nhi, adds $nlo,$nlo,$tj
adc $nhi,$nhi, ldr $tp,[$_bp] @ restore bp
ldr $tj,[$_bpend] @ restore &bp[num]
str $nlo,[$num] @ tp[num-1]=
ldr $n0,[$_n0] @ restore n0
str $nhi,[$num,
cmp $tp,$tj
bne .Louter
ldr $rp,[$_rp] @ pull rp
add $num,$num, sub $aj,$num,sp @ "original" num value
mov $tp,sp @ "rewind" $tp
mov $ap,$tp @ "borrow" $ap
sub $np,$np,$aj @ "rewind" $np to &np[0]
subs $tj,$tj,$tj @ "clear" carry flag
.Lsub: ldr $tj,[$tp], ldr $nj,[$np], sbcs $tj,$tj,$nj @ tp[j]-np[j]
str $tj,[$rp], teq $tp,$num @ preserve carry
bne .Lsub
sbcs $nhi,$nhi, mov $tp,sp @ "rewind" $tp
sub $rp,$rp,$aj @ "rewind" $rp
and $ap,$tp,$nhi
bic $np,$rp,$nhi
orr $ap,$ap,$np @ ap=borrow?tp:rp
.Lcopy: ldr $tj,[$ap], str sp,[$tp], str $tj,[$rp], cmp $tp,$num
bne .Lcopy
add sp,$num, ldmia sp!,{r4-r12,lr} @ restore registers
add sp,sp, mov r0,.Labrt: tst lr, moveq pc,lr @ be binary compatible with V4, yet
bx lr @ interoperable with Thumb ISA:-)
.size bn_mul_mont,.-bn_mul_mont
.asciz "Montgomery multiplication for ARMv4, CRYPTOGAMS by <appro\@openssl.org>"
___
$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; print $code;
close STDOUT;