$output=shift;
open STDOUT,">$output" || die "can't open $output: $!";
$dat="%rdi"; $len="%rsi"; $inp="%rdx"; $out="%rcx";
@XX=("%r8","%r10");
@TX=("%r9","%r11");
$YY="%r12";
$TY="%r13";
$code=<<___;;
.text
.globl RC4
.type RC4,\@function
.align 16
RC4: or $len,$len
jne .Lentry
repret
.Lentry:
push %r12
push %r13
add \$2,$dat
movzb -2($dat),$XX[0] movzb -1($dat),$YY
add \$1,$XX[0] movzb ($dat,$XX[0]),$TX[0] test \$-8,$len
jz .Lcloop1
push %rbx
.align 16 .Lcloop8:
mov ($inp),%eax
mov 4($inp),%ebx
___
for ($i=0;$i<4;$i++) {
$code.=<<___;
add $TX[0] lea 1($XX[0]),$XX[1]
movzb ($dat,$YY),$TY movzb $XX[1] movzb ($dat,$XX[1]),$TX[1] movb $TX[0] cmp $XX[1],$YY
movb $TY jne .Lcmov$i mov $TX[0],$TX[1]
.Lcmov$i:
add $TX[0] xor ($dat,$TY),%al
ror \$8,%eax
___
push(@TX,shift(@TX)); push(@XX,shift(@XX)); }
for ($i=4;$i<8;$i++) {
$code.=<<___;
add $TX[0] lea 1($XX[0]),$XX[1]
movzb ($dat,$YY),$TY movzb $XX[1] movzb ($dat,$XX[1]),$TX[1] movb $TX[0] cmp $XX[1],$YY
movb $TY jne .Lcmov$i mov $TX[0],$TX[1]
.Lcmov$i:
add $TX[0] xor ($dat,$TY),%bl
ror \$8,%ebx
___
push(@TX,shift(@TX)); push(@XX,shift(@XX)); }
$code.=<<___;
lea -8($len),$len
mov %eax,($out)
lea 8($inp),$inp
mov %ebx,4($out)
lea 8($out),$out
test \$-8,$len
jnz .Lcloop8
pop %rbx
cmp \$0,$len
jne .Lcloop1
.Lexit:
sub \$1,$XX[0] movb $XX[0] movb $YY
pop %r13
pop %r12
repret
.align 16
.Lcloop1:
add $TX[0] movzb ($dat,$YY),$TY movb $TX[0] movb $TY add $TX[0] add \$1,$XX[0] movzb ($dat,$TY),$TY movzb ($dat,$XX[0]),$TX[0] xorb ($inp),$TY lea 1($inp),$inp
movb $TY lea 1($out),$out
sub \$1,$len
jnz .Lcloop1
jmp .Lexit
.size RC4,.-RC4
___
$code =~ s/
$code =~ s/repret/.byte\t0xF3,0xC3/gm;
print $code;