/*
This file "aesxts.s" provides x86_64 / i386 optimization of the following functions
0. xts_mult_x_on_xmm7 : a code macro that is used throughout all other functions
1. void xts_mult_x(uint8_t *I) 3. int tweak_crypt_group(const uint8_t *P, uint8_t *C, uint8_t *T, aesedp_encrypt_ctx *ctx, uint32_t lim) 5. int tweak_uncrypt_group(const uint8_t *C, uint8_t *P, uint8_t *T, aesedp_decrypt_ctx *ctx, uint32_t lim) This file should be compiled together with xtsClearC.c
functions 1,2,4 are supposed to replace the C functions in xtsClearC.c for x86_64/i386 architectures
functions 3,5 are only given here, no C code is available, they are called in xts_encrypt/xts_decrypt (xtsClearC.c)
- we can possibly add C code for functions 3 and 5 for future porting to other architectures
cclee 4-29-10
*/
#ifdef KERNEL
#include <i386/cpu_capabilities.h>
#else
#include <System/i386/cpu_capabilities.h>
#endif
#define CRYPT_OK 0 // can not include "crypt.h" in which CRYPT_OK is from enum
/*
The following macro is used throughout the functions in this file.
It is the core function within the function xts_mult_x defined in (xtsClearC.c)
upon entry, %xmm7 = the input tweak (128-bit),
on return, %xmm7 = the updated tweak (128-bit)
the macro uses %xmm1/%xmm2/%ecx in the computation
the operation can be described as follows :
0. let x = %xmm7 2. if (x&1) x ^= 0x0000...0086
It's a pity that SSE does not support shifting of the whole 128-bit xmm registers.
The workaround is
1. using parallel dual quad (8-byte) shifting, 1 for the 2 bottom 63-bits, 1 for the 2 leading bits
2. manipulating the shifted quad words to form the 128-bit shifted result.
Input : %xmm7
Output : %xmm7
Used : %xmm1/%xmm2/%ecx
The macro is good for both x86_64 and i386.
*/
.macro xts_mult_x_on_xmm7 // input : x = %xmm7, MS = most significant, LS = least significant
movaps %xmm7, %xmm1 // %xmm1 = a copy of x
movaps %xmm7, %xmm2 // %xmm2 = a copy of x
psllq $$1, %xmm7 // 1-bit left shift of 2 quad words (x1<<1, x0<<1), zero-filled
psrlq $$63, %xmm1 // 2 leading bits, each in the least significant bit of a quad word
psrad $$31, %xmm2 // the MS 32-bit will be either 0 or -1, depending on the MS bit of x
pshufd $$0xc6, %xmm1, %xmm1 // switch the positions of the 2 leading bits
pshufd $$0x03, %xmm2, %xmm2 // the LS 32-bit will be either 0 or -1, depending on the MS bit of x
por %xmm1, %xmm7 // we finally has %xmm7 = rotate_left(x,1) movd %ecx, %xmm1 // copy it to %xmm1, the other is 0
pand %xmm2, %xmm1 // %xmm1 = 0 or 0x86, depending on the MS bit of x
pxor %xmm1, %xmm7 // rotate_left(x,1) ^= 0 or 0x86 depending on the MS bit of x
.endm
/*
function : void xts_mult_x(uint8_t *I) 1. load (__m128*) (I) into xmm7
2. macro xts_mult_x_on_xmm7 (i/o @ xmm7, used xmm1/xmm2/ecx)
3. save output (%xmm7) to memory pointed by I
input : 16-byte memory pointed by I
output : same 16-byte memory pointed by I
if kernel code, xmm1/xmm2/xmm7 saved and restored
other used registers : eax/ecx
*/
.text
.align 4,0x90
.globl _xts_mult_x
_xts_mult_x:
#if defined __x86_64__
#define I %rdi // 1st argument at %rdi for x86_64
#define sp %rsp
#else
mov 4(%esp), %eax // 1st argument at stack, offset 4 for ret_addr for i386
#define I %eax
#define sp %esp
#endif
// if KERNEL code, allocate memory and save xmm1/xmm2/xmm7
#ifdef KERNEL
#if defined __x86_64__
sub $0x38, sp // 8-bytes alignment + 3 * 16 bytes
#else
sub $0x3c, sp // 12-bytes alignment + 3 * 16 bytes
#endif
movaps %xmm1, (sp)
movaps %xmm2, 16(sp)
movaps %xmm7, 32(sp)
#endif
// load, compute, and save
movups (I), %xmm7 // load input tweak 128-bit into %xmm7
xts_mult_x_on_xmm7 // the macro (also used else where) will update %xmm7 as the output
movups %xmm7, (I) // save the xts_mult_x output
// if KERNEL code, restore xmm1/xmm2/xmm7 and deallocate stack memory
#ifdef KERNEL
movaps (sp), %xmm1
movaps 16(sp), %xmm2
movaps 32(sp), %xmm7
#if defined __x86_64__
add $0x38, sp // 8-bytes alignment + 3 * 16 bytes
#else
add $0x3c, sp // 12-bytes alignment + 3 * 16 bytes
#endif
#endif
ret // return
#undef I
#undef sp
/*
The following is x86_64/i386 assembly implementation of
int tweak_crypt(const uint8_t *P, uint8_t *C, uint8_t *T, aesedp_encrypt_ctx *ctx) Its C code implementation is given in xtsClearC.c
all pointers P/C/T points to a block of 16 bytes. In the following description, P/C/T represent 128-bit data.
The operation of tweak_crypt
1. C = P ^ T
2. err = aes_encryp(C, C, ctx) 4. xts_mult_x(T)
5. return CRYPT_OK The following is the assembly implementation flow
1. save used xmm registers (xmm1/xmm7) if kernel code
2. load xmm1 = P, xmm7 = T
3. xmm1 = C = P ^ T
4. write xmm1 to C
5. call aes_encryp(C,C,ctx) 7. xmm1 = C = C^T = xmm1 ^ xmm7
8. write xmm1 to C
9. update T (in xmm7) via xts_mult_x macro
a. restore xmm registers (xmm1/xmm7) if kernel code
b. return CRYPT_OK (in eax)
Note: used xmm registers : xmm1/xmm2/xmm7, xmm2 in xts_mult_x macro
*/
.text
.align 4,0x90
.globl _tweak_crypt
_tweak_crypt:
#if defined __i386__
// push into stack for local use
push %ebp
mov %esp, %ebp
push %ebx
push %edi
push %esi
// alllocate stack memory for local use
sub $12+16*4, %esp // 12 (alignment) + 3*16 (xmm save/restore) + 16 (aes_crypt calling arguments)
// load with called arguments
mov 8(%ebp), %eax // P, we need this only briefly, so eax is fine
mov 12(%ebp), %edi // C
mov 16(%ebp), %ebx // T
mov 20(%ebp), %esi // ctx
#define P %eax
#define C %edi
#define T %ebx
#define ctx %esi
#define sp %esp
#else
// x86_64 calling argument order : rdi/rsi/rdx/rcx/r8
// push into stack for local use
push %rbp
mov %rsp, %rbp
push %r12
push %r13
push %r14
push %r15
// alllocate stack memory for local use, if kernel code, need to save/restore xmm registers
#ifdef KERNEL
sub $4*16, %rsp // only need 3*16, add 16 extra so to make save/restore xmm common to i386
#endif
// load with called arguments, release rdi/rsi/rdx/rcx/r8, as need to call aes_encrypt
mov %rsi, %r13
mov %rdx, %r14
mov %rcx, %r15
#define P %rdi
#define C %r13
#define T %r14
#define ctx %r15
#define sp %rsp
#endif
// if kernel, save used xmm registers
#ifdef KERNEL
movaps %xmm1, 16(sp)
movaps %xmm2, 32(sp)
movaps %xmm7, 48(sp)
#endif
movups (P), %xmm1 // P
movups (T), %xmm7 // T
// setup caliing arguments for aes_encrypt
#if defined __i386__
mov C, (%esp) // C
mov C, 4(%esp) // C
mov ctx, 8(%esp) // ctx
#else
mov C, %rdi // C
mov C, %rsi // C
mov ctx, %rdx // ctx
#endif
pxor %xmm7, %xmm1 // C = P ^ T
movups %xmm1, (C) // save C into memory
call _aes_encrypt // err = aes_encrypt(C,C,ctx) cmp $CRYPT_OK, %eax // check err == CRYPT_OK
jne 9f // if err != CRYPT_OK, exit
movups (C), %xmm1 // load xmm1 = C
pxor %xmm7, %xmm1 // C ^= T
movups %xmm1, (C) // write C with xmm1, xmm1 is freed now, will be changed in the following macro
xts_mult_x_on_xmm7 // update T (on xmm7)
movups %xmm7, (T) // write xmm7 to T
9:
// restore used xmm registers if this is for kernel
#ifdef KERNEL
movaps 16(sp), %xmm1
movaps 32(sp), %xmm2
movaps 48(sp), %xmm7
#endif
// free stack memory and restore callee registers
#if defined __i386__
add $12+16*4, %esp // 12 (alignment) + 3*16 (xmm save/restore) + 16 (aes_crypt calling arguments)
pop %esi
pop %edi
pop %ebx
#else
#ifdef KERNEL
add $4*16, %rsp // only need 3*16, add 16 extra so make save/restore xmm common to i386
#endif
pop %r15
pop %r14
pop %r13
pop %r12
#endif
// return, eax/rax already has the return val
leave
ret
#undef P
#undef C
#undef T
#undef ctx
#undef sp
/*
The following is x86_64/i386 assembly implementation of
int tweak_crypt_group(const uint8_t *P, uint8_t *C, uint8_t *T, aesedp_encrypt_ctx *ctx, uint32_t lim) TODO : Its C code implementation is YET to be provided in xtsClearC.c (for the benefit of porting to other ISAs)
This function is grouped version of the above function tweak_crypt(), so xmm registers save/restore only need
to happen once for all grouped blocks.
The implementation here probes __cpu_capabilities to detect whether aesni (or hw-aes instruction) is available.
If aesni is available, the code branch to optimized code that uses aesni.
The optimized aesni code operates as follows:
while (more than 4 consecutive blocks available) {
do xts_mult_x macro 4 times and write the 4 tweaks on stack (16-byte aligned)
perform 4 C = P ^ T perform 4 aes_encrypt (all aes_encrypt instruction interleaved to achieve better throughtput)
perform 4 C = C ^ T // T is on 16-byte aligned stack
}
The code then falls through to the scalar code, that sequentially performs what tweak_crypt does
1. C = P ^ T
2. err = aes_encryp(C, C, ctx) 4. xts_mult_x(T)
Note: used xmm registers :
xmm0-xmm5, xmm7 if aesni is available
xmm0-xmm4, xmm7 if aesni is not available.
*/
.text
.align 4,0x90
.globl _tweak_crypt_group
_tweak_crypt_group:
#if defined __i386__
// push callee-saved registers for local use
push %ebp
mov %esp, %ebp
push %ebx
push %edi
push %esi
// allocate stack memory for local use and/or xmm register save for kernel code
sub $(12+8*16+16*4), %esp // 12 (alignment) + 8*16 (xmm) + 4*16 (pre-computed tweaks) aesni
// 12 (alignment) + 8*16 (xmm) + 4*16 (only 12 used for aes_encrypt) no aesni
// transfer calling arguments
mov 20(%ebp), %eax // ctx
mov 12(%ebp), %edi // C
mov 16(%ebp), %ebx // T
mov 8(%ebp), %esi // P
mov %eax, 8(%esp) // ctx as the 3rd parameter to aes_decrypt
#define P %esi
#define C %edi
#define T %ebx
#define lim 24(%ebp)
#define sp %esp
#else
// push callee-saved registers for local use
push %rbp
mov %rsp, %rbp
push %rbx
push %r12
push %r13
push %r14
push %r15
// allocate stack memory for local use and/or xmm register save for kernel code
sub $(8+8*16+16*5), %rsp // 8 (alignment) + 8*16 (xmm) + 4*16 (pre-computed tweaks) + 16 (common to i386)
// rdi/rsi/rdx/rcx/r8
// transfer calling arguments
mov %rdi, %r12
mov %rsi, %r13
mov %rdx, %r14
mov %rcx, %r15
mov %r8, %rbx
#define P %r12
#define C %r13
#define T %r14
#define ctx %r15
#define lim %ebx
#define sp %rsp
#endif
#ifdef KERNEL
movaps %xmm0, 0x50(sp)
movaps %xmm1, 0x60(sp)
movaps %xmm2, 0x70(sp)
movaps %xmm3, 0x80(sp)
movaps %xmm4, 0x90(sp)
movaps %xmm7, 0xa0(sp)
#endif
// probe __cpu_capabilities to detect aesni
#if defined __x86_64__
movq __cpu_capabilities@GOTPCREL(%rip), %rax // %rax -> __cpu_capabilities
mov (%rax), %eax // %eax = __cpu_capabilities
#else // i386
#if defined KERNEL
leal __cpu_capabilities, %eax // %eax -> __cpu_capabilities
mov (%eax), %eax // %eax = __cpu_capabilities
#else
movl _COMM_PAGE_CPU_CAPABILITIES, %eax
#endif
#endif
test $(kHasAES), %eax
je L_crypt_group_sw // if aesni not available, jump to sw-based implementation
// aesni-based implementation
sub $4, lim // pre-decrement lim by 4
jl 9f // if lim < 4, skip the following code
movups (T), %xmm7 // xmm7 is the tweak before encrypting every 4 blocks
#ifdef KERNEL
movaps %xmm5, 0xb0(sp) // hw-aes-based uses extra xmm5
#endif
0:
// derive 4 tweaks using xts_mult_x macro, and save on aligned stack space
// xmm7 will be the tweak for next 4-blocks iteration
#define tweak1 16(sp)
#define tweak2 32(sp)
#define tweak3 48(sp)
#define tweak4 64(sp)
movaps %xmm7, tweak1 // save 1st tweak on stack
xts_mult_x_on_xmm7 // compute 2nd tweak
movaps %xmm7, tweak2 // save 2nd tweak on stack
xts_mult_x_on_xmm7 // compute 3rd tweak
movaps %xmm7, tweak3 // save 3rd tweak on stack
xts_mult_x_on_xmm7 // compute 4th tweak
movaps %xmm7, tweak4 // save 4th tweak on stack
xts_mult_x_on_xmm7 // compute 1st tweak for next iteration
// read 4 Ps
movups (P), %xmm0
movups 16(P), %xmm1
movups 32(P), %xmm2
movups 48(P), %xmm3
// 4 C = P ^ T
pxor tweak1, %xmm0
pxor tweak2, %xmm1
pxor tweak3, %xmm2
pxor tweak4, %xmm3
// 4 interleaved aes_encrypt
#if defined __i386__
mov 8(sp), %ecx // ctx
#undef ctx
#define ctx %ecx
#endif
mov 240(ctx), %eax // aes length
cmp $160, %eax // AES-128 ?
je 160f
cmp $192, %eax // AES-192 ?
je 192f
cmp $224, %eax // AES-256 ?
je 224f
mov $-1, %eax // error : non-supported aes length
#ifdef KERNEL
movaps 0xb0(sp), %xmm5 // hw-aes-based uses extra xmm5
#endif
jmp L_error_crypt
// definitions, macros, and constructs for 4 blocks hw-aes-encrypt
// the following key definitions will also be used in tweak_uncrypt_group
#define key0 0(ctx)
#define key1 16(ctx)
#define key2 32(ctx)
#define key3 48(ctx)
#define key4 64(ctx)
#define key5 80(ctx)
#define key6 96(ctx)
#define key7 112(ctx)
#define key8 128(ctx)
#define key9 144(ctx)
#define keyA 160(ctx)
#define keyB 176(ctx)
#define keyC 192(ctx)
#define keyD 208(ctx)
#define keyE 224(ctx)
#define aes aesenc
#define aeslast aesenclast
// all aes encrypt operations start with the following sequence
.macro aes_common_part
movups key0, %xmm4
movups key1, %xmm5
pxor %xmm4, %xmm0
pxor %xmm4, %xmm1
pxor %xmm4, %xmm2
pxor %xmm4, %xmm3
movups key2, %xmm4
aes %xmm5, %xmm0
aes %xmm5, %xmm1
aes %xmm5, %xmm2
aes %xmm5, %xmm3
movups key3, %xmm5
aes %xmm4, %xmm0
aes %xmm4, %xmm1
aes %xmm4, %xmm2
aes %xmm4, %xmm3
movups key4, %xmm4
aes %xmm5, %xmm0
aes %xmm5, %xmm1
aes %xmm5, %xmm2
aes %xmm5, %xmm3
movups key5, %xmm5
aes %xmm4, %xmm0
aes %xmm4, %xmm1
aes %xmm4, %xmm2
aes %xmm4, %xmm3
movups key6, %xmm4
aes %xmm5, %xmm0
aes %xmm5, %xmm1
aes %xmm5, %xmm2
aes %xmm5, %xmm3
movups key7, %xmm5
aes %xmm4, %xmm0
aes %xmm4, %xmm1
aes %xmm4, %xmm2
aes %xmm4, %xmm3
movups key8, %xmm4
aes %xmm5, %xmm0
aes %xmm5, %xmm1
aes %xmm5, %xmm2
aes %xmm5, %xmm3
movups key9, %xmm5
aes %xmm4, %xmm0
aes %xmm4, %xmm1
aes %xmm4, %xmm2
aes %xmm4, %xmm3
movups keyA, %xmm4
aes %xmm5, %xmm0
aes %xmm5, %xmm1
aes %xmm5, %xmm2
aes %xmm5, %xmm3
.endm
// all aes encypt operations end with the following 4 instructions
.macro aes_last
aeslast %xmm4, %xmm0
aeslast %xmm4, %xmm1
aeslast %xmm4, %xmm2
aeslast %xmm4, %xmm3
.endm
.macro aes_128
aes_common_part // encrypt common part
aes_last // encrypt ending part
.endm
.macro aes_192
aes_common_part // encrypt common part
// 10 extra instructions in between common and ending
movups keyB, %xmm5
aes %xmm4, %xmm0
aes %xmm4, %xmm1
aes %xmm4, %xmm2
aes %xmm4, %xmm3
movups keyC, %xmm4
aes %xmm5, %xmm0
aes %xmm5, %xmm1
aes %xmm5, %xmm2
aes %xmm5, %xmm3
aes_last // encrypt ending part
.endm
.macro aes_256
aes_common_part // encrypt common part
// 20 extra instructions in between common and ending
movups keyB, %xmm5
aes %xmm4, %xmm0
aes %xmm4, %xmm1
aes %xmm4, %xmm2
aes %xmm4, %xmm3
movups keyC, %xmm4
aes %xmm5, %xmm0
aes %xmm5, %xmm1
aes %xmm5, %xmm2
aes %xmm5, %xmm3
movups keyD, %xmm5
aes %xmm4, %xmm0
aes %xmm4, %xmm1
aes %xmm4, %xmm2
aes %xmm4, %xmm3
movups keyE, %xmm4
aes %xmm5, %xmm0
aes %xmm5, %xmm1
aes %xmm5, %xmm2
aes %xmm5, %xmm3
aes_last // encrypt ending part
.endm
160: // AES-128 encrypt
aes_128
jmp 8f
192: // AES-192 encrypt
aes_192
jmp 8f
224: // AES-256 encrypt
aes_256
8:
// 4 C = C ^ T
pxor tweak1, %xmm0
pxor tweak2, %xmm1
pxor tweak3, %xmm2
pxor tweak4, %xmm3
// write 4 Cs
movups %xmm0, (C)
movups %xmm1, 16(C)
movups %xmm2, 32(C)
movups %xmm3, 48(C)
add $64, P
add $64, C
sub $4, lim
jge 0b
#ifdef KERNEL
movaps 0xb0(sp), %xmm5 // hw-aes-based uses extra xmm5
#endif
movups %xmm7, (T)
9:
xor %eax, %eax // to return CRYPT_OK
add $4, lim // post-increment lim by 4
je 9f // if lim==0, branch to prepare to return
L_crypt_group_sw:
movups (T), %xmm7 // T, xmm7 will be used as T (128-bit) throughtout the loop
sub $1, lim // pre-decrement lim by 1
jl 1f // if lim < 1, branch to prepare to return
0:
movups (P), %xmm0 // P
// prepare for calling aes_encrypt
#if defined __i386__
mov C, (%esp) // C
mov C, 4(%esp) // C
// ctx was prepared previously in preamble
#else
mov C, %rdi // C
mov C, %rsi // C
mov ctx, %rdx // ctx
#endif
pxor %xmm7, %xmm0 // C = P ^ T
movups %xmm0, (C) // save C into memory
call _aes_encrypt_xmm_no_save // err = aes_encrypt(C,C,ctx) cmp $CRYPT_OK, %eax // err == CRYPT_OK ?
jne 9f // if err != CRYPT_OK, branch to exit with error
movups (C), %xmm0 // load xmm0 with C
pxor %xmm7, %xmm0 // C ^= T
movups %xmm0, (C) // save output C
xts_mult_x_on_xmm7
add $16, C // next C
add $16, P // next P
sub $1, lim // lim--
jge 0b // if (lim>0) repeat the scalar loop
1: movups %xmm7, (T) // save final tweak
L_error_crypt:
9:
// if kernel, restore used xmm registers
#ifdef KERNEL
movaps 0x50(sp), %xmm0
movaps 0x60(sp), %xmm1
movaps 0x70(sp), %xmm2
movaps 0x80(sp), %xmm3
movaps 0x90(sp), %xmm4
movaps 0xa0(sp), %xmm7
#endif
#if defined __i386__
add $(12+16*8+16*4), %esp
pop %esi
pop %edi
pop %ebx
#else
add $(8+16*8+16*5), %rsp
pop %r15
pop %r14
pop %r13
pop %r12
pop %rbx
#endif
leave
ret
#undef P
#undef C
#undef T
#undef ctx
#undef sp
/*
The following is x86_64/i386 assembly implementation of
int tweak_uncrypt(const uint8_t *C, uint8_t *P, uint8_t *T, aesedp_decrypt_ctx *ctx) Its C code implementation is given in xtsClearC.c
all pointers C/P/T points to a block of 16 bytes. In the following description, C/P/T represent 128-bit data.
The operation of tweak_crypt
1. P = C ^ T
2. err = aes_decryp(P, P, ctx) 4. xts_mult_x(T)
5. return CRYPT_OK The following is the assembly implementation flow
1. save used xmm registers (xmm1/xmm7) if kernel code
2. load xmm1 = C, xmm7 = T
3. xmm1 = P = C ^ T
4. write xmm1 to P
5. call aes_decryp(P,P,ctx) 7. xmm1 = P = P^T = xmm1 ^ xmm7
8. write xmm1 to P
9. update T (in xmm7) via xts_mult_x macro
a. restore xmm registers (xmm1/xmm7) if kernel code
b. return CRYPT_OK (in eax)
Note: used xmm registers : xmm1/xmm2/xmm7, xmm2 in xts_mult_x macro
*/
.text
.align 4,0x90
.globl _tweak_uncrypt
_tweak_uncrypt:
#if defined __i386__
// push into stack for local use
push %ebp
mov %esp, %ebp
push %ebx
push %edi
push %esi
// alllocate stack memory for local use
sub $12+16*4, %esp // 12 (alignment) + 3*16 (xmm save/restore) + 16 (aes_crypt calling arguments)
// load with called arguments
mov 8(%ebp), %eax // C, we need this only briefly, so eax is fine
mov 12(%ebp), %edi // P
mov 16(%ebp), %ebx // T
mov 20(%ebp), %esi // ctx
#define C %eax
#define P %edi
#define T %ebx
#define ctx %esi
#define sp %esp
#else
// x86_64 calling argument order : rdi/rsi/rdx/rcx/r8
// push into stack for local use
push %rbp
mov %rsp, %rbp
push %r12
push %r13
push %r14
push %r15
// alllocate stack memory for local use, if kernel code, need to save/restore xmm registers
#ifdef KERNEL
sub $4*16, %rsp // only need 3*16, add 16 extra so to make save/restore xmm common to i386
#endif
// load with called arguments, release rdi/rsi/rdx/rcx/r8, as need to call aes_decrypt
mov %rsi, %r13
mov %rdx, %r14
mov %rcx, %r15
#define C %rdi
#define P %r13
#define T %r14
#define ctx %r15
#define sp %rsp
#endif
// if kernel, save used xmm registers
#ifdef KERNEL
movaps %xmm1, 16(sp)
movaps %xmm2, 32(sp)
movaps %xmm7, 48(sp)
#endif
movups (C), %xmm1 // C
movups (T), %xmm7 // T
// setup caliing arguments for aes_decrypt
#if defined __i386__
mov P, (%esp) // P
mov P, 4(%esp) // P
mov ctx, 8(%esp) // ctx
#else
mov P, %rdi // P
mov P, %rsi // P
mov ctx, %rdx // ctx
#endif
pxor %xmm7, %xmm1 // P = C ^ T
movups %xmm1, (P) // save P into memory
call _aes_decrypt // err = aes_decrypt(P,P,ctx) cmp $CRYPT_OK, %eax // check err == CRYPT_OK
jne 9f // if err != CRYPT_OK, exit
movups (P), %xmm1 // load xmm1 = P
pxor %xmm7, %xmm1 // P ^= T
movups %xmm1, (P) // write P with xmm1, xmm1 is freed now, will be changed in the following macro
xts_mult_x_on_xmm7 // update T (on xmm7)
movups %xmm7, (T) // write xmm7 to T
9:
// restore used xmm registers if this is for kernel
#ifdef KERNEL
movaps 16(sp), %xmm1
movaps 32(sp), %xmm2
movaps 48(sp), %xmm7
#endif
// free stack memory and restore callee registers
#if defined __i386__
add $12+16*4, %esp // 12 (alignment) + 3*16 (xmm save/restore) + 16 (aes_crypt calling arguments)
pop %esi
pop %edi
pop %ebx
#else
#ifdef KERNEL
add $4*16, %rsp // only need 3*16, add 16 extra so make save/restore xmm common to i386
#endif
pop %r15
pop %r14
pop %r13
pop %r12
#endif
// return, eax/rax already has the return val
leave
ret
#undef P
#undef C
#undef T
#undef ctx
#undef sp
/*
The following is x86_64/i386 assembly implementation of
int tweak_uncrypt_group(const uint8_t *C, uint8_t *P, uint8_t *T, aesedp_decrypt_ctx *ctx, uint32_t lim) TODO : Its C code implementation is YET to be provided in xtsClearC.c (for the benefit of porting to other ISAs)
This function is grouped version of the above function tweak_uncrypt(), so xmm registers save/restore only need
to happen once for all grouped blocks.
The implementation here probes __cpu_capabilities to detect whether aesni (or hw-aes instruction) is available.
If aesni is available, the code branch to optimized code that uses aesni.
The optimized aesni code operates as follows:
while (more than 4 consecutive blocks available) {
do xts_mult_x macro 4 times and write the 4 tweaks on stack (16-byte aligned)
perform 4 P = C ^ T perform 4 aes_decrypt (all aes_decrypt instruction interleaved to achieve better throughtput)
perform 4 P = P ^ T // T is on 16-byte aligned stack
}
The code then falls through to the scalar code, that sequentially performs what tweak_crypt does
1. P = C ^ T
2. err = aes_decryp(P, P, ctx) 4. xts_mult_x(T)
Note: used xmm registers :
xmm0-xmm5, xmm7 if aesni is available
xmm0-xmm4, xmm7 if aesni is not available.
*/
.text
.align 4,0x90
.globl _tweak_uncrypt_group
_tweak_uncrypt_group:
#if defined __i386__
// push callee-saved registers for local use
push %ebp
mov %esp, %ebp
push %ebx
push %edi
push %esi
// allocate stack memory for local use and/or xmm register save for kernel code
sub $(12+8*16+16*4), %esp // 12 (alignment) + 8*16 (xmm) + 4*16 (pre-computed tweaks) aesni
// 12 (alignment) + 8*16 (xmm) + 4*16 (only 12 used for aes_decrypt) no aesni
// transfer calling arguments
mov 20(%ebp), %eax // ctx
mov 12(%ebp), %edi // P
mov 16(%ebp), %ebx // T
mov 8(%ebp), %esi // C
mov %eax, 8(%esp) // ctx as the 3rd parameter to aes_decrypt
#define C %esi
#define P %edi
#define T %ebx
#define lim 24(%ebp)
#define sp %esp
#else
// push callee-saved registers for local use
push %rbp
mov %rsp, %rbp
push %rbx
push %r12
push %r13
push %r14
push %r15
// allocate stack memory for local use and/or xmm register save for kernel code
sub $(8+8*16+16*5), %rsp // 8 (alignment) + 8*16 (xmm) + 4*16 (pre-computed tweaks) + 16 (common to i386)
// rdi/rsi/rdx/rcx/r8
// transfer calling arguments
mov %rdi, %r12
mov %rsi, %r13
mov %rdx, %r14
mov %rcx, %r15
mov %r8, %rbx
#define C %r12
#define P %r13
#define T %r14
#define ctx %r15
#define lim %ebx
#define sp %rsp
#endif
#ifdef KERNEL
movaps %xmm0, 0x50(sp)
movaps %xmm1, 0x60(sp)
movaps %xmm2, 0x70(sp)
movaps %xmm3, 0x80(sp)
movaps %xmm4, 0x90(sp)
movaps %xmm7, 0xa0(sp)
#endif
// probe __cpu_capabilities to detect aesni
#if defined __x86_64__
movq __cpu_capabilities@GOTPCREL(%rip), %rax // %rax -> __cpu_capabilities
mov (%rax), %eax // %eax = __cpu_capabilities
#else // i386
#if defined KERNEL
leal __cpu_capabilities, %eax // %eax -> __cpu_capabilities
mov (%eax), %eax // %eax = __cpu_capabilities
#else
movl _COMM_PAGE_CPU_CAPABILITIES, %eax
#endif
#endif
test $(kHasAES), %eax
je L_uncrypt_group_sw // if aesni not available, jump to sw-based implementation
// aesni-based implementation
sub $4, lim // pre-decrement lim by 4
jl 9f // if lim < 4, skip the following code
movups (T), %xmm7 // xmm7 is the tweak before decrypting every 4 blocks
#ifdef KERNEL
movaps %xmm5, 0xb0(sp) // hw-aes-based uses extra xmm5
#endif
0:
// derive 4 tweaks using xts_mult_x macro, and save on aligned stack space
// xmm7 will be the tweak for next 4-blocks iteration
#define tweak1 16(sp)
#define tweak2 32(sp)
#define tweak3 48(sp)
#define tweak4 64(sp)
movaps %xmm7, tweak1 // save 1st tweak on stack
xts_mult_x_on_xmm7 // compute 2nd tweak
movaps %xmm7, tweak2 // save 2nd tweak on stack
xts_mult_x_on_xmm7 // compute 3rd tweak
movaps %xmm7, tweak3 // save 3rd tweak on stack
xts_mult_x_on_xmm7 // compute 4th tweak
movaps %xmm7, tweak4 // save 4th tweak on stack
xts_mult_x_on_xmm7 // compute 1st tweak for next iteration
// read 4 Cs
movups (C), %xmm0
movups 16(C), %xmm1
movups 32(C), %xmm2
movups 48(C), %xmm3
// 4 P = C ^ T
pxor tweak1, %xmm0
pxor tweak2, %xmm1
pxor tweak3, %xmm2
pxor tweak4, %xmm3
// 4 interleaved aes_decrypt
#if defined __i386__
mov 8(sp), %ecx // ctx
#undef ctx
#define ctx %ecx
#endif
mov 240(ctx), %eax // aes length
cmp $160, %eax // AES-128 ?
je 160f
cmp $192, %eax // AES-192 ?
je 192f
cmp $224, %eax // AES-256 ?
je 224f
mov $-1, %eax // error : non-supported aes length
#ifdef KERNEL
movaps 0xb0(sp), %xmm5 // hw-aes-based uses extra xmm5
#endif
jmp L_error_uncrypt
// definitions, macros to construc hw-aes-decrypt
// will reuse previously defined key0 = (ctx), key1 = 16(ctx), ....
#undef aes
#undef aeslast
#define aes aesdec
#define aeslast aesdeclast
.macro aes_decrypt_common
movups key8, %xmm4
aes %xmm5, %xmm0
aes %xmm5, %xmm1
aes %xmm5, %xmm2
aes %xmm5, %xmm3
movups key7, %xmm5
aes %xmm4, %xmm0
aes %xmm4, %xmm1
aes %xmm4, %xmm2
aes %xmm4, %xmm3
movups key6, %xmm4
aes %xmm5, %xmm0
aes %xmm5, %xmm1
aes %xmm5, %xmm2
aes %xmm5, %xmm3
movups key5, %xmm5
aes %xmm4, %xmm0
aes %xmm4, %xmm1
aes %xmm4, %xmm2
aes %xmm4, %xmm3
movups key4, %xmm4
aes %xmm5, %xmm0
aes %xmm5, %xmm1
aes %xmm5, %xmm2
aes %xmm5, %xmm3
movups key3, %xmm5
aes %xmm4, %xmm0
aes %xmm4, %xmm1
aes %xmm4, %xmm2
aes %xmm4, %xmm3
movups key2, %xmm4
aes %xmm5, %xmm0
aes %xmm5, %xmm1
aes %xmm5, %xmm2
aes %xmm5, %xmm3
movups key1, %xmm5
aes %xmm4, %xmm0
aes %xmm4, %xmm1
aes %xmm4, %xmm2
aes %xmm4, %xmm3
movups key0, %xmm4
aes %xmm5, %xmm0
aes %xmm5, %xmm1
aes %xmm5, %xmm2
aes %xmm5, %xmm3
aeslast %xmm4, %xmm0
aeslast %xmm4, %xmm1
aeslast %xmm4, %xmm2
aeslast %xmm4, %xmm3
.endm
.macro aes_dec_128
movups keyA, %xmm4
movups key9, %xmm5
pxor %xmm4, %xmm0
pxor %xmm4, %xmm1
pxor %xmm4, %xmm2
pxor %xmm4, %xmm3
aes_decrypt_common
.endm
.macro aes_dec_192
movups keyC, %xmm4
movups keyB, %xmm5
pxor %xmm4, %xmm0
pxor %xmm4, %xmm1
pxor %xmm4, %xmm2
pxor %xmm4, %xmm3
movups keyA, %xmm4
aes %xmm5, %xmm0
aes %xmm5, %xmm1
aes %xmm5, %xmm2
aes %xmm5, %xmm3
movups key9, %xmm5
aes %xmm4, %xmm0
aes %xmm4, %xmm1
aes %xmm4, %xmm2
aes %xmm4, %xmm3
aes_decrypt_common
.endm
.macro aes_dec_256
movups keyE, %xmm4
movups keyD, %xmm5
pxor %xmm4, %xmm0
pxor %xmm4, %xmm1
pxor %xmm4, %xmm2
pxor %xmm4, %xmm3
movups keyC, %xmm4
aes %xmm5, %xmm0
aes %xmm5, %xmm1
aes %xmm5, %xmm2
aes %xmm5, %xmm3
movups keyB, %xmm5
aes %xmm4, %xmm0
aes %xmm4, %xmm1
aes %xmm4, %xmm2
aes %xmm4, %xmm3
movups keyA, %xmm4
aes %xmm5, %xmm0
aes %xmm5, %xmm1
aes %xmm5, %xmm2
aes %xmm5, %xmm3
movups key9, %xmm5
aes %xmm4, %xmm0
aes %xmm4, %xmm1
aes %xmm4, %xmm2
aes %xmm4, %xmm3
aes_decrypt_common
.endm
160: // AES-128 decrypt
aes_dec_128
jmp 8f
192: // AES-192 decrypt
aes_dec_192
jmp 8f
224: // AES-256 decrypt
aes_dec_256
8:
// 4 P = P ^ T
pxor tweak1, %xmm0
pxor tweak2, %xmm1
pxor tweak3, %xmm2
pxor tweak4, %xmm3
// write 4 Ps
movups %xmm0, (P)
movups %xmm1, 16(P)
movups %xmm2, 32(P)
movups %xmm3, 48(P)
add $64, C
add $64, P
sub $4, lim
jge 0b
#ifdef KERNEL
movaps 0xb0(sp), %xmm5 // hw-aes-based uses extra xmm5
#endif
movups %xmm7, (T)
9:
xor %eax, %eax // to return CRYPT_OK
add $4, lim // post-increment lim by 4
je 9f // if lim==0, branch to prepare to return
L_uncrypt_group_sw:
movups (T), %xmm7 // T, xmm7 will be used as T (128-bit) throughtout the loop
sub $1, lim // pre-decrement lim by 1
jl 1f // if lim < 1, branch to prepare to return
0:
movups (C), %xmm0 // C
// prepare for calling aes_decrypt
#if defined __i386__
mov P, (%esp) // P
mov P, 4(%esp) // P
// ctx was prepared previously in preamble
#else
mov P, %rdi // P
mov P, %rsi // P
mov ctx, %rdx // ctx
#endif
pxor %xmm7, %xmm0 // P = C ^ T
movups %xmm0, (P) // save P into memory
call _aes_decrypt_xmm_no_save // err = aes_decrypt(P,P,ctx) cmp $CRYPT_OK, %eax // err == CRYPT_OK ?
jne 9f // if err != CRYPT_OK, branch to exit with error
movups (P), %xmm0 // load xmm0 with P
pxor %xmm7, %xmm0 // P ^= T
movups %xmm0, (P) // save output P
xts_mult_x_on_xmm7
add $16, C // next C
add $16, P // next P
sub $1, lim // lim--
jge 0b // if (lim>0) repeat the scalar loop
1: movups %xmm7, (T) // save final tweak
L_error_uncrypt:
9:
// if kernel, restore used xmm registers
#ifdef KERNEL
movaps 0x50(sp), %xmm0
movaps 0x60(sp), %xmm1
movaps 0x70(sp), %xmm2
movaps 0x80(sp), %xmm3
movaps 0x90(sp), %xmm4
movaps 0xa0(sp), %xmm7
#endif
#if defined __i386__
add $(12+16*8+16*4), %esp
pop %esi
pop %edi
pop %ebx
#else
add $(8+16*8+16*5), %rsp
pop %r15
pop %r14
pop %r13
pop %r12
pop %rbx
#endif
leave
ret