aes_modes_asm.s   [plain text]


/*
 ---------------------------------------------------------------------------
 Copyright (c) 2003, Dr Brian Gladman, Worcester, UK.   All rights reserved.

 LICENSE TERMS

 The free distribution and use of this software in both source and binary
 form is allowed (with or without changes) provided that:

   1. distributions of this source code include the above copyright
      notice, this list of conditions and the following disclaimer;

   2. distributions in binary form include the above copyright
      notice, this list of conditions and the following disclaimer
      in the documentation and/or other associated materials;

   3. the copyright holder's name is not used to endorse products
      built using this software without specific written permission.

 ALTERNATIVELY, provided that this notice is retained in full, this product
 may be distributed under the terms of the GNU General Public License (GPL),
 in which case the provisions of the GPL apply INSTEAD OF those given above.

 DISCLAIMER

 This software is provided 'as is' with no explicit or implied warranties
 in respect of its properties, including, but not limited to, correctness
 and/or fitness for purpose.
 ---------------------------------------------------------------------------
 Issue 31/01/2006

 These subroutines implement multiple block AES modes for ECB, CBC, CFB,
 OFB and CTR encryption,  The code provides support for the VIA Advanced 
 Cryptography Engine (ACE).

 NOTE: In the following subroutines, the AES contexts (ctx) must be
 16 byte aligned if VIA ACE is being used
*/

/* modified 3/5/10 cclee */
/* Clean up those related to VIA ACE and hand optimize aes_cbc_encrypt and aes_cbc_decrypt */
/* move the xmm registers save/restore originally inside the callee functions into these 2 caller functions */

/* add code comments/description and HW AES detection and execution branch cclee 3-13-10 */

#ifdef KERNEL
#include <i386/cpu_capabilities.h> 	// to use __cpu_capabilities&kHasAES to detect Intel Westmere AES HW
#else
#include <System/i386/cpu_capabilities.h> 	// to use __cpu_capabilities&kHasAES to detect Intel Westmere AES HW
#endif

#if 0

// TODO:
// aes_ecb_encrypt and aes_ecb_decrypt are not present in gen/aescrypt.c
// would add the implementation if needed
// they are now compiled from aes_modes.c

aes_rval aes_ecb_encrypt(const unsigned char *ibuf, unsigned char *obuf,
                    int len, const aes_encrypt_ctx ctx[1])
{   int nb = len >> 4;

    if(len & (AES_BLOCK_SIZE - 1)) return 1;
    while(nb--) {
        aes_encrypt(ibuf, obuf, ctx);
        ibuf += AES_BLOCK_SIZE;
        obuf += AES_BLOCK_SIZE;
    }
    return 0;
}

aes_rval aes_ecb_decrypt(const unsigned char *ibuf, unsigned char *obuf,
                    int len, const aes_decrypt_ctx ctx[1])
{   int nb = len >> 4;

    if(len & (AES_BLOCK_SIZE - 1)) return 1;
    while(nb--) {
        aes_decrypt(ibuf, obuf, ctx);
        ibuf += AES_BLOCK_SIZE;
        obuf += AES_BLOCK_SIZE;
    }
    return 0;
}
#endif

#if 0
aes_rval aes_encrypt_cbc(const unsigned char *ibuf, const unsigned char *in_iv, unsigned int num_blk,
					 unsigned char *obuf, const aes_encrypt_ctx ctx[1])
{
		unsigned char iv[16];
		int i;
		
		for (i = 0; i < 16; i++) iv[i] = *(in_iv + i);

		while (num_blk--) {
			iv ^= ibuf;			// 128-bit	
            aes_encrypt(iv, iv, ctx);
            memcpy(obuf, iv, AES_BLOCK_SIZE);
            ibuf += AES_BLOCK_SIZE;
            obuf += AES_BLOCK_SIZE;
			
		}		

		return 0;
}
#endif

	.text
	.align	4,0x90
	.globl	_aes_encrypt_cbc
_aes_encrypt_cbc:

	// detect AES HW
	// if AES HW detected, branch to AES-HW-specific function _aes_encrypt_cbc_hw (aes_modes_hw.s)
	// o.w., fall through to the original AES-SW function

#if defined	__x86_64__
	movq	__cpu_capabilities@GOTPCREL(%rip), %rax			// %rax -> __cpu_capability
	mov		(%rax), %eax									// %eax = __cpu_capabilities
#else
#ifdef KERNEL
	leal	__cpu_capabilities, %eax						// %eax -> __cpu_capabilities
	mov		(%eax), %eax									// %eax = __cpu_capabilities
#else
	mov    _COMM_PAGE_CPU_CAPABILITIES, %eax
#endif
#endif
	test	$(kHasAES), %eax								// kHasAES & __cpu_capabilities
	jne		_aes_encrypt_cbc_hw								// if AES HW detected, branch to HW-specific code

	// save registers and allocate stack memory for xmm registers and calling arguments (i386 only)
#if	defined	__i386__
	push	%ebp
	mov		%esp, %ebp
	push	%ebx					// to be used as ibuf
	push	%edi					// to be used as obuf
	sub		$(16+16+7*16), %esp		// 12 (calling arguments) + 4 (%esi) + 16 (iv) + 7*16 (xmm)
	mov		%esi, 12(%esp)			// save %esp in the unused 4-bytes, to be used as num_blk

	#define	sp	%esp
#else	// __x86_64__
	push	%rbp
	mov		%rsp, %rbp
	push	%rbx
	push	%r12
	push	%r13
	push	%r14
	push	%r15
	sub		$(8+16+5*16+16), %rsp	// 8 (align) + 16 (dummy iv) + 5*16 (xmm) + 16 (for i386-x86_64 consistency)	

	#define	sp	%rsp
#endif

	// save xmm registers for kernel use
	// xmm6-xmm7 will be used locally
	// xmm0-xmm2 (x86_64) or xmm0-/xmm4 (i386) will be used inside _aes_encrypt_xmm_no_save (non-restored)
	// there is a hole not used for xmm, which is 48(sp). 
	// it has been used to store iv (16-bytes) in i386 code
	// for consistency between i386 and x86_64, this hole is dummied in x86_64 code
	// also the 1st 16 bytes (sp) is dummied in x86_64 code

#ifdef	KERNEL
	movaps	%xmm7, 16(sp)
	movaps	%xmm6, 32(sp)
	movaps	%xmm0, 64(sp)
	movaps	%xmm1, 80(sp)
	movaps	%xmm2, 96(sp)
#if defined	__i386__
	movaps	%xmm3, 112(sp)
	movaps	%xmm4, 128(sp)
#endif
#endif

	// set up registers from calling arguments

#if defined	__i386__

	mov		12(%ebp), %eax			// in_iv
	mov		24(%ebp), %edx			// ctx
	movups	(%eax), %xmm7			// in_iv	
	lea		48(%esp), %eax			// &iv[0]
	mov		%eax, (%esp)			// 1st iv for aes_encrypt
	mov		%eax, 4(%esp)			// 2nd iv for aes_encrypt
	mov		%edx, 8(%esp)			// ctx for aes_encrypt
	mov		8(%ebp), %ebx			// ibuf
	mov		16(%ebp), %esi			// num_blk
	mov		20(%ebp), %edi			// obuf

	#define	ibuf	%ebx
	#define	obuf	%edi
	#define num_blk	%esi	

#else	//	__x86_64__, calling arguments order : rdi/rsi/rdx/rcx/r8

	mov		%rdi, %rbx				// ibuf
	lea		48(sp), %r12			// &iv
	movups	(%rsi), %xmm7			// in_iv
	mov		%rdx, %r13				// num_blk
	mov		%rcx, %r14				// obuf
	mov		%r8, %r15				// ctx	

	#define	ibuf	%rbx
	#define	iv		%r12
	#define	num_blk	%r13d
	#define	obuf	%r14	
	#define	ctx		%r15

#endif

	cmp		$1, num_blk				// num_blk vs 1
	jl		9f						// if num_blk < 1, branch to bypass the main loop
0:
	movups	(ibuf), %xmm6			// ibuf
#if defined	__i386__
	lea		48(sp), %eax			// &iv[0]
	pxor	%xmm6, %xmm7			// iv ^= ibuf
	movups	%xmm7, (%eax)			// save iv
#else
	pxor	%xmm6, %xmm7			// iv ^= ibuf
	movups	%xmm7, (iv)				// save iv
	mov		iv, %rdi				// 1st calling argument for aes_encrypt
	mov		iv, %rsi				// 2nd calling argument for aes_encrypt
	mov		ctx, %rdx				// 3rd calling argument for aes_encrypt
#endif
	call	_aes_encrypt_xmm_no_save	// aes_encrypt(iv, iv, ctx)
#if defined __i386__
	leal	48(%esp), %eax			// &iv[0]
	movups	(%eax), %xmm7			// read iv
#else
	movups	(iv), %xmm7				// read iv
#endif
	movups	%xmm7, (obuf)			// memcpy(obuf, iv, AES_BLOCK_SIZE);
	add		$16, ibuf				// ibuf += AES_BLOCK_SIZE; 
	add		$16, obuf				// obuf += AES_BLOCK_SIZE;	
	sub		$1, num_blk				// num_blk --
	jg		0b						// if num_blk > 0, repeat the loop
9:	

L_crypt_cbc_done:

	// restore xmm registers due to kernel use
#ifdef	KERNEL
	movaps	16(sp), %xmm7
	movaps	32(sp), %xmm6
	movaps	64(sp), %xmm0
	movaps	80(sp), %xmm1
	movaps	96(sp), %xmm2
#if defined	__i386__
	movaps	112(sp), %xmm3
	movaps	128(sp), %xmm4
#endif
#endif

	xor		%eax, %eax				// to return 0 for SUCCESS

#if	defined	__i386__
	mov		12(%esp), %esi			// restore %esi
	add		$(16+16+7*16), %esp		// 12 (calling arguments) + 4 (%esi) + 16 (iv) + 7*16 (xmm)
	pop		%edi
	pop		%ebx
#else
	add		$(8+16+5*16+16), %rsp	// 8 (align) + 16 (dummy iv) + 5*16 (xmm) + 16 (for i386-x86_64 consistency)	
	pop		%r15
	pop		%r14
	pop		%r13
	pop		%r12
	pop		%rbx
#endif
	leave
	ret

#if 0
aes_rval aes_decrypt_cbc(const unsigned char *ibuf, const unsigned char *in_iv, unsigned int num_blk,
					 unsigned char *obuf, const aes_decrypt_ctx cx[1])
{
		unsigned char iv[16], tmp[16];
		int i;
		
		for (i = 0; i < 16; i++) iv[i] = *(in_iv + i);

		while (num_blk--) {

            memcpy(tmp, ibuf, AES_BLOCK_SIZE);
            aes_decrypt(ibuf, obuf, ctx);
			obuf ^= iv;
            memcpy(iv, tmp, AES_BLOCK_SIZE);
            ibuf += AES_BLOCK_SIZE;
            obuf += AES_BLOCK_SIZE;
		}

		return 0;
}
#endif

	.text
	.align	4,0x90
	.globl	_aes_decrypt_cbc
_aes_decrypt_cbc:

	// detect AES HW
	// if AES HW detected, branch to AES-HW-specific function _aes_decrypt_cbc_hw (aes_modes_hw.s)
	// o.w., fall through to the original AES-SW function

#if defined	__x86_64__
	movq	__cpu_capabilities@GOTPCREL(%rip), %rax			// %rax -> __cpu_capability
	mov		(%rax), %eax									// %eax = __cpu_capabilities
#else
#ifdef KERNEL
	leal	__cpu_capabilities, %eax						// %eax -> __cpu_capabilities
	mov		(%eax), %eax									// %eax = __cpu_capabilities
#else
	mov    _COMM_PAGE_CPU_CAPABILITIES, %eax
#endif
#endif
	test	$(kHasAES), %eax								// kHasAES & __cpu_capabilities
	jne		_aes_decrypt_cbc_hw

	// save registers and allocate stack memory for xmm registers and calling arguments (i386 only)
#if	defined	__i386__
	push	%ebp
	mov		%esp, %ebp
	push	%ebx					// to be used as ibuf
	push	%edi					// to be used as obuf
	sub		$(16+16+7*16), %esp		// 12 (calling arguments) + 4 (%esi) + 16 (iv) + 7*16 (xmm)
	mov		%esi, 12(%esp)			// save %esp in the unused 4-bytes, to be used as num_blk

	#define	sp	%esp
#else	// __x86_64__
	push	%rbp
	mov		%rsp, %rbp
	push	%rbx
	push	%r12
	push	%r13
	push	%r14
	push	%r15
	sub		$(8+16+5*16+16), %rsp	// 8 (align) + 16 (dummy iv) + 5*16 (xmm) + 16 (for i386-x86_64 consistency)	

	#define	sp	%rsp
#endif

	// save xmm registers for kernel use
	// xmm6-xmm7 will be used locally
	// xmm0-xmm2 (x86_64) or xmm0-/xmm4 (i386) will be used inside _aes_encrypt_xmm_no_save (non-restored)
	// there is a hole not used for xmm, which is 48(sp). 
	// it has been used to store iv (16-bytes) in i386 code
	// for consistency between i386 and x86_64, this hole is dummied in x86_64 code
	// also the 1st 16 bytes (sp) is dummied in x86_64 code

#ifdef	KERNEL
	movaps	%xmm7, 16(sp)
	movaps	%xmm6, 32(sp)
	movaps	%xmm0, 64(sp)
	movaps	%xmm1, 80(sp)
	movaps	%xmm2, 96(sp)
#if defined	__i386__
	movaps	%xmm3, 112(sp)
	movaps	%xmm4, 128(sp)
#endif
#endif

	// set up registers from calling arguments

#if defined	__i386__
	mov		12(%ebp), %eax			// in_iv
	mov		24(%ebp), %edx			// ctx
	movups	(%eax), %xmm7			// in_iv	
	mov		%edx, 8(%esp)			// ctx for aes_encrypt
	mov		8(%ebp), %ebx			// ibuf
	mov		16(%ebp), %esi			// num_blk
	mov		20(%ebp), %edi			// obuf

	#define	ibuf	%ebx
	#define	obuf	%edi
	#define num_blk	%esi	
#else	//	__x86_64__, rdi/rsi/rdx/rcx/r8
	mov		%rdi, %rbx				// ibuf
	movups	(%rsi), %xmm7			// in_iv
	mov		%rdx, %r13				// num_blk
	mov		%rcx, %r14				// obuf 
	mov		%r8, %r15				// ctx	

	#define	ibuf	%rbx
	#define	num_blk	%r13d
	#define	obuf	%r14	
	#define	ctx		%r15

#endif
           // memcpy(tmp, ibuf, AES_BLOCK_SIZE);
           // aes_decrypt(ibuf, obuf, ctx);
			//	obuf ^= iv;
           // memcpy(iv, tmp, AES_BLOCK_SIZE);
           // ibuf += AES_BLOCK_SIZE;
           // obuf += AES_BLOCK_SIZE;

	cmp		$1, num_blk					// num_blk vs 1
	jl		L_crypt_cbc_done			// if num_blk < 1, bypass the main loop, jump to finishing code
0:
	movups	(ibuf), %xmm6				// tmp
#if defined	__i386__
	mov		ibuf, (sp)					// ibuf
	mov		obuf, 4(sp)					// obuf
#else
	mov		ibuf, %rdi					// ibuf 
	mov		obuf, %rsi					// obuf
	mov		ctx, %rdx					// ctx
#endif
	call	_aes_decrypt_xmm_no_save	// aes_decrypt(ibuf, obuf, ctx)
	movups	(obuf), %xmm0				// obuf
	pxor	%xmm7, %xmm0				// obuf ^= iv;
	movaps	%xmm6, %xmm7				// memcpy(iv, tmp, AES_BLOCK_SIZE);
	movups	%xmm0, (obuf)				// update obuf
	add		$16, ibuf					// ibuf += AES_BLOCK_SIZE; 
	add		$16, obuf					// obuf += AES_BLOCK_SIZE;	
	sub		$1, num_blk					// num_blk --
	jg		0b							// if num_blk > 0, repeat the loop
9:	

	// we are done here, the finishing code is identical to that in aes_encrypt_cbc, so just jump to there
	jmp		L_crypt_cbc_done