ExpandKeyForEncryption.s   [plain text]


/*	This file defines _aes_encrypt_key, _aes_encrypt_key128,
	_aes_encrypt_key192, and _aes_encrypt_key256.  It is designed to be
	included in another assembly file with the preprocessor #include directive,
	to benefit from some assembly-time calculations.

	Written by Eric Postpischil, January 2008.

	The comments here do not say much about the algorithm; the code just
	follows the FIPS-197 specification.  I recommend reading the specification
	before working with this code or examining the C code in the parent
	directory that illustrates key expansion.
*/


/*	Routines:

		_aes_encrypt_key.

		_aes_encrypt_key128, _aes_encrypt_key192, and _aes_encrypt_key256.

	Function:

		Expand the user's cipher key into the key schedule, as defined in
		Federal Information Processing Standards Publication 197 (FIPS-197),
		November 26, 2001.

	Input:

		Constant data:

			The following names must be locally defined so the assembler
			can calculate certain offsets.

			static const Word _AESSubBytesWordTable[4][256].

				_AESSubBytesWordTable[i][j] = SubBytes(j) << 8*i, where
				SubBytes is defined in FIPS-197.  _AESSubBytesWordTable
				differs from _AESEncryptTable in that it does not include
				the MixColumn operation.  It is used in performing the last
				round, which differs fromm the previous rounds in that it
				does not include the MixColumn operation.

			static const Byte _AESRcon[].

				Round constants, beginning with AESRcon[1] for the first round
				(AESRcon[0] is padding.)
	
		Arguments:

			const uint8_t *Key

				Address of user's cipher key.

			int Length

				Number of bytes (16, 24, or 32) or bits (128, 192, or 256) in
				user's cipher key.

				This argument is used with _aes_encrypt_key.  It is not
				present for the other routines.  In those routines, Context
				is the second argument.

			aes_encrypt_ctx *Context

				Structure to contain the expanded key beginning at offset
				ContextKey and a four-byte "key length" beginning at offset
				ContextKeyLength.  The "key length" is the number of bytes from
				the start of the first round key to the start of the last round
				key.  That is 16 less than the number of bytes in the entire
				key.

	Output:

		The expanded key and the "key length" are written to *Context.

	Return:

		aes_rval	// -1 if "key length" is invalid.  0 otherwise.
*/

/* add AES HW detection and program branch if AES HW is detected cclee 3-12-10 */
#ifdef KERNEL
#include <i386/cpu_capabilities.h>
#else
#include <System/i386/cpu_capabilities.h>
#endif

	.text
	.globl _aes_encrypt_key
//	.private_extern	_aes_encrypt_key
_aes_encrypt_key:

    // detect AES HW, cclee-3-13-10
#if defined __x86_64__
    movq    __cpu_capabilities@GOTPCREL(%rip), %rax				// %rax -> __cpu_capabilities
    mov     (%rax), %eax										// %eax  = __cpu_capabilities
#else
#if defined KERNEL
    leal    __cpu_capabilities, %eax							// %eax -> __cpu_capabilities
    mov     (%eax), %eax										// %eax  = __cpu_capabilities
#else
	mov    _COMM_PAGE_CPU_CAPABILITIES, %eax
#endif
#endif
    test    $(kHasAES), %eax									// __cpu_capabilities & kHasAES
    jne     _aes_encrypt_key_hw									// if AES HW detected, branch to _aes_encrypt_key_hw

#define	dr		r0d				// Dissection register.
#define	drl		r0l				// Low 8 bits of dissection register.
#define	drh		r0h				// Second-lowest 8 bits of dissection register.

#define	t0		r1
#define	t0d		r1d				// Low 32 bits of t0.

#define	offset	Arch(r5, r11)	// Address offset and loop sentinel.

#define	R		r7				// Address of round constant.
#define	K		r7				// User key pointer.
	// R and K overlap.

#define	E		r6				// Expanded key pointer.

#define	ve0		%xmm0
#define	ve1		%xmm1
#define	ve2		%xmm2
#define	ve3		%xmm3
#define	vt3		%xmm4
#define	vt2		%xmm5
#define	vt1		%xmm6
#define	vt0		%xmm7

#if defined __i386__
	#define	LookupS(table, index)	\
		_AESSubBytesWordTable+(table)*TableSize(, index, 4)
#elif defined __x86_64__
	#define	LookupS(table, index)	(table)*TableSize(STable, index, 4)
#endif

	/*	Save registers and set SaveSize to the number of bytes pushed onto the
		stack so far, including the caller's return address.
	*/
	push	r3
	#if defined __i386__
		push	r5
		push	r6
		push	r7
		#define	SaveSize	(5*4)
	#else
		#define	SaveSize	(2*8)
	#endif

	/*	Number of bytes used for local variables:

			8 16-byte spaces to save XMM registers.
	*/
	#define	LocalsSize	(8*16)

	#if 0 < LocalsSize
		// Padding to position stack pointer at a multiple of 16 bytes.
		#define	Padding	(15 & -(SaveSize + LocalsSize))
		sub		$Padding + LocalsSize, r4	// Allocate space on stack.
	#else
		#define	Padding	0
	#endif

	/*	StackFrame is the number of bytes in our stack frame, from caller's
		stack pointer to ours (so it includes the return address).
	*/
	#define	StackFrame	(SaveSize + Padding + LocalsSize)

	// Save xmm registers.
	movaps	%xmm0, 0*16(r4)
	movaps	%xmm1, 1*16(r4)
	movaps	%xmm2, 2*16(r4)
	movaps	%xmm3, 3*16(r4)
	movaps	%xmm4, 4*16(r4)
	movaps	%xmm5, 5*16(r4)
	movaps	%xmm6, 6*16(r4)
	movaps	%xmm7, 7*16(r4)

#if defined __i386__

	// Define location of argument i.
	#define	Argument(i)	StackFrame+4*(i)(r4)

	#define	Nk		t0d

	// Load arguments.
	mov		Argument(2), E
	mov		Argument(1), Nk
	mov		Argument(0), K

#elif defined __x86_64__

	#define	Nk		r9d			// Number of words in key.
	mov		r6d, Nk				// Move Nk argument out of way.
	mov		r2, E				// Move E argument to common register.

#endif

	// Dispatch on key length.
	cmp		$128, Nk
	jge		2f
	shl		$3, Nk				// Convert from bytes to bits.
	cmp		$128, Nk
2:
	je		EKeyHas4Words
	cmp		$192, Nk
	je		EKeyHas6Words
	cmp		$256, Nk
	je		EKeyHas8Words
	mov		$-1, r0				// Return error.
	jmp		9f

// Stop using Nk.
#undef	Nk

	.globl _aes_encrypt_key128
//	.private_extern	_aes_encrypt_key128
_aes_encrypt_key128:

	/*	Save registers and set SaveSize to the number of bytes pushed onto the
		stack so far, including the caller's return address.
	*/
	push	r3
	#if defined __i386__
		push	r5
		push	r6
		push	r7
		#define	SaveSize	(5*4)
	#else
		#define	SaveSize	(2*8)
	#endif

	/*	Number of bytes used for local variables:

			8 16-byte spaces to save XMM registers.
	*/
	#define	LocalsSize	(8*16)

	#if 0 < LocalsSize
		// Padding to position stack pointer at a multiple of 16 bytes.
		#define	Padding	(15 & -(SaveSize + LocalsSize))
		sub		$Padding + LocalsSize, r4	// Allocate space on stack.
	#else
		#define	Padding	0
	#endif

	/*	StackFrame is the number of bytes in our stack frame, from caller's
		stack pointer to ours (so it includes the return address).
	*/
	#define	StackFrame	(SaveSize + Padding + LocalsSize)

	// Save xmm registers.
	movaps	%xmm0, 0*16(r4)
	movaps	%xmm1, 1*16(r4)
	movaps	%xmm2, 2*16(r4)
	movaps	%xmm3, 3*16(r4)
	movaps	%xmm4, 4*16(r4)
	movaps	%xmm5, 5*16(r4)
	movaps	%xmm6, 6*16(r4)
	movaps	%xmm7, 7*16(r4)

	#if defined __i386__

		// Load arguments.
		#define	Argument(i)	StackFrame+4*(i)(r4)
		mov		Argument(1), E
		mov		Argument(0), K

	#endif

// Merge point for _aes_encrypt_key and _aes_encrypt_key128.
EKeyHas4Words:

#define	e0	r2d
#define	e1	r3d
#define	e2	Arch(r5d, r11d)
#define	e3	r7d

	// First words of expanded key are copied from user key.
	mov		0*4(K), e0
	mov		1*4(K), e1
	mov		2*4(K), e2
	mov		3*4(K), e3

	movl	$10*16, ContextKeyLength(E)	// Set "key length."

	#if 0 != ContextKey
		add		$ContextKey, E
	#endif

	// K cannot be used after we write to R, since they use the same register.

	// Cache round constants in output buffer.  The last is a sentinel.
	movb	$0x01,  1*16(E)
	movb	$0x02,  2*16(E)
	movb	$0x04,  3*16(E)
	movb	$0x08,  4*16(E)
	movb	$0x10,  5*16(E)
	movb	$0x20,  6*16(E)
	movb	$0x40,  7*16(E)
	movb	$0x80,  8*16(E)
	movb	$0x1b,  9*16(E)
	movb	$0x36, 10*16(E)

	#if defined __x86_64__

		#define	STable	r8
		lea		_AESSubBytesWordTable(%rip), STable

	#endif

	// Store initial words of expanded key, which are copies of user's key.
	mov		e0, 0*4(E)
	mov		e1, 1*4(E)
	mov		e2, 2*4(E)
	mov		e3, 3*4(E)

1:
	mov		e3, dr				// Put previous word into dissection register.

	// Perform SubWord(RotWord(dr)).
	movzx	drl, t0
	xor		LookupS(3, t0), e0		// Look up byte 0 in table 3.
	movzx	drh, t0d
	xor		LookupS(0, t0), e0		// Look up byte 1 in table 0.
	shr		$16, dr
	movzx	drl, t0d
	xor		LookupS(1, t0), e0		// Look up byte 2 in table 1.
	movzx	drh, t0d
	xor		LookupS(2, t0), e0		// Look up byte 3 in table 2.

	add		$4*4, E

	movzx	(E), t0d				// Get cached round constant.
	xor		t0d, e0					// XOR with word from four words back.

	// Chain to successive words.
	mov		e0, 0*4(E)
	xor		e0, e1
	mov		e1, 1*4(E)
	xor		e1, e2
	mov		e2, 2*4(E)
	xor		e2, e3
	mov		e3, 3*4(E)

	cmp		$0x36, t0d				// Was this the last round constant?

	jne		1b

	xor		r0, r0		// Return success.

9:
	// Pop stack and restore registers.
	movaps	7*16(r4), %xmm7
	movaps	6*16(r4), %xmm6
	movaps	5*16(r4), %xmm5
	movaps	4*16(r4), %xmm4
	movaps	3*16(r4), %xmm3
	movaps	2*16(r4), %xmm2
	movaps	1*16(r4), %xmm1
	movaps	0*16(r4), %xmm0
	#if 0 < LocalsSize
		add		$Padding + LocalsSize, r4
	#endif
	#if defined __i386__
		pop		r7
		pop		r6
		pop		r5
	#endif
	pop		r3

	ret


// Reset definitions for next case.
#undef	e0
#undef	e1
#undef	e2
#undef	e3

#undef	vt3
#undef	vt2
#define	ve4	%xmm4
#define	ve5	%xmm5


	.globl _aes_encrypt_key192
//	.private_extern	_aes_encrypt_key192
_aes_encrypt_key192:

	/*	Save registers and set SaveSize to the number of bytes pushed onto the
		stack so far, including the caller's return address.
	*/
	push	r3
	#if defined __i386__
		push	r5
		push	r6
		push	r7
		#define	SaveSize	(5*4)
	#else
		#define	SaveSize	(2*8)
	#endif

	/*	Number of bytes used for local variables:

			8 16-byte spaces to save XMM registers.
	*/
	#define	LocalsSize	(8*16)

	#if 0 < LocalsSize
		// Padding to position stack pointer at a multiple of 16 bytes.
		#define	Padding	(15 & -(SaveSize + LocalsSize))
		sub		$Padding + LocalsSize, r4	// Allocate space on stack.
	#else
		#define	Padding	0
	#endif

	/*	StackFrame is the number of bytes in our stack frame, from caller's
		stack pointer to ours (so it includes the return address).
	*/
	#define	StackFrame	(SaveSize + Padding + LocalsSize)

	// Save xmm registers.
	movaps	%xmm0, 0*16(r4)
	movaps	%xmm1, 1*16(r4)
	movaps	%xmm2, 2*16(r4)
	movaps	%xmm3, 3*16(r4)
	movaps	%xmm4, 4*16(r4)
	movaps	%xmm5, 5*16(r4)
	movaps	%xmm6, 6*16(r4)
	movaps	%xmm7, 7*16(r4)

	#if defined __i386__

		// Load arguments.
		#define	Argument(i)	StackFrame+4*(i)(r4)
		mov		Argument(1), E
		mov		Argument(0), K

	#endif

// Merge point for _aes_encrypt_key and _aes_encrypt_key192.
EKeyHas6Words:

	// First words of expanded key are copied from user key.
	movd	0*4(K), ve0
	movd	1*4(K), ve1
	movd	2*4(K), ve2
	movd	3*4(K), ve3

	movl	$12*16, ContextKeyLength(E)	// Set "key length."

	#if 0 != ContextKey
		add		$ContextKey, E
	#endif

	movd	4*4(K), ve4
	movd	5*4(K), ve5

	// K cannot be used after we write to R, since they use the same register.

	#if defined __i386__

		lea		_AESRcon, R

	#elif defined __x86_64__

		lea		_AESRcon(%rip), R
		lea		_AESSubBytesWordTable(%rip), STable

	#endif

	/*	With a six-word key, there are twelve rounds (thirteen 16-byte key
		blocks).
	*/
	mov		$-12*4*4, offset
	sub		offset, E

	// Store initial words of expanded key, which are copies of user's key.
	movd	ve0, 0*4(E, offset)
	movd	ve1, 1*4(E, offset)
	movd	ve2, 2*4(E, offset)
	movd	ve3, 3*4(E, offset)
	movd	ve4, 4*4(E, offset)
	movd	ve5, 5*4(E, offset)

/*	Jump into loop body.  The key expansion processes six four-byte words per
	iteration.  52 are needed in the key.  So only four are needed in the last
	iteration.
*/
	jmp		2f		
1:
	// Continue chaining to successive words.
	pxor	ve3, ve4
	movd	ve4, 4*4(E, offset)
	pxor	ve4, ve5
	movd	ve5, 5*4(E, offset)
2:
	add		$1, R				// Advance pointer.
	movd	ve5, dr				// Put previous word into dissection register.
	movzx	(R), t0				// Get round constant.
	movd	t0d, vt1
	pxor	vt1, ve0			// XOR with word from six words back.

	// Perform SubWord(RotWord(dr)).
	movzx	drl, t0d
	movd	LookupS(3, t0), vt0		// Look up byte 0 in table 3.
	movzx	drh, t0d
	movd	LookupS(0, t0), vt1		// Look up byte 1 in table 0.
	shr		$16, dr
	movzx	drl, t0d
	pxor	vt1, vt0
	pxor	vt0, ve0
	movd	LookupS(1, t0), vt0		// Look up byte 2 in table 1.
	movzx	drh, t0d
	movd	LookupS(2, t0), vt1		// Look up byte 3 in table 2.
	pxor	vt1, vt0
	pxor	vt0, ve0

	add		$6*4, offset

	// Chain to successive words.
	movd	ve0, 0*4(E, offset)
	pxor	ve0, ve1
	movd	ve1, 1*4(E, offset)
	pxor	ve1, ve2
	movd	ve2, 2*4(E, offset)
	pxor	ve2, ve3
	movd	ve3, 3*4(E, offset)

	jne		1b

	xor		r0, r0		// Return success.

	// Pop stack and restore registers.
	movaps	7*16(r4), %xmm7
	movaps	6*16(r4), %xmm6
	movaps	5*16(r4), %xmm5
	movaps	4*16(r4), %xmm4
	movaps	3*16(r4), %xmm3
	movaps	2*16(r4), %xmm2
	movaps	1*16(r4), %xmm1
	movaps	0*16(r4), %xmm0
	#if 0 < LocalsSize
		add		$Padding + LocalsSize, r4
	#endif
	#if defined __i386__
		pop		r7
		pop		r6
		pop		r5
	#endif
	pop		r3

	ret


// Reset definitions for next case.
#undef	ve4
#undef	ve5
#define	vt3	%xmm4
#define	vt2	%xmm5


	.globl _aes_encrypt_key256
//	.private_extern	_aes_encrypt_key256
_aes_encrypt_key256:

	/*	Save registers and set SaveSize to the number of bytes pushed onto the
		stack so far, including the caller's return address.
	*/
	push	r3
	#if defined __i386__
		push	r5
		push	r6
		push	r7
		#define	SaveSize	(5*4)
	#else
		#define	SaveSize	(2*8)
	#endif

	/*	Number of bytes used for local variables:

			8 16-byte spaces to save XMM registers.
	*/
	#define	LocalsSize	(8*16)

	#if 0 < LocalsSize
		// Padding to position stack pointer at a multiple of 16 bytes.
		#define	Padding	(15 & -(SaveSize + LocalsSize))
		sub		$Padding + LocalsSize, r4	// Allocate space on stack.
	#else
		#define	Padding	0
	#endif

	/*	StackFrame is the number of bytes in our stack frame, from caller's
		stack pointer to ours (so it includes the return address).
	*/
	#define	StackFrame	(SaveSize + Padding + LocalsSize)

	// Save xmm registers.
	movaps	%xmm0, 0*16(r4)
	movaps	%xmm1, 1*16(r4)
	movaps	%xmm2, 2*16(r4)
	movaps	%xmm3, 3*16(r4)
	movaps	%xmm4, 4*16(r4)
	movaps	%xmm5, 5*16(r4)
	movaps	%xmm6, 6*16(r4)
	movaps	%xmm7, 7*16(r4)

	#if defined __i386__

		// Load arguments.
		#define	Argument(i)	StackFrame+4*(i)(r4)
		mov		Argument(1), E
		mov		Argument(0), K

	#endif

// Merge point for _aes_encrypt_key and _aes_encrypt_key256.
EKeyHas8Words:

	// First words of expanded key are copied from user key.
	movd	0*4(K), ve0
	movd	1*4(K), ve1
	movd	2*4(K), ve2
	movd	3*4(K), ve3

	movl	$14*16, ContextKeyLength(E)	// Set "key length."

	#if 0 != ContextKey
		add		$ContextKey, E
	#endif

	// Store initial words of expanded key, which are copies of user's key.
	movd	ve0, 0*4(E)
	movd	ve1, 1*4(E)
	movd	ve2, 2*4(E)
	movd	ve3, 3*4(E)
	movd	4*4(K), ve0
	movd	5*4(K), ve1
	movd	6*4(K), ve2
	movd	7*4(K), ve3

	// K cannot be used after we write to R, since they use the same register.

	#if defined __i386__

		lea		_AESRcon, R

	#elif defined __x86_64__

		lea		_AESRcon(%rip), R
		lea		_AESSubBytesWordTable(%rip), STable

	#endif

	/*	With an eight-word key, there are fourteen rounds (fifteen 16-byte key
	 	blocks).
	*/
	mov		$-14*4*4, offset
	sub		offset, E

	// Store initial words of expanded key, which are copies of user's key.
	movd	ve0, 4*4(E, offset)
	movd	ve1, 5*4(E, offset)
	movd	ve2, 6*4(E, offset)
	movd	ve3, 7*4(E, offset)

/*	Jump into loop body.  The key expansion processes eight four-byte words per
	iteration.  60 are needed in the key.  So only four are needed in the last
	iteration.
*/
	jmp		2f		
1:
	movd	ve3, dr				// Put previous word into dissection register.

	/*	Get word from eight words back (it is four words back from where E
	 	currently points, and we use it to prepare the value to be stored
		four words beyond where E currently points).
	*/
	movd	-4*4(E, offset), ve0

	// Perform SubWord(dr).
	movzx	drl, t0
	movd	LookupS(0, t0), vt0		// Look up byte 0 in table 0.
	movzx	drh, t0d
	movd	LookupS(1, t0), vt1		// Look up byte 1 in table 1.
	shr		$16, dr
	movzx	drl, t0d
	movd	LookupS(2, t0), vt2		// Look up byte 2 in table 2.
	movzx	drh, t0d
	movd	LookupS(3, t0), vt3		// Look up byte 3 in table 3.
	pxor	vt1, vt0
	pxor	vt3, vt2
	pxor	vt0, ve0
	pxor	vt2, ve0

	movd	-3*4(E, offset), ve1	// Get words from eight words back.
	movd	-2*4(E, offset), ve2
	movd	-1*4(E, offset), ve3

	// Chain to successive words.
	movd	ve0, 4*4(E, offset)
	pxor	ve0, ve1
	movd	ve1, 5*4(E, offset)
	pxor	ve1, ve2
	movd	ve2, 6*4(E, offset)
	pxor	ve2, ve3
	movd	ve3, 7*4(E, offset)

2:
	add		$1, R				// Advance pointer.
	movd	ve3, dr				// Put previous word into dissection register.
	movzx	(R), t0d			// Get round constant.
	movd	t0d, vt1
	movd	0*4(E, offset), ve0	// Get word from eight words back.
	pxor	vt1, ve0

	// Perform SubWord(RotWord(dr)).
	movzx	drl, t0
	movd	LookupS(3, t0), vt0		// Look up byte 0 in table 3.
	movzx	drh, t0d
	movd	LookupS(0, t0), vt1		// Look up byte 1 in table 0.
	shr		$16, dr
	movzx	drl, t0d
	movd	LookupS(1, t0), vt2		// Look up byte 2 in table 1.
	movzx	drh, t0d
	movd	LookupS(2, t0), vt3		// Look up byte 3 in table 2.
	pxor	vt1, vt0
	pxor	vt3, vt2
	pxor	vt0, ve0
	pxor	vt2, ve0

	movd	1*4(E, offset), ve1
	movd	2*4(E, offset), ve2
	movd	3*4(E, offset), ve3

	add		$8*4, offset

	// Chain to successive words.
	movd	ve0, 0*4(E, offset)
	pxor	ve0, ve1
	movd	ve1, 1*4(E, offset)
	pxor	ve1, ve2
	movd	ve2, 2*4(E, offset)
	pxor	ve2, ve3
	movd	ve3, 3*4(E, offset)

	jne		1b

	xor		r0, r0		// Return success.

	// Pop stack and restore registers.
	movaps	7*16(r4), %xmm7
	movaps	6*16(r4), %xmm6
	movaps	5*16(r4), %xmm5
	movaps	4*16(r4), %xmm4
	movaps	3*16(r4), %xmm3
	movaps	2*16(r4), %xmm2
	movaps	1*16(r4), %xmm1
	movaps	0*16(r4), %xmm0
	#if 0 < LocalsSize
		add		$Padding + LocalsSize, r4
	#endif
	#if defined __i386__
		pop		r7
		pop		r6
		pop		r5
	#endif
	pop		r3

	ret


#undef	Address
#undef	Argument
#undef	E
#undef	K
#undef	LocalsSize
#undef	LookupS
#undef	Padding
#undef	R
#undef	SaveSize
#undef	STable
#undef	StackFrame
#undef	dr
#undef	drh
#undef	drl
#undef	offset
#undef	t0
#undef	t0d
#undef	ve0
#undef	ve1
#undef	ve2
#undef	ve3
#undef	vt0
#undef	vt1
#undef	vt2
#undef	vt3