ExpandKeyForEncryption.s [plain text]
/* This file defines _AESExpandKeyForEncryption. It is designed to be
included in another assembly file with the preprocessor #include directive,
to benefit from some assembly-time calculations.
Written by Eric Postpischil, January 2008.
The comments here do not say much about the algorithm before working with this code or examining the C code in the parent
directory that illustrates key expansion.
*/
/* Routine:
_AESExpandKeyForEncryption.
Function:
Expand the user's cipher key into the key schedule, as defined in
Federal Information Processing Standards Publication 197 (FIPS-197),
November 26, 2001.
Input:
Constant data:
The following names must be locally defined so the assembler
can calculate certain offsets.
static const Word _OLDAESSubBytesWordTable[4][256].
_OLDAESSubBytesWordTable[i][j] = SubBytes(j) << 8*i, where
SubBytes is defined in FIPS-197. _OLDAESSubBytesWordTable
differs from _OLDAESEncryptTable in that it does not include
the MixColumn operation. It is used in performing the last
round, which differs fromm the previous rounds in that it
does not include the MixColumn operation.
static const Byte _OLDAESRcon[].
Round constants, beginning with OLDAESRcon[1] for the first round
(OLDAESRcon[0] is padding.)
Arguments:
Word *ExpandedKey
Address of output.
const AESKey *Key
Address of user's cipher key.
long Nk
Number of four-byte words in user's cipher key.
Output:
The expanded key is written to *ExpandedKey.
*/
.globl _AESExpandKeyForEncryption
.private_extern _AESExpandKeyForEncryption
_AESExpandKeyForEncryption:
#define dr r0d // Dissection register.
#define drl r0l // Low 8 bits of dissection register.
#define drh r0h // Second-lowest 8 bits of dissection register.
#define t0 r1
#define t0d r1d // Low 32 bits of t0.
#define offset r5 // Address offset and loop sentinel.
#define R r6 // Address of round constant.
#define K r6 // User key pointer, second x86_64 argument.
// R and K overlap.
#define E r7 // Expanded key pointer, first x86_64 argument.
#define ve0 %xmm0
#define ve1 %xmm1
#define ve2 %xmm2
#define ve3 %xmm3
#define vt3 %xmm4
#define vt2 %xmm5
#define vt1 %xmm6
#define vt0 %xmm7
#define LookupS(table, index) (table)*TableSize(STable, index, 4)
// Push new stack frame.
push r5
// Save registers.
#if defined __i386__
push r6
push r7
#define RegisterSaveSize (2*4)
#elif defined __x86_64__
#define RegisterSaveSize (0*8)
// Add pushes of r12 to r15 if used.
#endif
#define LocalsSize 0
#define StackFrame (LocalsSize+RegisterSaveSize)
// Locals plus the registers we pushed after the new stack frame.
#if defined __i386__
// Define location of argument i.
#define Argument(i) StackFrame+8+4*(i)(r4)
// Load arguments.
mov Argument(0), E
mov Argument(1), K
#define Nk Argument(2)
#elif defined __x86_64__
#define Nk r2 // Number of words in key. Overlaps STable.
#endif
cmp $6, Nk
// Stop using r2 for Nk and start using it for STable.
#undef Nk
#define STable r2
#if 0 < LocalsSize
sub $LocalsSize, r4 // Allocate space on stack.
#endif
// First words of expanded key are copied from user key.
movd 0*4(K), ve0
movd 1*4(K), ve1
movd 2*4(K), ve2
movd 3*4(K), ve3
je EKeyHas6Words
jg EKeyHas8Words
// Fall through to EKeyHas4Words.
EKeyHas4Words:
// K cannot be used after we write to R, since they use the same register.
#if defined __i386__
// Get address of 0 in R.
call 0f // Push program counter onto stack.
0:
pop STable // Get program counter.
lea _OLDAESRcon-0b(STable), R
lea _OLDAESSubBytesWordTable-0b(STable), STable
#elif defined __x86_64__
lea _OLDAESRcon(%rip), R
lea _OLDAESSubBytesWordTable(%rip), STable
#endif
// With a four-word key, there are ten rounds (eleven 16-byte key blocks).
mov $-10*4*4, offset
sub offset, E
// Store initial words of expanded key, which are copies of user's key.
movd ve0, 0*4(E, offset)
movd ve1, 1*4(E, offset)
movd ve2, 2*4(E, offset)
movd ve3, 3*4(E, offset)
1:
add $1, R // Advance pointer.
movd ve3, dr // Put previous word into dissection register.
movzx (R), t0d // Get round constant.
movd t0d, vt3
pxor vt3, ve0 // XOR with word from four words back.
// Perform SubWord(RotWord(dr)).
movzx drl, t0
movd LookupS(3, t0), vt0 // Look up byte 0 in table 3.
movzx drh, t0d
movd LookupS(0, t0), vt1 // Look up byte 1 in table 0.
shr $16, dr
movzx drl, t0d
movd LookupS(1, t0), vt2 // Look up byte 2 in table 1.
movzx drh, t0d
movd LookupS(2, t0), vt3 // Look up byte 3 in table 2.
pxor vt1, vt0
pxor vt3, vt2
pxor vt0, ve0
pxor vt2, ve0
add $4*4, offset
// Chain to successive words.
movd ve0, 0*4(E, offset)
pxor ve0, ve1
movd ve1, 1*4(E, offset)
pxor ve1, ve2
movd ve2, 2*4(E, offset)
pxor ve2, ve3
movd ve3, 3*4(E, offset)
jne 1b
// Pop stack and restore registers.
#if 0 < LocalsSize
add $LocalsSize, r4
#endif
#if defined __i386__
// Add pops of r15 to r12 if used.
pop r7
pop r6
#elif defined __x86_64
#endif
pop r5
ret
// Reset definitions for next case.
#undef vt3
#undef vt2
#define ve4 %xmm4
#define ve5 %xmm5
EKeyHas6Words:
movd 4*4(K), ve4
movd 5*4(K), ve5
// K cannot be used after we write to R, since they use the same register.
#if defined __i386__
// Get address of 0 in R.
call 0f // Push program counter onto stack.
0:
pop STable // Get program counter.
lea _OLDAESRcon-0b(STable), R
lea _OLDAESSubBytesWordTable-0b(STable), STable
#elif defined __x86_64__
lea _OLDAESRcon(%rip), R
lea _OLDAESSubBytesWordTable(%rip), STable
#endif
/* With a six-word key, there are twelve rounds (thirteen 16-byte key
blocks).
*/
mov $-12*4*4, offset
sub offset, E
// Store initial words of expanded key, which are copies of user's key.
movd ve0, 0*4(E, offset)
movd ve1, 1*4(E, offset)
movd ve2, 2*4(E, offset)
movd ve3, 3*4(E, offset)
movd ve4, 4*4(E, offset)
movd ve5, 5*4(E, offset)
/* Jump into loop body. The key expansion processes six four-byte words per
iteration. 52 are needed in the key. So only four are needed in the last
iteration.
*/
jmp 2f
1:
// Continue chaining to successive words.
pxor ve3, ve4
movd ve4, 4*4(E, offset)
pxor ve4, ve5
movd ve5, 5*4(E, offset)
2:
add $1, R // Advance pointer.
movd ve5, dr // Put previous word into dissection register.
movzx (R), t0 // Get round constant.
movd t0d, vt1
pxor vt1, ve0 // XOR with word from six words back.
// Perform SubWord(RotWord(dr)).
movzx drl, t0d
movd LookupS(3, t0), vt0 // Look up byte 0 in table 3.
movzx drh, t0d
movd LookupS(0, t0), vt1 // Look up byte 1 in table 0.
shr $16, dr
movzx drl, t0d
pxor vt1, vt0
pxor vt0, ve0
movd LookupS(1, t0), vt0 // Look up byte 2 in table 1.
movzx drh, t0d
movd LookupS(2, t0), vt1 // Look up byte 3 in table 2.
pxor vt1, vt0
pxor vt0, ve0
add $6*4, offset
// Chain to successive words.
movd ve0, 0*4(E, offset)
pxor ve0, ve1
movd ve1, 1*4(E, offset)
pxor ve1, ve2
movd ve2, 2*4(E, offset)
pxor ve2, ve3
movd ve3, 3*4(E, offset)
jne 1b
// Pop stack and restore registers.
#if 0 < LocalsSize
add $LocalsSize, r4
#endif
#if defined __i386__
// Add pops of r15 to r12 if used.
pop r7
pop r6
#elif defined __x86_64
#endif
pop r5
ret
// Reset definitions for next case.
#undef ve4
#undef ve5
#define vt3 %xmm4
#define vt2 %xmm5
EKeyHas8Words:
// Store initial words of expanded key, which are copies of user's key.
movd ve0, 0*4(E)
movd ve1, 1*4(E)
movd ve2, 2*4(E)
movd ve3, 3*4(E)
movd 4*4(K), ve0
movd 5*4(K), ve1
movd 6*4(K), ve2
movd 7*4(K), ve3
// K cannot be used after we write to R, since they use the same register.
#if defined __i386__
// Get address of 0 in R.
call 0f // Push program counter onto stack.
0:
pop STable // Get program counter.
lea _OLDAESRcon-0b(STable), R
lea _OLDAESSubBytesWordTable-0b(STable), STable
#elif defined __x86_64__
lea _OLDAESRcon(%rip), R
lea _OLDAESSubBytesWordTable(%rip), STable
#endif
/* With an eight-word key, there are fourteen rounds (fifteen 16-byte key
blocks).
*/
mov $-14*4*4, offset
sub offset, E
// Store initial words of expanded key, which are copies of user's key.
movd ve0, 4*4(E, offset)
movd ve1, 5*4(E, offset)
movd ve2, 6*4(E, offset)
movd ve3, 7*4(E, offset)
/* Jump into loop body. The key expansion processes eight four-byte words per
iteration. 60 are needed in the key. So only four are needed in the last
iteration.
*/
jmp 2f
1:
movd ve3, dr // Put previous word into dissection register.
/* Get word from eight words back (it is four words back from where E
currently points, and we use it to prepare the value to be stored
four words beyond where E currently points).
*/
movd -4*4(E, offset), ve0
// Perform SubWord(dr).
movzx drl, t0
movd LookupS(0, t0), vt0 // Look up byte 0 in table 0.
movzx drh, t0d
movd LookupS(1, t0), vt1 // Look up byte 1 in table 1.
shr $16, dr
movzx drl, t0d
movd LookupS(2, t0), vt2 // Look up byte 2 in table 2.
movzx drh, t0d
movd LookupS(3, t0), vt3 // Look up byte 3 in table 3.
pxor vt1, vt0
pxor vt3, vt2
pxor vt0, ve0
pxor vt2, ve0
movd -3*4(E, offset), ve1 // Get words from eight words back.
movd -2*4(E, offset), ve2
movd -1*4(E, offset), ve3
// Chain to successive words.
movd ve0, 4*4(E, offset)
pxor ve0, ve1
movd ve1, 5*4(E, offset)
pxor ve1, ve2
movd ve2, 6*4(E, offset)
pxor ve2, ve3
movd ve3, 7*4(E, offset)
2:
add $1, R // Advance pointer.
movd ve3, dr // Put previous word into dissection register.
movzx (R), t0d // Get round constant.
movd t0d, vt1
movd 0*4(E, offset), ve0 // Get word from eight words back.
pxor vt1, ve0
// Perform SubWord(RotWord(dr)).
movzx drl, t0
movd LookupS(3, t0), vt0 // Look up byte 0 in table 3.
movzx drh, t0d
movd LookupS(0, t0), vt1 // Look up byte 1 in table 0.
shr $16, dr
movzx drl, t0d
movd LookupS(1, t0), vt2 // Look up byte 2 in table 1.
movzx drh, t0d
movd LookupS(2, t0), vt3 // Look up byte 3 in table 2.
pxor vt1, vt0
pxor vt3, vt2
pxor vt0, ve0
pxor vt2, ve0
movd 1*4(E, offset), ve1
movd 2*4(E, offset), ve2
movd 3*4(E, offset), ve3
add $8*4, offset
// Chain to successive words.
movd ve0, 0*4(E, offset)
pxor ve0, ve1
movd ve1, 1*4(E, offset)
pxor ve1, ve2
movd ve2, 2*4(E, offset)
pxor ve2, ve3
movd ve3, 3*4(E, offset)
jne 1b
// Pop stack and restore registers.
#if 0 < LocalsSize
add $LocalsSize, r4
#endif
#if defined __i386__
// Add pops of r15 to r12 if used.
pop r7
pop r6
#elif defined __x86_64
#endif
pop r5
ret
#undef Address
#undef Argument
#undef E
#undef LocalsSize
#undef LookupS
#undef R
#undef RegisterSaveSize
#undef STable
#undef StackFrame
#undef dr
#undef drh
#undef drl
#undef offset
#undef t0
#undef t0d
#undef ve0
#undef ve1
#undef ve2
#undef ve3
#undef vt0
#undef vt1
#undef vt2
#undef vt3