ExpandKeyForDecryption.s [plain text]
/* This file defines _AESExpandKeyForDecryption. It is designed to be
included in another assembly file with the preprocessor #include directive,
to benefit from some assembly-time calculations.
Written by Eric Postpischil, January 2008.
The comments here do not say much about the algorithm before working with this code or examining the C code in the parent
directory that illustrates key expansion.
One complication is that this routine both expands the key and applies
InvMixColumn to most of the words in the expanded key. This modifies the
key for use with the Equivalent Inverse Cipher.
During key expansion, there are sequences of four or six words that are
produced like this:
E[i+0] = E[i+0-Nk] ^ f(E[i-1]), where f is some function.
E[i+1] = E[i+1-Nk] ^ E[i+0].
E[i+2] = E[i+2-Nk] ^ E[i+1].
E[i+3] = E[i+3-Nk] ^ E[i+2].
When Nk is four or eight, the sequence stops there. When it is six, it
goes on for two more words. Let I be the InvMixColumn function. for the
Equivalent Inverse Cipher, we want to store I(E[i+0]), I(E[i+1]),
I(E[i+2]), I(E[i+3]) (and two more when Nk is six). However, we do not
need to calculate I four times. In AES' finite field, I is a linear
combination of the four bytes of its input. The ^ operation on the bits
that represent field elements is an addition in the Galois field. So
I(a ^ b) = I(a) ^ I(b). Then we have:
I(E[i+0]) = I(E[i+0-Nk] ^ f(E[i-1])) = I(E[i+0-Nk]) ^ I(f(E[i-1])).
I(E[i+1]) = I(E[i+1-Nk]) ^ I(E[i+0]).
I(E[i+2]) = I(E[i+2-Nk]) ^ I(E[i+1]).
I(E[i+3]) = I(E[i+3-Nk]) ^ I(E[i+2]).
To compute this, we compute I(f(E[i-1])) and XOR it with the previously
stored E[i+0-Nk])) to get I(E[i+0])). Then we XOR that with the previously
stored E[i+1-Nk])) to get I(E[i+1])), and so on.
Note that to compute I(f(E[i-1])), we need to have E[i-1]. So we have to
compute the pre-InvMixColumn words of the expanded key*/
/* Routine:
_AESExpandKeyForDecryption.
Function:
Expand the user's cipher key into the key schedule, as defined in
Federal Information Processing Standards Publication 197 (FIPS-197),
November 26, 2001.
For decryption, the key is modified as shown in Figure 15 in FIPS-197,
to support the Equivalent Inverse Cipher.
Input:
Constant data:
The following names must be locally defined so the assembler
can calculate certain offsets.
static const Word _OLDAESSubBytesWordTable[4][256].
_OLDAESSubBytesWordTable[i][j] = SubBytes(j) << 8*i, where
SubBytes is defined in FIPS-197. _OLDAESSubBytesWordTable
differs from _OLDAESEncryptTable in that it does not include
the MixColumn operation. It is used in performing the last
round, which differs fromm the previous rounds in that it
does not include the MixColumn operation.
static const Word _AESSInvMixColumnTable[4][256].
_OLDAESInvMixColumnTable[i][j] contains the contribution of byte
j to element i of the InvMixColumn operation.
The four bytes of the word _OLDAESInvMixColumnTable[0][j] are:
{0xe}*{j}, {0x9}*{j}, {0xd}*{j}, {0xb}*{j},
listed in increasing address order, where multiplication is
performed in the Galois field. {j} designates the element of
the Galois field represented by j. _AESInvMixColumn[i][j] has
the same bytes, rotated right in the order shown above.
static const Byte _OLDAESRcon[].
Round constants, beginning with OLDAESRcon[1] for the first round
(OLDAESRcon[0] is padding.)
Arguments:
Word *ExpandedKey
Address of output.
const AESKey *Key
Address of user's cipher key.
long Nk
Number of four-byte words in user's cipher key.
Output:
The expanded key is written to *ExpandedKey.
*/
#define dr r0d // Dissection register.
#define drl r0l // Low 8 bits of dissection register.
#define drh r0h // Second-lowest 8 bits of dissection register.
#define t0 r1
#define t0d r1d // Low 32 bits of t0.
#define STable r2 // Address of SubBytes table. Overlaps Nk.
#define ITable r3 // Address of InvMixColumn table.
#define offset r5 // Address offset and loop sentinel.
#define R r6 // Address of round constant.
#define K r6 // User key pointer, second x86_64 argument.
// R and K overlap.
#define E r7 // Expanded key pointer, first x86_64 argument.
#define ve0 %xmm0
#define ve1 %xmm1
#define ve2 %xmm2
#define ve3 %xmm3
#define ve4 %xmm4
#define ve5 %xmm5
#define vt1 %xmm6
#define vt0 %xmm7
#define LookupS(table, index) (table)*TableSize(STable, index, 4)
#define LookupI(table, index) (table)*TableSize(ITable, index, 4)
/* InvMixColumn puts InvMixColumn(dr) into vt0. This is a non-standard
subroutine. It does not conform to the ABI. It is an integral part of
_ExpandKeyForDecryption and shares register use with it.
*/
InvMixColumn:
movzx drl, t0
movd LookupI(0, t0), vt0 // Look up byte 0 in table 0.
movzx drh, t0d
movd LookupI(1, t0), vt1 // Look up byte 1 in table 1.
pxor vt1, vt0
shr $16, dr
movzx drl, t0d
movd LookupI(2, t0), vt1 // Look up byte 2 in table 2.
pxor vt1, vt0
movzx drh, t0d
movd LookupI(3, t0), vt1 // Look up byte 3 in table 3.
pxor vt1, vt0
ret
// SubWordRotWord adds (XORs) SubWord(RotWord(dr)) to vt0.
.macro SubWordRotWord
movzx drl, t0
movd LookupS(3, t0), vt1 // Look up byte 0 in table 3.
pxor vt1, vt0
movzx drh, t0d
movd LookupS(0, t0), vt1 // Look up byte 1 in table 0.
pxor vt1, vt0
shr $$16, dr
movzx drl, t0d
movd LookupS(1, t0), vt1 // Look up byte 2 in table 1.
pxor vt1, vt0
movzx drh, t0d
movd LookupS(2, t0), vt1 // Look up byte 3 in table 2.
pxor vt1, vt0
.endmacro
// SubWord puts SubWord(dr) into vt0.
.macro SubWord
movzx drl, t0
movd LookupS(0, t0), vt0 // Look up byte 0 in table 0.
movzx drh, t0d
movd LookupS(1, t0), vt1 // Look up byte 1 in table 1.
pxor vt1,vt0
shr $$16, dr
movzx drl, t0d
movd LookupS(2, t0), vt1 // Look up byte 2 in table 2.
pxor vt1,vt0
movzx drh, t0d
movd LookupS(3, t0), vt1 // Look up byte 3 in table 3.
pxor vt1,vt0
.endmacro
.globl _AESExpandKeyForDecryption
.private_extern _AESExpandKeyForDecryption
_AESExpandKeyForDecryption:
// Push new stack frame.
push r5
// Save registers.
push r3
#if defined __i386__
push r6
push r7
#define RegisterSaveSize (3*4)
#elif defined __x86_64__
#define RegisterSaveSize (1*8)
// Add pushes of r12 to r15 if used.
#endif
#define LocalsSize 0
#define StackFrame (LocalsSize+RegisterSaveSize)
// Locals plus the registers we pushed after the new stack frame.
/* Define stack offset to storage space for local data. This is in the red
zone. We point far enough down to allow space for eight four-byte words
plus a return address (4 or 8 bytes on i386 or x86_64) for our internal
subroutine calls.
*/
#define Local (-8*4-8)
#if defined __i386__
// Define location of argument i.
#define Argument(i) StackFrame+8+4*(i)(r4)
// Load arguments.
mov Argument(0), E
mov Argument(1), K
#define Nk Argument(2)
#elif defined __x86_64__
#define Nk r2 // Number of words in key. Overlaps STable.
#endif
cmp $6, Nk
#if 0 < LocalsSize
sub $LocalsSize, r4 // Allocate space on stack.
#endif
// First words of expanded key are copied from user key.
movd 0*4(K), ve0
movd 1*4(K), ve1
movd 2*4(K), ve2
movd 3*4(K), ve3
je DKeyHas6Words
jg DKeyHas8Words
// Fall through to DKeyHas4Words.
DKeyHas4Words:
// K cannot be used after we write to R, since they use the same register.
#if defined __i386__
// Get address of 0 in R.
call 0f // Push program counter onto stack.
0:
pop STable // Get program counter.
lea _OLDAESRcon-0b(STable), R
lea _OLDAESInvMixColumnTable-0b(STable), ITable
lea _OLDAESSubBytesWordTable-0b(STable), STable
#elif defined __x86_64__
lea _OLDAESRcon(%rip), R
lea _OLDAESInvMixColumnTable(%rip), ITable
lea _OLDAESSubBytesWordTable(%rip), STable
#endif
/* With a four-word key, there are ten rounds (eleven 16-byte key blocks),
nine of which have InvMixColumn applied.
*/
mov $-9*4*4, offset
sub offset, E
// Store initial words of expanded key, which are copies of user's key.
movd ve0, 0*4(E, offset)
movd ve1, 1*4(E, offset)
movd ve2, 2*4(E, offset)
movd ve3, 3*4(E, offset)
/* Here is the first iteration of the key expansion. It is separate from the
main loop below because we need to apply InvMixColumn to each of the
outputs, in ve0 through ve3. In the main loop, the technique described at
the top of this file is used to compute the proper outputs while using
InvMixColumn only once.
*/
add $1, R // Advance pointer.
movd ve3, dr // Put previous word into work register.
movzx (R), t0d // Get round constant.
movd t0d, vt0
SubWordRotWord
pxor vt0, ve0
// Chain to successive words.
pxor ve0, ve1
pxor ve1, ve2
pxor ve2, ve3
add $4*4, offset
/* Apply InvMixColumn to each word. The transformed values are stored in
the expanded key. The original values are retained in registers for
further computation.
*/
movd ve0, dr
call InvMixColumn
movd vt0, 0*4(E, offset)
movd ve1, dr
call InvMixColumn
movd vt0, 1*4(E, offset)
movd ve2, dr
call InvMixColumn
movd vt0, 2*4(E, offset)
movd ve3, dr
call InvMixColumn
movd vt0, 3*4(E, offset)
// Here is the main loop.
1:
add $1, R // Advance pointer.
movd ve3, dr // Put previous word into work register.
movzx (R), t0d // Get round constant.
movd t0d, vt0
SubWordRotWord
pxor vt0, ve0
// Chain to successive words.
pxor ve0, ve1
pxor ve1, ve2
pxor ve2, ve3
/* Dr. Brian Gladman uses a technique with a single XOR here instead
of the previous four. There is some periodic behavior in the key
expansion, and Gladman maintains E[4*i+3] for the latest four
values of i. XORing the value in vt0 with one of these yields its
replacement. However, using this technique requires additional
instructions before the loop (to initialize the values) and after
it (to extract the final values to be stored) and either some way
to rotate or index four values in the loop or a four-fold unrolling
of the loop to provide the indexing. Experiment suggests the
former is not worthwhile. Unrolling the loop might give a small
gain, at the cost of increased use of instruction cache, increased
instructions loads the first time the routine is executed, and
increased code complexity, so I decided against it.
*/
// Apply InvMixColumn to the difference.
movd vt0, dr
call InvMixColumn
add $4*4, offset
// Chain the transformed difference to previously transformed outputs.
movd (0-4)*4(E, offset), vt1
pxor vt1, vt0
movd vt0, 0*4(E, offset)
movd (1-4)*4(E, offset), vt1
pxor vt1, vt0
movd vt0, 1*4(E, offset)
movd (2-4)*4(E, offset), vt1
pxor vt1, vt0
movd vt0, 2*4(E, offset)
movd (3-4)*4(E, offset), vt1
pxor vt1, vt0
movd vt0, 3*4(E, offset)
jl 1b
// Here is the final iteration, which does not perform InvMixColumn.
movd ve3, dr // Put previous word into work register.
movzx 1(R), t0d // Get round constant.
movd t0d, vt0
SubWordRotWord
pxor vt0, ve0
// Chain to successive words.
movd ve0, 4*4(E, offset)
pxor ve0, ve1
movd ve1, 5*4(E, offset)
pxor ve1, ve2
movd ve2, 6*4(E, offset)
pxor ve2, ve3
movd ve3, 7*4(E, offset)
// Pop stack and restore registers.
#if 0 < LocalsSize
add $LocalsSize, r4
#endif
#if defined __i386__
// Add pops of r15 to r12 if used.
pop r7
pop r6
#elif defined __x86_64__
#endif
pop r3
pop r5
ret
DKeyHas6Words:
movd 4*4(K), ve4
movd 5*4(K), ve5
// K cannot be used after we write to R, since they use the same register.
#if defined __i386__
// Get address of 0 in R.
call 0f // Push program counter onto stack.
0:
pop STable // Get program counter.
lea _OLDAESRcon-0b(STable), R
lea _OLDAESInvMixColumnTable-0b(STable), ITable
lea _OLDAESSubBytesWordTable-0b(STable), STable
#elif defined __x86_64__
lea _OLDAESRcon(%rip), R
lea _OLDAESInvMixColumnTable(%rip), ITable
lea _OLDAESSubBytesWordTable(%rip), STable
#endif
/* With a six-word key, there are twelve rounds (thirteen 16-byte key
blocks), eleven of which have InvMixColumn applied. The key expansion
proceeds in iterations of six four-byte words, so the termination
condition is a bit complicated. We set offset to the negative of 10
four four-byte words, and the loop branch does another iteration if
offset is less than or equal to zero, meaning the number of iterations
performed so far is less than or equal to 10. Thus, after ten
iterations, it branches again. After the eleventh iteration, it
stops. Code after the end of the loop computes the twelfth key block,
which does not have InvMixColumn applied.
*/
mov $-10*4*4, offset
sub offset, E
// Store initial words of expanded key, which are copies of user's key.
movd ve0, 0*4(E, offset)
movd ve1, 1*4(E, offset)
movd ve2, 2*4(E, offset)
movd ve3, 3*4(E, offset)
/* The first four words are stored untransformed. After that, words in
the expanded key are transformed by InvMixColumn.
*/
movd ve4, dr
call InvMixColumn
movd vt0, 4*4(E, offset)
movd ve5, dr
call InvMixColumn
movd vt0, 5*4(E, offset)
/* Here is the first iteration of the key expansion. It is separate from the
main loop below because we need to apply InvMixColumn to each of the
outputs, in ve0 through ve5. In the main loop, the technique described at
the top of this file is used to compute the proper outputs while using
InvMixColumn only once.
*/
add $1, R // Advance pointer.
movd ve5, dr // Put previous word into work register.
movzx (R), t0d // Get round constant.
movd t0d, vt0
SubWordRotWord
pxor vt0, ve0
// Chain to successive words.
pxor ve0, ve1
pxor ve1, ve2
pxor ve2, ve3
pxor ve3, ve4
pxor ve4, ve5
add $6*4, offset
/* Apply InvMixColumn to each word. The transformed values are stored in
the expanded key. The original values are retained in registers for
further computation.
*/
movd ve0, dr
call InvMixColumn
movd vt0, 0*4(E, offset)
movd ve1, dr
call InvMixColumn
movd vt0, 1*4(E, offset)
movd ve2, dr
call InvMixColumn
movd vt0, 2*4(E, offset)
movd ve3, dr
call InvMixColumn
movd vt0, 3*4(E, offset)
movd (4-6)*4(E, offset), vt1
pxor vt1, vt0
movd vt0, 4*4(E, offset)
movd (5-6)*4(E, offset), vt1
pxor vt1, vt0
movd vt0, 5*4(E, offset)
// Here is the main loop.
1:
add $1, R // Advance pointer.
movd ve5, dr // Put previous word into work register.
movzx (R), t0d // Get round constant.
movd t0d, vt0
SubWordRotWord
pxor vt0, ve0
// Chain to successive words.
pxor ve0, ve1
pxor ve1, ve2
pxor ve2, ve3
pxor ve3, ve4
pxor ve4, ve5
// Apply InvMixColumn to the difference.
movd vt0, dr
call InvMixColumn
add $6*4, offset
// Chain the transformed difference to previously transformed outputs.
movd (0-6)*4(E, offset), vt1
pxor vt1, vt0
movd vt0, 0*4(E, offset)
movd (1-6)*4(E, offset), vt1
pxor vt1, vt0
movd vt0, 1*4(E, offset)
movd (2-6)*4(E, offset), vt1
pxor vt1, vt0
movd vt0, 2*4(E, offset)
movd (3-6)*4(E, offset), vt1
pxor vt1, vt0
movd vt0, 3*4(E, offset)
movd (4-6)*4(E, offset), vt1
pxor vt1, vt0
movd vt0, 4*4(E, offset)
movd (5-6)*4(E, offset), vt1
pxor vt1, vt0
movd vt0, 5*4(E, offset)
jle 1b
// Here is the final iteration, which does not perform InvMixColumn.
movd ve5, dr // Put previous word into work register.
movzx 1(R), t0d // Get round constant.
movd t0d, vt0
SubWordRotWord
pxor vt0, ve0
// Chain to successive words.
movd ve0, 6*4(E, offset)
pxor ve0, ve1
movd ve1, 7*4(E, offset)
pxor ve1, ve2
movd ve2, 8*4(E, offset)
pxor ve2, ve3
movd ve3, 9*4(E, offset)
// Pop stack and restore registers.
#if 0 < LocalsSize
add $LocalsSize, r4
#endif
#if defined __i386__
// Add pops of r15 to r12 if used.
pop r7
pop r6
#elif defined __x86_64__
#endif
pop r3
pop r5
ret
DKeyHas8Words:
// Store initial words of expanded key, which are copies of user's key.
movd ve0, 0*4(E)
movd ve1, 1*4(E)
movd ve2, 2*4(E)
movd ve3, 3*4(E)
movd 4*4(K), ve0
movd 5*4(K), ve1
movd 6*4(K), ve2
movd 7*4(K), ve3
// K cannot be used after we write to R, since they use the same register.
#if defined __i386__
// Get address of 0 in R.
call 0f // Push program counter onto stack.
0:
pop STable // Get program counter.
lea _OLDAESRcon-0b(STable), R
lea _OLDAESInvMixColumnTable-0b(STable), ITable
lea _OLDAESSubBytesWordTable-0b(STable), STable
#elif defined __x86_64__
lea _OLDAESRcon(%rip), R
lea _OLDAESInvMixColumnTable(%rip), ITable
lea _OLDAESSubBytesWordTable(%rip), STable
#endif
/* With an eight-word key, there are fourteen rounds (fifteen 16-byte key
blocks), thirteen of which have InvMixColumn applied.
*/
mov $-12*4*4, offset
sub offset, E
// Save untransformed values in stack area.
movd ve0, 4*4+Local(r4)
movd ve1, 5*4+Local(r4)
movd ve2, 6*4+Local(r4)
movd ve3, 7*4+Local(r4)
/* Apply InvMixColumn to words 4 through 7. The transformed values are
stored in the expanded key. The original values are saved in the stack
area for further computation.
*/
movd ve0, dr
call InvMixColumn
movd vt0, 4*4(E, offset)
movd ve1, dr
call InvMixColumn
movd vt0, 5*4(E, offset)
movd ve2, dr
call InvMixColumn
movd vt0, 6*4(E, offset)
movd ve3, dr
call InvMixColumn
movd vt0, 7*4(E, offset)
/* Here is the first iteration of the key expansion. It is separate from the
main loop below because we need to apply InvMixColumn to each of the
outputs, in ve0 through ve3. In the main loop, the technique described at
the top of this file is used to compute the proper outputs while using
InvMixColumn only once.
*/
add $1, R // Advance pointer.
movd ve3, dr // Put previous word into work register.
movzx (R), t0d // Get round constant.
movd t0d, vt0
SubWordRotWord
add $8*4, offset
movd (0-8)*4(E, offset), ve0 // Get old word.
pxor vt0, ve0
movd ve0, 0*4+Local(r4) // Save on stack.
movd ve0, dr
call InvMixColumn
movd vt0, 0*4(E, offset) // Write to expanded key.
/* Chain to successive words and apply InvMixColumn to each word. The
transformed values are stored in the expanded key. The original
values are retained in local data for further computation.
*/
movd (1-8)*4(E, offset), ve1 // Get old word.
pxor ve0, ve1 // Chain.
movd ve1, 1*4+Local(r4) // Save on stack.
movd ve1, dr
call InvMixColumn
movd vt0, 1*4(E, offset) // Write to expanded key.
movd (2-8)*4(E, offset), ve2 // Get old word.
pxor ve1, ve2 // Chain.
movd ve2, 2*4+Local(r4) // Save on stack.
movd ve2, dr
call InvMixColumn
movd vt0, 2*4(E, offset) // Write to expanded key.
movd (3-8)*4(E, offset), ve3 // Get old word.
pxor ve2, ve3 // Chain.
movd ve3, 3*4+Local(r4) // Save on stack.
movd ve3, dr
call InvMixColumn
movd vt0, 3*4(E, offset) // Write to expanded key.
movd ve3, dr // Put previous word into work register.
SubWord
movd 4*4+Local(r4), ve0 // Get old word.
pxor vt0, ve0 // Chain.
movd ve0, 4*4+Local(r4) // Save on stack.
movd 5*4+Local(r4), ve1 // Get old word.
pxor ve0, ve1 // Chain.
movd ve1, 5*4+Local(r4) // Save on stack.
movd 6*4+Local(r4), ve2 // Get old word.
pxor ve1, ve2 // Chain.
movd ve2, 6*4+Local(r4) // Save on stack.
movd 7*4+Local(r4), ve3 // Get old word.
pxor ve2, ve3 // Chain.
movd ve3, 7*4+Local(r4) // Save on stack.
movd vt0, dr // Move change to work register.
call InvMixColumn
movd (4-8)*4(E, offset), vt1 // Get old word.
pxor vt1, vt0 // Chain.
movd vt0, 4*4(E, offset) // Write new word to expanded key.
movd (5-8)*4(E, offset), vt1 // Get old word.
pxor vt1, vt0 // Chain.
movd vt0, 5*4(E, offset) // Write new word to expanded key.
movd (6-8)*4(E, offset), vt1 // Get old word.
pxor vt1, vt0 // Chain.
movd vt0, 6*4(E, offset) // Write new word to expanded key.
movd (7-8)*4(E, offset), vt1 // Get old word.
pxor vt1, vt0 // Chain.
movd vt0, 7*4(E, offset) // Write new word to expanded key.
// Here is the main loop.
1:
add $1, R // Advance pointer.
movd ve3, dr // Put previous word into work register.
movzx (R), t0d // Get round constant.
movd t0d, vt0
SubWordRotWord
movd 0*4+Local(r4), ve0 // Get old word.
pxor vt0, ve0
movd ve0, 0*4+Local(r4) // Save on stack.
// Chain to successive words.
movd 1*4+Local(r4), ve1 // Get old word.
pxor ve0, ve1 // Chain.
movd ve1, 1*4+Local(r4) // Save on stack.
movd 2*4+Local(r4), ve2 // Get old word.
pxor ve1, ve2 // Chain.
movd ve2, 2*4+Local(r4) // Save on stack.
movd 3*4+Local(r4), ve3 // Get old word.
pxor ve2, ve3 // Chain.
movd ve3, 3*4+Local(r4) // Save on stack.
movd vt0, dr // Move change to work register.
call InvMixColumn
movd 0*4(E, offset), vt1 // Get old word.
pxor vt1, vt0 // Chain.
movd vt0, (0+8)*4(E, offset) // Write new word to expanded key.
movd 1*4(E, offset), vt1 // Get old word.
pxor vt1, vt0 // Chain.
movd vt0, (1+8)*4(E, offset) // Write new word to expanded key.
movd 2*4(E, offset), vt1 // Get old word.
pxor vt1, vt0 // Chain.
movd vt0, (2+8)*4(E, offset) // Write new word to expanded key.
movd 3*4(E, offset), vt1 // Get old word.
pxor vt1, vt0 // Chain.
movd vt0, (3+8)*4(E, offset) // Write new word to expanded key.
movd ve3, dr // Put previous word into work register.
SubWord
movd 4*4+Local(r4), ve0 // Get old word.
pxor vt0, ve0 // Chain.
movd ve0, 4*4+Local(r4) // Save on stack.
movd 5*4+Local(r4), ve1 // Get old word.
pxor ve0, ve1 // Chain.
movd ve1, 5*4+Local(r4) // Save on stack.
movd 6*4+Local(r4), ve2 // Get old word.
pxor ve1, ve2 // Chain.
movd ve2, 6*4+Local(r4) // Save on stack.
movd 7*4+Local(r4), ve3 // Get old word.
pxor ve2, ve3 // Chain.
movd ve3, 7*4+Local(r4) // Save on stack.
movd vt0, dr // Move change to work register.
call InvMixColumn
movd 4*4(E, offset), vt1 // Get old word.
pxor vt1, vt0 // Chain.
movd vt0, (4+8)*4(E, offset) // Write new word to expanded key.
movd 5*4(E, offset), vt1 // Get old word.
pxor vt1, vt0 // Chain.
movd vt0, (5+8)*4(E, offset) // Write new word to expanded key.
movd 6*4(E, offset), vt1 // Get old word.
pxor vt1, vt0 // Chain.
movd vt0, (6+8)*4(E, offset) // Write new word to expanded key.
movd 7*4(E, offset), vt1 // Get old word.
pxor vt1, vt0 // Chain.
movd vt0, (7+8)*4(E, offset) // Write new word to expanded key.
add $8*4, offset
jl 1b
movd ve3, dr // Put previous word into work register.
movzx 1(R), t0d // Get round constant.
movd t0d, vt0
SubWordRotWord
movd 0*4+Local(r4), ve0 // Get old word.
pxor vt0, ve0 // Chain.
movd ve0, (0+8)*4(E, offset)
// Chain to successive words.
movd 1*4+Local(r4), ve1 // Get old word.
pxor ve0, ve1 // Chain.
movd ve1, (1+8)*4(E, offset)
movd 2*4+Local(r4), ve2 // Get old word.
pxor ve1, ve2 // Chain.
movd ve2, (2+8)*4(E, offset)
movd 3*4+Local(r4), ve3 // Get old word.
pxor ve2, ve3 // Chain.
movd ve3, (3+8)*4(E, offset)
// Pop stack and restore registers.
#if 0 < LocalsSize
add $LocalsSize, r4
#endif
#if defined __i386__
// Add pops of r15 to r12 if used.
pop r7
pop r6
#elif defined __x86_64__
#endif
pop r3
pop r5
ret
#undef Address
#undef Argument
#undef E
#undef ITable
#undef K
#undef LocalsSize
#undef LookupI
#undef LookupS
#undef Nk
#undef R
#undef RegisterSaveSize
#undef STable
#undef StackFrame
#undef dr
#undef drh
#undef drl
#undef offset
#undef t0
#undef t0d
#undef ve0
#undef ve1
#undef ve2
#undef ve3
#undef ve4
#undef ve5
#undef vt0
#undef vt1