#include "../AESAssembly.h" // Generate object code only if this implementation has been requested. #if defined UseAESedp_IntelAssembly /* AESDecryptCBC.s -- Decrypt blocks with AES in Cipher Block Chaining mode. Written by Eric Postpischil, January 24, 2008. */ /* Define a macro to select a value based on architecture. This reduces some of the architecture conditionalization later in the source. */ #if defined __i386__ #define Arch(i386, x86_64) i386 #elif defined __x86_64__ #define Arch(i386, x86_64) x86_64 #endif /* Rename the general registers. This makes it easier to keep track of them and provides names for the "whole register" that are uniform between i386 and x86_64. */ #if defined __i386__ #define r0 %eax // Available for any use. #define r1 %ecx // Available for any use, some special purposes (loop). #define r2 %edx // Available for any use. #define r3 %ebx // Must be preserved by called routine. #define r4 %esp // Stack pointer. #define r5 %ebp // Frame pointer, must preserve, no bare indirect. #define r6 %esi // Must be preserved by called routine. #define r7 %edi // Must be preserved by called routine. #elif defined __x86_64__ #define r0 %rax // Available for any use. #define r1 %rcx // Available for any use. #define r2 %rdx // Available for any use. #define r3 %rbx // Must be preserved by called routine. #define r4 %rsp // Stack pointer. #define r5 %rbp // Frame pointer. Must be preserved by called routine. #define r6 %rsi // Available for any use. #define r7 %rdi // Available for any use. #define r8 %r8 // Available for any use. #define r9 %r9 // Available for any use. #define r10 %r10 // Available for any use. #define r11 %r11 // Available for any use. #define r12 %r12 // Must be preserved by called routine. #define r13 %r13 // Must be preserved by called routine. #define r14 %r14 // Must be preserved by called routine. #define r15 %r15 // Must be preserved by called routine. #else #error "Unknown architecture." #endif /* Routine: _AESDecryptCBC. Function: This routine uses _AESDecryptWithExpandedKey to decrypt blocks in Cipher Block Chaining mode, which requires chaining the AES state from block to block. In CBC mode, each output block is (after the underlying decryption) XORed with the previous input block. On the first iteration, the previous input block is supplied from a chain buffer. Input: void *O // Output const void *I // Input void *ChainBuffer // Chain buffer / initial value. void *Key // Expanded Key. long Blocks // Number of 16-byte blocks to process. long Rounds // Number of rounds. Output: Decrypted text is written to *O. The final input block is written to *ChainBuffer. */ .globl _AESDecryptCBC .private_extern _AESDecryptCBC _AESDecryptCBC: // Push new stack frame. push r5 // Save registers. push r3 #if defined __i386__ push r6 push r7 #define RegisterSaveSize (3*4) #elif defined __x86_64__ push r12 push r13 push r14 push r15 #define RegisterSaveSize (5*8) #endif /* B is the number of bytes from the top of stack just before the instruction that called this routine to the top of stack after we push the frame pointer and other registers. It provides information needed to align our stack frame. */ #define B (RegisterSaveSize + 2*Arch(4, 8)) /* Allocate space on the stack for 16 bytes for the AES state, 16 bytes to save the chain value, and, on i386, 16 bytes for four four-byte arguments, and padding needed to produce 16-byte alignment. */ #define LocalsSize ((16*2 + Arch(16, 0) + B + 15 & -16) - B) #define StackFrame (LocalsSize + B) /* LocalState is the offset from the stack pointer to where we store the AES state. */ #define LocalState Arch(16, 0) #define SavedChain Arch(32, 16) // Offset to saved chain value. #if 0 < LocalsSize sub $LocalsSize, r4 // Allocate space on stack. #endif // Non-volatile registers. #define I r3 #define O r5 #define Blocks Arch(r6, r12) #define ChainBuffer Arch(r7, r13) #define Rounds Arch(Not used, r14) #define Key Arch(Not used, r15) // Volatile registers. #define t0 r0 #define v0 %xmm0 #define vState0 %xmm4 // Arguments passed to us. #if defined __i386__ // Define location of argument i. #define Argument(i) StackFrame+4*(i)(r4) #endif #define ArgO Arch(Argument(0), r7) #define ArgI Arch(Argument(1), r6) #define ArgChainBuffer Arch(Argument(2), r2) #define ArgKey Arch(Argument(3), r1) #define ArgBlocks Arch(Argument(4), r8) #define ArgRounds Arch(Argument(5), r9) /* Get some arguments. We need to move these from the stack (on i386) or volatile registers (on x86_64) to non-volatile registers where we can use them and keep them during calls to a subroutine. */ mov ArgO, O mov ArgI, I mov ArgChainBuffer, ChainBuffer mov ArgBlocks, Blocks // Convert Blocks from number of blocks to displacement in bytes. imul $16, Blocks je done // Leave if we were given zero blocks. // Save last input block to write to ChainBuffer later. movupd -16(I, Blocks), v0 movapd v0, SavedChain(r4) #if defined __i386__ // Put arguments we will pass on stack. mov ArgRounds, t0 mov t0, 3*4(r4) mov ArgKey, t0 mov t0, 2*4(r4) lea LocalState(r4), t0 mov t0, 0*4(r4) #else // Put arguments we will pass into non-volatile registers. mov ArgRounds, Rounds mov ArgKey, Key #endif add $-16, Blocks jle 2f // Skip main loop if there was only one block. // Main loop. 1: #if defined __i386__ // Pass address of current input block. lea (I, Blocks), t0 mov t0, 1*4(r4) #else // Pass arguments to subroutine. #define PassedRounds r1 #define PassedKey r2 #define PassedInput r6 #define PassedOutput r7 mov Rounds, PassedRounds mov Key, PassedKey lea (I, Blocks), PassedInput lea LocalState(r4), PassedOutput #endif // Decrypt state. call _AESDecryptWithExpandedKey // XOR decrypted block with previous chain value. movapd LocalState(r4), vState0 movupd -16(I, Blocks), v0 pxor v0, vState0 // Write to output. movupd vState0, 0*4(O, Blocks) add $-16, Blocks jg 1b 2: /* First block is separate because it gets chain value from ChainBuffer rather than from the input stream. */ #if defined __i386__ // Pass address of current input block. lea (I, Blocks), t0 mov t0, 1*4(r4) #else // Pass arguments to subroutine. #define PassedRounds r1 #define PassedKey r2 #define PassedInput r6 #define PassedOutput r7 mov Rounds, PassedRounds mov Key, PassedKey lea (I, Blocks), PassedInput lea LocalState(r4), PassedOutput #endif // Decrypt state. call _AESDecryptWithExpandedKey // XOR decrypted block with previous chain value. movapd LocalState(r4), vState0 movupd (ChainBuffer), v0 pxor v0, vState0 // Write to output. movupd vState0, 0*4(O, Blocks) // Save state for chaining in future calls. movapd SavedChain(r4), v0 movupd v0, (ChainBuffer) done: // Pop stack and restore registers. #if 0 < LocalsSize add $LocalsSize, r4 #endif #if defined __i386__ pop r7 pop r6 #elif defined __x86_64__ pop r15 pop r14 pop r13 pop r12 #endif pop r3 pop r5 ret #endif // defined UseAESedp_IntelAssembly