aes_x86_v2.s   [plain text]


/*
 * Copyright (c) 2000-2006 Apple Computer, Inc. All rights reserved.
 *
 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
 * 
 * This file contains Original Code and/or Modifications of Original Code
 * as defined in and that are subject to the Apple Public Source License
 * Version 2.0 (the 'License'). You may not use this file except in
 * compliance with the License. The rights granted to you under the License
 * may not be used to create, or enable the creation or redistribution of,
 * unlawful or unlicensed copies of an Apple operating system, or to
 * circumvent, violate, or enable the circumvention or violation of, any
 * terms of an Apple operating system software license agreement.
 * 
 * Please obtain a copy of the License at
 * http://www.opensource.apple.com/apsl/ and read it before using this file.
 * 
 * The Original Code and all software distributed under the License are
 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
 * Please see the License for the specific language governing rights and
 * limitations under the License.
 * 
 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
 */

/*
 * ---------------------------------------------------------------------------
 * Copyright (c) 2002, Dr Brian Gladman, Worcester, UK.   All rights reserved.
 *
 * LICENSE TERMS
 *
 * The free distribution and use of this software in both source and binary
 * form is allowed (with or without changes) provided that:
 *
 *   1. distributions of this source code include the above copyright
 *      notice, this list of conditions and the following disclaimer;
 *
 *   2. distributions in binary form include the above copyright
 *      notice, this list of conditions and the following disclaimer
 *      in the documentation and/or other associated materials;
 *
 *   3. the copyright holder's name is not used to endorse products
 *      built using this software without specific written permission.
 *
 * ALTERNATIVELY, provided that this notice is retained in full, this product
 * may be distributed under the terms of the GNU General Public License (GPL),
 * in which case the provisions of the GPL apply INSTEAD OF those given above.
 *
 * DISCLAIMER
 *
 * This software is provided 'as is' with no explicit or implied warranties
 * in respect of its properties, including, but not limited to, correctness
 * and/or fitness for purpose.
 * ---------------------------------------------------------------------------
 * Issue 31/01/2006
 *
 * This code requires either ASM_X86_V2 or ASM_X86_V2C to be set in aesopt.h 
 * and the same define to be set here as well. If AES_V2C is set this file 
 * requires the C files aeskey.c and aestab.c for support.
 *
 * This is a full assembler implementation covering encryption, decryption and
 * key scheduling. It uses 2k bytes of tables but its encryption and decryption
 * performance is very close to that obtained using large tables.  Key schedule
 * expansion is slower for both encryption and decryption but this is likely to
 * be offset by the much smaller load that this version places on the processor
 * cache. I acknowledge the contribution made by Daniel Bernstein to aspects of
 * the design of the AES round function used here.
 *
 * This code provides the standard AES block size (128 bits, 16 bytes) and the
 * three standard AES key sizes (128, 192 and 256 bits). It has the same call
 * interface as my C implementation. The ebx, esi, edi and ebp registers are
 * preserved across calls but eax, ecx and edx and the artihmetic status flags
 * are not.
 */

#include <mach/i386/asm.h>

#define AES_128          /* define if AES with 128 bit keys is needed */
#define AES_192          /* define if AES with 192 bit keys is needed */
#define AES_256          /* define if AES with 256 bit keys is needed */
#define AES_VAR          /* define if a variable key size is needed */
#define ENCRYPTION       /* define if encryption is needed */
#define DECRYPTION       /* define if decryption is needed */
#define AES_REV_DKS      /* define if key decryption schedule is reversed */

#ifndef ASM_X86_V2C
#define ENCRYPTION_KEY_SCHEDULE /* define if enc. key expansion is needed */
#define DECRYPTION_KEY_SCHEDULE /* define if dec. key expansion is needed */
#endif

/*
 * The encryption key schedule has the following in memory layout where N is the
 * number of rounds (10, 12 or 14):
 *
 * lo: | input key (round 0)  |  ; each round is four 32-bit words
 *     | encryption round 1   |
 *     | encryption round 2   |
 *     ....
 *     | encryption round N-1 |
 * hi: | encryption round N   |
 *
 * The decryption key schedule is normally set up so that it has the same
 * layout as above by actually reversing the order of the encryption key
 * schedule in memory (this happens when AES_REV_DKS is set):
 *
 * lo: | decryption round 0   | =              | encryption round N   |
 *     | decryption round 1   | = INV_MIX_COL[ | encryption round N-1 | ]
 *     | decryption round 2   | = INV_MIX_COL[ | encryption round N-2 | ]
 *     ....                       ....
 *     | decryption round N-1 | = INV_MIX_COL[ | encryption round 1   | ]
 * hi: | decryption round N   | =              | input key (round 0)  |
 *
 * with rounds except the first and last modified using inv_mix_column()
 * But if AES_REV_DKS is NOT set the order of keys is left as it is for
 * encryption so that it has to be accessed in reverse when used for
 * decryption (although the inverse mix column modifications are done)
 *
 * lo: | decryption round 0   | =              | input key (round 0)  |
 *     | decryption round 1   | = INV_MIX_COL[ | encryption round 1   | ]
 *     | decryption round 2   | = INV_MIX_COL[ | encryption round 2   | ]
 *     ....                       ....
 *     | decryption round N-1 | = INV_MIX_COL[ | encryption round N-1 | ]
 * hi: | decryption round N   | =              | encryption round N   |
 *
 * This layout is faster when the assembler key scheduling provided here
 * is used.
 */

/* End of user defines */

#ifdef AES_VAR
#ifndef AES_128
#define AES_128 
#endif
#ifndef AES_192
#define AES_192 
#endif
#ifndef AES_256
#define AES_256 
#endif
#endif

#ifdef AES_VAR
#define KS_LENGTH 60
#else
#ifdef AES_256
#define KS_LENGTH 60
#else
#ifdef AES_192
#define KS_LENGTH 52
#else 
#define KS_LENGTH 44
#endif
#endif
#endif

/*
 * These macros implement stack based local variables
 */
#define	save(r1)			\
    movl    %r1, (%esp);

#define	restore(r1)			\
    movl    (%esp), %r1;

#define	do_call(f, n)			\
    call    EXT(f);			\
    addl    $(n), %esp;

/*
 * finite field multiplies by {02}, {04} and {08}
 */
#define f2(x) ((x<<1)^(((x>>7)&1)*0x11b))
#define f4(x) ((x<<2)^(((x>>6)&1)*0x11b)^(((x>>6)&2)*0x11b))
#define f8(x) ((x<<3)^(((x>>5)&1)*0x11b)^(((x>>5)&2)*0x11b)^(((x>>5)&4)*0x11b))

/*
 * finite field multiplies required in table generation
 */
#define	f3(x) (f2(x) ^ x)
#define	f9(x) (f8(x) ^ x)
#define	fb(x) (f8(x) ^ f2(x) ^ x)
#define	fd(x) (f8(x) ^ f4(x) ^ x)
#define	fe(x) (f8(x) ^ f4(x) ^ f2(x))

#define	etab_0(x) enc_tab+4(,x,8)
#define	etab_1(x) enc_tab+3(,x,8)
#define	etab_2(x) enc_tab+2(,x,8)
#define	etab_3(x) enc_tab+1(,x,8)

#define	etab_b(x) etab_3(x)

#define	btab_0(x) enc_tab+6(,x,8)
#define	btab_1(x) enc_tab+5(,x,8)
#define	btab_2(x) enc_tab+4(,x,8)
#define	btab_3(x) enc_tab+3(,x,8)

/*
 * ROUND FUNCTION.  Build column[2] on ESI and column[3] on EDI that have the
 * round keys pre-loaded. Build column[0] in EBP and column[1] in EBX.
 *
 * Input:
 *
 *   EAX     column[0]
 *   EBX     column[1]
 *   ECX     column[2]
 *   EDX     column[3]
 *   ESI     column key[round][2]
 *   EDI     column key[round][3]
 *   EBP     scratch
 *
 * Output:
 *
 *   EBP     column[0]   unkeyed
 *   EBX     column[1]   unkeyed
 *   ESI     column[2]   keyed
 *   EDI     column[3]   keyed
 *   EAX     scratch
 *   ECX     scratch
 *   EDX     scratch
 */
#define	rnd_fun(m1, m2)			\
    roll    $16, %ebx;			\
					\
    ## m1 ## _zo(esi, cl, 0, ebp);	\
    m1(esi, dh, 1, ebp);		\
    m1(esi, bh, 3, ebp);		\
    ## m1 ## _zo(edi, dl, 0, ebp);	\
    m1(edi, ah, 1, ebp);		\
    m1(edi, bl, 2, ebp);		\
    ## m2 ## _zo(ebp, al, 0, ebp);	\
					\
    shrl    $16, %ebx;			\
    andl    $0xffff0000, %eax;		\
    orl     %ebx, %eax;			\
    shrl    $16, %edx;			\
					\
    m1(ebp, ah, 1, ebx);		\
    m1(ebp, dh, 3, ebx);		\
    m2(ebx, dl, 2, ebx);		\
    m1(ebx, ch, 1, edx);		\
    ## m1 ## _zo(ebx, al, 0, edx);	\
					\
    shrl    $16, %eax;			\
    shrl    $16, %ecx;			\
					\
    m1(ebp, cl, 2, edx);		\
    m1(edi, ch, 3, edx);		\
    m1(esi, al, 2, edx);		\
    m1(ebx, ah, 3, edx)

/*
 * Basic MOV and XOR Operations for normal rounds
 */
#define	nr_xor_zo	nr_xor
#define	nr_xor(r1, r2, r3, r4)		\
    movzbl  %r2, %r4;			\
    xorl    etab_ ## r3(%r4), %r1;

#define	nr_mov_zo	nr_mov
#define	nr_mov(r1, r2, r3, r4)		\
    movzbl  %r2, %r4;			\
    movl    etab_ ## r3(%r4), %r1;

/*
 * Basic MOV and XOR Operations for last round
 */

#if 1

#define	lr_xor_zo(r1, r2, r3, r4)	\
    movzbl  %r2, %r4;			\
    movzbl  etab_b(%r4), %r4;		\
    xor     %r4, %r1;

#define	lr_xor(r1, r2, r3, r4)		\
    movzbl  %r2, %r4;			\
    movzbl  etab_b(%r4), %r4;		\
    shll    $(8*r3), %r4;		\
    xor     %r4, %r1;

#define	lr_mov_zo(r1, r2, r3, r4)	\
    movzbl  %r2, %r4;			\
    movzbl  etab_b(%r4), %r1;

#define	lr_mov(r1, r2, r3, r4)		\
    movzbl  %r2, %r4;			\
    movzbl  etab_b(%r4), %r1;		\
    shll    $(8*r3), %r1;

#else        /* less effective but worth leaving as an option */

#define	lr_xor_zo	lr_xor
#define	lr_xor(r1, r2, r3, r4)			\
    movzbl  %r2, %r4;				\
    mov     btab_ ## r3(%r4), %r4;		\
    andl    $(0x000000ff << 8 * r3), %r4;	\
    xor     %r4, %r1;

#define	lr_mov_zo	lr_mov
#define	lr_mov(r1, r2, r3, r4)			\
    movzbl  %r2, %r4;				\
    mov     btab_ ## r3(%r4), %r1;		\
    andl    $(0x000000ff << 8 * r3), %r1;

#endif

/*
 * Apply S-Box to the 4 bytes in a 32-bit word and rotate left 3 byte positions
 *
 *   r1 : output is xored into this register
 *   r2 : input: a => eax, b => ebx, c => ecx, d => edx
 *   r3 : scratch register
 */

#define	l3s_col(r1, r2, r3)			\
    lr_xor_zo(r1, ## r2 ## h, 0, r3);		\
    lr_xor(r1, ## r2 ## l, 3, r3);		\
    shrl    $16, %e ## r2 ## x;			\
    lr_xor(r1, ## r2 ## h, 2, r3);		\
    lr_xor(r1, ## r2 ## l, 1, r3);

/*
 * offsets to parameters
 */
#define	in_blk		4	/* input byte array address parameter */
#define	out_blk		8	/* output byte array address parameter */
#define	ctx		12	/* AES context structure */
#define	stk_spc		20	/* stack space */

#ifdef  ENCRYPTION

#define ENCRYPTION_TABLE 

#define	enc_round			\
    addl    $16, %ebp;			\
    save(ebp);				\
    movl    8(%ebp), %esi;		\
    movl    12(%ebp), %edi;		\
					\
    rnd_fun(nr_xor, nr_mov);		\
					\
    movl    %ebp, %eax;			\
    movl    %esi, %ecx;			\
    movl    %edi, %edx;			\
    restore(ebp);			\
    xorl    (%ebp), %eax;		\
    xorl    4(%ebp), %ebx;

#define enc_last_round			\
    addl    $16, %ebp;			\
    save(ebp);				\
    movl    8(%ebp), %esi;		\
    movl    12(%ebp), %edi;		\
					\
    rnd_fun(lr_xor, lr_mov);		\
					\
    movl    %ebp, %eax;			\
    restore(ebp);			\
    xorl    (%ebp), %eax;		\
    xorl    4(%ebp), %ebx;

    .section __TEXT, __text

/*
 * AES Encryption Subroutine
 */
Entry(aes_encrypt)

    subl    $stk_spc, %esp
    movl    %ebp, 16(%esp)
    movl    %ebx, 12(%esp)
    movl    %esi, 8(%esp)
    movl    %edi, 4(%esp)

    movl    in_blk+stk_spc(%esp), %esi	/* input pointer */
    movl    (%esi), %eax
    movl    4(%esi), %ebx
    movl    8(%esi), %ecx
    movl    12(%esi), %edx

    movl    ctx+stk_spc(%esp), %ebp	/* key pointer */
    movzbl  4*KS_LENGTH(%ebp), %edi
    xorl    (%ebp), %eax
    xorl    4(%ebp), %ebx
    xorl    8(%ebp), %ecx
    xorl    12(%ebp), %edx

    /*
     * determine the number of rounds
     */
    cmpl    $10*16, %edi
    je     aes_encrypt.3
    cmpl    $12*16, %edi
    je     aes_encrypt.2
    cmpl    $14*16, %edi
    je      aes_encrypt.1
    movl    $-1, %eax
    jmp     aes_encrypt.5

aes_encrypt.1:
    enc_round
    enc_round
aes_encrypt.2:
    enc_round
    enc_round
aes_encrypt.3:
    enc_round
    enc_round
    enc_round
    enc_round
    enc_round
    enc_round
    enc_round
    enc_round
    enc_round
    enc_last_round

    movl    out_blk+stk_spc(%esp), %edx
    movl    %eax, (%edx)
    movl    %ebx, 4(%edx)
    movl    %esi, 8(%edx)
    movl    %edi, 12(%edx)
    xorl    %eax, %eax

aes_encrypt.5:
    movl    16(%esp), %ebp
    movl    12(%esp), %ebx
    movl    8(%esp), %esi
    movl    4(%esp), %edi
    addl    $stk_spc, %esp
    ret

#endif

/*
 * For r2 == 16, or r2 == 24 && r1 == 7, or r2 ==32 && r1 == 6
 */
#define	f_key(r1, r2, rc_val)		\
    l3s_col(esi, a, ebx);		\
    xorl    $rc_val, %esi;		\
					\
    movl    %esi, r1*r2(%ebp);		\
    xorl    %esi, %edi;			\
    movl    %edi, r1*r2+4(%ebp);	\
    xorl    %edi, %ecx;			\
    movl    %ecx, r1*r2+8(%ebp);	\
    xorl    %ecx, %edx;			\
    movl    %edx, r1*r2+12(%ebp);	\
    movl    %edx, %eax;

/*
 * For r2 == 24 && r1 == 0 to 6
 */
#define	f_key_24(r1, r2, rc_val)	\
    f_key(r1, r2, rc_val);		\
					\
    xorl    r1*r2+16-r2(%ebp), %eax;	\
    movl    %eax, r1*r2+16(%ebp);	\
    xorl    r1*r2+20-r2(%ebp), %eax;	\
    movl    %eax, r1*r2+20(%ebp);

/*
 * For r2 ==32 && r1 == 0 to 5
 */
#define	f_key_32(r1, r2, rc_val)	\
    f_key(r1, r2, rc_val);		\
					\
    roll    $8, %eax;			\
    pushl   %edx;			\
    movl    r1*r2+16-r2(%ebp), %edx;	\
    l3s_col(edx, a, ebx);		\
    movl    %edx, %eax;			\
    popl    %edx;			\
    movl    %eax, r1*r2+16(%ebp);	\
    xorl    r1*r2+20-r2(%ebp), %eax;	\
    movl    %eax, r1*r2+20(%ebp);	\
    xorl    r1*r2+24-r2(%ebp), %eax;	\
    movl    %eax, r1*r2+24(%ebp);	\
    xorl    r1*r2+28-r2(%ebp), %eax;	\
    movl    %eax, r1*r2+28(%ebp);

#ifdef ENCRYPTION_KEY_SCHEDULE

#ifdef  AES_128

#ifndef ENCRYPTION_TABLE
#define ENCRYPTION_TABLE 
#endif

Entry(aes_encrypt_key128)

    pushl   %ebp
    pushl   %ebx
    pushl   %esi
    pushl   %edi

    movl    24(%esp), %ebp
    movl    $10*16, 4*KS_LENGTH(%ebp)
    movl    20(%esp), %ebx

    movl    (%ebx), %esi
    movl    %esi, (%ebp)
    movl    4(%ebx), %edi
    movl    %edi, 4(%ebp)
    movl    8(%ebx), %ecx
    movl    %ecx, 8(%ebp)
    movl    12(%ebx), %edx
    movl    %edx, 12(%ebp)
    addl    $16, %ebp
    movl    %edx, %eax

    f_key(0, 16, 1)
    f_key(1, 16, 2)
    f_key(2, 16, 4)
    f_key(3, 16, 8)
    f_key(4, 16, 16)
    f_key(5, 16, 32)
    f_key(6, 16, 64)
    f_key(7, 16, 128)
    f_key(8, 16, 27)
    f_key(9, 16, 54)

    popl    %edi
    popl    %esi
    popl    %ebx
    popl    %ebp
    xorl    %eax, %eax
    ret

#endif

#ifdef  AES_192

#ifndef ENCRYPTION_TABLE
#define ENCRYPTION_TABLE 
#endif

Entry(aes_encrypt_key192)

    pushl   %ebp
    pushl   %ebx
    pushl   %esi
    pushl   %edi

    movl    24(%esp), %ebp
    movl    $12*16, 4*KS_LENGTH(%ebp)
    movl    20(%esp), %ebx

    movl    (%ebx), %esi
    movl    %esi, (%ebp)
    movl    4(%ebx), %edi
    movl    %edi, 4(%ebp)
    movl    8(%ebx), %ecx
    movl    %ecx, 8(%ebp)
    movl    12(%ebx), %edx
    movl    %edx, 12(%ebp)
    movl    16(%ebx), %eax
    movl    %eax, 16(%ebp)
    movl    20(%ebx), %eax
    movl    %eax, 20(%ebp)
    addl    $24, %ebp

    f_key_24(0, 24, 1)
    f_key_24(1, 24, 2)
    f_key_24(2, 24, 4)
    f_key_24(3, 24, 8)
    f_key_24(4, 24, 16)
    f_key_24(5, 24, 32)
    f_key_24(6, 24, 64)
    f_key(7, 24, 128)

    popl    %edi
    popl    %esi
    popl    %ebx
    popl    %ebp
    xorl    %eax, %eax
    ret

#endif

#ifdef  AES_256

#ifndef ENCRYPTION_TABLE
#define ENCRYPTION_TABLE 
#endif

Entry(aes_encrypt_key256)

    pushl   %ebp
    pushl   %ebx
    pushl   %esi
    pushl   %edi

    movl    24(%esp), %ebp
    movl    $14*16, 4*KS_LENGTH(%ebp)
    movl    20(%esp), %ebx

    movl    (%ebx), %esi
    movl    %esi, (%ebp)
    movl    4(%ebx), %edi
    movl    %edi, 4(%ebp)
    movl    8(%ebx), %ecx
    movl    %ecx, 8(%ebp)
    movl    12(%ebx), %edx
    movl    %edx, 12(%ebp)
    movl    16(%ebx), %eax
    movl    %eax, 16(%ebp)
    movl    20(%ebx), %eax
    movl    %eax, 20(%ebp)
    movl    24(%ebx), %eax
    movl    %eax, 24(%ebp)
    movl    28(%ebx), %eax
    movl    %eax, 28(%ebp)
    addl    $32, %ebp

    f_key_32(0, 32, 1)
    f_key_32(1, 32, 2)
    f_key_32(2, 32, 4)
    f_key_32(3, 32, 8)
    f_key_32(4, 32, 16)
    f_key_32(5, 32, 32)
    f_key(6, 32, 64)

    popl    %edi
    popl    %esi
    popl    %ebx
    popl    %ebp
    xorl    %eax, %eax
    ret

#endif

#ifdef  AES_VAR

#ifndef ENCRYPTION_TABLE
#define ENCRYPTION_TABLE 
#endif

Entry(aes_encrypt_key)

    movl    4(%esp), %ecx
    movl    8(%esp), %eax
    movl    12(%esp), %edx
    pushl   %edx
    pushl   %ecx

    cmpl    $16, %eax
    je      aes_encrypt_key.1
    cmpl    $128, %eax
    je      aes_encrypt_key.1

    cmpl    $24, %eax
    je      aes_encrypt_key.2
    cmpl    $192, %eax
    je      aes_encrypt_key.2

    cmpl    $32, %eax
    je      aes_encrypt_key.3
    cmpl    $256, %eax
    je      aes_encrypt_key.3
    movl    $-1, %eax
    addl    $8, %esp
    ret

aes_encrypt_key.1:
    do_call(aes_encrypt_key128, 8)
    ret
aes_encrypt_key.2:
    do_call(aes_encrypt_key192, 8)
    ret
aes_encrypt_key.3:
    do_call(aes_encrypt_key256, 8)
    ret

#endif

#endif

#ifdef ENCRYPTION_TABLE

# S-box data - 256 entries

    .section __DATA, __data
    .align ALIGN

#define u8(x) 0, x, x, f3(x), f2(x), x, x, f3(x)

enc_tab: 
   .byte u8(0x63),u8(0x7c),u8(0x77),u8(0x7b),u8(0xf2),u8(0x6b),u8(0x6f),u8(0xc5)
   .byte u8(0x30),u8(0x01),u8(0x67),u8(0x2b),u8(0xfe),u8(0xd7),u8(0xab),u8(0x76)
   .byte u8(0xca),u8(0x82),u8(0xc9),u8(0x7d),u8(0xfa),u8(0x59),u8(0x47),u8(0xf0)
   .byte u8(0xad),u8(0xd4),u8(0xa2),u8(0xaf),u8(0x9c),u8(0xa4),u8(0x72),u8(0xc0)
   .byte u8(0xb7),u8(0xfd),u8(0x93),u8(0x26),u8(0x36),u8(0x3f),u8(0xf7),u8(0xcc)
   .byte u8(0x34),u8(0xa5),u8(0xe5),u8(0xf1),u8(0x71),u8(0xd8),u8(0x31),u8(0x15)
   .byte u8(0x04),u8(0xc7),u8(0x23),u8(0xc3),u8(0x18),u8(0x96),u8(0x05),u8(0x9a)
   .byte u8(0x07),u8(0x12),u8(0x80),u8(0xe2),u8(0xeb),u8(0x27),u8(0xb2),u8(0x75)
   .byte u8(0x09),u8(0x83),u8(0x2c),u8(0x1a),u8(0x1b),u8(0x6e),u8(0x5a),u8(0xa0)
   .byte u8(0x52),u8(0x3b),u8(0xd6),u8(0xb3),u8(0x29),u8(0xe3),u8(0x2f),u8(0x84)
   .byte u8(0x53),u8(0xd1),u8(0x00),u8(0xed),u8(0x20),u8(0xfc),u8(0xb1),u8(0x5b)
   .byte u8(0x6a),u8(0xcb),u8(0xbe),u8(0x39),u8(0x4a),u8(0x4c),u8(0x58),u8(0xcf)
   .byte u8(0xd0),u8(0xef),u8(0xaa),u8(0xfb),u8(0x43),u8(0x4d),u8(0x33),u8(0x85)
   .byte u8(0x45),u8(0xf9),u8(0x02),u8(0x7f),u8(0x50),u8(0x3c),u8(0x9f),u8(0xa8)
   .byte u8(0x51),u8(0xa3),u8(0x40),u8(0x8f),u8(0x92),u8(0x9d),u8(0x38),u8(0xf5)
   .byte u8(0xbc),u8(0xb6),u8(0xda),u8(0x21),u8(0x10),u8(0xff),u8(0xf3),u8(0xd2)
   .byte u8(0xcd),u8(0x0c),u8(0x13),u8(0xec),u8(0x5f),u8(0x97),u8(0x44),u8(0x17)
   .byte u8(0xc4),u8(0xa7),u8(0x7e),u8(0x3d),u8(0x64),u8(0x5d),u8(0x19),u8(0x73)
   .byte u8(0x60),u8(0x81),u8(0x4f),u8(0xdc),u8(0x22),u8(0x2a),u8(0x90),u8(0x88)
   .byte u8(0x46),u8(0xee),u8(0xb8),u8(0x14),u8(0xde),u8(0x5e),u8(0x0b),u8(0xdb)
   .byte u8(0xe0),u8(0x32),u8(0x3a),u8(0x0a),u8(0x49),u8(0x06),u8(0x24),u8(0x5c)
   .byte u8(0xc2),u8(0xd3),u8(0xac),u8(0x62),u8(0x91),u8(0x95),u8(0xe4),u8(0x79)
   .byte u8(0xe7),u8(0xc8),u8(0x37),u8(0x6d),u8(0x8d),u8(0xd5),u8(0x4e),u8(0xa9)
   .byte u8(0x6c),u8(0x56),u8(0xf4),u8(0xea),u8(0x65),u8(0x7a),u8(0xae),u8(0x08)
   .byte u8(0xba),u8(0x78),u8(0x25),u8(0x2e),u8(0x1c),u8(0xa6),u8(0xb4),u8(0xc6)
   .byte u8(0xe8),u8(0xdd),u8(0x74),u8(0x1f),u8(0x4b),u8(0xbd),u8(0x8b),u8(0x8a)
   .byte u8(0x70),u8(0x3e),u8(0xb5),u8(0x66),u8(0x48),u8(0x03),u8(0xf6),u8(0x0e)
   .byte u8(0x61),u8(0x35),u8(0x57),u8(0xb9),u8(0x86),u8(0xc1),u8(0x1d),u8(0x9e)
   .byte u8(0xe1),u8(0xf8),u8(0x98),u8(0x11),u8(0x69),u8(0xd9),u8(0x8e),u8(0x94)
   .byte u8(0x9b),u8(0x1e),u8(0x87),u8(0xe9),u8(0xce),u8(0x55),u8(0x28),u8(0xdf)
   .byte u8(0x8c),u8(0xa1),u8(0x89),u8(0x0d),u8(0xbf),u8(0xe6),u8(0x42),u8(0x68)
   .byte u8(0x41),u8(0x99),u8(0x2d),u8(0x0f),u8(0xb0),u8(0x54),u8(0xbb),u8(0x16)

#endif

#ifdef  DECRYPTION

#define DECRYPTION_TABLE 

#define dtab_0(x) dec_tab(,x,8)
#define dtab_1(x) dec_tab+3(,x,8)
#define dtab_2(x) dec_tab+2(,x,8)
#define dtab_3(x) dec_tab+1(,x,8)
#define dtab_x(x) dec_tab+7(,x,8)

#define	irn_fun(m1, m2)			\
    roll    $16, %eax;			\
					\
    ## m1 ## _zo(esi, cl, 0, ebp);	\
    m1(esi, bh, 1, ebp);		\
    m1(esi, al, 2, ebp);		\
    ## m1 ## _zo(edi, dl, 0, ebp);	\
    m1(edi, ch, 1, ebp);		\
    m1(edi, ah, 3, ebp);		\
    ## m2 ## _zo(ebp, bl, 0, ebp);	\
					\
    shrl    $16, %eax;			\
    andl    $0xffff0000, %ebx;		\
    orl     %eax, %ebx;			\
    shrl    $16, %ecx;			\
					\
    m1(ebp, bh, 1, eax);		\
    m1(ebp, ch, 3, eax);		\
    m2(eax, cl, 2, ecx);		\
    ## m1 ## _zo(eax, bl, 0, ecx);	\
    m1(eax, dh, 1, ecx);		\
					\
    shrl    $16, %ebx;			\
    shrl    $16, %edx;			\
					\
    m1(esi, dh, 3, ecx);		\
    m1(ebp, dl, 2, ecx);		\
    m1(eax, bh, 3, ecx);		\
    m1(edi, bl, 2, ecx);

/*
 * Basic MOV and XOR Operations for normal rounds
 */
#define	ni_xor_zo	ni_xor
#define	ni_xor(r1, r2, r3, r4)		\
    movzbl  %r2, %r4;			\
    xorl    dtab_ ## r3 ## (%r4), %r1;

#define	ni_mov_zo	ni_mov
#define	ni_mov(r1, r2, r3, r4)		\
    movzbl  %r2, %r4;			\
    movl    dtab_ ## r3 ## (%r4), %r1;

/*
 * Basic MOV and XOR Operations for last round
 */

#define	li_xor_zo(r1, r2, r3, r4)	\
    movzbl %r2, %r4;			\
    movzbl dtab_x(%r4), %r4;		\
    xor    %r4, %r1;

#define	li_xor(r1, r2, r3, r4)		\
    movzbl %r2, %r4;			\
    movzbl dtab_x(%r4), %r4;		\
    shll   $(8*r3), %r4;		\
    xor    %r4, %r1;

#define	li_mov_zo(r1, r2, r3, r4)	\
    movzbl %r2, %r4;			\
    movzbl dtab_x(%r4), %r1;

#define	li_mov(r1, r2, r3, r4)		\
    movzbl %r2, %r4;			\
    movzbl dtab_x(%r4), %r1;		\
    shl    $(8*r3), %r1;

#ifdef AES_REV_DKS

#define	dec_round			\
    addl    $16, %ebp;			\
    save(ebp);				\
    movl    8(%ebp), %esi;		\
    movl    12(%ebp), %edi;		\
					\
    irn_fun(ni_xor, ni_mov);		\
					\
    movl    %ebp, %ebx;			\
    movl    %esi, %ecx;			\
    movl    %edi, %edx;			\
    restore(ebp);			\
    xorl    (%ebp), %eax;		\
    xorl    4(%ebp), %ebx;

#define	dec_last_round			\
    addl    $16, %ebp;			\
    save(ebp);				\
    movl    8(%ebp), %esi;		\
    movl    12(%ebp), %edi;		\
					\
    irn_fun(li_xor, li_mov);		\
					\
    movl    %ebp, %ebx;			\
    restore(ebp);			\
    xorl    (%ebp), %eax;		\
    xorl    4(%ebp), %ebx;

#else

#define	dec_round			\
    subl    $16, %ebp;			\
    save(ebp);				\
    movl    8(%ebp), %esi;		\
    movl    12(%ebp), %edi;		\
					\
    irn_fun(ni_xor, ni_mov);		\
					\
    movl    %ebp, %ebx;			\
    movl    %esi, %ecx;			\
    movl    %edi, %edx;			\
    restore(ebp);			\
    xorl    (%ebp), %eax;		\
    xorl    4(%ebp), %ebx;

#define	dec_last_round			\
    subl    $16, %ebp;			\
    save(ebp);				\
    movl    8(%ebp), %esi;		\
    movl    12(%ebp), %edi;		\
					\
    irn_fun(li_xor, li_mov);		\
					\
    movl    %ebp, %ebx;			\
    restore(ebp);			\
    xorl    (%ebp), %eax;		\
    xorl    4(%ebp), %ebx;

#endif /* AES_REV_DKS */

    .section __TEXT, __text

/*
 * AES Decryption Subroutine
 */
Entry(aes_decrypt)

    subl    $stk_spc, %esp
    movl    %ebp, 16(%esp)
    movl    %ebx, 12(%esp)
    movl    %esi, 8(%esp)
    movl    %edi, 4(%esp)

    /*
     * input four columns and xor in first round key
     */
    movl    in_blk+stk_spc(%esp), %esi	/* input pointer */
    movl    (%esi), %eax
    movl    4(%esi), %ebx
    movl    8(%esi), %ecx
    movl    12(%esi), %edx
    leal    16(%esi), %esi

    movl    ctx+stk_spc(%esp), %ebp	/* key pointer */
    movzbl  4*KS_LENGTH(%ebp), %edi
#ifndef  AES_REV_DKS		/* if decryption key schedule is not reversed */
    leal    (%ebp,%edi), %ebp	/* we have to access it from the top down */
#endif
    xorl    (%ebp), %eax	/* key schedule */
    xorl    4(%ebp), %ebx
    xorl    8(%ebp), %ecx
    xorl    12(%ebp), %edx

    /*
     * determine the number of rounds
     */
    cmpl    $10*16, %edi
    je     aes_decrypt.3
    cmpl    $12*16, %edi
    je     aes_decrypt.2
    cmpl    $14*16, %edi
    je      aes_decrypt.1
    movl    $-1, %eax
    jmp     aes_decrypt.5

aes_decrypt.1:
    dec_round
    dec_round
aes_decrypt.2:
    dec_round
    dec_round
aes_decrypt.3:
    dec_round
    dec_round
    dec_round
    dec_round
    dec_round
    dec_round
    dec_round
    dec_round
    dec_round
    dec_last_round

    /*
     * move final values to the output array.
     */
    movl    out_blk+stk_spc(%esp), %ebp
    movl    %eax, (%ebp)
    movl    %ebx, 4(%ebp)
    movl    %esi, 8(%ebp)
    movl    %edi, 12(%ebp)
    xorl    %eax, %eax

aes_decrypt.5:
    movl    16(%esp), %ebp
    movl    12(%esp), %ebx
    movl    8(%esp), %esi
    movl    4(%esp), %edi
    addl    $stk_spc, %esp
    ret

#endif

#define	inv_mix_col			\
    movzbl  %dl, %ebx;			\
    movzbl  etab_b(%ebx), %ebx;		\
    movl    dtab_0(%ebx), %eax;		\
    movzbl  %dh, %ebx;			\
    shrl    $16, %edx;			\
    movzbl  etab_b(%ebx), %ebx;		\
    xorl    dtab_1(%ebx), %eax;		\
    movzbl  %dl, %ebx;			\
    movzbl  etab_b(%ebx), %ebx;		\
    xorl    dtab_2(%ebx), %eax;		\
    movzbl  %dh, %ebx;			\
    movzbl  etab_b(%ebx), %ebx;		\
    xorl    dtab_3(%ebx), %eax;

#ifdef DECRYPTION_KEY_SCHEDULE

#ifdef AES_128

#ifndef DECRYPTION_TABLE
#define DECRYPTION_TABLE 
#endif

Entry(aes_decrypt_key128)

    pushl   %ebp
    pushl   %ebx
    pushl   %esi
    pushl   %edi
    movl    24(%esp), %eax	/* context */
    movl    20(%esp), %edx	/* key */
    pushl   %eax
    pushl   %edx
    do_call(aes_encrypt_key128, 8)
    movl    $10*16, %eax
    movl    24(%esp), %esi	/* pointer to first round key */
    leal    (%esi,%eax), %edi	/* pointer to last round key */
    addl    $32, %esi
				/* the inverse mix column transformation */
    movl    -16(%esi), %edx	/* needs to be applied to all round keys */
    inv_mix_col
    movl    %eax, -16(%esi)	/* transforming the four sub-keys in the */
    movl    -12(%esi), %edx	/* second round key */
    inv_mix_col
    movl    %eax, -12(%esi)	/* transformations for subsequent rounds */
    movl    -8(%esi), %edx	/* can then be made more efficient by */
    inv_mix_col
    movl    %eax, -8(%esi)	/* in the encryption round key ek[r]: */
    movl    -4(%esi), %edx
    inv_mix_col
    movl    %eax, -4(%esi)	/* where n is 1..3. Hence the corresponding */

aes_decrypt_key128.0:
    movl    (%esi), %edx	/* subkeys in the decryption round key dk[r] */
    inv_mix_col
    movl    %eax, (%esi)	/* GF(256): */
    xorl    -12(%esi), %eax
    movl    %eax, 4(%esi)	/* dk[r][n] = dk[r][n-1] ^ dk[r-1][n] */
    xorl    -8(%esi), %eax
    movl    %eax, 8(%esi)	/* So we only need one inverse mix column */
    xorl    -4(%esi), %eax	/* operation (n = 0) for each four word cycle */
    movl    %eax, 12(%esi)	/* in the expanded key. */
    addl    $16, %esi
    cmpl    %esi, %edi
    jg      aes_decrypt_key128.0
    jmp     dec_end

#endif

#ifdef AES_192

#ifndef DECRYPTION_TABLE
#define DECRYPTION_TABLE 
#endif

Entry(aes_decrypt_key192)

    pushl   %ebp
    pushl   %ebx
    pushl   %esi
    pushl   %edi
    movl    24(%esp), %eax	/* context */
    movl    20(%esp), %edx	/* key */
    pushl   %eax
    pushl   %edx
    do_call(aes_encrypt_key192, 8)
    movl    $12*16, %eax
    movl    24(%esp), %esi	/* first round key */
    leal    (%esi,%eax), %edi	/* last round key */
    addl    $48, %esi		/* the first 6 words are the key, of */
				/* which the top 2 words are part of */
    movl    -32(%esi), %edx	/* the second round key and hence */
    inv_mix_col
    movl    %eax, -32(%esi)	/* need to do a further six values prior */
    movl    -28(%esi), %edx	/* to using a more efficient technique */
    inv_mix_col
    movl    %eax, -28(%esi)
				/* dk[r][n] = dk[r][n-1] ^ dk[r-1][n] */
    movl    -24(%esi), %edx
    inv_mix_col
    movl    %eax, -24(%esi)	/* cycle is now 6 words long */
    movl    -20(%esi), %edx
    inv_mix_col
    movl    %eax, -20(%esi)
    movl    -16(%esi), %edx
    inv_mix_col
    movl    %eax, -16(%esi)
    movl    -12(%esi), %edx
    inv_mix_col
    movl    %eax, -12(%esi)
    movl    -8(%esi), %edx
    inv_mix_col
    movl    %eax, -8(%esi)
    movl    -4(%esi), %edx
    inv_mix_col
    movl    %eax, -4(%esi)

aes_decrypt_key192.0:
    movl    (%esi), %edx	/* expanded key is 13 * 4 = 44 32-bit words */
    inv_mix_col
    movl    %eax, (%esi)	/* using inv_mix_col.  We have already done 8 */
    xorl    -20(%esi), %eax	/* of these so 36 are left - hence we need */
    movl    %eax, 4(%esi)	/* exactly 6 loops of six here */
    xorl    -16(%esi), %eax
    movl    %eax, 8(%esi)
    xorl    -12(%esi), %eax
    movl    %eax, 12(%esi)
    xorl    -8(%esi), %eax
    movl    %eax, 16(%esi)
    xorl    -4(%esi), %eax
    movl    %eax, 20(%esi)
    addl    $24, %esi
    cmpl    %esi, %edi
    jg      aes_decrypt_key192.0
    jmp     dec_end

#endif

#ifdef AES_256

#ifndef DECRYPTION_TABLE
#define DECRYPTION_TABLE 
#endif

Entry(aes_decrypt_key256)

    pushl   %ebp
    pushl   %ebx
    pushl   %esi
    pushl   %edi
    movl    24(%esp), %eax
    movl    20(%esp), %edx
    pushl   %eax
    pushl   %edx
    do_call(aes_encrypt_key256, 8)
    movl    $14*16, %eax
    movl    24(%esp), %esi
    leal    (%esi,%eax), %edi
    addl    $64, %esi

    movl    -48(%esi), %edx	/* the primary key is 8 words, of which */
    inv_mix_col
    movl    %eax, -48(%esi)
    movl    -44(%esi), %edx
    inv_mix_col
    movl    %eax, -44(%esi)
    movl    -40(%esi), %edx
    inv_mix_col
    movl    %eax, -40(%esi)
    movl    -36(%esi), %edx
    inv_mix_col
    movl    %eax, -36(%esi)

    movl    -32(%esi), %edx	/* the encryption key expansion cycle is */
    inv_mix_col
    movl    %eax, -32(%esi)	/* start by doing one complete block */
    movl    -28(%esi), %edx
    inv_mix_col
    movl    %eax, -28(%esi)
    movl    -24(%esi), %edx
    inv_mix_col
    movl    %eax, -24(%esi)
    movl    -20(%esi), %edx
    inv_mix_col
    movl    %eax, -20(%esi)
    movl    -16(%esi), %edx
    inv_mix_col
    movl    %eax, -16(%esi)
    movl    -12(%esi), %edx
    inv_mix_col
    movl    %eax, -12(%esi)
    movl    -8(%esi), %edx
    inv_mix_col
    movl    %eax, -8(%esi)
    movl    -4(%esi), %edx
    inv_mix_col
    movl    %eax, -4(%esi)

aes_decrypt_key256.0:
    movl    (%esi), %edx	/* we can now speed up the remaining */
    inv_mix_col
    movl    %eax, (%esi)	/* outlined earlier.  But note that */
    xorl    -28(%esi), %eax	/* there is one extra inverse mix */
    movl    %eax, 4(%esi)	/* column operation as the 256 bit */
    xorl    -24(%esi), %eax	/* key has an extra non-linear step */
    movl    %eax, 8(%esi)	/* for the midway element. */
    xorl    -20(%esi), %eax
    movl    %eax, 12(%esi)	/* the expanded key is 15 * 4 = 60 */
    movl    16(%esi), %edx	/* 32-bit words of which 52 need to */
    inv_mix_col
    movl    %eax, 16(%esi)	/* 12 so 40 are left - which means */
    xorl    -12(%esi), %eax	/* that we need exactly 5 loops of 8 */
    movl    %eax, 20(%esi)
    xorl    -8(%esi), %eax
    movl    %eax, 24(%esi)
    xorl    -4(%esi), %eax
    movl    %eax, 28(%esi)
    addl    $32, %esi
    cmpl    %esi, %edi
    jg      aes_decrypt_key256.0

#endif

dec_end: 

#ifdef AES_REV_DKS

    movl    24(%esp), %esi	/* this reverses the order of the */
dec_end.1:
    movl    (%esi), %eax	/* round keys if required */
    movl    4(%esi), %ebx
    movl    (%edi), %ebp
    movl    4(%edi), %edx
    movl    %ebp, (%esi)
    movl    %edx, 4(%esi)
    movl    %eax, (%edi)
    movl    %ebx, 4(%edi)

    movl    8(%esi), %eax
    movl    12(%esi), %ebx
    movl    8(%edi), %ebp
    movl    12(%edi), %edx
    movl    %ebp, 8(%esi)
    movl    %edx, 12(%esi)
    movl    %eax, 8(%edi)
    movl    %ebx, 12(%edi)

    addl    $16, %esi
    subl    $16, %edi
    cmpl    %esi, %edi
    jg      dec_end.1

#endif

    popl    %edi
    popl    %esi
    popl    %ebx
    popl    %ebp
    xorl    %eax, %eax
    ret

#ifdef AES_VAR

Entry(aes_decrypt_key)

    movl    4(%esp), %ecx
    movl    8(%esp), %eax
    movl    12(%esp), %edx
    pushl   %edx
    pushl   %ecx

    cmpl    $16, %eax
    je      aes_decrypt_key.1
    cmpl    $128, %eax
    je      aes_decrypt_key.1

    cmpl    $24, %eax
    je      aes_decrypt_key.2
    cmpl    $192, %eax
    je      aes_decrypt_key.2

    cmpl    $32, %eax
    je      aes_decrypt_key.3
    cmpl    $256, %eax
    je      aes_decrypt_key.3
    movl    $-1, %eax
    addl    $8, %esp
    ret

aes_decrypt_key.1:
    do_call(aes_decrypt_key128, 8)
    ret
aes_decrypt_key.2:
    do_call(aes_decrypt_key192, 8)
    ret
aes_decrypt_key.3:
    do_call(aes_decrypt_key256, 8)
    ret

#endif

#endif

#ifdef DECRYPTION_TABLE

/*
 * Inverse S-box data - 256 entries
 */

    .section __DATA, __data
    .align ALIGN

#define v8(x) fe(x), f9(x), fd(x), fb(x), fe(x), f9(x), fd(x), x

dec_tab: 
   .byte v8(0x52),v8(0x09),v8(0x6a),v8(0xd5),v8(0x30),v8(0x36),v8(0xa5),v8(0x38)
   .byte v8(0xbf),v8(0x40),v8(0xa3),v8(0x9e),v8(0x81),v8(0xf3),v8(0xd7),v8(0xfb)
   .byte v8(0x7c),v8(0xe3),v8(0x39),v8(0x82),v8(0x9b),v8(0x2f),v8(0xff),v8(0x87)
   .byte v8(0x34),v8(0x8e),v8(0x43),v8(0x44),v8(0xc4),v8(0xde),v8(0xe9),v8(0xcb)
   .byte v8(0x54),v8(0x7b),v8(0x94),v8(0x32),v8(0xa6),v8(0xc2),v8(0x23),v8(0x3d)
   .byte v8(0xee),v8(0x4c),v8(0x95),v8(0x0b),v8(0x42),v8(0xfa),v8(0xc3),v8(0x4e)
   .byte v8(0x08),v8(0x2e),v8(0xa1),v8(0x66),v8(0x28),v8(0xd9),v8(0x24),v8(0xb2)
   .byte v8(0x76),v8(0x5b),v8(0xa2),v8(0x49),v8(0x6d),v8(0x8b),v8(0xd1),v8(0x25)
   .byte v8(0x72),v8(0xf8),v8(0xf6),v8(0x64),v8(0x86),v8(0x68),v8(0x98),v8(0x16)
   .byte v8(0xd4),v8(0xa4),v8(0x5c),v8(0xcc),v8(0x5d),v8(0x65),v8(0xb6),v8(0x92)
   .byte v8(0x6c),v8(0x70),v8(0x48),v8(0x50),v8(0xfd),v8(0xed),v8(0xb9),v8(0xda)
   .byte v8(0x5e),v8(0x15),v8(0x46),v8(0x57),v8(0xa7),v8(0x8d),v8(0x9d),v8(0x84)
   .byte v8(0x90),v8(0xd8),v8(0xab),v8(0x00),v8(0x8c),v8(0xbc),v8(0xd3),v8(0x0a)
   .byte v8(0xf7),v8(0xe4),v8(0x58),v8(0x05),v8(0xb8),v8(0xb3),v8(0x45),v8(0x06)
   .byte v8(0xd0),v8(0x2c),v8(0x1e),v8(0x8f),v8(0xca),v8(0x3f),v8(0x0f),v8(0x02)
   .byte v8(0xc1),v8(0xaf),v8(0xbd),v8(0x03),v8(0x01),v8(0x13),v8(0x8a),v8(0x6b)
   .byte v8(0x3a),v8(0x91),v8(0x11),v8(0x41),v8(0x4f),v8(0x67),v8(0xdc),v8(0xea)
   .byte v8(0x97),v8(0xf2),v8(0xcf),v8(0xce),v8(0xf0),v8(0xb4),v8(0xe6),v8(0x73)
   .byte v8(0x96),v8(0xac),v8(0x74),v8(0x22),v8(0xe7),v8(0xad),v8(0x35),v8(0x85)
   .byte v8(0xe2),v8(0xf9),v8(0x37),v8(0xe8),v8(0x1c),v8(0x75),v8(0xdf),v8(0x6e)
   .byte v8(0x47),v8(0xf1),v8(0x1a),v8(0x71),v8(0x1d),v8(0x29),v8(0xc5),v8(0x89)
   .byte v8(0x6f),v8(0xb7),v8(0x62),v8(0x0e),v8(0xaa),v8(0x18),v8(0xbe),v8(0x1b)
   .byte v8(0xfc),v8(0x56),v8(0x3e),v8(0x4b),v8(0xc6),v8(0xd2),v8(0x79),v8(0x20)
   .byte v8(0x9a),v8(0xdb),v8(0xc0),v8(0xfe),v8(0x78),v8(0xcd),v8(0x5a),v8(0xf4)
   .byte v8(0x1f),v8(0xdd),v8(0xa8),v8(0x33),v8(0x88),v8(0x07),v8(0xc7),v8(0x31)
   .byte v8(0xb1),v8(0x12),v8(0x10),v8(0x59),v8(0x27),v8(0x80),v8(0xec),v8(0x5f)
   .byte v8(0x60),v8(0x51),v8(0x7f),v8(0xa9),v8(0x19),v8(0xb5),v8(0x4a),v8(0x0d)
   .byte v8(0x2d),v8(0xe5),v8(0x7a),v8(0x9f),v8(0x93),v8(0xc9),v8(0x9c),v8(0xef)
   .byte v8(0xa0),v8(0xe0),v8(0x3b),v8(0x4d),v8(0xae),v8(0x2a),v8(0xf5),v8(0xb0)
   .byte v8(0xc8),v8(0xeb),v8(0xbb),v8(0x3c),v8(0x83),v8(0x53),v8(0x99),v8(0x61)
   .byte v8(0x17),v8(0x2b),v8(0x04),v8(0x7e),v8(0xba),v8(0x77),v8(0xd6),v8(0x26)
   .byte v8(0xe1),v8(0x69),v8(0x14),v8(0x63),v8(0x55),v8(0x21),v8(0x0c),v8(0x7d)

#endif