pixman-sse2.c   [plain text]


/*
 * Copyright © 2008 Rodrigo Kumpera
 * Copyright © 2008 André Tupinambá
 *
 * Permission to use, copy, modify, distribute, and sell this software and its
 * documentation for any purpose is hereby granted without fee, provided that
 * the above copyright notice appear in all copies and that both that
 * copyright notice and this permission notice appear in supporting
 * documentation, and that the name of Red Hat not be used in advertising or
 * publicity pertaining to distribution of the software without specific,
 * written prior permission.  Red Hat makes no representations about the
 * suitability of this software for any purpose.  It is provided "as is"
 * without express or implied warranty.
 *
 * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
 * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
 * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
 * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
 * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
 * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
 * SOFTWARE.
 *
 * Author:  Rodrigo Kumpera (kumpera@gmail.com)
 *          André Tupinambá (andrelrt@gmail.com)
 * 
 * Based on work by Owen Taylor and Søren Sandmann
 */
#ifdef HAVE_CONFIG_H
#include <config.h>
#endif

#include <mmintrin.h>
#include <xmmintrin.h> /* for _mm_shuffle_pi16 and _MM_SHUFFLE */
#include <emmintrin.h> /* for SSE2 intrinsics */

#include "pixman-sse2.h"

#ifdef USE_SSE2

#ifdef _MSC_VER
#undef inline
#define inline __forceinline
#endif

#ifdef __GNUC__
#    define inline __inline__ __attribute__ ((__always_inline__))
#endif

/* -------------------------------------------------------------------------------------------------
 * Locals
 */

static __m64 xMask0080;
static __m64 xMask00ff;
static __m64 xMask0101;
static __m64 xMaskAlpha;

static __m64 xMask565rgb;
static __m64 xMask565Unpack;

static __m128i Mask0080;
static __m128i Mask00ff;
static __m128i Mask0101;
static __m128i Maskffff;
static __m128i Maskff000000;
static __m128i MaskAlpha;

static __m128i Mask565r;
static __m128i Mask565g1, Mask565g2;
static __m128i Mask565b;
static __m128i MaskRed;
static __m128i MaskGreen;
static __m128i MaskBlue;

static __m128i Mask565FixRB;
static __m128i Mask565FixG;

/* -------------------------------------------------------------------------------------------------
 * SSE2 Inlines
 */
static inline __m128i
unpack_32_1x128 (uint32_t data)
{
    return _mm_unpacklo_epi8 (_mm_cvtsi32_si128 (data), _mm_setzero_si128());
}

static inline void
unpack_128_2x128 (__m128i data, __m128i* dataLo, __m128i* dataHi)
{
    *dataLo = _mm_unpacklo_epi8 (data, _mm_setzero_si128 ());
    *dataHi = _mm_unpackhi_epi8 (data, _mm_setzero_si128 ());
}

static inline __m128i
unpack565to8888 (__m128i lo)
{
    __m128i r, g, b, rb, t;
    
    r = _mm_and_si128 (_mm_slli_epi32 (lo, 8), MaskRed);
    g = _mm_and_si128 (_mm_slli_epi32 (lo, 5), MaskGreen);
    b = _mm_and_si128 (_mm_slli_epi32 (lo, 3), MaskBlue);

    rb = _mm_or_si128 (r, b);
    t  = _mm_and_si128 (rb, Mask565FixRB);
    t  = _mm_srli_epi32 (t, 5);
    rb = _mm_or_si128 (rb, t);

    t  = _mm_and_si128 (g, Mask565FixG);
    t  = _mm_srli_epi32 (t, 6);
    g  = _mm_or_si128 (g, t);
    
    return _mm_or_si128 (rb, g);
}

static inline void
unpack565_128_4x128 (__m128i data, __m128i* data0, __m128i* data1, __m128i* data2, __m128i* data3)
{
    __m128i lo, hi;

    lo = _mm_unpacklo_epi16 (data, _mm_setzero_si128 ());
    hi = _mm_unpackhi_epi16 (data, _mm_setzero_si128 ());

    lo = unpack565to8888 (lo);
    hi = unpack565to8888 (hi);

    unpack_128_2x128 (lo, data0, data1);
    unpack_128_2x128 (hi, data2, data3);
}

static inline uint16_t
pack565_32_16 (uint32_t pixel)
{
    return (uint16_t) (((pixel>>8) & 0xf800) | ((pixel>>5) & 0x07e0) | ((pixel>>3) & 0x001f));
}

static inline __m128i
pack_2x128_128 (__m128i lo, __m128i hi)
{
    return _mm_packus_epi16 (lo, hi);
}

static inline __m128i
pack565_2x128_128 (__m128i lo, __m128i hi)
{
    __m128i data;
    __m128i r, g1, g2, b;

    data = pack_2x128_128 ( lo, hi );

    r  = _mm_and_si128 (data , Mask565r);
    g1 = _mm_and_si128 (_mm_slli_epi32 (data , 3), Mask565g1);
    g2 = _mm_and_si128 (_mm_srli_epi32 (data , 5), Mask565g2);
    b  = _mm_and_si128 (_mm_srli_epi32 (data , 3), Mask565b);

    return _mm_or_si128 (_mm_or_si128 (_mm_or_si128 (r, g1), g2), b);
}

static inline __m128i
pack565_4x128_128 (__m128i* xmm0, __m128i* xmm1, __m128i* xmm2, __m128i* xmm3)
{
    return _mm_packus_epi16 (pack565_2x128_128 (*xmm0, *xmm1), pack565_2x128_128 (*xmm2, *xmm3));
}

static inline uint32_t
packAlpha (__m128i x)
{
    return _mm_cvtsi128_si32 (_mm_packus_epi16 (_mm_packus_epi16 (_mm_srli_epi32 (x, 24),
                                                                  _mm_setzero_si128 ()),
                                                _mm_setzero_si128 ()));
}

static inline __m128i
expandPixel_32_1x128 (uint32_t data)
{
    return _mm_shuffle_epi32 (unpack_32_1x128 (data), _MM_SHUFFLE(1, 0, 1, 0));
}

static inline __m128i
expandAlpha_1x128 (__m128i data)
{
    return _mm_shufflehi_epi16 (_mm_shufflelo_epi16 (data, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
}

static inline void
expandAlpha_2x128 (__m128i dataLo, __m128i dataHi, __m128i* alphaLo, __m128i* alphaHi)
{
    __m128i lo, hi;

    lo = _mm_shufflelo_epi16 (dataLo, _MM_SHUFFLE(3, 3, 3, 3));
    hi = _mm_shufflelo_epi16 (dataHi, _MM_SHUFFLE(3, 3, 3, 3));
    *alphaLo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE(3, 3, 3, 3));
    *alphaHi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE(3, 3, 3, 3));
}

static inline void
expandAlphaRev_2x128 (__m128i dataLo, __m128i dataHi, __m128i* alphaLo, __m128i* alphaHi)
{
    __m128i lo, hi;

    lo = _mm_shufflelo_epi16 (dataLo, _MM_SHUFFLE(0, 0, 0, 0));
    hi = _mm_shufflelo_epi16 (dataHi, _MM_SHUFFLE(0, 0, 0, 0));
    *alphaLo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE(0, 0, 0, 0));
    *alphaHi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE(0, 0, 0, 0));
}

static inline void
pixMultiply_2x128 (__m128i* dataLo, __m128i* dataHi, __m128i* alphaLo, __m128i* alphaHi, __m128i* retLo, __m128i* retHi)
{
    __m128i lo, hi;

    lo = _mm_mullo_epi16 (*dataLo, *alphaLo);
    hi = _mm_mullo_epi16 (*dataHi, *alphaHi);
    lo = _mm_adds_epu16 (lo, Mask0080);
    hi = _mm_adds_epu16 (hi, Mask0080);
    *retLo = _mm_mulhi_epu16 (lo, Mask0101);
    *retHi = _mm_mulhi_epu16 (hi, Mask0101);
}

static inline void
pixAddMultiply_2x128 (__m128i* srcLo, __m128i* srcHi, __m128i* alphaDstLo, __m128i* alphaDstHi,
                      __m128i* dstLo, __m128i* dstHi, __m128i* alphaSrcLo, __m128i* alphaSrcHi,
                      __m128i* retLo, __m128i* retHi)
{
    __m128i lo, hi;
    __m128i mulLo, mulHi;

    lo = _mm_mullo_epi16 (*srcLo, *alphaDstLo);
    hi = _mm_mullo_epi16 (*srcHi, *alphaDstHi);
    mulLo = _mm_mullo_epi16 (*dstLo, *alphaSrcLo);
    mulHi = _mm_mullo_epi16 (*dstHi, *alphaSrcHi);
    lo = _mm_adds_epu16 (lo, Mask0080);
    hi = _mm_adds_epu16 (hi, Mask0080);
    lo = _mm_adds_epu16 (lo, mulLo);
    hi = _mm_adds_epu16 (hi, mulHi);
    *retLo = _mm_mulhi_epu16 (lo, Mask0101);
    *retHi = _mm_mulhi_epu16 (hi, Mask0101);
}

static inline void
negate_2x128 (__m128i dataLo, __m128i dataHi, __m128i* negLo, __m128i* negHi)
{
    *negLo = _mm_xor_si128 (dataLo, Mask00ff);
    *negHi = _mm_xor_si128 (dataHi, Mask00ff);
}

static inline void
invertColors_2x128 (__m128i dataLo, __m128i dataHi, __m128i* invLo, __m128i* invHi)
{
    __m128i lo, hi;

    lo = _mm_shufflelo_epi16 (dataLo, _MM_SHUFFLE(3, 0, 1, 2));
    hi = _mm_shufflelo_epi16 (dataHi, _MM_SHUFFLE(3, 0, 1, 2));
    *invLo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE(3, 0, 1, 2));
    *invHi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE(3, 0, 1, 2));
}

static inline void
over_2x128 (__m128i* srcLo, __m128i* srcHi, __m128i* alphaLo, __m128i* alphaHi, __m128i* dstLo, __m128i* dstHi)
{
    __m128i t1, t2;

    negate_2x128 (*alphaLo, *alphaHi, &t1, &t2);

    pixMultiply_2x128 (dstLo, dstHi, &t1, &t2, dstLo, dstHi);

    *dstLo = _mm_adds_epu8 (*srcLo, *dstLo);
    *dstHi = _mm_adds_epu8 (*srcHi, *dstHi);
}

static inline void
overRevNonPre_2x128 (__m128i srcLo, __m128i srcHi, __m128i* dstLo, __m128i* dstHi)
{
    __m128i lo, hi;
    __m128i alphaLo, alphaHi;

    expandAlpha_2x128 (srcLo, srcHi, &alphaLo, &alphaHi);

    lo = _mm_or_si128 (alphaLo, MaskAlpha);
    hi = _mm_or_si128 (alphaHi, MaskAlpha);

    invertColors_2x128 (srcLo, srcHi, &srcLo, &srcHi);

    pixMultiply_2x128 (&srcLo, &srcHi, &lo, &hi, &lo, &hi);

    over_2x128 (&lo, &hi, &alphaLo, &alphaHi, dstLo, dstHi);
}

static inline void
inOver_2x128 (__m128i* srcLo,  __m128i* srcHi,  __m128i*  alphaLo, __m128i*  alphaHi,
              __m128i* maskLo, __m128i* maskHi, __m128i* dstLo,   __m128i* dstHi)
{
    __m128i sLo, sHi;
    __m128i aLo, aHi;

    pixMultiply_2x128 (  srcLo,   srcHi, maskLo, maskHi, &sLo, &sHi);
    pixMultiply_2x128 (alphaLo, alphaHi, maskLo, maskHi, &aLo, &aHi);

    over_2x128 (&sLo, &sHi, &aLo, &aHi, dstLo, dstHi);
}

static inline void
cachePrefetch (__m128i* addr)
{
    _mm_prefetch (addr, _MM_HINT_T0);
}

static inline void
cachePrefetchNext (__m128i* addr)
{
    _mm_prefetch (addr + 4, _MM_HINT_T0); // 64 bytes ahead
}

/* load 4 pixels from a 16-byte boundary aligned address */
static inline __m128i
load128Aligned (__m128i* src)
{
    return _mm_load_si128 (src);
}

/* load 4 pixels from a unaligned address */
static inline __m128i
load128Unaligned (__m128i* src)
{
    return _mm_loadu_si128 (src);
}

/* save 4 pixels using Write Combining memory on a 16-byte boundary aligned address */
static inline void
save128WriteCombining (__m128i* dst, __m128i data)
{
    _mm_stream_si128 (dst, data);
}

/* save 4 pixels on a 16-byte boundary aligned address */
static inline void
save128Aligned (__m128i* dst, __m128i data)
{
    _mm_store_si128 (dst, data);
}

/* save 4 pixels on a unaligned address */
static inline void
save128Unaligned (__m128i* dst, __m128i data)
{
    _mm_storeu_si128 (dst, data);
}

/* -------------------------------------------------------------------------------------------------
 * MMX inlines
 */

static inline __m64
unpack_32_1x64 (uint32_t data)
{
    return _mm_unpacklo_pi8 (_mm_cvtsi32_si64 (data), _mm_setzero_si64());
}

static inline __m64
expandAlpha_1x64 (__m64 data)
{
    return _mm_shuffle_pi16 (data, _MM_SHUFFLE(3, 3, 3, 3));
}

static inline __m64
expandAlphaRev_1x64 (__m64 data)
{
    return _mm_shuffle_pi16 (data, _MM_SHUFFLE(0, 0, 0, 0));
}

static inline __m64
expandPixel_8_1x64 (uint8_t data)
{
    return _mm_shuffle_pi16 (unpack_32_1x64 ((uint32_t)data), _MM_SHUFFLE(0, 0, 0, 0));
}

static inline __m64
pixMultiply_1x64 (__m64 data, __m64 alpha)
{
    return _mm_mulhi_pu16 (_mm_adds_pu16 (_mm_mullo_pi16 (data, alpha),
                                          xMask0080),
                           xMask0101);
}

static inline __m64
pixAddMultiply_1x64 (__m64* src, __m64* alphaDst, __m64* dst, __m64* alphaSrc)
{
    return _mm_mulhi_pu16 (_mm_adds_pu16 (_mm_adds_pu16 (_mm_mullo_pi16 (*src, *alphaDst),
                                                         xMask0080),
                                          _mm_mullo_pi16 (*dst, *alphaSrc)),
                           xMask0101);
}

static inline __m64
negate_1x64 (__m64 data)
{
    return _mm_xor_si64 (data, xMask00ff);
}

static inline __m64
invertColors_1x64 (__m64 data)
{
    return _mm_shuffle_pi16 (data, _MM_SHUFFLE(3, 0, 1, 2));
}

static inline __m64
over_1x64 (__m64 src, __m64 alpha, __m64 dst)
{
    return _mm_adds_pu8 (src, pixMultiply_1x64 (dst, negate_1x64 (alpha)));
}

static inline __m64
inOver_1x64 (__m64* src, __m64* alpha, __m64* mask, __m64* dst)
{
    return over_1x64 (pixMultiply_1x64 (*src, *mask),
                      pixMultiply_1x64 (*alpha, *mask),
                      *dst);
}

static inline __m64
overRevNonPre_1x64 (__m64 src, __m64 dst)
{
    __m64 alpha = expandAlpha_1x64 (src);

    return over_1x64 (pixMultiply_1x64 (invertColors_1x64 (src),
                                        _mm_or_si64 (alpha, xMaskAlpha)),
                      alpha,
                      dst);
}

static inline uint32_t
pack_1x64_32( __m64 data )
{
    return _mm_cvtsi64_si32 (_mm_packs_pu16 (data, _mm_setzero_si64()));
}

/* Expand 16 bits positioned at @pos (0-3) of a mmx register into
 *
 *    00RR00GG00BB
 *
 * --- Expanding 565 in the low word ---
 *
 * m = (m << (32 - 3)) | (m << (16 - 5)) | m;
 * m = m & (01f0003f001f);
 * m = m * (008404100840);
 * m = m >> 8;
 *
 * Note the trick here - the top word is shifted by another nibble to
 * avoid it bumping into the middle word
 */
static inline __m64
expand565_16_1x64 (uint16_t pixel)
{
    __m64 p;
    __m64 t1, t2;

    p = _mm_cvtsi32_si64 ((uint32_t) pixel);

    t1 = _mm_slli_si64 (p, 36 - 11);
    t2 = _mm_slli_si64 (p, 16 - 5);

    p = _mm_or_si64 (t1, p);
    p = _mm_or_si64 (t2, p);
    p = _mm_and_si64 (p, xMask565rgb);
    p = _mm_mullo_pi16 (p, xMask565Unpack);

    return _mm_srli_pi16 (p, 8);
}

/* -------------------------------------------------------------------------------------------------
 * Compose Core transformations
 */
static inline uint32_t
coreCombineOverUPixelsse2 (uint32_t src, uint32_t dst)
{
    uint8_t     a;
    __m64       ms;

    a = src >> 24;

    if (a == 0xff)
    {
        return src;
    }
    else if (a)
    {
        ms = unpack_32_1x64 (src);
        return pack_1x64_32 (over_1x64 (ms, expandAlpha_1x64 (ms), unpack_32_1x64 (dst)));
    }

    return dst;
}

static inline void
coreCombineOverUsse2 (uint32_t* pd, const uint32_t* ps, int w)
{
    uint32_t pa;
    uint32_t s, d;

    __m128i xmmDstLo, xmmDstHi;
    __m128i xmmSrcLo, xmmSrcHi;
    __m128i xmmAlphaLo, xmmAlphaHi;

    /* call prefetch hint to optimize cache load*/
    cachePrefetch ((__m128i*)ps);
    cachePrefetch ((__m128i*)pd);

    /* Align dst on a 16-byte boundary */
    while (w &&
           ((unsigned long)pd & 15))
    {
        d = *pd;
        s = *ps++;

        *pd++ = coreCombineOverUPixelsse2 (s, d);
        w--;
    }

    /* call prefetch hint to optimize cache load*/
    cachePrefetch ((__m128i*)ps);
    cachePrefetch ((__m128i*)pd);

    while (w >= 4)
    {
        /* fill cache line with next memory */
        cachePrefetchNext ((__m128i*)ps);
        cachePrefetchNext ((__m128i*)pd);

        /* I'm loading unaligned because I'm not sure about the address alignment. */
        xmmSrcHi = load128Unaligned ((__m128i*) ps);

        /* Check the alpha channel */
        pa = packAlpha (xmmSrcHi);

        if (pa == 0xffffffff)
        {
            save128Aligned ((__m128i*)pd, xmmSrcHi);
        }
        else if (pa)
        {
            xmmDstHi = load128Aligned ((__m128i*) pd);

            unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi);
            unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi);

            expandAlpha_2x128 (xmmSrcLo, xmmSrcHi, &xmmAlphaLo, &xmmAlphaHi);

            over_2x128 (&xmmSrcLo, &xmmSrcHi, &xmmAlphaLo, &xmmAlphaHi, &xmmDstLo, &xmmDstHi);

            /* rebuid the 4 pixel data and save*/
            save128Aligned ((__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi));
        }

        w -= 4;
        ps += 4;
        pd += 4;
    }

    while (w)
    {
        d = *pd;
        s = *ps++;

        *pd++ = coreCombineOverUPixelsse2 (s, d);
        w--;
    }
}

static inline void
coreCombineOverReverseUsse2 (uint32_t* pd, const uint32_t* ps, int w)
{
    uint32_t s, d;

    __m128i xmmDstLo, xmmDstHi;
    __m128i xmmSrcLo, xmmSrcHi;
    __m128i xmmAlphaLo, xmmAlphaHi;

    /* call prefetch hint to optimize cache load*/
    cachePrefetch ((__m128i*)ps);
    cachePrefetch ((__m128i*)pd);

    /* Align dst on a 16-byte boundary */
    while (w &&
           ((unsigned long)pd & 15))
    {
        d = *pd;
        s = *ps++;

        *pd++ = coreCombineOverUPixelsse2 (d, s);
        w--;
    }

    /* call prefetch hint to optimize cache load*/
    cachePrefetch ((__m128i*)ps);
    cachePrefetch ((__m128i*)pd);

    while (w >= 4)
    {
        /* fill cache line with next memory */
        cachePrefetchNext ((__m128i*)ps);
        cachePrefetchNext ((__m128i*)pd);

        /* I'm loading unaligned because I'm not sure about the address alignment. */
        xmmSrcHi = load128Unaligned ((__m128i*) ps);
        xmmDstHi = load128Aligned ((__m128i*) pd);

        unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi);
        unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi);

        expandAlpha_2x128 (xmmDstLo, xmmDstHi, &xmmAlphaLo, &xmmAlphaHi);

        over_2x128 (&xmmDstLo, &xmmDstHi, &xmmAlphaLo, &xmmAlphaHi, &xmmSrcLo, &xmmSrcHi);

        /* rebuid the 4 pixel data and save*/
        save128Aligned ((__m128i*)pd, pack_2x128_128 (xmmSrcLo, xmmSrcHi));

        w -= 4;
        ps += 4;
        pd += 4;
    }

    while (w)
    {
        d = *pd;
        s = *ps++;

        *pd++ = coreCombineOverUPixelsse2 (d, s);
        w--;
    }
}

static inline uint32_t
coreCombineInUPixelsse2 (uint32_t src, uint32_t dst)
{
    uint32_t maska = src >> 24;

    if (maska == 0)
    {
        return 0;
    }
    else if (maska != 0xff)
    {
        return pack_1x64_32(pixMultiply_1x64 (unpack_32_1x64 (dst), expandAlpha_1x64 (unpack_32_1x64 (src))));
    }

    return dst;
}

static inline void
coreCombineInUsse2 (uint32_t* pd, const uint32_t* ps, int w)
{
    uint32_t s, d;

    __m128i xmmSrcLo, xmmSrcHi;
    __m128i xmmDstLo, xmmDstHi;

    /* call prefetch hint to optimize cache load*/
    cachePrefetch ((__m128i*)ps);
    cachePrefetch ((__m128i*)pd);

    while (w && ((unsigned long) pd & 15))
    {
        s = *ps++;
        d = *pd;

        *pd++ = coreCombineInUPixelsse2 (d, s);
        w--;
    }

    /* call prefetch hint to optimize cache load*/
    cachePrefetch ((__m128i*)ps);
    cachePrefetch ((__m128i*)pd);

    while (w >= 4)
    {
        /* fill cache line with next memory */
        cachePrefetchNext ((__m128i*)ps);
        cachePrefetchNext ((__m128i*)pd);

        xmmDstHi = load128Aligned ((__m128i*) pd);
        xmmSrcHi = load128Unaligned ((__m128i*) ps);

        unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi);
        expandAlpha_2x128 (xmmDstLo, xmmDstHi, &xmmDstLo, &xmmDstHi);

        unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi);
        pixMultiply_2x128 (&xmmSrcLo, &xmmSrcHi, &xmmDstLo, &xmmDstHi, &xmmDstLo, &xmmDstHi);

        save128Aligned ((__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi));

        ps += 4;
        pd += 4;
        w -= 4;
    }

    while (w)
    {
        s = *ps++;
        d = *pd;

        *pd++ = coreCombineInUPixelsse2 (d, s);
        w--;
    }
}

static inline void
coreCombineReverseInUsse2 (uint32_t* pd, const uint32_t* ps, int w)
{
    uint32_t s, d;

    __m128i xmmSrcLo, xmmSrcHi;
    __m128i xmmDstLo, xmmDstHi;

    /* call prefetch hint to optimize cache load*/
    cachePrefetch ((__m128i*)ps);
    cachePrefetch ((__m128i*)pd);

    while (w && ((unsigned long) pd & 15))
    {
        s = *ps++;
        d = *pd;

        *pd++ = coreCombineInUPixelsse2 (s, d);
        w--;
    }

    /* call prefetch hint to optimize cache load*/
    cachePrefetch ((__m128i*)ps);
    cachePrefetch ((__m128i*)pd);

    while (w >= 4)
    {
        /* fill cache line with next memory */
        cachePrefetchNext ((__m128i*)ps);
        cachePrefetchNext ((__m128i*)pd);

        xmmDstHi = load128Aligned ((__m128i*) pd);
        xmmSrcHi = load128Unaligned ((__m128i*) ps);

        unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi);
        expandAlpha_2x128 (xmmSrcLo, xmmSrcHi, &xmmSrcLo, &xmmSrcHi);

        unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi);
        pixMultiply_2x128 (&xmmDstLo, &xmmDstHi, &xmmSrcLo, &xmmSrcHi, &xmmDstLo, &xmmDstHi);

        save128Aligned ((__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi));

        ps += 4;
        pd += 4;
        w -= 4;
    }

    while (w)
    {
        s = *ps++;
        d = *pd;

        *pd++ = coreCombineInUPixelsse2 (s, d);
        w--;
    }
}

static inline void
coreCombineReverseOutUsse2 (uint32_t* pd, const uint32_t* ps, int w)
{
    /* call prefetch hint to optimize cache load*/
    cachePrefetch ((__m128i*)ps);
    cachePrefetch ((__m128i*)pd);

    while (w && ((unsigned long) pd & 15))
    {
        uint32_t s = *ps++;
        uint32_t d = *pd;

        *pd++ = pack_1x64_32 (pixMultiply_1x64 (unpack_32_1x64 (d), negate_1x64 (expandAlpha_1x64 (unpack_32_1x64 (s)))));
        w--;
    }

    /* call prefetch hint to optimize cache load*/
    cachePrefetch ((__m128i*)ps);
    cachePrefetch ((__m128i*)pd);

    while (w >= 4)
    {
        __m128i xmmSrcLo, xmmSrcHi;
        __m128i xmmDstLo, xmmDstHi;

        /* fill cache line with next memory */
        cachePrefetchNext ((__m128i*)ps);
        cachePrefetchNext ((__m128i*)pd);

        xmmSrcHi = load128Unaligned ((__m128i*) ps);
        xmmDstHi = load128Aligned ((__m128i*) pd);

        unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi);
        unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi);

        expandAlpha_2x128 (xmmSrcLo, xmmSrcHi, &xmmSrcLo, &xmmSrcHi);
        negate_2x128      (xmmSrcLo, xmmSrcHi, &xmmSrcLo, &xmmSrcHi);

        pixMultiply_2x128 (&xmmDstLo, &xmmDstHi, &xmmSrcLo, &xmmSrcHi, &xmmDstLo, &xmmDstHi);

        save128Aligned ((__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi));

        ps += 4;
        pd += 4;
        w -= 4;
    }

    while (w)
    {
        uint32_t s = *ps++;
        uint32_t d = *pd;

        *pd++ = pack_1x64_32 (pixMultiply_1x64 (unpack_32_1x64 (d), negate_1x64 (expandAlpha_1x64 (unpack_32_1x64 (s)))));
        w--;
    }
}

static inline void
coreCombineOutUsse2 (uint32_t* pd, const uint32_t* ps, int w)
{
    /* call prefetch hint to optimize cache load*/
    cachePrefetch ((__m128i*)ps);
    cachePrefetch ((__m128i*)pd);

    while (w && ((unsigned long) pd & 15))
    {
        uint32_t s = *ps++;
        uint32_t d = *pd;

        *pd++ = pack_1x64_32 (pixMultiply_1x64 (unpack_32_1x64 (s), negate_1x64 (expandAlpha_1x64 (unpack_32_1x64 (d)))));
        w--;
    }

    /* call prefetch hint to optimize cache load*/
    cachePrefetch ((__m128i*)ps);
    cachePrefetch ((__m128i*)pd);

    while (w >= 4)
    {
        __m128i xmmSrcLo, xmmSrcHi;
        __m128i xmmDstLo, xmmDstHi;

        /* fill cache line with next memory */
        cachePrefetchNext ((__m128i*)ps);
        cachePrefetchNext ((__m128i*)pd);

        xmmSrcHi = load128Unaligned ((__m128i*) ps);
        xmmDstHi = load128Aligned ((__m128i*) pd);

        unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi);
        unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi);

        expandAlpha_2x128 (xmmDstLo, xmmDstHi, &xmmDstLo, &xmmDstHi);
        negate_2x128      (xmmDstLo, xmmDstHi, &xmmDstLo, &xmmDstHi);

        pixMultiply_2x128 (&xmmSrcLo, &xmmSrcHi, &xmmDstLo, &xmmDstHi, &xmmDstLo, &xmmDstHi);

        save128Aligned ((__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi));

        ps += 4;
        pd += 4;
        w -= 4;
    }

    while (w)
    {
        uint32_t s = *ps++;
        uint32_t d = *pd;

        *pd++ = pack_1x64_32 (pixMultiply_1x64 (unpack_32_1x64 (s), negate_1x64 (expandAlpha_1x64 (unpack_32_1x64 (d)))));
        w--;
    }
}

static inline uint32_t
coreCombineAtopUPixelsse2 (uint32_t src, uint32_t dst)
{
    __m64 s = unpack_32_1x64 (src);
    __m64 d = unpack_32_1x64 (dst);

    __m64 sa = negate_1x64 (expandAlpha_1x64 (s));
    __m64 da = expandAlpha_1x64 (d);

    return pack_1x64_32 (pixAddMultiply_1x64 (&s, &da, &d, &sa));
}

static inline void
coreCombineAtopUsse2 (uint32_t* pd, const uint32_t* ps, int w)
{
    uint32_t s, d;

    __m128i xmmSrcLo, xmmSrcHi;
    __m128i xmmDstLo, xmmDstHi;
    __m128i xmmAlphaSrcLo, xmmAlphaSrcHi;
    __m128i xmmAlphaDstLo, xmmAlphaDstHi;

    /* call prefetch hint to optimize cache load*/
    cachePrefetch ((__m128i*)ps);
    cachePrefetch ((__m128i*)pd);

    while (w && ((unsigned long) pd & 15))
    {
        s = *ps++;
        d = *pd;

        *pd++ = coreCombineAtopUPixelsse2 (s, d);
        w--;
    }

    /* call prefetch hint to optimize cache load*/
    cachePrefetch ((__m128i*)ps);
    cachePrefetch ((__m128i*)pd);

    while (w >= 4)
    {
        /* fill cache line with next memory */
        cachePrefetchNext ((__m128i*)ps);
        cachePrefetchNext ((__m128i*)pd);

        xmmSrcHi = load128Unaligned ((__m128i*) ps);
        xmmDstHi = load128Aligned ((__m128i*) pd);

        unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi);
        unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi);

        expandAlpha_2x128 (xmmSrcLo, xmmSrcHi, &xmmAlphaSrcLo, &xmmAlphaSrcHi);
        expandAlpha_2x128 (xmmDstLo, xmmDstHi, &xmmAlphaDstLo, &xmmAlphaDstHi);

        negate_2x128 (xmmAlphaSrcLo, xmmAlphaSrcHi, &xmmAlphaSrcLo, &xmmAlphaSrcHi);

        pixAddMultiply_2x128 ( &xmmSrcLo, &xmmSrcHi, &xmmAlphaDstLo, &xmmAlphaDstHi,
                               &xmmDstLo, &xmmDstHi, &xmmAlphaSrcLo, &xmmAlphaSrcHi,
                               &xmmDstLo, &xmmDstHi );

        save128Aligned ((__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi));

        ps += 4;
        pd += 4;
        w -= 4;
    }

    while (w)
    {
        s = *ps++;
        d = *pd;

        *pd++ = coreCombineAtopUPixelsse2 (s, d);
        w--;
    }
}

static inline uint32_t
coreCombineReverseAtopUPixelsse2 (uint32_t src, uint32_t dst)
{
    __m64 s = unpack_32_1x64 (src);
    __m64 d = unpack_32_1x64 (dst);

    __m64 sa = expandAlpha_1x64 (s);
    __m64 da = negate_1x64 (expandAlpha_1x64 (d));

    return pack_1x64_32 (pixAddMultiply_1x64 (&s, &da, &d, &sa));
}

static inline void
coreCombineReverseAtopUsse2 (uint32_t* pd, const uint32_t* ps, int w)
{
    uint32_t s, d;

    __m128i xmmSrcLo, xmmSrcHi;
    __m128i xmmDstLo, xmmDstHi;
    __m128i xmmAlphaSrcLo, xmmAlphaSrcHi;
    __m128i xmmAlphaDstLo, xmmAlphaDstHi;

    /* call prefetch hint to optimize cache load*/
    cachePrefetch ((__m128i*)ps);
    cachePrefetch ((__m128i*)pd);

    while (w && ((unsigned long) pd & 15))
    {
        s = *ps++;
        d = *pd;

        *pd++ = coreCombineReverseAtopUPixelsse2 (s, d);
        w--;
    }

    /* call prefetch hint to optimize cache load*/
    cachePrefetch ((__m128i*)ps);
    cachePrefetch ((__m128i*)pd);

    while (w >= 4)
    {
        /* fill cache line with next memory */
        cachePrefetchNext ((__m128i*)ps);
        cachePrefetchNext ((__m128i*)pd);

        xmmSrcHi = load128Unaligned ((__m128i*) ps);
        xmmDstHi = load128Aligned ((__m128i*) pd);

        unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi);
        unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi);

        expandAlpha_2x128 (xmmSrcLo, xmmSrcHi, &xmmAlphaSrcLo, &xmmAlphaSrcHi);
        expandAlpha_2x128 (xmmDstLo, xmmDstHi, &xmmAlphaDstLo, &xmmAlphaDstHi);

        negate_2x128 (xmmAlphaDstLo, xmmAlphaDstHi, &xmmAlphaDstLo, &xmmAlphaDstHi);

        pixAddMultiply_2x128 ( &xmmSrcLo, &xmmSrcHi, &xmmAlphaDstLo, &xmmAlphaDstHi,
                               &xmmDstLo, &xmmDstHi, &xmmAlphaSrcLo, &xmmAlphaSrcHi,
                               &xmmDstLo, &xmmDstHi );

        save128Aligned ((__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi));

        ps += 4;
        pd += 4;
        w -= 4;
    }

    while (w)
    {
        s = *ps++;
        d = *pd;

        *pd++ = coreCombineReverseAtopUPixelsse2 (s, d);
        w--;
    }
}

static inline uint32_t
coreCombineXorUPixelsse2 (uint32_t src, uint32_t dst)
{
    __m64 s = unpack_32_1x64 (src);
    __m64 d = unpack_32_1x64 (dst);

    __m64 negD = negate_1x64 (expandAlpha_1x64 (d));
    __m64 negS = negate_1x64 (expandAlpha_1x64 (s));

    return pack_1x64_32 (pixAddMultiply_1x64 (&s, &negD, &d, &negS));
}

static inline void
coreCombineXorUsse2 (uint32_t* dst, const uint32_t* src, int width)
{
    int w = width;
    uint32_t s, d;
    uint32_t* pd = dst;
    const uint32_t* ps = src;

    __m128i xmmSrc, xmmSrcLo, xmmSrcHi;
    __m128i xmmDst, xmmDstLo, xmmDstHi;
    __m128i xmmAlphaSrcLo, xmmAlphaSrcHi;
    __m128i xmmAlphaDstLo, xmmAlphaDstHi;

    /* call prefetch hint to optimize cache load*/
    cachePrefetch ((__m128i*)ps);
    cachePrefetch ((__m128i*)pd);

    while (w && ((unsigned long) pd & 15))
    {
        s = *ps++;
        d = *pd;

        *pd++ = coreCombineXorUPixelsse2 (s, d);
        w--;
    }

    /* call prefetch hint to optimize cache load*/
    cachePrefetch ((__m128i*)ps);
    cachePrefetch ((__m128i*)pd);

    while (w >= 4)
    {
        /* fill cache line with next memory */
        cachePrefetchNext ((__m128i*)ps);
        cachePrefetchNext ((__m128i*)pd);

        xmmSrc = load128Unaligned ((__m128i*) ps);
        xmmDst = load128Aligned ((__m128i*) pd);

        unpack_128_2x128 (xmmSrc, &xmmSrcLo, &xmmSrcHi);
        unpack_128_2x128 (xmmDst, &xmmDstLo, &xmmDstHi);

        expandAlpha_2x128 (xmmSrcLo, xmmSrcHi, &xmmAlphaSrcLo, &xmmAlphaSrcHi);
        expandAlpha_2x128 (xmmDstLo, xmmDstHi, &xmmAlphaDstLo, &xmmAlphaDstHi);

        negate_2x128 (xmmAlphaSrcLo, xmmAlphaSrcHi, &xmmAlphaSrcLo, &xmmAlphaSrcHi);
        negate_2x128 (xmmAlphaDstLo, xmmAlphaDstHi, &xmmAlphaDstLo, &xmmAlphaDstHi);

        pixAddMultiply_2x128 ( &xmmSrcLo, &xmmSrcHi, &xmmAlphaDstLo, &xmmAlphaDstHi,
                               &xmmDstLo, &xmmDstHi, &xmmAlphaSrcLo, &xmmAlphaSrcHi,
                               &xmmDstLo, &xmmDstHi );

        save128Aligned ((__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi));

        ps += 4;
        pd += 4;
        w -= 4;
    }

    while (w)
    {
        s = *ps++;
        d = *pd;

        *pd++ = coreCombineXorUPixelsse2 (s, d);
        w--;
    }
}

static inline void
coreCombineAddUsse2 (uint32_t* dst, const uint32_t* src, int width)
{
    int w = width;
    uint32_t s,d;
    uint32_t* pd = dst;
    const uint32_t* ps = src;

    /* call prefetch hint to optimize cache load*/
    cachePrefetch ((__m128i*)ps);
    cachePrefetch ((__m128i*)pd);

    while (w && (unsigned long)pd & 15)
    {
        s = *ps++;
        d = *pd;
        *pd++ = _mm_cvtsi64_si32 (_mm_adds_pu8 (_mm_cvtsi32_si64 (s), _mm_cvtsi32_si64 (d)));
        w--;
    }

    /* call prefetch hint to optimize cache load*/
    cachePrefetch ((__m128i*)ps);
    cachePrefetch ((__m128i*)pd);

    while (w >= 4)
    {
        /* fill cache line with next memory */
        cachePrefetchNext ((__m128i*)ps);
        cachePrefetchNext ((__m128i*)pd);

        save128Aligned( (__m128i*)pd,
                        _mm_adds_epu8( load128Unaligned((__m128i*)ps),
                                       load128Aligned  ((__m128i*)pd)) );
        pd += 4;
        ps += 4;
        w -= 4;
    }

    while (w--)
    {
        s = *ps++;
        d = *pd;
        *pd++ = _mm_cvtsi64_si32 (_mm_adds_pu8 (_mm_cvtsi32_si64 (s), _mm_cvtsi32_si64 (d)));
    }
}

static inline uint32_t
coreCombineSaturateUPixelsse2 (uint32_t src, uint32_t dst)
{
    __m64 ms = unpack_32_1x64 (src);
    __m64 md = unpack_32_1x64 (dst);
    uint32_t sa = src >> 24;
    uint32_t da = ~dst >> 24;

    if (sa > da)
    {
        ms = pixMultiply_1x64 (ms, expandAlpha_1x64 (unpack_32_1x64 (FbIntDiv(da, sa) << 24)));
    }

    return pack_1x64_32 (_mm_adds_pu16 (md, ms));
}

static inline void
coreCombineSaturateUsse2 (uint32_t *pd, const uint32_t *ps, int w)
{
    uint32_t s,d;

    uint32_t packCmp;
    __m128i xmmSrc, xmmDst;

    /* call prefetch hint to optimize cache load*/
    cachePrefetch ((__m128i*)ps);
    cachePrefetch ((__m128i*)pd);

    while (w && (unsigned long)pd & 15)
    {
        s = *ps++;
        d = *pd;
        *pd++ = coreCombineSaturateUPixelsse2 (s, d);
        w--;
    }

    /* call prefetch hint to optimize cache load*/
    cachePrefetch ((__m128i*)ps);
    cachePrefetch ((__m128i*)pd);

    while (w >= 4)
    {
        /* fill cache line with next memory */
        cachePrefetchNext ((__m128i*)ps);
        cachePrefetchNext ((__m128i*)pd);

        xmmDst = load128Aligned  ((__m128i*)pd);
        xmmSrc = load128Unaligned((__m128i*)ps);

        packCmp = _mm_movemask_epi8 (_mm_cmpgt_epi32 (_mm_srli_epi32 (xmmSrc, 24),
                                                      _mm_srli_epi32 (_mm_xor_si128 (xmmDst, Maskff000000), 24)));

        /* if some alpha src is grater than respective ~alpha dst */
        if (packCmp)
        {
            s = *ps++;
            d = *pd;
            *pd++ = coreCombineSaturateUPixelsse2 (s, d);

            s = *ps++;
            d = *pd;
            *pd++ = coreCombineSaturateUPixelsse2 (s, d);

            s = *ps++;
            d = *pd;
            *pd++ = coreCombineSaturateUPixelsse2 (s, d);

            s = *ps++;
            d = *pd;
            *pd++ = coreCombineSaturateUPixelsse2 (s, d);
        }
        else
        {
            save128Aligned ((__m128i*)pd, _mm_adds_epu8 (xmmDst, xmmSrc));

            pd += 4;
            ps += 4;
        }

        w -= 4;
    }

    while (w--)
    {
        s = *ps++;
        d = *pd;
        *pd++ = coreCombineSaturateUPixelsse2 (s, d);
    }
}

static inline void
coreCombineSrcCsse2 (uint32_t* pd, const uint32_t* ps, const uint32_t *pm, int w)
{
    uint32_t s, m;

    __m128i xmmSrcLo, xmmSrcHi;
    __m128i xmmMaskLo, xmmMaskHi;
    __m128i xmmDstLo, xmmDstHi;

    /* call prefetch hint to optimize cache load*/
    cachePrefetch ((__m128i*)ps);
    cachePrefetch ((__m128i*)pd);
    cachePrefetch ((__m128i*)pm);

    while (w && (unsigned long)pd & 15)
    {
        s = *ps++;
        m = *pm++;
        *pd++ = pack_1x64_32 (pixMultiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (m)));
        w--;
    }

    /* call prefetch hint to optimize cache load*/
    cachePrefetch ((__m128i*)ps);
    cachePrefetch ((__m128i*)pd);
    cachePrefetch ((__m128i*)pm);

    while (w >= 4)
    {
        /* fill cache line with next memory */
        cachePrefetchNext ((__m128i*)ps);
        cachePrefetchNext ((__m128i*)pd);
        cachePrefetchNext ((__m128i*)pm);

        xmmSrcHi = load128Unaligned ((__m128i*)ps);
        xmmMaskHi = load128Unaligned ((__m128i*)pm);

        unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi);
        unpack_128_2x128 (xmmMaskHi, &xmmMaskLo, &xmmMaskHi);

        pixMultiply_2x128 (&xmmSrcLo, &xmmSrcHi, &xmmMaskLo, &xmmMaskHi, &xmmDstLo, &xmmDstHi);

        save128Aligned( (__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi));

        ps += 4;
        pd += 4;
        pm += 4;
        w -= 4;
    }

    while (w)
    {
        s = *ps++;
        m = *pm++;
        *pd++ = pack_1x64_32 (pixMultiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (m)));
        w--;
    }
}

static inline uint32_t
coreCombineOverCPixelsse2 (uint32_t src, uint32_t mask, uint32_t dst)
{
    __m64 s = unpack_32_1x64 (src);
    __m64 expAlpha = expandAlpha_1x64 (s);
    __m64 unpkMask = unpack_32_1x64 (mask);
    __m64 unpkDst  = unpack_32_1x64 (dst);

    return pack_1x64_32 (inOver_1x64 (&s, &expAlpha, &unpkMask, &unpkDst));
}

static inline void
coreCombineOverCsse2 (uint32_t* pd, const uint32_t* ps, const uint32_t *pm, int w)
{
    uint32_t s, m, d;

    __m128i xmmAlphaLo, xmmAlphaHi;
    __m128i xmmSrcLo, xmmSrcHi;
    __m128i xmmDstLo, xmmDstHi;
    __m128i xmmMaskLo, xmmMaskHi;

    /* call prefetch hint to optimize cache load*/
    cachePrefetch ((__m128i*)ps);
    cachePrefetch ((__m128i*)pd);
    cachePrefetch ((__m128i*)pm);

    while (w && (unsigned long)pd & 15)
    {
        s = *ps++;
        m = *pm++;
        d = *pd;

        *pd++ = coreCombineOverCPixelsse2 (s, m, d);
        w--;
    }

    /* call prefetch hint to optimize cache load*/
    cachePrefetch ((__m128i*)ps);
    cachePrefetch ((__m128i*)pd);
    cachePrefetch ((__m128i*)pm);

    while (w >= 4)
    {
        /* fill cache line with next memory */
        cachePrefetchNext ((__m128i*)ps);
        cachePrefetchNext ((__m128i*)pd);
        cachePrefetchNext ((__m128i*)pm);

        xmmDstHi = load128Aligned ((__m128i*)pd);
        xmmSrcHi = load128Unaligned ((__m128i*)ps);
        xmmMaskHi = load128Unaligned ((__m128i*)pm);

        unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi);
        unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi);
        unpack_128_2x128 (xmmMaskHi, &xmmMaskLo, &xmmMaskHi);

        expandAlpha_2x128 (xmmSrcLo, xmmSrcHi, &xmmAlphaLo, &xmmAlphaHi);

        inOver_2x128 (&xmmSrcLo, &xmmSrcHi, &xmmAlphaLo, &xmmAlphaHi, &xmmMaskLo, &xmmMaskHi, &xmmDstLo, &xmmDstHi);

        save128Aligned( (__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi));

        ps += 4;
        pd += 4;
        pm += 4;
        w -= 4;
    }

    while (w)
    {
        s = *ps++;
        m = *pm++;
        d = *pd;

        *pd++ = coreCombineOverCPixelsse2 (s, m, d);
        w--;
    }
}

static inline uint32_t
coreCombineOverReverseCPixelsse2 (uint32_t src, uint32_t mask, uint32_t dst)
{
    __m64 d = unpack_32_1x64 (dst);

	return pack_1x64_32(over_1x64 (d, expandAlpha_1x64 (d), pixMultiply_1x64 (unpack_32_1x64 (src), unpack_32_1x64 (mask))));
}

static inline void
coreCombineOverReverseCsse2 (uint32_t* pd, const uint32_t* ps, const uint32_t *pm, int w)
{
    uint32_t s, m, d;

    __m128i xmmAlphaLo, xmmAlphaHi;
    __m128i xmmSrcLo, xmmSrcHi;
    __m128i xmmDstLo, xmmDstHi;
    __m128i xmmMaskLo, xmmMaskHi;

    /* call prefetch hint to optimize cache load*/
    cachePrefetch ((__m128i*)ps);
    cachePrefetch ((__m128i*)pd);
    cachePrefetch ((__m128i*)pm);

    while (w && (unsigned long)pd & 15)
    {
        s = *ps++;
        m = *pm++;
        d = *pd;

        *pd++ = coreCombineOverReverseCPixelsse2 (s, m, d);
        w--;
    }

    /* call prefetch hint to optimize cache load*/
    cachePrefetch ((__m128i*)ps);
    cachePrefetch ((__m128i*)pd);
    cachePrefetch ((__m128i*)pm);

    while (w >= 4)
    {
        /* fill cache line with next memory */
        cachePrefetchNext ((__m128i*)ps);
        cachePrefetchNext ((__m128i*)pd);
        cachePrefetchNext ((__m128i*)pm);

        xmmDstHi = load128Aligned ((__m128i*)pd);
        xmmSrcHi = load128Unaligned ((__m128i*)ps);
        xmmMaskHi = load128Unaligned ((__m128i*)pm);

        unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi);
        unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi);
        unpack_128_2x128 (xmmMaskHi, &xmmMaskLo, &xmmMaskHi);

        expandAlpha_2x128 (xmmDstLo, xmmDstHi, &xmmAlphaLo, &xmmAlphaHi);
        pixMultiply_2x128 (&xmmSrcLo, &xmmSrcHi, &xmmMaskLo, &xmmMaskHi, &xmmMaskLo, &xmmMaskHi);

        over_2x128 (&xmmDstLo, &xmmDstHi, &xmmAlphaLo, &xmmAlphaHi, &xmmMaskLo, &xmmMaskHi);

        save128Aligned( (__m128i*)pd, pack_2x128_128 (xmmMaskLo, xmmMaskHi));

        ps += 4;
        pd += 4;
        pm += 4;
        w -= 4;
    }

    while (w)
    {
        s = *ps++;
        m = *pm++;
        d = *pd;

        *pd++ = coreCombineOverReverseCPixelsse2 (s, m, d);
        w--;
    }
}

static inline void
coreCombineInCsse2 (uint32_t *pd, uint32_t *ps, uint32_t *pm, int w)
{
    uint32_t s, m, d;

    __m128i xmmAlphaLo, xmmAlphaHi;
    __m128i xmmSrcLo, xmmSrcHi;
    __m128i xmmDstLo, xmmDstHi;
    __m128i xmmMaskLo, xmmMaskHi;

    /* call prefetch hint to optimize cache load*/
    cachePrefetch ((__m128i*)ps);
    cachePrefetch ((__m128i*)pd);
    cachePrefetch ((__m128i*)pm);

    while (w && (unsigned long)pd & 15)
    {
        s = *ps++;
        m = *pm++;
        d = *pd;

        *pd++ = pack_1x64_32 (pixMultiply_1x64 (pixMultiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (m)),
                                                expandAlpha_1x64 (unpack_32_1x64 (d))));
        w--;
    }

    /* call prefetch hint to optimize cache load*/
    cachePrefetch ((__m128i*)ps);
    cachePrefetch ((__m128i*)pd);
    cachePrefetch ((__m128i*)pm);

    while (w >= 4)
    {
        /* fill cache line with next memory */
        cachePrefetchNext ((__m128i*)ps);
        cachePrefetchNext ((__m128i*)pd);
        cachePrefetchNext ((__m128i*)pm);

        xmmDstHi = load128Aligned ((__m128i*)pd);
        xmmSrcHi = load128Unaligned ((__m128i*)ps);
        xmmMaskHi = load128Unaligned ((__m128i*)pm);

        unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi);
        unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi);
        unpack_128_2x128 (xmmMaskHi, &xmmMaskLo, &xmmMaskHi);

        expandAlpha_2x128 (xmmDstLo, xmmDstHi, &xmmAlphaLo, &xmmAlphaHi);
        pixMultiply_2x128 (&xmmSrcLo, &xmmSrcHi, &xmmMaskLo, &xmmMaskHi, &xmmDstLo, &xmmDstHi);

        pixMultiply_2x128 (&xmmDstLo, &xmmDstHi, &xmmAlphaLo, &xmmAlphaHi, &xmmDstLo, &xmmDstHi);

        save128Aligned( (__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi));

        ps += 4;
        pd += 4;
        pm += 4;
        w -= 4;
    }

    while (w)
    {
        s = *ps++;
        m = *pm++;
        d = *pd;

        *pd++ = pack_1x64_32 (pixMultiply_1x64 (pixMultiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (m)),
                                                expandAlpha_1x64 (unpack_32_1x64 (d))));
        w--;
    }
}

static inline void
coreCombineInReverseCsse2 (uint32_t *pd, uint32_t *ps, uint32_t *pm, int w)
{
    uint32_t s, m, d;

    __m128i xmmAlphaLo, xmmAlphaHi;
    __m128i xmmSrcLo, xmmSrcHi;
    __m128i xmmDstLo, xmmDstHi;
    __m128i xmmMaskLo, xmmMaskHi;

    /* call prefetch hint to optimize cache load*/
    cachePrefetch ((__m128i*)ps);
    cachePrefetch ((__m128i*)pd);
    cachePrefetch ((__m128i*)pm);

    while (w && (unsigned long)pd & 15)
    {
        s = *ps++;
        m = *pm++;
        d = *pd;

        *pd++ = pack_1x64_32 (pixMultiply_1x64 (unpack_32_1x64 (d),
                                                pixMultiply_1x64 (unpack_32_1x64 (m),
                                                                  expandAlpha_1x64 (unpack_32_1x64 (s)))));
        w--;
    }

    /* call prefetch hint to optimize cache load*/
    cachePrefetch ((__m128i*)ps);
    cachePrefetch ((__m128i*)pd);
    cachePrefetch ((__m128i*)pm);

    while (w >= 4)
    {
        /* fill cache line with next memory */
        cachePrefetchNext ((__m128i*)ps);
        cachePrefetchNext ((__m128i*)pd);
        cachePrefetchNext ((__m128i*)pm);

        xmmDstHi = load128Aligned ((__m128i*)pd);
        xmmSrcHi = load128Unaligned ((__m128i*)ps);
        xmmMaskHi = load128Unaligned ((__m128i*)pm);

        unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi);
        unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi);
        unpack_128_2x128 (xmmMaskHi, &xmmMaskLo, &xmmMaskHi);

        expandAlpha_2x128 (xmmSrcLo, xmmSrcHi, &xmmAlphaLo, &xmmAlphaHi);
        pixMultiply_2x128 (&xmmMaskLo, &xmmMaskHi, &xmmAlphaLo, &xmmAlphaHi, &xmmAlphaLo, &xmmAlphaHi);

        pixMultiply_2x128 (&xmmDstLo, &xmmDstHi, &xmmAlphaLo, &xmmAlphaHi, &xmmDstLo, &xmmDstHi);

        save128Aligned( (__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi));

        ps += 4;
        pd += 4;
        pm += 4;
        w -= 4;
    }

    while (w)
    {
        s = *ps++;
        m = *pm++;
        d = *pd;

        *pd++ = pack_1x64_32 (pixMultiply_1x64 (unpack_32_1x64 (d),
                                                pixMultiply_1x64 (unpack_32_1x64 (m),
                                                                  expandAlpha_1x64 (unpack_32_1x64 (s)))));
        w--;
    }
}

static inline void
coreCombineOutCsse2 (uint32_t *pd, uint32_t *ps, uint32_t *pm, int w)
{
    uint32_t s, m, d;

    __m128i xmmAlphaLo, xmmAlphaHi;
    __m128i xmmSrcLo, xmmSrcHi;
    __m128i xmmDstLo, xmmDstHi;
    __m128i xmmMaskLo, xmmMaskHi;

    /* call prefetch hint to optimize cache load*/
    cachePrefetch ((__m128i*)ps);
    cachePrefetch ((__m128i*)pd);
    cachePrefetch ((__m128i*)pm);

    while (w && (unsigned long)pd & 15)
    {
        s = *ps++;
        m = *pm++;
        d = *pd;

        *pd++ = pack_1x64_32 (pixMultiply_1x64 (pixMultiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (m)),
                                                negate_1x64 (expandAlpha_1x64 (unpack_32_1x64 (d)))));
        w--;
    }

    /* call prefetch hint to optimize cache load*/
    cachePrefetch ((__m128i*)ps);
    cachePrefetch ((__m128i*)pd);
    cachePrefetch ((__m128i*)pm);

    while (w >= 4)
    {
        /* fill cache line with next memory */
        cachePrefetchNext ((__m128i*)ps);
        cachePrefetchNext ((__m128i*)pd);
        cachePrefetchNext ((__m128i*)pm);

        xmmDstHi = load128Aligned ((__m128i*)pd);
        xmmSrcHi = load128Unaligned ((__m128i*)ps);
        xmmMaskHi = load128Unaligned ((__m128i*)pm);

        unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi);
        unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi);
        unpack_128_2x128 (xmmMaskHi, &xmmMaskLo, &xmmMaskHi);

        expandAlpha_2x128 (xmmDstLo, xmmDstHi, &xmmAlphaLo, &xmmAlphaHi);
        negate_2x128 (xmmAlphaLo, xmmAlphaHi, &xmmAlphaLo, &xmmAlphaHi);

        pixMultiply_2x128 (&xmmSrcLo, &xmmSrcHi, &xmmMaskLo, &xmmMaskHi, &xmmDstLo, &xmmDstHi);
        pixMultiply_2x128 (&xmmDstLo, &xmmDstHi, &xmmAlphaLo, &xmmAlphaHi, &xmmDstLo, &xmmDstHi);

        save128Aligned( (__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi));

        ps += 4;
        pd += 4;
        pm += 4;
        w -= 4;
    }

    while (w)
    {
        s = *ps++;
        m = *pm++;
        d = *pd;

        *pd++ = pack_1x64_32 (pixMultiply_1x64 (pixMultiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (m)),
                                                negate_1x64 (expandAlpha_1x64 (unpack_32_1x64 (d)))));
        w--;
    }
}

static inline void
coreCombineOutReverseCsse2 (uint32_t *pd, uint32_t *ps, uint32_t *pm, int w)
{
    uint32_t s, m, d;

    __m128i xmmAlphaLo, xmmAlphaHi;
    __m128i xmmSrcLo, xmmSrcHi;
    __m128i xmmDstLo, xmmDstHi;
    __m128i xmmMaskLo, xmmMaskHi;

    /* call prefetch hint to optimize cache load*/
    cachePrefetch ((__m128i*)ps);
    cachePrefetch ((__m128i*)pd);
    cachePrefetch ((__m128i*)pm);

    while (w && (unsigned long)pd & 15)
    {
        s = *ps++;
        m = *pm++;
        d = *pd;

        *pd++ = pack_1x64_32 (pixMultiply_1x64 (unpack_32_1x64 (d),
                                                negate_1x64 (pixMultiply_1x64 (unpack_32_1x64 (m),
                                                                               expandAlpha_1x64 (unpack_32_1x64 (s))))));
        w--;
    }

    /* call prefetch hint to optimize cache load*/
    cachePrefetch ((__m128i*)ps);
    cachePrefetch ((__m128i*)pd);
    cachePrefetch ((__m128i*)pm);

    while (w >= 4)
    {
        /* fill cache line with next memory */
        cachePrefetchNext ((__m128i*)ps);
        cachePrefetchNext ((__m128i*)pd);
        cachePrefetchNext ((__m128i*)pm);

        xmmDstHi = load128Aligned ((__m128i*)pd);
        xmmSrcHi = load128Unaligned ((__m128i*)ps);
        xmmMaskHi = load128Unaligned ((__m128i*)pm);

        unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi);
        unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi);
        unpack_128_2x128 (xmmMaskHi, &xmmMaskLo, &xmmMaskHi);

        expandAlpha_2x128 (xmmSrcLo, xmmSrcHi, &xmmAlphaLo, &xmmAlphaHi);

        pixMultiply_2x128 (&xmmMaskLo, &xmmMaskHi, &xmmAlphaLo, &xmmAlphaHi, &xmmMaskLo, &xmmMaskHi);

        negate_2x128 (xmmMaskLo, xmmMaskHi, &xmmMaskLo, &xmmMaskHi);

        pixMultiply_2x128 (&xmmDstLo, &xmmDstHi, &xmmMaskLo, &xmmMaskHi, &xmmDstLo, &xmmDstHi);

        save128Aligned( (__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi));

        ps += 4;
        pd += 4;
        pm += 4;
        w -= 4;
    }

    while (w)
    {
        s = *ps++;
        m = *pm++;
        d = *pd;

        *pd++ = pack_1x64_32 (pixMultiply_1x64 (unpack_32_1x64 (d),
                                                negate_1x64 (pixMultiply_1x64 (unpack_32_1x64 (m),
                                                                               expandAlpha_1x64 (unpack_32_1x64 (s))))));
        w--;
    }
}

static inline uint32_t
coreCombineAtopCPixelsse2 (uint32_t src, uint32_t mask, uint32_t dst)
{
    __m64 m = unpack_32_1x64 (mask);
    __m64 s = unpack_32_1x64 (src);
    __m64 d = unpack_32_1x64 (dst);
    __m64 sa = expandAlpha_1x64 (s);
    __m64 da = expandAlpha_1x64 (d);

    s = pixMultiply_1x64 (s, m);
    m = negate_1x64 (pixMultiply_1x64 (m, sa));

    return pack_1x64_32 (pixAddMultiply_1x64 (&d, &m, &s, &da));
}

static inline void
coreCombineAtopCsse2 (uint32_t *pd, uint32_t *ps, uint32_t *pm, int w)
{
    uint32_t s, m, d;

    __m128i xmmSrcLo, xmmSrcHi;
    __m128i xmmDstLo, xmmDstHi;
    __m128i xmmAlphaSrcLo, xmmAlphaSrcHi;
    __m128i xmmAlphaDstLo, xmmAlphaDstHi;
    __m128i xmmMaskLo, xmmMaskHi;

    /* call prefetch hint to optimize cache load*/
    cachePrefetch ((__m128i*)ps);
    cachePrefetch ((__m128i*)pd);
    cachePrefetch ((__m128i*)pm);

    while (w && (unsigned long)pd & 15)
    {
        s = *ps++;
        m = *pm++;
        d = *pd;

        *pd++ = coreCombineAtopCPixelsse2 (s, m, d);
        w--;
    }

    /* call prefetch hint to optimize cache load*/
    cachePrefetch ((__m128i*)ps);
    cachePrefetch ((__m128i*)pd);
    cachePrefetch ((__m128i*)pm);

    while (w >= 4)
    {
        /* fill cache line with next memory */
        cachePrefetchNext ((__m128i*)ps);
        cachePrefetchNext ((__m128i*)pd);
        cachePrefetchNext ((__m128i*)pm);

        xmmDstHi = load128Aligned ((__m128i*)pd);
        xmmSrcHi = load128Unaligned ((__m128i*)ps);
        xmmMaskHi = load128Unaligned ((__m128i*)pm);

        unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi);
        unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi);
        unpack_128_2x128 (xmmMaskHi, &xmmMaskLo, &xmmMaskHi);

        expandAlpha_2x128 (xmmSrcLo, xmmSrcHi, &xmmAlphaSrcLo, &xmmAlphaSrcHi);
        expandAlpha_2x128 (xmmDstLo, xmmDstHi, &xmmAlphaDstLo, &xmmAlphaDstHi);

        pixMultiply_2x128 (&xmmSrcLo, &xmmSrcHi, &xmmMaskLo, &xmmMaskHi, &xmmSrcLo, &xmmSrcHi);
        pixMultiply_2x128 (&xmmMaskLo, &xmmMaskHi, &xmmAlphaSrcLo, &xmmAlphaSrcHi, &xmmMaskLo, &xmmMaskHi);

        negate_2x128 (xmmMaskLo, xmmMaskHi, &xmmMaskLo, &xmmMaskHi);

        pixAddMultiply_2x128 (&xmmDstLo, &xmmDstHi, &xmmMaskLo, &xmmMaskHi,
                              &xmmSrcLo, &xmmSrcHi, &xmmAlphaDstLo, &xmmAlphaDstHi,
                              &xmmDstLo, &xmmDstHi);

        save128Aligned( (__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi));

        ps += 4;
        pd += 4;
        pm += 4;
        w -= 4;
    }

    while (w)
    {
        s = *ps++;
        m = *pm++;
        d = *pd;

        *pd++ = coreCombineAtopCPixelsse2 (s, m, d);
        w--;
    }
}

static inline uint32_t
coreCombineReverseAtopCPixelsse2 (uint32_t src, uint32_t mask, uint32_t dst)
{
    __m64 m = unpack_32_1x64 (mask);
    __m64 s = unpack_32_1x64 (src);
    __m64 d = unpack_32_1x64 (dst);

    __m64 da = negate_1x64 (expandAlpha_1x64 (d));
    __m64 sa = expandAlpha_1x64 (s);

    s = pixMultiply_1x64 (s, m);
    m = pixMultiply_1x64 (m, sa);

    return pack_1x64_32 (pixAddMultiply_1x64 (&d, &m, &s, &da));
}

static inline void
coreCombineReverseAtopCsse2 (uint32_t *pd, uint32_t *ps, uint32_t *pm, int w)
{
    uint32_t s, m, d;

    __m128i xmmSrcLo, xmmSrcHi;
    __m128i xmmDstLo, xmmDstHi;
    __m128i xmmAlphaSrcLo, xmmAlphaSrcHi;
    __m128i xmmAlphaDstLo, xmmAlphaDstHi;
    __m128i xmmMaskLo, xmmMaskHi;

    /* call prefetch hint to optimize cache load*/
    cachePrefetch ((__m128i*)ps);
    cachePrefetch ((__m128i*)pd);
    cachePrefetch ((__m128i*)pm);

    while (w && (unsigned long)pd & 15)
    {
        s = *ps++;
        m = *pm++;
        d = *pd;

        *pd++ = coreCombineReverseAtopCPixelsse2 (s, m, d);
        w--;
    }

    /* call prefetch hint to optimize cache load*/
    cachePrefetch ((__m128i*)ps);
    cachePrefetch ((__m128i*)pd);
    cachePrefetch ((__m128i*)pm);

    while (w >= 4)
    {
        /* fill cache line with next memory */
        cachePrefetchNext ((__m128i*)ps);
        cachePrefetchNext ((__m128i*)pd);
        cachePrefetchNext ((__m128i*)pm);

        xmmDstHi = load128Aligned ((__m128i*)pd);
        xmmSrcHi = load128Unaligned ((__m128i*)ps);
        xmmMaskHi = load128Unaligned ((__m128i*)pm);

        unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi);
        unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi);
        unpack_128_2x128 (xmmMaskHi, &xmmMaskLo, &xmmMaskHi);

        expandAlpha_2x128 (xmmSrcLo, xmmSrcHi, &xmmAlphaSrcLo, &xmmAlphaSrcHi);
        expandAlpha_2x128 (xmmDstLo, xmmDstHi, &xmmAlphaDstLo, &xmmAlphaDstHi);

        pixMultiply_2x128 (&xmmSrcLo, &xmmSrcHi, &xmmMaskLo, &xmmMaskHi, &xmmSrcLo, &xmmSrcHi);
        pixMultiply_2x128 (&xmmMaskLo, &xmmMaskHi, &xmmAlphaSrcLo, &xmmAlphaSrcHi, &xmmMaskLo, &xmmMaskHi);

        negate_2x128 (xmmAlphaDstLo, xmmAlphaDstHi, &xmmAlphaDstLo, &xmmAlphaDstHi);

        pixAddMultiply_2x128 (&xmmDstLo, &xmmDstHi, &xmmMaskLo, &xmmMaskHi,
                              &xmmSrcLo, &xmmSrcHi, &xmmAlphaDstLo, &xmmAlphaDstHi,
                              &xmmDstLo, &xmmDstHi);

        save128Aligned( (__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi));

        ps += 4;
        pd += 4;
        pm += 4;
        w -= 4;
    }

    while (w)
    {
        s = *ps++;
        m = *pm++;
        d = *pd;

        *pd++ = coreCombineReverseAtopCPixelsse2 (s, m, d);
        w--;
    }
}

static inline uint32_t
coreCombineXorCPixelsse2 (uint32_t src, uint32_t mask, uint32_t dst)
{
    __m64 a = unpack_32_1x64 (mask);
    __m64 s = unpack_32_1x64 (src);
    __m64 d = unpack_32_1x64 (dst);

    __m64 alphaDst = negate_1x64 (pixMultiply_1x64 (a, expandAlpha_1x64 (s)));
    __m64 dest      = pixMultiply_1x64 (s, a);
    __m64 alphaSrc = negate_1x64 (expandAlpha_1x64 (d));

    return pack_1x64_32 (pixAddMultiply_1x64 (&d,
                                              &alphaDst,
                                              &dest,
                                              &alphaSrc));
}

static inline void
coreCombineXorCsse2 (uint32_t *pd, uint32_t *ps, uint32_t *pm, int w)
{
    uint32_t s, m, d;

    __m128i xmmSrcLo, xmmSrcHi;
    __m128i xmmDstLo, xmmDstHi;
    __m128i xmmAlphaSrcLo, xmmAlphaSrcHi;
    __m128i xmmAlphaDstLo, xmmAlphaDstHi;
    __m128i xmmMaskLo, xmmMaskHi;

    /* call prefetch hint to optimize cache load*/
    cachePrefetch ((__m128i*)ps);
    cachePrefetch ((__m128i*)pd);
    cachePrefetch ((__m128i*)pm);

    while (w && (unsigned long)pd & 15)
    {
        s = *ps++;
        m = *pm++;
        d = *pd;

        *pd++ = coreCombineXorCPixelsse2 (s, m, d);
        w--;
    }

    /* call prefetch hint to optimize cache load*/
    cachePrefetch ((__m128i*)ps);
    cachePrefetch ((__m128i*)pd);
    cachePrefetch ((__m128i*)pm);

    while (w >= 4)
    {
        /* fill cache line with next memory */
        cachePrefetchNext ((__m128i*)ps);
        cachePrefetchNext ((__m128i*)pd);
        cachePrefetchNext ((__m128i*)pm);

        xmmDstHi = load128Aligned ((__m128i*)pd);
        xmmSrcHi = load128Unaligned ((__m128i*)ps);
        xmmMaskHi = load128Unaligned ((__m128i*)pm);

        unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi);
        unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi);
        unpack_128_2x128 (xmmMaskHi, &xmmMaskLo, &xmmMaskHi);

        expandAlpha_2x128 (xmmSrcLo, xmmSrcHi, &xmmAlphaSrcLo, &xmmAlphaSrcHi);
        expandAlpha_2x128 (xmmDstLo, xmmDstHi, &xmmAlphaDstLo, &xmmAlphaDstHi);

        pixMultiply_2x128 (&xmmSrcLo, &xmmSrcHi, &xmmMaskLo, &xmmMaskHi, &xmmSrcLo, &xmmSrcHi);
        pixMultiply_2x128 (&xmmMaskLo, &xmmMaskHi, &xmmAlphaSrcLo, &xmmAlphaSrcHi, &xmmMaskLo, &xmmMaskHi);

        negate_2x128 (xmmAlphaDstLo, xmmAlphaDstHi, &xmmAlphaDstLo, &xmmAlphaDstHi);
        negate_2x128 (xmmMaskLo, xmmMaskHi, &xmmMaskLo, &xmmMaskHi);

        pixAddMultiply_2x128 (&xmmDstLo, &xmmDstHi, &xmmMaskLo, &xmmMaskHi,
                              &xmmSrcLo, &xmmSrcHi, &xmmAlphaDstLo, &xmmAlphaDstHi,
                              &xmmDstLo, &xmmDstHi);

        save128Aligned( (__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi));

        ps += 4;
        pd += 4;
        pm += 4;
        w -= 4;
    }

    while (w)
    {
        s = *ps++;
        m = *pm++;
        d = *pd;

        *pd++ = coreCombineXorCPixelsse2 (s, m, d);
        w--;
    }
}

static inline void
coreCombineAddCsse2 (uint32_t *pd, uint32_t *ps, uint32_t *pm, int w)
{
    uint32_t s, m, d;

    __m128i xmmSrcLo, xmmSrcHi;
    __m128i xmmDstLo, xmmDstHi;
    __m128i xmmMaskLo, xmmMaskHi;

    /* call prefetch hint to optimize cache load*/
    cachePrefetch ((__m128i*)ps);
    cachePrefetch ((__m128i*)pd);
    cachePrefetch ((__m128i*)pm);

    while (w && (unsigned long)pd & 15)
    {
        s = *ps++;
        m = *pm++;
        d = *pd;

        *pd++ = pack_1x64_32 (_mm_adds_pu8 (pixMultiply_1x64 (unpack_32_1x64 (s),
                                                              unpack_32_1x64 (m)),
                                            unpack_32_1x64 (d)));
        w--;
    }

    /* call prefetch hint to optimize cache load*/
    cachePrefetch ((__m128i*)ps);
    cachePrefetch ((__m128i*)pd);
    cachePrefetch ((__m128i*)pm);

    while (w >= 4)
    {
        /* fill cache line with next memory */
        cachePrefetchNext ((__m128i*)ps);
        cachePrefetchNext ((__m128i*)pd);
        cachePrefetchNext ((__m128i*)pm);

        xmmSrcHi = load128Unaligned ((__m128i*)ps);
        xmmMaskHi = load128Unaligned ((__m128i*)pm);
        xmmDstHi = load128Aligned ((__m128i*)pd);

        unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi);
        unpack_128_2x128 (xmmMaskHi, &xmmMaskLo, &xmmMaskHi);
        unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi);

        pixMultiply_2x128 (&xmmSrcLo, &xmmSrcHi, &xmmMaskLo, &xmmMaskHi, &xmmSrcLo, &xmmSrcHi);

        save128Aligned( (__m128i*)pd, pack_2x128_128 (_mm_adds_epu8 (xmmSrcLo, xmmDstLo),
                                                      _mm_adds_epu8 (xmmSrcHi, xmmDstHi)));

        ps += 4;
        pd += 4;
        pm += 4;
        w -= 4;
    }

    while (w)
    {
        s = *ps++;
        m = *pm++;
        d = *pd;

        *pd++ = pack_1x64_32 (_mm_adds_pu8 (pixMultiply_1x64 (unpack_32_1x64 (s),
                                                              unpack_32_1x64 (m)),
                                            unpack_32_1x64 (d)));
        w--;
    }
}

/* -------------------------------------------------------------------------------------------------
 * fbComposeSetupSSE2
 */
static inline __m64
createMask_16_64 (uint16_t mask)
{
    return _mm_set1_pi16 (mask);
}

static inline __m128i
createMask_16_128 (uint16_t mask)
{
    return _mm_set1_epi16 (mask);
}

static inline __m64
createMask_2x32_64 (uint32_t mask0, uint32_t mask1)
{
    return _mm_set_pi32 (mask0, mask1);
}

static inline __m128i
createMask_2x32_128 (uint32_t mask0, uint32_t mask1)
{
    return _mm_set_epi32 (mask0, mask1, mask0, mask1);
}

/* SSE2 code patch for fbcompose.c */

static FASTCALL void
sse2CombineMaskU (uint32_t *dst, const uint32_t *src, int width)
{
    coreCombineReverseInUsse2 (dst, src, width);
    _mm_empty();
}

static FASTCALL void
sse2CombineOverU (uint32_t *dst, const uint32_t *src, int width)
{
    coreCombineOverUsse2 (dst, src, width);
    _mm_empty();
}

static FASTCALL void
sse2CombineOverReverseU (uint32_t *dst, const uint32_t *src, int width)
{
    coreCombineOverReverseUsse2 (dst, src, width);
    _mm_empty();
}

static FASTCALL void
sse2CombineInU (uint32_t *dst, const uint32_t *src, int width)
{
    coreCombineInUsse2 (dst, src, width);
    _mm_empty();
}

static FASTCALL void
sse2CombineInReverseU (uint32_t *dst, const uint32_t *src, int width)
{
    coreCombineReverseInUsse2 (dst, src, width);
    _mm_empty();
}

static FASTCALL void
sse2CombineOutU (uint32_t *dst, const uint32_t *src, int width)
{
    coreCombineOutUsse2 (dst, src, width);
    _mm_empty();
}

static FASTCALL void
sse2CombineOutReverseU (uint32_t *dst, const uint32_t *src, int width)
{
    coreCombineReverseOutUsse2 (dst, src, width);
    _mm_empty();
}

static FASTCALL void
sse2CombineAtopU (uint32_t *dst, const uint32_t *src, int width)
{
    coreCombineAtopUsse2 (dst, src, width);
    _mm_empty();
}

static FASTCALL void
sse2CombineAtopReverseU (uint32_t *dst, const uint32_t *src, int width)
{
    coreCombineReverseAtopUsse2 (dst, src, width);
    _mm_empty();
}

static FASTCALL void
sse2CombineXorU (uint32_t *dst, const uint32_t *src, int width)
{
    coreCombineXorUsse2 (dst, src, width);
    _mm_empty();
}

static FASTCALL void
sse2CombineAddU (uint32_t *dst, const uint32_t *src, int width)
{
    coreCombineAddUsse2 (dst, src, width);
    _mm_empty();
}

static FASTCALL void
sse2CombineSaturateU (uint32_t *dst, const uint32_t *src, int width)
{
    coreCombineSaturateUsse2 (dst, src, width);
    _mm_empty();
}

static FASTCALL void
sse2CombineSrcC (uint32_t *dst, uint32_t *src, uint32_t *mask, int width)
{
    coreCombineSrcCsse2 (dst, src, mask, width);
    _mm_empty();
}

static FASTCALL void
sse2CombineOverC (uint32_t *dst, uint32_t *src, uint32_t *mask, int width)
{
    coreCombineOverCsse2 (dst, src, mask, width);
    _mm_empty();
}

static FASTCALL void
sse2CombineOverReverseC (uint32_t *dst, uint32_t *src, uint32_t *mask, int width)
{
    coreCombineOverReverseCsse2 (dst, src, mask, width);
    _mm_empty();
}

static FASTCALL void
sse2CombineInC (uint32_t *dst, uint32_t *src, uint32_t *mask, int width)
{
    coreCombineInCsse2 (dst, src, mask, width);
    _mm_empty();
}

static FASTCALL void
sse2CombineInReverseC (uint32_t *dst, uint32_t *src, uint32_t *mask, int width)
{
    coreCombineInReverseCsse2 (dst, src, mask, width);
    _mm_empty();
}

static FASTCALL void
sse2CombineOutC (uint32_t *dst, uint32_t *src, uint32_t *mask, int width)
{
    coreCombineOutCsse2 (dst, src, mask, width);
    _mm_empty();
}

static FASTCALL void
sse2CombineOutReverseC (uint32_t *dst, uint32_t *src, uint32_t *mask, int width)
{
    coreCombineOutReverseCsse2 (dst, src, mask, width);
    _mm_empty();
}

static FASTCALL void
sse2CombineAtopC (uint32_t *dst, uint32_t *src, uint32_t *mask, int width)
{
    coreCombineAtopCsse2 (dst, src, mask, width);
    _mm_empty();
}

static FASTCALL void
sse2CombineAtopReverseC (uint32_t *dst, uint32_t *src, uint32_t *mask, int width)
{
    coreCombineReverseAtopCsse2 (dst, src, mask, width);
    _mm_empty();
}

static FASTCALL void
sse2CombineXorC (uint32_t *dst, uint32_t *src, uint32_t *mask, int width)
{
    coreCombineXorCsse2 (dst, src, mask, width);
    _mm_empty();
}

static FASTCALL void
sse2CombineAddC (uint32_t *dst, uint32_t *src, uint32_t *mask, int width)
{
    coreCombineAddCsse2 (dst, src, mask, width);
    _mm_empty();
}

void
fbComposeSetupSSE2(void)
{
    static pixman_bool_t initialized = FALSE;

    if (initialized)
	return;
    
    /* check if we have SSE2 support and initialize accordingly */
    if (pixman_have_sse2())
    {
        /* SSE2 constants */
        Mask565r  = createMask_2x32_128 (0x00f80000, 0x00f80000);
        Mask565g1 = createMask_2x32_128 (0x00070000, 0x00070000);
        Mask565g2 = createMask_2x32_128 (0x000000e0, 0x000000e0);
        Mask565b  = createMask_2x32_128 (0x0000001f, 0x0000001f);
        MaskRed   = createMask_2x32_128 (0x00f80000, 0x00f80000);
        MaskGreen = createMask_2x32_128 (0x0000fc00, 0x0000fc00);
        MaskBlue  = createMask_2x32_128 (0x000000f8, 0x000000f8);
	Mask565FixRB = createMask_2x32_128 (0x00e000e0, 0x00e000e0);
	Mask565FixG = createMask_2x32_128  (0x0000c000, 0x0000c000);
        Mask0080 = createMask_16_128 (0x0080);
        Mask00ff = createMask_16_128 (0x00ff);
        Mask0101 = createMask_16_128 (0x0101);
        Maskffff = createMask_16_128 (0xffff);
        Maskff000000 = createMask_2x32_128 (0xff000000, 0xff000000);
        MaskAlpha = createMask_2x32_128 (0x00ff0000, 0x00000000);

        /* MMX constants */
        xMask565rgb = createMask_2x32_64 (0x000001f0, 0x003f001f);
        xMask565Unpack = createMask_2x32_64 (0x00000084, 0x04100840);

        xMask0080 = createMask_16_64 (0x0080);
        xMask00ff = createMask_16_64 (0x00ff);
        xMask0101 = createMask_16_64 (0x0101);
        xMaskAlpha = createMask_2x32_64 (0x00ff0000, 0x00000000);

        /* SSE code patch for fbcompose.c */
        pixman_composeFunctions.combineU[PIXMAN_OP_OVER] = sse2CombineOverU;
        pixman_composeFunctions.combineU[PIXMAN_OP_OVER_REVERSE] = sse2CombineOverReverseU;
        pixman_composeFunctions.combineU[PIXMAN_OP_IN] = sse2CombineInU;
        pixman_composeFunctions.combineU[PIXMAN_OP_IN_REVERSE] = sse2CombineInReverseU;
        pixman_composeFunctions.combineU[PIXMAN_OP_OUT] = sse2CombineOutU;

        pixman_composeFunctions.combineU[PIXMAN_OP_OUT_REVERSE] = sse2CombineOutReverseU;
        pixman_composeFunctions.combineU[PIXMAN_OP_ATOP] = sse2CombineAtopU;
        pixman_composeFunctions.combineU[PIXMAN_OP_ATOP_REVERSE] = sse2CombineAtopReverseU;
        pixman_composeFunctions.combineU[PIXMAN_OP_XOR] = sse2CombineXorU;
        pixman_composeFunctions.combineU[PIXMAN_OP_ADD] = sse2CombineAddU;

        pixman_composeFunctions.combineU[PIXMAN_OP_SATURATE] = sse2CombineSaturateU;

        pixman_composeFunctions.combineC[PIXMAN_OP_SRC] = sse2CombineSrcC;
        pixman_composeFunctions.combineC[PIXMAN_OP_OVER] = sse2CombineOverC;
        pixman_composeFunctions.combineC[PIXMAN_OP_OVER_REVERSE] = sse2CombineOverReverseC;
        pixman_composeFunctions.combineC[PIXMAN_OP_IN] = sse2CombineInC;
        pixman_composeFunctions.combineC[PIXMAN_OP_IN_REVERSE] = sse2CombineInReverseC;
        pixman_composeFunctions.combineC[PIXMAN_OP_OUT] = sse2CombineOutC;
        pixman_composeFunctions.combineC[PIXMAN_OP_OUT_REVERSE] = sse2CombineOutReverseC;
        pixman_composeFunctions.combineC[PIXMAN_OP_ATOP] = sse2CombineAtopC;
        pixman_composeFunctions.combineC[PIXMAN_OP_ATOP_REVERSE] = sse2CombineAtopReverseC;
        pixman_composeFunctions.combineC[PIXMAN_OP_XOR] = sse2CombineXorC;
        pixman_composeFunctions.combineC[PIXMAN_OP_ADD] = sse2CombineAddC;

        pixman_composeFunctions.combineMaskU = sse2CombineMaskU;
    }

    initialized = TRUE;

    _mm_empty();
}


/* -------------------------------------------------------------------------------------------------
 * fbCompositeSolid_nx8888
 */

void
fbCompositeSolid_nx8888sse2 (pixman_op_t op,
			    pixman_image_t * pSrc,
			    pixman_image_t * pMask,
			    pixman_image_t * pDst,
			    int16_t	xSrc,
			    int16_t	ySrc,
			    int16_t	xMask,
			    int16_t	yMask,
			    int16_t	xDst,
			    int16_t	yDst,
			    uint16_t	width,
			    uint16_t	height)
{
    uint32_t	src;
    uint32_t	*dstLine, *dst, d;
    uint16_t	w;
    int	dstStride;
    __m128i xmmSrc, xmmAlpha;
    __m128i xmmDst, xmmDstLo, xmmDstHi;

    fbComposeGetSolid(pSrc, src, pDst->bits.format);

    if (src >> 24 == 0)
	return;

    fbComposeGetStart (pDst, xDst, yDst, uint32_t, dstStride, dstLine, 1);

    xmmSrc = expandPixel_32_1x128 (src);
    xmmAlpha = expandAlpha_1x128 (xmmSrc);

    while (height--)
    {
        dst = dstLine;

        /* call prefetch hint to optimize cache load*/
        cachePrefetch ((__m128i*)dst);

        dstLine += dstStride;
        w = width;

        while (w && (unsigned long)dst & 15)
        {
            d = *dst;
            *dst++ = pack_1x64_32 (over_1x64 (_mm_movepi64_pi64 (xmmSrc),
                                              _mm_movepi64_pi64 (xmmAlpha),
                                              unpack_32_1x64 (d)));
            w--;
        }

        cachePrefetch ((__m128i*)dst);

        while (w >= 4)
        {
            /* fill cache line with next memory */
            cachePrefetchNext ((__m128i*)dst);

            xmmDst = load128Aligned ((__m128i*)dst);

            unpack_128_2x128 (xmmDst, &xmmDstLo, &xmmDstHi);

            over_2x128 (&xmmSrc, &xmmSrc, &xmmAlpha, &xmmAlpha, &xmmDstLo, &xmmDstHi);

            /* rebuid the 4 pixel data and save*/
            save128Aligned ((__m128i*)dst, pack_2x128_128 (xmmDstLo, xmmDstHi));

            w -= 4;
            dst += 4;
        }

        while (w)
        {
            d = *dst;
            *dst++ = pack_1x64_32 (over_1x64 (_mm_movepi64_pi64 (xmmSrc),
                                              _mm_movepi64_pi64 (xmmAlpha),
                                              unpack_32_1x64 (d)));
            w--;
        }

    }
    _mm_empty();
}

/* -------------------------------------------------------------------------------------------------
 * fbCompositeSolid_nx0565
 */
void
fbCompositeSolid_nx0565sse2 (pixman_op_t op,
			    pixman_image_t * pSrc,
			    pixman_image_t * pMask,
			    pixman_image_t * pDst,
			    int16_t	xSrc,
			    int16_t	ySrc,
			    int16_t	xMask,
			    int16_t	yMask,
			    int16_t	xDst,
			    int16_t	yDst,
			    uint16_t	width,
			    uint16_t	height)
{
    uint32_t	src;
    uint16_t	*dstLine, *dst, d;
    uint16_t	w;
    int	        dstStride;
    __m128i xmmSrc, xmmAlpha;
    __m128i xmmDst, xmmDst0, xmmDst1, xmmDst2, xmmDst3;

    fbComposeGetSolid(pSrc, src, pDst->bits.format);

    if (src >> 24 == 0)
        return;

    fbComposeGetStart (pDst, xDst, yDst, uint16_t, dstStride, dstLine, 1);

    xmmSrc = expandPixel_32_1x128 (src);
    xmmAlpha = expandAlpha_1x128 (xmmSrc);

    while (height--)
    {
        dst = dstLine;

        /* call prefetch hint to optimize cache load*/
        cachePrefetch ((__m128i*)dst);

        dstLine += dstStride;
        w = width;

        while (w && (unsigned long)dst & 15)
        {
            d = *dst;

            *dst++ = pack565_32_16 (pack_1x64_32 (over_1x64 (_mm_movepi64_pi64 (xmmSrc),
                                                             _mm_movepi64_pi64 (xmmAlpha),
                                                             expand565_16_1x64 (d))));
            w--;
        }

        /* call prefetch hint to optimize cache load*/
        cachePrefetch ((__m128i*)dst);

        while (w >= 8)
        {
            /* fill cache line with next memory */
            cachePrefetchNext ((__m128i*)dst);

	    xmmDst = load128Aligned ((__m128i*)dst);
	    
	    unpack565_128_4x128 (xmmDst, &xmmDst0, &xmmDst1, &xmmDst2, &xmmDst3);
	    
            over_2x128 (&xmmSrc, &xmmSrc, &xmmAlpha, &xmmAlpha, &xmmDst0, &xmmDst1);
            over_2x128 (&xmmSrc, &xmmSrc, &xmmAlpha, &xmmAlpha, &xmmDst2, &xmmDst3);

            xmmDst = pack565_4x128_128 (&xmmDst0, &xmmDst1, &xmmDst2, &xmmDst3);
            save128Aligned ((__m128i*)dst, xmmDst);

            dst += 8;
            w -= 8;
        }

        while (w--)
        {
            d = *dst;
            *dst++ = pack565_32_16 (pack_1x64_32 (over_1x64 (_mm_movepi64_pi64 (xmmSrc),
                                                             _mm_movepi64_pi64 (xmmAlpha),
                                                             expand565_16_1x64 (d))));
        }
    }

    _mm_empty();
}

/* -------------------------------------------------------------------------------------------------
 * fbCompositeSolidMask_nx8888x8888C
 */

void
fbCompositeSolidMask_nx8888x8888Csse2 (pixman_op_t op,
				      pixman_image_t * pSrc,
				      pixman_image_t * pMask,
				      pixman_image_t * pDst,
				      int16_t	xSrc,
				      int16_t	ySrc,
				      int16_t	xMask,
				      int16_t	yMask,
				      int16_t	xDst,
				      int16_t	yDst,
				      uint16_t	width,
				      uint16_t	height)
{
    uint32_t	src, srca;
    uint32_t	*dstLine, d;
    uint32_t	*maskLine, m;
    uint32_t    packCmp;
    int	dstStride, maskStride;

    __m128i xmmSrc, xmmAlpha;
    __m128i xmmDst, xmmDstLo, xmmDstHi;
    __m128i xmmMask, xmmMaskLo, xmmMaskHi;

    __m64 mmxSrc, mmxAlpha, mmxMask, mmxDst;

    fbComposeGetSolid(pSrc, src, pDst->bits.format);

    srca = src >> 24;
    if (srca == 0)
	return;

    fbComposeGetStart (pDst, xDst, yDst, uint32_t, dstStride, dstLine, 1);
    fbComposeGetStart (pMask, xMask, yMask, uint32_t, maskStride, maskLine, 1);

    xmmSrc = _mm_unpacklo_epi8 (createMask_2x32_128 (src, src), _mm_setzero_si128 ());
    xmmAlpha = expandAlpha_1x128 (xmmSrc);
    mmxSrc   = _mm_movepi64_pi64 (xmmSrc);
    mmxAlpha = _mm_movepi64_pi64 (xmmAlpha);

    while (height--)
    {
        int w = width;
        uint32_t *pm = (uint32_t *)maskLine;
        uint32_t *pd = (uint32_t *)dstLine;

        dstLine += dstStride;
        maskLine += maskStride;

        /* call prefetch hint to optimize cache load*/
        cachePrefetch ((__m128i*)pd);
        cachePrefetch ((__m128i*)pm);

        while (w && (unsigned long)pd & 15)
        {
            m = *pm++;

            if (m)
            {
                d = *pd;
                mmxMask = unpack_32_1x64 (m);
                mmxDst = unpack_32_1x64 (d);

                *pd = pack_1x64_32 (inOver_1x64 (&mmxSrc,
                                                 &mmxAlpha,
                                                 &mmxMask,
                                                 &mmxDst));
            }

            pd++;
            w--;
        }

        /* call prefetch hint to optimize cache load*/
        cachePrefetch ((__m128i*)pd);
        cachePrefetch ((__m128i*)pm);

        while (w >= 4)
        {
            /* fill cache line with next memory */
            cachePrefetchNext ((__m128i*)pd);
            cachePrefetchNext ((__m128i*)pm);

            xmmMask = load128Unaligned ((__m128i*)pm);

            packCmp = _mm_movemask_epi8 (_mm_cmpeq_epi32 (xmmMask, _mm_setzero_si128()));

            /* if all bits in mask are zero, packCmp are equal to 0xffff */
            if (packCmp != 0xffff)
            {
                xmmDst = load128Aligned ((__m128i*)pd);

                unpack_128_2x128 (xmmMask, &xmmMaskLo, &xmmMaskHi);
                unpack_128_2x128 (xmmDst, &xmmDstLo, &xmmDstHi);

                inOver_2x128 (&xmmSrc, &xmmSrc, &xmmAlpha, &xmmAlpha, &xmmMaskLo, &xmmMaskHi, &xmmDstLo, &xmmDstHi);

                save128Aligned ((__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi));
            }

            pd += 4;
            pm += 4;
            w -= 4;
        }

        while (w)
        {
            m = *pm++;

            if (m)
            {
                d = *pd;
                mmxMask = unpack_32_1x64 (m);
                mmxDst = unpack_32_1x64 (d);

                *pd = pack_1x64_32 (inOver_1x64 (&mmxSrc,
                                                 &mmxAlpha,
                                                 &mmxMask,
                                                 &mmxDst));
            }

            pd++;
            w--;
        }
    }

    _mm_empty();
}


/* -------------------------------------------------------------------------------------------------
 * fbCompositeSrc_8888x8x8888
 */

void
fbCompositeSrc_8888x8x8888sse2 (pixman_op_t op,
			       pixman_image_t * pSrc,
			       pixman_image_t * pMask,
			       pixman_image_t * pDst,
			       int16_t	xSrc,
			       int16_t	ySrc,
			       int16_t      xMask,
			       int16_t      yMask,
			       int16_t      xDst,
			       int16_t      yDst,
			       uint16_t     width,
			       uint16_t     height)
{
    uint32_t	*dstLine, *dst;
    uint32_t	*srcLine, *src;
    uint32_t	mask;
    uint16_t	w;
    int	dstStride, srcStride;

    __m128i xmmMask;
    __m128i xmmSrc, xmmSrcLo, xmmSrcHi;
    __m128i xmmDst, xmmDstLo, xmmDstHi;
    __m128i xmmAlphaLo, xmmAlphaHi;

    fbComposeGetStart (pDst, xDst, yDst, uint32_t, dstStride, dstLine, 1);
    fbComposeGetStart (pSrc, xSrc, ySrc, uint32_t, srcStride, srcLine, 1);
    fbComposeGetSolid (pMask, mask, pDst->bits.format);

    xmmMask = createMask_16_128 (mask >> 24);

    while (height--)
    {
        dst = dstLine;
        dstLine += dstStride;
        src = srcLine;
        srcLine += srcStride;
        w = width;

        /* call prefetch hint to optimize cache load*/
        cachePrefetch ((__m128i*)dst);
        cachePrefetch ((__m128i*)src);

        while (w && (unsigned long)dst & 15)
        {
            uint32_t s = *src++;
            uint32_t d = *dst;

            __m64 ms = unpack_32_1x64 (s);
            __m64 alpha    = expandAlpha_1x64 (ms);
            __m64 dest     = _mm_movepi64_pi64 (xmmMask);
            __m64 alphaDst = unpack_32_1x64 (d);

            *dst++ = pack_1x64_32 (inOver_1x64 (&ms,
                                                &alpha,
                                                &dest,
                                                &alphaDst));

            w--;
        }

        /* call prefetch hint to optimize cache load*/
        cachePrefetch ((__m128i*)dst);
        cachePrefetch ((__m128i*)src);

        while (w >= 4)
        {
            /* fill cache line with next memory */
            cachePrefetchNext ((__m128i*)dst);
            cachePrefetchNext ((__m128i*)src);

            xmmSrc = load128Unaligned ((__m128i*)src);
            xmmDst = load128Aligned ((__m128i*)dst);

            unpack_128_2x128 (xmmSrc, &xmmSrcLo, &xmmSrcHi);
            unpack_128_2x128 (xmmDst, &xmmDstLo, &xmmDstHi);
            expandAlpha_2x128 (xmmSrcLo, xmmSrcHi, &xmmAlphaLo, &xmmAlphaHi);

            inOver_2x128 (&xmmSrcLo, &xmmSrcHi, &xmmAlphaLo, &xmmAlphaHi, &xmmMask, &xmmMask, &xmmDstLo, &xmmDstHi);

            save128Aligned( (__m128i*)dst, pack_2x128_128 (xmmDstLo, xmmDstHi));

            dst += 4;
            src += 4;
            w -= 4;
        }

        while (w)
        {
            uint32_t s = *src++;
            uint32_t d = *dst;

            __m64 ms = unpack_32_1x64 (s);
            __m64 alpha = expandAlpha_1x64 (ms);
            __m64 mask  = _mm_movepi64_pi64 (xmmMask);
            __m64 dest  = unpack_32_1x64 (d);

            *dst++ = pack_1x64_32 (inOver_1x64 (&ms,
                                                &alpha,
                                                &mask,
                                                &dest));

            w--;
        }
    }

    _mm_empty();
}

/* -------------------------------------------------------------------------------------------------
 * fbCompositeSrc_x888xnx8888
 */
void
fbCompositeSrc_x888xnx8888sse2 (pixman_op_t op,
			       pixman_image_t * pSrc,
			       pixman_image_t * pMask,
			       pixman_image_t * pDst,
			       int16_t	xSrc,
			       int16_t	ySrc,
			       int16_t      xMask,
			       int16_t      yMask,
			       int16_t      xDst,
			       int16_t      yDst,
			       uint16_t     width,
			       uint16_t     height)
{
    uint32_t	*dstLine, *dst;
    uint32_t	*srcLine, *src;
    uint32_t	mask;
    int	dstStride, srcStride;
    uint16_t	w;

    __m128i xmmMask, xmmAlpha;
    __m128i xmmSrc, xmmSrcLo, xmmSrcHi;
    __m128i xmmDst, xmmDstLo, xmmDstHi;

    fbComposeGetStart (pDst, xDst, yDst, uint32_t, dstStride, dstLine, 1);
    fbComposeGetStart (pSrc, xSrc, ySrc, uint32_t, srcStride, srcLine, 1);
    fbComposeGetSolid (pMask, mask, pDst->bits.format);

    xmmMask = createMask_16_128 (mask >> 24);
    xmmAlpha = Mask00ff;

    while (height--)
    {
        dst = dstLine;
        dstLine += dstStride;
        src = srcLine;
        srcLine += srcStride;
        w = width;

        /* call prefetch hint to optimize cache load*/
        cachePrefetch ((__m128i*)dst);
        cachePrefetch ((__m128i*)src);

        while (w && (unsigned long)dst & 15)
        {
            uint32_t s = (*src++) | 0xff000000;
            uint32_t d = *dst;

            __m64 src   = unpack_32_1x64 (s);
            __m64 alpha = _mm_movepi64_pi64 (xmmAlpha);
            __m64 mask  = _mm_movepi64_pi64 (xmmMask);
            __m64 dest  = unpack_32_1x64 (d);

            *dst++ = pack_1x64_32 (inOver_1x64 (&src,
                                                &alpha,
                                                &mask,
                                                &dest));

            w--;
        }

        /* call prefetch hint to optimize cache load*/
        cachePrefetch ((__m128i*)dst);
        cachePrefetch ((__m128i*)src);

        while (w >= 4)
        {
            /* fill cache line with next memory */
            cachePrefetchNext ((__m128i*)dst);
            cachePrefetchNext ((__m128i*)src);

            xmmSrc = _mm_or_si128 (load128Unaligned ((__m128i*)src), Maskff000000);
            xmmDst = load128Aligned ((__m128i*)dst);

            unpack_128_2x128 (xmmSrc, &xmmSrcLo, &xmmSrcHi);
            unpack_128_2x128 (xmmDst, &xmmDstLo, &xmmDstHi);

            inOver_2x128 (&xmmSrcLo, &xmmSrcHi, &xmmAlpha, &xmmAlpha, &xmmMask, &xmmMask, &xmmDstLo, &xmmDstHi);

            save128Aligned( (__m128i*)dst, pack_2x128_128 (xmmDstLo, xmmDstHi));

            dst += 4;
            src += 4;
            w -= 4;

        }

        while (w)
        {
            uint32_t s = (*src++) | 0xff000000;
            uint32_t d = *dst;

            __m64 src  = unpack_32_1x64 (s);
            __m64 alpha = _mm_movepi64_pi64 (xmmAlpha);
            __m64 mask  = _mm_movepi64_pi64 (xmmMask);
            __m64 dest  = unpack_32_1x64 (d);

            *dst++ = pack_1x64_32 (inOver_1x64 (&src,
                                                &alpha,
                                                &mask,
                                                &dest));

            w--;
        }
    }

    _mm_empty();
}

/* -------------------------------------------------------------------------------------------------
 * fbCompositeSrc_8888x8888
 */
void
fbCompositeSrc_8888x8888sse2 (pixman_op_t op,
			     pixman_image_t * pSrc,
			     pixman_image_t * pMask,
			     pixman_image_t * pDst,
			     int16_t	xSrc,
			     int16_t	ySrc,
			     int16_t      xMask,
			     int16_t      yMask,
			     int16_t      xDst,
			     int16_t      yDst,
			     uint16_t     width,
			     uint16_t     height)
{
    int	        dstStride, srcStride;
    uint32_t	*dstLine, *dst;
    uint32_t	*srcLine, *src;

    fbComposeGetStart (pDst, xDst, yDst, uint32_t, dstStride, dstLine, 1);
    fbComposeGetStart (pSrc, xSrc, ySrc, uint32_t, srcStride, srcLine, 1);

    dst = dstLine;
    src = srcLine;

    while (height--)
    {
        coreCombineOverUsse2 (dst, src, width);

        dst += dstStride;
        src += srcStride;
    }
    _mm_empty();
}

/* -------------------------------------------------------------------------------------------------
 * fbCompositeSrc_8888x0565
 */
static inline uint16_t
fbCompositeSrc_8888x0565pixel (uint32_t src, uint16_t dst)
{
    __m64       ms;

    ms = unpack_32_1x64 (src);
    return pack565_32_16( pack_1x64_32 (over_1x64 (ms,
                                                   expandAlpha_1x64 (ms),
                                                   expand565_16_1x64 (dst))));
}

void
fbCompositeSrc_8888x0565sse2 (pixman_op_t op,
			     pixman_image_t * pSrc,
			     pixman_image_t * pMask,
			     pixman_image_t * pDst,
			     int16_t      xSrc,
			     int16_t      ySrc,
			     int16_t      xMask,
			     int16_t      yMask,
			     int16_t      xDst,
			     int16_t      yDst,
			     uint16_t     width,
			     uint16_t     height)
{
    uint16_t	*dstLine, *dst, d;
    uint32_t	*srcLine, *src, s;
    int	dstStride, srcStride;
    uint16_t	w;

    __m128i xmmAlphaLo, xmmAlphaHi;
    __m128i xmmSrc, xmmSrcLo, xmmSrcHi;
    __m128i xmmDst, xmmDst0, xmmDst1, xmmDst2, xmmDst3;

    fbComposeGetStart (pDst, xDst, yDst, uint16_t, dstStride, dstLine, 1);
    fbComposeGetStart (pSrc, xSrc, ySrc, uint32_t, srcStride, srcLine, 1);

#if 0
    /* FIXME
     *
     * I copy the code from MMX one and keep the fixme.
     * If it's a problem there, probably is a problem here.
     */
    assert (pSrc->pDrawable == pMask->pDrawable);
#endif

    while (height--)
    {
        dst = dstLine;
        src = srcLine;

        /* call prefetch hint to optimize cache load*/
        cachePrefetch ((__m128i*)src);
        cachePrefetch ((__m128i*)dst);

        dstLine += dstStride;
        srcLine += srcStride;
        w = width;

        /* Align dst on a 16-byte boundary */
        while (w &&
               ((unsigned long)dst & 15))
        {
            s = *src++;
            d = *dst;

            *dst++ = fbCompositeSrc_8888x0565pixel (s, d);
            w--;
        }

        /* call prefetch hint to optimize cache load*/
        cachePrefetch ((__m128i*)src);
        cachePrefetch ((__m128i*)dst);

        /* It's a 8 pixel loop */
        while (w >= 8)
        {
            /* fill cache line with next memory */
            cachePrefetchNext ((__m128i*)src);
            cachePrefetchNext ((__m128i*)dst);

            /* I'm loading unaligned because I'm not sure about the address alignment. */
            xmmSrc = load128Unaligned ((__m128i*) src);
            xmmDst = load128Aligned ((__m128i*) dst);

            /* Unpacking */
            unpack_128_2x128 (xmmSrc, &xmmSrcLo, &xmmSrcHi);
            unpack565_128_4x128 (xmmDst, &xmmDst0, &xmmDst1, &xmmDst2, &xmmDst3);
            expandAlpha_2x128 (xmmSrcLo, xmmSrcHi, &xmmAlphaLo, &xmmAlphaHi);

            /* I'm loading next 4 pixels from memory before to optimze the memory read. */
            xmmSrc = load128Unaligned ((__m128i*) (src+4));

            over_2x128 (&xmmSrcLo, &xmmSrcHi, &xmmAlphaLo, &xmmAlphaHi, &xmmDst0, &xmmDst1);

            /* Unpacking */
            unpack_128_2x128 (xmmSrc, &xmmSrcLo, &xmmSrcHi);
            expandAlpha_2x128 (xmmSrcLo, xmmSrcHi, &xmmAlphaLo, &xmmAlphaHi);

            over_2x128 (&xmmSrcLo, &xmmSrcHi, &xmmAlphaLo, &xmmAlphaHi, &xmmDst2, &xmmDst3);

            save128Aligned ((__m128i*)dst, pack565_4x128_128 (&xmmDst0, &xmmDst1, &xmmDst2, &xmmDst3));

            w -= 8;
            dst += 8;
            src += 8;
        }

        while (w--)
        {
            s = *src++;
            d = *dst;

            *dst++ = fbCompositeSrc_8888x0565pixel (s, d);
        }
    }

    _mm_empty();
}

/* -------------------------------------------------------------------------------------------------
 * fbCompositeSolidMask_nx8x8888
 */

void
fbCompositeSolidMask_nx8x8888sse2 (pixman_op_t op,
				  pixman_image_t * pSrc,
				  pixman_image_t * pMask,
				  pixman_image_t * pDst,
				  int16_t      xSrc,
				  int16_t      ySrc,
				  int16_t      xMask,
				  int16_t      yMask,
				  int16_t      xDst,
				  int16_t      yDst,
				  uint16_t     width,
				  uint16_t     height)
{
    uint32_t	src, srca;
    uint32_t	*dstLine, *dst;
    uint8_t	*maskLine, *mask;
    int	dstStride, maskStride;
    uint16_t	w;
    uint32_t m, d;

    __m128i xmmSrc, xmmAlpha, xmmDef;
    __m128i xmmDst, xmmDstLo, xmmDstHi;
    __m128i xmmMask, xmmMaskLo, xmmMaskHi;

    __m64 mmxSrc, mmxAlpha, mmxMask, mmxDest;

    fbComposeGetSolid(pSrc, src, pDst->bits.format);

    srca = src >> 24;
    if (srca == 0)
	return;

    fbComposeGetStart (pDst, xDst, yDst, uint32_t, dstStride, dstLine, 1);
    fbComposeGetStart (pMask, xMask, yMask, uint8_t, maskStride, maskLine, 1);

    xmmDef = createMask_2x32_128 (src, src);
    xmmSrc = expandPixel_32_1x128 (src);
    xmmAlpha = expandAlpha_1x128 (xmmSrc);
    mmxSrc   = _mm_movepi64_pi64 (xmmSrc);
    mmxAlpha = _mm_movepi64_pi64 (xmmAlpha);

    while (height--)
    {
        dst = dstLine;
        dstLine += dstStride;
        mask = maskLine;
        maskLine += maskStride;
        w = width;

        /* call prefetch hint to optimize cache load*/
        cachePrefetch ((__m128i*)mask);
        cachePrefetch ((__m128i*)dst);

        while (w && (unsigned long)dst & 15)
        {
            uint8_t m = *mask++;

            if (m)
            {
                d = *dst;
                mmxMask = expandPixel_8_1x64 (m);
                mmxDest = unpack_32_1x64 (d);

                *dst = pack_1x64_32 (inOver_1x64 (&mmxSrc,
                                                  &mmxAlpha,
                                                  &mmxMask,
                                                  &mmxDest));
            }

            w--;
            dst++;
        }

        /* call prefetch hint to optimize cache load*/
        cachePrefetch ((__m128i*)mask);
        cachePrefetch ((__m128i*)dst);

        while (w >= 4)
        {
            /* fill cache line with next memory */
            cachePrefetchNext ((__m128i*)mask);
            cachePrefetchNext ((__m128i*)dst);

            m = *((uint32_t*)mask);

            if (srca == 0xff && m == 0xffffffff)
            {
                save128Aligned ((__m128i*)dst, xmmDef);
            }
            else if (m)
            {
                xmmDst = load128Aligned ((__m128i*) dst);
                xmmMask = unpack_32_1x128 (m);
                xmmMask = _mm_unpacklo_epi8 (xmmMask, _mm_setzero_si128());

                /* Unpacking */
                unpack_128_2x128 (xmmDst, &xmmDstLo, &xmmDstHi);
                unpack_128_2x128 (xmmMask, &xmmMaskLo, &xmmMaskHi);

                expandAlphaRev_2x128 (xmmMaskLo, xmmMaskHi, &xmmMaskLo, &xmmMaskHi);

                inOver_2x128 (&xmmSrc, &xmmSrc, &xmmAlpha, &xmmAlpha, &xmmMaskLo, &xmmMaskHi, &xmmDstLo, &xmmDstHi);

                save128Aligned ((__m128i*)dst, pack_2x128_128 (xmmDstLo, xmmDstHi));
            }

            w -= 4;
            dst += 4;
            mask += 4;
        }

        while (w)
        {
            uint8_t m = *mask++;

            if (m)
            {
                d = *dst;
                mmxMask = expandPixel_8_1x64 (m);
                mmxDest = unpack_32_1x64 (d);

                *dst = pack_1x64_32 (inOver_1x64 (&mmxSrc,
                                                  &mmxAlpha,
                                                  &mmxMask,
                                                  &mmxDest));
            }

            w--;
            dst++;
        }
    }

    _mm_empty();
}

/* -------------------------------------------------------------------------------------------------
 * fbCompositeSolidMask_nx8x8888
 */

pixman_bool_t
pixmanFillsse2 (uint32_t *bits,
		 int stride,
		 int bpp,
		 int x,
		 int y,
		 int width,
		 int height,
		 uint32_t data)
{
    uint32_t	byte_width;
    uint8_t	    *byte_line;

    __m128i xmmDef;

    if (bpp == 16 && (data >> 16 != (data & 0xffff)))
	return FALSE;

    if (bpp != 16 && bpp != 32)
	return FALSE;

    if (bpp == 16)
    {
        stride = stride * (int) sizeof (uint32_t) / 2;
        byte_line = (uint8_t *)(((uint16_t *)bits) + stride * y + x);
        byte_width = 2 * width;
        stride *= 2;
    }
    else
    {
        stride = stride * (int) sizeof (uint32_t) / 4;
        byte_line = (uint8_t *)(((uint32_t *)bits) + stride * y + x);
        byte_width = 4 * width;
        stride *= 4;
    }

    cachePrefetch ((__m128i*)byte_line);
    xmmDef = createMask_2x32_128 (data, data);

    while (height--)
    {
        int w;
        uint8_t *d = byte_line;
        byte_line += stride;
        w = byte_width;


        cachePrefetchNext ((__m128i*)d);

        while (w >= 2 && ((unsigned long)d & 3))
        {
            *(uint16_t *)d = data;
            w -= 2;
            d += 2;
        }

        while (w >= 4 && ((unsigned long)d & 15))
        {
            *(uint32_t *)d = data;

            w -= 4;
            d += 4;
        }

        cachePrefetchNext ((__m128i*)d);

        while (w >= 128)
        {
            cachePrefetch (((__m128i*)d) + 12);

            save128Aligned ((__m128i*)(d),     xmmDef);
            save128Aligned ((__m128i*)(d+16),  xmmDef);
            save128Aligned ((__m128i*)(d+32),  xmmDef);
            save128Aligned ((__m128i*)(d+48),  xmmDef);
            save128Aligned ((__m128i*)(d+64),  xmmDef);
            save128Aligned ((__m128i*)(d+80),  xmmDef);
            save128Aligned ((__m128i*)(d+96),  xmmDef);
            save128Aligned ((__m128i*)(d+112), xmmDef);

            d += 128;
            w -= 128;
        }

        if (w >= 64)
        {
            cachePrefetch (((__m128i*)d) + 8);

            save128Aligned ((__m128i*)(d),     xmmDef);
            save128Aligned ((__m128i*)(d+16),  xmmDef);
            save128Aligned ((__m128i*)(d+32),  xmmDef);
            save128Aligned ((__m128i*)(d+48),  xmmDef);

            d += 64;
            w -= 64;
        }

        cachePrefetchNext ((__m128i*)d);

        if (w >= 32)
        {
            save128Aligned ((__m128i*)(d),     xmmDef);
            save128Aligned ((__m128i*)(d+16),  xmmDef);

            d += 32;
            w -= 32;
        }

        if (w >= 16)
        {
            save128Aligned ((__m128i*)(d),     xmmDef);

            d += 16;
            w -= 16;
        }

        cachePrefetchNext ((__m128i*)d);

        while (w >= 4)
        {
            *(uint32_t *)d = data;

            w -= 4;
            d += 4;
        }

        if (w >= 2)
        {
            *(uint16_t *)d = data;
            w -= 2;
            d += 2;
        }
    }

    _mm_empty();
    return TRUE;
}

void
fbCompositeSolidMaskSrc_nx8x8888sse2 (pixman_op_t op,
				     pixman_image_t * pSrc,
				     pixman_image_t * pMask,
				     pixman_image_t * pDst,
				     int16_t      xSrc,
				     int16_t      ySrc,
				     int16_t      xMask,
				     int16_t      yMask,
				     int16_t      xDst,
				     int16_t      yDst,
				     uint16_t     width,
				     uint16_t     height)
{
    uint32_t	src, srca;
    uint32_t	*dstLine, *dst;
    uint8_t	*maskLine, *mask;
    int	dstStride, maskStride;
    uint16_t	w;
    uint32_t    m;

    __m128i xmmSrc, xmmDef;
    __m128i xmmMask, xmmMaskLo, xmmMaskHi;

    fbComposeGetSolid(pSrc, src, pDst->bits.format);

    srca = src >> 24;
    if (srca == 0)
    {
        pixmanFillsse2 (pDst->bits.bits, pDst->bits.rowstride,
                        PIXMAN_FORMAT_BPP (pDst->bits.format),
                        xDst, yDst, width, height, 0);
        return;
    }

    fbComposeGetStart (pDst, xDst, yDst, uint32_t, dstStride, dstLine, 1);
    fbComposeGetStart (pMask, xMask, yMask, uint8_t, maskStride, maskLine, 1);

    xmmDef = createMask_2x32_128 (src, src);
    xmmSrc = expandPixel_32_1x128 (src);

    while (height--)
    {
        dst = dstLine;
        dstLine += dstStride;
        mask = maskLine;
        maskLine += maskStride;
        w = width;

        /* call prefetch hint to optimize cache load*/
        cachePrefetch ((__m128i*)mask);
        cachePrefetch ((__m128i*)dst);

        while (w && (unsigned long)dst & 15)
        {
            uint8_t m = *mask++;

            if (m)
            {
                *dst = pack_1x64_32 (pixMultiply_1x64 (_mm_movepi64_pi64 (xmmSrc), expandPixel_8_1x64 (m)));
            }
            else
            {
                *dst = 0;
            }

            w--;
            dst++;
        }

        /* call prefetch hint to optimize cache load*/
        cachePrefetch ((__m128i*)mask);
        cachePrefetch ((__m128i*)dst);

        while (w >= 4)
        {
            /* fill cache line with next memory */
            cachePrefetchNext ((__m128i*)mask);
            cachePrefetchNext ((__m128i*)dst);

            m = *((uint32_t*)mask);

            if (srca == 0xff && m == 0xffffffff)
            {
                save128Aligned ((__m128i*)dst, xmmDef);
            }
            else if (m)
            {
                xmmMask = unpack_32_1x128 (m);
                xmmMask = _mm_unpacklo_epi8 (xmmMask, _mm_setzero_si128());

                /* Unpacking */
                unpack_128_2x128 (xmmMask, &xmmMaskLo, &xmmMaskHi);

                expandAlphaRev_2x128 (xmmMaskLo, xmmMaskHi, &xmmMaskLo, &xmmMaskHi);

                pixMultiply_2x128 (&xmmSrc, &xmmSrc, &xmmMaskLo, &xmmMaskHi, &xmmMaskLo, &xmmMaskHi);

                save128Aligned ((__m128i*)dst, pack_2x128_128 (xmmMaskLo, xmmMaskHi));
            }
            else
            {
                save128Aligned ((__m128i*)dst, _mm_setzero_si128());
            }

            w -= 4;
            dst += 4;
            mask += 4;
        }

        while (w)
        {
            uint8_t m = *mask++;

            if (m)
            {
                *dst = pack_1x64_32 (pixMultiply_1x64 (_mm_movepi64_pi64 (xmmSrc), expandPixel_8_1x64 (m)));
            }
            else
            {
                *dst = 0;
            }

            w--;
            dst++;
        }
    }

    _mm_empty();
}

/* -------------------------------------------------------------------------------------------------
 * fbCompositeSolidMask_nx8x0565
 */

void
fbCompositeSolidMask_nx8x0565sse2 (pixman_op_t op,
				  pixman_image_t * pSrc,
				  pixman_image_t * pMask,
				  pixman_image_t * pDst,
				  int16_t      xSrc,
				  int16_t      ySrc,
				  int16_t      xMask,
				  int16_t      yMask,
				  int16_t      xDst,
				  int16_t      yDst,
				  uint16_t     width,
				  uint16_t     height)
{
    uint32_t	src, srca;
    uint16_t	*dstLine, *dst, d;
    uint8_t	*maskLine, *mask;
    int	dstStride, maskStride;
    uint16_t	w;
    uint32_t m;
    __m64 mmxSrc, mmxAlpha, mmxMask, mmxDest;

    __m128i xmmSrc, xmmAlpha;
    __m128i xmmMask, xmmMaskLo, xmmMaskHi;
    __m128i xmmDst, xmmDst0, xmmDst1, xmmDst2, xmmDst3;

    fbComposeGetSolid(pSrc, src, pDst->bits.format);

    srca = src >> 24;
    if (srca == 0)
	return;

    fbComposeGetStart (pDst, xDst, yDst, uint16_t, dstStride, dstLine, 1);
    fbComposeGetStart (pMask, xMask, yMask, uint8_t, maskStride, maskLine, 1);

    xmmSrc = expandPixel_32_1x128 (src);
    xmmAlpha = expandAlpha_1x128 (xmmSrc);
    mmxSrc = _mm_movepi64_pi64 (xmmSrc);
    mmxAlpha = _mm_movepi64_pi64 (xmmAlpha);

    while (height--)
    {
        dst = dstLine;
        dstLine += dstStride;
        mask = maskLine;
        maskLine += maskStride;
        w = width;

        /* call prefetch hint to optimize cache load*/
        cachePrefetch ((__m128i*)mask);
        cachePrefetch ((__m128i*)dst);

        while (w && (unsigned long)dst & 15)
        {
            m = *mask++;

            if (m)
            {
                d = *dst;
                mmxMask = expandAlphaRev_1x64 (unpack_32_1x64 (m));
                mmxDest = expand565_16_1x64 (d);

                *dst = pack565_32_16 (pack_1x64_32 (inOver_1x64 (&mmxSrc,
                                                                 &mmxAlpha,
                                                                 &mmxMask,
                                                                 &mmxDest)));
            }

            w--;
            dst++;
        }

        /* call prefetch hint to optimize cache load*/
        cachePrefetch ((__m128i*)mask);
        cachePrefetch ((__m128i*)dst);

        while (w >= 8)
        {
            /* fill cache line with next memory */
            cachePrefetchNext ((__m128i*)mask);
            cachePrefetchNext ((__m128i*)dst);

            xmmDst = load128Aligned ((__m128i*) dst);
            unpack565_128_4x128 (xmmDst, &xmmDst0, &xmmDst1, &xmmDst2, &xmmDst3);

            m = *((uint32_t*)mask);
            mask += 4;

            if (m)
            {
                xmmMask = unpack_32_1x128 (m);
                xmmMask = _mm_unpacklo_epi8 (xmmMask, _mm_setzero_si128());

                /* Unpacking */
                unpack_128_2x128 (xmmMask, &xmmMaskLo, &xmmMaskHi);

                expandAlphaRev_2x128 (xmmMaskLo, xmmMaskHi, &xmmMaskLo, &xmmMaskHi);
                inOver_2x128 (&xmmSrc, &xmmSrc, &xmmAlpha, &xmmAlpha, &xmmMaskLo, &xmmMaskHi, &xmmDst0, &xmmDst1);
            }

            m = *((uint32_t*)mask);
            mask += 4;

            if (m)
            {
                xmmMask = unpack_32_1x128 (m);
                xmmMask = _mm_unpacklo_epi8 (xmmMask, _mm_setzero_si128());

                /* Unpacking */
                unpack_128_2x128 (xmmMask, &xmmMaskLo, &xmmMaskHi);

                expandAlphaRev_2x128 (xmmMaskLo, xmmMaskHi, &xmmMaskLo, &xmmMaskHi);
                inOver_2x128 (&xmmSrc, &xmmSrc, &xmmAlpha, &xmmAlpha, &xmmMaskLo, &xmmMaskHi, &xmmDst2, &xmmDst3);
            }

            save128Aligned ((__m128i*)dst, pack565_4x128_128 (&xmmDst0, &xmmDst1, &xmmDst2, &xmmDst3));

            w -= 8;
            dst += 8;
        }

        while (w)
        {
            m = *mask++;

            if (m)
            {
                d = *dst;
                mmxMask = expandAlphaRev_1x64 (unpack_32_1x64 (m));
                mmxDest = expand565_16_1x64 (d);

                *dst = pack565_32_16 (pack_1x64_32 (inOver_1x64 (&mmxSrc,
                                                                 &mmxAlpha,
                                                                 &mmxMask,
                                                                 &mmxDest)));
            }

            w--;
            dst++;
        }
    }

    _mm_empty();
}

/* -------------------------------------------------------------------------------------------------
 * fbCompositeSrc_8888RevNPx0565
 */

void
fbCompositeSrc_8888RevNPx0565sse2 (pixman_op_t op,
				  pixman_image_t * pSrc,
				  pixman_image_t * pMask,
				  pixman_image_t * pDst,
				  int16_t      xSrc,
				  int16_t      ySrc,
				  int16_t      xMask,
				  int16_t      yMask,
				  int16_t      xDst,
				  int16_t      yDst,
				  uint16_t     width,
				  uint16_t     height)
{
    uint16_t	*dstLine, *dst, d;
    uint32_t	*srcLine, *src, s;
    int	dstStride, srcStride;
    uint16_t	w;
    uint32_t    packCmp;

    __m64 ms;
    __m128i xmmSrc, xmmSrcLo, xmmSrcHi;
    __m128i xmmDst, xmmDst0, xmmDst1, xmmDst2, xmmDst3;

    fbComposeGetStart (pDst, xDst, yDst, uint16_t, dstStride, dstLine, 1);
    fbComposeGetStart (pSrc, xSrc, ySrc, uint32_t, srcStride, srcLine, 1);

#if 0
    /* FIXME
     *
     * I copy the code from MMX one and keep the fixme.
     * If it's a problem there, probably is a problem here.
     */
    assert (pSrc->pDrawable == pMask->pDrawable);
#endif

    while (height--)
    {
        dst = dstLine;
        dstLine += dstStride;
        src = srcLine;
        srcLine += srcStride;
        w = width;

        /* call prefetch hint to optimize cache load*/
        cachePrefetch ((__m128i*)src);
        cachePrefetch ((__m128i*)dst);

        while (w && (unsigned long)dst & 15)
        {
            s = *src++;
            d = *dst;

            ms = unpack_32_1x64 (s);

            *dst++ = pack565_32_16 (pack_1x64_32 (overRevNonPre_1x64(ms, expand565_16_1x64 (d))));
            w--;
        }

        /* call prefetch hint to optimize cache load*/
        cachePrefetch ((__m128i*)src);
        cachePrefetch ((__m128i*)dst);

        while (w >= 8)
        {
            /* fill cache line with next memory */
            cachePrefetchNext ((__m128i*)src);
            cachePrefetchNext ((__m128i*)dst);

            /* First round */
            xmmSrc = load128Unaligned((__m128i*)src);
            xmmDst = load128Aligned  ((__m128i*)dst);

            packCmp = packAlpha (xmmSrc);

            unpack565_128_4x128 (xmmDst, &xmmDst0, &xmmDst1, &xmmDst2, &xmmDst3);
            unpack_128_2x128 (xmmSrc, &xmmSrcLo, &xmmSrcHi);

            /* preload next round*/
            xmmSrc = load128Unaligned((__m128i*)(src+4));
            /* preload next round*/

            if (packCmp == 0xffffffff)
            {
                invertColors_2x128 (xmmSrcLo, xmmSrcHi, &xmmDst0, &xmmDst1);
            }
            else if (packCmp)
            {
                overRevNonPre_2x128 (xmmSrcLo, xmmSrcHi, &xmmDst0, &xmmDst1);
            }

            /* Second round */
            packCmp = packAlpha (xmmSrc);

            unpack_128_2x128 (xmmSrc, &xmmSrcLo, &xmmSrcHi);

            if (packCmp == 0xffffffff)
            {
                invertColors_2x128 (xmmSrcLo, xmmSrcHi, &xmmDst2, &xmmDst3);
            }
            else if (packCmp)
            {
                overRevNonPre_2x128 (xmmSrcLo, xmmSrcHi, &xmmDst2, &xmmDst3);
            }

            save128Aligned ((__m128i*)dst, pack565_4x128_128 (&xmmDst0, &xmmDst1, &xmmDst2, &xmmDst3));

            w -= 8;
            src += 8;
            dst += 8;
        }

        while (w)
        {
            s = *src++;
            d = *dst;

            ms = unpack_32_1x64 (s);

            *dst++ = pack565_32_16 (pack_1x64_32 (overRevNonPre_1x64(ms, expand565_16_1x64 (d))));
            w--;
        }
    }

    _mm_empty();
}

/* "8888RevNP" is GdkPixbuf's format: ABGR, non premultiplied */

/* -------------------------------------------------------------------------------------------------
 * fbCompositeSrc_8888RevNPx8888
 */

void
fbCompositeSrc_8888RevNPx8888sse2 (pixman_op_t op,
				  pixman_image_t * pSrc,
				  pixman_image_t * pMask,
				  pixman_image_t * pDst,
				  int16_t      xSrc,
				  int16_t      ySrc,
				  int16_t      xMask,
				  int16_t      yMask,
				  int16_t      xDst,
				  int16_t      yDst,
				  uint16_t     width,
				  uint16_t     height)
{
    uint32_t	*dstLine, *dst, d;
    uint32_t	*srcLine, *src, s;
    int	dstStride, srcStride;
    uint16_t	w;
    uint32_t    packCmp;

    __m128i xmmSrcLo, xmmSrcHi;
    __m128i xmmDstLo, xmmDstHi;

    fbComposeGetStart (pDst, xDst, yDst, uint32_t, dstStride, dstLine, 1);
    fbComposeGetStart (pSrc, xSrc, ySrc, uint32_t, srcStride, srcLine, 1);

#if 0
    /* FIXME
     *
     * I copy the code from MMX one and keep the fixme.
     * If it's a problem there, probably is a problem here.
     */
    assert (pSrc->pDrawable == pMask->pDrawable);
#endif

    while (height--)
    {
        dst = dstLine;
        dstLine += dstStride;
        src = srcLine;
        srcLine += srcStride;
        w = width;

        /* call prefetch hint to optimize cache load*/
        cachePrefetch ((__m128i*)src);
        cachePrefetch ((__m128i*)dst);

        while (w && (unsigned long)dst & 15)
        {
            s = *src++;
            d = *dst;

            *dst++ = pack_1x64_32 (overRevNonPre_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (d)));

            w--;
        }

        /* call prefetch hint to optimize cache load*/
        cachePrefetch ((__m128i*)src);
        cachePrefetch ((__m128i*)dst);

        while (w >= 4)
        {
            /* fill cache line with next memory */
            cachePrefetchNext ((__m128i*)src);
            cachePrefetchNext ((__m128i*)dst);

            xmmSrcHi = load128Unaligned((__m128i*)src);

            packCmp = packAlpha (xmmSrcHi);

            unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi);

            if (packCmp == 0xffffffff)
            {
                invertColors_2x128( xmmSrcLo, xmmSrcHi, &xmmDstLo, &xmmDstHi);

                save128Aligned ((__m128i*)dst, pack_2x128_128 (xmmDstLo, xmmDstHi));
            }
            else if (packCmp)
            {
                xmmDstHi = load128Aligned  ((__m128i*)dst);

                unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi);

                overRevNonPre_2x128 (xmmSrcLo, xmmSrcHi, &xmmDstLo, &xmmDstHi);

                save128Aligned ((__m128i*)dst, pack_2x128_128 (xmmDstLo, xmmDstHi));
            }

            w -= 4;
            dst += 4;
            src += 4;
        }

        while (w)
        {
            s = *src++;
            d = *dst;

            *dst++ = pack_1x64_32 (overRevNonPre_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (d)));

            w--;
        }
    }

    _mm_empty();
}

/* -------------------------------------------------------------------------------------------------
 * fbCompositeSolidMask_nx8888x0565C
 */

void
fbCompositeSolidMask_nx8888x0565Csse2 (pixman_op_t op,
				      pixman_image_t * pSrc,
				      pixman_image_t * pMask,
				      pixman_image_t * pDst,
				      int16_t      xSrc,
				      int16_t      ySrc,
				      int16_t      xMask,
				      int16_t      yMask,
				      int16_t      xDst,
				      int16_t      yDst,
				      uint16_t     width,
				      uint16_t     height)
{
    uint32_t	src, srca;
    uint16_t	*dstLine, *dst, d;
    uint32_t	*maskLine, *mask, m;
    int	dstStride, maskStride;
    int w;
    uint32_t packCmp;

    __m128i xmmSrc, xmmAlpha;
    __m128i xmmMask, xmmMaskLo, xmmMaskHi;
    __m128i xmmDst, xmmDst0, xmmDst1, xmmDst2, xmmDst3;

    __m64 mmxSrc, mmxAlpha, mmxMask, mmxDest;

    fbComposeGetSolid(pSrc, src, pDst->bits.format);

    srca = src >> 24;
    if (srca == 0)
        return;

    fbComposeGetStart (pDst, xDst, yDst, uint16_t, dstStride, dstLine, 1);
    fbComposeGetStart (pMask, xMask, yMask, uint32_t, maskStride, maskLine, 1);

    xmmSrc = expandPixel_32_1x128 (src);
    xmmAlpha = expandAlpha_1x128 (xmmSrc);
    mmxSrc = _mm_movepi64_pi64 (xmmSrc);
    mmxAlpha = _mm_movepi64_pi64 (xmmAlpha);

    while (height--)
    {
        w = width;
        mask = maskLine;
        dst = dstLine;
        maskLine += maskStride;
        dstLine += dstStride;

        /* call prefetch hint to optimize cache load*/
        cachePrefetch ((__m128i*)mask);
        cachePrefetch ((__m128i*)dst);

        while (w && ((unsigned long)dst & 15))
        {
            m = *(uint32_t *) mask;

            if (m)
            {
                d = *dst;
                mmxMask = unpack_32_1x64 (m);
                mmxDest = expand565_16_1x64 (d);

                *dst = pack565_32_16 (pack_1x64_32 (inOver_1x64 (&mmxSrc,
                                                                 &mmxAlpha,
                                                                 &mmxMask,
                                                                 &mmxDest)));
            }

            w--;
            dst++;
            mask++;
        }

        /* call prefetch hint to optimize cache load*/
        cachePrefetch ((__m128i*)mask);
        cachePrefetch ((__m128i*)dst);

        while (w >= 8)
        {
            /* fill cache line with next memory */
            cachePrefetchNext ((__m128i*)mask);
            cachePrefetchNext ((__m128i*)dst);

            /* First round */
            xmmMask = load128Unaligned((__m128i*)mask);
            xmmDst = load128Aligned((__m128i*)dst);

            packCmp = _mm_movemask_epi8 (_mm_cmpeq_epi32 (xmmMask, _mm_setzero_si128()));

            unpack565_128_4x128 (xmmDst, &xmmDst0, &xmmDst1, &xmmDst2, &xmmDst3);
            unpack_128_2x128 (xmmMask, &xmmMaskLo, &xmmMaskHi);

            /* preload next round*/
            xmmMask = load128Unaligned((__m128i*)(mask+4));
            /* preload next round*/

            if (packCmp != 0xffff)
            {
                inOver_2x128(&xmmSrc, &xmmSrc, &xmmAlpha, &xmmAlpha, &xmmMaskLo, &xmmMaskHi, &xmmDst0, &xmmDst1);
            }

            /* Second round */
            packCmp = _mm_movemask_epi8 (_mm_cmpeq_epi32 (xmmMask, _mm_setzero_si128()));

            unpack_128_2x128 (xmmMask, &xmmMaskLo, &xmmMaskHi);

            if (packCmp != 0xffff)
            {
                inOver_2x128(&xmmSrc, &xmmSrc, &xmmAlpha, &xmmAlpha, &xmmMaskLo, &xmmMaskHi, &xmmDst2, &xmmDst3);
            }

            save128Aligned ((__m128i*)dst, pack565_4x128_128 (&xmmDst0, &xmmDst1, &xmmDst2, &xmmDst3));

            w -= 8;
            dst += 8;
            mask += 8;
        }

        while (w)
        {
            m = *(uint32_t *) mask;

            if (m)
            {
                d = *dst;
                mmxMask = unpack_32_1x64 (m);
                mmxDest = expand565_16_1x64 (d);

                *dst = pack565_32_16 (pack_1x64_32 (inOver_1x64 (&mmxSrc,
                                                                 &mmxAlpha,
                                                                 &mmxMask,
                                                                 &mmxDest)));
            }

            w--;
            dst++;
            mask++;
        }
    }

    _mm_empty ();
}

/* -------------------------------------------------------------------------------------------------
 * fbCompositeIn_nx8x8
 */

void
fbCompositeIn_nx8x8sse2 (pixman_op_t op,
			pixman_image_t * pSrc,
			pixman_image_t * pMask,
			pixman_image_t * pDst,
			int16_t      xSrc,
			int16_t      ySrc,
			int16_t      xMask,
			int16_t      yMask,
			int16_t      xDst,
			int16_t      yDst,
			uint16_t     width,
			uint16_t     height)
{
    uint8_t	*dstLine, *dst;
    uint8_t	*maskLine, *mask;
    int	dstStride, maskStride;
    uint16_t	w, d, m;
    uint32_t	src;
    uint8_t	sa;

    __m128i xmmAlpha;
    __m128i xmmMask, xmmMaskLo, xmmMaskHi;
    __m128i xmmDst, xmmDstLo, xmmDstHi;

    fbComposeGetStart (pDst, xDst, yDst, uint8_t, dstStride, dstLine, 1);
    fbComposeGetStart (pMask, xMask, yMask, uint8_t, maskStride, maskLine, 1);

    fbComposeGetSolid(pSrc, src, pDst->bits.format);

    sa = src >> 24;
    if (sa == 0)
        return;

    xmmAlpha = expandAlpha_1x128 (expandPixel_32_1x128 (src));

    while (height--)
    {
        dst = dstLine;
        dstLine += dstStride;
        mask = maskLine;
        maskLine += maskStride;
        w = width;

        /* call prefetch hint to optimize cache load*/
        cachePrefetch ((__m128i*)mask);
        cachePrefetch ((__m128i*)dst);

        while (w && ((unsigned long)dst & 15))
        {
            m = (uint32_t) *mask++;
            d = (uint32_t) *dst;

            *dst++ = (uint8_t) pack_1x64_32 (pixMultiply_1x64 (pixMultiply_1x64 (_mm_movepi64_pi64 (xmmAlpha), unpack_32_1x64 (m)),
                                                               unpack_32_1x64 (d)));
            w--;
        }

        /* call prefetch hint to optimize cache load*/
        cachePrefetch ((__m128i*)mask);
        cachePrefetch ((__m128i*)dst);

        while (w >= 16)
        {
            /* fill cache line with next memory */
            cachePrefetchNext ((__m128i*)mask);
            cachePrefetchNext ((__m128i*)dst);

            xmmMask = load128Unaligned((__m128i*)mask);
            xmmDst = load128Aligned((__m128i*)dst);

            unpack_128_2x128 (xmmMask, &xmmMaskLo, &xmmMaskHi);
            unpack_128_2x128 (xmmDst, &xmmDstLo, &xmmDstHi);

            pixMultiply_2x128 (&xmmAlpha, &xmmAlpha, &xmmMaskLo, &xmmMaskHi, &xmmMaskLo, &xmmMaskHi);
            pixMultiply_2x128 (&xmmMaskLo, &xmmMaskHi, &xmmDstLo, &xmmDstHi, &xmmDstLo, &xmmDstHi);

            save128Aligned ((__m128i*)dst, pack_2x128_128 (xmmDstLo, xmmDstHi));

            mask += 16;
            dst += 16;
            w -= 16;
        }

        while (w)
        {
            m = (uint32_t) *mask++;
            d = (uint32_t) *dst;

            *dst++ = (uint8_t) pack_1x64_32 (pixMultiply_1x64 (pixMultiply_1x64 (_mm_movepi64_pi64 (xmmAlpha), unpack_32_1x64 (m)),
                                                               unpack_32_1x64 (d)));
            w--;
        }
    }

    _mm_empty();
}

/* -------------------------------------------------------------------------------------------------
 * fbCompositeIn_8x8
 */

void
fbCompositeIn_8x8sse2 (pixman_op_t op,
		      pixman_image_t * pSrc,
		      pixman_image_t * pMask,
		      pixman_image_t * pDst,
		      int16_t      xSrc,
		      int16_t      ySrc,
		      int16_t      xMask,
		      int16_t      yMask,
		      int16_t      xDst,
		      int16_t      yDst,
		      uint16_t     width,
		      uint16_t     height)
{
    uint8_t	*dstLine, *dst;
    uint8_t	*srcLine, *src;
    int	srcStride, dstStride;
    uint16_t	w;
    uint32_t    s, d;

    __m128i xmmSrc, xmmSrcLo, xmmSrcHi;
    __m128i xmmDst, xmmDstLo, xmmDstHi;

    fbComposeGetStart (pDst, xDst, yDst, uint8_t, dstStride, dstLine, 1);
    fbComposeGetStart (pSrc, xSrc, ySrc, uint8_t, srcStride, srcLine, 1);

    while (height--)
    {
        dst = dstLine;
        dstLine += dstStride;
        src = srcLine;
        srcLine += srcStride;
        w = width;

        /* call prefetch hint to optimize cache load*/
        cachePrefetch ((__m128i*)src);
        cachePrefetch ((__m128i*)dst);

        while (w && ((unsigned long)dst & 15))
        {
            s = (uint32_t) *src++;
            d = (uint32_t) *dst;

            *dst++ = (uint8_t) pack_1x64_32 (pixMultiply_1x64 (unpack_32_1x64 (s),unpack_32_1x64 (d)));
            w--;
        }

        /* call prefetch hint to optimize cache load*/
        cachePrefetch ((__m128i*)src);
        cachePrefetch ((__m128i*)dst);

        while (w >= 16)
        {
            /* fill cache line with next memory */
            cachePrefetchNext ((__m128i*)src);
            cachePrefetchNext ((__m128i*)dst);

            xmmSrc = load128Unaligned((__m128i*)src);
            xmmDst = load128Aligned((__m128i*)dst);

            unpack_128_2x128 (xmmSrc, &xmmSrcLo, &xmmSrcHi);
            unpack_128_2x128 (xmmDst, &xmmDstLo, &xmmDstHi);

            pixMultiply_2x128 (&xmmSrcLo, &xmmSrcHi, &xmmDstLo, &xmmDstHi, &xmmDstLo, &xmmDstHi);

            save128Aligned ((__m128i*)dst, pack_2x128_128 (xmmDstLo, xmmDstHi));

            src += 16;
            dst += 16;
            w -= 16;
        }

        while (w)
        {
            s = (uint32_t) *src++;
            d = (uint32_t) *dst;

            *dst++ = (uint8_t) pack_1x64_32 (pixMultiply_1x64 (unpack_32_1x64 (s),unpack_32_1x64 (d)));
            w--;
        }
    }

    _mm_empty ();
}

/* -------------------------------------------------------------------------------------------------
 * fbCompositeSrcAdd_8888x8x8
 */

void
fbCompositeSrcAdd_8888x8x8sse2 (pixman_op_t op,
			       pixman_image_t * pSrc,
			       pixman_image_t * pMask,
			       pixman_image_t * pDst,
			       int16_t      xSrc,
			       int16_t      ySrc,
			       int16_t      xMask,
			       int16_t      yMask,
			       int16_t      xDst,
			       int16_t      yDst,
			       uint16_t     width,
			       uint16_t     height)
{
    uint8_t	*dstLine, *dst;
    uint8_t	*maskLine, *mask;
    int	dstStride, maskStride;
    uint16_t	w;
    uint32_t	src;
    uint8_t	sa;
    uint32_t m, d;

    __m128i xmmAlpha;
    __m128i xmmMask, xmmMaskLo, xmmMaskHi;
    __m128i xmmDst, xmmDstLo, xmmDstHi;

    fbComposeGetStart (pDst, xDst, yDst, uint8_t, dstStride, dstLine, 1);
    fbComposeGetStart (pMask, xMask, yMask, uint8_t, maskStride, maskLine, 1);

    fbComposeGetSolid(pSrc, src, pDst->bits.format);

    sa = src >> 24;
    if (sa == 0)
        return;

    xmmAlpha = expandAlpha_1x128 (expandPixel_32_1x128 (src));

    while (height--)
    {
        dst = dstLine;
        dstLine += dstStride;
        mask = maskLine;
        maskLine += maskStride;
        w = width;

        /* call prefetch hint to optimize cache load*/
        cachePrefetch ((__m128i*)mask);
        cachePrefetch ((__m128i*)dst);

        while (w && ((unsigned long)dst & 15))
        {
            m = (uint32_t) *mask++;
            d = (uint32_t) *dst;

            *dst++ = (uint8_t) pack_1x64_32 (_mm_adds_pu16 (pixMultiply_1x64 (_mm_movepi64_pi64 (xmmAlpha), unpack_32_1x64 (m)),
                                                                              unpack_32_1x64 (d)));
            w--;
        }

        /* call prefetch hint to optimize cache load*/
        cachePrefetch ((__m128i*)mask);
        cachePrefetch ((__m128i*)dst);

        while (w >= 16)
        {
            /* fill cache line with next memory */
            cachePrefetchNext ((__m128i*)mask);
            cachePrefetchNext ((__m128i*)dst);

            xmmMask = load128Unaligned((__m128i*)mask);
            xmmDst = load128Aligned((__m128i*)dst);

            unpack_128_2x128 (xmmMask, &xmmMaskLo, &xmmMaskHi);
            unpack_128_2x128 (xmmDst, &xmmDstLo, &xmmDstHi);

            pixMultiply_2x128 (&xmmAlpha, &xmmAlpha, &xmmMaskLo, &xmmMaskHi, &xmmMaskLo, &xmmMaskHi);

            xmmDstLo = _mm_adds_epu16 (xmmMaskLo, xmmDstLo);
            xmmDstHi = _mm_adds_epu16 (xmmMaskHi, xmmDstHi);

            save128Aligned ((__m128i*)dst, pack_2x128_128 (xmmDstLo, xmmDstHi));

            mask += 16;
            dst += 16;
            w -= 16;
        }

        while (w)
        {
            m = (uint32_t) *mask++;
            d = (uint32_t) *dst;

            *dst++ = (uint8_t) pack_1x64_32 (_mm_adds_pu16 (pixMultiply_1x64 (_mm_movepi64_pi64 (xmmAlpha), unpack_32_1x64 (m)),
                                                                              unpack_32_1x64 (d)));
            w--;
        }
    }

    _mm_empty();
}

/* -------------------------------------------------------------------------------------------------
 * fbCompositeSrcAdd_8000x8000
 */

void
fbCompositeSrcAdd_8000x8000sse2 (pixman_op_t op,
				pixman_image_t * pSrc,
				pixman_image_t * pMask,
				pixman_image_t * pDst,
				int16_t      xSrc,
				int16_t      ySrc,
				int16_t      xMask,
				int16_t      yMask,
				int16_t      xDst,
				int16_t      yDst,
				uint16_t     width,
				uint16_t     height)
{
    uint8_t	*dstLine, *dst;
    uint8_t	*srcLine, *src;
    int	dstStride, srcStride;
    uint16_t	w;
    uint16_t	t;

    fbComposeGetStart (pSrc, xSrc, ySrc, uint8_t, srcStride, srcLine, 1);
    fbComposeGetStart (pDst, xDst, yDst, uint8_t, dstStride, dstLine, 1);

    while (height--)
    {
        dst = dstLine;
        src = srcLine;

        /* call prefetch hint to optimize cache load*/
        cachePrefetch ((__m128i*)src);
        cachePrefetch ((__m128i*)dst);

        dstLine += dstStride;
        srcLine += srcStride;
        w = width;

        /* Small head */
        while (w && (unsigned long)dst & 3)
        {
            t = (*dst) + (*src++);
            *dst++ = t | (0 - (t >> 8));
            w--;
        }

        coreCombineAddUsse2 ((uint32_t*)dst, (uint32_t*)src, w >> 2);

        /* Small tail */
        dst += w & 0xfffc;
        src += w & 0xfffc;

        w &= 3;

        while (w)
        {
            t = (*dst) + (*src++);
            *dst++ = t | (0 - (t >> 8));
            w--;
        }
    }

    _mm_empty();
}

/* -------------------------------------------------------------------------------------------------
 * fbCompositeSrcAdd_8888x8888
 */
void
fbCompositeSrcAdd_8888x8888sse2 (pixman_op_t 	op,
				pixman_image_t *	pSrc,
				pixman_image_t *	pMask,
				pixman_image_t *	 pDst,
				int16_t		 xSrc,
				int16_t      ySrc,
				int16_t      xMask,
				int16_t      yMask,
				int16_t      xDst,
				int16_t      yDst,
				uint16_t     width,
				uint16_t     height)
{
    uint32_t	*dstLine, *dst;
    uint32_t	*srcLine, *src;
    int	dstStride, srcStride;

    fbComposeGetStart (pSrc, xSrc, ySrc, uint32_t, srcStride, srcLine, 1);
    fbComposeGetStart (pDst, xDst, yDst, uint32_t, dstStride, dstLine, 1);

    while (height--)
    {
        dst = dstLine;
        dstLine += dstStride;
        src = srcLine;
        srcLine += srcStride;

        coreCombineAddUsse2 (dst, src, width);
    }

    _mm_empty();
}

/* -------------------------------------------------------------------------------------------------
 * fbCompositeCopyAreasse2
 */

pixman_bool_t
pixmanBltsse2 (uint32_t *src_bits,
		uint32_t *dst_bits,
		int src_stride,
		int dst_stride,
		int src_bpp,
		int dst_bpp,
		int src_x, int src_y,
		int dst_x, int dst_y,
		int width, int height)
{
    uint8_t *	src_bytes;
    uint8_t *	dst_bytes;
    int		byte_width;

    if (src_bpp != dst_bpp)
        return FALSE;

    if (src_bpp == 16)
    {
        src_stride = src_stride * (int) sizeof (uint32_t) / 2;
        dst_stride = dst_stride * (int) sizeof (uint32_t) / 2;
        src_bytes = (uint8_t *)(((uint16_t *)src_bits) + src_stride * (src_y) + (src_x));
        dst_bytes = (uint8_t *)(((uint16_t *)dst_bits) + dst_stride * (dst_y) + (dst_x));
        byte_width = 2 * width;
        src_stride *= 2;
        dst_stride *= 2;
    }
    else if (src_bpp == 32)
    {
        src_stride = src_stride * (int) sizeof (uint32_t) / 4;
        dst_stride = dst_stride * (int) sizeof (uint32_t) / 4;
        src_bytes = (uint8_t *)(((uint32_t *)src_bits) + src_stride * (src_y) + (src_x));
        dst_bytes = (uint8_t *)(((uint32_t *)dst_bits) + dst_stride * (dst_y) + (dst_x));
        byte_width = 4 * width;
        src_stride *= 4;
        dst_stride *= 4;
    }
    else
    {
        return FALSE;
    }

    cachePrefetch ((__m128i*)src_bytes);
    cachePrefetch ((__m128i*)dst_bytes);

    while (height--)
    {
        int w;
        uint8_t *s = src_bytes;
        uint8_t *d = dst_bytes;
        src_bytes += src_stride;
        dst_bytes += dst_stride;
        w = byte_width;

        cachePrefetchNext ((__m128i*)s);
        cachePrefetchNext ((__m128i*)d);

        while (w >= 2 && ((unsigned long)d & 3))
        {
            *(uint16_t *)d = *(uint16_t *)s;
            w -= 2;
            s += 2;
            d += 2;
        }

        while (w >= 4 && ((unsigned long)d & 15))
        {
            *(uint32_t *)d = *(uint32_t *)s;

            w -= 4;
            s += 4;
            d += 4;
        }

        cachePrefetchNext ((__m128i*)s);
        cachePrefetchNext ((__m128i*)d);

        while (w >= 64)
        {
            __m128i xmm0, xmm1, xmm2, xmm3;

            /* 128 bytes ahead */
            cachePrefetch (((__m128i*)s) + 8);
            cachePrefetch (((__m128i*)d) + 8);

            xmm0 = load128Unaligned ((__m128i*)(s));
            xmm1 = load128Unaligned ((__m128i*)(s+16));
            xmm2 = load128Unaligned ((__m128i*)(s+32));
            xmm3 = load128Unaligned ((__m128i*)(s+48));

            save128Aligned ((__m128i*)(d),    xmm0);
            save128Aligned ((__m128i*)(d+16), xmm1);
            save128Aligned ((__m128i*)(d+32), xmm2);
            save128Aligned ((__m128i*)(d+48), xmm3);

            s += 64;
            d += 64;
            w -= 64;
        }

        cachePrefetchNext ((__m128i*)s);
        cachePrefetchNext ((__m128i*)d);

        while (w >= 16)
        {
            save128Aligned ((__m128i*)d, load128Unaligned ((__m128i*)s) );

            w -= 16;
            d += 16;
            s += 16;
        }

        cachePrefetchNext ((__m128i*)s);
        cachePrefetchNext ((__m128i*)d);

        while (w >= 4)
        {
            *(uint32_t *)d = *(uint32_t *)s;

            w -= 4;
            s += 4;
            d += 4;
        }

        if (w >= 2)
        {
            *(uint16_t *)d = *(uint16_t *)s;
            w -= 2;
            s += 2;
            d += 2;
        }
    }

    _mm_empty();

    return TRUE;
}

void
fbCompositeCopyAreasse2 (pixman_op_t       op,
			pixman_image_t *	pSrc,
			pixman_image_t *	pMask,
			pixman_image_t *	pDst,
			int16_t		xSrc,
			int16_t		ySrc,
			int16_t		xMask,
			int16_t		yMask,
			int16_t		xDst,
			int16_t		yDst,
			uint16_t		width,
			uint16_t		height)
{
    pixmanBltsse2 (pSrc->bits.bits,
		    pDst->bits.bits,
		    pSrc->bits.rowstride,
		    pDst->bits.rowstride,
		    PIXMAN_FORMAT_BPP (pSrc->bits.format),
		    PIXMAN_FORMAT_BPP (pDst->bits.format),
		    xSrc, ySrc, xDst, yDst, width, height);
}

#if 0
/* This code are buggy in MMX version, now the bug was translated to SSE2 version */
void
fbCompositeOver_x888x8x8888sse2 (pixman_op_t      op,
				pixman_image_t * pSrc,
				pixman_image_t * pMask,
				pixman_image_t * pDst,
				int16_t      xSrc,
				int16_t      ySrc,
				int16_t      xMask,
				int16_t      yMask,
				int16_t      xDst,
				int16_t      yDst,
				uint16_t     width,
				uint16_t     height)
{
    uint32_t	*src, *srcLine, s;
    uint32_t    *dst, *dstLine, d;
    uint8_t	    *mask, *maskLine;
    uint32_t    m;
    int		 srcStride, maskStride, dstStride;
    uint16_t w;

    __m128i xmmSrc, xmmSrcLo, xmmSrcHi;
    __m128i xmmDst, xmmDstLo, xmmDstHi;
    __m128i xmmMask, xmmMaskLo, xmmMaskHi;

    fbComposeGetStart (pDst, xDst, yDst, uint32_t, dstStride, dstLine, 1);
    fbComposeGetStart (pMask, xMask, yMask, uint8_t, maskStride, maskLine, 1);
    fbComposeGetStart (pSrc, xSrc, ySrc, uint32_t, srcStride, srcLine, 1);

    while (height--)
    {
        src = srcLine;
        srcLine += srcStride;
        dst = dstLine;
        dstLine += dstStride;
        mask = maskLine;
        maskLine += maskStride;

        w = width;

        /* call prefetch hint to optimize cache load*/
        cachePrefetch ((__m128i*)src);
        cachePrefetch ((__m128i*)dst);
        cachePrefetch ((__m128i*)mask);

        while (w && (unsigned long)dst & 15)
        {
            s = 0xff000000 | *src++;
            m = (uint32_t) *mask++;
            d = *dst;

            __m64 ms = unpack_32_1x64 (s);

            if (m != 0xff)
            {
                ms = inOver_1x64 (ms,
                                  xMask00ff,
                                  expandAlphaRev_1x64 (unpack_32_1x64 (m)),
                                  unpack_32_1x64 (d));
            }

            *dst++ = pack_1x64_32 (ms);
            w--;
        }

        /* call prefetch hint to optimize cache load*/
        cachePrefetch ((__m128i*)src);
        cachePrefetch ((__m128i*)dst);
        cachePrefetch ((__m128i*)mask);

        while (w >= 4)
        {
            /* fill cache line with next memory */
            cachePrefetchNext ((__m128i*)src);
            cachePrefetchNext ((__m128i*)dst);
            cachePrefetchNext ((__m128i*)mask);

            m = *(uint32_t*) mask;
            xmmSrc = _mm_or_si128 (load128Unaligned ((__m128i*)src), Maskff000000);

            if (m == 0xffffffff)
            {
                save128Aligned ((__m128i*)dst, xmmSrc);
            }
            else
            {
                xmmDst = load128Aligned ((__m128i*)dst);

                xmmMask = _mm_unpacklo_epi16 (unpack_32_1x128 (m), _mm_setzero_si128());

                unpack_128_2x128 (xmmSrc, &xmmSrcLo, &xmmSrcHi);
                unpack_128_2x128 (xmmMask, &xmmMaskLo, &xmmMaskHi);
                unpack_128_2x128 (xmmDst, &xmmDstLo, &xmmDstHi);

                expandAlphaRev_2x128 (xmmMaskLo, xmmMaskHi, &xmmMaskLo, &xmmMaskHi);

                inOver_2x128 (xmmSrcLo, xmmSrcHi, Mask00ff, Mask00ff, xmmMaskLo, xmmMaskHi, &xmmDstLo, &xmmDstHi);

                save128Aligned( (__m128i*)dst, pack_2x128_128 (xmmDstLo, xmmDstHi));
            }

            src += 4;
            dst += 4;
            mask += 4;
            w -= 4;
        }

        while (w)
        {
            m = (uint32_t) *mask++;

            if (m)
            {
                s = 0xff000000 | *src;

                if (m == 0xff)
                {
                    *dst = s;
                }
                else
                {
                    d = *dst;

                    *dst = pack_1x64_32 (inOver_1x64 (unpack_32_1x64 (s),
                                                      xMask00ff,
                                                      expandAlphaRev_1x64 (unpack_32_1x64 (m)),
                                                      unpack_32_1x64 (d)));
                }

            }

            src++;
            dst++;
            w--;
        }
    }

    _mm_empty();
}
#endif /* #if 0 */

#endif /* USE_SSE2 */