qemu/pixman/pixman/pixman-mmx.c

   1 /*
   2  * Copyright © 2004, 2005 Red Hat, Inc.
   3  * Copyright © 2004 Nicholas Miell
   4  * Copyright © 2005 Trolltech AS
   5  *
   6  * Permission to use, copy, modify, distribute, and sell this software and its
   7  * documentation for any purpose is hereby granted without fee, provided that
   8  * the above copyright notice appear in all copies and that both that
   9  * copyright notice and this permission notice appear in supporting
  10  * documentation, and that the name of Red Hat not be used in advertising or
  11  * publicity pertaining to distribution of the software without specific,
  12  * written prior permission.  Red Hat makes no representations about the
  13  * suitability of this software for any purpose.  It is provided "as is"
  14  * without express or implied warranty.
  15  *
  16  * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
  17  * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
  18  * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
  19  * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
  20  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
  21  * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
  22  * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
  23  * SOFTWARE.
  24  *
  25  * Author:  Søren Sandmann (sandmann@redhat.com)
  26  * Minor Improvements: Nicholas Miell (nmiell@gmail.com)
  27  * MMX code paths for fbcompose.c by Lars Knoll (lars@trolltech.com)
  28  *
  29  * Based on work by Owen Taylor
  30  */
  31
  32 #ifdef HAVE_CONFIG_H
  33 #include <config.h>
  34 #endif
  35
  36 #if defined USE_X86_MMX || defined USE_ARM_IWMMXT || defined USE_LOONGSON_MMI
  37
  38 #ifdef USE_LOONGSON_MMI
  39 #include <loongson-mmintrin.h>
  40 #else
  41 #include <mmintrin.h>
  42 #endif
  43 #include "pixman-private.h"
  44 #include "pixman-combine32.h"
  45 #include "pixman-inlines.h"
  46
  47 #ifdef VERBOSE
  48 #define CHECKPOINT() error_f ("at %s %d\n", __FUNCTION__, __LINE__)
  49 #else
  50 #define CHECKPOINT()
  51 #endif
  52
  53 #if defined USE_ARM_IWMMXT && __GNUC__ == 4 && __GNUC_MINOR__ < 8
  54 /* Empty the multimedia state. For some reason, ARM's mmintrin.h doesn't provide this.  */
  55 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  56 _mm_empty (void)
  57 {
  58
  59 }
  60 #endif
  61
  62 #ifdef USE_X86_MMX
  63 # if (defined(__SUNPRO_C) || defined(_MSC_VER) || defined(_WIN64))
  64 #  include <xmmintrin.h>
  65 # else
  66 /* We have to compile with -msse to use xmmintrin.h, but that causes SSE
  67  * instructions to be generated that we don't want. Just duplicate the
  68  * functions we want to use.  */
  69 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  70 _mm_movemask_pi8 (__m64 __A)
  71 {
  72     int ret;
  73
  74     asm ("pmovmskb %1, %0\n\t"
  75         : "=r" (ret)
  76         : "y" (__A)
  77     );
  78
  79     return ret;
  80 }
  81
  82 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  83 _mm_mulhi_pu16 (__m64 __A, __m64 __B)
  84 {
  85     asm ("pmulhuw %1, %0\n\t"
  86         : "+y" (__A)
  87         : "y" (__B)
  88     );
  89     return __A;
  90 }
  91
  92 #  ifdef __OPTIMIZE__
  93 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  94 _mm_shuffle_pi16 (__m64 __A, int8_t const __N)
  95 {
  96     __m64 ret;
  97
  98     asm ("pshufw %2, %1, %0\n\t"
  99         : "=y" (ret)
 100         : "y" (__A), "K" (__N)
 101     );
 102
 103     return ret;
 104 }
 105 #  else
 106 #   define _mm_shuffle_pi16(A, N)                                       \
 107     ({                                                                  \
 108         __m64 ret;                                                      \
 109                                                                         \
 110         asm ("pshufw %2, %1, %0\n\t"                                    \
 111              : "=y" (ret)                                               \
 112              : "y" (A), "K" ((const int8_t)N)                           \
 113         );                                                              \
 114                                                                         \
 115         ret;                                                            \
 116     })
 117 #  endif
 118 # endif
 119 #endif
 120
 121 #ifndef _MSC_VER
 122 #define _MM_SHUFFLE(fp3,fp2,fp1,fp0) \
 123  (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | (fp0))
 124 #endif
 125
 126 /* Notes about writing mmx code
 127  *
 128  * give memory operands as the second operand. If you give it as the
 129  * first, gcc will first load it into a register, then use that
 130  * register
 131  *
 132  *   ie. use
 133  *
 134  *         _mm_mullo_pi16 (x, mmx_constant);
 135  *
 136  *   not
 137  *
 138  *         _mm_mullo_pi16 (mmx_constant, x);
 139  *
 140  * Also try to minimize dependencies. i.e. when you need a value, try
 141  * to calculate it from a value that was calculated as early as
 142  * possible.
 143  */
 144
 145 /* --------------- MMX primitives ------------------------------------- */
 146
 147 /* If __m64 is defined as a struct or union, then define M64_MEMBER to be
 148  * the name of the member used to access the data.
 149  * If __m64 requires using mm_cvt* intrinsics functions to convert between
 150  * uint64_t and __m64 values, then define USE_CVT_INTRINSICS.
 151  * If __m64 and uint64_t values can just be cast to each other directly,
 152  * then define USE_M64_CASTS.
 153  * If __m64 is a double datatype, then define USE_M64_DOUBLE.
 154  */
 155 #ifdef _MSC_VER
 156 # define M64_MEMBER m64_u64
 157 #elif defined(__ICC)
 158 # define USE_CVT_INTRINSICS
 159 #elif defined(USE_LOONGSON_MMI)
 160 # define USE_M64_DOUBLE
 161 #elif defined(__GNUC__)
 162 # define USE_M64_CASTS
 163 #elif defined(__SUNPRO_C)
 164 # if (__SUNPRO_C >= 0x5120) && !defined(__NOVECTORSIZE__)
 165 /* Solaris Studio 12.3 (Sun C 5.12) introduces __attribute__(__vector_size__)
 166  * support, and defaults to using it to define __m64, unless __NOVECTORSIZE__
 167  * is defined.   If it is used, then the mm_cvt* intrinsics must be used.
 168  */
 169 #  define USE_CVT_INTRINSICS
 170 # else
 171 /* For Studio 12.2 or older, or when __attribute__(__vector_size__) is
 172  * disabled, __m64 is defined as a struct containing "unsigned long long l_".
 173  */
 174 #  define M64_MEMBER l_
 175 # endif
 176 #endif
 177
 178 #if defined(USE_M64_CASTS) || defined(USE_CVT_INTRINSICS) || defined(USE_M64_DOUBLE)
 179 typedef uint64_t mmxdatafield;
 180 #else
 181 typedef __m64 mmxdatafield;
 182 #endif
 183
 184 typedef struct
 185 {
 186     mmxdatafield mmx_4x00ff;
 187     mmxdatafield mmx_4x0080;
 188     mmxdatafield mmx_565_rgb;
 189     mmxdatafield mmx_565_unpack_multiplier;
 190     mmxdatafield mmx_565_pack_multiplier;
 191     mmxdatafield mmx_565_r;
 192     mmxdatafield mmx_565_g;
 193     mmxdatafield mmx_565_b;
 194     mmxdatafield mmx_packed_565_rb;
 195     mmxdatafield mmx_packed_565_g;
 196     mmxdatafield mmx_expand_565_g;
 197     mmxdatafield mmx_expand_565_b;
 198     mmxdatafield mmx_expand_565_r;
 199 #ifndef USE_LOONGSON_MMI
 200     mmxdatafield mmx_mask_0;
 201     mmxdatafield mmx_mask_1;
 202     mmxdatafield mmx_mask_2;
 203     mmxdatafield mmx_mask_3;
 204 #endif
 205     mmxdatafield mmx_full_alpha;
 206     mmxdatafield mmx_4x0101;
 207     mmxdatafield mmx_ff000000;
 208 } mmx_data_t;
 209
 210 #if defined(_MSC_VER)
 211 # define MMXDATA_INIT(field, val) { val ## UI64 }
 212 #elif defined(M64_MEMBER)       /* __m64 is a struct, not an integral type */
 213 # define MMXDATA_INIT(field, val) field =   { val ## ULL }
 214 #else                           /* mmxdatafield is an integral type */
 215 # define MMXDATA_INIT(field, val) field =   val ## ULL
 216 #endif
 217
 218 static const mmx_data_t c =
 219 {
 220     MMXDATA_INIT (.mmx_4x00ff,                   0x00ff00ff00ff00ff),
 221     MMXDATA_INIT (.mmx_4x0080,                   0x0080008000800080),
 222     MMXDATA_INIT (.mmx_565_rgb,                  0x000001f0003f001f),
 223     MMXDATA_INIT (.mmx_565_unpack_multiplier,    0x0000008404100840),
 224     MMXDATA_INIT (.mmx_565_pack_multiplier,      0x2000000420000004),
 225     MMXDATA_INIT (.mmx_565_r,                    0x000000f800000000),
 226     MMXDATA_INIT (.mmx_565_g,                    0x0000000000fc0000),
 227     MMXDATA_INIT (.mmx_565_b,                    0x00000000000000f8),
 228     MMXDATA_INIT (.mmx_packed_565_rb,            0x00f800f800f800f8),
 229     MMXDATA_INIT (.mmx_packed_565_g,             0x0000fc000000fc00),
 230     MMXDATA_INIT (.mmx_expand_565_g,             0x07e007e007e007e0),
 231     MMXDATA_INIT (.mmx_expand_565_b,             0x001f001f001f001f),
 232     MMXDATA_INIT (.mmx_expand_565_r,             0xf800f800f800f800),
 233 #ifndef USE_LOONGSON_MMI
 234     MMXDATA_INIT (.mmx_mask_0,                   0xffffffffffff0000),
 235     MMXDATA_INIT (.mmx_mask_1,                   0xffffffff0000ffff),
 236     MMXDATA_INIT (.mmx_mask_2,                   0xffff0000ffffffff),
 237     MMXDATA_INIT (.mmx_mask_3,                   0x0000ffffffffffff),
 238 #endif
 239     MMXDATA_INIT (.mmx_full_alpha,               0x00ff000000000000),
 240     MMXDATA_INIT (.mmx_4x0101,                   0x0101010101010101),
 241     MMXDATA_INIT (.mmx_ff000000,                 0xff000000ff000000),
 242 };
 243
 244 #ifdef USE_CVT_INTRINSICS
 245 #    define MC(x) to_m64 (c.mmx_ ## x)
 246 #elif defined(USE_M64_CASTS)
 247 #    define MC(x) ((__m64)c.mmx_ ## x)
 248 #elif defined(USE_M64_DOUBLE)
 249 #    define MC(x) (*(__m64 *)&c.mmx_ ## x)
 250 #else
 251 #    define MC(x) c.mmx_ ## x
 252 #endif
 253
 254 static force_inline __m64
 255 to_m64 (uint64_t x)
 256 {
 257 #ifdef USE_CVT_INTRINSICS
 258     return _mm_cvtsi64_m64 (x);
 259 #elif defined M64_MEMBER        /* __m64 is a struct, not an integral type */
 260     __m64 res;
 261
 262     res.M64_MEMBER = x;
 263     return res;
 264 #elif defined USE_M64_DOUBLE
 265     return *(__m64 *)&x;
 266 #else /* USE_M64_CASTS */
 267     return (__m64)x;
 268 #endif
 269 }
 270
 271 static force_inline uint64_t
 272 to_uint64 (__m64 x)
 273 {
 274 #ifdef USE_CVT_INTRINSICS
 275     return _mm_cvtm64_si64 (x);
 276 #elif defined M64_MEMBER        /* __m64 is a struct, not an integral type */
 277     uint64_t res = x.M64_MEMBER;
 278     return res;
 279 #elif defined USE_M64_DOUBLE
 280     return *(uint64_t *)&x;
 281 #else /* USE_M64_CASTS */
 282     return (uint64_t)x;
 283 #endif
 284 }
 285
 286 static force_inline __m64
 287 shift (__m64 v,
 288        int   s)
 289 {
 290     if (s > 0)
 291         return _mm_slli_si64 (v, s);
 292     else if (s < 0)
 293         return _mm_srli_si64 (v, -s);
 294     else
 295         return v;
 296 }
 297
 298 static force_inline __m64
 299 negate (__m64 mask)
 300 {
 301     return _mm_xor_si64 (mask, MC (4x00ff));
 302 }
 303
 304 /* Computes the product of two unsigned fixed-point 8-bit values from 0 to 1
 305  * and maps its result to the same range.
 306  *
 307  * Jim Blinn gives multiple ways to compute this in "Jim Blinn's Corner:
 308  * Notation, Notation, Notation", the first of which is
 309  *
 310  *   prod(a, b) = (a * b + 128) / 255.
 311  *
 312  * By approximating the division by 255 as 257/65536 it can be replaced by a
 313  * multiply and a right shift. This is the implementation that we use in
 314  * pix_multiply(), but we _mm_mulhi_pu16() by 257 (part of SSE1 or Extended
 315  * 3DNow!, and unavailable at the time of the book's publication) to perform
 316  * the multiplication and right shift in a single operation.
 317  *
 318  *   prod(a, b) = ((a * b + 128) * 257) >> 16.
 319  *
 320  * A third way (how pix_multiply() was implemented prior to 14208344) exists
 321  * also that performs the multiplication by 257 with adds and shifts.
 322  *
 323  * Where temp = a * b + 128
 324  *
 325  *   prod(a, b) = (temp + (temp >> 8)) >> 8.
 326  */
 327 static force_inline __m64
 328 pix_multiply (__m64 a, __m64 b)
 329 {
 330     __m64 res;
 331
 332     res = _mm_mullo_pi16 (a, b);
 333     res = _mm_adds_pu16 (res, MC (4x0080));
 334     res = _mm_mulhi_pu16 (res, MC (4x0101));
 335
 336     return res;
 337 }
 338
 339 static force_inline __m64
 340 pix_add (__m64 a, __m64 b)
 341 {
 342     return _mm_adds_pu8 (a, b);
 343 }
 344
 345 static force_inline __m64
 346 expand_alpha (__m64 pixel)
 347 {
 348     return _mm_shuffle_pi16 (pixel, _MM_SHUFFLE (3, 3, 3, 3));
 349 }
 350
 351 static force_inline __m64
 352 expand_alpha_rev (__m64 pixel)
 353 {
 354     return _mm_shuffle_pi16 (pixel, _MM_SHUFFLE (0, 0, 0, 0));
 355 }
 356
 357 static force_inline __m64
 358 invert_colors (__m64 pixel)
 359 {
 360     return _mm_shuffle_pi16 (pixel, _MM_SHUFFLE (3, 0, 1, 2));
 361 }
 362
 363 static force_inline __m64
 364 over (__m64 src,
 365       __m64 srca,
 366       __m64 dest)
 367 {
 368     return _mm_adds_pu8 (src, pix_multiply (dest, negate (srca)));
 369 }
 370
 371 static force_inline __m64
 372 over_rev_non_pre (__m64 src, __m64 dest)
 373 {
 374     __m64 srca = expand_alpha (src);
 375     __m64 srcfaaa = _mm_or_si64 (srca, MC (full_alpha));
 376
 377     return over (pix_multiply (invert_colors (src), srcfaaa), srca, dest);
 378 }
 379
 380 static force_inline __m64
 381 in (__m64 src, __m64 mask)
 382 {
 383     return pix_multiply (src, mask);
 384 }
 385
 386 #ifndef _MSC_VER
 387 static force_inline __m64
 388 in_over (__m64 src, __m64 srca, __m64 mask, __m64 dest)
 389 {
 390     return over (in (src, mask), pix_multiply (srca, mask), dest);
 391 }
 392
 393 #else
 394
 395 #define in_over(src, srca, mask, dest)                                  \
 396     over (in (src, mask), pix_multiply (srca, mask), dest)
 397
 398 #endif
 399
 400 /* Elemental unaligned loads */
 401
 402 static force_inline __m64 ldq_u(__m64 *p)
 403 {
 404 #ifdef USE_X86_MMX
 405     /* x86's alignment restrictions are very relaxed. */
 406     return *(__m64 *)p;
 407 #elif defined USE_ARM_IWMMXT
 408     int align = (uintptr_t)p & 7;
 409     __m64 *aligned_p;
 410     if (align == 0)
 411         return *p;
 412     aligned_p = (__m64 *)((uintptr_t)p & ~7);
 413     return (__m64) _mm_align_si64 (aligned_p[0], aligned_p[1], align);
 414 #else
 415     struct __una_u64 { __m64 x __attribute__((packed)); };
 416     const struct __una_u64 *ptr = (const struct __una_u64 *) p;
 417     return (__m64) ptr->x;
 418 #endif
 419 }
 420
 421 static force_inline uint32_t ldl_u(const uint32_t *p)
 422 {
 423 #ifdef USE_X86_MMX
 424     /* x86's alignment restrictions are very relaxed. */
 425     return *p;
 426 #else
 427     struct __una_u32 { uint32_t x __attribute__((packed)); };
 428     const struct __una_u32 *ptr = (const struct __una_u32 *) p;
 429     return ptr->x;
 430 #endif
 431 }
 432
 433 static force_inline __m64
 434 load (const uint32_t *v)
 435 {
 436 #ifdef USE_LOONGSON_MMI
 437     __m64 ret;
 438     asm ("lwc1 %0, %1\n\t"
 439         : "=f" (ret)
 440         : "m" (*v)
 441     );
 442     return ret;
 443 #else
 444     return _mm_cvtsi32_si64 (*v);
 445 #endif
 446 }
 447
 448 static force_inline __m64
 449 load8888 (const uint32_t *v)
 450 {
 451 #ifdef USE_LOONGSON_MMI
 452     return _mm_unpacklo_pi8_f (*(__m32 *)v, _mm_setzero_si64 ());
 453 #else
 454     return _mm_unpacklo_pi8 (load (v), _mm_setzero_si64 ());
 455 #endif
 456 }
 457
 458 static force_inline __m64
 459 load8888u (const uint32_t *v)
 460 {
 461     uint32_t l = ldl_u (v);
 462     return load8888 (&l);
 463 }
 464
 465 static force_inline __m64
 466 pack8888 (__m64 lo, __m64 hi)
 467 {
 468     return _mm_packs_pu16 (lo, hi);
 469 }
 470
 471 static force_inline void
 472 store (uint32_t *dest, __m64 v)
 473 {
 474 #ifdef USE_LOONGSON_MMI
 475     asm ("swc1 %1, %0\n\t"
 476         : "=m" (*dest)
 477         : "f" (v)
 478         : "memory"
 479     );
 480 #else
 481     *dest = _mm_cvtsi64_si32 (v);
 482 #endif
 483 }
 484
 485 static force_inline void
 486 store8888 (uint32_t *dest, __m64 v)
 487 {
 488     v = pack8888 (v, _mm_setzero_si64 ());
 489     store (dest, v);
 490 }
 491
 492 static force_inline pixman_bool_t
 493 is_equal (__m64 a, __m64 b)
 494 {
 495 #ifdef USE_LOONGSON_MMI
 496     /* __m64 is double, we can compare directly. */
 497     return a == b;
 498 #else
 499     return _mm_movemask_pi8 (_mm_cmpeq_pi8 (a, b)) == 0xff;
 500 #endif
 501 }
 502
 503 static force_inline pixman_bool_t
 504 is_opaque (__m64 v)
 505 {
 506 #ifdef USE_LOONGSON_MMI
 507     return is_equal (_mm_and_si64 (v, MC (full_alpha)), MC (full_alpha));
 508 #else
 509     __m64 ffs = _mm_cmpeq_pi8 (v, v);
 510     return (_mm_movemask_pi8 (_mm_cmpeq_pi8 (v, ffs)) & 0x40);
 511 #endif
 512 }
 513
 514 static force_inline pixman_bool_t
 515 is_zero (__m64 v)
 516 {
 517     return is_equal (v, _mm_setzero_si64 ());
 518 }
 519
 520 /* Expand 16 bits positioned at @pos (0-3) of a mmx register into
 521  *
 522  *    00RR00GG00BB
 523  *
 524  * --- Expanding 565 in the low word ---
 525  *
 526  * m = (m << (32 - 3)) | (m << (16 - 5)) | m;
 527  * m = m & (01f0003f001f);
 528  * m = m * (008404100840);
 529  * m = m >> 8;
 530  *
 531  * Note the trick here - the top word is shifted by another nibble to
 532  * avoid it bumping into the middle word
 533  */
 534 static force_inline __m64
 535 expand565 (__m64 pixel, int pos)
 536 {
 537     __m64 p = pixel;
 538     __m64 t1, t2;
 539
 540     /* move pixel to low 16 bit and zero the rest */
 541 #ifdef USE_LOONGSON_MMI
 542     p = loongson_extract_pi16 (p, pos);
 543 #else
 544     p = shift (shift (p, (3 - pos) * 16), -48);
 545 #endif
 546
 547     t1 = shift (p, 36 - 11);
 548     t2 = shift (p, 16 - 5);
 549
 550     p = _mm_or_si64 (t1, p);
 551     p = _mm_or_si64 (t2, p);
 552     p = _mm_and_si64 (p, MC (565_rgb));
 553
 554     pixel = _mm_mullo_pi16 (p, MC (565_unpack_multiplier));
 555     return _mm_srli_pi16 (pixel, 8);
 556 }
 557
 558 /* Expand 4 16 bit pixels in an mmx register into two mmx registers of
 559  *
 560  *    AARRGGBBRRGGBB
 561  */
 562 static force_inline void
 563 expand_4xpacked565 (__m64 vin, __m64 *vout0, __m64 *vout1, int full_alpha)
 564 {
 565     __m64 t0, t1, alpha = _mm_setzero_si64 ();
 566     __m64 r = _mm_and_si64 (vin, MC (expand_565_r));
 567     __m64 g = _mm_and_si64 (vin, MC (expand_565_g));
 568     __m64 b = _mm_and_si64 (vin, MC (expand_565_b));
 569     if (full_alpha)
 570         alpha = _mm_cmpeq_pi32 (alpha, alpha);
 571
 572     /* Replicate high bits into empty low bits. */
 573     r = _mm_or_si64 (_mm_srli_pi16 (r, 8), _mm_srli_pi16 (r, 13));
 574     g = _mm_or_si64 (_mm_srli_pi16 (g, 3), _mm_srli_pi16 (g, 9));
 575     b = _mm_or_si64 (_mm_slli_pi16 (b, 3), _mm_srli_pi16 (b, 2));
 576
 577     r = _mm_packs_pu16 (r, _mm_setzero_si64 ());        /* 00 00 00 00 R3 R2 R1 R0 */
 578     g = _mm_packs_pu16 (g, _mm_setzero_si64 ());        /* 00 00 00 00 G3 G2 G1 G0 */
 579     b = _mm_packs_pu16 (b, _mm_setzero_si64 ());        /* 00 00 00 00 B3 B2 B1 B0 */
 580
 581     t1 = _mm_unpacklo_pi8 (r, alpha);                   /* A3 R3 A2 R2 A1 R1 A0 R0 */
 582     t0 = _mm_unpacklo_pi8 (b, g);                       /* G3 B3 G2 B2 G1 B1 G0 B0 */
 583
 584     *vout0 = _mm_unpacklo_pi16 (t0, t1);                /* A1 R1 G1 B1 A0 R0 G0 B0 */
 585     *vout1 = _mm_unpackhi_pi16 (t0, t1);                /* A3 R3 G3 B3 A2 R2 G2 B2 */
 586 }
 587
 588 static force_inline __m64
 589 expand8888 (__m64 in, int pos)
 590 {
 591     if (pos == 0)
 592         return _mm_unpacklo_pi8 (in, _mm_setzero_si64 ());
 593     else
 594         return _mm_unpackhi_pi8 (in, _mm_setzero_si64 ());
 595 }
 596
 597 static force_inline __m64
 598 expandx888 (__m64 in, int pos)
 599 {
 600     return _mm_or_si64 (expand8888 (in, pos), MC (full_alpha));
 601 }
 602
 603 static force_inline void
 604 expand_4x565 (__m64 vin, __m64 *vout0, __m64 *vout1, __m64 *vout2, __m64 *vout3, int full_alpha)
 605 {
 606     __m64 v0, v1;
 607     expand_4xpacked565 (vin, &v0, &v1, full_alpha);
 608     *vout0 = expand8888 (v0, 0);
 609     *vout1 = expand8888 (v0, 1);
 610     *vout2 = expand8888 (v1, 0);
 611     *vout3 = expand8888 (v1, 1);
 612 }
 613
 614 static force_inline __m64
 615 pack_565 (__m64 pixel, __m64 target, int pos)
 616 {
 617     __m64 p = pixel;
 618     __m64 t = target;
 619     __m64 r, g, b;
 620
 621     r = _mm_and_si64 (p, MC (565_r));
 622     g = _mm_and_si64 (p, MC (565_g));
 623     b = _mm_and_si64 (p, MC (565_b));
 624
 625 #ifdef USE_LOONGSON_MMI
 626     r = shift (r, -(32 - 8));
 627     g = shift (g, -(16 - 3));
 628     b = shift (b, -(0  + 3));
 629
 630     p = _mm_or_si64 (r, g);
 631     p = _mm_or_si64 (p, b);
 632     return loongson_insert_pi16 (t, p, pos);
 633 #else
 634     r = shift (r, -(32 - 8) + pos * 16);
 635     g = shift (g, -(16 - 3) + pos * 16);
 636     b = shift (b, -(0  + 3) + pos * 16);
 637
 638     if (pos == 0)
 639         t = _mm_and_si64 (t, MC (mask_0));
 640     else if (pos == 1)
 641         t = _mm_and_si64 (t, MC (mask_1));
 642     else if (pos == 2)
 643         t = _mm_and_si64 (t, MC (mask_2));
 644     else if (pos == 3)
 645         t = _mm_and_si64 (t, MC (mask_3));
 646
 647     p = _mm_or_si64 (r, t);
 648     p = _mm_or_si64 (g, p);
 649
 650     return _mm_or_si64 (b, p);
 651 #endif
 652 }
 653
 654 static force_inline __m64
 655 pack_4xpacked565 (__m64 a, __m64 b)
 656 {
 657     __m64 rb0 = _mm_and_si64 (a, MC (packed_565_rb));
 658     __m64 rb1 = _mm_and_si64 (b, MC (packed_565_rb));
 659
 660     __m64 t0 = _mm_madd_pi16 (rb0, MC (565_pack_multiplier));
 661     __m64 t1 = _mm_madd_pi16 (rb1, MC (565_pack_multiplier));
 662
 663     __m64 g0 = _mm_and_si64 (a, MC (packed_565_g));
 664     __m64 g1 = _mm_and_si64 (b, MC (packed_565_g));
 665
 666     t0 = _mm_or_si64 (t0, g0);
 667     t1 = _mm_or_si64 (t1, g1);
 668
 669     t0 = shift(t0, -5);
 670 #ifdef USE_ARM_IWMMXT
 671     t1 = shift(t1, -5);
 672     return _mm_packs_pu32 (t0, t1);
 673 #else
 674     t1 = shift(t1, -5 + 16);
 675     return _mm_shuffle_pi16 (_mm_or_si64 (t0, t1), _MM_SHUFFLE (3, 1, 2, 0));
 676 #endif
 677 }
 678
 679 #ifndef _MSC_VER
 680
 681 static force_inline __m64
 682 pack_4x565 (__m64 v0, __m64 v1, __m64 v2, __m64 v3)
 683 {
 684     return pack_4xpacked565 (pack8888 (v0, v1), pack8888 (v2, v3));
 685 }
 686
 687 static force_inline __m64
 688 pix_add_mul (__m64 x, __m64 a, __m64 y, __m64 b)
 689 {
 690     x = pix_multiply (x, a);
 691     y = pix_multiply (y, b);
 692
 693     return pix_add (x, y);
 694 }
 695
 696 #else
 697
 698 /* MSVC only handles a "pass by register" of up to three SSE intrinsics */
 699
 700 #define pack_4x565(v0, v1, v2, v3) \
 701     pack_4xpacked565 (pack8888 (v0, v1), pack8888 (v2, v3))
 702
 703 #define pix_add_mul(x, a, y, b)  \
 704     ( x = pix_multiply (x, a),   \
 705       y = pix_multiply (y, b),   \
 706       pix_add (x, y) )
 707
 708 #endif
 709
 710 /* --------------- MMX code patch for fbcompose.c --------------------- */
 711
 712 static force_inline __m64
 713 combine (const uint32_t *src, const uint32_t *mask)
 714 {
 715     __m64 vsrc = load8888 (src);
 716
 717     if (mask)
 718     {
 719         __m64 m = load8888 (mask);
 720
 721         m = expand_alpha (m);
 722         vsrc = pix_multiply (vsrc, m);
 723     }
 724
 725     return vsrc;
 726 }
 727
 728 static force_inline __m64
 729 core_combine_over_u_pixel_mmx (__m64 vsrc, __m64 vdst)
 730 {
 731     vsrc = _mm_unpacklo_pi8 (vsrc, _mm_setzero_si64 ());
 732
 733     if (is_opaque (vsrc))
 734     {
 735         return vsrc;
 736     }
 737     else if (!is_zero (vsrc))
 738     {
 739         return over (vsrc, expand_alpha (vsrc),
 740                      _mm_unpacklo_pi8 (vdst, _mm_setzero_si64 ()));
 741     }
 742
 743     return _mm_unpacklo_pi8 (vdst, _mm_setzero_si64 ());
 744 }
 745
 746 static void
 747 mmx_combine_over_u (pixman_implementation_t *imp,
 748                     pixman_op_t              op,
 749                     uint32_t *               dest,
 750                     const uint32_t *         src,
 751                     const uint32_t *         mask,
 752                     int                      width)
 753 {
 754     const uint32_t *end = dest + width;
 755
 756     while (dest < end)
 757     {
 758         __m64 vsrc = combine (src, mask);
 759
 760         if (is_opaque (vsrc))
 761         {
 762             store8888 (dest, vsrc);
 763         }
 764         else if (!is_zero (vsrc))
 765         {
 766             __m64 sa = expand_alpha (vsrc);
 767             store8888 (dest, over (vsrc, sa, load8888 (dest)));
 768         }
 769
 770         ++dest;
 771         ++src;
 772         if (mask)
 773             ++mask;
 774     }
 775     _mm_empty ();
 776 }
 777
 778 static void
 779 mmx_combine_over_reverse_u (pixman_implementation_t *imp,
 780                             pixman_op_t              op,
 781                             uint32_t *               dest,
 782                             const uint32_t *         src,
 783                             const uint32_t *         mask,
 784                             int                      width)
 785 {
 786     const uint32_t *end = dest + width;
 787
 788     while (dest < end)
 789     {
 790         __m64 d, da;
 791         __m64 s = combine (src, mask);
 792
 793         d = load8888 (dest);
 794         da = expand_alpha (d);
 795         store8888 (dest, over (d, da, s));
 796
 797         ++dest;
 798         ++src;
 799         if (mask)
 800             mask++;
 801     }
 802     _mm_empty ();
 803 }
 804
 805 static void
 806 mmx_combine_in_u (pixman_implementation_t *imp,
 807                   pixman_op_t              op,
 808                   uint32_t *               dest,
 809                   const uint32_t *         src,
 810                   const uint32_t *         mask,
 811                   int                      width)
 812 {
 813     const uint32_t *end = dest + width;
 814
 815     while (dest < end)
 816     {
 817         __m64 a;
 818         __m64 x = combine (src, mask);
 819
 820         a = load8888 (dest);
 821         a = expand_alpha (a);
 822         x = pix_multiply (x, a);
 823
 824         store8888 (dest, x);
 825
 826         ++dest;
 827         ++src;
 828         if (mask)
 829             mask++;
 830     }
 831     _mm_empty ();
 832 }
 833
 834 static void
 835 mmx_combine_in_reverse_u (pixman_implementation_t *imp,
 836                           pixman_op_t              op,
 837                           uint32_t *               dest,
 838                           const uint32_t *         src,
 839                           const uint32_t *         mask,
 840                           int                      width)
 841 {
 842     const uint32_t *end = dest + width;
 843
 844     while (dest < end)
 845     {
 846         __m64 a = combine (src, mask);
 847         __m64 x;
 848
 849         x = load8888 (dest);
 850         a = expand_alpha (a);
 851         x = pix_multiply (x, a);
 852         store8888 (dest, x);
 853
 854         ++dest;
 855         ++src;
 856         if (mask)
 857             mask++;
 858     }
 859     _mm_empty ();
 860 }
 861
 862 static void
 863 mmx_combine_out_u (pixman_implementation_t *imp,
 864                    pixman_op_t              op,
 865                    uint32_t *               dest,
 866                    const uint32_t *         src,
 867                    const uint32_t *         mask,
 868                    int                      width)
 869 {
 870     const uint32_t *end = dest + width;
 871
 872     while (dest < end)
 873     {
 874         __m64 a;
 875         __m64 x = combine (src, mask);
 876
 877         a = load8888 (dest);
 878         a = expand_alpha (a);
 879         a = negate (a);
 880         x = pix_multiply (x, a);
 881         store8888 (dest, x);
 882
 883         ++dest;
 884         ++src;
 885         if (mask)
 886             mask++;
 887     }
 888     _mm_empty ();
 889 }
 890
 891 static void
 892 mmx_combine_out_reverse_u (pixman_implementation_t *imp,
 893                            pixman_op_t              op,
 894                            uint32_t *               dest,
 895                            const uint32_t *         src,
 896                            const uint32_t *         mask,
 897                            int                      width)
 898 {
 899     const uint32_t *end = dest + width;
 900
 901     while (dest < end)
 902     {
 903         __m64 a = combine (src, mask);
 904         __m64 x;
 905
 906         x = load8888 (dest);
 907         a = expand_alpha (a);
 908         a = negate (a);
 909         x = pix_multiply (x, a);
 910
 911         store8888 (dest, x);
 912
 913         ++dest;
 914         ++src;
 915         if (mask)
 916             mask++;
 917     }
 918     _mm_empty ();
 919 }
 920
 921 static void
 922 mmx_combine_atop_u (pixman_implementation_t *imp,
 923                     pixman_op_t              op,
 924                     uint32_t *               dest,
 925                     const uint32_t *         src,
 926                     const uint32_t *         mask,
 927                     int                      width)
 928 {
 929     const uint32_t *end = dest + width;
 930
 931     while (dest < end)
 932     {
 933         __m64 da, d, sia;
 934         __m64 s = combine (src, mask);
 935
 936         d = load8888 (dest);
 937         sia = expand_alpha (s);
 938         sia = negate (sia);
 939         da = expand_alpha (d);
 940         s = pix_add_mul (s, da, d, sia);
 941         store8888 (dest, s);
 942
 943         ++dest;
 944         ++src;
 945         if (mask)
 946             mask++;
 947     }
 948     _mm_empty ();
 949 }
 950
 951 static void
 952 mmx_combine_atop_reverse_u (pixman_implementation_t *imp,
 953                             pixman_op_t              op,
 954                             uint32_t *               dest,
 955                             const uint32_t *         src,
 956                             const uint32_t *         mask,
 957                             int                      width)
 958 {
 959     const uint32_t *end;
 960
 961     end = dest + width;
 962
 963     while (dest < end)
 964     {
 965         __m64 dia, d, sa;
 966         __m64 s = combine (src, mask);
 967
 968         d = load8888 (dest);
 969         sa = expand_alpha (s);
 970         dia = expand_alpha (d);
 971         dia = negate (dia);
 972         s = pix_add_mul (s, dia, d, sa);
 973         store8888 (dest, s);
 974
 975         ++dest;
 976         ++src;
 977         if (mask)
 978             mask++;
 979     }
 980     _mm_empty ();
 981 }
 982
 983 static void
 984 mmx_combine_xor_u (pixman_implementation_t *imp,
 985                    pixman_op_t              op,
 986                    uint32_t *               dest,
 987                    const uint32_t *         src,
 988                    const uint32_t *         mask,
 989                    int                      width)
 990 {
 991     const uint32_t *end = dest + width;
 992
 993     while (dest < end)
 994     {
 995         __m64 dia, d, sia;
 996         __m64 s = combine (src, mask);
 997
 998         d = load8888 (dest);
 999         sia = expand_alpha (s);
1000         dia = expand_alpha (d);
1001         sia = negate (sia);
1002         dia = negate (dia);
1003         s = pix_add_mul (s, dia, d, sia);
1004         store8888 (dest, s);
1005
1006         ++dest;
1007         ++src;
1008         if (mask)
1009             mask++;
1010     }
1011     _mm_empty ();
1012 }
1013
1014 static void
1015 mmx_combine_add_u (pixman_implementation_t *imp,
1016                    pixman_op_t              op,
1017                    uint32_t *               dest,
1018                    const uint32_t *         src,
1019                    const uint32_t *         mask,
1020                    int                      width)
1021 {
1022     const uint32_t *end = dest + width;
1023
1024     while (dest < end)
1025     {
1026         __m64 d;
1027         __m64 s = combine (src, mask);
1028
1029         d = load8888 (dest);
1030         s = pix_add (s, d);
1031         store8888 (dest, s);
1032
1033         ++dest;
1034         ++src;
1035         if (mask)
1036             mask++;
1037     }
1038     _mm_empty ();
1039 }
1040
1041 static void
1042 mmx_combine_saturate_u (pixman_implementation_t *imp,
1043                         pixman_op_t              op,
1044                         uint32_t *               dest,
1045                         const uint32_t *         src,
1046                         const uint32_t *         mask,
1047                         int                      width)
1048 {
1049     const uint32_t *end = dest + width;
1050
1051     while (dest < end)
1052     {
1053         uint32_t s, sa, da;
1054         uint32_t d = *dest;
1055         __m64 ms = combine (src, mask);
1056         __m64 md = load8888 (dest);
1057
1058         store8888(&s, ms);
1059         da = ~d >> 24;
1060         sa = s >> 24;
1061
1062         if (sa > da)
1063         {
1064             uint32_t quot = DIV_UN8 (da, sa) << 24;
1065             __m64 msa = load8888 (&quot);
1066             msa = expand_alpha (msa);
1067             ms = pix_multiply (ms, msa);
1068         }
1069
1070         md = pix_add (md, ms);
1071         store8888 (dest, md);
1072
1073         ++src;
1074         ++dest;
1075         if (mask)
1076             mask++;
1077     }
1078     _mm_empty ();
1079 }
1080
1081 static void
1082 mmx_combine_src_ca (pixman_implementation_t *imp,
1083                     pixman_op_t              op,
1084                     uint32_t *               dest,
1085                     const uint32_t *         src,
1086                     const uint32_t *         mask,
1087                     int                      width)
1088 {
1089     const uint32_t *end = src + width;
1090
1091     while (src < end)
1092     {
1093         __m64 a = load8888 (mask);
1094         __m64 s = load8888 (src);
1095
1096         s = pix_multiply (s, a);
1097         store8888 (dest, s);
1098
1099         ++src;
1100         ++mask;
1101         ++dest;
1102     }
1103     _mm_empty ();
1104 }
1105
1106 static void
1107 mmx_combine_over_ca (pixman_implementation_t *imp,
1108                      pixman_op_t              op,
1109                      uint32_t *               dest,
1110                      const uint32_t *         src,
1111                      const uint32_t *         mask,
1112                      int                      width)
1113 {
1114     const uint32_t *end = src + width;
1115
1116     while (src < end)
1117     {
1118         __m64 a = load8888 (mask);
1119         __m64 s = load8888 (src);
1120         __m64 d = load8888 (dest);
1121         __m64 sa = expand_alpha (s);
1122
1123         store8888 (dest, in_over (s, sa, a, d));
1124
1125         ++src;
1126         ++dest;
1127         ++mask;
1128     }
1129     _mm_empty ();
1130 }
1131
1132 static void
1133 mmx_combine_over_reverse_ca (pixman_implementation_t *imp,
1134                              pixman_op_t              op,
1135                              uint32_t *               dest,
1136                              const uint32_t *         src,
1137                              const uint32_t *         mask,
1138                              int                      width)
1139 {
1140     const uint32_t *end = src + width;
1141
1142     while (src < end)
1143     {
1144         __m64 a = load8888 (mask);
1145         __m64 s = load8888 (src);
1146         __m64 d = load8888 (dest);
1147         __m64 da = expand_alpha (d);
1148
1149         store8888 (dest, over (d, da, in (s, a)));
1150
1151         ++src;
1152         ++dest;
1153         ++mask;
1154     }
1155     _mm_empty ();
1156 }
1157
1158 static void
1159 mmx_combine_in_ca (pixman_implementation_t *imp,
1160                    pixman_op_t              op,
1161                    uint32_t *               dest,
1162                    const uint32_t *         src,
1163                    const uint32_t *         mask,
1164                    int                      width)
1165 {
1166     const uint32_t *end = src + width;
1167
1168     while (src < end)
1169     {
1170         __m64 a = load8888 (mask);
1171         __m64 s = load8888 (src);
1172         __m64 d = load8888 (dest);
1173         __m64 da = expand_alpha (d);
1174
1175         s = pix_multiply (s, a);
1176         s = pix_multiply (s, da);
1177         store8888 (dest, s);
1178
1179         ++src;
1180         ++dest;
1181         ++mask;
1182     }
1183     _mm_empty ();
1184 }
1185
1186 static void
1187 mmx_combine_in_reverse_ca (pixman_implementation_t *imp,
1188                            pixman_op_t              op,
1189                            uint32_t *               dest,
1190                            const uint32_t *         src,
1191                            const uint32_t *         mask,
1192                            int                      width)
1193 {
1194     const uint32_t *end = src + width;
1195
1196     while (src < end)
1197     {
1198         __m64 a = load8888 (mask);
1199         __m64 s = load8888 (src);
1200         __m64 d = load8888 (dest);
1201         __m64 sa = expand_alpha (s);
1202
1203         a = pix_multiply (a, sa);
1204         d = pix_multiply (d, a);
1205         store8888 (dest, d);
1206
1207         ++src;
1208         ++dest;
1209         ++mask;
1210     }
1211     _mm_empty ();
1212 }
1213
1214 static void
1215 mmx_combine_out_ca (pixman_implementation_t *imp,
1216                     pixman_op_t              op,
1217                     uint32_t *               dest,
1218                     const uint32_t *         src,
1219                     const uint32_t *         mask,
1220                     int                      width)
1221 {
1222     const uint32_t *end = src + width;
1223
1224     while (src < end)
1225     {
1226         __m64 a = load8888 (mask);
1227         __m64 s = load8888 (src);
1228         __m64 d = load8888 (dest);
1229         __m64 da = expand_alpha (d);
1230
1231         da = negate (da);
1232         s = pix_multiply (s, a);
1233         s = pix_multiply (s, da);
1234         store8888 (dest, s);
1235
1236         ++src;
1237         ++dest;
1238         ++mask;
1239     }
1240     _mm_empty ();
1241 }
1242
1243 static void
1244 mmx_combine_out_reverse_ca (pixman_implementation_t *imp,
1245                             pixman_op_t              op,
1246                             uint32_t *               dest,
1247                             const uint32_t *         src,
1248                             const uint32_t *         mask,
1249                             int                      width)
1250 {
1251     const uint32_t *end = src + width;
1252
1253     while (src < end)
1254     {
1255         __m64 a = load8888 (mask);
1256         __m64 s = load8888 (src);
1257         __m64 d = load8888 (dest);
1258         __m64 sa = expand_alpha (s);
1259
1260         a = pix_multiply (a, sa);
1261         a = negate (a);
1262         d = pix_multiply (d, a);
1263         store8888 (dest, d);
1264
1265         ++src;
1266         ++dest;
1267         ++mask;
1268     }
1269     _mm_empty ();
1270 }
1271
1272 static void
1273 mmx_combine_atop_ca (pixman_implementation_t *imp,
1274                      pixman_op_t              op,
1275                      uint32_t *               dest,
1276                      const uint32_t *         src,
1277                      const uint32_t *         mask,
1278                      int                      width)
1279 {
1280     const uint32_t *end = src + width;
1281
1282     while (src < end)
1283     {
1284         __m64 a = load8888 (mask);
1285         __m64 s = load8888 (src);
1286         __m64 d = load8888 (dest);
1287         __m64 da = expand_alpha (d);
1288         __m64 sa = expand_alpha (s);
1289
1290         s = pix_multiply (s, a);
1291         a = pix_multiply (a, sa);
1292         a = negate (a);
1293         d = pix_add_mul (d, a, s, da);
1294         store8888 (dest, d);
1295
1296         ++src;
1297         ++dest;
1298         ++mask;
1299     }
1300     _mm_empty ();
1301 }
1302
1303 static void
1304 mmx_combine_atop_reverse_ca (pixman_implementation_t *imp,
1305                              pixman_op_t              op,
1306                              uint32_t *               dest,
1307                              const uint32_t *         src,
1308                              const uint32_t *         mask,
1309                              int                      width)
1310 {
1311     const uint32_t *end = src + width;
1312
1313     while (src < end)
1314     {
1315         __m64 a = load8888 (mask);
1316         __m64 s = load8888 (src);
1317         __m64 d = load8888 (dest);
1318         __m64 da = expand_alpha (d);
1319         __m64 sa = expand_alpha (s);
1320
1321         s = pix_multiply (s, a);
1322         a = pix_multiply (a, sa);
1323         da = negate (da);
1324         d = pix_add_mul (d, a, s, da);
1325         store8888 (dest, d);
1326
1327         ++src;
1328         ++dest;
1329         ++mask;
1330     }
1331     _mm_empty ();
1332 }
1333
1334 static void
1335 mmx_combine_xor_ca (pixman_implementation_t *imp,
1336                     pixman_op_t              op,
1337                     uint32_t *               dest,
1338                     const uint32_t *         src,
1339                     const uint32_t *         mask,
1340                     int                      width)
1341 {
1342     const uint32_t *end = src + width;
1343
1344     while (src < end)
1345     {
1346         __m64 a = load8888 (mask);
1347         __m64 s = load8888 (src);
1348         __m64 d = load8888 (dest);
1349         __m64 da = expand_alpha (d);
1350         __m64 sa = expand_alpha (s);
1351
1352         s = pix_multiply (s, a);
1353         a = pix_multiply (a, sa);
1354         da = negate (da);
1355         a = negate (a);
1356         d = pix_add_mul (d, a, s, da);
1357         store8888 (dest, d);
1358
1359         ++src;
1360         ++dest;
1361         ++mask;
1362     }
1363     _mm_empty ();
1364 }
1365
1366 static void
1367 mmx_combine_add_ca (pixman_implementation_t *imp,
1368                     pixman_op_t              op,
1369                     uint32_t *               dest,
1370                     const uint32_t *         src,
1371                     const uint32_t *         mask,
1372                     int                      width)
1373 {
1374     const uint32_t *end = src + width;
1375
1376     while (src < end)
1377     {
1378         __m64 a = load8888 (mask);
1379         __m64 s = load8888 (src);
1380         __m64 d = load8888 (dest);
1381
1382         s = pix_multiply (s, a);
1383         d = pix_add (s, d);
1384         store8888 (dest, d);
1385
1386         ++src;
1387         ++dest;
1388         ++mask;
1389     }
1390     _mm_empty ();
1391 }
1392
1393 /* ------------- MMX code paths called from fbpict.c -------------------- */
1394
1395 static void
1396 mmx_composite_over_n_8888 (pixman_implementation_t *imp,
1397                            pixman_composite_info_t *info)
1398 {
1399     PIXMAN_COMPOSITE_ARGS (info);
1400     uint32_t src;
1401     uint32_t    *dst_line, *dst;
1402     int32_t w;
1403     int dst_stride;
1404     __m64 vsrc, vsrca;
1405
1406     CHECKPOINT ();
1407
1408     src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
1409
1410     if (src == 0)
1411         return;
1412
1413     PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
1414
1415     vsrc = load8888 (&src);
1416     vsrca = expand_alpha (vsrc);
1417
1418     while (height--)
1419     {
1420         dst = dst_line;
1421         dst_line += dst_stride;
1422         w = width;
1423
1424         CHECKPOINT ();
1425
1426         while (w && (uintptr_t)dst & 7)
1427         {
1428             store8888 (dst, over (vsrc, vsrca, load8888 (dst)));
1429
1430             w--;
1431             dst++;
1432         }
1433
1434         while (w >= 2)
1435         {
1436             __m64 vdest;
1437             __m64 dest0, dest1;
1438
1439             vdest = *(__m64 *)dst;
1440
1441             dest0 = over (vsrc, vsrca, expand8888 (vdest, 0));
1442             dest1 = over (vsrc, vsrca, expand8888 (vdest, 1));
1443
1444             *(__m64 *)dst = pack8888 (dest0, dest1);
1445
1446             dst += 2;
1447             w -= 2;
1448         }
1449
1450         CHECKPOINT ();
1451
1452         if (w)
1453         {
1454             store8888 (dst, over (vsrc, vsrca, load8888 (dst)));
1455         }
1456     }
1457
1458     _mm_empty ();
1459 }
1460
1461 static void
1462 mmx_composite_over_n_0565 (pixman_implementation_t *imp,
1463                            pixman_composite_info_t *info)
1464 {
1465     PIXMAN_COMPOSITE_ARGS (info);
1466     uint32_t src;
1467     uint16_t    *dst_line, *dst;
1468     int32_t w;
1469     int dst_stride;
1470     __m64 vsrc, vsrca;
1471
1472     CHECKPOINT ();
1473
1474     src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
1475
1476     if (src == 0)
1477         return;
1478
1479     PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
1480
1481     vsrc = load8888 (&src);
1482     vsrca = expand_alpha (vsrc);
1483
1484     while (height--)
1485     {
1486         dst = dst_line;
1487         dst_line += dst_stride;
1488         w = width;
1489
1490         CHECKPOINT ();
1491
1492         while (w && (uintptr_t)dst & 7)
1493         {
1494             uint64_t d = *dst;
1495             __m64 vdest = expand565 (to_m64 (d), 0);
1496
1497             vdest = pack_565 (over (vsrc, vsrca, vdest), vdest, 0);
1498             *dst = to_uint64 (vdest);
1499
1500             w--;
1501             dst++;
1502         }
1503
1504         while (w >= 4)
1505         {
1506             __m64 vdest = *(__m64 *)dst;
1507             __m64 v0, v1, v2, v3;
1508
1509             expand_4x565 (vdest, &v0, &v1, &v2, &v3, 0);
1510
1511             v0 = over (vsrc, vsrca, v0);
1512             v1 = over (vsrc, vsrca, v1);
1513             v2 = over (vsrc, vsrca, v2);
1514             v3 = over (vsrc, vsrca, v3);
1515
1516             *(__m64 *)dst = pack_4x565 (v0, v1, v2, v3);
1517
1518             dst += 4;
1519             w -= 4;
1520         }
1521
1522         CHECKPOINT ();
1523
1524         while (w)
1525         {
1526             uint64_t d = *dst;
1527             __m64 vdest = expand565 (to_m64 (d), 0);
1528
1529             vdest = pack_565 (over (vsrc, vsrca, vdest), vdest, 0);
1530             *dst = to_uint64 (vdest);
1531
1532             w--;
1533             dst++;
1534         }
1535     }
1536
1537     _mm_empty ();
1538 }
1539
1540 static void
1541 mmx_composite_over_n_8888_8888_ca (pixman_implementation_t *imp,
1542                                    pixman_composite_info_t *info)
1543 {
1544     PIXMAN_COMPOSITE_ARGS (info);
1545     uint32_t src;
1546     uint32_t    *dst_line;
1547     uint32_t    *mask_line;
1548     int dst_stride, mask_stride;
1549     __m64 vsrc, vsrca;
1550
1551     CHECKPOINT ();
1552
1553     src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
1554
1555     if (src == 0)
1556         return;
1557
1558     PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
1559     PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
1560
1561     vsrc = load8888 (&src);
1562     vsrca = expand_alpha (vsrc);
1563
1564     while (height--)
1565     {
1566         int twidth = width;
1567         uint32_t *p = (uint32_t *)mask_line;
1568         uint32_t *q = (uint32_t *)dst_line;
1569
1570         while (twidth && (uintptr_t)q & 7)
1571         {
1572             uint32_t m = *(uint32_t *)p;
1573
1574             if (m)
1575             {
1576                 __m64 vdest = load8888 (q);
1577                 vdest = in_over (vsrc, vsrca, load8888 (&m), vdest);
1578                 store8888 (q, vdest);
1579             }
1580
1581             twidth--;
1582             p++;
1583             q++;
1584         }
1585
1586         while (twidth >= 2)
1587         {
1588             uint32_t m0, m1;
1589             m0 = *p;
1590             m1 = *(p + 1);
1591
1592             if (m0 | m1)
1593             {
1594                 __m64 dest0, dest1;
1595                 __m64 vdest = *(__m64 *)q;
1596
1597                 dest0 = in_over (vsrc, vsrca, load8888 (&m0),
1598                                  expand8888 (vdest, 0));
1599                 dest1 = in_over (vsrc, vsrca, load8888 (&m1),
1600                                  expand8888 (vdest, 1));
1601
1602                 *(__m64 *)q = pack8888 (dest0, dest1);
1603             }
1604
1605             p += 2;
1606             q += 2;
1607             twidth -= 2;
1608         }
1609
1610         if (twidth)
1611         {
1612             uint32_t m = *(uint32_t *)p;
1613
1614             if (m)
1615             {
1616                 __m64 vdest = load8888 (q);
1617                 vdest = in_over (vsrc, vsrca, load8888 (&m), vdest);
1618                 store8888 (q, vdest);
1619             }
1620
1621             twidth--;
1622             p++;
1623             q++;
1624         }
1625
1626         dst_line += dst_stride;
1627         mask_line += mask_stride;
1628     }
1629
1630     _mm_empty ();
1631 }
1632
1633 static void
1634 mmx_composite_over_8888_n_8888 (pixman_implementation_t *imp,
1635                                 pixman_composite_info_t *info)
1636 {
1637     PIXMAN_COMPOSITE_ARGS (info);
1638     uint32_t    *dst_line, *dst;
1639     uint32_t    *src_line, *src;
1640     uint32_t mask;
1641     __m64 vmask;
1642     int dst_stride, src_stride;
1643     int32_t w;
1644
1645     CHECKPOINT ();
1646
1647     PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
1648     PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
1649
1650     mask = _pixman_image_get_solid (imp, mask_image, dest_image->bits.format);
1651     vmask = expand_alpha (load8888 (&mask));
1652
1653     while (height--)
1654     {
1655         dst = dst_line;
1656         dst_line += dst_stride;
1657         src = src_line;
1658         src_line += src_stride;
1659         w = width;
1660
1661         while (w && (uintptr_t)dst & 7)
1662         {
1663             __m64 s = load8888 (src);
1664             __m64 d = load8888 (dst);
1665
1666             store8888 (dst, in_over (s, expand_alpha (s), vmask, d));
1667
1668             w--;
1669             dst++;
1670             src++;
1671         }
1672
1673         while (w >= 2)
1674         {
1675             __m64 vs = ldq_u ((__m64 *)src);
1676             __m64 vd = *(__m64 *)dst;
1677             __m64 vsrc0 = expand8888 (vs, 0);
1678             __m64 vsrc1 = expand8888 (vs, 1);
1679
1680             *(__m64 *)dst = pack8888 (
1681                 in_over (vsrc0, expand_alpha (vsrc0), vmask, expand8888 (vd, 0)),
1682                 in_over (vsrc1, expand_alpha (vsrc1), vmask, expand8888 (vd, 1)));
1683
1684             w -= 2;
1685             dst += 2;
1686             src += 2;
1687         }
1688
1689         if (w)
1690         {
1691             __m64 s = load8888 (src);
1692             __m64 d = load8888 (dst);
1693
1694             store8888 (dst, in_over (s, expand_alpha (s), vmask, d));
1695         }
1696     }
1697
1698     _mm_empty ();
1699 }
1700
1701 static void
1702 mmx_composite_over_x888_n_8888 (pixman_implementation_t *imp,
1703                                 pixman_composite_info_t *info)
1704 {
1705     PIXMAN_COMPOSITE_ARGS (info);
1706     uint32_t *dst_line, *dst;
1707     uint32_t *src_line, *src;
1708     uint32_t mask;
1709     __m64 vmask;
1710     int dst_stride, src_stride;
1711     int32_t w;
1712     __m64 srca;
1713
1714     CHECKPOINT ();
1715
1716     PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
1717     PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
1718     mask = _pixman_image_get_solid (imp, mask_image, dest_image->bits.format);
1719
1720     vmask = expand_alpha (load8888 (&mask));
1721     srca = MC (4x00ff);
1722
1723     while (height--)
1724     {
1725         dst = dst_line;
1726         dst_line += dst_stride;
1727         src = src_line;
1728         src_line += src_stride;
1729         w = width;
1730
1731         while (w && (uintptr_t)dst & 7)
1732         {
1733             uint32_t ssrc = *src | 0xff000000;
1734             __m64 s = load8888 (&ssrc);
1735             __m64 d = load8888 (dst);
1736
1737             store8888 (dst, in_over (s, srca, vmask, d));
1738
1739             w--;
1740             dst++;
1741             src++;
1742         }
1743
1744         while (w >= 16)
1745         {
1746             __m64 vd0 = *(__m64 *)(dst + 0);
1747             __m64 vd1 = *(__m64 *)(dst + 2);
1748             __m64 vd2 = *(__m64 *)(dst + 4);
1749             __m64 vd3 = *(__m64 *)(dst + 6);
1750             __m64 vd4 = *(__m64 *)(dst + 8);
1751             __m64 vd5 = *(__m64 *)(dst + 10);
1752             __m64 vd6 = *(__m64 *)(dst + 12);
1753             __m64 vd7 = *(__m64 *)(dst + 14);
1754
1755             __m64 vs0 = ldq_u ((__m64 *)(src + 0));
1756             __m64 vs1 = ldq_u ((__m64 *)(src + 2));
1757             __m64 vs2 = ldq_u ((__m64 *)(src + 4));
1758             __m64 vs3 = ldq_u ((__m64 *)(src + 6));
1759             __m64 vs4 = ldq_u ((__m64 *)(src + 8));
1760             __m64 vs5 = ldq_u ((__m64 *)(src + 10));
1761             __m64 vs6 = ldq_u ((__m64 *)(src + 12));
1762             __m64 vs7 = ldq_u ((__m64 *)(src + 14));
1763
1764             vd0 = pack8888 (
1765                 in_over (expandx888 (vs0, 0), srca, vmask, expand8888 (vd0, 0)),
1766                 in_over (expandx888 (vs0, 1), srca, vmask, expand8888 (vd0, 1)));
1767
1768             vd1 = pack8888 (
1769                 in_over (expandx888 (vs1, 0), srca, vmask, expand8888 (vd1, 0)),
1770                 in_over (expandx888 (vs1, 1), srca, vmask, expand8888 (vd1, 1)));
1771
1772             vd2 = pack8888 (
1773                 in_over (expandx888 (vs2, 0), srca, vmask, expand8888 (vd2, 0)),
1774                 in_over (expandx888 (vs2, 1), srca, vmask, expand8888 (vd2, 1)));
1775
1776             vd3 = pack8888 (
1777                 in_over (expandx888 (vs3, 0), srca, vmask, expand8888 (vd3, 0)),
1778                 in_over (expandx888 (vs3, 1), srca, vmask, expand8888 (vd3, 1)));
1779
1780             vd4 = pack8888 (
1781                 in_over (expandx888 (vs4, 0), srca, vmask, expand8888 (vd4, 0)),
1782                 in_over (expandx888 (vs4, 1), srca, vmask, expand8888 (vd4, 1)));
1783
1784             vd5 = pack8888 (
1785                 in_over (expandx888 (vs5, 0), srca, vmask, expand8888 (vd5, 0)),
1786                 in_over (expandx888 (vs5, 1), srca, vmask, expand8888 (vd5, 1)));
1787
1788             vd6 = pack8888 (
1789                 in_over (expandx888 (vs6, 0), srca, vmask, expand8888 (vd6, 0)),
1790                 in_over (expandx888 (vs6, 1), srca, vmask, expand8888 (vd6, 1)));
1791
1792             vd7 = pack8888 (
1793                 in_over (expandx888 (vs7, 0), srca, vmask, expand8888 (vd7, 0)),
1794                 in_over (expandx888 (vs7, 1), srca, vmask, expand8888 (vd7, 1)));
1795
1796             *(__m64 *)(dst + 0) = vd0;
1797             *(__m64 *)(dst + 2) = vd1;
1798             *(__m64 *)(dst + 4) = vd2;
1799             *(__m64 *)(dst + 6) = vd3;
1800             *(__m64 *)(dst + 8) = vd4;
1801             *(__m64 *)(dst + 10) = vd5;
1802             *(__m64 *)(dst + 12) = vd6;
1803             *(__m64 *)(dst + 14) = vd7;
1804
1805             w -= 16;
1806             dst += 16;
1807             src += 16;
1808         }
1809
1810         while (w)
1811         {
1812             uint32_t ssrc = *src | 0xff000000;
1813             __m64 s = load8888 (&ssrc);
1814             __m64 d = load8888 (dst);
1815
1816             store8888 (dst, in_over (s, srca, vmask, d));
1817
1818             w--;
1819             dst++;
1820             src++;
1821         }
1822     }
1823
1824     _mm_empty ();
1825 }
1826
1827 static void
1828 mmx_composite_over_8888_8888 (pixman_implementation_t *imp,
1829                               pixman_composite_info_t *info)
1830 {
1831     PIXMAN_COMPOSITE_ARGS (info);
1832     uint32_t *dst_line, *dst;
1833     uint32_t *src_line, *src;
1834     uint32_t s;
1835     int dst_stride, src_stride;
1836     uint8_t a;
1837     int32_t w;
1838
1839     CHECKPOINT ();
1840
1841     PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
1842     PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
1843
1844     while (height--)
1845     {
1846         dst = dst_line;
1847         dst_line += dst_stride;
1848         src = src_line;
1849         src_line += src_stride;
1850         w = width;
1851
1852         while (w--)
1853         {
1854             s = *src++;
1855             a = s >> 24;
1856
1857             if (a == 0xff)
1858             {
1859                 *dst = s;
1860             }
1861             else if (s)
1862             {
1863                 __m64 ms, sa;
1864                 ms = load8888 (&s);
1865                 sa = expand_alpha (ms);
1866                 store8888 (dst, over (ms, sa, load8888 (dst)));
1867             }
1868
1869             dst++;
1870         }
1871     }
1872     _mm_empty ();
1873 }
1874
1875 static void
1876 mmx_composite_over_8888_0565 (pixman_implementation_t *imp,
1877                               pixman_composite_info_t *info)
1878 {
1879     PIXMAN_COMPOSITE_ARGS (info);
1880     uint16_t    *dst_line, *dst;
1881     uint32_t    *src_line, *src;
1882     int dst_stride, src_stride;
1883     int32_t w;
1884
1885     CHECKPOINT ();
1886
1887     PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
1888     PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
1889
1890 #if 0
1891     /* FIXME */
1892     assert (src_image->drawable == mask_image->drawable);
1893 #endif
1894
1895     while (height--)
1896     {
1897         dst = dst_line;
1898         dst_line += dst_stride;
1899         src = src_line;
1900         src_line += src_stride;
1901         w = width;
1902
1903         CHECKPOINT ();
1904
1905         while (w && (uintptr_t)dst & 7)
1906         {
1907             __m64 vsrc = load8888 (src);
1908             uint64_t d = *dst;
1909             __m64 vdest = expand565 (to_m64 (d), 0);
1910
1911             vdest = pack_565 (
1912                 over (vsrc, expand_alpha (vsrc), vdest), vdest, 0);
1913
1914             *dst = to_uint64 (vdest);
1915
1916             w--;
1917             dst++;
1918             src++;
1919         }
1920
1921         CHECKPOINT ();
1922
1923         while (w >= 4)
1924         {
1925             __m64 vdest = *(__m64 *)dst;
1926             __m64 v0, v1, v2, v3;
1927             __m64 vsrc0, vsrc1, vsrc2, vsrc3;
1928
1929             expand_4x565 (vdest, &v0, &v1, &v2, &v3, 0);
1930
1931             vsrc0 = load8888 ((src + 0));
1932             vsrc1 = load8888 ((src + 1));
1933             vsrc2 = load8888 ((src + 2));
1934             vsrc3 = load8888 ((src + 3));
1935
1936             v0 = over (vsrc0, expand_alpha (vsrc0), v0);
1937             v1 = over (vsrc1, expand_alpha (vsrc1), v1);
1938             v2 = over (vsrc2, expand_alpha (vsrc2), v2);
1939             v3 = over (vsrc3, expand_alpha (vsrc3), v3);
1940
1941             *(__m64 *)dst = pack_4x565 (v0, v1, v2, v3);
1942
1943             w -= 4;
1944             dst += 4;
1945             src += 4;
1946         }
1947
1948         CHECKPOINT ();
1949
1950         while (w)
1951         {
1952             __m64 vsrc = load8888 (src);
1953             uint64_t d = *dst;
1954             __m64 vdest = expand565 (to_m64 (d), 0);
1955
1956             vdest = pack_565 (over (vsrc, expand_alpha (vsrc), vdest), vdest, 0);
1957
1958             *dst = to_uint64 (vdest);
1959
1960             w--;
1961             dst++;
1962             src++;
1963         }
1964     }
1965
1966     _mm_empty ();
1967 }
1968
1969 static void
1970 mmx_composite_over_n_8_8888 (pixman_implementation_t *imp,
1971                              pixman_composite_info_t *info)
1972 {
1973     PIXMAN_COMPOSITE_ARGS (info);
1974     uint32_t src, srca;
1975     uint32_t *dst_line, *dst;
1976     uint8_t *mask_line, *mask;
1977     int dst_stride, mask_stride;
1978     int32_t w;
1979     __m64 vsrc, vsrca;
1980     uint64_t srcsrc;
1981
1982     CHECKPOINT ();
1983
1984     src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
1985
1986     srca = src >> 24;
1987     if (src == 0)
1988         return;
1989
1990     srcsrc = (uint64_t)src << 32 | src;
1991
1992     PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
1993     PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
1994
1995     vsrc = load8888 (&src);
1996     vsrca = expand_alpha (vsrc);
1997
1998     while (height--)
1999     {
2000         dst = dst_line;
2001         dst_line += dst_stride;
2002         mask = mask_line;
2003         mask_line += mask_stride;
2004         w = width;
2005
2006         CHECKPOINT ();
2007
2008         while (w && (uintptr_t)dst & 7)
2009         {
2010             uint64_t m = *mask;
2011
2012             if (m)
2013             {
2014                 __m64 vdest = in_over (vsrc, vsrca,
2015                                        expand_alpha_rev (to_m64 (m)),
2016                                        load8888 (dst));
2017
2018                 store8888 (dst, vdest);
2019             }
2020
2021             w--;
2022             mask++;
2023             dst++;
2024         }
2025
2026         CHECKPOINT ();
2027
2028         while (w >= 2)
2029         {
2030             uint64_t m0, m1;
2031
2032             m0 = *mask;
2033             m1 = *(mask + 1);
2034
2035             if (srca == 0xff && (m0 & m1) == 0xff)
2036             {
2037                 *(uint64_t *)dst = srcsrc;
2038             }
2039             else if (m0 | m1)
2040             {
2041                 __m64 vdest;
2042                 __m64 dest0, dest1;
2043
2044                 vdest = *(__m64 *)dst;
2045
2046                 dest0 = in_over (vsrc, vsrca, expand_alpha_rev (to_m64 (m0)),
2047                                  expand8888 (vdest, 0));
2048                 dest1 = in_over (vsrc, vsrca, expand_alpha_rev (to_m64 (m1)),
2049                                  expand8888 (vdest, 1));
2050
2051                 *(__m64 *)dst = pack8888 (dest0, dest1);
2052             }
2053
2054             mask += 2;
2055             dst += 2;
2056             w -= 2;
2057         }
2058
2059         CHECKPOINT ();
2060
2061         if (w)
2062         {
2063             uint64_t m = *mask;
2064
2065             if (m)
2066             {
2067                 __m64 vdest = load8888 (dst);
2068
2069                 vdest = in_over (
2070                     vsrc, vsrca, expand_alpha_rev (to_m64 (m)), vdest);
2071                 store8888 (dst, vdest);
2072             }
2073         }
2074     }
2075
2076     _mm_empty ();
2077 }
2078
2079 static pixman_bool_t
2080 mmx_fill (pixman_implementation_t *imp,
2081           uint32_t *               bits,
2082           int                      stride,
2083           int                      bpp,
2084           int                      x,
2085           int                      y,
2086           int                      width,
2087           int                      height,
2088           uint32_t                 filler)
2089 {
2090     uint64_t fill;
2091     __m64 vfill;
2092     uint32_t byte_width;
2093     uint8_t     *byte_line;
2094
2095 #if defined __GNUC__ && defined USE_X86_MMX
2096     __m64 v1, v2, v3, v4, v5, v6, v7;
2097 #endif
2098
2099     if (bpp != 16 && bpp != 32 && bpp != 8)
2100         return FALSE;
2101
2102     if (bpp == 8)
2103     {
2104         stride = stride * (int) sizeof (uint32_t) / 1;
2105         byte_line = (uint8_t *)(((uint8_t *)bits) + stride * y + x);
2106         byte_width = width;
2107         stride *= 1;
2108         filler = (filler & 0xff) * 0x01010101;
2109     }
2110     else if (bpp == 16)
2111     {
2112         stride = stride * (int) sizeof (uint32_t) / 2;
2113         byte_line = (uint8_t *)(((uint16_t *)bits) + stride * y + x);
2114         byte_width = 2 * width;
2115         stride *= 2;
2116         filler = (filler & 0xffff) * 0x00010001;
2117     }
2118     else
2119     {
2120         stride = stride * (int) sizeof (uint32_t) / 4;
2121         byte_line = (uint8_t *)(((uint32_t *)bits) + stride * y + x);
2122         byte_width = 4 * width;
2123         stride *= 4;
2124     }
2125
2126     fill = ((uint64_t)filler << 32) | filler;
2127     vfill = to_m64 (fill);
2128
2129 #if defined __GNUC__ && defined USE_X86_MMX
2130     __asm__ (
2131         "movq           %7,     %0\n"
2132         "movq           %7,     %1\n"
2133         "movq           %7,     %2\n"
2134         "movq           %7,     %3\n"
2135         "movq           %7,     %4\n"
2136         "movq           %7,     %5\n"
2137         "movq           %7,     %6\n"
2138         : "=&y" (v1), "=&y" (v2), "=&y" (v3),
2139           "=&y" (v4), "=&y" (v5), "=&y" (v6), "=y" (v7)
2140         : "y" (vfill));
2141 #endif
2142
2143     while (height--)
2144     {
2145         int w;
2146         uint8_t *d = byte_line;
2147
2148         byte_line += stride;
2149         w = byte_width;
2150
2151         if (w >= 1 && ((uintptr_t)d & 1))
2152         {
2153             *(uint8_t *)d = (filler & 0xff);
2154             w--;
2155             d++;
2156         }
2157
2158         if (w >= 2 && ((uintptr_t)d & 3))
2159         {
2160             *(uint16_t *)d = filler;
2161             w -= 2;
2162             d += 2;
2163         }
2164
2165         while (w >= 4 && ((uintptr_t)d & 7))
2166         {
2167             *(uint32_t *)d = filler;
2168
2169             w -= 4;
2170             d += 4;
2171         }
2172
2173         while (w >= 64)
2174         {
2175 #if defined __GNUC__ && defined USE_X86_MMX
2176             __asm__ (
2177                 "movq   %1,       (%0)\n"
2178                 "movq   %2,      8(%0)\n"
2179                 "movq   %3,     16(%0)\n"
2180                 "movq   %4,     24(%0)\n"
2181                 "movq   %5,     32(%0)\n"
2182                 "movq   %6,     40(%0)\n"
2183                 "movq   %7,     48(%0)\n"
2184                 "movq   %8,     56(%0)\n"
2185                 :
2186                 : "r" (d),
2187                   "y" (vfill), "y" (v1), "y" (v2), "y" (v3),
2188                   "y" (v4), "y" (v5), "y" (v6), "y" (v7)
2189                 : "memory");
2190 #else
2191             *(__m64*) (d +  0) = vfill;
2192             *(__m64*) (d +  8) = vfill;
2193             *(__m64*) (d + 16) = vfill;
2194             *(__m64*) (d + 24) = vfill;
2195             *(__m64*) (d + 32) = vfill;
2196             *(__m64*) (d + 40) = vfill;
2197             *(__m64*) (d + 48) = vfill;
2198             *(__m64*) (d + 56) = vfill;
2199 #endif
2200             w -= 64;
2201             d += 64;
2202         }
2203
2204         while (w >= 4)
2205         {
2206             *(uint32_t *)d = filler;
2207
2208             w -= 4;
2209             d += 4;
2210         }
2211         if (w >= 2)
2212         {
2213             *(uint16_t *)d = filler;
2214             w -= 2;
2215             d += 2;
2216         }
2217         if (w >= 1)
2218         {
2219             *(uint8_t *)d = (filler & 0xff);
2220             w--;
2221             d++;
2222         }
2223
2224     }
2225
2226     _mm_empty ();
2227     return TRUE;
2228 }
2229
2230 static void
2231 mmx_composite_src_x888_0565 (pixman_implementation_t *imp,
2232                              pixman_composite_info_t *info)
2233 {
2234     PIXMAN_COMPOSITE_ARGS (info);
2235     uint16_t    *dst_line, *dst;
2236     uint32_t    *src_line, *src, s;
2237     int dst_stride, src_stride;
2238     int32_t w;
2239
2240     PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
2241     PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
2242
2243     while (height--)
2244     {
2245         dst = dst_line;
2246         dst_line += dst_stride;
2247         src = src_line;
2248         src_line += src_stride;
2249         w = width;
2250
2251         while (w && (uintptr_t)dst & 7)
2252         {
2253             s = *src++;
2254             *dst = convert_8888_to_0565 (s);
2255             dst++;
2256             w--;
2257         }
2258
2259         while (w >= 4)
2260         {
2261             __m64 vdest;
2262             __m64 vsrc0 = ldq_u ((__m64 *)(src + 0));
2263             __m64 vsrc1 = ldq_u ((__m64 *)(src + 2));
2264
2265             vdest = pack_4xpacked565 (vsrc0, vsrc1);
2266
2267             *(__m64 *)dst = vdest;
2268
2269             w -= 4;
2270             src += 4;
2271             dst += 4;
2272         }
2273
2274         while (w)
2275         {
2276             s = *src++;
2277             *dst = convert_8888_to_0565 (s);
2278             dst++;
2279             w--;
2280         }
2281     }
2282
2283     _mm_empty ();
2284 }
2285
2286 static void
2287 mmx_composite_src_n_8_8888 (pixman_implementation_t *imp,
2288                             pixman_composite_info_t *info)
2289 {
2290     PIXMAN_COMPOSITE_ARGS (info);
2291     uint32_t src, srca;
2292     uint32_t    *dst_line, *dst;
2293     uint8_t     *mask_line, *mask;
2294     int dst_stride, mask_stride;
2295     int32_t w;
2296     __m64 vsrc;
2297     uint64_t srcsrc;
2298
2299     CHECKPOINT ();
2300
2301     src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
2302
2303     srca = src >> 24;
2304     if (src == 0)
2305     {
2306         mmx_fill (imp, dest_image->bits.bits, dest_image->bits.rowstride,
2307                   PIXMAN_FORMAT_BPP (dest_image->bits.format),
2308                   dest_x, dest_y, width, height, 0);
2309         return;
2310     }
2311
2312     srcsrc = (uint64_t)src << 32 | src;
2313
2314     PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2315     PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
2316
2317     vsrc = load8888 (&src);
2318
2319     while (height--)
2320     {
2321         dst = dst_line;
2322         dst_line += dst_stride;
2323         mask = mask_line;
2324         mask_line += mask_stride;
2325         w = width;
2326
2327         CHECKPOINT ();
2328
2329         while (w && (uintptr_t)dst & 7)
2330         {
2331             uint64_t m = *mask;
2332
2333             if (m)
2334             {
2335                 __m64 vdest = in (vsrc, expand_alpha_rev (to_m64 (m)));
2336
2337                 store8888 (dst, vdest);
2338             }
2339             else
2340             {
2341                 *dst = 0;
2342             }
2343
2344             w--;
2345             mask++;
2346             dst++;
2347         }
2348
2349         CHECKPOINT ();
2350
2351         while (w >= 2)
2352         {
2353             uint64_t m0, m1;
2354             m0 = *mask;
2355             m1 = *(mask + 1);
2356
2357             if (srca == 0xff && (m0 & m1) == 0xff)
2358             {
2359                 *(uint64_t *)dst = srcsrc;
2360             }
2361             else if (m0 | m1)
2362             {
2363                 __m64 dest0, dest1;
2364
2365                 dest0 = in (vsrc, expand_alpha_rev (to_m64 (m0)));
2366                 dest1 = in (vsrc, expand_alpha_rev (to_m64 (m1)));
2367
2368                 *(__m64 *)dst = pack8888 (dest0, dest1);
2369             }
2370             else
2371             {
2372                 *(uint64_t *)dst = 0;
2373             }
2374
2375             mask += 2;
2376             dst += 2;
2377             w -= 2;
2378         }
2379
2380         CHECKPOINT ();
2381
2382         if (w)
2383         {
2384             uint64_t m = *mask;
2385
2386             if (m)
2387             {
2388                 __m64 vdest = load8888 (dst);
2389
2390                 vdest = in (vsrc, expand_alpha_rev (to_m64 (m)));
2391                 store8888 (dst, vdest);
2392             }
2393             else
2394             {
2395                 *dst = 0;
2396             }
2397         }
2398     }
2399
2400     _mm_empty ();
2401 }
2402
2403 static void
2404 mmx_composite_over_n_8_0565 (pixman_implementation_t *imp,
2405                              pixman_composite_info_t *info)
2406 {
2407     PIXMAN_COMPOSITE_ARGS (info);
2408     uint32_t src, srca;
2409     uint16_t *dst_line, *dst;
2410     uint8_t *mask_line, *mask;
2411     int dst_stride, mask_stride;
2412     int32_t w;
2413     __m64 vsrc, vsrca, tmp;
2414     __m64 srcsrcsrcsrc;
2415
2416     CHECKPOINT ();
2417
2418     src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
2419
2420     srca = src >> 24;
2421     if (src == 0)
2422         return;
2423
2424     PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
2425     PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
2426
2427     vsrc = load8888 (&src);
2428     vsrca = expand_alpha (vsrc);
2429
2430     tmp = pack_565 (vsrc, _mm_setzero_si64 (), 0);
2431     srcsrcsrcsrc = expand_alpha_rev (tmp);
2432
2433     while (height--)
2434     {
2435         dst = dst_line;
2436         dst_line += dst_stride;
2437         mask = mask_line;
2438         mask_line += mask_stride;
2439         w = width;
2440
2441         CHECKPOINT ();
2442
2443         while (w && (uintptr_t)dst & 7)
2444         {
2445             uint64_t m = *mask;
2446
2447             if (m)
2448             {
2449                 uint64_t d = *dst;
2450                 __m64 vd = to_m64 (d);
2451                 __m64 vdest = in_over (
2452                     vsrc, vsrca, expand_alpha_rev (to_m64 (m)), expand565 (vd, 0));
2453
2454                 vd = pack_565 (vdest, _mm_setzero_si64 (), 0);
2455                 *dst = to_uint64 (vd);
2456             }
2457
2458             w--;
2459             mask++;
2460             dst++;
2461         }
2462
2463         CHECKPOINT ();
2464
2465         while (w >= 4)
2466         {
2467             uint64_t m0, m1, m2, m3;
2468             m0 = *mask;
2469             m1 = *(mask + 1);
2470             m2 = *(mask + 2);
2471             m3 = *(mask + 3);
2472
2473             if (srca == 0xff && (m0 & m1 & m2 & m3) == 0xff)
2474             {
2475                 *(__m64 *)dst = srcsrcsrcsrc;
2476             }
2477             else if (m0 | m1 | m2 | m3)
2478             {
2479                 __m64 vdest = *(__m64 *)dst;
2480                 __m64 v0, v1, v2, v3;
2481                 __m64 vm0, vm1, vm2, vm3;
2482
2483                 expand_4x565 (vdest, &v0, &v1, &v2, &v3, 0);
2484
2485                 vm0 = to_m64 (m0);
2486                 v0 = in_over (vsrc, vsrca, expand_alpha_rev (vm0), v0);
2487
2488                 vm1 = to_m64 (m1);
2489                 v1 = in_over (vsrc, vsrca, expand_alpha_rev (vm1), v1);
2490
2491                 vm2 = to_m64 (m2);
2492                 v2 = in_over (vsrc, vsrca, expand_alpha_rev (vm2), v2);
2493
2494                 vm3 = to_m64 (m3);
2495                 v3 = in_over (vsrc, vsrca, expand_alpha_rev (vm3), v3);
2496
2497                 *(__m64 *)dst = pack_4x565 (v0, v1, v2, v3);;
2498             }
2499
2500             w -= 4;
2501             mask += 4;
2502             dst += 4;
2503         }
2504
2505         CHECKPOINT ();
2506
2507         while (w)
2508         {
2509             uint64_t m = *mask;
2510
2511             if (m)
2512             {
2513                 uint64_t d = *dst;
2514                 __m64 vd = to_m64 (d);
2515                 __m64 vdest = in_over (vsrc, vsrca, expand_alpha_rev (to_m64 (m)),
2516                                        expand565 (vd, 0));
2517                 vd = pack_565 (vdest, _mm_setzero_si64 (), 0);
2518                 *dst = to_uint64 (vd);
2519             }
2520
2521             w--;
2522             mask++;
2523             dst++;
2524         }
2525     }
2526
2527     _mm_empty ();
2528 }
2529
2530 static void
2531 mmx_composite_over_pixbuf_0565 (pixman_implementation_t *imp,
2532                                 pixman_composite_info_t *info)
2533 {
2534     PIXMAN_COMPOSITE_ARGS (info);
2535     uint16_t    *dst_line, *dst;
2536     uint32_t    *src_line, *src;
2537     int dst_stride, src_stride;
2538     int32_t w;
2539
2540     CHECKPOINT ();
2541
2542     PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
2543     PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
2544
2545 #if 0
2546     /* FIXME */
2547     assert (src_image->drawable == mask_image->drawable);
2548 #endif
2549
2550     while (height--)
2551     {
2552         dst = dst_line;
2553         dst_line += dst_stride;
2554         src = src_line;
2555         src_line += src_stride;
2556         w = width;
2557
2558         CHECKPOINT ();
2559
2560         while (w && (uintptr_t)dst & 7)
2561         {
2562             __m64 vsrc = load8888 (src);
2563             uint64_t d = *dst;
2564             __m64 vdest = expand565 (to_m64 (d), 0);
2565
2566             vdest = pack_565 (over_rev_non_pre (vsrc, vdest), vdest, 0);
2567
2568             *dst = to_uint64 (vdest);
2569
2570             w--;
2571             dst++;
2572             src++;
2573         }
2574
2575         CHECKPOINT ();
2576
2577         while (w >= 4)
2578         {
2579             uint32_t s0, s1, s2, s3;
2580             unsigned char a0, a1, a2, a3;
2581
2582             s0 = *src;
2583             s1 = *(src + 1);
2584             s2 = *(src + 2);
2585             s3 = *(src + 3);
2586
2587             a0 = (s0 >> 24);
2588             a1 = (s1 >> 24);
2589             a2 = (s2 >> 24);
2590             a3 = (s3 >> 24);
2591
2592             if ((a0 & a1 & a2 & a3) == 0xFF)
2593             {
2594                 __m64 v0 = invert_colors (load8888 (&s0));
2595                 __m64 v1 = invert_colors (load8888 (&s1));
2596                 __m64 v2 = invert_colors (load8888 (&s2));
2597                 __m64 v3 = invert_colors (load8888 (&s3));
2598
2599                 *(__m64 *)dst = pack_4x565 (v0, v1, v2, v3);
2600             }
2601             else if (s0 | s1 | s2 | s3)
2602             {
2603                 __m64 vdest = *(__m64 *)dst;
2604                 __m64 v0, v1, v2, v3;
2605
2606                 __m64 vsrc0 = load8888 (&s0);
2607                 __m64 vsrc1 = load8888 (&s1);
2608                 __m64 vsrc2 = load8888 (&s2);
2609                 __m64 vsrc3 = load8888 (&s3);
2610
2611                 expand_4x565 (vdest, &v0, &v1, &v2, &v3, 0);
2612
2613                 v0 = over_rev_non_pre (vsrc0, v0);
2614                 v1 = over_rev_non_pre (vsrc1, v1);
2615                 v2 = over_rev_non_pre (vsrc2, v2);
2616                 v3 = over_rev_non_pre (vsrc3, v3);
2617
2618                 *(__m64 *)dst = pack_4x565 (v0, v1, v2, v3);
2619             }
2620
2621             w -= 4;
2622             dst += 4;
2623             src += 4;
2624         }
2625
2626         CHECKPOINT ();
2627
2628         while (w)
2629         {
2630             __m64 vsrc = load8888 (src);
2631             uint64_t d = *dst;
2632             __m64 vdest = expand565 (to_m64 (d), 0);
2633
2634             vdest = pack_565 (over_rev_non_pre (vsrc, vdest), vdest, 0);
2635
2636             *dst = to_uint64 (vdest);
2637
2638             w--;
2639             dst++;
2640             src++;
2641         }
2642     }
2643
2644     _mm_empty ();
2645 }
2646
2647 static void
2648 mmx_composite_over_pixbuf_8888 (pixman_implementation_t *imp,
2649                                 pixman_composite_info_t *info)
2650 {
2651     PIXMAN_COMPOSITE_ARGS (info);
2652     uint32_t    *dst_line, *dst;
2653     uint32_t    *src_line, *src;
2654     int dst_stride, src_stride;
2655     int32_t w;
2656
2657     CHECKPOINT ();
2658
2659     PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2660     PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
2661
2662 #if 0
2663     /* FIXME */
2664     assert (src_image->drawable == mask_image->drawable);
2665 #endif
2666
2667     while (height--)
2668     {
2669         dst = dst_line;
2670         dst_line += dst_stride;
2671         src = src_line;
2672         src_line += src_stride;
2673         w = width;
2674
2675         while (w && (uintptr_t)dst & 7)
2676         {
2677             __m64 s = load8888 (src);
2678             __m64 d = load8888 (dst);
2679
2680             store8888 (dst, over_rev_non_pre (s, d));
2681
2682             w--;
2683             dst++;
2684             src++;
2685         }
2686
2687         while (w >= 2)
2688         {
2689             uint32_t s0, s1;
2690             unsigned char a0, a1;
2691             __m64 d0, d1;
2692
2693             s0 = *src;
2694             s1 = *(src + 1);
2695
2696             a0 = (s0 >> 24);
2697             a1 = (s1 >> 24);
2698
2699             if ((a0 & a1) == 0xFF)
2700             {
2701                 d0 = invert_colors (load8888 (&s0));
2702                 d1 = invert_colors (load8888 (&s1));
2703
2704                 *(__m64 *)dst = pack8888 (d0, d1);
2705             }
2706             else if (s0 | s1)
2707             {
2708                 __m64 vdest = *(__m64 *)dst;
2709
2710                 d0 = over_rev_non_pre (load8888 (&s0), expand8888 (vdest, 0));
2711                 d1 = over_rev_non_pre (load8888 (&s1), expand8888 (vdest, 1));
2712
2713                 *(__m64 *)dst = pack8888 (d0, d1);
2714             }
2715
2716             w -= 2;
2717             dst += 2;
2718             src += 2;
2719         }
2720
2721         if (w)
2722         {
2723             __m64 s = load8888 (src);
2724             __m64 d = load8888 (dst);
2725
2726             store8888 (dst, over_rev_non_pre (s, d));
2727         }
2728     }
2729
2730     _mm_empty ();
2731 }
2732
2733 static void
2734 mmx_composite_over_n_8888_0565_ca (pixman_implementation_t *imp,
2735                                    pixman_composite_info_t *info)
2736 {
2737     PIXMAN_COMPOSITE_ARGS (info);
2738     uint32_t src;
2739     uint16_t    *dst_line;
2740     uint32_t    *mask_line;
2741     int dst_stride, mask_stride;
2742     __m64 vsrc, vsrca;
2743
2744     CHECKPOINT ();
2745
2746     src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
2747
2748     if (src == 0)
2749         return;
2750
2751     PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
2752     PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
2753
2754     vsrc = load8888 (&src);
2755     vsrca = expand_alpha (vsrc);
2756
2757     while (height--)
2758     {
2759         int twidth = width;
2760         uint32_t *p = (uint32_t *)mask_line;
2761         uint16_t *q = (uint16_t *)dst_line;
2762
2763         while (twidth && ((uintptr_t)q & 7))
2764         {
2765             uint32_t m = *(uint32_t *)p;
2766
2767             if (m)
2768             {
2769                 uint64_t d = *q;
2770                 __m64 vdest = expand565 (to_m64 (d), 0);
2771                 vdest = pack_565 (in_over (vsrc, vsrca, load8888 (&m), vdest), vdest, 0);
2772                 *q = to_uint64 (vdest);
2773             }
2774
2775             twidth--;
2776             p++;
2777             q++;
2778         }
2779
2780         while (twidth >= 4)
2781         {
2782             uint32_t m0, m1, m2, m3;
2783
2784             m0 = *p;
2785             m1 = *(p + 1);
2786             m2 = *(p + 2);
2787             m3 = *(p + 3);
2788
2789             if ((m0 | m1 | m2 | m3))
2790             {
2791                 __m64 vdest = *(__m64 *)q;
2792                 __m64 v0, v1, v2, v3;
2793
2794                 expand_4x565 (vdest, &v0, &v1, &v2, &v3, 0);
2795
2796                 v0 = in_over (vsrc, vsrca, load8888 (&m0), v0);
2797                 v1 = in_over (vsrc, vsrca, load8888 (&m1), v1);
2798                 v2 = in_over (vsrc, vsrca, load8888 (&m2), v2);
2799                 v3 = in_over (vsrc, vsrca, load8888 (&m3), v3);
2800
2801                 *(__m64 *)q = pack_4x565 (v0, v1, v2, v3);
2802             }
2803             twidth -= 4;
2804             p += 4;
2805             q += 4;
2806         }
2807
2808         while (twidth)
2809         {
2810             uint32_t m;
2811
2812             m = *(uint32_t *)p;
2813             if (m)
2814             {
2815                 uint64_t d = *q;
2816                 __m64 vdest = expand565 (to_m64 (d), 0);
2817                 vdest = pack_565 (in_over (vsrc, vsrca, load8888 (&m), vdest), vdest, 0);
2818                 *q = to_uint64 (vdest);
2819             }
2820
2821             twidth--;
2822             p++;
2823             q++;
2824         }
2825
2826         mask_line += mask_stride;
2827         dst_line += dst_stride;
2828     }
2829
2830     _mm_empty ();
2831 }
2832
2833 static void
2834 mmx_composite_in_n_8_8 (pixman_implementation_t *imp,
2835                         pixman_composite_info_t *info)
2836 {
2837     PIXMAN_COMPOSITE_ARGS (info);
2838     uint8_t *dst_line, *dst;
2839     uint8_t *mask_line, *mask;
2840     int dst_stride, mask_stride;
2841     int32_t w;
2842     uint32_t src;
2843     uint8_t sa;
2844     __m64 vsrc, vsrca;
2845
2846     PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
2847     PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
2848
2849     src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
2850
2851     sa = src >> 24;
2852
2853     vsrc = load8888 (&src);
2854     vsrca = expand_alpha (vsrc);
2855
2856     while (height--)
2857     {
2858         dst = dst_line;
2859         dst_line += dst_stride;
2860         mask = mask_line;
2861         mask_line += mask_stride;
2862         w = width;
2863
2864         while (w && (uintptr_t)dst & 7)
2865         {
2866             uint16_t tmp;
2867             uint8_t a;
2868             uint32_t m, d;
2869
2870             a = *mask++;
2871             d = *dst;
2872
2873             m = MUL_UN8 (sa, a, tmp);
2874             d = MUL_UN8 (m, d, tmp);
2875
2876             *dst++ = d;
2877             w--;
2878         }
2879
2880         while (w >= 4)
2881         {
2882             __m64 vmask;
2883             __m64 vdest;
2884
2885             vmask = load8888u ((uint32_t *)mask);
2886             vdest = load8888 ((uint32_t *)dst);
2887
2888             store8888 ((uint32_t *)dst, in (in (vsrca, vmask), vdest));
2889
2890             dst += 4;
2891             mask += 4;
2892             w -= 4;
2893         }
2894
2895         while (w--)
2896         {
2897             uint16_t tmp;
2898             uint8_t a;
2899             uint32_t m, d;
2900
2901             a = *mask++;
2902             d = *dst;
2903
2904             m = MUL_UN8 (sa, a, tmp);
2905             d = MUL_UN8 (m, d, tmp);
2906
2907             *dst++ = d;
2908         }
2909     }
2910
2911     _mm_empty ();
2912 }
2913
2914 static void
2915 mmx_composite_in_8_8 (pixman_implementation_t *imp,
2916                       pixman_composite_info_t *info)
2917 {
2918     PIXMAN_COMPOSITE_ARGS (info);
2919     uint8_t     *dst_line, *dst;
2920     uint8_t     *src_line, *src;
2921     int src_stride, dst_stride;
2922     int32_t w;
2923
2924     PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
2925     PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
2926
2927     while (height--)
2928     {
2929         dst = dst_line;
2930         dst_line += dst_stride;
2931         src = src_line;
2932         src_line += src_stride;
2933         w = width;
2934
2935         while (w && (uintptr_t)dst & 3)
2936         {
2937             uint8_t s, d;
2938             uint16_t tmp;
2939
2940             s = *src;
2941             d = *dst;
2942
2943             *dst = MUL_UN8 (s, d, tmp);
2944
2945             src++;
2946             dst++;
2947             w--;
2948         }
2949
2950         while (w >= 4)
2951         {
2952             uint32_t *s = (uint32_t *)src;
2953             uint32_t *d = (uint32_t *)dst;
2954
2955             store8888 (d, in (load8888u (s), load8888 (d)));
2956
2957             w -= 4;
2958             dst += 4;
2959             src += 4;
2960         }
2961
2962         while (w--)
2963         {
2964             uint8_t s, d;
2965             uint16_t tmp;
2966
2967             s = *src;
2968             d = *dst;
2969
2970             *dst = MUL_UN8 (s, d, tmp);
2971
2972             src++;
2973             dst++;
2974         }
2975     }
2976
2977     _mm_empty ();
2978 }
2979
2980 static void
2981 mmx_composite_add_n_8_8 (pixman_implementation_t *imp,
2982                          pixman_composite_info_t *info)
2983 {
2984     PIXMAN_COMPOSITE_ARGS (info);
2985     uint8_t     *dst_line, *dst;
2986     uint8_t     *mask_line, *mask;
2987     int dst_stride, mask_stride;
2988     int32_t w;
2989     uint32_t src;
2990     uint8_t sa;
2991     __m64 vsrc, vsrca;
2992
2993     PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
2994     PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
2995
2996     src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
2997
2998     sa = src >> 24;
2999
3000     if (src == 0)
3001         return;
3002
3003     vsrc = load8888 (&src);
3004     vsrca = expand_alpha (vsrc);
3005
3006     while (height--)
3007     {
3008         dst = dst_line;
3009         dst_line += dst_stride;
3010         mask = mask_line;
3011         mask_line += mask_stride;
3012         w = width;
3013
3014         while (w && (uintptr_t)dst & 3)
3015         {
3016             uint16_t tmp;
3017             uint16_t a;
3018             uint32_t m, d;
3019             uint32_t r;
3020
3021             a = *mask++;
3022             d = *dst;
3023
3024             m = MUL_UN8 (sa, a, tmp);
3025             r = ADD_UN8 (m, d, tmp);
3026
3027             *dst++ = r;
3028             w--;
3029         }
3030
3031         while (w >= 4)
3032         {
3033             __m64 vmask;
3034             __m64 vdest;
3035
3036             vmask = load8888u ((uint32_t *)mask);
3037             vdest = load8888 ((uint32_t *)dst);
3038
3039             store8888 ((uint32_t *)dst, _mm_adds_pu8 (in (vsrca, vmask), vdest));
3040
3041             dst += 4;
3042             mask += 4;
3043             w -= 4;
3044         }
3045
3046         while (w--)
3047         {
3048             uint16_t tmp;
3049             uint16_t a;
3050             uint32_t m, d;
3051             uint32_t r;
3052
3053             a = *mask++;
3054             d = *dst;
3055
3056             m = MUL_UN8 (sa, a, tmp);
3057             r = ADD_UN8 (m, d, tmp);
3058
3059             *dst++ = r;
3060         }
3061     }
3062
3063     _mm_empty ();
3064 }
3065
3066 static void
3067 mmx_composite_add_8_8 (pixman_implementation_t *imp,
3068                        pixman_composite_info_t *info)
3069 {
3070     PIXMAN_COMPOSITE_ARGS (info);
3071     uint8_t *dst_line, *dst;
3072     uint8_t *src_line, *src;
3073     int dst_stride, src_stride;
3074     int32_t w;
3075     uint8_t s, d;
3076     uint16_t t;
3077
3078     CHECKPOINT ();
3079
3080     PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
3081     PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
3082
3083     while (height--)
3084     {
3085         dst = dst_line;
3086         dst_line += dst_stride;
3087         src = src_line;
3088         src_line += src_stride;
3089         w = width;
3090
3091         while (w && (uintptr_t)dst & 7)
3092         {
3093             s = *src;
3094             d = *dst;
3095             t = d + s;
3096             s = t | (0 - (t >> 8));
3097             *dst = s;
3098
3099             dst++;
3100             src++;
3101             w--;
3102         }
3103
3104         while (w >= 8)
3105         {
3106             *(__m64*)dst = _mm_adds_pu8 (ldq_u ((__m64 *)src), *(__m64*)dst);
3107             dst += 8;
3108             src += 8;
3109             w -= 8;
3110         }
3111
3112         while (w)
3113         {
3114             s = *src;
3115             d = *dst;
3116             t = d + s;
3117             s = t | (0 - (t >> 8));
3118             *dst = s;
3119
3120             dst++;
3121             src++;
3122             w--;
3123         }
3124     }
3125
3126     _mm_empty ();
3127 }
3128
3129 static void
3130 mmx_composite_add_0565_0565 (pixman_implementation_t *imp,
3131                              pixman_composite_info_t *info)
3132 {
3133     PIXMAN_COMPOSITE_ARGS (info);
3134     uint16_t    *dst_line, *dst;
3135     uint32_t    d;
3136     uint16_t    *src_line, *src;
3137     uint32_t    s;
3138     int dst_stride, src_stride;
3139     int32_t w;
3140
3141     CHECKPOINT ();
3142
3143     PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint16_t, src_stride, src_line, 1);
3144     PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
3145
3146     while (height--)
3147     {
3148         dst = dst_line;
3149         dst_line += dst_stride;
3150         src = src_line;
3151         src_line += src_stride;
3152         w = width;
3153
3154         while (w && (uintptr_t)dst & 7)
3155         {
3156             s = *src++;
3157             if (s)
3158             {
3159                 d = *dst;
3160                 s = convert_0565_to_8888 (s);
3161                 if (d)
3162                 {
3163                     d = convert_0565_to_8888 (d);
3164                     UN8x4_ADD_UN8x4 (s, d);
3165                 }
3166                 *dst = convert_8888_to_0565 (s);
3167             }
3168             dst++;
3169             w--;
3170         }
3171
3172         while (w >= 4)
3173         {
3174             __m64 vdest = *(__m64 *)dst;
3175             __m64 vsrc = ldq_u ((__m64 *)src);
3176             __m64 vd0, vd1;
3177             __m64 vs0, vs1;
3178
3179             expand_4xpacked565 (vdest, &vd0, &vd1, 0);
3180             expand_4xpacked565 (vsrc, &vs0, &vs1, 0);
3181
3182             vd0 = _mm_adds_pu8 (vd0, vs0);
3183             vd1 = _mm_adds_pu8 (vd1, vs1);
3184
3185             *(__m64 *)dst = pack_4xpacked565 (vd0, vd1);
3186
3187             dst += 4;
3188             src += 4;
3189             w -= 4;
3190         }
3191
3192         while (w--)
3193         {
3194             s = *src++;
3195             if (s)
3196             {
3197                 d = *dst;
3198                 s = convert_0565_to_8888 (s);
3199                 if (d)
3200                 {
3201                     d = convert_0565_to_8888 (d);
3202                     UN8x4_ADD_UN8x4 (s, d);
3203                 }
3204                 *dst = convert_8888_to_0565 (s);
3205             }
3206             dst++;
3207         }
3208     }
3209
3210     _mm_empty ();
3211 }
3212
3213 static void
3214 mmx_composite_add_8888_8888 (pixman_implementation_t *imp,
3215                              pixman_composite_info_t *info)
3216 {
3217     PIXMAN_COMPOSITE_ARGS (info);
3218     uint32_t    *dst_line, *dst;
3219     uint32_t    *src_line, *src;
3220     int dst_stride, src_stride;
3221     int32_t w;
3222
3223     CHECKPOINT ();
3224
3225     PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3226     PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3227
3228     while (height--)
3229     {
3230         dst = dst_line;
3231         dst_line += dst_stride;
3232         src = src_line;
3233         src_line += src_stride;
3234         w = width;
3235
3236         while (w && (uintptr_t)dst & 7)
3237         {
3238             store (dst, _mm_adds_pu8 (load ((const uint32_t *)src),
3239                                       load ((const uint32_t *)dst)));
3240             dst++;
3241             src++;
3242             w--;
3243         }
3244
3245         while (w >= 2)
3246         {
3247             *(__m64 *)dst = _mm_adds_pu8 (ldq_u ((__m64 *)src), *(__m64*)dst);
3248             dst += 2;
3249             src += 2;
3250             w -= 2;
3251         }
3252
3253         if (w)
3254         {
3255             store (dst, _mm_adds_pu8 (load ((const uint32_t *)src),
3256                                       load ((const uint32_t *)dst)));
3257
3258         }
3259     }
3260
3261     _mm_empty ();
3262 }
3263
3264 static pixman_bool_t
3265 mmx_blt (pixman_implementation_t *imp,
3266          uint32_t *               src_bits,
3267          uint32_t *               dst_bits,
3268          int                      src_stride,
3269          int                      dst_stride,
3270          int                      src_bpp,
3271          int                      dst_bpp,
3272          int                      src_x,
3273          int                      src_y,
3274          int                      dest_x,
3275          int                      dest_y,
3276          int                      width,
3277          int                      height)
3278 {
3279     uint8_t *   src_bytes;
3280     uint8_t *   dst_bytes;
3281     int byte_width;
3282
3283     if (src_bpp != dst_bpp)
3284         return FALSE;
3285
3286     if (src_bpp == 16)
3287     {
3288         src_stride = src_stride * (int) sizeof (uint32_t) / 2;
3289         dst_stride = dst_stride * (int) sizeof (uint32_t) / 2;
3290         src_bytes = (uint8_t *)(((uint16_t *)src_bits) + src_stride * (src_y) + (src_x));
3291         dst_bytes = (uint8_t *)(((uint16_t *)dst_bits) + dst_stride * (dest_y) + (dest_x));
3292         byte_width = 2 * width;
3293         src_stride *= 2;
3294         dst_stride *= 2;
3295     }
3296     else if (src_bpp == 32)
3297     {
3298         src_stride = src_stride * (int) sizeof (uint32_t) / 4;
3299         dst_stride = dst_stride * (int) sizeof (uint32_t) / 4;
3300         src_bytes = (uint8_t *)(((uint32_t *)src_bits) + src_stride * (src_y) + (src_x));
3301         dst_bytes = (uint8_t *)(((uint32_t *)dst_bits) + dst_stride * (dest_y) + (dest_x));
3302         byte_width = 4 * width;
3303         src_stride *= 4;
3304         dst_stride *= 4;
3305     }
3306     else
3307     {
3308         return FALSE;
3309     }
3310
3311     while (height--)
3312     {
3313         int w;
3314         uint8_t *s = src_bytes;
3315         uint8_t *d = dst_bytes;
3316         src_bytes += src_stride;
3317         dst_bytes += dst_stride;
3318         w = byte_width;
3319
3320         if (w >= 1 && ((uintptr_t)d & 1))
3321         {
3322             *(uint8_t *)d = *(uint8_t *)s;
3323             w -= 1;
3324             s += 1;
3325             d += 1;
3326         }
3327
3328         if (w >= 2 && ((uintptr_t)d & 3))
3329         {
3330             *(uint16_t *)d = *(uint16_t *)s;
3331             w -= 2;
3332             s += 2;
3333             d += 2;
3334         }
3335
3336         while (w >= 4 && ((uintptr_t)d & 7))
3337         {
3338             *(uint32_t *)d = ldl_u ((uint32_t *)s);
3339
3340             w -= 4;
3341             s += 4;
3342             d += 4;
3343         }
3344
3345         while (w >= 64)
3346         {
3347 #if (defined (__GNUC__) || (defined(__SUNPRO_C) && (__SUNPRO_C >= 0x590))) && defined USE_X86_MMX
3348             __asm__ (
3349                 "movq     (%1),   %%mm0\n"
3350                 "movq    8(%1),   %%mm1\n"
3351                 "movq   16(%1),   %%mm2\n"
3352                 "movq   24(%1),   %%mm3\n"
3353                 "movq   32(%1),   %%mm4\n"
3354                 "movq   40(%1),   %%mm5\n"
3355                 "movq   48(%1),   %%mm6\n"
3356                 "movq   56(%1),   %%mm7\n"
3357
3358                 "movq   %%mm0,    (%0)\n"
3359                 "movq   %%mm1,   8(%0)\n"
3360                 "movq   %%mm2,  16(%0)\n"
3361                 "movq   %%mm3,  24(%0)\n"
3362                 "movq   %%mm4,  32(%0)\n"
3363                 "movq   %%mm5,  40(%0)\n"
3364                 "movq   %%mm6,  48(%0)\n"
3365                 "movq   %%mm7,  56(%0)\n"
3366                 :
3367                 : "r" (d), "r" (s)
3368                 : "memory",
3369                   "%mm0", "%mm1", "%mm2", "%mm3",
3370                   "%mm4", "%mm5", "%mm6", "%mm7");
3371 #else
3372             __m64 v0 = ldq_u ((__m64 *)(s + 0));
3373             __m64 v1 = ldq_u ((__m64 *)(s + 8));
3374             __m64 v2 = ldq_u ((__m64 *)(s + 16));
3375             __m64 v3 = ldq_u ((__m64 *)(s + 24));
3376             __m64 v4 = ldq_u ((__m64 *)(s + 32));
3377             __m64 v5 = ldq_u ((__m64 *)(s + 40));
3378             __m64 v6 = ldq_u ((__m64 *)(s + 48));
3379             __m64 v7 = ldq_u ((__m64 *)(s + 56));
3380             *(__m64 *)(d + 0)  = v0;
3381             *(__m64 *)(d + 8)  = v1;
3382             *(__m64 *)(d + 16) = v2;
3383             *(__m64 *)(d + 24) = v3;
3384             *(__m64 *)(d + 32) = v4;
3385             *(__m64 *)(d + 40) = v5;
3386             *(__m64 *)(d + 48) = v6;
3387             *(__m64 *)(d + 56) = v7;
3388 #endif
3389
3390             w -= 64;
3391             s += 64;
3392             d += 64;
3393         }
3394         while (w >= 4)
3395         {
3396             *(uint32_t *)d = ldl_u ((uint32_t *)s);
3397
3398             w -= 4;
3399             s += 4;
3400             d += 4;
3401         }
3402         if (w >= 2)
3403         {
3404             *(uint16_t *)d = *(uint16_t *)s;
3405             w -= 2;
3406             s += 2;
3407             d += 2;
3408         }
3409     }
3410
3411     _mm_empty ();
3412
3413     return TRUE;
3414 }
3415
3416 static void
3417 mmx_composite_copy_area (pixman_implementation_t *imp,
3418                          pixman_composite_info_t *info)
3419 {
3420     PIXMAN_COMPOSITE_ARGS (info);
3421
3422     mmx_blt (imp, src_image->bits.bits,
3423              dest_image->bits.bits,
3424              src_image->bits.rowstride,
3425              dest_image->bits.rowstride,
3426              PIXMAN_FORMAT_BPP (src_image->bits.format),
3427              PIXMAN_FORMAT_BPP (dest_image->bits.format),
3428              src_x, src_y, dest_x, dest_y, width, height);
3429 }
3430
3431 static void
3432 mmx_composite_over_x888_8_8888 (pixman_implementation_t *imp,
3433                                 pixman_composite_info_t *info)
3434 {
3435     PIXMAN_COMPOSITE_ARGS (info);
3436     uint32_t  *src, *src_line;
3437     uint32_t  *dst, *dst_line;
3438     uint8_t  *mask, *mask_line;
3439     int src_stride, mask_stride, dst_stride;
3440     int32_t w;
3441
3442     PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3443     PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
3444     PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3445
3446     while (height--)
3447     {
3448         src = src_line;
3449         src_line += src_stride;
3450         dst = dst_line;
3451         dst_line += dst_stride;
3452         mask = mask_line;
3453         mask_line += mask_stride;
3454
3455         w = width;
3456
3457         while (w--)
3458         {
3459             uint64_t m = *mask;
3460
3461             if (m)
3462             {
3463                 uint32_t ssrc = *src | 0xff000000;
3464                 __m64 s = load8888 (&ssrc);
3465
3466                 if (m == 0xff)
3467                 {
3468                     store8888 (dst, s);
3469                 }
3470                 else
3471                 {
3472                     __m64 sa = expand_alpha (s);
3473                     __m64 vm = expand_alpha_rev (to_m64 (m));
3474                     __m64 vdest = in_over (s, sa, vm, load8888 (dst));
3475
3476                     store8888 (dst, vdest);
3477                 }
3478             }
3479
3480             mask++;
3481             dst++;
3482             src++;
3483         }
3484     }
3485
3486     _mm_empty ();
3487 }
3488
3489 static void
3490 mmx_composite_over_reverse_n_8888 (pixman_implementation_t *imp,
3491                                    pixman_composite_info_t *info)
3492 {
3493     PIXMAN_COMPOSITE_ARGS (info);
3494     uint32_t src;
3495     uint32_t    *dst_line, *dst;
3496     int32_t w;
3497     int dst_stride;
3498     __m64 vsrc;
3499
3500     CHECKPOINT ();
3501
3502     src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
3503
3504     if (src == 0)
3505         return;
3506
3507     PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3508
3509     vsrc = load8888 (&src);
3510
3511     while (height--)
3512     {
3513         dst = dst_line;
3514         dst_line += dst_stride;
3515         w = width;
3516
3517         CHECKPOINT ();
3518
3519         while (w && (uintptr_t)dst & 7)
3520         {
3521             __m64 vdest = load8888 (dst);
3522
3523             store8888 (dst, over (vdest, expand_alpha (vdest), vsrc));
3524
3525             w--;
3526             dst++;
3527         }
3528
3529         while (w >= 2)
3530         {
3531             __m64 vdest = *(__m64 *)dst;
3532             __m64 dest0 = expand8888 (vdest, 0);
3533             __m64 dest1 = expand8888 (vdest, 1);
3534
3535
3536             dest0 = over (dest0, expand_alpha (dest0), vsrc);
3537             dest1 = over (dest1, expand_alpha (dest1), vsrc);
3538
3539             *(__m64 *)dst = pack8888 (dest0, dest1);
3540
3541             dst += 2;
3542             w -= 2;
3543         }
3544
3545         CHECKPOINT ();
3546
3547         if (w)
3548         {
3549             __m64 vdest = load8888 (dst);
3550
3551             store8888 (dst, over (vdest, expand_alpha (vdest), vsrc));
3552         }
3553     }
3554
3555     _mm_empty ();
3556 }
3557
3558 #define BSHIFT ((1 << BILINEAR_INTERPOLATION_BITS))
3559 #define BMSK (BSHIFT - 1)
3560
3561 #define BILINEAR_DECLARE_VARIABLES                                              \
3562     const __m64 mm_wt = _mm_set_pi16 (wt, wt, wt, wt);                          \
3563     const __m64 mm_wb = _mm_set_pi16 (wb, wb, wb, wb);                          \
3564     const __m64 mm_addc7 = _mm_set_pi16 (0, 1, 0, 1);                           \
3565     const __m64 mm_xorc7 = _mm_set_pi16 (0, BMSK, 0, BMSK);                     \
3566     const __m64 mm_ux = _mm_set_pi16 (unit_x, unit_x, unit_x, unit_x);          \
3567     const __m64 mm_zero = _mm_setzero_si64 ();                                  \
3568     __m64 mm_x = _mm_set_pi16 (vx, vx, vx, vx)
3569
3570 #define BILINEAR_INTERPOLATE_ONE_PIXEL(pix)                                     \
3571 do {                                                                            \
3572     /* fetch 2x2 pixel block into 2 mmx registers */                            \
3573     __m64 t = ldq_u ((__m64 *)&src_top [pixman_fixed_to_int (vx)]);             \
3574     __m64 b = ldq_u ((__m64 *)&src_bottom [pixman_fixed_to_int (vx)]);          \
3575     /* vertical interpolation */                                                \
3576     __m64 t_hi = _mm_mullo_pi16 (_mm_unpackhi_pi8 (t, mm_zero), mm_wt);         \
3577     __m64 t_lo = _mm_mullo_pi16 (_mm_unpacklo_pi8 (t, mm_zero), mm_wt);         \
3578     __m64 b_hi = _mm_mullo_pi16 (_mm_unpackhi_pi8 (b, mm_zero), mm_wb);         \
3579     __m64 b_lo = _mm_mullo_pi16 (_mm_unpacklo_pi8 (b, mm_zero), mm_wb);         \
3580     __m64 hi = _mm_add_pi16 (t_hi, b_hi);                                       \
3581     __m64 lo = _mm_add_pi16 (t_lo, b_lo);                                       \
3582     /* calculate horizontal weights */                                          \
3583     __m64 mm_wh = _mm_add_pi16 (mm_addc7, _mm_xor_si64 (mm_xorc7,               \
3584                           _mm_srli_pi16 (mm_x,                                  \
3585                                          16 - BILINEAR_INTERPOLATION_BITS)));   \
3586     /* horizontal interpolation */                                              \
3587     __m64 p = _mm_unpacklo_pi16 (lo, hi);                                       \
3588     __m64 q = _mm_unpackhi_pi16 (lo, hi);                                       \
3589     vx += unit_x;                                                               \
3590     lo = _mm_madd_pi16 (p, mm_wh);                                              \
3591     hi = _mm_madd_pi16 (q, mm_wh);                                              \
3592     mm_x = _mm_add_pi16 (mm_x, mm_ux);                                          \
3593     /* shift and pack the result */                                             \
3594     hi = _mm_srli_pi32 (hi, BILINEAR_INTERPOLATION_BITS * 2);                   \
3595     lo = _mm_srli_pi32 (lo, BILINEAR_INTERPOLATION_BITS * 2);                   \
3596     lo = _mm_packs_pi32 (lo, hi);                                               \
3597     lo = _mm_packs_pu16 (lo, lo);                                               \
3598     pix = lo;                                                                   \
3599 } while (0)
3600
3601 #define BILINEAR_SKIP_ONE_PIXEL()                                               \
3602 do {                                                                            \
3603     vx += unit_x;                                                               \
3604     mm_x = _mm_add_pi16 (mm_x, mm_ux);                                          \
3605 } while(0)
3606
3607 static force_inline void
3608 scaled_bilinear_scanline_mmx_8888_8888_SRC (uint32_t *       dst,
3609                                             const uint32_t * mask,
3610                                             const uint32_t * src_top,
3611                                             const uint32_t * src_bottom,
3612                                             int32_t          w,
3613                                             int              wt,
3614                                             int              wb,
3615                                             pixman_fixed_t   vx,
3616                                             pixman_fixed_t   unit_x,
3617                                             pixman_fixed_t   max_vx,
3618                                             pixman_bool_t    zero_src)
3619 {
3620     BILINEAR_DECLARE_VARIABLES;
3621     __m64 pix;
3622
3623     while (w--)
3624     {
3625         BILINEAR_INTERPOLATE_ONE_PIXEL (pix);
3626         store (dst, pix);
3627         dst++;
3628     }
3629
3630     _mm_empty ();
3631 }
3632
3633 FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_cover_SRC,
3634                                scaled_bilinear_scanline_mmx_8888_8888_SRC,
3635                                uint32_t, uint32_t, uint32_t,
3636                                COVER, FLAG_NONE)
3637 FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_pad_SRC,
3638                                scaled_bilinear_scanline_mmx_8888_8888_SRC,
3639                                uint32_t, uint32_t, uint32_t,
3640                                PAD, FLAG_NONE)
3641 FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_none_SRC,
3642                                scaled_bilinear_scanline_mmx_8888_8888_SRC,
3643                                uint32_t, uint32_t, uint32_t,
3644                                NONE, FLAG_NONE)
3645 FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_normal_SRC,
3646                                scaled_bilinear_scanline_mmx_8888_8888_SRC,
3647                                uint32_t, uint32_t, uint32_t,
3648                                NORMAL, FLAG_NONE)
3649
3650 static force_inline void
3651 scaled_bilinear_scanline_mmx_8888_8888_OVER (uint32_t *       dst,
3652                                              const uint32_t * mask,
3653                                              const uint32_t * src_top,
3654                                              const uint32_t * src_bottom,
3655                                              int32_t          w,
3656                                              int              wt,
3657                                              int              wb,
3658                                              pixman_fixed_t   vx,
3659                                              pixman_fixed_t   unit_x,
3660                                              pixman_fixed_t   max_vx,
3661                                              pixman_bool_t    zero_src)
3662 {
3663     BILINEAR_DECLARE_VARIABLES;
3664     __m64 pix1, pix2;
3665
3666     while (w)
3667     {
3668         BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
3669
3670         if (!is_zero (pix1))
3671         {
3672             pix2 = load (dst);
3673             store8888 (dst, core_combine_over_u_pixel_mmx (pix1, pix2));
3674         }
3675
3676         w--;
3677         dst++;
3678     }
3679
3680     _mm_empty ();
3681 }
3682
3683 FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_cover_OVER,
3684                                scaled_bilinear_scanline_mmx_8888_8888_OVER,
3685                                uint32_t, uint32_t, uint32_t,
3686                                COVER, FLAG_NONE)
3687 FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_pad_OVER,
3688                                scaled_bilinear_scanline_mmx_8888_8888_OVER,
3689                                uint32_t, uint32_t, uint32_t,
3690                                PAD, FLAG_NONE)
3691 FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_none_OVER,
3692                                scaled_bilinear_scanline_mmx_8888_8888_OVER,
3693                                uint32_t, uint32_t, uint32_t,
3694                                NONE, FLAG_NONE)
3695 FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_normal_OVER,
3696                                scaled_bilinear_scanline_mmx_8888_8888_OVER,
3697                                uint32_t, uint32_t, uint32_t,
3698                                NORMAL, FLAG_NONE)
3699
3700 static force_inline void
3701 scaled_bilinear_scanline_mmx_8888_8_8888_OVER (uint32_t *       dst,
3702                                                const uint8_t  * mask,
3703                                                const uint32_t * src_top,
3704                                                const uint32_t * src_bottom,
3705                                                int32_t          w,
3706                                                int              wt,
3707                                                int              wb,
3708                                                pixman_fixed_t   vx,
3709                                                pixman_fixed_t   unit_x,
3710                                                pixman_fixed_t   max_vx,
3711                                                pixman_bool_t    zero_src)
3712 {
3713     BILINEAR_DECLARE_VARIABLES;
3714     __m64 pix1, pix2;
3715     uint32_t m;
3716
3717     while (w)
3718     {
3719         m = (uint32_t) *mask++;
3720
3721         if (m)
3722         {
3723             BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
3724
3725             if (m == 0xff && is_opaque (pix1))
3726             {
3727                 store (dst, pix1);
3728             }
3729             else
3730             {
3731                 __m64 ms, md, ma, msa;
3732
3733                 pix2 = load (dst);
3734                 ma = expand_alpha_rev (to_m64 (m));
3735                 ms = _mm_unpacklo_pi8 (pix1, _mm_setzero_si64 ());
3736                 md = _mm_unpacklo_pi8 (pix2, _mm_setzero_si64 ());
3737
3738                 msa = expand_alpha (ms);
3739
3740                 store8888 (dst, (in_over (ms, msa, ma, md)));
3741             }
3742         }
3743         else
3744         {
3745             BILINEAR_SKIP_ONE_PIXEL ();
3746         }
3747
3748         w--;
3749         dst++;
3750     }
3751
3752     _mm_empty ();
3753 }
3754
3755 FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8_8888_cover_OVER,
3756                                scaled_bilinear_scanline_mmx_8888_8_8888_OVER,
3757                                uint32_t, uint8_t, uint32_t,
3758                                COVER, FLAG_HAVE_NON_SOLID_MASK)
3759 FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8_8888_pad_OVER,
3760                                scaled_bilinear_scanline_mmx_8888_8_8888_OVER,
3761                                uint32_t, uint8_t, uint32_t,
3762                                PAD, FLAG_HAVE_NON_SOLID_MASK)
3763 FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8_8888_none_OVER,
3764                                scaled_bilinear_scanline_mmx_8888_8_8888_OVER,
3765                                uint32_t, uint8_t, uint32_t,
3766                                NONE, FLAG_HAVE_NON_SOLID_MASK)
3767 FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8_8888_normal_OVER,
3768                                scaled_bilinear_scanline_mmx_8888_8_8888_OVER,
3769                                uint32_t, uint8_t, uint32_t,
3770                                NORMAL, FLAG_HAVE_NON_SOLID_MASK)
3771
3772 static uint32_t *
3773 mmx_fetch_x8r8g8b8 (pixman_iter_t *iter, const uint32_t *mask)
3774 {
3775     int w = iter->width;
3776     uint32_t *dst = iter->buffer;
3777     uint32_t *src = (uint32_t *)iter->bits;
3778
3779     iter->bits += iter->stride;
3780
3781     while (w && ((uintptr_t)dst) & 7)
3782     {
3783         *dst++ = (*src++) | 0xff000000;
3784         w--;
3785     }
3786
3787     while (w >= 8)
3788     {
3789         __m64 vsrc1 = ldq_u ((__m64 *)(src + 0));
3790         __m64 vsrc2 = ldq_u ((__m64 *)(src + 2));
3791         __m64 vsrc3 = ldq_u ((__m64 *)(src + 4));
3792         __m64 vsrc4 = ldq_u ((__m64 *)(src + 6));
3793
3794         *(__m64 *)(dst + 0) = _mm_or_si64 (vsrc1, MC (ff000000));
3795         *(__m64 *)(dst + 2) = _mm_or_si64 (vsrc2, MC (ff000000));
3796         *(__m64 *)(dst + 4) = _mm_or_si64 (vsrc3, MC (ff000000));
3797         *(__m64 *)(dst + 6) = _mm_or_si64 (vsrc4, MC (ff000000));
3798
3799         dst += 8;
3800         src += 8;
3801         w -= 8;
3802     }
3803
3804     while (w)
3805     {
3806         *dst++ = (*src++) | 0xff000000;
3807         w--;
3808     }
3809
3810     _mm_empty ();
3811     return iter->buffer;
3812 }
3813
3814 static uint32_t *
3815 mmx_fetch_r5g6b5 (pixman_iter_t *iter, const uint32_t *mask)
3816 {
3817     int w = iter->width;
3818     uint32_t *dst = iter->buffer;
3819     uint16_t *src = (uint16_t *)iter->bits;
3820
3821     iter->bits += iter->stride;
3822
3823     while (w && ((uintptr_t)dst) & 0x0f)
3824     {
3825         uint16_t s = *src++;
3826
3827         *dst++ = convert_0565_to_8888 (s);
3828         w--;
3829     }
3830
3831     while (w >= 4)
3832     {
3833         __m64 vsrc = ldq_u ((__m64 *)src);
3834         __m64 mm0, mm1;
3835
3836         expand_4xpacked565 (vsrc, &mm0, &mm1, 1);
3837
3838         *(__m64 *)(dst + 0) = mm0;
3839         *(__m64 *)(dst + 2) = mm1;
3840
3841         dst += 4;
3842         src += 4;
3843         w -= 4;
3844     }
3845
3846     while (w)
3847     {
3848         uint16_t s = *src++;
3849
3850         *dst++ = convert_0565_to_8888 (s);
3851         w--;
3852     }
3853
3854     _mm_empty ();
3855     return iter->buffer;
3856 }
3857
3858 static uint32_t *
3859 mmx_fetch_a8 (pixman_iter_t *iter, const uint32_t *mask)
3860 {
3861     int w = iter->width;
3862     uint32_t *dst = iter->buffer;
3863     uint8_t *src = iter->bits;
3864
3865     iter->bits += iter->stride;
3866
3867     while (w && (((uintptr_t)dst) & 15))
3868     {
3869         *dst++ = *(src++) << 24;
3870         w--;
3871     }
3872
3873     while (w >= 8)
3874     {
3875         __m64 mm0 = ldq_u ((__m64 *)src);
3876
3877         __m64 mm1 = _mm_unpacklo_pi8  (_mm_setzero_si64(), mm0);
3878         __m64 mm2 = _mm_unpackhi_pi8  (_mm_setzero_si64(), mm0);
3879         __m64 mm3 = _mm_unpacklo_pi16 (_mm_setzero_si64(), mm1);
3880         __m64 mm4 = _mm_unpackhi_pi16 (_mm_setzero_si64(), mm1);
3881         __m64 mm5 = _mm_unpacklo_pi16 (_mm_setzero_si64(), mm2);
3882         __m64 mm6 = _mm_unpackhi_pi16 (_mm_setzero_si64(), mm2);
3883
3884         *(__m64 *)(dst + 0) = mm3;
3885         *(__m64 *)(dst + 2) = mm4;
3886         *(__m64 *)(dst + 4) = mm5;
3887         *(__m64 *)(dst + 6) = mm6;
3888
3889         dst += 8;
3890         src += 8;
3891         w -= 8;
3892     }
3893
3894     while (w)
3895     {
3896         *dst++ = *(src++) << 24;
3897         w--;
3898     }
3899
3900     _mm_empty ();
3901     return iter->buffer;
3902 }
3903
3904 #define IMAGE_FLAGS                                                     \
3905     (FAST_PATH_STANDARD_FLAGS | FAST_PATH_ID_TRANSFORM |                \
3906      FAST_PATH_BITS_IMAGE | FAST_PATH_SAMPLES_COVER_CLIP_NEAREST)
3907
3908 static const pixman_iter_info_t mmx_iters[] =
3909 {
3910     { PIXMAN_x8r8g8b8, IMAGE_FLAGS, ITER_NARROW,
3911       _pixman_iter_init_bits_stride, mmx_fetch_x8r8g8b8, NULL
3912     },
3913     { PIXMAN_r5g6b5, IMAGE_FLAGS, ITER_NARROW,
3914       _pixman_iter_init_bits_stride, mmx_fetch_r5g6b5, NULL
3915     },
3916     { PIXMAN_a8, IMAGE_FLAGS, ITER_NARROW,
3917       _pixman_iter_init_bits_stride, mmx_fetch_a8, NULL
3918     },
3919     { PIXMAN_null },
3920 };
3921
3922 static const pixman_fast_path_t mmx_fast_paths[] =
3923 {
3924     PIXMAN_STD_FAST_PATH    (OVER, solid,    a8,       r5g6b5,   mmx_composite_over_n_8_0565       ),
3925     PIXMAN_STD_FAST_PATH    (OVER, solid,    a8,       b5g6r5,   mmx_composite_over_n_8_0565       ),
3926     PIXMAN_STD_FAST_PATH    (OVER, solid,    a8,       a8r8g8b8, mmx_composite_over_n_8_8888       ),
3927     PIXMAN_STD_FAST_PATH    (OVER, solid,    a8,       x8r8g8b8, mmx_composite_over_n_8_8888       ),
3928     PIXMAN_STD_FAST_PATH    (OVER, solid,    a8,       a8b8g8r8, mmx_composite_over_n_8_8888       ),
3929     PIXMAN_STD_FAST_PATH    (OVER, solid,    a8,       x8b8g8r8, mmx_composite_over_n_8_8888       ),
3930     PIXMAN_STD_FAST_PATH_CA (OVER, solid,    a8r8g8b8, a8r8g8b8, mmx_composite_over_n_8888_8888_ca ),
3931     PIXMAN_STD_FAST_PATH_CA (OVER, solid,    a8r8g8b8, x8r8g8b8, mmx_composite_over_n_8888_8888_ca ),
3932     PIXMAN_STD_FAST_PATH_CA (OVER, solid,    a8r8g8b8, r5g6b5,   mmx_composite_over_n_8888_0565_ca ),
3933     PIXMAN_STD_FAST_PATH_CA (OVER, solid,    a8b8g8r8, a8b8g8r8, mmx_composite_over_n_8888_8888_ca ),
3934     PIXMAN_STD_FAST_PATH_CA (OVER, solid,    a8b8g8r8, x8b8g8r8, mmx_composite_over_n_8888_8888_ca ),
3935     PIXMAN_STD_FAST_PATH_CA (OVER, solid,    a8b8g8r8, b5g6r5,   mmx_composite_over_n_8888_0565_ca ),
3936     PIXMAN_STD_FAST_PATH    (OVER, pixbuf,   pixbuf,   a8r8g8b8, mmx_composite_over_pixbuf_8888    ),
3937     PIXMAN_STD_FAST_PATH    (OVER, pixbuf,   pixbuf,   x8r8g8b8, mmx_composite_over_pixbuf_8888    ),
3938     PIXMAN_STD_FAST_PATH    (OVER, pixbuf,   pixbuf,   r5g6b5,   mmx_composite_over_pixbuf_0565    ),
3939     PIXMAN_STD_FAST_PATH    (OVER, rpixbuf,  rpixbuf,  a8b8g8r8, mmx_composite_over_pixbuf_8888    ),
3940     PIXMAN_STD_FAST_PATH    (OVER, rpixbuf,  rpixbuf,  x8b8g8r8, mmx_composite_over_pixbuf_8888    ),
3941     PIXMAN_STD_FAST_PATH    (OVER, rpixbuf,  rpixbuf,  b5g6r5,   mmx_composite_over_pixbuf_0565    ),
3942     PIXMAN_STD_FAST_PATH    (OVER, x8r8g8b8, solid,    a8r8g8b8, mmx_composite_over_x888_n_8888    ),
3943     PIXMAN_STD_FAST_PATH    (OVER, x8r8g8b8, solid,    x8r8g8b8, mmx_composite_over_x888_n_8888    ),
3944     PIXMAN_STD_FAST_PATH    (OVER, x8b8g8r8, solid,    a8b8g8r8, mmx_composite_over_x888_n_8888    ),
3945     PIXMAN_STD_FAST_PATH    (OVER, x8b8g8r8, solid,    x8b8g8r8, mmx_composite_over_x888_n_8888    ),
3946     PIXMAN_STD_FAST_PATH    (OVER, a8r8g8b8, solid,    a8r8g8b8, mmx_composite_over_8888_n_8888    ),
3947     PIXMAN_STD_FAST_PATH    (OVER, a8r8g8b8, solid,    x8r8g8b8, mmx_composite_over_8888_n_8888    ),
3948     PIXMAN_STD_FAST_PATH    (OVER, a8b8g8r8, solid,    a8b8g8r8, mmx_composite_over_8888_n_8888    ),
3949     PIXMAN_STD_FAST_PATH    (OVER, a8b8g8r8, solid,    x8b8g8r8, mmx_composite_over_8888_n_8888    ),
3950     PIXMAN_STD_FAST_PATH    (OVER, x8r8g8b8, a8,       x8r8g8b8, mmx_composite_over_x888_8_8888    ),
3951     PIXMAN_STD_FAST_PATH    (OVER, x8r8g8b8, a8,       a8r8g8b8, mmx_composite_over_x888_8_8888    ),
3952     PIXMAN_STD_FAST_PATH    (OVER, x8b8g8r8, a8,       x8b8g8r8, mmx_composite_over_x888_8_8888    ),
3953     PIXMAN_STD_FAST_PATH    (OVER, x8b8g8r8, a8,       a8b8g8r8, mmx_composite_over_x888_8_8888    ),
3954     PIXMAN_STD_FAST_PATH    (OVER, solid,    null,     a8r8g8b8, mmx_composite_over_n_8888         ),
3955     PIXMAN_STD_FAST_PATH    (OVER, solid,    null,     x8r8g8b8, mmx_composite_over_n_8888         ),
3956     PIXMAN_STD_FAST_PATH    (OVER, solid,    null,     r5g6b5,   mmx_composite_over_n_0565         ),
3957     PIXMAN_STD_FAST_PATH    (OVER, solid,    null,     b5g6r5,   mmx_composite_over_n_0565         ),
3958     PIXMAN_STD_FAST_PATH    (OVER, x8r8g8b8, null,     x8r8g8b8, mmx_composite_copy_area           ),
3959     PIXMAN_STD_FAST_PATH    (OVER, x8b8g8r8, null,     x8b8g8r8, mmx_composite_copy_area           ),
3960
3961     PIXMAN_STD_FAST_PATH    (OVER, a8r8g8b8, null,     a8r8g8b8, mmx_composite_over_8888_8888      ),
3962     PIXMAN_STD_FAST_PATH    (OVER, a8r8g8b8, null,     x8r8g8b8, mmx_composite_over_8888_8888      ),
3963     PIXMAN_STD_FAST_PATH    (OVER, a8r8g8b8, null,     r5g6b5,   mmx_composite_over_8888_0565      ),
3964     PIXMAN_STD_FAST_PATH    (OVER, a8b8g8r8, null,     a8b8g8r8, mmx_composite_over_8888_8888      ),
3965     PIXMAN_STD_FAST_PATH    (OVER, a8b8g8r8, null,     x8b8g8r8, mmx_composite_over_8888_8888      ),
3966     PIXMAN_STD_FAST_PATH    (OVER, a8b8g8r8, null,     b5g6r5,   mmx_composite_over_8888_0565      ),
3967
3968     PIXMAN_STD_FAST_PATH    (OVER_REVERSE, solid, null, a8r8g8b8, mmx_composite_over_reverse_n_8888),
3969     PIXMAN_STD_FAST_PATH    (OVER_REVERSE, solid, null, a8b8g8r8, mmx_composite_over_reverse_n_8888),
3970
3971     PIXMAN_STD_FAST_PATH    (ADD,  r5g6b5,   null,     r5g6b5,   mmx_composite_add_0565_0565       ),
3972     PIXMAN_STD_FAST_PATH    (ADD,  b5g6r5,   null,     b5g6r5,   mmx_composite_add_0565_0565       ),
3973     PIXMAN_STD_FAST_PATH    (ADD,  a8r8g8b8, null,     a8r8g8b8, mmx_composite_add_8888_8888       ),
3974     PIXMAN_STD_FAST_PATH    (ADD,  a8b8g8r8, null,     a8b8g8r8, mmx_composite_add_8888_8888       ),
3975     PIXMAN_STD_FAST_PATH    (ADD,  a8,       null,     a8,       mmx_composite_add_8_8             ),
3976     PIXMAN_STD_FAST_PATH    (ADD,  solid,    a8,       a8,       mmx_composite_add_n_8_8           ),
3977
3978     PIXMAN_STD_FAST_PATH    (SRC,  a8r8g8b8, null,     r5g6b5,   mmx_composite_src_x888_0565       ),
3979     PIXMAN_STD_FAST_PATH    (SRC,  a8b8g8r8, null,     b5g6r5,   mmx_composite_src_x888_0565       ),
3980     PIXMAN_STD_FAST_PATH    (SRC,  x8r8g8b8, null,     r5g6b5,   mmx_composite_src_x888_0565       ),
3981     PIXMAN_STD_FAST_PATH    (SRC,  x8b8g8r8, null,     b5g6r5,   mmx_composite_src_x888_0565       ),
3982     PIXMAN_STD_FAST_PATH    (SRC,  solid,    a8,       a8r8g8b8, mmx_composite_src_n_8_8888        ),
3983     PIXMAN_STD_FAST_PATH    (SRC,  solid,    a8,       x8r8g8b8, mmx_composite_src_n_8_8888        ),
3984     PIXMAN_STD_FAST_PATH    (SRC,  solid,    a8,       a8b8g8r8, mmx_composite_src_n_8_8888        ),
3985     PIXMAN_STD_FAST_PATH    (SRC,  solid,    a8,       x8b8g8r8, mmx_composite_src_n_8_8888        ),
3986     PIXMAN_STD_FAST_PATH    (SRC,  a8r8g8b8, null,     a8r8g8b8, mmx_composite_copy_area           ),
3987     PIXMAN_STD_FAST_PATH    (SRC,  a8b8g8r8, null,     a8b8g8r8, mmx_composite_copy_area           ),
3988     PIXMAN_STD_FAST_PATH    (SRC,  a8r8g8b8, null,     x8r8g8b8, mmx_composite_copy_area           ),
3989     PIXMAN_STD_FAST_PATH    (SRC,  a8b8g8r8, null,     x8b8g8r8, mmx_composite_copy_area           ),
3990     PIXMAN_STD_FAST_PATH    (SRC,  x8r8g8b8, null,     x8r8g8b8, mmx_composite_copy_area           ),
3991     PIXMAN_STD_FAST_PATH    (SRC,  x8b8g8r8, null,     x8b8g8r8, mmx_composite_copy_area           ),
3992     PIXMAN_STD_FAST_PATH    (SRC,  r5g6b5,   null,     r5g6b5,   mmx_composite_copy_area           ),
3993     PIXMAN_STD_FAST_PATH    (SRC,  b5g6r5,   null,     b5g6r5,   mmx_composite_copy_area           ),
3994
3995     PIXMAN_STD_FAST_PATH    (IN,   a8,       null,     a8,       mmx_composite_in_8_8              ),
3996     PIXMAN_STD_FAST_PATH    (IN,   solid,    a8,       a8,       mmx_composite_in_n_8_8            ),
3997
3998     SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8,          a8r8g8b8, mmx_8888_8888                     ),
3999     SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8,          x8r8g8b8, mmx_8888_8888                     ),
4000     SIMPLE_BILINEAR_FAST_PATH (SRC, x8r8g8b8,          x8r8g8b8, mmx_8888_8888                     ),
4001     SIMPLE_BILINEAR_FAST_PATH (SRC, a8b8g8r8,          a8b8g8r8, mmx_8888_8888                     ),
4002     SIMPLE_BILINEAR_FAST_PATH (SRC, a8b8g8r8,          x8b8g8r8, mmx_8888_8888                     ),
4003     SIMPLE_BILINEAR_FAST_PATH (SRC, x8b8g8r8,          x8b8g8r8, mmx_8888_8888                     ),
4004
4005     SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8,         x8r8g8b8, mmx_8888_8888                     ),
4006     SIMPLE_BILINEAR_FAST_PATH (OVER, a8b8g8r8,         x8b8g8r8, mmx_8888_8888                     ),
4007     SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8,         a8r8g8b8, mmx_8888_8888                     ),
4008     SIMPLE_BILINEAR_FAST_PATH (OVER, a8b8g8r8,         a8b8g8r8, mmx_8888_8888                     ),
4009
4010     SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, mmx_8888_8_8888                   ),
4011     SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, mmx_8888_8_8888                   ),
4012     SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, mmx_8888_8_8888                   ),
4013     SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, mmx_8888_8_8888                   ),
4014
4015     { PIXMAN_OP_NONE },
4016 };
4017
4018 pixman_implementation_t *
4019 _pixman_implementation_create_mmx (pixman_implementation_t *fallback)
4020 {
4021     pixman_implementation_t *imp = _pixman_implementation_create (fallback, mmx_fast_paths);
4022
4023     imp->combine_32[PIXMAN_OP_OVER] = mmx_combine_over_u;
4024     imp->combine_32[PIXMAN_OP_OVER_REVERSE] = mmx_combine_over_reverse_u;
4025     imp->combine_32[PIXMAN_OP_IN] = mmx_combine_in_u;
4026     imp->combine_32[PIXMAN_OP_IN_REVERSE] = mmx_combine_in_reverse_u;
4027     imp->combine_32[PIXMAN_OP_OUT] = mmx_combine_out_u;
4028     imp->combine_32[PIXMAN_OP_OUT_REVERSE] = mmx_combine_out_reverse_u;
4029     imp->combine_32[PIXMAN_OP_ATOP] = mmx_combine_atop_u;
4030     imp->combine_32[PIXMAN_OP_ATOP_REVERSE] = mmx_combine_atop_reverse_u;
4031     imp->combine_32[PIXMAN_OP_XOR] = mmx_combine_xor_u;
4032     imp->combine_32[PIXMAN_OP_ADD] = mmx_combine_add_u;
4033     imp->combine_32[PIXMAN_OP_SATURATE] = mmx_combine_saturate_u;
4034
4035     imp->combine_32_ca[PIXMAN_OP_SRC] = mmx_combine_src_ca;
4036     imp->combine_32_ca[PIXMAN_OP_OVER] = mmx_combine_over_ca;
4037     imp->combine_32_ca[PIXMAN_OP_OVER_REVERSE] = mmx_combine_over_reverse_ca;
4038     imp->combine_32_ca[PIXMAN_OP_IN] = mmx_combine_in_ca;
4039     imp->combine_32_ca[PIXMAN_OP_IN_REVERSE] = mmx_combine_in_reverse_ca;
4040     imp->combine_32_ca[PIXMAN_OP_OUT] = mmx_combine_out_ca;
4041     imp->combine_32_ca[PIXMAN_OP_OUT_REVERSE] = mmx_combine_out_reverse_ca;
4042     imp->combine_32_ca[PIXMAN_OP_ATOP] = mmx_combine_atop_ca;
4043     imp->combine_32_ca[PIXMAN_OP_ATOP_REVERSE] = mmx_combine_atop_reverse_ca;
4044     imp->combine_32_ca[PIXMAN_OP_XOR] = mmx_combine_xor_ca;
4045     imp->combine_32_ca[PIXMAN_OP_ADD] = mmx_combine_add_ca;
4046
4047     imp->blt = mmx_blt;
4048     imp->fill = mmx_fill;
4049
4050     imp->iter_info = mmx_iters;
4051
4052     return imp;
4053 }
4054
4055 #endif /* USE_X86_MMX || USE_ARM_IWMMXT || USE_LOONGSON_MMI */