2 * Copyright © 2004, 2005 Red Hat, Inc.
3 * Copyright © 2004 Nicholas Miell
4 * Copyright © 2005 Trolltech AS
6 * Permission to use, copy, modify, distribute, and sell this software and its
7 * documentation for any purpose is hereby granted without fee, provided that
8 * the above copyright notice appear in all copies and that both that
9 * copyright notice and this permission notice appear in supporting
10 * documentation, and that the name of Red Hat not be used in advertising or
11 * publicity pertaining to distribution of the software without specific,
12 * written prior permission. Red Hat makes no representations about the
13 * suitability of this software for any purpose. It is provided "as is"
14 * without express or implied warranty.
16 * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
17 * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
18 * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
19 * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
20 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
21 * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
22 * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
25 * Author: Søren Sandmann (sandmann@redhat.com)
26 * Minor Improvements: Nicholas Miell (nmiell@gmail.com)
27 * MMX code paths for fbcompose.c by Lars Knoll (lars@trolltech.com)
29 * Based on work by Owen Taylor
36 #if defined USE_X86_MMX || defined USE_ARM_IWMMXT || defined USE_LOONGSON_MMI
38 #ifdef USE_LOONGSON_MMI
39 #include <loongson-mmintrin.h>
43 #include "pixman-private.h"
44 #include "pixman-combine32.h"
45 #include "pixman-inlines.h"
48 #define CHECKPOINT() error_f ("at %s %d\n", __FUNCTION__, __LINE__)
53 #if defined USE_ARM_IWMMXT && __GNUC__ == 4 && __GNUC_MINOR__ < 8
54 /* Empty the multimedia state. For some reason, ARM's mmintrin.h doesn't provide this. */
55 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
63 # if (defined(__SUNPRO_C) || defined(_MSC_VER) || defined(_WIN64))
64 # include <xmmintrin.h>
66 /* We have to compile with -msse to use xmmintrin.h, but that causes SSE
67 * instructions to be generated that we don't want. Just duplicate the
68 * functions we want to use. */
69 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
70 _mm_movemask_pi8 (__m64 __A)
74 asm ("pmovmskb %1, %0\n\t"
82 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
83 _mm_mulhi_pu16 (__m64 __A, __m64 __B)
85 asm ("pmulhuw %1, %0\n\t"
93 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
94 _mm_shuffle_pi16 (__m64 __A, int8_t const __N)
98 asm ("pshufw %2, %1, %0\n\t"
100 : "y" (__A), "K" (__N)
106 # define _mm_shuffle_pi16(A, N) \
110 asm ("pshufw %2, %1, %0\n\t" \
112 : "y" (A), "K" ((const int8_t)N) \
122 #define _MM_SHUFFLE(fp3,fp2,fp1,fp0) \
123 (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | (fp0))
126 /* Notes about writing mmx code
128 * give memory operands as the second operand. If you give it as the
129 * first, gcc will first load it into a register, then use that
134 * _mm_mullo_pi16 (x, mmx_constant);
138 * _mm_mullo_pi16 (mmx_constant, x);
140 * Also try to minimize dependencies. i.e. when you need a value, try
141 * to calculate it from a value that was calculated as early as
145 /* --------------- MMX primitives ------------------------------------- */
147 /* If __m64 is defined as a struct or union, then define M64_MEMBER to be
148 * the name of the member used to access the data.
149 * If __m64 requires using mm_cvt* intrinsics functions to convert between
150 * uint64_t and __m64 values, then define USE_CVT_INTRINSICS.
151 * If __m64 and uint64_t values can just be cast to each other directly,
152 * then define USE_M64_CASTS.
153 * If __m64 is a double datatype, then define USE_M64_DOUBLE.
156 # define M64_MEMBER m64_u64
158 # define USE_CVT_INTRINSICS
159 #elif defined(USE_LOONGSON_MMI)
160 # define USE_M64_DOUBLE
161 #elif defined(__GNUC__)
162 # define USE_M64_CASTS
163 #elif defined(__SUNPRO_C)
164 # if (__SUNPRO_C >= 0x5120) && !defined(__NOVECTORSIZE__)
165 /* Solaris Studio 12.3 (Sun C 5.12) introduces __attribute__(__vector_size__)
166 * support, and defaults to using it to define __m64, unless __NOVECTORSIZE__
167 * is defined. If it is used, then the mm_cvt* intrinsics must be used.
169 # define USE_CVT_INTRINSICS
171 /* For Studio 12.2 or older, or when __attribute__(__vector_size__) is
172 * disabled, __m64 is defined as a struct containing "unsigned long long l_".
174 # define M64_MEMBER l_
178 #if defined(USE_M64_CASTS) || defined(USE_CVT_INTRINSICS) || defined(USE_M64_DOUBLE)
179 typedef uint64_t mmxdatafield;
181 typedef __m64 mmxdatafield;
186 mmxdatafield mmx_4x00ff;
187 mmxdatafield mmx_4x0080;
188 mmxdatafield mmx_565_rgb;
189 mmxdatafield mmx_565_unpack_multiplier;
190 mmxdatafield mmx_565_pack_multiplier;
191 mmxdatafield mmx_565_r;
192 mmxdatafield mmx_565_g;
193 mmxdatafield mmx_565_b;
194 mmxdatafield mmx_packed_565_rb;
195 mmxdatafield mmx_packed_565_g;
196 mmxdatafield mmx_expand_565_g;
197 mmxdatafield mmx_expand_565_b;
198 mmxdatafield mmx_expand_565_r;
199 #ifndef USE_LOONGSON_MMI
200 mmxdatafield mmx_mask_0;
201 mmxdatafield mmx_mask_1;
202 mmxdatafield mmx_mask_2;
203 mmxdatafield mmx_mask_3;
205 mmxdatafield mmx_full_alpha;
206 mmxdatafield mmx_4x0101;
207 mmxdatafield mmx_ff000000;
210 #if defined(_MSC_VER)
211 # define MMXDATA_INIT(field, val) { val ## UI64 }
212 #elif defined(M64_MEMBER) /* __m64 is a struct, not an integral type */
213 # define MMXDATA_INIT(field, val) field = { val ## ULL }
214 #else /* mmxdatafield is an integral type */
215 # define MMXDATA_INIT(field, val) field = val ## ULL
218 static const mmx_data_t c =
220 MMXDATA_INIT (.mmx_4x00ff, 0x00ff00ff00ff00ff),
221 MMXDATA_INIT (.mmx_4x0080, 0x0080008000800080),
222 MMXDATA_INIT (.mmx_565_rgb, 0x000001f0003f001f),
223 MMXDATA_INIT (.mmx_565_unpack_multiplier, 0x0000008404100840),
224 MMXDATA_INIT (.mmx_565_pack_multiplier, 0x2000000420000004),
225 MMXDATA_INIT (.mmx_565_r, 0x000000f800000000),
226 MMXDATA_INIT (.mmx_565_g, 0x0000000000fc0000),
227 MMXDATA_INIT (.mmx_565_b, 0x00000000000000f8),
228 MMXDATA_INIT (.mmx_packed_565_rb, 0x00f800f800f800f8),
229 MMXDATA_INIT (.mmx_packed_565_g, 0x0000fc000000fc00),
230 MMXDATA_INIT (.mmx_expand_565_g, 0x07e007e007e007e0),
231 MMXDATA_INIT (.mmx_expand_565_b, 0x001f001f001f001f),
232 MMXDATA_INIT (.mmx_expand_565_r, 0xf800f800f800f800),
233 #ifndef USE_LOONGSON_MMI
234 MMXDATA_INIT (.mmx_mask_0, 0xffffffffffff0000),
235 MMXDATA_INIT (.mmx_mask_1, 0xffffffff0000ffff),
236 MMXDATA_INIT (.mmx_mask_2, 0xffff0000ffffffff),
237 MMXDATA_INIT (.mmx_mask_3, 0x0000ffffffffffff),
239 MMXDATA_INIT (.mmx_full_alpha, 0x00ff000000000000),
240 MMXDATA_INIT (.mmx_4x0101, 0x0101010101010101),
241 MMXDATA_INIT (.mmx_ff000000, 0xff000000ff000000),
244 #ifdef USE_CVT_INTRINSICS
245 # define MC(x) to_m64 (c.mmx_ ## x)
246 #elif defined(USE_M64_CASTS)
247 # define MC(x) ((__m64)c.mmx_ ## x)
248 #elif defined(USE_M64_DOUBLE)
249 # define MC(x) (*(__m64 *)&c.mmx_ ## x)
251 # define MC(x) c.mmx_ ## x
254 static force_inline __m64
257 #ifdef USE_CVT_INTRINSICS
258 return _mm_cvtsi64_m64 (x);
259 #elif defined M64_MEMBER /* __m64 is a struct, not an integral type */
264 #elif defined USE_M64_DOUBLE
266 #else /* USE_M64_CASTS */
271 static force_inline uint64_t
274 #ifdef USE_CVT_INTRINSICS
275 return _mm_cvtm64_si64 (x);
276 #elif defined M64_MEMBER /* __m64 is a struct, not an integral type */
277 uint64_t res = x.M64_MEMBER;
279 #elif defined USE_M64_DOUBLE
280 return *(uint64_t *)&x;
281 #else /* USE_M64_CASTS */
286 static force_inline __m64
291 return _mm_slli_si64 (v, s);
293 return _mm_srli_si64 (v, -s);
298 static force_inline __m64
301 return _mm_xor_si64 (mask, MC (4x00ff));
304 /* Computes the product of two unsigned fixed-point 8-bit values from 0 to 1
305 * and maps its result to the same range.
307 * Jim Blinn gives multiple ways to compute this in "Jim Blinn's Corner:
308 * Notation, Notation, Notation", the first of which is
310 * prod(a, b) = (a * b + 128) / 255.
312 * By approximating the division by 255 as 257/65536 it can be replaced by a
313 * multiply and a right shift. This is the implementation that we use in
314 * pix_multiply(), but we _mm_mulhi_pu16() by 257 (part of SSE1 or Extended
315 * 3DNow!, and unavailable at the time of the book's publication) to perform
316 * the multiplication and right shift in a single operation.
318 * prod(a, b) = ((a * b + 128) * 257) >> 16.
320 * A third way (how pix_multiply() was implemented prior to 14208344) exists
321 * also that performs the multiplication by 257 with adds and shifts.
323 * Where temp = a * b + 128
325 * prod(a, b) = (temp + (temp >> 8)) >> 8.
327 static force_inline __m64
328 pix_multiply (__m64 a, __m64 b)
332 res = _mm_mullo_pi16 (a, b);
333 res = _mm_adds_pu16 (res, MC (4x0080));
334 res = _mm_mulhi_pu16 (res, MC (4x0101));
339 static force_inline __m64
340 pix_add (__m64 a, __m64 b)
342 return _mm_adds_pu8 (a, b);
345 static force_inline __m64
346 expand_alpha (__m64 pixel)
348 return _mm_shuffle_pi16 (pixel, _MM_SHUFFLE (3, 3, 3, 3));
351 static force_inline __m64
352 expand_alpha_rev (__m64 pixel)
354 return _mm_shuffle_pi16 (pixel, _MM_SHUFFLE (0, 0, 0, 0));
357 static force_inline __m64
358 invert_colors (__m64 pixel)
360 return _mm_shuffle_pi16 (pixel, _MM_SHUFFLE (3, 0, 1, 2));
363 static force_inline __m64
368 return _mm_adds_pu8 (src, pix_multiply (dest, negate (srca)));
371 static force_inline __m64
372 over_rev_non_pre (__m64 src, __m64 dest)
374 __m64 srca = expand_alpha (src);
375 __m64 srcfaaa = _mm_or_si64 (srca, MC (full_alpha));
377 return over (pix_multiply (invert_colors (src), srcfaaa), srca, dest);
380 static force_inline __m64
381 in (__m64 src, __m64 mask)
383 return pix_multiply (src, mask);
387 static force_inline __m64
388 in_over (__m64 src, __m64 srca, __m64 mask, __m64 dest)
390 return over (in (src, mask), pix_multiply (srca, mask), dest);
395 #define in_over(src, srca, mask, dest) \
396 over (in (src, mask), pix_multiply (srca, mask), dest)
400 /* Elemental unaligned loads */
402 static force_inline __m64 ldq_u(__m64 *p)
405 /* x86's alignment restrictions are very relaxed. */
407 #elif defined USE_ARM_IWMMXT
408 int align = (uintptr_t)p & 7;
412 aligned_p = (__m64 *)((uintptr_t)p & ~7);
413 return (__m64) _mm_align_si64 (aligned_p[0], aligned_p[1], align);
415 struct __una_u64 { __m64 x __attribute__((packed)); };
416 const struct __una_u64 *ptr = (const struct __una_u64 *) p;
417 return (__m64) ptr->x;
421 static force_inline uint32_t ldl_u(const uint32_t *p)
424 /* x86's alignment restrictions are very relaxed. */
427 struct __una_u32 { uint32_t x __attribute__((packed)); };
428 const struct __una_u32 *ptr = (const struct __una_u32 *) p;
433 static force_inline __m64
434 load (const uint32_t *v)
436 #ifdef USE_LOONGSON_MMI
438 asm ("lwc1 %0, %1\n\t"
444 return _mm_cvtsi32_si64 (*v);
448 static force_inline __m64
449 load8888 (const uint32_t *v)
451 #ifdef USE_LOONGSON_MMI
452 return _mm_unpacklo_pi8_f (*(__m32 *)v, _mm_setzero_si64 ());
454 return _mm_unpacklo_pi8 (load (v), _mm_setzero_si64 ());
458 static force_inline __m64
459 load8888u (const uint32_t *v)
461 uint32_t l = ldl_u (v);
462 return load8888 (&l);
465 static force_inline __m64
466 pack8888 (__m64 lo, __m64 hi)
468 return _mm_packs_pu16 (lo, hi);
471 static force_inline void
472 store (uint32_t *dest, __m64 v)
474 #ifdef USE_LOONGSON_MMI
475 asm ("swc1 %1, %0\n\t"
481 *dest = _mm_cvtsi64_si32 (v);
485 static force_inline void
486 store8888 (uint32_t *dest, __m64 v)
488 v = pack8888 (v, _mm_setzero_si64 ());
492 static force_inline pixman_bool_t
493 is_equal (__m64 a, __m64 b)
495 #ifdef USE_LOONGSON_MMI
496 /* __m64 is double, we can compare directly. */
499 return _mm_movemask_pi8 (_mm_cmpeq_pi8 (a, b)) == 0xff;
503 static force_inline pixman_bool_t
506 #ifdef USE_LOONGSON_MMI
507 return is_equal (_mm_and_si64 (v, MC (full_alpha)), MC (full_alpha));
509 __m64 ffs = _mm_cmpeq_pi8 (v, v);
510 return (_mm_movemask_pi8 (_mm_cmpeq_pi8 (v, ffs)) & 0x40);
514 static force_inline pixman_bool_t
517 return is_equal (v, _mm_setzero_si64 ());
520 /* Expand 16 bits positioned at @pos (0-3) of a mmx register into
524 * --- Expanding 565 in the low word ---
526 * m = (m << (32 - 3)) | (m << (16 - 5)) | m;
527 * m = m & (01f0003f001f);
528 * m = m * (008404100840);
531 * Note the trick here - the top word is shifted by another nibble to
532 * avoid it bumping into the middle word
534 static force_inline __m64
535 expand565 (__m64 pixel, int pos)
540 /* move pixel to low 16 bit and zero the rest */
541 #ifdef USE_LOONGSON_MMI
542 p = loongson_extract_pi16 (p, pos);
544 p = shift (shift (p, (3 - pos) * 16), -48);
547 t1 = shift (p, 36 - 11);
548 t2 = shift (p, 16 - 5);
550 p = _mm_or_si64 (t1, p);
551 p = _mm_or_si64 (t2, p);
552 p = _mm_and_si64 (p, MC (565_rgb));
554 pixel = _mm_mullo_pi16 (p, MC (565_unpack_multiplier));
555 return _mm_srli_pi16 (pixel, 8);
558 /* Expand 4 16 bit pixels in an mmx register into two mmx registers of
562 static force_inline void
563 expand_4xpacked565 (__m64 vin, __m64 *vout0, __m64 *vout1, int full_alpha)
565 __m64 t0, t1, alpha = _mm_setzero_si64 ();
566 __m64 r = _mm_and_si64 (vin, MC (expand_565_r));
567 __m64 g = _mm_and_si64 (vin, MC (expand_565_g));
568 __m64 b = _mm_and_si64 (vin, MC (expand_565_b));
570 alpha = _mm_cmpeq_pi32 (alpha, alpha);
572 /* Replicate high bits into empty low bits. */
573 r = _mm_or_si64 (_mm_srli_pi16 (r, 8), _mm_srli_pi16 (r, 13));
574 g = _mm_or_si64 (_mm_srli_pi16 (g, 3), _mm_srli_pi16 (g, 9));
575 b = _mm_or_si64 (_mm_slli_pi16 (b, 3), _mm_srli_pi16 (b, 2));
577 r = _mm_packs_pu16 (r, _mm_setzero_si64 ()); /* 00 00 00 00 R3 R2 R1 R0 */
578 g = _mm_packs_pu16 (g, _mm_setzero_si64 ()); /* 00 00 00 00 G3 G2 G1 G0 */
579 b = _mm_packs_pu16 (b, _mm_setzero_si64 ()); /* 00 00 00 00 B3 B2 B1 B0 */
581 t1 = _mm_unpacklo_pi8 (r, alpha); /* A3 R3 A2 R2 A1 R1 A0 R0 */
582 t0 = _mm_unpacklo_pi8 (b, g); /* G3 B3 G2 B2 G1 B1 G0 B0 */
584 *vout0 = _mm_unpacklo_pi16 (t0, t1); /* A1 R1 G1 B1 A0 R0 G0 B0 */
585 *vout1 = _mm_unpackhi_pi16 (t0, t1); /* A3 R3 G3 B3 A2 R2 G2 B2 */
588 static force_inline __m64
589 expand8888 (__m64 in, int pos)
592 return _mm_unpacklo_pi8 (in, _mm_setzero_si64 ());
594 return _mm_unpackhi_pi8 (in, _mm_setzero_si64 ());
597 static force_inline __m64
598 expandx888 (__m64 in, int pos)
600 return _mm_or_si64 (expand8888 (in, pos), MC (full_alpha));
603 static force_inline void
604 expand_4x565 (__m64 vin, __m64 *vout0, __m64 *vout1, __m64 *vout2, __m64 *vout3, int full_alpha)
607 expand_4xpacked565 (vin, &v0, &v1, full_alpha);
608 *vout0 = expand8888 (v0, 0);
609 *vout1 = expand8888 (v0, 1);
610 *vout2 = expand8888 (v1, 0);
611 *vout3 = expand8888 (v1, 1);
614 static force_inline __m64
615 pack_565 (__m64 pixel, __m64 target, int pos)
621 r = _mm_and_si64 (p, MC (565_r));
622 g = _mm_and_si64 (p, MC (565_g));
623 b = _mm_and_si64 (p, MC (565_b));
625 #ifdef USE_LOONGSON_MMI
626 r = shift (r, -(32 - 8));
627 g = shift (g, -(16 - 3));
628 b = shift (b, -(0 + 3));
630 p = _mm_or_si64 (r, g);
631 p = _mm_or_si64 (p, b);
632 return loongson_insert_pi16 (t, p, pos);
634 r = shift (r, -(32 - 8) + pos * 16);
635 g = shift (g, -(16 - 3) + pos * 16);
636 b = shift (b, -(0 + 3) + pos * 16);
639 t = _mm_and_si64 (t, MC (mask_0));
641 t = _mm_and_si64 (t, MC (mask_1));
643 t = _mm_and_si64 (t, MC (mask_2));
645 t = _mm_and_si64 (t, MC (mask_3));
647 p = _mm_or_si64 (r, t);
648 p = _mm_or_si64 (g, p);
650 return _mm_or_si64 (b, p);
654 static force_inline __m64
655 pack_4xpacked565 (__m64 a, __m64 b)
657 __m64 rb0 = _mm_and_si64 (a, MC (packed_565_rb));
658 __m64 rb1 = _mm_and_si64 (b, MC (packed_565_rb));
660 __m64 t0 = _mm_madd_pi16 (rb0, MC (565_pack_multiplier));
661 __m64 t1 = _mm_madd_pi16 (rb1, MC (565_pack_multiplier));
663 __m64 g0 = _mm_and_si64 (a, MC (packed_565_g));
664 __m64 g1 = _mm_and_si64 (b, MC (packed_565_g));
666 t0 = _mm_or_si64 (t0, g0);
667 t1 = _mm_or_si64 (t1, g1);
670 #ifdef USE_ARM_IWMMXT
672 return _mm_packs_pu32 (t0, t1);
674 t1 = shift(t1, -5 + 16);
675 return _mm_shuffle_pi16 (_mm_or_si64 (t0, t1), _MM_SHUFFLE (3, 1, 2, 0));
681 static force_inline __m64
682 pack_4x565 (__m64 v0, __m64 v1, __m64 v2, __m64 v3)
684 return pack_4xpacked565 (pack8888 (v0, v1), pack8888 (v2, v3));
687 static force_inline __m64
688 pix_add_mul (__m64 x, __m64 a, __m64 y, __m64 b)
690 x = pix_multiply (x, a);
691 y = pix_multiply (y, b);
693 return pix_add (x, y);
698 /* MSVC only handles a "pass by register" of up to three SSE intrinsics */
700 #define pack_4x565(v0, v1, v2, v3) \
701 pack_4xpacked565 (pack8888 (v0, v1), pack8888 (v2, v3))
703 #define pix_add_mul(x, a, y, b) \
704 ( x = pix_multiply (x, a), \
705 y = pix_multiply (y, b), \
710 /* --------------- MMX code patch for fbcompose.c --------------------- */
712 static force_inline __m64
713 combine (const uint32_t *src, const uint32_t *mask)
715 __m64 vsrc = load8888 (src);
719 __m64 m = load8888 (mask);
721 m = expand_alpha (m);
722 vsrc = pix_multiply (vsrc, m);
728 static force_inline __m64
729 core_combine_over_u_pixel_mmx (__m64 vsrc, __m64 vdst)
731 vsrc = _mm_unpacklo_pi8 (vsrc, _mm_setzero_si64 ());
733 if (is_opaque (vsrc))
737 else if (!is_zero (vsrc))
739 return over (vsrc, expand_alpha (vsrc),
740 _mm_unpacklo_pi8 (vdst, _mm_setzero_si64 ()));
743 return _mm_unpacklo_pi8 (vdst, _mm_setzero_si64 ());
747 mmx_combine_over_u (pixman_implementation_t *imp,
750 const uint32_t * src,
751 const uint32_t * mask,
754 const uint32_t *end = dest + width;
758 __m64 vsrc = combine (src, mask);
760 if (is_opaque (vsrc))
762 store8888 (dest, vsrc);
764 else if (!is_zero (vsrc))
766 __m64 sa = expand_alpha (vsrc);
767 store8888 (dest, over (vsrc, sa, load8888 (dest)));
779 mmx_combine_over_reverse_u (pixman_implementation_t *imp,
782 const uint32_t * src,
783 const uint32_t * mask,
786 const uint32_t *end = dest + width;
791 __m64 s = combine (src, mask);
794 da = expand_alpha (d);
795 store8888 (dest, over (d, da, s));
806 mmx_combine_in_u (pixman_implementation_t *imp,
809 const uint32_t * src,
810 const uint32_t * mask,
813 const uint32_t *end = dest + width;
818 __m64 x = combine (src, mask);
821 a = expand_alpha (a);
822 x = pix_multiply (x, a);
835 mmx_combine_in_reverse_u (pixman_implementation_t *imp,
838 const uint32_t * src,
839 const uint32_t * mask,
842 const uint32_t *end = dest + width;
846 __m64 a = combine (src, mask);
850 a = expand_alpha (a);
851 x = pix_multiply (x, a);
863 mmx_combine_out_u (pixman_implementation_t *imp,
866 const uint32_t * src,
867 const uint32_t * mask,
870 const uint32_t *end = dest + width;
875 __m64 x = combine (src, mask);
878 a = expand_alpha (a);
880 x = pix_multiply (x, a);
892 mmx_combine_out_reverse_u (pixman_implementation_t *imp,
895 const uint32_t * src,
896 const uint32_t * mask,
899 const uint32_t *end = dest + width;
903 __m64 a = combine (src, mask);
907 a = expand_alpha (a);
909 x = pix_multiply (x, a);
922 mmx_combine_atop_u (pixman_implementation_t *imp,
925 const uint32_t * src,
926 const uint32_t * mask,
929 const uint32_t *end = dest + width;
934 __m64 s = combine (src, mask);
937 sia = expand_alpha (s);
939 da = expand_alpha (d);
940 s = pix_add_mul (s, da, d, sia);
952 mmx_combine_atop_reverse_u (pixman_implementation_t *imp,
955 const uint32_t * src,
956 const uint32_t * mask,
966 __m64 s = combine (src, mask);
969 sa = expand_alpha (s);
970 dia = expand_alpha (d);
972 s = pix_add_mul (s, dia, d, sa);
984 mmx_combine_xor_u (pixman_implementation_t *imp,
987 const uint32_t * src,
988 const uint32_t * mask,
991 const uint32_t *end = dest + width;
996 __m64 s = combine (src, mask);
999 sia = expand_alpha (s);
1000 dia = expand_alpha (d);
1003 s = pix_add_mul (s, dia, d, sia);
1004 store8888 (dest, s);
1015 mmx_combine_add_u (pixman_implementation_t *imp,
1018 const uint32_t * src,
1019 const uint32_t * mask,
1022 const uint32_t *end = dest + width;
1027 __m64 s = combine (src, mask);
1029 d = load8888 (dest);
1031 store8888 (dest, s);
1042 mmx_combine_saturate_u (pixman_implementation_t *imp,
1045 const uint32_t * src,
1046 const uint32_t * mask,
1049 const uint32_t *end = dest + width;
1055 __m64 ms = combine (src, mask);
1056 __m64 md = load8888 (dest);
1064 uint32_t quot = DIV_UN8 (da, sa) << 24;
1065 __m64 msa = load8888 (");
1066 msa = expand_alpha (msa);
1067 ms = pix_multiply (ms, msa);
1070 md = pix_add (md, ms);
1071 store8888 (dest, md);
1082 mmx_combine_src_ca (pixman_implementation_t *imp,
1085 const uint32_t * src,
1086 const uint32_t * mask,
1089 const uint32_t *end = src + width;
1093 __m64 a = load8888 (mask);
1094 __m64 s = load8888 (src);
1096 s = pix_multiply (s, a);
1097 store8888 (dest, s);
1107 mmx_combine_over_ca (pixman_implementation_t *imp,
1110 const uint32_t * src,
1111 const uint32_t * mask,
1114 const uint32_t *end = src + width;
1118 __m64 a = load8888 (mask);
1119 __m64 s = load8888 (src);
1120 __m64 d = load8888 (dest);
1121 __m64 sa = expand_alpha (s);
1123 store8888 (dest, in_over (s, sa, a, d));
1133 mmx_combine_over_reverse_ca (pixman_implementation_t *imp,
1136 const uint32_t * src,
1137 const uint32_t * mask,
1140 const uint32_t *end = src + width;
1144 __m64 a = load8888 (mask);
1145 __m64 s = load8888 (src);
1146 __m64 d = load8888 (dest);
1147 __m64 da = expand_alpha (d);
1149 store8888 (dest, over (d, da, in (s, a)));
1159 mmx_combine_in_ca (pixman_implementation_t *imp,
1162 const uint32_t * src,
1163 const uint32_t * mask,
1166 const uint32_t *end = src + width;
1170 __m64 a = load8888 (mask);
1171 __m64 s = load8888 (src);
1172 __m64 d = load8888 (dest);
1173 __m64 da = expand_alpha (d);
1175 s = pix_multiply (s, a);
1176 s = pix_multiply (s, da);
1177 store8888 (dest, s);
1187 mmx_combine_in_reverse_ca (pixman_implementation_t *imp,
1190 const uint32_t * src,
1191 const uint32_t * mask,
1194 const uint32_t *end = src + width;
1198 __m64 a = load8888 (mask);
1199 __m64 s = load8888 (src);
1200 __m64 d = load8888 (dest);
1201 __m64 sa = expand_alpha (s);
1203 a = pix_multiply (a, sa);
1204 d = pix_multiply (d, a);
1205 store8888 (dest, d);
1215 mmx_combine_out_ca (pixman_implementation_t *imp,
1218 const uint32_t * src,
1219 const uint32_t * mask,
1222 const uint32_t *end = src + width;
1226 __m64 a = load8888 (mask);
1227 __m64 s = load8888 (src);
1228 __m64 d = load8888 (dest);
1229 __m64 da = expand_alpha (d);
1232 s = pix_multiply (s, a);
1233 s = pix_multiply (s, da);
1234 store8888 (dest, s);
1244 mmx_combine_out_reverse_ca (pixman_implementation_t *imp,
1247 const uint32_t * src,
1248 const uint32_t * mask,
1251 const uint32_t *end = src + width;
1255 __m64 a = load8888 (mask);
1256 __m64 s = load8888 (src);
1257 __m64 d = load8888 (dest);
1258 __m64 sa = expand_alpha (s);
1260 a = pix_multiply (a, sa);
1262 d = pix_multiply (d, a);
1263 store8888 (dest, d);
1273 mmx_combine_atop_ca (pixman_implementation_t *imp,
1276 const uint32_t * src,
1277 const uint32_t * mask,
1280 const uint32_t *end = src + width;
1284 __m64 a = load8888 (mask);
1285 __m64 s = load8888 (src);
1286 __m64 d = load8888 (dest);
1287 __m64 da = expand_alpha (d);
1288 __m64 sa = expand_alpha (s);
1290 s = pix_multiply (s, a);
1291 a = pix_multiply (a, sa);
1293 d = pix_add_mul (d, a, s, da);
1294 store8888 (dest, d);
1304 mmx_combine_atop_reverse_ca (pixman_implementation_t *imp,
1307 const uint32_t * src,
1308 const uint32_t * mask,
1311 const uint32_t *end = src + width;
1315 __m64 a = load8888 (mask);
1316 __m64 s = load8888 (src);
1317 __m64 d = load8888 (dest);
1318 __m64 da = expand_alpha (d);
1319 __m64 sa = expand_alpha (s);
1321 s = pix_multiply (s, a);
1322 a = pix_multiply (a, sa);
1324 d = pix_add_mul (d, a, s, da);
1325 store8888 (dest, d);
1335 mmx_combine_xor_ca (pixman_implementation_t *imp,
1338 const uint32_t * src,
1339 const uint32_t * mask,
1342 const uint32_t *end = src + width;
1346 __m64 a = load8888 (mask);
1347 __m64 s = load8888 (src);
1348 __m64 d = load8888 (dest);
1349 __m64 da = expand_alpha (d);
1350 __m64 sa = expand_alpha (s);
1352 s = pix_multiply (s, a);
1353 a = pix_multiply (a, sa);
1356 d = pix_add_mul (d, a, s, da);
1357 store8888 (dest, d);
1367 mmx_combine_add_ca (pixman_implementation_t *imp,
1370 const uint32_t * src,
1371 const uint32_t * mask,
1374 const uint32_t *end = src + width;
1378 __m64 a = load8888 (mask);
1379 __m64 s = load8888 (src);
1380 __m64 d = load8888 (dest);
1382 s = pix_multiply (s, a);
1384 store8888 (dest, d);
1393 /* ------------- MMX code paths called from fbpict.c -------------------- */
1396 mmx_composite_over_n_8888 (pixman_implementation_t *imp,
1397 pixman_composite_info_t *info)
1399 PIXMAN_COMPOSITE_ARGS (info);
1401 uint32_t *dst_line, *dst;
1408 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
1413 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
1415 vsrc = load8888 (&src);
1416 vsrca = expand_alpha (vsrc);
1421 dst_line += dst_stride;
1426 while (w && (uintptr_t)dst & 7)
1428 store8888 (dst, over (vsrc, vsrca, load8888 (dst)));
1439 vdest = *(__m64 *)dst;
1441 dest0 = over (vsrc, vsrca, expand8888 (vdest, 0));
1442 dest1 = over (vsrc, vsrca, expand8888 (vdest, 1));
1444 *(__m64 *)dst = pack8888 (dest0, dest1);
1454 store8888 (dst, over (vsrc, vsrca, load8888 (dst)));
1462 mmx_composite_over_n_0565 (pixman_implementation_t *imp,
1463 pixman_composite_info_t *info)
1465 PIXMAN_COMPOSITE_ARGS (info);
1467 uint16_t *dst_line, *dst;
1474 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
1479 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
1481 vsrc = load8888 (&src);
1482 vsrca = expand_alpha (vsrc);
1487 dst_line += dst_stride;
1492 while (w && (uintptr_t)dst & 7)
1495 __m64 vdest = expand565 (to_m64 (d), 0);
1497 vdest = pack_565 (over (vsrc, vsrca, vdest), vdest, 0);
1498 *dst = to_uint64 (vdest);
1506 __m64 vdest = *(__m64 *)dst;
1507 __m64 v0, v1, v2, v3;
1509 expand_4x565 (vdest, &v0, &v1, &v2, &v3, 0);
1511 v0 = over (vsrc, vsrca, v0);
1512 v1 = over (vsrc, vsrca, v1);
1513 v2 = over (vsrc, vsrca, v2);
1514 v3 = over (vsrc, vsrca, v3);
1516 *(__m64 *)dst = pack_4x565 (v0, v1, v2, v3);
1527 __m64 vdest = expand565 (to_m64 (d), 0);
1529 vdest = pack_565 (over (vsrc, vsrca, vdest), vdest, 0);
1530 *dst = to_uint64 (vdest);
1541 mmx_composite_over_n_8888_8888_ca (pixman_implementation_t *imp,
1542 pixman_composite_info_t *info)
1544 PIXMAN_COMPOSITE_ARGS (info);
1547 uint32_t *mask_line;
1548 int dst_stride, mask_stride;
1553 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
1558 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
1559 PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
1561 vsrc = load8888 (&src);
1562 vsrca = expand_alpha (vsrc);
1567 uint32_t *p = (uint32_t *)mask_line;
1568 uint32_t *q = (uint32_t *)dst_line;
1570 while (twidth && (uintptr_t)q & 7)
1572 uint32_t m = *(uint32_t *)p;
1576 __m64 vdest = load8888 (q);
1577 vdest = in_over (vsrc, vsrca, load8888 (&m), vdest);
1578 store8888 (q, vdest);
1595 __m64 vdest = *(__m64 *)q;
1597 dest0 = in_over (vsrc, vsrca, load8888 (&m0),
1598 expand8888 (vdest, 0));
1599 dest1 = in_over (vsrc, vsrca, load8888 (&m1),
1600 expand8888 (vdest, 1));
1602 *(__m64 *)q = pack8888 (dest0, dest1);
1612 uint32_t m = *(uint32_t *)p;
1616 __m64 vdest = load8888 (q);
1617 vdest = in_over (vsrc, vsrca, load8888 (&m), vdest);
1618 store8888 (q, vdest);
1626 dst_line += dst_stride;
1627 mask_line += mask_stride;
1634 mmx_composite_over_8888_n_8888 (pixman_implementation_t *imp,
1635 pixman_composite_info_t *info)
1637 PIXMAN_COMPOSITE_ARGS (info);
1638 uint32_t *dst_line, *dst;
1639 uint32_t *src_line, *src;
1642 int dst_stride, src_stride;
1647 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
1648 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
1650 mask = _pixman_image_get_solid (imp, mask_image, dest_image->bits.format);
1651 vmask = expand_alpha (load8888 (&mask));
1656 dst_line += dst_stride;
1658 src_line += src_stride;
1661 while (w && (uintptr_t)dst & 7)
1663 __m64 s = load8888 (src);
1664 __m64 d = load8888 (dst);
1666 store8888 (dst, in_over (s, expand_alpha (s), vmask, d));
1675 __m64 vs = ldq_u ((__m64 *)src);
1676 __m64 vd = *(__m64 *)dst;
1677 __m64 vsrc0 = expand8888 (vs, 0);
1678 __m64 vsrc1 = expand8888 (vs, 1);
1680 *(__m64 *)dst = pack8888 (
1681 in_over (vsrc0, expand_alpha (vsrc0), vmask, expand8888 (vd, 0)),
1682 in_over (vsrc1, expand_alpha (vsrc1), vmask, expand8888 (vd, 1)));
1691 __m64 s = load8888 (src);
1692 __m64 d = load8888 (dst);
1694 store8888 (dst, in_over (s, expand_alpha (s), vmask, d));
1702 mmx_composite_over_x888_n_8888 (pixman_implementation_t *imp,
1703 pixman_composite_info_t *info)
1705 PIXMAN_COMPOSITE_ARGS (info);
1706 uint32_t *dst_line, *dst;
1707 uint32_t *src_line, *src;
1710 int dst_stride, src_stride;
1716 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
1717 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
1718 mask = _pixman_image_get_solid (imp, mask_image, dest_image->bits.format);
1720 vmask = expand_alpha (load8888 (&mask));
1726 dst_line += dst_stride;
1728 src_line += src_stride;
1731 while (w && (uintptr_t)dst & 7)
1733 uint32_t ssrc = *src | 0xff000000;
1734 __m64 s = load8888 (&ssrc);
1735 __m64 d = load8888 (dst);
1737 store8888 (dst, in_over (s, srca, vmask, d));
1746 __m64 vd0 = *(__m64 *)(dst + 0);
1747 __m64 vd1 = *(__m64 *)(dst + 2);
1748 __m64 vd2 = *(__m64 *)(dst + 4);
1749 __m64 vd3 = *(__m64 *)(dst + 6);
1750 __m64 vd4 = *(__m64 *)(dst + 8);
1751 __m64 vd5 = *(__m64 *)(dst + 10);
1752 __m64 vd6 = *(__m64 *)(dst + 12);
1753 __m64 vd7 = *(__m64 *)(dst + 14);
1755 __m64 vs0 = ldq_u ((__m64 *)(src + 0));
1756 __m64 vs1 = ldq_u ((__m64 *)(src + 2));
1757 __m64 vs2 = ldq_u ((__m64 *)(src + 4));
1758 __m64 vs3 = ldq_u ((__m64 *)(src + 6));
1759 __m64 vs4 = ldq_u ((__m64 *)(src + 8));
1760 __m64 vs5 = ldq_u ((__m64 *)(src + 10));
1761 __m64 vs6 = ldq_u ((__m64 *)(src + 12));
1762 __m64 vs7 = ldq_u ((__m64 *)(src + 14));
1765 in_over (expandx888 (vs0, 0), srca, vmask, expand8888 (vd0, 0)),
1766 in_over (expandx888 (vs0, 1), srca, vmask, expand8888 (vd0, 1)));
1769 in_over (expandx888 (vs1, 0), srca, vmask, expand8888 (vd1, 0)),
1770 in_over (expandx888 (vs1, 1), srca, vmask, expand8888 (vd1, 1)));
1773 in_over (expandx888 (vs2, 0), srca, vmask, expand8888 (vd2, 0)),
1774 in_over (expandx888 (vs2, 1), srca, vmask, expand8888 (vd2, 1)));
1777 in_over (expandx888 (vs3, 0), srca, vmask, expand8888 (vd3, 0)),
1778 in_over (expandx888 (vs3, 1), srca, vmask, expand8888 (vd3, 1)));
1781 in_over (expandx888 (vs4, 0), srca, vmask, expand8888 (vd4, 0)),
1782 in_over (expandx888 (vs4, 1), srca, vmask, expand8888 (vd4, 1)));
1785 in_over (expandx888 (vs5, 0), srca, vmask, expand8888 (vd5, 0)),
1786 in_over (expandx888 (vs5, 1), srca, vmask, expand8888 (vd5, 1)));
1789 in_over (expandx888 (vs6, 0), srca, vmask, expand8888 (vd6, 0)),
1790 in_over (expandx888 (vs6, 1), srca, vmask, expand8888 (vd6, 1)));
1793 in_over (expandx888 (vs7, 0), srca, vmask, expand8888 (vd7, 0)),
1794 in_over (expandx888 (vs7, 1), srca, vmask, expand8888 (vd7, 1)));
1796 *(__m64 *)(dst + 0) = vd0;
1797 *(__m64 *)(dst + 2) = vd1;
1798 *(__m64 *)(dst + 4) = vd2;
1799 *(__m64 *)(dst + 6) = vd3;
1800 *(__m64 *)(dst + 8) = vd4;
1801 *(__m64 *)(dst + 10) = vd5;
1802 *(__m64 *)(dst + 12) = vd6;
1803 *(__m64 *)(dst + 14) = vd7;
1812 uint32_t ssrc = *src | 0xff000000;
1813 __m64 s = load8888 (&ssrc);
1814 __m64 d = load8888 (dst);
1816 store8888 (dst, in_over (s, srca, vmask, d));
1828 mmx_composite_over_8888_8888 (pixman_implementation_t *imp,
1829 pixman_composite_info_t *info)
1831 PIXMAN_COMPOSITE_ARGS (info);
1832 uint32_t *dst_line, *dst;
1833 uint32_t *src_line, *src;
1835 int dst_stride, src_stride;
1841 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
1842 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
1847 dst_line += dst_stride;
1849 src_line += src_stride;
1865 sa = expand_alpha (ms);
1866 store8888 (dst, over (ms, sa, load8888 (dst)));
1876 mmx_composite_over_8888_0565 (pixman_implementation_t *imp,
1877 pixman_composite_info_t *info)
1879 PIXMAN_COMPOSITE_ARGS (info);
1880 uint16_t *dst_line, *dst;
1881 uint32_t *src_line, *src;
1882 int dst_stride, src_stride;
1887 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
1888 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
1892 assert (src_image->drawable == mask_image->drawable);
1898 dst_line += dst_stride;
1900 src_line += src_stride;
1905 while (w && (uintptr_t)dst & 7)
1907 __m64 vsrc = load8888 (src);
1909 __m64 vdest = expand565 (to_m64 (d), 0);
1912 over (vsrc, expand_alpha (vsrc), vdest), vdest, 0);
1914 *dst = to_uint64 (vdest);
1925 __m64 vdest = *(__m64 *)dst;
1926 __m64 v0, v1, v2, v3;
1927 __m64 vsrc0, vsrc1, vsrc2, vsrc3;
1929 expand_4x565 (vdest, &v0, &v1, &v2, &v3, 0);
1931 vsrc0 = load8888 ((src + 0));
1932 vsrc1 = load8888 ((src + 1));
1933 vsrc2 = load8888 ((src + 2));
1934 vsrc3 = load8888 ((src + 3));
1936 v0 = over (vsrc0, expand_alpha (vsrc0), v0);
1937 v1 = over (vsrc1, expand_alpha (vsrc1), v1);
1938 v2 = over (vsrc2, expand_alpha (vsrc2), v2);
1939 v3 = over (vsrc3, expand_alpha (vsrc3), v3);
1941 *(__m64 *)dst = pack_4x565 (v0, v1, v2, v3);
1952 __m64 vsrc = load8888 (src);
1954 __m64 vdest = expand565 (to_m64 (d), 0);
1956 vdest = pack_565 (over (vsrc, expand_alpha (vsrc), vdest), vdest, 0);
1958 *dst = to_uint64 (vdest);
1970 mmx_composite_over_n_8_8888 (pixman_implementation_t *imp,
1971 pixman_composite_info_t *info)
1973 PIXMAN_COMPOSITE_ARGS (info);
1975 uint32_t *dst_line, *dst;
1976 uint8_t *mask_line, *mask;
1977 int dst_stride, mask_stride;
1984 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
1990 srcsrc = (uint64_t)src << 32 | src;
1992 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
1993 PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
1995 vsrc = load8888 (&src);
1996 vsrca = expand_alpha (vsrc);
2001 dst_line += dst_stride;
2003 mask_line += mask_stride;
2008 while (w && (uintptr_t)dst & 7)
2014 __m64 vdest = in_over (vsrc, vsrca,
2015 expand_alpha_rev (to_m64 (m)),
2018 store8888 (dst, vdest);
2035 if (srca == 0xff && (m0 & m1) == 0xff)
2037 *(uint64_t *)dst = srcsrc;
2044 vdest = *(__m64 *)dst;
2046 dest0 = in_over (vsrc, vsrca, expand_alpha_rev (to_m64 (m0)),
2047 expand8888 (vdest, 0));
2048 dest1 = in_over (vsrc, vsrca, expand_alpha_rev (to_m64 (m1)),
2049 expand8888 (vdest, 1));
2051 *(__m64 *)dst = pack8888 (dest0, dest1);
2067 __m64 vdest = load8888 (dst);
2070 vsrc, vsrca, expand_alpha_rev (to_m64 (m)), vdest);
2071 store8888 (dst, vdest);
2079 static pixman_bool_t
2080 mmx_fill (pixman_implementation_t *imp,
2092 uint32_t byte_width;
2095 #if defined __GNUC__ && defined USE_X86_MMX
2096 __m64 v1, v2, v3, v4, v5, v6, v7;
2099 if (bpp != 16 && bpp != 32 && bpp != 8)
2104 stride = stride * (int) sizeof (uint32_t) / 1;
2105 byte_line = (uint8_t *)(((uint8_t *)bits) + stride * y + x);
2108 filler = (filler & 0xff) * 0x01010101;
2112 stride = stride * (int) sizeof (uint32_t) / 2;
2113 byte_line = (uint8_t *)(((uint16_t *)bits) + stride * y + x);
2114 byte_width = 2 * width;
2116 filler = (filler & 0xffff) * 0x00010001;
2120 stride = stride * (int) sizeof (uint32_t) / 4;
2121 byte_line = (uint8_t *)(((uint32_t *)bits) + stride * y + x);
2122 byte_width = 4 * width;
2126 fill = ((uint64_t)filler << 32) | filler;
2127 vfill = to_m64 (fill);
2129 #if defined __GNUC__ && defined USE_X86_MMX
2138 : "=&y" (v1), "=&y" (v2), "=&y" (v3),
2139 "=&y" (v4), "=&y" (v5), "=&y" (v6), "=y" (v7)
2146 uint8_t *d = byte_line;
2148 byte_line += stride;
2151 if (w >= 1 && ((uintptr_t)d & 1))
2153 *(uint8_t *)d = (filler & 0xff);
2158 if (w >= 2 && ((uintptr_t)d & 3))
2160 *(uint16_t *)d = filler;
2165 while (w >= 4 && ((uintptr_t)d & 7))
2167 *(uint32_t *)d = filler;
2175 #if defined __GNUC__ && defined USE_X86_MMX
2187 "y" (vfill), "y" (v1), "y" (v2), "y" (v3),
2188 "y" (v4), "y" (v5), "y" (v6), "y" (v7)
2191 *(__m64*) (d + 0) = vfill;
2192 *(__m64*) (d + 8) = vfill;
2193 *(__m64*) (d + 16) = vfill;
2194 *(__m64*) (d + 24) = vfill;
2195 *(__m64*) (d + 32) = vfill;
2196 *(__m64*) (d + 40) = vfill;
2197 *(__m64*) (d + 48) = vfill;
2198 *(__m64*) (d + 56) = vfill;
2206 *(uint32_t *)d = filler;
2213 *(uint16_t *)d = filler;
2219 *(uint8_t *)d = (filler & 0xff);
2231 mmx_composite_src_x888_0565 (pixman_implementation_t *imp,
2232 pixman_composite_info_t *info)
2234 PIXMAN_COMPOSITE_ARGS (info);
2235 uint16_t *dst_line, *dst;
2236 uint32_t *src_line, *src, s;
2237 int dst_stride, src_stride;
2240 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
2241 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
2246 dst_line += dst_stride;
2248 src_line += src_stride;
2251 while (w && (uintptr_t)dst & 7)
2254 *dst = convert_8888_to_0565 (s);
2262 __m64 vsrc0 = ldq_u ((__m64 *)(src + 0));
2263 __m64 vsrc1 = ldq_u ((__m64 *)(src + 2));
2265 vdest = pack_4xpacked565 (vsrc0, vsrc1);
2267 *(__m64 *)dst = vdest;
2277 *dst = convert_8888_to_0565 (s);
2287 mmx_composite_src_n_8_8888 (pixman_implementation_t *imp,
2288 pixman_composite_info_t *info)
2290 PIXMAN_COMPOSITE_ARGS (info);
2292 uint32_t *dst_line, *dst;
2293 uint8_t *mask_line, *mask;
2294 int dst_stride, mask_stride;
2301 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
2306 mmx_fill (imp, dest_image->bits.bits, dest_image->bits.rowstride,
2307 PIXMAN_FORMAT_BPP (dest_image->bits.format),
2308 dest_x, dest_y, width, height, 0);
2312 srcsrc = (uint64_t)src << 32 | src;
2314 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2315 PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
2317 vsrc = load8888 (&src);
2322 dst_line += dst_stride;
2324 mask_line += mask_stride;
2329 while (w && (uintptr_t)dst & 7)
2335 __m64 vdest = in (vsrc, expand_alpha_rev (to_m64 (m)));
2337 store8888 (dst, vdest);
2357 if (srca == 0xff && (m0 & m1) == 0xff)
2359 *(uint64_t *)dst = srcsrc;
2365 dest0 = in (vsrc, expand_alpha_rev (to_m64 (m0)));
2366 dest1 = in (vsrc, expand_alpha_rev (to_m64 (m1)));
2368 *(__m64 *)dst = pack8888 (dest0, dest1);
2372 *(uint64_t *)dst = 0;
2388 __m64 vdest = load8888 (dst);
2390 vdest = in (vsrc, expand_alpha_rev (to_m64 (m)));
2391 store8888 (dst, vdest);
2404 mmx_composite_over_n_8_0565 (pixman_implementation_t *imp,
2405 pixman_composite_info_t *info)
2407 PIXMAN_COMPOSITE_ARGS (info);
2409 uint16_t *dst_line, *dst;
2410 uint8_t *mask_line, *mask;
2411 int dst_stride, mask_stride;
2413 __m64 vsrc, vsrca, tmp;
2418 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
2424 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
2425 PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
2427 vsrc = load8888 (&src);
2428 vsrca = expand_alpha (vsrc);
2430 tmp = pack_565 (vsrc, _mm_setzero_si64 (), 0);
2431 srcsrcsrcsrc = expand_alpha_rev (tmp);
2436 dst_line += dst_stride;
2438 mask_line += mask_stride;
2443 while (w && (uintptr_t)dst & 7)
2450 __m64 vd = to_m64 (d);
2451 __m64 vdest = in_over (
2452 vsrc, vsrca, expand_alpha_rev (to_m64 (m)), expand565 (vd, 0));
2454 vd = pack_565 (vdest, _mm_setzero_si64 (), 0);
2455 *dst = to_uint64 (vd);
2467 uint64_t m0, m1, m2, m3;
2473 if (srca == 0xff && (m0 & m1 & m2 & m3) == 0xff)
2475 *(__m64 *)dst = srcsrcsrcsrc;
2477 else if (m0 | m1 | m2 | m3)
2479 __m64 vdest = *(__m64 *)dst;
2480 __m64 v0, v1, v2, v3;
2481 __m64 vm0, vm1, vm2, vm3;
2483 expand_4x565 (vdest, &v0, &v1, &v2, &v3, 0);
2486 v0 = in_over (vsrc, vsrca, expand_alpha_rev (vm0), v0);
2489 v1 = in_over (vsrc, vsrca, expand_alpha_rev (vm1), v1);
2492 v2 = in_over (vsrc, vsrca, expand_alpha_rev (vm2), v2);
2495 v3 = in_over (vsrc, vsrca, expand_alpha_rev (vm3), v3);
2497 *(__m64 *)dst = pack_4x565 (v0, v1, v2, v3);;
2514 __m64 vd = to_m64 (d);
2515 __m64 vdest = in_over (vsrc, vsrca, expand_alpha_rev (to_m64 (m)),
2517 vd = pack_565 (vdest, _mm_setzero_si64 (), 0);
2518 *dst = to_uint64 (vd);
2531 mmx_composite_over_pixbuf_0565 (pixman_implementation_t *imp,
2532 pixman_composite_info_t *info)
2534 PIXMAN_COMPOSITE_ARGS (info);
2535 uint16_t *dst_line, *dst;
2536 uint32_t *src_line, *src;
2537 int dst_stride, src_stride;
2542 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
2543 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
2547 assert (src_image->drawable == mask_image->drawable);
2553 dst_line += dst_stride;
2555 src_line += src_stride;
2560 while (w && (uintptr_t)dst & 7)
2562 __m64 vsrc = load8888 (src);
2564 __m64 vdest = expand565 (to_m64 (d), 0);
2566 vdest = pack_565 (over_rev_non_pre (vsrc, vdest), vdest, 0);
2568 *dst = to_uint64 (vdest);
2579 uint32_t s0, s1, s2, s3;
2580 unsigned char a0, a1, a2, a3;
2592 if ((a0 & a1 & a2 & a3) == 0xFF)
2594 __m64 v0 = invert_colors (load8888 (&s0));
2595 __m64 v1 = invert_colors (load8888 (&s1));
2596 __m64 v2 = invert_colors (load8888 (&s2));
2597 __m64 v3 = invert_colors (load8888 (&s3));
2599 *(__m64 *)dst = pack_4x565 (v0, v1, v2, v3);
2601 else if (s0 | s1 | s2 | s3)
2603 __m64 vdest = *(__m64 *)dst;
2604 __m64 v0, v1, v2, v3;
2606 __m64 vsrc0 = load8888 (&s0);
2607 __m64 vsrc1 = load8888 (&s1);
2608 __m64 vsrc2 = load8888 (&s2);
2609 __m64 vsrc3 = load8888 (&s3);
2611 expand_4x565 (vdest, &v0, &v1, &v2, &v3, 0);
2613 v0 = over_rev_non_pre (vsrc0, v0);
2614 v1 = over_rev_non_pre (vsrc1, v1);
2615 v2 = over_rev_non_pre (vsrc2, v2);
2616 v3 = over_rev_non_pre (vsrc3, v3);
2618 *(__m64 *)dst = pack_4x565 (v0, v1, v2, v3);
2630 __m64 vsrc = load8888 (src);
2632 __m64 vdest = expand565 (to_m64 (d), 0);
2634 vdest = pack_565 (over_rev_non_pre (vsrc, vdest), vdest, 0);
2636 *dst = to_uint64 (vdest);
2648 mmx_composite_over_pixbuf_8888 (pixman_implementation_t *imp,
2649 pixman_composite_info_t *info)
2651 PIXMAN_COMPOSITE_ARGS (info);
2652 uint32_t *dst_line, *dst;
2653 uint32_t *src_line, *src;
2654 int dst_stride, src_stride;
2659 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2660 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
2664 assert (src_image->drawable == mask_image->drawable);
2670 dst_line += dst_stride;
2672 src_line += src_stride;
2675 while (w && (uintptr_t)dst & 7)
2677 __m64 s = load8888 (src);
2678 __m64 d = load8888 (dst);
2680 store8888 (dst, over_rev_non_pre (s, d));
2690 unsigned char a0, a1;
2699 if ((a0 & a1) == 0xFF)
2701 d0 = invert_colors (load8888 (&s0));
2702 d1 = invert_colors (load8888 (&s1));
2704 *(__m64 *)dst = pack8888 (d0, d1);
2708 __m64 vdest = *(__m64 *)dst;
2710 d0 = over_rev_non_pre (load8888 (&s0), expand8888 (vdest, 0));
2711 d1 = over_rev_non_pre (load8888 (&s1), expand8888 (vdest, 1));
2713 *(__m64 *)dst = pack8888 (d0, d1);
2723 __m64 s = load8888 (src);
2724 __m64 d = load8888 (dst);
2726 store8888 (dst, over_rev_non_pre (s, d));
2734 mmx_composite_over_n_8888_0565_ca (pixman_implementation_t *imp,
2735 pixman_composite_info_t *info)
2737 PIXMAN_COMPOSITE_ARGS (info);
2740 uint32_t *mask_line;
2741 int dst_stride, mask_stride;
2746 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
2751 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
2752 PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
2754 vsrc = load8888 (&src);
2755 vsrca = expand_alpha (vsrc);
2760 uint32_t *p = (uint32_t *)mask_line;
2761 uint16_t *q = (uint16_t *)dst_line;
2763 while (twidth && ((uintptr_t)q & 7))
2765 uint32_t m = *(uint32_t *)p;
2770 __m64 vdest = expand565 (to_m64 (d), 0);
2771 vdest = pack_565 (in_over (vsrc, vsrca, load8888 (&m), vdest), vdest, 0);
2772 *q = to_uint64 (vdest);
2782 uint32_t m0, m1, m2, m3;
2789 if ((m0 | m1 | m2 | m3))
2791 __m64 vdest = *(__m64 *)q;
2792 __m64 v0, v1, v2, v3;
2794 expand_4x565 (vdest, &v0, &v1, &v2, &v3, 0);
2796 v0 = in_over (vsrc, vsrca, load8888 (&m0), v0);
2797 v1 = in_over (vsrc, vsrca, load8888 (&m1), v1);
2798 v2 = in_over (vsrc, vsrca, load8888 (&m2), v2);
2799 v3 = in_over (vsrc, vsrca, load8888 (&m3), v3);
2801 *(__m64 *)q = pack_4x565 (v0, v1, v2, v3);
2816 __m64 vdest = expand565 (to_m64 (d), 0);
2817 vdest = pack_565 (in_over (vsrc, vsrca, load8888 (&m), vdest), vdest, 0);
2818 *q = to_uint64 (vdest);
2826 mask_line += mask_stride;
2827 dst_line += dst_stride;
2834 mmx_composite_in_n_8_8 (pixman_implementation_t *imp,
2835 pixman_composite_info_t *info)
2837 PIXMAN_COMPOSITE_ARGS (info);
2838 uint8_t *dst_line, *dst;
2839 uint8_t *mask_line, *mask;
2840 int dst_stride, mask_stride;
2846 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
2847 PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
2849 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
2853 vsrc = load8888 (&src);
2854 vsrca = expand_alpha (vsrc);
2859 dst_line += dst_stride;
2861 mask_line += mask_stride;
2864 while (w && (uintptr_t)dst & 7)
2873 m = MUL_UN8 (sa, a, tmp);
2874 d = MUL_UN8 (m, d, tmp);
2885 vmask = load8888u ((uint32_t *)mask);
2886 vdest = load8888 ((uint32_t *)dst);
2888 store8888 ((uint32_t *)dst, in (in (vsrca, vmask), vdest));
2904 m = MUL_UN8 (sa, a, tmp);
2905 d = MUL_UN8 (m, d, tmp);
2915 mmx_composite_in_8_8 (pixman_implementation_t *imp,
2916 pixman_composite_info_t *info)
2918 PIXMAN_COMPOSITE_ARGS (info);
2919 uint8_t *dst_line, *dst;
2920 uint8_t *src_line, *src;
2921 int src_stride, dst_stride;
2924 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
2925 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
2930 dst_line += dst_stride;
2932 src_line += src_stride;
2935 while (w && (uintptr_t)dst & 3)
2943 *dst = MUL_UN8 (s, d, tmp);
2952 uint32_t *s = (uint32_t *)src;
2953 uint32_t *d = (uint32_t *)dst;
2955 store8888 (d, in (load8888u (s), load8888 (d)));
2970 *dst = MUL_UN8 (s, d, tmp);
2981 mmx_composite_add_n_8_8 (pixman_implementation_t *imp,
2982 pixman_composite_info_t *info)
2984 PIXMAN_COMPOSITE_ARGS (info);
2985 uint8_t *dst_line, *dst;
2986 uint8_t *mask_line, *mask;
2987 int dst_stride, mask_stride;
2993 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
2994 PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
2996 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
3003 vsrc = load8888 (&src);
3004 vsrca = expand_alpha (vsrc);
3009 dst_line += dst_stride;
3011 mask_line += mask_stride;
3014 while (w && (uintptr_t)dst & 3)
3024 m = MUL_UN8 (sa, a, tmp);
3025 r = ADD_UN8 (m, d, tmp);
3036 vmask = load8888u ((uint32_t *)mask);
3037 vdest = load8888 ((uint32_t *)dst);
3039 store8888 ((uint32_t *)dst, _mm_adds_pu8 (in (vsrca, vmask), vdest));
3056 m = MUL_UN8 (sa, a, tmp);
3057 r = ADD_UN8 (m, d, tmp);
3067 mmx_composite_add_8_8 (pixman_implementation_t *imp,
3068 pixman_composite_info_t *info)
3070 PIXMAN_COMPOSITE_ARGS (info);
3071 uint8_t *dst_line, *dst;
3072 uint8_t *src_line, *src;
3073 int dst_stride, src_stride;
3080 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
3081 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
3086 dst_line += dst_stride;
3088 src_line += src_stride;
3091 while (w && (uintptr_t)dst & 7)
3096 s = t | (0 - (t >> 8));
3106 *(__m64*)dst = _mm_adds_pu8 (ldq_u ((__m64 *)src), *(__m64*)dst);
3117 s = t | (0 - (t >> 8));
3130 mmx_composite_add_0565_0565 (pixman_implementation_t *imp,
3131 pixman_composite_info_t *info)
3133 PIXMAN_COMPOSITE_ARGS (info);
3134 uint16_t *dst_line, *dst;
3136 uint16_t *src_line, *src;
3138 int dst_stride, src_stride;
3143 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint16_t, src_stride, src_line, 1);
3144 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
3149 dst_line += dst_stride;
3151 src_line += src_stride;
3154 while (w && (uintptr_t)dst & 7)
3160 s = convert_0565_to_8888 (s);
3163 d = convert_0565_to_8888 (d);
3164 UN8x4_ADD_UN8x4 (s, d);
3166 *dst = convert_8888_to_0565 (s);
3174 __m64 vdest = *(__m64 *)dst;
3175 __m64 vsrc = ldq_u ((__m64 *)src);
3179 expand_4xpacked565 (vdest, &vd0, &vd1, 0);
3180 expand_4xpacked565 (vsrc, &vs0, &vs1, 0);
3182 vd0 = _mm_adds_pu8 (vd0, vs0);
3183 vd1 = _mm_adds_pu8 (vd1, vs1);
3185 *(__m64 *)dst = pack_4xpacked565 (vd0, vd1);
3198 s = convert_0565_to_8888 (s);
3201 d = convert_0565_to_8888 (d);
3202 UN8x4_ADD_UN8x4 (s, d);
3204 *dst = convert_8888_to_0565 (s);
3214 mmx_composite_add_8888_8888 (pixman_implementation_t *imp,
3215 pixman_composite_info_t *info)
3217 PIXMAN_COMPOSITE_ARGS (info);
3218 uint32_t *dst_line, *dst;
3219 uint32_t *src_line, *src;
3220 int dst_stride, src_stride;
3225 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3226 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3231 dst_line += dst_stride;
3233 src_line += src_stride;
3236 while (w && (uintptr_t)dst & 7)
3238 store (dst, _mm_adds_pu8 (load ((const uint32_t *)src),
3239 load ((const uint32_t *)dst)));
3247 *(__m64 *)dst = _mm_adds_pu8 (ldq_u ((__m64 *)src), *(__m64*)dst);
3255 store (dst, _mm_adds_pu8 (load ((const uint32_t *)src),
3256 load ((const uint32_t *)dst)));
3264 static pixman_bool_t
3265 mmx_blt (pixman_implementation_t *imp,
3266 uint32_t * src_bits,
3267 uint32_t * dst_bits,
3279 uint8_t * src_bytes;
3280 uint8_t * dst_bytes;
3283 if (src_bpp != dst_bpp)
3288 src_stride = src_stride * (int) sizeof (uint32_t) / 2;
3289 dst_stride = dst_stride * (int) sizeof (uint32_t) / 2;
3290 src_bytes = (uint8_t *)(((uint16_t *)src_bits) + src_stride * (src_y) + (src_x));
3291 dst_bytes = (uint8_t *)(((uint16_t *)dst_bits) + dst_stride * (dest_y) + (dest_x));
3292 byte_width = 2 * width;
3296 else if (src_bpp == 32)
3298 src_stride = src_stride * (int) sizeof (uint32_t) / 4;
3299 dst_stride = dst_stride * (int) sizeof (uint32_t) / 4;
3300 src_bytes = (uint8_t *)(((uint32_t *)src_bits) + src_stride * (src_y) + (src_x));
3301 dst_bytes = (uint8_t *)(((uint32_t *)dst_bits) + dst_stride * (dest_y) + (dest_x));
3302 byte_width = 4 * width;
3314 uint8_t *s = src_bytes;
3315 uint8_t *d = dst_bytes;
3316 src_bytes += src_stride;
3317 dst_bytes += dst_stride;
3320 if (w >= 1 && ((uintptr_t)d & 1))
3322 *(uint8_t *)d = *(uint8_t *)s;
3328 if (w >= 2 && ((uintptr_t)d & 3))
3330 *(uint16_t *)d = *(uint16_t *)s;
3336 while (w >= 4 && ((uintptr_t)d & 7))
3338 *(uint32_t *)d = ldl_u ((uint32_t *)s);
3347 #if (defined (__GNUC__) || (defined(__SUNPRO_C) && (__SUNPRO_C >= 0x590))) && defined USE_X86_MMX
3349 "movq (%1), %%mm0\n"
3350 "movq 8(%1), %%mm1\n"
3351 "movq 16(%1), %%mm2\n"
3352 "movq 24(%1), %%mm3\n"
3353 "movq 32(%1), %%mm4\n"
3354 "movq 40(%1), %%mm5\n"
3355 "movq 48(%1), %%mm6\n"
3356 "movq 56(%1), %%mm7\n"
3358 "movq %%mm0, (%0)\n"
3359 "movq %%mm1, 8(%0)\n"
3360 "movq %%mm2, 16(%0)\n"
3361 "movq %%mm3, 24(%0)\n"
3362 "movq %%mm4, 32(%0)\n"
3363 "movq %%mm5, 40(%0)\n"
3364 "movq %%mm6, 48(%0)\n"
3365 "movq %%mm7, 56(%0)\n"
3369 "%mm0", "%mm1", "%mm2", "%mm3",
3370 "%mm4", "%mm5", "%mm6", "%mm7");
3372 __m64 v0 = ldq_u ((__m64 *)(s + 0));
3373 __m64 v1 = ldq_u ((__m64 *)(s + 8));
3374 __m64 v2 = ldq_u ((__m64 *)(s + 16));
3375 __m64 v3 = ldq_u ((__m64 *)(s + 24));
3376 __m64 v4 = ldq_u ((__m64 *)(s + 32));
3377 __m64 v5 = ldq_u ((__m64 *)(s + 40));
3378 __m64 v6 = ldq_u ((__m64 *)(s + 48));
3379 __m64 v7 = ldq_u ((__m64 *)(s + 56));
3380 *(__m64 *)(d + 0) = v0;
3381 *(__m64 *)(d + 8) = v1;
3382 *(__m64 *)(d + 16) = v2;
3383 *(__m64 *)(d + 24) = v3;
3384 *(__m64 *)(d + 32) = v4;
3385 *(__m64 *)(d + 40) = v5;
3386 *(__m64 *)(d + 48) = v6;
3387 *(__m64 *)(d + 56) = v7;
3396 *(uint32_t *)d = ldl_u ((uint32_t *)s);
3404 *(uint16_t *)d = *(uint16_t *)s;
3417 mmx_composite_copy_area (pixman_implementation_t *imp,
3418 pixman_composite_info_t *info)
3420 PIXMAN_COMPOSITE_ARGS (info);
3422 mmx_blt (imp, src_image->bits.bits,
3423 dest_image->bits.bits,
3424 src_image->bits.rowstride,
3425 dest_image->bits.rowstride,
3426 PIXMAN_FORMAT_BPP (src_image->bits.format),
3427 PIXMAN_FORMAT_BPP (dest_image->bits.format),
3428 src_x, src_y, dest_x, dest_y, width, height);
3432 mmx_composite_over_x888_8_8888 (pixman_implementation_t *imp,
3433 pixman_composite_info_t *info)
3435 PIXMAN_COMPOSITE_ARGS (info);
3436 uint32_t *src, *src_line;
3437 uint32_t *dst, *dst_line;
3438 uint8_t *mask, *mask_line;
3439 int src_stride, mask_stride, dst_stride;
3442 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3443 PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
3444 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3449 src_line += src_stride;
3451 dst_line += dst_stride;
3453 mask_line += mask_stride;
3463 uint32_t ssrc = *src | 0xff000000;
3464 __m64 s = load8888 (&ssrc);
3472 __m64 sa = expand_alpha (s);
3473 __m64 vm = expand_alpha_rev (to_m64 (m));
3474 __m64 vdest = in_over (s, sa, vm, load8888 (dst));
3476 store8888 (dst, vdest);
3490 mmx_composite_over_reverse_n_8888 (pixman_implementation_t *imp,
3491 pixman_composite_info_t *info)
3493 PIXMAN_COMPOSITE_ARGS (info);
3495 uint32_t *dst_line, *dst;
3502 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
3507 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3509 vsrc = load8888 (&src);
3514 dst_line += dst_stride;
3519 while (w && (uintptr_t)dst & 7)
3521 __m64 vdest = load8888 (dst);
3523 store8888 (dst, over (vdest, expand_alpha (vdest), vsrc));
3531 __m64 vdest = *(__m64 *)dst;
3532 __m64 dest0 = expand8888 (vdest, 0);
3533 __m64 dest1 = expand8888 (vdest, 1);
3536 dest0 = over (dest0, expand_alpha (dest0), vsrc);
3537 dest1 = over (dest1, expand_alpha (dest1), vsrc);
3539 *(__m64 *)dst = pack8888 (dest0, dest1);
3549 __m64 vdest = load8888 (dst);
3551 store8888 (dst, over (vdest, expand_alpha (vdest), vsrc));
3558 #define BSHIFT ((1 << BILINEAR_INTERPOLATION_BITS))
3559 #define BMSK (BSHIFT - 1)
3561 #define BILINEAR_DECLARE_VARIABLES \
3562 const __m64 mm_wt = _mm_set_pi16 (wt, wt, wt, wt); \
3563 const __m64 mm_wb = _mm_set_pi16 (wb, wb, wb, wb); \
3564 const __m64 mm_addc7 = _mm_set_pi16 (0, 1, 0, 1); \
3565 const __m64 mm_xorc7 = _mm_set_pi16 (0, BMSK, 0, BMSK); \
3566 const __m64 mm_ux = _mm_set_pi16 (unit_x, unit_x, unit_x, unit_x); \
3567 const __m64 mm_zero = _mm_setzero_si64 (); \
3568 __m64 mm_x = _mm_set_pi16 (vx, vx, vx, vx)
3570 #define BILINEAR_INTERPOLATE_ONE_PIXEL(pix) \
3572 /* fetch 2x2 pixel block into 2 mmx registers */ \
3573 __m64 t = ldq_u ((__m64 *)&src_top [pixman_fixed_to_int (vx)]); \
3574 __m64 b = ldq_u ((__m64 *)&src_bottom [pixman_fixed_to_int (vx)]); \
3575 /* vertical interpolation */ \
3576 __m64 t_hi = _mm_mullo_pi16 (_mm_unpackhi_pi8 (t, mm_zero), mm_wt); \
3577 __m64 t_lo = _mm_mullo_pi16 (_mm_unpacklo_pi8 (t, mm_zero), mm_wt); \
3578 __m64 b_hi = _mm_mullo_pi16 (_mm_unpackhi_pi8 (b, mm_zero), mm_wb); \
3579 __m64 b_lo = _mm_mullo_pi16 (_mm_unpacklo_pi8 (b, mm_zero), mm_wb); \
3580 __m64 hi = _mm_add_pi16 (t_hi, b_hi); \
3581 __m64 lo = _mm_add_pi16 (t_lo, b_lo); \
3582 /* calculate horizontal weights */ \
3583 __m64 mm_wh = _mm_add_pi16 (mm_addc7, _mm_xor_si64 (mm_xorc7, \
3584 _mm_srli_pi16 (mm_x, \
3585 16 - BILINEAR_INTERPOLATION_BITS))); \
3586 /* horizontal interpolation */ \
3587 __m64 p = _mm_unpacklo_pi16 (lo, hi); \
3588 __m64 q = _mm_unpackhi_pi16 (lo, hi); \
3590 lo = _mm_madd_pi16 (p, mm_wh); \
3591 hi = _mm_madd_pi16 (q, mm_wh); \
3592 mm_x = _mm_add_pi16 (mm_x, mm_ux); \
3593 /* shift and pack the result */ \
3594 hi = _mm_srli_pi32 (hi, BILINEAR_INTERPOLATION_BITS * 2); \
3595 lo = _mm_srli_pi32 (lo, BILINEAR_INTERPOLATION_BITS * 2); \
3596 lo = _mm_packs_pi32 (lo, hi); \
3597 lo = _mm_packs_pu16 (lo, lo); \
3601 #define BILINEAR_SKIP_ONE_PIXEL() \
3604 mm_x = _mm_add_pi16 (mm_x, mm_ux); \
3607 static force_inline void
3608 scaled_bilinear_scanline_mmx_8888_8888_SRC (uint32_t * dst,
3609 const uint32_t * mask,
3610 const uint32_t * src_top,
3611 const uint32_t * src_bottom,
3616 pixman_fixed_t unit_x,
3617 pixman_fixed_t max_vx,
3618 pixman_bool_t zero_src)
3620 BILINEAR_DECLARE_VARIABLES;
3625 BILINEAR_INTERPOLATE_ONE_PIXEL (pix);
3633 FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_cover_SRC,
3634 scaled_bilinear_scanline_mmx_8888_8888_SRC,
3635 uint32_t, uint32_t, uint32_t,
3637 FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_pad_SRC,
3638 scaled_bilinear_scanline_mmx_8888_8888_SRC,
3639 uint32_t, uint32_t, uint32_t,
3641 FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_none_SRC,
3642 scaled_bilinear_scanline_mmx_8888_8888_SRC,
3643 uint32_t, uint32_t, uint32_t,
3645 FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_normal_SRC,
3646 scaled_bilinear_scanline_mmx_8888_8888_SRC,
3647 uint32_t, uint32_t, uint32_t,
3650 static force_inline void
3651 scaled_bilinear_scanline_mmx_8888_8888_OVER (uint32_t * dst,
3652 const uint32_t * mask,
3653 const uint32_t * src_top,
3654 const uint32_t * src_bottom,
3659 pixman_fixed_t unit_x,
3660 pixman_fixed_t max_vx,
3661 pixman_bool_t zero_src)
3663 BILINEAR_DECLARE_VARIABLES;
3668 BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
3670 if (!is_zero (pix1))
3673 store8888 (dst, core_combine_over_u_pixel_mmx (pix1, pix2));
3683 FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_cover_OVER,
3684 scaled_bilinear_scanline_mmx_8888_8888_OVER,
3685 uint32_t, uint32_t, uint32_t,
3687 FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_pad_OVER,
3688 scaled_bilinear_scanline_mmx_8888_8888_OVER,
3689 uint32_t, uint32_t, uint32_t,
3691 FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_none_OVER,
3692 scaled_bilinear_scanline_mmx_8888_8888_OVER,
3693 uint32_t, uint32_t, uint32_t,
3695 FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_normal_OVER,
3696 scaled_bilinear_scanline_mmx_8888_8888_OVER,
3697 uint32_t, uint32_t, uint32_t,
3700 static force_inline void
3701 scaled_bilinear_scanline_mmx_8888_8_8888_OVER (uint32_t * dst,
3702 const uint8_t * mask,
3703 const uint32_t * src_top,
3704 const uint32_t * src_bottom,
3709 pixman_fixed_t unit_x,
3710 pixman_fixed_t max_vx,
3711 pixman_bool_t zero_src)
3713 BILINEAR_DECLARE_VARIABLES;
3719 m = (uint32_t) *mask++;
3723 BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
3725 if (m == 0xff && is_opaque (pix1))
3731 __m64 ms, md, ma, msa;
3734 ma = expand_alpha_rev (to_m64 (m));
3735 ms = _mm_unpacklo_pi8 (pix1, _mm_setzero_si64 ());
3736 md = _mm_unpacklo_pi8 (pix2, _mm_setzero_si64 ());
3738 msa = expand_alpha (ms);
3740 store8888 (dst, (in_over (ms, msa, ma, md)));
3745 BILINEAR_SKIP_ONE_PIXEL ();
3755 FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8_8888_cover_OVER,
3756 scaled_bilinear_scanline_mmx_8888_8_8888_OVER,
3757 uint32_t, uint8_t, uint32_t,
3758 COVER, FLAG_HAVE_NON_SOLID_MASK)
3759 FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8_8888_pad_OVER,
3760 scaled_bilinear_scanline_mmx_8888_8_8888_OVER,
3761 uint32_t, uint8_t, uint32_t,
3762 PAD, FLAG_HAVE_NON_SOLID_MASK)
3763 FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8_8888_none_OVER,
3764 scaled_bilinear_scanline_mmx_8888_8_8888_OVER,
3765 uint32_t, uint8_t, uint32_t,
3766 NONE, FLAG_HAVE_NON_SOLID_MASK)
3767 FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8_8888_normal_OVER,
3768 scaled_bilinear_scanline_mmx_8888_8_8888_OVER,
3769 uint32_t, uint8_t, uint32_t,
3770 NORMAL, FLAG_HAVE_NON_SOLID_MASK)
3773 mmx_fetch_x8r8g8b8 (pixman_iter_t *iter, const uint32_t *mask)
3775 int w = iter->width;
3776 uint32_t *dst = iter->buffer;
3777 uint32_t *src = (uint32_t *)iter->bits;
3779 iter->bits += iter->stride;
3781 while (w && ((uintptr_t)dst) & 7)
3783 *dst++ = (*src++) | 0xff000000;
3789 __m64 vsrc1 = ldq_u ((__m64 *)(src + 0));
3790 __m64 vsrc2 = ldq_u ((__m64 *)(src + 2));
3791 __m64 vsrc3 = ldq_u ((__m64 *)(src + 4));
3792 __m64 vsrc4 = ldq_u ((__m64 *)(src + 6));
3794 *(__m64 *)(dst + 0) = _mm_or_si64 (vsrc1, MC (ff000000));
3795 *(__m64 *)(dst + 2) = _mm_or_si64 (vsrc2, MC (ff000000));
3796 *(__m64 *)(dst + 4) = _mm_or_si64 (vsrc3, MC (ff000000));
3797 *(__m64 *)(dst + 6) = _mm_or_si64 (vsrc4, MC (ff000000));
3806 *dst++ = (*src++) | 0xff000000;
3811 return iter->buffer;
3815 mmx_fetch_r5g6b5 (pixman_iter_t *iter, const uint32_t *mask)
3817 int w = iter->width;
3818 uint32_t *dst = iter->buffer;
3819 uint16_t *src = (uint16_t *)iter->bits;
3821 iter->bits += iter->stride;
3823 while (w && ((uintptr_t)dst) & 0x0f)
3825 uint16_t s = *src++;
3827 *dst++ = convert_0565_to_8888 (s);
3833 __m64 vsrc = ldq_u ((__m64 *)src);
3836 expand_4xpacked565 (vsrc, &mm0, &mm1, 1);
3838 *(__m64 *)(dst + 0) = mm0;
3839 *(__m64 *)(dst + 2) = mm1;
3848 uint16_t s = *src++;
3850 *dst++ = convert_0565_to_8888 (s);
3855 return iter->buffer;
3859 mmx_fetch_a8 (pixman_iter_t *iter, const uint32_t *mask)
3861 int w = iter->width;
3862 uint32_t *dst = iter->buffer;
3863 uint8_t *src = iter->bits;
3865 iter->bits += iter->stride;
3867 while (w && (((uintptr_t)dst) & 15))
3869 *dst++ = *(src++) << 24;
3875 __m64 mm0 = ldq_u ((__m64 *)src);
3877 __m64 mm1 = _mm_unpacklo_pi8 (_mm_setzero_si64(), mm0);
3878 __m64 mm2 = _mm_unpackhi_pi8 (_mm_setzero_si64(), mm0);
3879 __m64 mm3 = _mm_unpacklo_pi16 (_mm_setzero_si64(), mm1);
3880 __m64 mm4 = _mm_unpackhi_pi16 (_mm_setzero_si64(), mm1);
3881 __m64 mm5 = _mm_unpacklo_pi16 (_mm_setzero_si64(), mm2);
3882 __m64 mm6 = _mm_unpackhi_pi16 (_mm_setzero_si64(), mm2);
3884 *(__m64 *)(dst + 0) = mm3;
3885 *(__m64 *)(dst + 2) = mm4;
3886 *(__m64 *)(dst + 4) = mm5;
3887 *(__m64 *)(dst + 6) = mm6;
3896 *dst++ = *(src++) << 24;
3901 return iter->buffer;
3904 #define IMAGE_FLAGS \
3905 (FAST_PATH_STANDARD_FLAGS | FAST_PATH_ID_TRANSFORM | \
3906 FAST_PATH_BITS_IMAGE | FAST_PATH_SAMPLES_COVER_CLIP_NEAREST)
3908 static const pixman_iter_info_t mmx_iters[] =
3910 { PIXMAN_x8r8g8b8, IMAGE_FLAGS, ITER_NARROW,
3911 _pixman_iter_init_bits_stride, mmx_fetch_x8r8g8b8, NULL
3913 { PIXMAN_r5g6b5, IMAGE_FLAGS, ITER_NARROW,
3914 _pixman_iter_init_bits_stride, mmx_fetch_r5g6b5, NULL
3916 { PIXMAN_a8, IMAGE_FLAGS, ITER_NARROW,
3917 _pixman_iter_init_bits_stride, mmx_fetch_a8, NULL
3922 static const pixman_fast_path_t mmx_fast_paths[] =
3924 PIXMAN_STD_FAST_PATH (OVER, solid, a8, r5g6b5, mmx_composite_over_n_8_0565 ),
3925 PIXMAN_STD_FAST_PATH (OVER, solid, a8, b5g6r5, mmx_composite_over_n_8_0565 ),
3926 PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8r8g8b8, mmx_composite_over_n_8_8888 ),
3927 PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8r8g8b8, mmx_composite_over_n_8_8888 ),
3928 PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8b8g8r8, mmx_composite_over_n_8_8888 ),
3929 PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8b8g8r8, mmx_composite_over_n_8_8888 ),
3930 PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, a8r8g8b8, mmx_composite_over_n_8888_8888_ca ),
3931 PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, x8r8g8b8, mmx_composite_over_n_8888_8888_ca ),
3932 PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, r5g6b5, mmx_composite_over_n_8888_0565_ca ),
3933 PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, a8b8g8r8, mmx_composite_over_n_8888_8888_ca ),
3934 PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, x8b8g8r8, mmx_composite_over_n_8888_8888_ca ),
3935 PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, b5g6r5, mmx_composite_over_n_8888_0565_ca ),
3936 PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, a8r8g8b8, mmx_composite_over_pixbuf_8888 ),
3937 PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, x8r8g8b8, mmx_composite_over_pixbuf_8888 ),
3938 PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, r5g6b5, mmx_composite_over_pixbuf_0565 ),
3939 PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, a8b8g8r8, mmx_composite_over_pixbuf_8888 ),
3940 PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, x8b8g8r8, mmx_composite_over_pixbuf_8888 ),
3941 PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, b5g6r5, mmx_composite_over_pixbuf_0565 ),
3942 PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, solid, a8r8g8b8, mmx_composite_over_x888_n_8888 ),
3943 PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, solid, x8r8g8b8, mmx_composite_over_x888_n_8888 ),
3944 PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, solid, a8b8g8r8, mmx_composite_over_x888_n_8888 ),
3945 PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, solid, x8b8g8r8, mmx_composite_over_x888_n_8888 ),
3946 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, a8r8g8b8, mmx_composite_over_8888_n_8888 ),
3947 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, x8r8g8b8, mmx_composite_over_8888_n_8888 ),
3948 PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, a8b8g8r8, mmx_composite_over_8888_n_8888 ),
3949 PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, x8b8g8r8, mmx_composite_over_8888_n_8888 ),
3950 PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, x8r8g8b8, mmx_composite_over_x888_8_8888 ),
3951 PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, a8r8g8b8, mmx_composite_over_x888_8_8888 ),
3952 PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, x8b8g8r8, mmx_composite_over_x888_8_8888 ),
3953 PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, a8b8g8r8, mmx_composite_over_x888_8_8888 ),
3954 PIXMAN_STD_FAST_PATH (OVER, solid, null, a8r8g8b8, mmx_composite_over_n_8888 ),
3955 PIXMAN_STD_FAST_PATH (OVER, solid, null, x8r8g8b8, mmx_composite_over_n_8888 ),
3956 PIXMAN_STD_FAST_PATH (OVER, solid, null, r5g6b5, mmx_composite_over_n_0565 ),
3957 PIXMAN_STD_FAST_PATH (OVER, solid, null, b5g6r5, mmx_composite_over_n_0565 ),
3958 PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, null, x8r8g8b8, mmx_composite_copy_area ),
3959 PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, null, x8b8g8r8, mmx_composite_copy_area ),
3961 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, a8r8g8b8, mmx_composite_over_8888_8888 ),
3962 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, x8r8g8b8, mmx_composite_over_8888_8888 ),
3963 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, r5g6b5, mmx_composite_over_8888_0565 ),
3964 PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, a8b8g8r8, mmx_composite_over_8888_8888 ),
3965 PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, x8b8g8r8, mmx_composite_over_8888_8888 ),
3966 PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, b5g6r5, mmx_composite_over_8888_0565 ),
3968 PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8r8g8b8, mmx_composite_over_reverse_n_8888),
3969 PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8b8g8r8, mmx_composite_over_reverse_n_8888),
3971 PIXMAN_STD_FAST_PATH (ADD, r5g6b5, null, r5g6b5, mmx_composite_add_0565_0565 ),
3972 PIXMAN_STD_FAST_PATH (ADD, b5g6r5, null, b5g6r5, mmx_composite_add_0565_0565 ),
3973 PIXMAN_STD_FAST_PATH (ADD, a8r8g8b8, null, a8r8g8b8, mmx_composite_add_8888_8888 ),
3974 PIXMAN_STD_FAST_PATH (ADD, a8b8g8r8, null, a8b8g8r8, mmx_composite_add_8888_8888 ),
3975 PIXMAN_STD_FAST_PATH (ADD, a8, null, a8, mmx_composite_add_8_8 ),
3976 PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8, mmx_composite_add_n_8_8 ),
3978 PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, r5g6b5, mmx_composite_src_x888_0565 ),
3979 PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, b5g6r5, mmx_composite_src_x888_0565 ),
3980 PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, r5g6b5, mmx_composite_src_x888_0565 ),
3981 PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, b5g6r5, mmx_composite_src_x888_0565 ),
3982 PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8r8g8b8, mmx_composite_src_n_8_8888 ),
3983 PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8r8g8b8, mmx_composite_src_n_8_8888 ),
3984 PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8b8g8r8, mmx_composite_src_n_8_8888 ),
3985 PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8b8g8r8, mmx_composite_src_n_8_8888 ),
3986 PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, a8r8g8b8, mmx_composite_copy_area ),
3987 PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, a8b8g8r8, mmx_composite_copy_area ),
3988 PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, x8r8g8b8, mmx_composite_copy_area ),
3989 PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, x8b8g8r8, mmx_composite_copy_area ),
3990 PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, x8r8g8b8, mmx_composite_copy_area ),
3991 PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, x8b8g8r8, mmx_composite_copy_area ),
3992 PIXMAN_STD_FAST_PATH (SRC, r5g6b5, null, r5g6b5, mmx_composite_copy_area ),
3993 PIXMAN_STD_FAST_PATH (SRC, b5g6r5, null, b5g6r5, mmx_composite_copy_area ),
3995 PIXMAN_STD_FAST_PATH (IN, a8, null, a8, mmx_composite_in_8_8 ),
3996 PIXMAN_STD_FAST_PATH (IN, solid, a8, a8, mmx_composite_in_n_8_8 ),
3998 SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, a8r8g8b8, mmx_8888_8888 ),
3999 SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8, mmx_8888_8888 ),
4000 SIMPLE_BILINEAR_FAST_PATH (SRC, x8r8g8b8, x8r8g8b8, mmx_8888_8888 ),
4001 SIMPLE_BILINEAR_FAST_PATH (SRC, a8b8g8r8, a8b8g8r8, mmx_8888_8888 ),
4002 SIMPLE_BILINEAR_FAST_PATH (SRC, a8b8g8r8, x8b8g8r8, mmx_8888_8888 ),
4003 SIMPLE_BILINEAR_FAST_PATH (SRC, x8b8g8r8, x8b8g8r8, mmx_8888_8888 ),
4005 SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, mmx_8888_8888 ),
4006 SIMPLE_BILINEAR_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, mmx_8888_8888 ),
4007 SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, mmx_8888_8888 ),
4008 SIMPLE_BILINEAR_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, mmx_8888_8888 ),
4010 SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, mmx_8888_8_8888 ),
4011 SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, mmx_8888_8_8888 ),
4012 SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, mmx_8888_8_8888 ),
4013 SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, mmx_8888_8_8888 ),
4018 pixman_implementation_t *
4019 _pixman_implementation_create_mmx (pixman_implementation_t *fallback)
4021 pixman_implementation_t *imp = _pixman_implementation_create (fallback, mmx_fast_paths);
4023 imp->combine_32[PIXMAN_OP_OVER] = mmx_combine_over_u;
4024 imp->combine_32[PIXMAN_OP_OVER_REVERSE] = mmx_combine_over_reverse_u;
4025 imp->combine_32[PIXMAN_OP_IN] = mmx_combine_in_u;
4026 imp->combine_32[PIXMAN_OP_IN_REVERSE] = mmx_combine_in_reverse_u;
4027 imp->combine_32[PIXMAN_OP_OUT] = mmx_combine_out_u;
4028 imp->combine_32[PIXMAN_OP_OUT_REVERSE] = mmx_combine_out_reverse_u;
4029 imp->combine_32[PIXMAN_OP_ATOP] = mmx_combine_atop_u;
4030 imp->combine_32[PIXMAN_OP_ATOP_REVERSE] = mmx_combine_atop_reverse_u;
4031 imp->combine_32[PIXMAN_OP_XOR] = mmx_combine_xor_u;
4032 imp->combine_32[PIXMAN_OP_ADD] = mmx_combine_add_u;
4033 imp->combine_32[PIXMAN_OP_SATURATE] = mmx_combine_saturate_u;
4035 imp->combine_32_ca[PIXMAN_OP_SRC] = mmx_combine_src_ca;
4036 imp->combine_32_ca[PIXMAN_OP_OVER] = mmx_combine_over_ca;
4037 imp->combine_32_ca[PIXMAN_OP_OVER_REVERSE] = mmx_combine_over_reverse_ca;
4038 imp->combine_32_ca[PIXMAN_OP_IN] = mmx_combine_in_ca;
4039 imp->combine_32_ca[PIXMAN_OP_IN_REVERSE] = mmx_combine_in_reverse_ca;
4040 imp->combine_32_ca[PIXMAN_OP_OUT] = mmx_combine_out_ca;
4041 imp->combine_32_ca[PIXMAN_OP_OUT_REVERSE] = mmx_combine_out_reverse_ca;
4042 imp->combine_32_ca[PIXMAN_OP_ATOP] = mmx_combine_atop_ca;
4043 imp->combine_32_ca[PIXMAN_OP_ATOP_REVERSE] = mmx_combine_atop_reverse_ca;
4044 imp->combine_32_ca[PIXMAN_OP_XOR] = mmx_combine_xor_ca;
4045 imp->combine_32_ca[PIXMAN_OP_ADD] = mmx_combine_add_ca;
4048 imp->fill = mmx_fill;
4050 imp->iter_info = mmx_iters;
4055 #endif /* USE_X86_MMX || USE_ARM_IWMMXT || USE_LOONGSON_MMI */