2 * Copyright © 2008 Rodrigo Kumpera
3 * Copyright © 2008 André Tupinambá
5 * Permission to use, copy, modify, distribute, and sell this software and its
6 * documentation for any purpose is hereby granted without fee, provided that
7 * the above copyright notice appear in all copies and that both that
8 * copyright notice and this permission notice appear in supporting
9 * documentation, and that the name of Red Hat not be used in advertising or
10 * publicity pertaining to distribution of the software without specific,
11 * written prior permission. Red Hat makes no representations about the
12 * suitability of this software for any purpose. It is provided "as is"
13 * without express or implied warranty.
15 * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
16 * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
17 * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
18 * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
19 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
20 * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
21 * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
24 * Author: Rodrigo Kumpera (kumpera@gmail.com)
25 * André Tupinambá (andrelrt@gmail.com)
27 * Based on work by Owen Taylor and Søren Sandmann
33 /* PSHUFD is slow on a lot of old processors, and new processors have SSSE3 */
34 #define PSHUFD_IS_FAST 0
36 #include <xmmintrin.h> /* for _mm_shuffle_pi16 and _MM_SHUFFLE */
37 #include <emmintrin.h> /* for SSE2 intrinsics */
38 #include "pixman-private.h"
39 #include "pixman-combine32.h"
40 #include "pixman-inlines.h"
42 static __m128i mask_0080;
43 static __m128i mask_00ff;
44 static __m128i mask_0101;
45 static __m128i mask_ffff;
46 static __m128i mask_ff000000;
47 static __m128i mask_alpha;
49 static __m128i mask_565_r;
50 static __m128i mask_565_g1, mask_565_g2;
51 static __m128i mask_565_b;
52 static __m128i mask_red;
53 static __m128i mask_green;
54 static __m128i mask_blue;
56 static __m128i mask_565_fix_rb;
57 static __m128i mask_565_fix_g;
59 static __m128i mask_565_rb;
60 static __m128i mask_565_pack_multiplier;
62 static force_inline __m128i
63 unpack_32_1x128 (uint32_t data)
65 return _mm_unpacklo_epi8 (_mm_cvtsi32_si128 (data), _mm_setzero_si128 ());
68 static force_inline void
69 unpack_128_2x128 (__m128i data, __m128i* data_lo, __m128i* data_hi)
71 *data_lo = _mm_unpacklo_epi8 (data, _mm_setzero_si128 ());
72 *data_hi = _mm_unpackhi_epi8 (data, _mm_setzero_si128 ());
75 static force_inline __m128i
76 unpack_565_to_8888 (__m128i lo)
78 __m128i r, g, b, rb, t;
80 r = _mm_and_si128 (_mm_slli_epi32 (lo, 8), mask_red);
81 g = _mm_and_si128 (_mm_slli_epi32 (lo, 5), mask_green);
82 b = _mm_and_si128 (_mm_slli_epi32 (lo, 3), mask_blue);
84 rb = _mm_or_si128 (r, b);
85 t = _mm_and_si128 (rb, mask_565_fix_rb);
86 t = _mm_srli_epi32 (t, 5);
87 rb = _mm_or_si128 (rb, t);
89 t = _mm_and_si128 (g, mask_565_fix_g);
90 t = _mm_srli_epi32 (t, 6);
91 g = _mm_or_si128 (g, t);
93 return _mm_or_si128 (rb, g);
96 static force_inline void
97 unpack_565_128_4x128 (__m128i data,
105 lo = _mm_unpacklo_epi16 (data, _mm_setzero_si128 ());
106 hi = _mm_unpackhi_epi16 (data, _mm_setzero_si128 ());
108 lo = unpack_565_to_8888 (lo);
109 hi = unpack_565_to_8888 (hi);
111 unpack_128_2x128 (lo, data0, data1);
112 unpack_128_2x128 (hi, data2, data3);
115 static force_inline uint16_t
116 pack_565_32_16 (uint32_t pixel)
118 return (uint16_t) (((pixel >> 8) & 0xf800) |
119 ((pixel >> 5) & 0x07e0) |
120 ((pixel >> 3) & 0x001f));
123 static force_inline __m128i
124 pack_2x128_128 (__m128i lo, __m128i hi)
126 return _mm_packus_epi16 (lo, hi);
129 static force_inline __m128i
130 pack_565_2packedx128_128 (__m128i lo, __m128i hi)
132 __m128i rb0 = _mm_and_si128 (lo, mask_565_rb);
133 __m128i rb1 = _mm_and_si128 (hi, mask_565_rb);
135 __m128i t0 = _mm_madd_epi16 (rb0, mask_565_pack_multiplier);
136 __m128i t1 = _mm_madd_epi16 (rb1, mask_565_pack_multiplier);
138 __m128i g0 = _mm_and_si128 (lo, mask_green);
139 __m128i g1 = _mm_and_si128 (hi, mask_green);
141 t0 = _mm_or_si128 (t0, g0);
142 t1 = _mm_or_si128 (t1, g1);
144 /* Simulates _mm_packus_epi32 */
145 t0 = _mm_slli_epi32 (t0, 16 - 5);
146 t1 = _mm_slli_epi32 (t1, 16 - 5);
147 t0 = _mm_srai_epi32 (t0, 16);
148 t1 = _mm_srai_epi32 (t1, 16);
149 return _mm_packs_epi32 (t0, t1);
152 static force_inline __m128i
153 pack_565_2x128_128 (__m128i lo, __m128i hi)
156 __m128i r, g1, g2, b;
158 data = pack_2x128_128 (lo, hi);
160 r = _mm_and_si128 (data, mask_565_r);
161 g1 = _mm_and_si128 (_mm_slli_epi32 (data, 3), mask_565_g1);
162 g2 = _mm_and_si128 (_mm_srli_epi32 (data, 5), mask_565_g2);
163 b = _mm_and_si128 (_mm_srli_epi32 (data, 3), mask_565_b);
165 return _mm_or_si128 (_mm_or_si128 (_mm_or_si128 (r, g1), g2), b);
168 static force_inline __m128i
169 pack_565_4x128_128 (__m128i* xmm0, __m128i* xmm1, __m128i* xmm2, __m128i* xmm3)
171 return _mm_packus_epi16 (pack_565_2x128_128 (*xmm0, *xmm1),
172 pack_565_2x128_128 (*xmm2, *xmm3));
175 static force_inline int
176 is_opaque (__m128i x)
178 __m128i ffs = _mm_cmpeq_epi8 (x, x);
180 return (_mm_movemask_epi8 (_mm_cmpeq_epi8 (x, ffs)) & 0x8888) == 0x8888;
183 static force_inline int
186 return _mm_movemask_epi8 (
187 _mm_cmpeq_epi8 (x, _mm_setzero_si128 ())) == 0xffff;
190 static force_inline int
191 is_transparent (__m128i x)
193 return (_mm_movemask_epi8 (
194 _mm_cmpeq_epi8 (x, _mm_setzero_si128 ())) & 0x8888) == 0x8888;
197 static force_inline __m128i
198 expand_pixel_32_1x128 (uint32_t data)
200 return _mm_shuffle_epi32 (unpack_32_1x128 (data), _MM_SHUFFLE (1, 0, 1, 0));
203 static force_inline __m128i
204 expand_alpha_1x128 (__m128i data)
206 return _mm_shufflehi_epi16 (_mm_shufflelo_epi16 (data,
207 _MM_SHUFFLE (3, 3, 3, 3)),
208 _MM_SHUFFLE (3, 3, 3, 3));
211 static force_inline void
212 expand_alpha_2x128 (__m128i data_lo,
219 lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (3, 3, 3, 3));
220 hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (3, 3, 3, 3));
222 *alpha_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (3, 3, 3, 3));
223 *alpha_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (3, 3, 3, 3));
226 static force_inline void
227 expand_alpha_rev_2x128 (__m128i data_lo,
234 lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (0, 0, 0, 0));
235 hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (0, 0, 0, 0));
236 *alpha_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (0, 0, 0, 0));
237 *alpha_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (0, 0, 0, 0));
240 static force_inline void
241 pix_multiply_2x128 (__m128i* data_lo,
250 lo = _mm_mullo_epi16 (*data_lo, *alpha_lo);
251 hi = _mm_mullo_epi16 (*data_hi, *alpha_hi);
252 lo = _mm_adds_epu16 (lo, mask_0080);
253 hi = _mm_adds_epu16 (hi, mask_0080);
254 *ret_lo = _mm_mulhi_epu16 (lo, mask_0101);
255 *ret_hi = _mm_mulhi_epu16 (hi, mask_0101);
258 static force_inline void
259 pix_add_multiply_2x128 (__m128i* src_lo,
261 __m128i* alpha_dst_lo,
262 __m128i* alpha_dst_hi,
265 __m128i* alpha_src_lo,
266 __m128i* alpha_src_hi,
270 __m128i t1_lo, t1_hi;
271 __m128i t2_lo, t2_hi;
273 pix_multiply_2x128 (src_lo, src_hi, alpha_dst_lo, alpha_dst_hi, &t1_lo, &t1_hi);
274 pix_multiply_2x128 (dst_lo, dst_hi, alpha_src_lo, alpha_src_hi, &t2_lo, &t2_hi);
276 *ret_lo = _mm_adds_epu8 (t1_lo, t2_lo);
277 *ret_hi = _mm_adds_epu8 (t1_hi, t2_hi);
280 static force_inline void
281 negate_2x128 (__m128i data_lo,
286 *neg_lo = _mm_xor_si128 (data_lo, mask_00ff);
287 *neg_hi = _mm_xor_si128 (data_hi, mask_00ff);
290 static force_inline void
291 invert_colors_2x128 (__m128i data_lo,
298 lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (3, 0, 1, 2));
299 hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (3, 0, 1, 2));
300 *inv_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (3, 0, 1, 2));
301 *inv_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (3, 0, 1, 2));
304 static force_inline void
305 over_2x128 (__m128i* src_lo,
314 negate_2x128 (*alpha_lo, *alpha_hi, &t1, &t2);
316 pix_multiply_2x128 (dst_lo, dst_hi, &t1, &t2, dst_lo, dst_hi);
318 *dst_lo = _mm_adds_epu8 (*src_lo, *dst_lo);
319 *dst_hi = _mm_adds_epu8 (*src_hi, *dst_hi);
322 static force_inline void
323 over_rev_non_pre_2x128 (__m128i src_lo,
329 __m128i alpha_lo, alpha_hi;
331 expand_alpha_2x128 (src_lo, src_hi, &alpha_lo, &alpha_hi);
333 lo = _mm_or_si128 (alpha_lo, mask_alpha);
334 hi = _mm_or_si128 (alpha_hi, mask_alpha);
336 invert_colors_2x128 (src_lo, src_hi, &src_lo, &src_hi);
338 pix_multiply_2x128 (&src_lo, &src_hi, &lo, &hi, &lo, &hi);
340 over_2x128 (&lo, &hi, &alpha_lo, &alpha_hi, dst_lo, dst_hi);
343 static force_inline void
344 in_over_2x128 (__m128i* src_lo,
356 pix_multiply_2x128 (src_lo, src_hi, mask_lo, mask_hi, &s_lo, &s_hi);
357 pix_multiply_2x128 (alpha_lo, alpha_hi, mask_lo, mask_hi, &a_lo, &a_hi);
359 over_2x128 (&s_lo, &s_hi, &a_lo, &a_hi, dst_lo, dst_hi);
362 /* load 4 pixels from a 16-byte boundary aligned address */
363 static force_inline __m128i
364 load_128_aligned (__m128i* src)
366 return _mm_load_si128 (src);
369 /* load 4 pixels from a unaligned address */
370 static force_inline __m128i
371 load_128_unaligned (const __m128i* src)
373 return _mm_loadu_si128 (src);
376 /* save 4 pixels using Write Combining memory on a 16-byte
377 * boundary aligned address
379 static force_inline void
380 save_128_write_combining (__m128i* dst,
383 _mm_stream_si128 (dst, data);
386 /* save 4 pixels on a 16-byte boundary aligned address */
387 static force_inline void
388 save_128_aligned (__m128i* dst,
391 _mm_store_si128 (dst, data);
394 /* save 4 pixels on a unaligned address */
395 static force_inline void
396 save_128_unaligned (__m128i* dst,
399 _mm_storeu_si128 (dst, data);
402 static force_inline __m128i
403 load_32_1x128 (uint32_t data)
405 return _mm_cvtsi32_si128 (data);
408 static force_inline __m128i
409 expand_alpha_rev_1x128 (__m128i data)
411 return _mm_shufflelo_epi16 (data, _MM_SHUFFLE (0, 0, 0, 0));
414 static force_inline __m128i
415 expand_pixel_8_1x128 (uint8_t data)
417 return _mm_shufflelo_epi16 (
418 unpack_32_1x128 ((uint32_t)data), _MM_SHUFFLE (0, 0, 0, 0));
421 static force_inline __m128i
422 pix_multiply_1x128 (__m128i data,
425 return _mm_mulhi_epu16 (_mm_adds_epu16 (_mm_mullo_epi16 (data, alpha),
430 static force_inline __m128i
431 pix_add_multiply_1x128 (__m128i* src,
436 __m128i t1 = pix_multiply_1x128 (*src, *alpha_dst);
437 __m128i t2 = pix_multiply_1x128 (*dst, *alpha_src);
439 return _mm_adds_epu8 (t1, t2);
442 static force_inline __m128i
443 negate_1x128 (__m128i data)
445 return _mm_xor_si128 (data, mask_00ff);
448 static force_inline __m128i
449 invert_colors_1x128 (__m128i data)
451 return _mm_shufflelo_epi16 (data, _MM_SHUFFLE (3, 0, 1, 2));
454 static force_inline __m128i
455 over_1x128 (__m128i src, __m128i alpha, __m128i dst)
457 return _mm_adds_epu8 (src, pix_multiply_1x128 (dst, negate_1x128 (alpha)));
460 static force_inline __m128i
461 in_over_1x128 (__m128i* src, __m128i* alpha, __m128i* mask, __m128i* dst)
463 return over_1x128 (pix_multiply_1x128 (*src, *mask),
464 pix_multiply_1x128 (*alpha, *mask),
468 static force_inline __m128i
469 over_rev_non_pre_1x128 (__m128i src, __m128i dst)
471 __m128i alpha = expand_alpha_1x128 (src);
473 return over_1x128 (pix_multiply_1x128 (invert_colors_1x128 (src),
474 _mm_or_si128 (alpha, mask_alpha)),
479 static force_inline uint32_t
480 pack_1x128_32 (__m128i data)
482 return _mm_cvtsi128_si32 (_mm_packus_epi16 (data, _mm_setzero_si128 ()));
485 static force_inline __m128i
486 expand565_16_1x128 (uint16_t pixel)
488 __m128i m = _mm_cvtsi32_si128 (pixel);
490 m = unpack_565_to_8888 (m);
492 return _mm_unpacklo_epi8 (m, _mm_setzero_si128 ());
495 static force_inline uint32_t
496 core_combine_over_u_pixel_sse2 (uint32_t src, uint32_t dst)
509 xmms = unpack_32_1x128 (src);
510 return pack_1x128_32 (
511 over_1x128 (xmms, expand_alpha_1x128 (xmms),
512 unpack_32_1x128 (dst)));
518 static force_inline uint32_t
519 combine1 (const uint32_t *ps, const uint32_t *pm)
527 mm = unpack_32_1x128 (*pm);
528 mm = expand_alpha_1x128 (mm);
530 ms = unpack_32_1x128 (s);
531 ms = pix_multiply_1x128 (ms, mm);
533 s = pack_1x128_32 (ms);
539 static force_inline __m128i
540 combine4 (const __m128i *ps, const __m128i *pm)
542 __m128i xmm_src_lo, xmm_src_hi;
543 __m128i xmm_msk_lo, xmm_msk_hi;
548 xmm_msk_lo = load_128_unaligned (pm);
550 if (is_transparent (xmm_msk_lo))
551 return _mm_setzero_si128 ();
554 s = load_128_unaligned (ps);
558 unpack_128_2x128 (s, &xmm_src_lo, &xmm_src_hi);
559 unpack_128_2x128 (xmm_msk_lo, &xmm_msk_lo, &xmm_msk_hi);
561 expand_alpha_2x128 (xmm_msk_lo, xmm_msk_hi, &xmm_msk_lo, &xmm_msk_hi);
563 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
564 &xmm_msk_lo, &xmm_msk_hi,
565 &xmm_src_lo, &xmm_src_hi);
567 s = pack_2x128_128 (xmm_src_lo, xmm_src_hi);
573 static force_inline void
574 core_combine_over_u_sse2_mask (uint32_t * pd,
581 /* Align dst on a 16-byte boundary */
582 while (w && ((uintptr_t)pd & 15))
585 s = combine1 (ps, pm);
588 *pd = core_combine_over_u_pixel_sse2 (s, d);
597 __m128i mask = load_128_unaligned ((__m128i *)pm);
602 __m128i src_hi, src_lo;
603 __m128i mask_hi, mask_lo;
604 __m128i alpha_hi, alpha_lo;
606 src = load_128_unaligned ((__m128i *)ps);
608 if (is_opaque (_mm_and_si128 (src, mask)))
610 save_128_aligned ((__m128i *)pd, src);
614 __m128i dst = load_128_aligned ((__m128i *)pd);
615 __m128i dst_hi, dst_lo;
617 unpack_128_2x128 (mask, &mask_lo, &mask_hi);
618 unpack_128_2x128 (src, &src_lo, &src_hi);
620 expand_alpha_2x128 (mask_lo, mask_hi, &mask_lo, &mask_hi);
621 pix_multiply_2x128 (&src_lo, &src_hi,
625 unpack_128_2x128 (dst, &dst_lo, &dst_hi);
627 expand_alpha_2x128 (src_lo, src_hi,
628 &alpha_lo, &alpha_hi);
630 over_2x128 (&src_lo, &src_hi, &alpha_lo, &alpha_hi,
635 pack_2x128_128 (dst_lo, dst_hi));
647 s = combine1 (ps, pm);
650 *pd = core_combine_over_u_pixel_sse2 (s, d);
659 static force_inline void
660 core_combine_over_u_sse2_no_mask (uint32_t * pd,
666 /* Align dst on a 16-byte boundary */
667 while (w && ((uintptr_t)pd & 15))
673 *pd = core_combine_over_u_pixel_sse2 (s, d);
682 __m128i src_hi, src_lo, dst_hi, dst_lo;
683 __m128i alpha_hi, alpha_lo;
685 src = load_128_unaligned ((__m128i *)ps);
691 save_128_aligned ((__m128i *)pd, src);
695 __m128i dst = load_128_aligned ((__m128i *)pd);
697 unpack_128_2x128 (src, &src_lo, &src_hi);
698 unpack_128_2x128 (dst, &dst_lo, &dst_hi);
700 expand_alpha_2x128 (src_lo, src_hi,
701 &alpha_lo, &alpha_hi);
702 over_2x128 (&src_lo, &src_hi, &alpha_lo, &alpha_hi,
707 pack_2x128_128 (dst_lo, dst_hi));
721 *pd = core_combine_over_u_pixel_sse2 (s, d);
729 static force_inline void
730 sse2_combine_over_u (pixman_implementation_t *imp,
738 core_combine_over_u_sse2_mask (pd, ps, pm, w);
740 core_combine_over_u_sse2_no_mask (pd, ps, w);
744 sse2_combine_over_reverse_u (pixman_implementation_t *imp,
753 __m128i xmm_dst_lo, xmm_dst_hi;
754 __m128i xmm_src_lo, xmm_src_hi;
755 __m128i xmm_alpha_lo, xmm_alpha_hi;
757 /* Align dst on a 16-byte boundary */
759 ((uintptr_t)pd & 15))
762 s = combine1 (ps, pm);
764 *pd++ = core_combine_over_u_pixel_sse2 (d, s);
773 /* I'm loading unaligned because I'm not sure
774 * about the address alignment.
776 xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
777 xmm_dst_hi = load_128_aligned ((__m128i*) pd);
779 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
780 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
782 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
783 &xmm_alpha_lo, &xmm_alpha_hi);
785 over_2x128 (&xmm_dst_lo, &xmm_dst_hi,
786 &xmm_alpha_lo, &xmm_alpha_hi,
787 &xmm_src_lo, &xmm_src_hi);
789 /* rebuid the 4 pixel data and save*/
790 save_128_aligned ((__m128i*)pd,
791 pack_2x128_128 (xmm_src_lo, xmm_src_hi));
804 s = combine1 (ps, pm);
806 *pd++ = core_combine_over_u_pixel_sse2 (d, s);
814 static force_inline uint32_t
815 core_combine_in_u_pixel_sse2 (uint32_t src, uint32_t dst)
817 uint32_t maska = src >> 24;
823 else if (maska != 0xff)
825 return pack_1x128_32 (
826 pix_multiply_1x128 (unpack_32_1x128 (dst),
827 expand_alpha_1x128 (unpack_32_1x128 (src))));
834 sse2_combine_in_u (pixman_implementation_t *imp,
843 __m128i xmm_src_lo, xmm_src_hi;
844 __m128i xmm_dst_lo, xmm_dst_hi;
846 while (w && ((uintptr_t)pd & 15))
848 s = combine1 (ps, pm);
851 *pd++ = core_combine_in_u_pixel_sse2 (d, s);
860 xmm_dst_hi = load_128_aligned ((__m128i*) pd);
861 xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*) pm);
863 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
864 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
866 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
867 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
868 &xmm_dst_lo, &xmm_dst_hi,
869 &xmm_dst_lo, &xmm_dst_hi);
871 save_128_aligned ((__m128i*)pd,
872 pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
883 s = combine1 (ps, pm);
886 *pd++ = core_combine_in_u_pixel_sse2 (d, s);
895 sse2_combine_in_reverse_u (pixman_implementation_t *imp,
904 __m128i xmm_src_lo, xmm_src_hi;
905 __m128i xmm_dst_lo, xmm_dst_hi;
907 while (w && ((uintptr_t)pd & 15))
909 s = combine1 (ps, pm);
912 *pd++ = core_combine_in_u_pixel_sse2 (s, d);
921 xmm_dst_hi = load_128_aligned ((__m128i*) pd);
922 xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*)pm);
924 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
925 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
927 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
928 pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
929 &xmm_src_lo, &xmm_src_hi,
930 &xmm_dst_lo, &xmm_dst_hi);
933 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
944 s = combine1 (ps, pm);
947 *pd++ = core_combine_in_u_pixel_sse2 (s, d);
956 sse2_combine_out_reverse_u (pixman_implementation_t *imp,
963 while (w && ((uintptr_t)pd & 15))
965 uint32_t s = combine1 (ps, pm);
968 *pd++ = pack_1x128_32 (
970 unpack_32_1x128 (d), negate_1x128 (
971 expand_alpha_1x128 (unpack_32_1x128 (s)))));
981 __m128i xmm_src_lo, xmm_src_hi;
982 __m128i xmm_dst_lo, xmm_dst_hi;
984 xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
985 xmm_dst_hi = load_128_aligned ((__m128i*) pd);
987 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
988 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
990 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
991 negate_2x128 (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
993 pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
994 &xmm_src_lo, &xmm_src_hi,
995 &xmm_dst_lo, &xmm_dst_hi);
998 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1010 uint32_t s = combine1 (ps, pm);
1013 *pd++ = pack_1x128_32 (
1014 pix_multiply_1x128 (
1015 unpack_32_1x128 (d), negate_1x128 (
1016 expand_alpha_1x128 (unpack_32_1x128 (s)))));
1025 sse2_combine_out_u (pixman_implementation_t *imp,
1028 const uint32_t * ps,
1029 const uint32_t * pm,
1032 while (w && ((uintptr_t)pd & 15))
1034 uint32_t s = combine1 (ps, pm);
1037 *pd++ = pack_1x128_32 (
1038 pix_multiply_1x128 (
1039 unpack_32_1x128 (s), negate_1x128 (
1040 expand_alpha_1x128 (unpack_32_1x128 (d)))));
1049 __m128i xmm_src_lo, xmm_src_hi;
1050 __m128i xmm_dst_lo, xmm_dst_hi;
1052 xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*)pm);
1053 xmm_dst_hi = load_128_aligned ((__m128i*) pd);
1055 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1056 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1058 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1059 negate_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1061 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1062 &xmm_dst_lo, &xmm_dst_hi,
1063 &xmm_dst_lo, &xmm_dst_hi);
1066 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1077 uint32_t s = combine1 (ps, pm);
1080 *pd++ = pack_1x128_32 (
1081 pix_multiply_1x128 (
1082 unpack_32_1x128 (s), negate_1x128 (
1083 expand_alpha_1x128 (unpack_32_1x128 (d)))));
1091 static force_inline uint32_t
1092 core_combine_atop_u_pixel_sse2 (uint32_t src,
1095 __m128i s = unpack_32_1x128 (src);
1096 __m128i d = unpack_32_1x128 (dst);
1098 __m128i sa = negate_1x128 (expand_alpha_1x128 (s));
1099 __m128i da = expand_alpha_1x128 (d);
1101 return pack_1x128_32 (pix_add_multiply_1x128 (&s, &da, &d, &sa));
1105 sse2_combine_atop_u (pixman_implementation_t *imp,
1108 const uint32_t * ps,
1109 const uint32_t * pm,
1114 __m128i xmm_src_lo, xmm_src_hi;
1115 __m128i xmm_dst_lo, xmm_dst_hi;
1116 __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
1117 __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
1119 while (w && ((uintptr_t)pd & 15))
1121 s = combine1 (ps, pm);
1124 *pd++ = core_combine_atop_u_pixel_sse2 (s, d);
1133 xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
1134 xmm_dst_hi = load_128_aligned ((__m128i*) pd);
1136 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1137 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1139 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1140 &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1141 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1142 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1144 negate_2x128 (xmm_alpha_src_lo, xmm_alpha_src_hi,
1145 &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1147 pix_add_multiply_2x128 (
1148 &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
1149 &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,
1150 &xmm_dst_lo, &xmm_dst_hi);
1153 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1164 s = combine1 (ps, pm);
1167 *pd++ = core_combine_atop_u_pixel_sse2 (s, d);
1175 static force_inline uint32_t
1176 core_combine_reverse_atop_u_pixel_sse2 (uint32_t src,
1179 __m128i s = unpack_32_1x128 (src);
1180 __m128i d = unpack_32_1x128 (dst);
1182 __m128i sa = expand_alpha_1x128 (s);
1183 __m128i da = negate_1x128 (expand_alpha_1x128 (d));
1185 return pack_1x128_32 (pix_add_multiply_1x128 (&s, &da, &d, &sa));
1189 sse2_combine_atop_reverse_u (pixman_implementation_t *imp,
1192 const uint32_t * ps,
1193 const uint32_t * pm,
1198 __m128i xmm_src_lo, xmm_src_hi;
1199 __m128i xmm_dst_lo, xmm_dst_hi;
1200 __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
1201 __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
1203 while (w && ((uintptr_t)pd & 15))
1205 s = combine1 (ps, pm);
1208 *pd++ = core_combine_reverse_atop_u_pixel_sse2 (s, d);
1217 xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
1218 xmm_dst_hi = load_128_aligned ((__m128i*) pd);
1220 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1221 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1223 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1224 &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1225 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1226 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1228 negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
1229 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1231 pix_add_multiply_2x128 (
1232 &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
1233 &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,
1234 &xmm_dst_lo, &xmm_dst_hi);
1237 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1248 s = combine1 (ps, pm);
1251 *pd++ = core_combine_reverse_atop_u_pixel_sse2 (s, d);
1259 static force_inline uint32_t
1260 core_combine_xor_u_pixel_sse2 (uint32_t src,
1263 __m128i s = unpack_32_1x128 (src);
1264 __m128i d = unpack_32_1x128 (dst);
1266 __m128i neg_d = negate_1x128 (expand_alpha_1x128 (d));
1267 __m128i neg_s = negate_1x128 (expand_alpha_1x128 (s));
1269 return pack_1x128_32 (pix_add_multiply_1x128 (&s, &neg_d, &d, &neg_s));
1273 sse2_combine_xor_u (pixman_implementation_t *imp,
1276 const uint32_t * src,
1277 const uint32_t * mask,
1283 const uint32_t* ps = src;
1284 const uint32_t* pm = mask;
1286 __m128i xmm_src, xmm_src_lo, xmm_src_hi;
1287 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
1288 __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
1289 __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
1291 while (w && ((uintptr_t)pd & 15))
1293 s = combine1 (ps, pm);
1296 *pd++ = core_combine_xor_u_pixel_sse2 (s, d);
1305 xmm_src = combine4 ((__m128i*) ps, (__m128i*) pm);
1306 xmm_dst = load_128_aligned ((__m128i*) pd);
1308 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
1309 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
1311 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1312 &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1313 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1314 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1316 negate_2x128 (xmm_alpha_src_lo, xmm_alpha_src_hi,
1317 &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1318 negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
1319 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1321 pix_add_multiply_2x128 (
1322 &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
1323 &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,
1324 &xmm_dst_lo, &xmm_dst_hi);
1327 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1338 s = combine1 (ps, pm);
1341 *pd++ = core_combine_xor_u_pixel_sse2 (s, d);
1349 static force_inline void
1350 sse2_combine_add_u (pixman_implementation_t *imp,
1353 const uint32_t * src,
1354 const uint32_t * mask,
1360 const uint32_t* ps = src;
1361 const uint32_t* pm = mask;
1363 while (w && (uintptr_t)pd & 15)
1365 s = combine1 (ps, pm);
1371 *pd++ = _mm_cvtsi128_si32 (
1372 _mm_adds_epu8 (_mm_cvtsi32_si128 (s), _mm_cvtsi32_si128 (d)));
1380 s = combine4 ((__m128i*)ps, (__m128i*)pm);
1383 (__m128i*)pd, _mm_adds_epu8 (s, load_128_aligned ((__m128i*)pd)));
1394 s = combine1 (ps, pm);
1398 *pd++ = _mm_cvtsi128_si32 (
1399 _mm_adds_epu8 (_mm_cvtsi32_si128 (s), _mm_cvtsi32_si128 (d)));
1405 static force_inline uint32_t
1406 core_combine_saturate_u_pixel_sse2 (uint32_t src,
1409 __m128i ms = unpack_32_1x128 (src);
1410 __m128i md = unpack_32_1x128 (dst);
1411 uint32_t sa = src >> 24;
1412 uint32_t da = ~dst >> 24;
1416 ms = pix_multiply_1x128 (
1417 ms, expand_alpha_1x128 (unpack_32_1x128 (DIV_UN8 (da, sa) << 24)));
1420 return pack_1x128_32 (_mm_adds_epu16 (md, ms));
1424 sse2_combine_saturate_u (pixman_implementation_t *imp,
1427 const uint32_t * ps,
1428 const uint32_t * pm,
1434 __m128i xmm_src, xmm_dst;
1436 while (w && (uintptr_t)pd & 15)
1438 s = combine1 (ps, pm);
1441 *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1450 xmm_dst = load_128_aligned ((__m128i*)pd);
1451 xmm_src = combine4 ((__m128i*)ps, (__m128i*)pm);
1453 pack_cmp = _mm_movemask_epi8 (
1455 _mm_srli_epi32 (xmm_src, 24),
1456 _mm_srli_epi32 (_mm_xor_si128 (xmm_dst, mask_ff000000), 24)));
1458 /* if some alpha src is grater than respective ~alpha dst */
1461 s = combine1 (ps++, pm);
1463 *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1467 s = combine1 (ps++, pm);
1469 *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1473 s = combine1 (ps++, pm);
1475 *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1479 s = combine1 (ps++, pm);
1481 *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1487 save_128_aligned ((__m128i*)pd, _mm_adds_epu8 (xmm_dst, xmm_src));
1500 s = combine1 (ps, pm);
1503 *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1511 sse2_combine_src_ca (pixman_implementation_t *imp,
1514 const uint32_t * ps,
1515 const uint32_t * pm,
1520 __m128i xmm_src_lo, xmm_src_hi;
1521 __m128i xmm_mask_lo, xmm_mask_hi;
1522 __m128i xmm_dst_lo, xmm_dst_hi;
1524 while (w && (uintptr_t)pd & 15)
1528 *pd++ = pack_1x128_32 (
1529 pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (m)));
1535 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1536 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1538 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1539 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1541 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1542 &xmm_mask_lo, &xmm_mask_hi,
1543 &xmm_dst_lo, &xmm_dst_hi);
1546 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1558 *pd++ = pack_1x128_32 (
1559 pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (m)));
1564 static force_inline uint32_t
1565 core_combine_over_ca_pixel_sse2 (uint32_t src,
1569 __m128i s = unpack_32_1x128 (src);
1570 __m128i expAlpha = expand_alpha_1x128 (s);
1571 __m128i unpk_mask = unpack_32_1x128 (mask);
1572 __m128i unpk_dst = unpack_32_1x128 (dst);
1574 return pack_1x128_32 (in_over_1x128 (&s, &expAlpha, &unpk_mask, &unpk_dst));
1578 sse2_combine_over_ca (pixman_implementation_t *imp,
1581 const uint32_t * ps,
1582 const uint32_t * pm,
1587 __m128i xmm_alpha_lo, xmm_alpha_hi;
1588 __m128i xmm_src_lo, xmm_src_hi;
1589 __m128i xmm_dst_lo, xmm_dst_hi;
1590 __m128i xmm_mask_lo, xmm_mask_hi;
1592 while (w && (uintptr_t)pd & 15)
1598 *pd++ = core_combine_over_ca_pixel_sse2 (s, m, d);
1604 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1605 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1606 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1608 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1609 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1610 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1612 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1613 &xmm_alpha_lo, &xmm_alpha_hi);
1615 in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
1616 &xmm_alpha_lo, &xmm_alpha_hi,
1617 &xmm_mask_lo, &xmm_mask_hi,
1618 &xmm_dst_lo, &xmm_dst_hi);
1621 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1635 *pd++ = core_combine_over_ca_pixel_sse2 (s, m, d);
1640 static force_inline uint32_t
1641 core_combine_over_reverse_ca_pixel_sse2 (uint32_t src,
1645 __m128i d = unpack_32_1x128 (dst);
1647 return pack_1x128_32 (
1648 over_1x128 (d, expand_alpha_1x128 (d),
1649 pix_multiply_1x128 (unpack_32_1x128 (src),
1650 unpack_32_1x128 (mask))));
1654 sse2_combine_over_reverse_ca (pixman_implementation_t *imp,
1657 const uint32_t * ps,
1658 const uint32_t * pm,
1663 __m128i xmm_alpha_lo, xmm_alpha_hi;
1664 __m128i xmm_src_lo, xmm_src_hi;
1665 __m128i xmm_dst_lo, xmm_dst_hi;
1666 __m128i xmm_mask_lo, xmm_mask_hi;
1668 while (w && (uintptr_t)pd & 15)
1674 *pd++ = core_combine_over_reverse_ca_pixel_sse2 (s, m, d);
1680 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1681 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1682 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1684 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1685 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1686 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1688 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1689 &xmm_alpha_lo, &xmm_alpha_hi);
1690 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1691 &xmm_mask_lo, &xmm_mask_hi,
1692 &xmm_mask_lo, &xmm_mask_hi);
1694 over_2x128 (&xmm_dst_lo, &xmm_dst_hi,
1695 &xmm_alpha_lo, &xmm_alpha_hi,
1696 &xmm_mask_lo, &xmm_mask_hi);
1699 (__m128i*)pd, pack_2x128_128 (xmm_mask_lo, xmm_mask_hi));
1713 *pd++ = core_combine_over_reverse_ca_pixel_sse2 (s, m, d);
1719 sse2_combine_in_ca (pixman_implementation_t *imp,
1722 const uint32_t * ps,
1723 const uint32_t * pm,
1728 __m128i xmm_alpha_lo, xmm_alpha_hi;
1729 __m128i xmm_src_lo, xmm_src_hi;
1730 __m128i xmm_dst_lo, xmm_dst_hi;
1731 __m128i xmm_mask_lo, xmm_mask_hi;
1733 while (w && (uintptr_t)pd & 15)
1739 *pd++ = pack_1x128_32 (
1740 pix_multiply_1x128 (
1741 pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (m)),
1742 expand_alpha_1x128 (unpack_32_1x128 (d))));
1749 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1750 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1751 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1753 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1754 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1755 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1757 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1758 &xmm_alpha_lo, &xmm_alpha_hi);
1760 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1761 &xmm_mask_lo, &xmm_mask_hi,
1762 &xmm_dst_lo, &xmm_dst_hi);
1764 pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
1765 &xmm_alpha_lo, &xmm_alpha_hi,
1766 &xmm_dst_lo, &xmm_dst_hi);
1769 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1783 *pd++ = pack_1x128_32 (
1784 pix_multiply_1x128 (
1785 pix_multiply_1x128 (
1786 unpack_32_1x128 (s), unpack_32_1x128 (m)),
1787 expand_alpha_1x128 (unpack_32_1x128 (d))));
1794 sse2_combine_in_reverse_ca (pixman_implementation_t *imp,
1797 const uint32_t * ps,
1798 const uint32_t * pm,
1803 __m128i xmm_alpha_lo, xmm_alpha_hi;
1804 __m128i xmm_src_lo, xmm_src_hi;
1805 __m128i xmm_dst_lo, xmm_dst_hi;
1806 __m128i xmm_mask_lo, xmm_mask_hi;
1808 while (w && (uintptr_t)pd & 15)
1814 *pd++ = pack_1x128_32 (
1815 pix_multiply_1x128 (
1816 unpack_32_1x128 (d),
1817 pix_multiply_1x128 (unpack_32_1x128 (m),
1818 expand_alpha_1x128 (unpack_32_1x128 (s)))));
1824 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1825 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1826 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1828 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1829 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1830 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1832 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1833 &xmm_alpha_lo, &xmm_alpha_hi);
1834 pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
1835 &xmm_alpha_lo, &xmm_alpha_hi,
1836 &xmm_alpha_lo, &xmm_alpha_hi);
1838 pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
1839 &xmm_alpha_lo, &xmm_alpha_hi,
1840 &xmm_dst_lo, &xmm_dst_hi);
1843 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1857 *pd++ = pack_1x128_32 (
1858 pix_multiply_1x128 (
1859 unpack_32_1x128 (d),
1860 pix_multiply_1x128 (unpack_32_1x128 (m),
1861 expand_alpha_1x128 (unpack_32_1x128 (s)))));
1867 sse2_combine_out_ca (pixman_implementation_t *imp,
1870 const uint32_t * ps,
1871 const uint32_t * pm,
1876 __m128i xmm_alpha_lo, xmm_alpha_hi;
1877 __m128i xmm_src_lo, xmm_src_hi;
1878 __m128i xmm_dst_lo, xmm_dst_hi;
1879 __m128i xmm_mask_lo, xmm_mask_hi;
1881 while (w && (uintptr_t)pd & 15)
1887 *pd++ = pack_1x128_32 (
1888 pix_multiply_1x128 (
1889 pix_multiply_1x128 (
1890 unpack_32_1x128 (s), unpack_32_1x128 (m)),
1891 negate_1x128 (expand_alpha_1x128 (unpack_32_1x128 (d)))));
1897 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1898 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1899 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1901 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1902 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1903 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1905 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1906 &xmm_alpha_lo, &xmm_alpha_hi);
1907 negate_2x128 (xmm_alpha_lo, xmm_alpha_hi,
1908 &xmm_alpha_lo, &xmm_alpha_hi);
1910 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1911 &xmm_mask_lo, &xmm_mask_hi,
1912 &xmm_dst_lo, &xmm_dst_hi);
1913 pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
1914 &xmm_alpha_lo, &xmm_alpha_hi,
1915 &xmm_dst_lo, &xmm_dst_hi);
1918 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1932 *pd++ = pack_1x128_32 (
1933 pix_multiply_1x128 (
1934 pix_multiply_1x128 (
1935 unpack_32_1x128 (s), unpack_32_1x128 (m)),
1936 negate_1x128 (expand_alpha_1x128 (unpack_32_1x128 (d)))));
1943 sse2_combine_out_reverse_ca (pixman_implementation_t *imp,
1946 const uint32_t * ps,
1947 const uint32_t * pm,
1952 __m128i xmm_alpha_lo, xmm_alpha_hi;
1953 __m128i xmm_src_lo, xmm_src_hi;
1954 __m128i xmm_dst_lo, xmm_dst_hi;
1955 __m128i xmm_mask_lo, xmm_mask_hi;
1957 while (w && (uintptr_t)pd & 15)
1963 *pd++ = pack_1x128_32 (
1964 pix_multiply_1x128 (
1965 unpack_32_1x128 (d),
1966 negate_1x128 (pix_multiply_1x128 (
1967 unpack_32_1x128 (m),
1968 expand_alpha_1x128 (unpack_32_1x128 (s))))));
1974 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1975 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1976 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1978 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1979 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1980 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1982 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1983 &xmm_alpha_lo, &xmm_alpha_hi);
1985 pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
1986 &xmm_alpha_lo, &xmm_alpha_hi,
1987 &xmm_mask_lo, &xmm_mask_hi);
1989 negate_2x128 (xmm_mask_lo, xmm_mask_hi,
1990 &xmm_mask_lo, &xmm_mask_hi);
1992 pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
1993 &xmm_mask_lo, &xmm_mask_hi,
1994 &xmm_dst_lo, &xmm_dst_hi);
1997 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2011 *pd++ = pack_1x128_32 (
2012 pix_multiply_1x128 (
2013 unpack_32_1x128 (d),
2014 negate_1x128 (pix_multiply_1x128 (
2015 unpack_32_1x128 (m),
2016 expand_alpha_1x128 (unpack_32_1x128 (s))))));
2021 static force_inline uint32_t
2022 core_combine_atop_ca_pixel_sse2 (uint32_t src,
2026 __m128i m = unpack_32_1x128 (mask);
2027 __m128i s = unpack_32_1x128 (src);
2028 __m128i d = unpack_32_1x128 (dst);
2029 __m128i sa = expand_alpha_1x128 (s);
2030 __m128i da = expand_alpha_1x128 (d);
2032 s = pix_multiply_1x128 (s, m);
2033 m = negate_1x128 (pix_multiply_1x128 (m, sa));
2035 return pack_1x128_32 (pix_add_multiply_1x128 (&d, &m, &s, &da));
2039 sse2_combine_atop_ca (pixman_implementation_t *imp,
2042 const uint32_t * ps,
2043 const uint32_t * pm,
2048 __m128i xmm_src_lo, xmm_src_hi;
2049 __m128i xmm_dst_lo, xmm_dst_hi;
2050 __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
2051 __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
2052 __m128i xmm_mask_lo, xmm_mask_hi;
2054 while (w && (uintptr_t)pd & 15)
2060 *pd++ = core_combine_atop_ca_pixel_sse2 (s, m, d);
2066 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2067 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2068 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2070 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2071 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2072 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2074 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
2075 &xmm_alpha_src_lo, &xmm_alpha_src_hi);
2076 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
2077 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2079 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
2080 &xmm_mask_lo, &xmm_mask_hi,
2081 &xmm_src_lo, &xmm_src_hi);
2082 pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
2083 &xmm_alpha_src_lo, &xmm_alpha_src_hi,
2084 &xmm_mask_lo, &xmm_mask_hi);
2086 negate_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2088 pix_add_multiply_2x128 (
2089 &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,
2090 &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
2091 &xmm_dst_lo, &xmm_dst_hi);
2094 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2108 *pd++ = core_combine_atop_ca_pixel_sse2 (s, m, d);
2113 static force_inline uint32_t
2114 core_combine_reverse_atop_ca_pixel_sse2 (uint32_t src,
2118 __m128i m = unpack_32_1x128 (mask);
2119 __m128i s = unpack_32_1x128 (src);
2120 __m128i d = unpack_32_1x128 (dst);
2122 __m128i da = negate_1x128 (expand_alpha_1x128 (d));
2123 __m128i sa = expand_alpha_1x128 (s);
2125 s = pix_multiply_1x128 (s, m);
2126 m = pix_multiply_1x128 (m, sa);
2128 return pack_1x128_32 (pix_add_multiply_1x128 (&d, &m, &s, &da));
2132 sse2_combine_atop_reverse_ca (pixman_implementation_t *imp,
2135 const uint32_t * ps,
2136 const uint32_t * pm,
2141 __m128i xmm_src_lo, xmm_src_hi;
2142 __m128i xmm_dst_lo, xmm_dst_hi;
2143 __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
2144 __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
2145 __m128i xmm_mask_lo, xmm_mask_hi;
2147 while (w && (uintptr_t)pd & 15)
2153 *pd++ = core_combine_reverse_atop_ca_pixel_sse2 (s, m, d);
2159 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2160 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2161 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2163 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2164 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2165 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2167 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
2168 &xmm_alpha_src_lo, &xmm_alpha_src_hi);
2169 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
2170 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2172 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
2173 &xmm_mask_lo, &xmm_mask_hi,
2174 &xmm_src_lo, &xmm_src_hi);
2175 pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
2176 &xmm_alpha_src_lo, &xmm_alpha_src_hi,
2177 &xmm_mask_lo, &xmm_mask_hi);
2179 negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
2180 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2182 pix_add_multiply_2x128 (
2183 &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,
2184 &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
2185 &xmm_dst_lo, &xmm_dst_hi);
2188 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2202 *pd++ = core_combine_reverse_atop_ca_pixel_sse2 (s, m, d);
2207 static force_inline uint32_t
2208 core_combine_xor_ca_pixel_sse2 (uint32_t src,
2212 __m128i a = unpack_32_1x128 (mask);
2213 __m128i s = unpack_32_1x128 (src);
2214 __m128i d = unpack_32_1x128 (dst);
2216 __m128i alpha_dst = negate_1x128 (pix_multiply_1x128 (
2217 a, expand_alpha_1x128 (s)));
2218 __m128i dest = pix_multiply_1x128 (s, a);
2219 __m128i alpha_src = negate_1x128 (expand_alpha_1x128 (d));
2221 return pack_1x128_32 (pix_add_multiply_1x128 (&d,
2228 sse2_combine_xor_ca (pixman_implementation_t *imp,
2231 const uint32_t * ps,
2232 const uint32_t * pm,
2237 __m128i xmm_src_lo, xmm_src_hi;
2238 __m128i xmm_dst_lo, xmm_dst_hi;
2239 __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
2240 __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
2241 __m128i xmm_mask_lo, xmm_mask_hi;
2243 while (w && (uintptr_t)pd & 15)
2249 *pd++ = core_combine_xor_ca_pixel_sse2 (s, m, d);
2255 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2256 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2257 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2259 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2260 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2261 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2263 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
2264 &xmm_alpha_src_lo, &xmm_alpha_src_hi);
2265 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
2266 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2268 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
2269 &xmm_mask_lo, &xmm_mask_hi,
2270 &xmm_src_lo, &xmm_src_hi);
2271 pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
2272 &xmm_alpha_src_lo, &xmm_alpha_src_hi,
2273 &xmm_mask_lo, &xmm_mask_hi);
2275 negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
2276 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2277 negate_2x128 (xmm_mask_lo, xmm_mask_hi,
2278 &xmm_mask_lo, &xmm_mask_hi);
2280 pix_add_multiply_2x128 (
2281 &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,
2282 &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
2283 &xmm_dst_lo, &xmm_dst_hi);
2286 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2300 *pd++ = core_combine_xor_ca_pixel_sse2 (s, m, d);
2306 sse2_combine_add_ca (pixman_implementation_t *imp,
2309 const uint32_t * ps,
2310 const uint32_t * pm,
2315 __m128i xmm_src_lo, xmm_src_hi;
2316 __m128i xmm_dst_lo, xmm_dst_hi;
2317 __m128i xmm_mask_lo, xmm_mask_hi;
2319 while (w && (uintptr_t)pd & 15)
2325 *pd++ = pack_1x128_32 (
2326 _mm_adds_epu8 (pix_multiply_1x128 (unpack_32_1x128 (s),
2327 unpack_32_1x128 (m)),
2328 unpack_32_1x128 (d)));
2334 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2335 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2336 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2338 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2339 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2340 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2342 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
2343 &xmm_mask_lo, &xmm_mask_hi,
2344 &xmm_src_lo, &xmm_src_hi);
2347 (__m128i*)pd, pack_2x128_128 (
2348 _mm_adds_epu8 (xmm_src_lo, xmm_dst_lo),
2349 _mm_adds_epu8 (xmm_src_hi, xmm_dst_hi)));
2363 *pd++ = pack_1x128_32 (
2364 _mm_adds_epu8 (pix_multiply_1x128 (unpack_32_1x128 (s),
2365 unpack_32_1x128 (m)),
2366 unpack_32_1x128 (d)));
2371 static force_inline __m128i
2372 create_mask_16_128 (uint16_t mask)
2374 return _mm_set1_epi16 (mask);
2377 /* Work around a code generation bug in Sun Studio 12. */
2378 #if defined(__SUNPRO_C) && (__SUNPRO_C >= 0x590)
2379 # define create_mask_2x32_128(mask0, mask1) \
2380 (_mm_set_epi32 ((mask0), (mask1), (mask0), (mask1)))
2382 static force_inline __m128i
2383 create_mask_2x32_128 (uint32_t mask0,
2386 return _mm_set_epi32 (mask0, mask1, mask0, mask1);
2391 sse2_composite_over_n_8888 (pixman_implementation_t *imp,
2392 pixman_composite_info_t *info)
2394 PIXMAN_COMPOSITE_ARGS (info);
2396 uint32_t *dst_line, *dst, d;
2399 __m128i xmm_src, xmm_alpha;
2400 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
2402 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
2407 PIXMAN_IMAGE_GET_LINE (
2408 dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2410 xmm_src = expand_pixel_32_1x128 (src);
2411 xmm_alpha = expand_alpha_1x128 (xmm_src);
2417 dst_line += dst_stride;
2420 while (w && (uintptr_t)dst & 15)
2423 *dst++ = pack_1x128_32 (over_1x128 (xmm_src,
2425 unpack_32_1x128 (d)));
2431 xmm_dst = load_128_aligned ((__m128i*)dst);
2433 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
2435 over_2x128 (&xmm_src, &xmm_src,
2436 &xmm_alpha, &xmm_alpha,
2437 &xmm_dst_lo, &xmm_dst_hi);
2439 /* rebuid the 4 pixel data and save*/
2441 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2450 *dst++ = pack_1x128_32 (over_1x128 (xmm_src,
2452 unpack_32_1x128 (d)));
2460 sse2_composite_over_n_0565 (pixman_implementation_t *imp,
2461 pixman_composite_info_t *info)
2463 PIXMAN_COMPOSITE_ARGS (info);
2465 uint16_t *dst_line, *dst, d;
2468 __m128i xmm_src, xmm_alpha;
2469 __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
2471 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
2476 PIXMAN_IMAGE_GET_LINE (
2477 dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
2479 xmm_src = expand_pixel_32_1x128 (src);
2480 xmm_alpha = expand_alpha_1x128 (xmm_src);
2486 dst_line += dst_stride;
2489 while (w && (uintptr_t)dst & 15)
2493 *dst++ = pack_565_32_16 (
2494 pack_1x128_32 (over_1x128 (xmm_src,
2496 expand565_16_1x128 (d))));
2502 xmm_dst = load_128_aligned ((__m128i*)dst);
2504 unpack_565_128_4x128 (xmm_dst,
2505 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
2507 over_2x128 (&xmm_src, &xmm_src,
2508 &xmm_alpha, &xmm_alpha,
2509 &xmm_dst0, &xmm_dst1);
2510 over_2x128 (&xmm_src, &xmm_src,
2511 &xmm_alpha, &xmm_alpha,
2512 &xmm_dst2, &xmm_dst3);
2514 xmm_dst = pack_565_4x128_128 (
2515 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
2517 save_128_aligned ((__m128i*)dst, xmm_dst);
2526 *dst++ = pack_565_32_16 (
2527 pack_1x128_32 (over_1x128 (xmm_src, xmm_alpha,
2528 expand565_16_1x128 (d))));
2535 sse2_composite_add_n_8888_8888_ca (pixman_implementation_t *imp,
2536 pixman_composite_info_t *info)
2538 PIXMAN_COMPOSITE_ARGS (info);
2540 uint32_t *dst_line, d;
2541 uint32_t *mask_line, m;
2543 int dst_stride, mask_stride;
2547 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
2549 __m128i mmx_src, mmx_mask, mmx_dest;
2551 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
2556 PIXMAN_IMAGE_GET_LINE (
2557 dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2558 PIXMAN_IMAGE_GET_LINE (
2559 mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
2561 xmm_src = _mm_unpacklo_epi8 (
2562 create_mask_2x32_128 (src, src), _mm_setzero_si128 ());
2568 const uint32_t *pm = (uint32_t *)mask_line;
2569 uint32_t *pd = (uint32_t *)dst_line;
2571 dst_line += dst_stride;
2572 mask_line += mask_stride;
2574 while (w && (uintptr_t)pd & 15)
2582 mmx_mask = unpack_32_1x128 (m);
2583 mmx_dest = unpack_32_1x128 (d);
2585 *pd = pack_1x128_32 (
2586 _mm_adds_epu8 (pix_multiply_1x128 (mmx_mask, mmx_src),
2596 xmm_mask = load_128_unaligned ((__m128i*)pm);
2600 _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
2602 /* if all bits in mask are zero, pack_cmp are equal to 0xffff */
2603 if (pack_cmp != 0xffff)
2605 xmm_dst = load_128_aligned ((__m128i*)pd);
2607 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
2609 pix_multiply_2x128 (&xmm_src, &xmm_src,
2610 &xmm_mask_lo, &xmm_mask_hi,
2611 &xmm_mask_lo, &xmm_mask_hi);
2612 xmm_mask_hi = pack_2x128_128 (xmm_mask_lo, xmm_mask_hi);
2615 (__m128i*)pd, _mm_adds_epu8 (xmm_mask_hi, xmm_dst));
2631 mmx_mask = unpack_32_1x128 (m);
2632 mmx_dest = unpack_32_1x128 (d);
2634 *pd = pack_1x128_32 (
2635 _mm_adds_epu8 (pix_multiply_1x128 (mmx_mask, mmx_src),
2647 sse2_composite_over_n_8888_8888_ca (pixman_implementation_t *imp,
2648 pixman_composite_info_t *info)
2650 PIXMAN_COMPOSITE_ARGS (info);
2652 uint32_t *dst_line, d;
2653 uint32_t *mask_line, m;
2655 int dst_stride, mask_stride;
2657 __m128i xmm_src, xmm_alpha;
2658 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
2659 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
2661 __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest;
2663 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
2668 PIXMAN_IMAGE_GET_LINE (
2669 dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2670 PIXMAN_IMAGE_GET_LINE (
2671 mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
2673 xmm_src = _mm_unpacklo_epi8 (
2674 create_mask_2x32_128 (src, src), _mm_setzero_si128 ());
2675 xmm_alpha = expand_alpha_1x128 (xmm_src);
2677 mmx_alpha = xmm_alpha;
2682 const uint32_t *pm = (uint32_t *)mask_line;
2683 uint32_t *pd = (uint32_t *)dst_line;
2685 dst_line += dst_stride;
2686 mask_line += mask_stride;
2688 while (w && (uintptr_t)pd & 15)
2695 mmx_mask = unpack_32_1x128 (m);
2696 mmx_dest = unpack_32_1x128 (d);
2698 *pd = pack_1x128_32 (in_over_1x128 (&mmx_src,
2710 xmm_mask = load_128_unaligned ((__m128i*)pm);
2714 _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
2716 /* if all bits in mask are zero, pack_cmp are equal to 0xffff */
2717 if (pack_cmp != 0xffff)
2719 xmm_dst = load_128_aligned ((__m128i*)pd);
2721 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
2722 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
2724 in_over_2x128 (&xmm_src, &xmm_src,
2725 &xmm_alpha, &xmm_alpha,
2726 &xmm_mask_lo, &xmm_mask_hi,
2727 &xmm_dst_lo, &xmm_dst_hi);
2730 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2745 mmx_mask = unpack_32_1x128 (m);
2746 mmx_dest = unpack_32_1x128 (d);
2748 *pd = pack_1x128_32 (
2749 in_over_1x128 (&mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest));
2760 sse2_composite_over_8888_n_8888 (pixman_implementation_t *imp,
2761 pixman_composite_info_t *info)
2763 PIXMAN_COMPOSITE_ARGS (info);
2764 uint32_t *dst_line, *dst;
2765 uint32_t *src_line, *src;
2768 int dst_stride, src_stride;
2771 __m128i xmm_src, xmm_src_lo, xmm_src_hi;
2772 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
2773 __m128i xmm_alpha_lo, xmm_alpha_hi;
2775 PIXMAN_IMAGE_GET_LINE (
2776 dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2777 PIXMAN_IMAGE_GET_LINE (
2778 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
2780 mask = _pixman_image_get_solid (imp, mask_image, PIXMAN_a8r8g8b8);
2782 xmm_mask = create_mask_16_128 (mask >> 24);
2787 dst_line += dst_stride;
2789 src_line += src_stride;
2792 while (w && (uintptr_t)dst & 15)
2794 uint32_t s = *src++;
2800 __m128i ms = unpack_32_1x128 (s);
2801 __m128i alpha = expand_alpha_1x128 (ms);
2802 __m128i dest = xmm_mask;
2803 __m128i alpha_dst = unpack_32_1x128 (d);
2805 *dst = pack_1x128_32 (
2806 in_over_1x128 (&ms, &alpha, &dest, &alpha_dst));
2814 xmm_src = load_128_unaligned ((__m128i*)src);
2816 if (!is_zero (xmm_src))
2818 xmm_dst = load_128_aligned ((__m128i*)dst);
2820 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
2821 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
2822 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
2823 &xmm_alpha_lo, &xmm_alpha_hi);
2825 in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
2826 &xmm_alpha_lo, &xmm_alpha_hi,
2827 &xmm_mask, &xmm_mask,
2828 &xmm_dst_lo, &xmm_dst_hi);
2831 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2841 uint32_t s = *src++;
2847 __m128i ms = unpack_32_1x128 (s);
2848 __m128i alpha = expand_alpha_1x128 (ms);
2849 __m128i mask = xmm_mask;
2850 __m128i dest = unpack_32_1x128 (d);
2852 *dst = pack_1x128_32 (
2853 in_over_1x128 (&ms, &alpha, &mask, &dest));
2864 sse2_composite_src_x888_0565 (pixman_implementation_t *imp,
2865 pixman_composite_info_t *info)
2867 PIXMAN_COMPOSITE_ARGS (info);
2868 uint16_t *dst_line, *dst;
2869 uint32_t *src_line, *src, s;
2870 int dst_stride, src_stride;
2873 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
2874 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
2879 dst_line += dst_stride;
2881 src_line += src_stride;
2884 while (w && (uintptr_t)dst & 15)
2887 *dst = convert_8888_to_0565 (s);
2894 __m128i xmm_src0 = load_128_unaligned ((__m128i *)src + 0);
2895 __m128i xmm_src1 = load_128_unaligned ((__m128i *)src + 1);
2897 save_128_aligned ((__m128i*)dst, pack_565_2packedx128_128 (xmm_src0, xmm_src1));
2907 *dst = convert_8888_to_0565 (s);
2915 sse2_composite_src_x888_8888 (pixman_implementation_t *imp,
2916 pixman_composite_info_t *info)
2918 PIXMAN_COMPOSITE_ARGS (info);
2919 uint32_t *dst_line, *dst;
2920 uint32_t *src_line, *src;
2922 int dst_stride, src_stride;
2925 PIXMAN_IMAGE_GET_LINE (
2926 dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2927 PIXMAN_IMAGE_GET_LINE (
2928 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
2933 dst_line += dst_stride;
2935 src_line += src_stride;
2938 while (w && (uintptr_t)dst & 15)
2940 *dst++ = *src++ | 0xff000000;
2946 __m128i xmm_src1, xmm_src2, xmm_src3, xmm_src4;
2948 xmm_src1 = load_128_unaligned ((__m128i*)src + 0);
2949 xmm_src2 = load_128_unaligned ((__m128i*)src + 1);
2950 xmm_src3 = load_128_unaligned ((__m128i*)src + 2);
2951 xmm_src4 = load_128_unaligned ((__m128i*)src + 3);
2953 save_128_aligned ((__m128i*)dst + 0, _mm_or_si128 (xmm_src1, mask_ff000000));
2954 save_128_aligned ((__m128i*)dst + 1, _mm_or_si128 (xmm_src2, mask_ff000000));
2955 save_128_aligned ((__m128i*)dst + 2, _mm_or_si128 (xmm_src3, mask_ff000000));
2956 save_128_aligned ((__m128i*)dst + 3, _mm_or_si128 (xmm_src4, mask_ff000000));
2965 *dst++ = *src++ | 0xff000000;
2973 sse2_composite_over_x888_n_8888 (pixman_implementation_t *imp,
2974 pixman_composite_info_t *info)
2976 PIXMAN_COMPOSITE_ARGS (info);
2977 uint32_t *dst_line, *dst;
2978 uint32_t *src_line, *src;
2980 int dst_stride, src_stride;
2983 __m128i xmm_mask, xmm_alpha;
2984 __m128i xmm_src, xmm_src_lo, xmm_src_hi;
2985 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
2987 PIXMAN_IMAGE_GET_LINE (
2988 dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2989 PIXMAN_IMAGE_GET_LINE (
2990 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
2992 mask = _pixman_image_get_solid (imp, mask_image, PIXMAN_a8r8g8b8);
2994 xmm_mask = create_mask_16_128 (mask >> 24);
2995 xmm_alpha = mask_00ff;
3000 dst_line += dst_stride;
3002 src_line += src_stride;
3005 while (w && (uintptr_t)dst & 15)
3007 uint32_t s = (*src++) | 0xff000000;
3010 __m128i src = unpack_32_1x128 (s);
3011 __m128i alpha = xmm_alpha;
3012 __m128i mask = xmm_mask;
3013 __m128i dest = unpack_32_1x128 (d);
3015 *dst++ = pack_1x128_32 (
3016 in_over_1x128 (&src, &alpha, &mask, &dest));
3023 xmm_src = _mm_or_si128 (
3024 load_128_unaligned ((__m128i*)src), mask_ff000000);
3025 xmm_dst = load_128_aligned ((__m128i*)dst);
3027 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
3028 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
3030 in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
3031 &xmm_alpha, &xmm_alpha,
3032 &xmm_mask, &xmm_mask,
3033 &xmm_dst_lo, &xmm_dst_hi);
3036 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
3046 uint32_t s = (*src++) | 0xff000000;
3049 __m128i src = unpack_32_1x128 (s);
3050 __m128i alpha = xmm_alpha;
3051 __m128i mask = xmm_mask;
3052 __m128i dest = unpack_32_1x128 (d);
3054 *dst++ = pack_1x128_32 (
3055 in_over_1x128 (&src, &alpha, &mask, &dest));
3064 sse2_composite_over_8888_8888 (pixman_implementation_t *imp,
3065 pixman_composite_info_t *info)
3067 PIXMAN_COMPOSITE_ARGS (info);
3068 int dst_stride, src_stride;
3069 uint32_t *dst_line, *dst;
3070 uint32_t *src_line, *src;
3072 PIXMAN_IMAGE_GET_LINE (
3073 dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3074 PIXMAN_IMAGE_GET_LINE (
3075 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3082 sse2_combine_over_u (imp, op, dst, src, NULL, width);
3089 static force_inline uint16_t
3090 composite_over_8888_0565pixel (uint32_t src, uint16_t dst)
3094 ms = unpack_32_1x128 (src);
3095 return pack_565_32_16 (
3098 ms, expand_alpha_1x128 (ms), expand565_16_1x128 (dst))));
3102 sse2_composite_over_8888_0565 (pixman_implementation_t *imp,
3103 pixman_composite_info_t *info)
3105 PIXMAN_COMPOSITE_ARGS (info);
3106 uint16_t *dst_line, *dst, d;
3107 uint32_t *src_line, *src, s;
3108 int dst_stride, src_stride;
3111 __m128i xmm_alpha_lo, xmm_alpha_hi;
3112 __m128i xmm_src, xmm_src_lo, xmm_src_hi;
3113 __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
3115 PIXMAN_IMAGE_GET_LINE (
3116 dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
3117 PIXMAN_IMAGE_GET_LINE (
3118 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3125 dst_line += dst_stride;
3126 src_line += src_stride;
3129 /* Align dst on a 16-byte boundary */
3131 ((uintptr_t)dst & 15))
3136 *dst++ = composite_over_8888_0565pixel (s, d);
3140 /* It's a 8 pixel loop */
3143 /* I'm loading unaligned because I'm not sure
3144 * about the address alignment.
3146 xmm_src = load_128_unaligned ((__m128i*) src);
3147 xmm_dst = load_128_aligned ((__m128i*) dst);
3150 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
3151 unpack_565_128_4x128 (xmm_dst,
3152 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
3153 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
3154 &xmm_alpha_lo, &xmm_alpha_hi);
3156 /* I'm loading next 4 pixels from memory
3157 * before to optimze the memory read.
3159 xmm_src = load_128_unaligned ((__m128i*) (src + 4));
3161 over_2x128 (&xmm_src_lo, &xmm_src_hi,
3162 &xmm_alpha_lo, &xmm_alpha_hi,
3163 &xmm_dst0, &xmm_dst1);
3166 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
3167 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
3168 &xmm_alpha_lo, &xmm_alpha_hi);
3170 over_2x128 (&xmm_src_lo, &xmm_src_hi,
3171 &xmm_alpha_lo, &xmm_alpha_hi,
3172 &xmm_dst2, &xmm_dst3);
3175 (__m128i*)dst, pack_565_4x128_128 (
3176 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
3188 *dst++ = composite_over_8888_0565pixel (s, d);
3195 sse2_composite_over_n_8_8888 (pixman_implementation_t *imp,
3196 pixman_composite_info_t *info)
3198 PIXMAN_COMPOSITE_ARGS (info);
3200 uint32_t *dst_line, *dst;
3201 uint8_t *mask_line, *mask;
3202 int dst_stride, mask_stride;
3206 __m128i xmm_src, xmm_alpha, xmm_def;
3207 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
3208 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
3210 __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest;
3212 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
3218 PIXMAN_IMAGE_GET_LINE (
3219 dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3220 PIXMAN_IMAGE_GET_LINE (
3221 mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
3223 xmm_def = create_mask_2x32_128 (src, src);
3224 xmm_src = expand_pixel_32_1x128 (src);
3225 xmm_alpha = expand_alpha_1x128 (xmm_src);
3227 mmx_alpha = xmm_alpha;
3232 dst_line += dst_stride;
3234 mask_line += mask_stride;
3237 while (w && (uintptr_t)dst & 15)
3239 uint8_t m = *mask++;
3244 mmx_mask = expand_pixel_8_1x128 (m);
3245 mmx_dest = unpack_32_1x128 (d);
3247 *dst = pack_1x128_32 (in_over_1x128 (&mmx_src,
3259 m = *((uint32_t*)mask);
3261 if (srca == 0xff && m == 0xffffffff)
3263 save_128_aligned ((__m128i*)dst, xmm_def);
3267 xmm_dst = load_128_aligned ((__m128i*) dst);
3268 xmm_mask = unpack_32_1x128 (m);
3269 xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
3272 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
3273 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
3275 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
3276 &xmm_mask_lo, &xmm_mask_hi);
3278 in_over_2x128 (&xmm_src, &xmm_src,
3279 &xmm_alpha, &xmm_alpha,
3280 &xmm_mask_lo, &xmm_mask_hi,
3281 &xmm_dst_lo, &xmm_dst_hi);
3284 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
3294 uint8_t m = *mask++;
3299 mmx_mask = expand_pixel_8_1x128 (m);
3300 mmx_dest = unpack_32_1x128 (d);
3302 *dst = pack_1x128_32 (in_over_1x128 (&mmx_src,
3315 #if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__)
3316 __attribute__((__force_align_arg_pointer__))
3318 static pixman_bool_t
3319 sse2_fill (pixman_implementation_t *imp,
3329 uint32_t byte_width;
3339 stride = stride * (int) sizeof (uint32_t) / 1;
3340 byte_line = (uint8_t *)(((uint8_t *)bits) + stride * y + x);
3346 filler = (w << 16) | w;
3350 stride = stride * (int) sizeof (uint32_t) / 2;
3351 byte_line = (uint8_t *)(((uint16_t *)bits) + stride * y + x);
3352 byte_width = 2 * width;
3355 filler = (filler & 0xffff) * 0x00010001;
3359 stride = stride * (int) sizeof (uint32_t) / 4;
3360 byte_line = (uint8_t *)(((uint32_t *)bits) + stride * y + x);
3361 byte_width = 4 * width;
3369 xmm_def = create_mask_2x32_128 (filler, filler);
3374 uint8_t *d = byte_line;
3375 byte_line += stride;
3378 if (w >= 1 && ((uintptr_t)d & 1))
3380 *(uint8_t *)d = filler;
3385 while (w >= 2 && ((uintptr_t)d & 3))
3387 *(uint16_t *)d = filler;
3392 while (w >= 4 && ((uintptr_t)d & 15))
3394 *(uint32_t *)d = filler;
3402 save_128_aligned ((__m128i*)(d), xmm_def);
3403 save_128_aligned ((__m128i*)(d + 16), xmm_def);
3404 save_128_aligned ((__m128i*)(d + 32), xmm_def);
3405 save_128_aligned ((__m128i*)(d + 48), xmm_def);
3406 save_128_aligned ((__m128i*)(d + 64), xmm_def);
3407 save_128_aligned ((__m128i*)(d + 80), xmm_def);
3408 save_128_aligned ((__m128i*)(d + 96), xmm_def);
3409 save_128_aligned ((__m128i*)(d + 112), xmm_def);
3417 save_128_aligned ((__m128i*)(d), xmm_def);
3418 save_128_aligned ((__m128i*)(d + 16), xmm_def);
3419 save_128_aligned ((__m128i*)(d + 32), xmm_def);
3420 save_128_aligned ((__m128i*)(d + 48), xmm_def);
3428 save_128_aligned ((__m128i*)(d), xmm_def);
3429 save_128_aligned ((__m128i*)(d + 16), xmm_def);
3437 save_128_aligned ((__m128i*)(d), xmm_def);
3445 *(uint32_t *)d = filler;
3453 *(uint16_t *)d = filler;
3460 *(uint8_t *)d = filler;
3470 sse2_composite_src_n_8_8888 (pixman_implementation_t *imp,
3471 pixman_composite_info_t *info)
3473 PIXMAN_COMPOSITE_ARGS (info);
3475 uint32_t *dst_line, *dst;
3476 uint8_t *mask_line, *mask;
3477 int dst_stride, mask_stride;
3481 __m128i xmm_src, xmm_def;
3482 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
3484 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
3489 sse2_fill (imp, dest_image->bits.bits, dest_image->bits.rowstride,
3490 PIXMAN_FORMAT_BPP (dest_image->bits.format),
3491 dest_x, dest_y, width, height, 0);
3495 PIXMAN_IMAGE_GET_LINE (
3496 dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3497 PIXMAN_IMAGE_GET_LINE (
3498 mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
3500 xmm_def = create_mask_2x32_128 (src, src);
3501 xmm_src = expand_pixel_32_1x128 (src);
3506 dst_line += dst_stride;
3508 mask_line += mask_stride;
3511 while (w && (uintptr_t)dst & 15)
3513 uint8_t m = *mask++;
3517 *dst = pack_1x128_32 (
3518 pix_multiply_1x128 (xmm_src, expand_pixel_8_1x128 (m)));
3531 m = *((uint32_t*)mask);
3533 if (srca == 0xff && m == 0xffffffff)
3535 save_128_aligned ((__m128i*)dst, xmm_def);
3539 xmm_mask = unpack_32_1x128 (m);
3540 xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
3543 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
3545 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
3546 &xmm_mask_lo, &xmm_mask_hi);
3548 pix_multiply_2x128 (&xmm_src, &xmm_src,
3549 &xmm_mask_lo, &xmm_mask_hi,
3550 &xmm_mask_lo, &xmm_mask_hi);
3553 (__m128i*)dst, pack_2x128_128 (xmm_mask_lo, xmm_mask_hi));
3557 save_128_aligned ((__m128i*)dst, _mm_setzero_si128 ());
3567 uint8_t m = *mask++;
3571 *dst = pack_1x128_32 (
3572 pix_multiply_1x128 (
3573 xmm_src, expand_pixel_8_1x128 (m)));
3588 sse2_composite_over_n_8_0565 (pixman_implementation_t *imp,
3589 pixman_composite_info_t *info)
3591 PIXMAN_COMPOSITE_ARGS (info);
3593 uint16_t *dst_line, *dst, d;
3594 uint8_t *mask_line, *mask;
3595 int dst_stride, mask_stride;
3598 __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest;
3600 __m128i xmm_src, xmm_alpha;
3601 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
3602 __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
3604 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
3609 PIXMAN_IMAGE_GET_LINE (
3610 dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
3611 PIXMAN_IMAGE_GET_LINE (
3612 mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
3614 xmm_src = expand_pixel_32_1x128 (src);
3615 xmm_alpha = expand_alpha_1x128 (xmm_src);
3617 mmx_alpha = xmm_alpha;
3622 dst_line += dst_stride;
3624 mask_line += mask_stride;
3627 while (w && (uintptr_t)dst & 15)
3634 mmx_mask = expand_alpha_rev_1x128 (unpack_32_1x128 (m));
3635 mmx_dest = expand565_16_1x128 (d);
3637 *dst = pack_565_32_16 (
3640 &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
3649 xmm_dst = load_128_aligned ((__m128i*) dst);
3650 unpack_565_128_4x128 (xmm_dst,
3651 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
3653 m = *((uint32_t*)mask);
3658 xmm_mask = unpack_32_1x128 (m);
3659 xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
3662 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
3664 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
3665 &xmm_mask_lo, &xmm_mask_hi);
3667 in_over_2x128 (&xmm_src, &xmm_src,
3668 &xmm_alpha, &xmm_alpha,
3669 &xmm_mask_lo, &xmm_mask_hi,
3670 &xmm_dst0, &xmm_dst1);
3673 m = *((uint32_t*)mask);
3678 xmm_mask = unpack_32_1x128 (m);
3679 xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
3682 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
3684 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
3685 &xmm_mask_lo, &xmm_mask_hi);
3686 in_over_2x128 (&xmm_src, &xmm_src,
3687 &xmm_alpha, &xmm_alpha,
3688 &xmm_mask_lo, &xmm_mask_hi,
3689 &xmm_dst2, &xmm_dst3);
3693 (__m128i*)dst, pack_565_4x128_128 (
3694 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
3707 mmx_mask = expand_alpha_rev_1x128 (unpack_32_1x128 (m));
3708 mmx_dest = expand565_16_1x128 (d);
3710 *dst = pack_565_32_16 (
3713 &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
3724 sse2_composite_over_pixbuf_0565 (pixman_implementation_t *imp,
3725 pixman_composite_info_t *info)
3727 PIXMAN_COMPOSITE_ARGS (info);
3728 uint16_t *dst_line, *dst, d;
3729 uint32_t *src_line, *src, s;
3730 int dst_stride, src_stride;
3732 uint32_t opaque, zero;
3735 __m128i xmm_src, xmm_src_lo, xmm_src_hi;
3736 __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
3738 PIXMAN_IMAGE_GET_LINE (
3739 dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
3740 PIXMAN_IMAGE_GET_LINE (
3741 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3746 dst_line += dst_stride;
3748 src_line += src_stride;
3751 while (w && (uintptr_t)dst & 15)
3756 ms = unpack_32_1x128 (s);
3758 *dst++ = pack_565_32_16 (
3760 over_rev_non_pre_1x128 (ms, expand565_16_1x128 (d))));
3767 xmm_src = load_128_unaligned ((__m128i*)src);
3768 xmm_dst = load_128_aligned ((__m128i*)dst);
3770 opaque = is_opaque (xmm_src);
3771 zero = is_zero (xmm_src);
3773 unpack_565_128_4x128 (xmm_dst,
3774 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
3775 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
3777 /* preload next round*/
3778 xmm_src = load_128_unaligned ((__m128i*)(src + 4));
3782 invert_colors_2x128 (xmm_src_lo, xmm_src_hi,
3783 &xmm_dst0, &xmm_dst1);
3787 over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi,
3788 &xmm_dst0, &xmm_dst1);
3792 opaque = is_opaque (xmm_src);
3793 zero = is_zero (xmm_src);
3795 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
3799 invert_colors_2x128 (xmm_src_lo, xmm_src_hi,
3800 &xmm_dst2, &xmm_dst3);
3804 over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi,
3805 &xmm_dst2, &xmm_dst3);
3809 (__m128i*)dst, pack_565_4x128_128 (
3810 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
3822 ms = unpack_32_1x128 (s);
3824 *dst++ = pack_565_32_16 (
3826 over_rev_non_pre_1x128 (ms, expand565_16_1x128 (d))));
3834 sse2_composite_over_pixbuf_8888 (pixman_implementation_t *imp,
3835 pixman_composite_info_t *info)
3837 PIXMAN_COMPOSITE_ARGS (info);
3838 uint32_t *dst_line, *dst, d;
3839 uint32_t *src_line, *src, s;
3840 int dst_stride, src_stride;
3842 uint32_t opaque, zero;
3844 __m128i xmm_src_lo, xmm_src_hi;
3845 __m128i xmm_dst_lo, xmm_dst_hi;
3847 PIXMAN_IMAGE_GET_LINE (
3848 dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3849 PIXMAN_IMAGE_GET_LINE (
3850 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3855 dst_line += dst_stride;
3857 src_line += src_stride;
3860 while (w && (uintptr_t)dst & 15)
3865 *dst++ = pack_1x128_32 (
3866 over_rev_non_pre_1x128 (
3867 unpack_32_1x128 (s), unpack_32_1x128 (d)));
3874 xmm_src_hi = load_128_unaligned ((__m128i*)src);
3876 opaque = is_opaque (xmm_src_hi);
3877 zero = is_zero (xmm_src_hi);
3879 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
3883 invert_colors_2x128 (xmm_src_lo, xmm_src_hi,
3884 &xmm_dst_lo, &xmm_dst_hi);
3887 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
3891 xmm_dst_hi = load_128_aligned ((__m128i*)dst);
3893 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
3895 over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi,
3896 &xmm_dst_lo, &xmm_dst_hi);
3899 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
3912 *dst++ = pack_1x128_32 (
3913 over_rev_non_pre_1x128 (
3914 unpack_32_1x128 (s), unpack_32_1x128 (d)));
3923 sse2_composite_over_n_8888_0565_ca (pixman_implementation_t *imp,
3924 pixman_composite_info_t *info)
3926 PIXMAN_COMPOSITE_ARGS (info);
3928 uint16_t *dst_line, *dst, d;
3929 uint32_t *mask_line, *mask, m;
3930 int dst_stride, mask_stride;
3934 __m128i xmm_src, xmm_alpha;
3935 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
3936 __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
3938 __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest;
3940 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
3945 PIXMAN_IMAGE_GET_LINE (
3946 dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
3947 PIXMAN_IMAGE_GET_LINE (
3948 mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
3950 xmm_src = expand_pixel_32_1x128 (src);
3951 xmm_alpha = expand_alpha_1x128 (xmm_src);
3953 mmx_alpha = xmm_alpha;
3960 mask_line += mask_stride;
3961 dst_line += dst_stride;
3963 while (w && ((uintptr_t)dst & 15))
3965 m = *(uint32_t *) mask;
3970 mmx_mask = unpack_32_1x128 (m);
3971 mmx_dest = expand565_16_1x128 (d);
3973 *dst = pack_565_32_16 (
3976 &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
3987 xmm_mask = load_128_unaligned ((__m128i*)mask);
3988 xmm_dst = load_128_aligned ((__m128i*)dst);
3990 pack_cmp = _mm_movemask_epi8 (
3991 _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
3993 unpack_565_128_4x128 (xmm_dst,
3994 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
3995 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
3997 /* preload next round */
3998 xmm_mask = load_128_unaligned ((__m128i*)(mask + 4));
4000 /* preload next round */
4001 if (pack_cmp != 0xffff)
4003 in_over_2x128 (&xmm_src, &xmm_src,
4004 &xmm_alpha, &xmm_alpha,
4005 &xmm_mask_lo, &xmm_mask_hi,
4006 &xmm_dst0, &xmm_dst1);
4010 pack_cmp = _mm_movemask_epi8 (
4011 _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
4013 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4015 if (pack_cmp != 0xffff)
4017 in_over_2x128 (&xmm_src, &xmm_src,
4018 &xmm_alpha, &xmm_alpha,
4019 &xmm_mask_lo, &xmm_mask_hi,
4020 &xmm_dst2, &xmm_dst3);
4024 (__m128i*)dst, pack_565_4x128_128 (
4025 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
4034 m = *(uint32_t *) mask;
4039 mmx_mask = unpack_32_1x128 (m);
4040 mmx_dest = expand565_16_1x128 (d);
4042 *dst = pack_565_32_16 (
4045 &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
4057 sse2_composite_in_n_8_8 (pixman_implementation_t *imp,
4058 pixman_composite_info_t *info)
4060 PIXMAN_COMPOSITE_ARGS (info);
4061 uint8_t *dst_line, *dst;
4062 uint8_t *mask_line, *mask;
4063 int dst_stride, mask_stride;
4069 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
4070 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
4072 PIXMAN_IMAGE_GET_LINE (
4073 dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
4074 PIXMAN_IMAGE_GET_LINE (
4075 mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
4077 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
4079 xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src));
4084 dst_line += dst_stride;
4086 mask_line += mask_stride;
4089 while (w && ((uintptr_t)dst & 15))
4091 m = (uint32_t) *mask++;
4092 d = (uint32_t) *dst;
4094 *dst++ = (uint8_t) pack_1x128_32 (
4095 pix_multiply_1x128 (
4096 pix_multiply_1x128 (xmm_alpha,
4097 unpack_32_1x128 (m)),
4098 unpack_32_1x128 (d)));
4104 xmm_mask = load_128_unaligned ((__m128i*)mask);
4105 xmm_dst = load_128_aligned ((__m128i*)dst);
4107 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4108 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
4110 pix_multiply_2x128 (&xmm_alpha, &xmm_alpha,
4111 &xmm_mask_lo, &xmm_mask_hi,
4112 &xmm_mask_lo, &xmm_mask_hi);
4114 pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
4115 &xmm_dst_lo, &xmm_dst_hi,
4116 &xmm_dst_lo, &xmm_dst_hi);
4119 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4128 m = (uint32_t) *mask++;
4129 d = (uint32_t) *dst;
4131 *dst++ = (uint8_t) pack_1x128_32 (
4132 pix_multiply_1x128 (
4133 pix_multiply_1x128 (
4134 xmm_alpha, unpack_32_1x128 (m)),
4135 unpack_32_1x128 (d)));
4143 sse2_composite_in_n_8 (pixman_implementation_t *imp,
4144 pixman_composite_info_t *info)
4146 PIXMAN_COMPOSITE_ARGS (info);
4147 uint8_t *dst_line, *dst;
4154 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
4156 PIXMAN_IMAGE_GET_LINE (
4157 dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
4159 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
4161 xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src));
4170 pixman_fill (dest_image->bits.bits, dest_image->bits.rowstride,
4171 8, dest_x, dest_y, width, height, src);
4179 dst_line += dst_stride;
4182 while (w && ((uintptr_t)dst & 15))
4184 d = (uint32_t) *dst;
4186 *dst++ = (uint8_t) pack_1x128_32 (
4187 pix_multiply_1x128 (
4189 unpack_32_1x128 (d)));
4195 xmm_dst = load_128_aligned ((__m128i*)dst);
4197 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
4199 pix_multiply_2x128 (&xmm_alpha, &xmm_alpha,
4200 &xmm_dst_lo, &xmm_dst_hi,
4201 &xmm_dst_lo, &xmm_dst_hi);
4204 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4212 d = (uint32_t) *dst;
4214 *dst++ = (uint8_t) pack_1x128_32 (
4215 pix_multiply_1x128 (
4217 unpack_32_1x128 (d)));
4225 sse2_composite_in_8_8 (pixman_implementation_t *imp,
4226 pixman_composite_info_t *info)
4228 PIXMAN_COMPOSITE_ARGS (info);
4229 uint8_t *dst_line, *dst;
4230 uint8_t *src_line, *src;
4231 int src_stride, dst_stride;
4235 __m128i xmm_src, xmm_src_lo, xmm_src_hi;
4236 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
4238 PIXMAN_IMAGE_GET_LINE (
4239 dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
4240 PIXMAN_IMAGE_GET_LINE (
4241 src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
4246 dst_line += dst_stride;
4248 src_line += src_stride;
4251 while (w && ((uintptr_t)dst & 15))
4253 s = (uint32_t) *src++;
4254 d = (uint32_t) *dst;
4256 *dst++ = (uint8_t) pack_1x128_32 (
4257 pix_multiply_1x128 (
4258 unpack_32_1x128 (s), unpack_32_1x128 (d)));
4264 xmm_src = load_128_unaligned ((__m128i*)src);
4265 xmm_dst = load_128_aligned ((__m128i*)dst);
4267 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
4268 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
4270 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
4271 &xmm_dst_lo, &xmm_dst_hi,
4272 &xmm_dst_lo, &xmm_dst_hi);
4275 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4284 s = (uint32_t) *src++;
4285 d = (uint32_t) *dst;
4287 *dst++ = (uint8_t) pack_1x128_32 (
4288 pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (d)));
4296 sse2_composite_add_n_8_8 (pixman_implementation_t *imp,
4297 pixman_composite_info_t *info)
4299 PIXMAN_COMPOSITE_ARGS (info);
4300 uint8_t *dst_line, *dst;
4301 uint8_t *mask_line, *mask;
4302 int dst_stride, mask_stride;
4308 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
4309 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
4311 PIXMAN_IMAGE_GET_LINE (
4312 dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
4313 PIXMAN_IMAGE_GET_LINE (
4314 mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
4316 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
4318 xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src));
4323 dst_line += dst_stride;
4325 mask_line += mask_stride;
4328 while (w && ((uintptr_t)dst & 15))
4330 m = (uint32_t) *mask++;
4331 d = (uint32_t) *dst;
4333 *dst++ = (uint8_t) pack_1x128_32 (
4335 pix_multiply_1x128 (
4336 xmm_alpha, unpack_32_1x128 (m)),
4337 unpack_32_1x128 (d)));
4343 xmm_mask = load_128_unaligned ((__m128i*)mask);
4344 xmm_dst = load_128_aligned ((__m128i*)dst);
4346 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4347 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
4349 pix_multiply_2x128 (&xmm_alpha, &xmm_alpha,
4350 &xmm_mask_lo, &xmm_mask_hi,
4351 &xmm_mask_lo, &xmm_mask_hi);
4353 xmm_dst_lo = _mm_adds_epu16 (xmm_mask_lo, xmm_dst_lo);
4354 xmm_dst_hi = _mm_adds_epu16 (xmm_mask_hi, xmm_dst_hi);
4357 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4366 m = (uint32_t) *mask++;
4367 d = (uint32_t) *dst;
4369 *dst++ = (uint8_t) pack_1x128_32 (
4371 pix_multiply_1x128 (
4372 xmm_alpha, unpack_32_1x128 (m)),
4373 unpack_32_1x128 (d)));
4382 sse2_composite_add_n_8 (pixman_implementation_t *imp,
4383 pixman_composite_info_t *info)
4385 PIXMAN_COMPOSITE_ARGS (info);
4386 uint8_t *dst_line, *dst;
4393 PIXMAN_IMAGE_GET_LINE (
4394 dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
4396 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
4405 pixman_fill (dest_image->bits.bits, dest_image->bits.rowstride,
4406 8, dest_x, dest_y, width, height, 0xff);
4411 src = (src << 24) | (src << 16) | (src << 8) | src;
4412 xmm_src = _mm_set_epi32 (src, src, src, src);
4417 dst_line += dst_stride;
4420 while (w && ((uintptr_t)dst & 15))
4422 *dst = (uint8_t)_mm_cvtsi128_si32 (
4425 _mm_cvtsi32_si128 (*dst)));
4434 (__m128i*)dst, _mm_adds_epu8 (xmm_src, load_128_aligned ((__m128i*)dst)));
4442 *dst = (uint8_t)_mm_cvtsi128_si32 (
4445 _mm_cvtsi32_si128 (*dst)));
4455 sse2_composite_add_8_8 (pixman_implementation_t *imp,
4456 pixman_composite_info_t *info)
4458 PIXMAN_COMPOSITE_ARGS (info);
4459 uint8_t *dst_line, *dst;
4460 uint8_t *src_line, *src;
4461 int dst_stride, src_stride;
4465 PIXMAN_IMAGE_GET_LINE (
4466 src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
4467 PIXMAN_IMAGE_GET_LINE (
4468 dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
4475 dst_line += dst_stride;
4476 src_line += src_stride;
4480 while (w && (uintptr_t)dst & 3)
4482 t = (*dst) + (*src++);
4483 *dst++ = t | (0 - (t >> 8));
4487 sse2_combine_add_u (imp, op,
4488 (uint32_t*)dst, (uint32_t*)src, NULL, w >> 2);
4498 t = (*dst) + (*src++);
4499 *dst++ = t | (0 - (t >> 8));
4507 sse2_composite_add_8888_8888 (pixman_implementation_t *imp,
4508 pixman_composite_info_t *info)
4510 PIXMAN_COMPOSITE_ARGS (info);
4511 uint32_t *dst_line, *dst;
4512 uint32_t *src_line, *src;
4513 int dst_stride, src_stride;
4515 PIXMAN_IMAGE_GET_LINE (
4516 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
4517 PIXMAN_IMAGE_GET_LINE (
4518 dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
4523 dst_line += dst_stride;
4525 src_line += src_stride;
4527 sse2_combine_add_u (imp, op, dst, src, NULL, width);
4532 sse2_composite_add_n_8888 (pixman_implementation_t *imp,
4533 pixman_composite_info_t *info)
4535 PIXMAN_COMPOSITE_ARGS (info);
4536 uint32_t *dst_line, *dst, src;
4541 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
4543 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
4549 pixman_fill (dest_image->bits.bits, dest_image->bits.rowstride, 32,
4550 dest_x, dest_y, width, height, ~0);
4555 xmm_src = _mm_set_epi32 (src, src, src, src);
4562 dst_line += dst_stride;
4564 while (w && (uintptr_t)dst & 15)
4568 _mm_cvtsi128_si32 ( _mm_adds_epu8 (xmm_src, _mm_cvtsi32_si128 (d)));
4576 _mm_adds_epu8 (xmm_src, load_128_aligned ((__m128i*)dst)));
4586 _mm_cvtsi128_si32 (_mm_adds_epu8 (xmm_src,
4587 _mm_cvtsi32_si128 (d)));
4593 sse2_composite_add_n_8_8888 (pixman_implementation_t *imp,
4594 pixman_composite_info_t *info)
4596 PIXMAN_COMPOSITE_ARGS (info);
4597 uint32_t *dst_line, *dst;
4598 uint8_t *mask_line, *mask;
4599 int dst_stride, mask_stride;
4605 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
4608 xmm_src = expand_pixel_32_1x128 (src);
4610 PIXMAN_IMAGE_GET_LINE (
4611 dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
4612 PIXMAN_IMAGE_GET_LINE (
4613 mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
4618 dst_line += dst_stride;
4620 mask_line += mask_stride;
4623 while (w && ((uintptr_t)dst & 15))
4625 uint8_t m = *mask++;
4628 *dst = pack_1x128_32
4630 (pix_multiply_1x128 (xmm_src, expand_pixel_8_1x128 (m)),
4631 unpack_32_1x128 (*dst)));
4639 uint32_t m = *(uint32_t*)mask;
4642 __m128i xmm_mask_lo, xmm_mask_hi;
4643 __m128i xmm_dst_lo, xmm_dst_hi;
4645 __m128i xmm_dst = load_128_aligned ((__m128i*)dst);
4647 _mm_unpacklo_epi8 (unpack_32_1x128(m),
4648 _mm_setzero_si128 ());
4650 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4651 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
4653 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
4654 &xmm_mask_lo, &xmm_mask_hi);
4656 pix_multiply_2x128 (&xmm_src, &xmm_src,
4657 &xmm_mask_lo, &xmm_mask_hi,
4658 &xmm_mask_lo, &xmm_mask_hi);
4660 xmm_dst_lo = _mm_adds_epu16 (xmm_mask_lo, xmm_dst_lo);
4661 xmm_dst_hi = _mm_adds_epu16 (xmm_mask_hi, xmm_dst_hi);
4664 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4674 uint8_t m = *mask++;
4677 *dst = pack_1x128_32
4679 (pix_multiply_1x128 (xmm_src, expand_pixel_8_1x128 (m)),
4680 unpack_32_1x128 (*dst)));
4688 static pixman_bool_t
4689 sse2_blt (pixman_implementation_t *imp,
4690 uint32_t * src_bits,
4691 uint32_t * dst_bits,
4703 uint8_t * src_bytes;
4704 uint8_t * dst_bytes;
4707 if (src_bpp != dst_bpp)
4712 src_stride = src_stride * (int) sizeof (uint32_t) / 2;
4713 dst_stride = dst_stride * (int) sizeof (uint32_t) / 2;
4714 src_bytes =(uint8_t *)(((uint16_t *)src_bits) + src_stride * (src_y) + (src_x));
4715 dst_bytes = (uint8_t *)(((uint16_t *)dst_bits) + dst_stride * (dest_y) + (dest_x));
4716 byte_width = 2 * width;
4720 else if (src_bpp == 32)
4722 src_stride = src_stride * (int) sizeof (uint32_t) / 4;
4723 dst_stride = dst_stride * (int) sizeof (uint32_t) / 4;
4724 src_bytes = (uint8_t *)(((uint32_t *)src_bits) + src_stride * (src_y) + (src_x));
4725 dst_bytes = (uint8_t *)(((uint32_t *)dst_bits) + dst_stride * (dest_y) + (dest_x));
4726 byte_width = 4 * width;
4738 uint8_t *s = src_bytes;
4739 uint8_t *d = dst_bytes;
4740 src_bytes += src_stride;
4741 dst_bytes += dst_stride;
4744 while (w >= 2 && ((uintptr_t)d & 3))
4746 *(uint16_t *)d = *(uint16_t *)s;
4752 while (w >= 4 && ((uintptr_t)d & 15))
4754 *(uint32_t *)d = *(uint32_t *)s;
4763 __m128i xmm0, xmm1, xmm2, xmm3;
4765 xmm0 = load_128_unaligned ((__m128i*)(s));
4766 xmm1 = load_128_unaligned ((__m128i*)(s + 16));
4767 xmm2 = load_128_unaligned ((__m128i*)(s + 32));
4768 xmm3 = load_128_unaligned ((__m128i*)(s + 48));
4770 save_128_aligned ((__m128i*)(d), xmm0);
4771 save_128_aligned ((__m128i*)(d + 16), xmm1);
4772 save_128_aligned ((__m128i*)(d + 32), xmm2);
4773 save_128_aligned ((__m128i*)(d + 48), xmm3);
4782 save_128_aligned ((__m128i*)d, load_128_unaligned ((__m128i*)s) );
4791 *(uint32_t *)d = *(uint32_t *)s;
4800 *(uint16_t *)d = *(uint16_t *)s;
4811 sse2_composite_copy_area (pixman_implementation_t *imp,
4812 pixman_composite_info_t *info)
4814 PIXMAN_COMPOSITE_ARGS (info);
4815 sse2_blt (imp, src_image->bits.bits,
4816 dest_image->bits.bits,
4817 src_image->bits.rowstride,
4818 dest_image->bits.rowstride,
4819 PIXMAN_FORMAT_BPP (src_image->bits.format),
4820 PIXMAN_FORMAT_BPP (dest_image->bits.format),
4821 src_x, src_y, dest_x, dest_y, width, height);
4825 sse2_composite_over_x888_8_8888 (pixman_implementation_t *imp,
4826 pixman_composite_info_t *info)
4828 PIXMAN_COMPOSITE_ARGS (info);
4829 uint32_t *src, *src_line, s;
4830 uint32_t *dst, *dst_line, d;
4831 uint8_t *mask, *mask_line;
4833 int src_stride, mask_stride, dst_stride;
4837 __m128i xmm_src, xmm_src_lo, xmm_src_hi;
4838 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
4839 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
4841 PIXMAN_IMAGE_GET_LINE (
4842 dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
4843 PIXMAN_IMAGE_GET_LINE (
4844 mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
4845 PIXMAN_IMAGE_GET_LINE (
4846 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
4851 src_line += src_stride;
4853 dst_line += dst_stride;
4855 mask_line += mask_stride;
4859 while (w && (uintptr_t)dst & 15)
4861 s = 0xff000000 | *src++;
4862 m = (uint32_t) *mask++;
4864 ms = unpack_32_1x128 (s);
4868 __m128i ma = expand_alpha_rev_1x128 (unpack_32_1x128 (m));
4869 __m128i md = unpack_32_1x128 (d);
4871 ms = in_over_1x128 (&ms, &mask_00ff, &ma, &md);
4874 *dst++ = pack_1x128_32 (ms);
4880 m = *(uint32_t*) mask;
4881 xmm_src = _mm_or_si128 (
4882 load_128_unaligned ((__m128i*)src), mask_ff000000);
4884 if (m == 0xffffffff)
4886 save_128_aligned ((__m128i*)dst, xmm_src);
4890 xmm_dst = load_128_aligned ((__m128i*)dst);
4892 xmm_mask = _mm_unpacklo_epi16 (unpack_32_1x128 (m), _mm_setzero_si128());
4894 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
4895 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4896 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
4898 expand_alpha_rev_2x128 (
4899 xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
4901 in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
4902 &mask_00ff, &mask_00ff, &xmm_mask_lo, &xmm_mask_hi,
4903 &xmm_dst_lo, &xmm_dst_hi);
4905 save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4916 m = (uint32_t) *mask++;
4920 s = 0xff000000 | *src;
4932 ma = expand_alpha_rev_1x128 (unpack_32_1x128 (m));
4933 md = unpack_32_1x128 (d);
4934 ms = unpack_32_1x128 (s);
4936 *dst = pack_1x128_32 (in_over_1x128 (&ms, &mask_00ff, &ma, &md));
4950 sse2_composite_over_8888_8_8888 (pixman_implementation_t *imp,
4951 pixman_composite_info_t *info)
4953 PIXMAN_COMPOSITE_ARGS (info);
4954 uint32_t *src, *src_line, s;
4955 uint32_t *dst, *dst_line, d;
4956 uint8_t *mask, *mask_line;
4958 int src_stride, mask_stride, dst_stride;
4961 __m128i xmm_src, xmm_src_lo, xmm_src_hi, xmm_srca_lo, xmm_srca_hi;
4962 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
4963 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
4965 PIXMAN_IMAGE_GET_LINE (
4966 dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
4967 PIXMAN_IMAGE_GET_LINE (
4968 mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
4969 PIXMAN_IMAGE_GET_LINE (
4970 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
4975 src_line += src_stride;
4977 dst_line += dst_stride;
4979 mask_line += mask_stride;
4983 while (w && (uintptr_t)dst & 15)
4988 m = (uint32_t) *mask++;
4995 if (sa == 0xff && m == 0xff)
5001 __m128i ms, md, ma, msa;
5003 ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
5004 ms = unpack_32_1x128 (s);
5005 md = unpack_32_1x128 (d);
5007 msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
5009 *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
5019 m = *(uint32_t *) mask;
5023 xmm_src = load_128_unaligned ((__m128i*)src);
5025 if (m == 0xffffffff && is_opaque (xmm_src))
5027 save_128_aligned ((__m128i *)dst, xmm_src);
5031 xmm_dst = load_128_aligned ((__m128i *)dst);
5033 xmm_mask = _mm_unpacklo_epi16 (unpack_32_1x128 (m), _mm_setzero_si128());
5035 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
5036 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
5037 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
5039 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi);
5040 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
5042 in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi,
5043 &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi);
5045 save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
5060 m = (uint32_t) *mask++;
5067 if (sa == 0xff && m == 0xff)
5073 __m128i ms, md, ma, msa;
5075 ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
5076 ms = unpack_32_1x128 (s);
5077 md = unpack_32_1x128 (d);
5079 msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
5081 *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
5093 sse2_composite_over_reverse_n_8888 (pixman_implementation_t *imp,
5094 pixman_composite_info_t *info)
5096 PIXMAN_COMPOSITE_ARGS (info);
5098 uint32_t *dst_line, *dst;
5100 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
5101 __m128i xmm_dsta_hi, xmm_dsta_lo;
5105 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
5110 PIXMAN_IMAGE_GET_LINE (
5111 dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
5113 xmm_src = expand_pixel_32_1x128 (src);
5119 dst_line += dst_stride;
5122 while (w && (uintptr_t)dst & 15)
5126 vd = unpack_32_1x128 (*dst);
5128 *dst = pack_1x128_32 (over_1x128 (vd, expand_alpha_1x128 (vd),
5136 __m128i tmp_lo, tmp_hi;
5138 xmm_dst = load_128_aligned ((__m128i*)dst);
5140 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
5141 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dsta_lo, &xmm_dsta_hi);
5146 over_2x128 (&xmm_dst_lo, &xmm_dst_hi,
5147 &xmm_dsta_lo, &xmm_dsta_hi,
5151 (__m128i*)dst, pack_2x128_128 (tmp_lo, tmp_hi));
5161 vd = unpack_32_1x128 (*dst);
5163 *dst = pack_1x128_32 (over_1x128 (vd, expand_alpha_1x128 (vd),
5174 sse2_composite_over_8888_8888_8888 (pixman_implementation_t *imp,
5175 pixman_composite_info_t *info)
5177 PIXMAN_COMPOSITE_ARGS (info);
5178 uint32_t *src, *src_line, s;
5179 uint32_t *dst, *dst_line, d;
5180 uint32_t *mask, *mask_line;
5182 int src_stride, mask_stride, dst_stride;
5185 __m128i xmm_src, xmm_src_lo, xmm_src_hi, xmm_srca_lo, xmm_srca_hi;
5186 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
5187 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
5189 PIXMAN_IMAGE_GET_LINE (
5190 dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
5191 PIXMAN_IMAGE_GET_LINE (
5192 mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
5193 PIXMAN_IMAGE_GET_LINE (
5194 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
5199 src_line += src_stride;
5201 dst_line += dst_stride;
5203 mask_line += mask_stride;
5207 while (w && (uintptr_t)dst & 15)
5212 m = (*mask++) >> 24;
5219 if (sa == 0xff && m == 0xff)
5225 __m128i ms, md, ma, msa;
5227 ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
5228 ms = unpack_32_1x128 (s);
5229 md = unpack_32_1x128 (d);
5231 msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
5233 *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
5243 xmm_mask = load_128_unaligned ((__m128i*)mask);
5245 if (!is_transparent (xmm_mask))
5247 xmm_src = load_128_unaligned ((__m128i*)src);
5249 if (is_opaque (xmm_mask) && is_opaque (xmm_src))
5251 save_128_aligned ((__m128i *)dst, xmm_src);
5255 xmm_dst = load_128_aligned ((__m128i *)dst);
5257 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
5258 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
5259 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
5261 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi);
5262 expand_alpha_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
5264 in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi,
5265 &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi);
5267 save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
5282 m = (*mask++) >> 24;
5289 if (sa == 0xff && m == 0xff)
5295 __m128i ms, md, ma, msa;
5297 ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
5298 ms = unpack_32_1x128 (s);
5299 md = unpack_32_1x128 (d);
5301 msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
5303 *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
5314 /* A variant of 'sse2_combine_over_u' with minor tweaks */
5315 static force_inline void
5316 scaled_nearest_scanline_sse2_8888_8888_OVER (uint32_t* pd,
5320 pixman_fixed_t unit_x,
5321 pixman_fixed_t src_width_fixed,
5322 pixman_bool_t fully_transparent_src)
5325 const uint32_t* pm = NULL;
5327 __m128i xmm_dst_lo, xmm_dst_hi;
5328 __m128i xmm_src_lo, xmm_src_hi;
5329 __m128i xmm_alpha_lo, xmm_alpha_hi;
5331 if (fully_transparent_src)
5334 /* Align dst on a 16-byte boundary */
5335 while (w && ((uintptr_t)pd & 15))
5338 s = combine1 (ps + pixman_fixed_to_int (vx), pm);
5341 vx -= src_width_fixed;
5343 *pd++ = core_combine_over_u_pixel_sse2 (s, d);
5352 uint32_t tmp1, tmp2, tmp3, tmp4;
5354 tmp1 = *(ps + pixman_fixed_to_int (vx));
5357 vx -= src_width_fixed;
5358 tmp2 = *(ps + pixman_fixed_to_int (vx));
5361 vx -= src_width_fixed;
5362 tmp3 = *(ps + pixman_fixed_to_int (vx));
5365 vx -= src_width_fixed;
5366 tmp4 = *(ps + pixman_fixed_to_int (vx));
5369 vx -= src_width_fixed;
5371 tmp = _mm_set_epi32 (tmp4, tmp3, tmp2, tmp1);
5373 xmm_src_hi = combine4 ((__m128i*)&tmp, (__m128i*)pm);
5375 if (is_opaque (xmm_src_hi))
5377 save_128_aligned ((__m128i*)pd, xmm_src_hi);
5379 else if (!is_zero (xmm_src_hi))
5381 xmm_dst_hi = load_128_aligned ((__m128i*) pd);
5383 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
5384 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
5386 expand_alpha_2x128 (
5387 xmm_src_lo, xmm_src_hi, &xmm_alpha_lo, &xmm_alpha_hi);
5389 over_2x128 (&xmm_src_lo, &xmm_src_hi,
5390 &xmm_alpha_lo, &xmm_alpha_hi,
5391 &xmm_dst_lo, &xmm_dst_hi);
5393 /* rebuid the 4 pixel data and save*/
5394 save_128_aligned ((__m128i*)pd,
5395 pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
5407 s = combine1 (ps + pixman_fixed_to_int (vx), pm);
5410 vx -= src_width_fixed;
5412 *pd++ = core_combine_over_u_pixel_sse2 (s, d);
5420 FAST_NEAREST_MAINLOOP (sse2_8888_8888_cover_OVER,
5421 scaled_nearest_scanline_sse2_8888_8888_OVER,
5422 uint32_t, uint32_t, COVER)
5423 FAST_NEAREST_MAINLOOP (sse2_8888_8888_none_OVER,
5424 scaled_nearest_scanline_sse2_8888_8888_OVER,
5425 uint32_t, uint32_t, NONE)
5426 FAST_NEAREST_MAINLOOP (sse2_8888_8888_pad_OVER,
5427 scaled_nearest_scanline_sse2_8888_8888_OVER,
5428 uint32_t, uint32_t, PAD)
5429 FAST_NEAREST_MAINLOOP (sse2_8888_8888_normal_OVER,
5430 scaled_nearest_scanline_sse2_8888_8888_OVER,
5431 uint32_t, uint32_t, NORMAL)
5433 static force_inline void
5434 scaled_nearest_scanline_sse2_8888_n_8888_OVER (const uint32_t * mask,
5436 const uint32_t * src,
5439 pixman_fixed_t unit_x,
5440 pixman_fixed_t src_width_fixed,
5441 pixman_bool_t zero_src)
5444 __m128i xmm_src, xmm_src_lo, xmm_src_hi;
5445 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
5446 __m128i xmm_alpha_lo, xmm_alpha_hi;
5448 if (zero_src || (*mask >> 24) == 0)
5451 xmm_mask = create_mask_16_128 (*mask >> 24);
5453 while (w && (uintptr_t)dst & 15)
5455 uint32_t s = *(src + pixman_fixed_to_int (vx));
5458 vx -= src_width_fixed;
5464 __m128i ms = unpack_32_1x128 (s);
5465 __m128i alpha = expand_alpha_1x128 (ms);
5466 __m128i dest = xmm_mask;
5467 __m128i alpha_dst = unpack_32_1x128 (d);
5469 *dst = pack_1x128_32 (
5470 in_over_1x128 (&ms, &alpha, &dest, &alpha_dst));
5478 uint32_t tmp1, tmp2, tmp3, tmp4;
5480 tmp1 = *(src + pixman_fixed_to_int (vx));
5483 vx -= src_width_fixed;
5484 tmp2 = *(src + pixman_fixed_to_int (vx));
5487 vx -= src_width_fixed;
5488 tmp3 = *(src + pixman_fixed_to_int (vx));
5491 vx -= src_width_fixed;
5492 tmp4 = *(src + pixman_fixed_to_int (vx));
5495 vx -= src_width_fixed;
5497 xmm_src = _mm_set_epi32 (tmp4, tmp3, tmp2, tmp1);
5499 if (!is_zero (xmm_src))
5501 xmm_dst = load_128_aligned ((__m128i*)dst);
5503 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
5504 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
5505 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
5506 &xmm_alpha_lo, &xmm_alpha_hi);
5508 in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
5509 &xmm_alpha_lo, &xmm_alpha_hi,
5510 &xmm_mask, &xmm_mask,
5511 &xmm_dst_lo, &xmm_dst_hi);
5514 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
5523 uint32_t s = *(src + pixman_fixed_to_int (vx));
5526 vx -= src_width_fixed;
5532 __m128i ms = unpack_32_1x128 (s);
5533 __m128i alpha = expand_alpha_1x128 (ms);
5534 __m128i mask = xmm_mask;
5535 __m128i dest = unpack_32_1x128 (d);
5537 *dst = pack_1x128_32 (
5538 in_over_1x128 (&ms, &alpha, &mask, &dest));
5547 FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_cover_OVER,
5548 scaled_nearest_scanline_sse2_8888_n_8888_OVER,
5549 uint32_t, uint32_t, uint32_t, COVER, TRUE, TRUE)
5550 FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_pad_OVER,
5551 scaled_nearest_scanline_sse2_8888_n_8888_OVER,
5552 uint32_t, uint32_t, uint32_t, PAD, TRUE, TRUE)
5553 FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_none_OVER,
5554 scaled_nearest_scanline_sse2_8888_n_8888_OVER,
5555 uint32_t, uint32_t, uint32_t, NONE, TRUE, TRUE)
5556 FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_normal_OVER,
5557 scaled_nearest_scanline_sse2_8888_n_8888_OVER,
5558 uint32_t, uint32_t, uint32_t, NORMAL, TRUE, TRUE)
5562 /***********************************************************************************/
5564 # define BILINEAR_DECLARE_VARIABLES \
5565 const __m128i xmm_wt = _mm_set_epi16 (wt, wt, wt, wt, wt, wt, wt, wt); \
5566 const __m128i xmm_wb = _mm_set_epi16 (wb, wb, wb, wb, wb, wb, wb, wb); \
5567 const __m128i xmm_addc = _mm_set_epi16 (0, 1, 0, 1, 0, 1, 0, 1); \
5568 const __m128i xmm_ux1 = _mm_set_epi16 (unit_x, -unit_x, unit_x, -unit_x, \
5569 unit_x, -unit_x, unit_x, -unit_x); \
5570 const __m128i xmm_ux4 = _mm_set_epi16 (unit_x * 4, -unit_x * 4, \
5571 unit_x * 4, -unit_x * 4, \
5572 unit_x * 4, -unit_x * 4, \
5573 unit_x * 4, -unit_x * 4); \
5574 const __m128i xmm_zero = _mm_setzero_si128 (); \
5575 __m128i xmm_x = _mm_set_epi16 (vx + unit_x * 3, -(vx + 1) - unit_x * 3, \
5576 vx + unit_x * 2, -(vx + 1) - unit_x * 2, \
5577 vx + unit_x * 1, -(vx + 1) - unit_x * 1, \
5578 vx + unit_x * 0, -(vx + 1) - unit_x * 0); \
5579 __m128i xmm_wh_state;
5581 #define BILINEAR_INTERPOLATE_ONE_PIXEL_HELPER(pix, phase_) \
5583 int phase = phase_; \
5584 __m128i xmm_wh, xmm_a, xmm_b; \
5585 /* fetch 2x2 pixel block into sse2 registers */ \
5586 __m128i tltr = _mm_loadl_epi64 ((__m128i *)&src_top[vx >> 16]); \
5587 __m128i blbr = _mm_loadl_epi64 ((__m128i *)&src_bottom[vx >> 16]); \
5589 /* vertical interpolation */ \
5590 xmm_a = _mm_mullo_epi16 (_mm_unpacklo_epi8 (tltr, xmm_zero), xmm_wt); \
5591 xmm_b = _mm_mullo_epi16 (_mm_unpacklo_epi8 (blbr, xmm_zero), xmm_wb); \
5592 xmm_a = _mm_add_epi16 (xmm_a, xmm_b); \
5593 /* calculate horizontal weights */ \
5596 xmm_wh_state = _mm_add_epi16 (xmm_addc, _mm_srli_epi16 (xmm_x, \
5597 16 - BILINEAR_INTERPOLATION_BITS)); \
5598 xmm_x = _mm_add_epi16 (xmm_x, (phase < 0) ? xmm_ux1 : xmm_ux4); \
5601 xmm_wh = _mm_shuffle_epi32 (xmm_wh_state, _MM_SHUFFLE (phase, phase, \
5603 /* horizontal interpolation */ \
5604 xmm_a = _mm_madd_epi16 (_mm_unpackhi_epi16 (_mm_shuffle_epi32 ( \
5605 xmm_a, _MM_SHUFFLE (1, 0, 3, 2)), xmm_a), xmm_wh); \
5606 /* shift the result */ \
5607 pix = _mm_srli_epi32 (xmm_a, BILINEAR_INTERPOLATION_BITS * 2); \
5610 #else /************************************************************************/
5612 # define BILINEAR_DECLARE_VARIABLES \
5613 const __m128i xmm_wt = _mm_set_epi16 (wt, wt, wt, wt, wt, wt, wt, wt); \
5614 const __m128i xmm_wb = _mm_set_epi16 (wb, wb, wb, wb, wb, wb, wb, wb); \
5615 const __m128i xmm_addc = _mm_set_epi16 (0, 1, 0, 1, 0, 1, 0, 1); \
5616 const __m128i xmm_ux1 = _mm_set_epi16 (unit_x, -unit_x, unit_x, -unit_x, \
5617 unit_x, -unit_x, unit_x, -unit_x); \
5618 const __m128i xmm_ux4 = _mm_set_epi16 (unit_x * 4, -unit_x * 4, \
5619 unit_x * 4, -unit_x * 4, \
5620 unit_x * 4, -unit_x * 4, \
5621 unit_x * 4, -unit_x * 4); \
5622 const __m128i xmm_zero = _mm_setzero_si128 (); \
5623 __m128i xmm_x = _mm_set_epi16 (vx, -(vx + 1), vx, -(vx + 1), \
5624 vx, -(vx + 1), vx, -(vx + 1))
5626 #define BILINEAR_INTERPOLATE_ONE_PIXEL_HELPER(pix, phase) \
5628 __m128i xmm_wh, xmm_a, xmm_b; \
5629 /* fetch 2x2 pixel block into sse2 registers */ \
5630 __m128i tltr = _mm_loadl_epi64 ((__m128i *)&src_top[vx >> 16]); \
5631 __m128i blbr = _mm_loadl_epi64 ((__m128i *)&src_bottom[vx >> 16]); \
5632 (void)xmm_ux4; /* suppress warning: unused variable 'xmm_ux4' */ \
5634 /* vertical interpolation */ \
5635 xmm_a = _mm_mullo_epi16 (_mm_unpacklo_epi8 (tltr, xmm_zero), xmm_wt); \
5636 xmm_b = _mm_mullo_epi16 (_mm_unpacklo_epi8 (blbr, xmm_zero), xmm_wb); \
5637 xmm_a = _mm_add_epi16 (xmm_a, xmm_b); \
5638 /* calculate horizontal weights */ \
5639 xmm_wh = _mm_add_epi16 (xmm_addc, _mm_srli_epi16 (xmm_x, \
5640 16 - BILINEAR_INTERPOLATION_BITS)); \
5641 xmm_x = _mm_add_epi16 (xmm_x, xmm_ux1); \
5642 /* horizontal interpolation */ \
5643 xmm_b = _mm_unpacklo_epi64 (/* any value is fine here */ xmm_b, xmm_a); \
5644 xmm_a = _mm_madd_epi16 (_mm_unpackhi_epi16 (xmm_b, xmm_a), xmm_wh); \
5645 /* shift the result */ \
5646 pix = _mm_srli_epi32 (xmm_a, BILINEAR_INTERPOLATION_BITS * 2); \
5649 /***********************************************************************************/
5653 #define BILINEAR_INTERPOLATE_ONE_PIXEL(pix); \
5656 BILINEAR_INTERPOLATE_ONE_PIXEL_HELPER (xmm_pix, -1); \
5657 xmm_pix = _mm_packs_epi32 (xmm_pix, xmm_pix); \
5658 xmm_pix = _mm_packus_epi16 (xmm_pix, xmm_pix); \
5659 pix = _mm_cvtsi128_si32 (xmm_pix); \
5662 #define BILINEAR_INTERPOLATE_FOUR_PIXELS(pix); \
5664 __m128i xmm_pix1, xmm_pix2, xmm_pix3, xmm_pix4; \
5665 BILINEAR_INTERPOLATE_ONE_PIXEL_HELPER (xmm_pix1, 0); \
5666 BILINEAR_INTERPOLATE_ONE_PIXEL_HELPER (xmm_pix2, 1); \
5667 BILINEAR_INTERPOLATE_ONE_PIXEL_HELPER (xmm_pix3, 2); \
5668 BILINEAR_INTERPOLATE_ONE_PIXEL_HELPER (xmm_pix4, 3); \
5669 xmm_pix1 = _mm_packs_epi32 (xmm_pix1, xmm_pix2); \
5670 xmm_pix3 = _mm_packs_epi32 (xmm_pix3, xmm_pix4); \
5671 pix = _mm_packus_epi16 (xmm_pix1, xmm_pix3); \
5674 #define BILINEAR_SKIP_ONE_PIXEL() \
5677 xmm_x = _mm_add_epi16 (xmm_x, xmm_ux1); \
5680 #define BILINEAR_SKIP_FOUR_PIXELS() \
5683 xmm_x = _mm_add_epi16 (xmm_x, xmm_ux4); \
5686 /***********************************************************************************/
5688 static force_inline void
5689 scaled_bilinear_scanline_sse2_8888_8888_SRC (uint32_t * dst,
5690 const uint32_t * mask,
5691 const uint32_t * src_top,
5692 const uint32_t * src_bottom,
5697 pixman_fixed_t unit_x_,
5698 pixman_fixed_t max_vx,
5699 pixman_bool_t zero_src)
5702 intptr_t unit_x = unit_x_;
5703 BILINEAR_DECLARE_VARIABLES;
5704 uint32_t pix1, pix2;
5706 while (w && ((uintptr_t)dst & 15))
5708 BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
5713 while ((w -= 4) >= 0) {
5715 BILINEAR_INTERPOLATE_FOUR_PIXELS (xmm_src);
5716 _mm_store_si128 ((__m128i *)dst, xmm_src);
5722 BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
5723 BILINEAR_INTERPOLATE_ONE_PIXEL (pix2);
5730 BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
5736 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_cover_SRC,
5737 scaled_bilinear_scanline_sse2_8888_8888_SRC,
5738 uint32_t, uint32_t, uint32_t,
5740 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_pad_SRC,
5741 scaled_bilinear_scanline_sse2_8888_8888_SRC,
5742 uint32_t, uint32_t, uint32_t,
5744 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_none_SRC,
5745 scaled_bilinear_scanline_sse2_8888_8888_SRC,
5746 uint32_t, uint32_t, uint32_t,
5748 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_normal_SRC,
5749 scaled_bilinear_scanline_sse2_8888_8888_SRC,
5750 uint32_t, uint32_t, uint32_t,
5753 static force_inline void
5754 scaled_bilinear_scanline_sse2_x888_8888_SRC (uint32_t * dst,
5755 const uint32_t * mask,
5756 const uint32_t * src_top,
5757 const uint32_t * src_bottom,
5762 pixman_fixed_t unit_x_,
5763 pixman_fixed_t max_vx,
5764 pixman_bool_t zero_src)
5767 intptr_t unit_x = unit_x_;
5768 BILINEAR_DECLARE_VARIABLES;
5769 uint32_t pix1, pix2;
5771 while (w && ((uintptr_t)dst & 15))
5773 BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
5774 *dst++ = pix1 | 0xFF000000;
5778 while ((w -= 4) >= 0) {
5780 BILINEAR_INTERPOLATE_FOUR_PIXELS (xmm_src);
5781 _mm_store_si128 ((__m128i *)dst, _mm_or_si128 (xmm_src, mask_ff000000));
5787 BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
5788 BILINEAR_INTERPOLATE_ONE_PIXEL (pix2);
5789 *dst++ = pix1 | 0xFF000000;
5790 *dst++ = pix2 | 0xFF000000;
5795 BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
5796 *dst = pix1 | 0xFF000000;
5800 FAST_BILINEAR_MAINLOOP_COMMON (sse2_x888_8888_cover_SRC,
5801 scaled_bilinear_scanline_sse2_x888_8888_SRC,
5802 uint32_t, uint32_t, uint32_t,
5804 FAST_BILINEAR_MAINLOOP_COMMON (sse2_x888_8888_pad_SRC,
5805 scaled_bilinear_scanline_sse2_x888_8888_SRC,
5806 uint32_t, uint32_t, uint32_t,
5808 FAST_BILINEAR_MAINLOOP_COMMON (sse2_x888_8888_normal_SRC,
5809 scaled_bilinear_scanline_sse2_x888_8888_SRC,
5810 uint32_t, uint32_t, uint32_t,
5813 static force_inline void
5814 scaled_bilinear_scanline_sse2_8888_8888_OVER (uint32_t * dst,
5815 const uint32_t * mask,
5816 const uint32_t * src_top,
5817 const uint32_t * src_bottom,
5822 pixman_fixed_t unit_x_,
5823 pixman_fixed_t max_vx,
5824 pixman_bool_t zero_src)
5827 intptr_t unit_x = unit_x_;
5828 BILINEAR_DECLARE_VARIABLES;
5829 uint32_t pix1, pix2;
5831 while (w && ((uintptr_t)dst & 15))
5833 BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
5838 *dst = core_combine_over_u_pixel_sse2 (pix1, pix2);
5848 __m128i xmm_src_hi, xmm_src_lo, xmm_dst_hi, xmm_dst_lo;
5849 __m128i xmm_alpha_hi, xmm_alpha_lo;
5851 BILINEAR_INTERPOLATE_FOUR_PIXELS (xmm_src);
5853 if (!is_zero (xmm_src))
5855 if (is_opaque (xmm_src))
5857 save_128_aligned ((__m128i *)dst, xmm_src);
5861 __m128i xmm_dst = load_128_aligned ((__m128i *)dst);
5863 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
5864 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
5866 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_alpha_lo, &xmm_alpha_hi);
5867 over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_alpha_lo, &xmm_alpha_hi,
5868 &xmm_dst_lo, &xmm_dst_hi);
5870 save_128_aligned ((__m128i *)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
5880 BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
5885 *dst = core_combine_over_u_pixel_sse2 (pix1, pix2);
5893 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_cover_OVER,
5894 scaled_bilinear_scanline_sse2_8888_8888_OVER,
5895 uint32_t, uint32_t, uint32_t,
5897 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_pad_OVER,
5898 scaled_bilinear_scanline_sse2_8888_8888_OVER,
5899 uint32_t, uint32_t, uint32_t,
5901 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_none_OVER,
5902 scaled_bilinear_scanline_sse2_8888_8888_OVER,
5903 uint32_t, uint32_t, uint32_t,
5905 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_normal_OVER,
5906 scaled_bilinear_scanline_sse2_8888_8888_OVER,
5907 uint32_t, uint32_t, uint32_t,
5910 static force_inline void
5911 scaled_bilinear_scanline_sse2_8888_8_8888_OVER (uint32_t * dst,
5912 const uint8_t * mask,
5913 const uint32_t * src_top,
5914 const uint32_t * src_bottom,
5919 pixman_fixed_t unit_x_,
5920 pixman_fixed_t max_vx,
5921 pixman_bool_t zero_src)
5924 intptr_t unit_x = unit_x_;
5925 BILINEAR_DECLARE_VARIABLES;
5926 uint32_t pix1, pix2;
5929 while (w && ((uintptr_t)dst & 15))
5933 m = (uint32_t) *mask++;
5937 BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
5940 if (sa == 0xff && m == 0xff)
5946 __m128i ms, md, ma, msa;
5949 ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
5950 ms = unpack_32_1x128 (pix1);
5951 md = unpack_32_1x128 (pix2);
5953 msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
5955 *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
5960 BILINEAR_SKIP_ONE_PIXEL ();
5969 __m128i xmm_src, xmm_src_lo, xmm_src_hi, xmm_srca_lo, xmm_srca_hi;
5970 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
5971 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
5973 m = *(uint32_t*)mask;
5977 BILINEAR_INTERPOLATE_FOUR_PIXELS (xmm_src);
5979 if (m == 0xffffffff && is_opaque (xmm_src))
5981 save_128_aligned ((__m128i *)dst, xmm_src);
5985 xmm_dst = load_128_aligned ((__m128i *)dst);
5987 xmm_mask = _mm_unpacklo_epi16 (unpack_32_1x128 (m), _mm_setzero_si128());
5989 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
5990 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
5991 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
5993 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi);
5994 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
5996 in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi,
5997 &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi);
5999 save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
6004 BILINEAR_SKIP_FOUR_PIXELS ();
6016 m = (uint32_t) *mask++;
6020 BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
6023 if (sa == 0xff && m == 0xff)
6029 __m128i ms, md, ma, msa;
6032 ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
6033 ms = unpack_32_1x128 (pix1);
6034 md = unpack_32_1x128 (pix2);
6036 msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
6038 *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
6043 BILINEAR_SKIP_ONE_PIXEL ();
6051 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8_8888_cover_OVER,
6052 scaled_bilinear_scanline_sse2_8888_8_8888_OVER,
6053 uint32_t, uint8_t, uint32_t,
6054 COVER, FLAG_HAVE_NON_SOLID_MASK)
6055 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8_8888_pad_OVER,
6056 scaled_bilinear_scanline_sse2_8888_8_8888_OVER,
6057 uint32_t, uint8_t, uint32_t,
6058 PAD, FLAG_HAVE_NON_SOLID_MASK)
6059 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8_8888_none_OVER,
6060 scaled_bilinear_scanline_sse2_8888_8_8888_OVER,
6061 uint32_t, uint8_t, uint32_t,
6062 NONE, FLAG_HAVE_NON_SOLID_MASK)
6063 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8_8888_normal_OVER,
6064 scaled_bilinear_scanline_sse2_8888_8_8888_OVER,
6065 uint32_t, uint8_t, uint32_t,
6066 NORMAL, FLAG_HAVE_NON_SOLID_MASK)
6068 static force_inline void
6069 scaled_bilinear_scanline_sse2_8888_n_8888_OVER (uint32_t * dst,
6070 const uint32_t * mask,
6071 const uint32_t * src_top,
6072 const uint32_t * src_bottom,
6077 pixman_fixed_t unit_x_,
6078 pixman_fixed_t max_vx,
6079 pixman_bool_t zero_src)
6082 intptr_t unit_x = unit_x_;
6083 BILINEAR_DECLARE_VARIABLES;
6087 if (zero_src || (*mask >> 24) == 0)
6090 xmm_mask = create_mask_16_128 (*mask >> 24);
6092 while (w && ((uintptr_t)dst & 15))
6094 BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
6099 __m128i ms = unpack_32_1x128 (pix1);
6100 __m128i alpha = expand_alpha_1x128 (ms);
6101 __m128i dest = xmm_mask;
6102 __m128i alpha_dst = unpack_32_1x128 (d);
6104 *dst = pack_1x128_32
6105 (in_over_1x128 (&ms, &alpha, &dest, &alpha_dst));
6115 BILINEAR_INTERPOLATE_FOUR_PIXELS (xmm_src);
6117 if (!is_zero (xmm_src))
6119 __m128i xmm_src_lo, xmm_src_hi;
6120 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
6121 __m128i xmm_alpha_lo, xmm_alpha_hi;
6123 xmm_dst = load_128_aligned ((__m128i*)dst);
6125 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
6126 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
6127 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
6128 &xmm_alpha_lo, &xmm_alpha_hi);
6130 in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
6131 &xmm_alpha_lo, &xmm_alpha_hi,
6132 &xmm_mask, &xmm_mask,
6133 &xmm_dst_lo, &xmm_dst_hi);
6136 ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
6145 BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
6150 __m128i ms = unpack_32_1x128 (pix1);
6151 __m128i alpha = expand_alpha_1x128 (ms);
6152 __m128i dest = xmm_mask;
6153 __m128i alpha_dst = unpack_32_1x128 (d);
6155 *dst = pack_1x128_32
6156 (in_over_1x128 (&ms, &alpha, &dest, &alpha_dst));
6164 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_n_8888_cover_OVER,
6165 scaled_bilinear_scanline_sse2_8888_n_8888_OVER,
6166 uint32_t, uint32_t, uint32_t,
6167 COVER, FLAG_HAVE_SOLID_MASK)
6168 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_n_8888_pad_OVER,
6169 scaled_bilinear_scanline_sse2_8888_n_8888_OVER,
6170 uint32_t, uint32_t, uint32_t,
6171 PAD, FLAG_HAVE_SOLID_MASK)
6172 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_n_8888_none_OVER,
6173 scaled_bilinear_scanline_sse2_8888_n_8888_OVER,
6174 uint32_t, uint32_t, uint32_t,
6175 NONE, FLAG_HAVE_SOLID_MASK)
6176 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_n_8888_normal_OVER,
6177 scaled_bilinear_scanline_sse2_8888_n_8888_OVER,
6178 uint32_t, uint32_t, uint32_t,
6179 NORMAL, FLAG_HAVE_SOLID_MASK)
6181 static const pixman_fast_path_t sse2_fast_paths[] =
6183 /* PIXMAN_OP_OVER */
6184 PIXMAN_STD_FAST_PATH (OVER, solid, a8, r5g6b5, sse2_composite_over_n_8_0565),
6185 PIXMAN_STD_FAST_PATH (OVER, solid, a8, b5g6r5, sse2_composite_over_n_8_0565),
6186 PIXMAN_STD_FAST_PATH (OVER, solid, null, a8r8g8b8, sse2_composite_over_n_8888),
6187 PIXMAN_STD_FAST_PATH (OVER, solid, null, x8r8g8b8, sse2_composite_over_n_8888),
6188 PIXMAN_STD_FAST_PATH (OVER, solid, null, r5g6b5, sse2_composite_over_n_0565),
6189 PIXMAN_STD_FAST_PATH (OVER, solid, null, b5g6r5, sse2_composite_over_n_0565),
6190 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, a8r8g8b8, sse2_composite_over_8888_8888),
6191 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, x8r8g8b8, sse2_composite_over_8888_8888),
6192 PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, a8b8g8r8, sse2_composite_over_8888_8888),
6193 PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, x8b8g8r8, sse2_composite_over_8888_8888),
6194 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, r5g6b5, sse2_composite_over_8888_0565),
6195 PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, b5g6r5, sse2_composite_over_8888_0565),
6196 PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8r8g8b8, sse2_composite_over_n_8_8888),
6197 PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8r8g8b8, sse2_composite_over_n_8_8888),
6198 PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8b8g8r8, sse2_composite_over_n_8_8888),
6199 PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8b8g8r8, sse2_composite_over_n_8_8888),
6200 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, a8r8g8b8, sse2_composite_over_8888_8888_8888),
6201 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8, x8r8g8b8, sse2_composite_over_8888_8_8888),
6202 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8, a8r8g8b8, sse2_composite_over_8888_8_8888),
6203 PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, a8, x8b8g8r8, sse2_composite_over_8888_8_8888),
6204 PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, a8, a8b8g8r8, sse2_composite_over_8888_8_8888),
6205 PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, x8r8g8b8, sse2_composite_over_x888_8_8888),
6206 PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, a8r8g8b8, sse2_composite_over_x888_8_8888),
6207 PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, x8b8g8r8, sse2_composite_over_x888_8_8888),
6208 PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, a8b8g8r8, sse2_composite_over_x888_8_8888),
6209 PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, solid, a8r8g8b8, sse2_composite_over_x888_n_8888),
6210 PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, solid, x8r8g8b8, sse2_composite_over_x888_n_8888),
6211 PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, solid, a8b8g8r8, sse2_composite_over_x888_n_8888),
6212 PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, solid, x8b8g8r8, sse2_composite_over_x888_n_8888),
6213 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, a8r8g8b8, sse2_composite_over_8888_n_8888),
6214 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, x8r8g8b8, sse2_composite_over_8888_n_8888),
6215 PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, a8b8g8r8, sse2_composite_over_8888_n_8888),
6216 PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, x8b8g8r8, sse2_composite_over_8888_n_8888),
6217 PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, a8r8g8b8, sse2_composite_over_n_8888_8888_ca),
6218 PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, x8r8g8b8, sse2_composite_over_n_8888_8888_ca),
6219 PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, a8b8g8r8, sse2_composite_over_n_8888_8888_ca),
6220 PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, x8b8g8r8, sse2_composite_over_n_8888_8888_ca),
6221 PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, r5g6b5, sse2_composite_over_n_8888_0565_ca),
6222 PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, b5g6r5, sse2_composite_over_n_8888_0565_ca),
6223 PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, a8r8g8b8, sse2_composite_over_pixbuf_8888),
6224 PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, x8r8g8b8, sse2_composite_over_pixbuf_8888),
6225 PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, a8b8g8r8, sse2_composite_over_pixbuf_8888),
6226 PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, x8b8g8r8, sse2_composite_over_pixbuf_8888),
6227 PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, r5g6b5, sse2_composite_over_pixbuf_0565),
6228 PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, b5g6r5, sse2_composite_over_pixbuf_0565),
6229 PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, null, x8r8g8b8, sse2_composite_copy_area),
6230 PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, null, x8b8g8r8, sse2_composite_copy_area),
6232 /* PIXMAN_OP_OVER_REVERSE */
6233 PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8r8g8b8, sse2_composite_over_reverse_n_8888),
6234 PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8b8g8r8, sse2_composite_over_reverse_n_8888),
6237 PIXMAN_STD_FAST_PATH_CA (ADD, solid, a8r8g8b8, a8r8g8b8, sse2_composite_add_n_8888_8888_ca),
6238 PIXMAN_STD_FAST_PATH (ADD, a8, null, a8, sse2_composite_add_8_8),
6239 PIXMAN_STD_FAST_PATH (ADD, a8r8g8b8, null, a8r8g8b8, sse2_composite_add_8888_8888),
6240 PIXMAN_STD_FAST_PATH (ADD, a8b8g8r8, null, a8b8g8r8, sse2_composite_add_8888_8888),
6241 PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8, sse2_composite_add_n_8_8),
6242 PIXMAN_STD_FAST_PATH (ADD, solid, null, a8, sse2_composite_add_n_8),
6243 PIXMAN_STD_FAST_PATH (ADD, solid, null, x8r8g8b8, sse2_composite_add_n_8888),
6244 PIXMAN_STD_FAST_PATH (ADD, solid, null, a8r8g8b8, sse2_composite_add_n_8888),
6245 PIXMAN_STD_FAST_PATH (ADD, solid, null, x8b8g8r8, sse2_composite_add_n_8888),
6246 PIXMAN_STD_FAST_PATH (ADD, solid, null, a8b8g8r8, sse2_composite_add_n_8888),
6247 PIXMAN_STD_FAST_PATH (ADD, solid, a8, x8r8g8b8, sse2_composite_add_n_8_8888),
6248 PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8r8g8b8, sse2_composite_add_n_8_8888),
6249 PIXMAN_STD_FAST_PATH (ADD, solid, a8, x8b8g8r8, sse2_composite_add_n_8_8888),
6250 PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8b8g8r8, sse2_composite_add_n_8_8888),
6253 PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8r8g8b8, sse2_composite_src_n_8_8888),
6254 PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8r8g8b8, sse2_composite_src_n_8_8888),
6255 PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8b8g8r8, sse2_composite_src_n_8_8888),
6256 PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8b8g8r8, sse2_composite_src_n_8_8888),
6257 PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, r5g6b5, sse2_composite_src_x888_0565),
6258 PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, b5g6r5, sse2_composite_src_x888_0565),
6259 PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, r5g6b5, sse2_composite_src_x888_0565),
6260 PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, b5g6r5, sse2_composite_src_x888_0565),
6261 PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, a8r8g8b8, sse2_composite_src_x888_8888),
6262 PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, a8b8g8r8, sse2_composite_src_x888_8888),
6263 PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, a8r8g8b8, sse2_composite_copy_area),
6264 PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, a8b8g8r8, sse2_composite_copy_area),
6265 PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, x8r8g8b8, sse2_composite_copy_area),
6266 PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, x8b8g8r8, sse2_composite_copy_area),
6267 PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, x8r8g8b8, sse2_composite_copy_area),
6268 PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, x8b8g8r8, sse2_composite_copy_area),
6269 PIXMAN_STD_FAST_PATH (SRC, r5g6b5, null, r5g6b5, sse2_composite_copy_area),
6270 PIXMAN_STD_FAST_PATH (SRC, b5g6r5, null, b5g6r5, sse2_composite_copy_area),
6273 PIXMAN_STD_FAST_PATH (IN, a8, null, a8, sse2_composite_in_8_8),
6274 PIXMAN_STD_FAST_PATH (IN, solid, a8, a8, sse2_composite_in_n_8_8),
6275 PIXMAN_STD_FAST_PATH (IN, solid, null, a8, sse2_composite_in_n_8),
6277 SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
6278 SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
6279 SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
6280 SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
6281 SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
6282 SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
6283 SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
6284 SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
6285 SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
6286 SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
6287 SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
6288 SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
6289 SIMPLE_NEAREST_FAST_PATH_NORMAL (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
6290 SIMPLE_NEAREST_FAST_PATH_NORMAL (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
6291 SIMPLE_NEAREST_FAST_PATH_NORMAL (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
6292 SIMPLE_NEAREST_FAST_PATH_NORMAL (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
6294 SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_n_8888),
6295 SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_n_8888),
6296 SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_n_8888),
6297 SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_n_8888),
6298 SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_NORMAL (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_n_8888),
6299 SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_NORMAL (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_n_8888),
6300 SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_NORMAL (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_n_8888),
6301 SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_NORMAL (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_n_8888),
6303 SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
6304 SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
6305 SIMPLE_BILINEAR_FAST_PATH (SRC, x8r8g8b8, x8r8g8b8, sse2_8888_8888),
6306 SIMPLE_BILINEAR_FAST_PATH (SRC, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
6307 SIMPLE_BILINEAR_FAST_PATH (SRC, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
6308 SIMPLE_BILINEAR_FAST_PATH (SRC, x8b8g8r8, x8b8g8r8, sse2_8888_8888),
6310 SIMPLE_BILINEAR_FAST_PATH_COVER (SRC, x8r8g8b8, a8r8g8b8, sse2_x888_8888),
6311 SIMPLE_BILINEAR_FAST_PATH_COVER (SRC, x8b8g8r8, a8b8g8r8, sse2_x888_8888),
6312 SIMPLE_BILINEAR_FAST_PATH_PAD (SRC, x8r8g8b8, a8r8g8b8, sse2_x888_8888),
6313 SIMPLE_BILINEAR_FAST_PATH_PAD (SRC, x8b8g8r8, a8b8g8r8, sse2_x888_8888),
6314 SIMPLE_BILINEAR_FAST_PATH_NORMAL (SRC, x8r8g8b8, a8r8g8b8, sse2_x888_8888),
6315 SIMPLE_BILINEAR_FAST_PATH_NORMAL (SRC, x8b8g8r8, a8b8g8r8, sse2_x888_8888),
6317 SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
6318 SIMPLE_BILINEAR_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
6319 SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
6320 SIMPLE_BILINEAR_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
6322 SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_n_8888),
6323 SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_n_8888),
6324 SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_n_8888),
6325 SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_n_8888),
6327 SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8_8888),
6328 SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8_8888),
6329 SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8_8888),
6330 SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8_8888),
6336 sse2_fetch_x8r8g8b8 (pixman_iter_t *iter, const uint32_t *mask)
6338 int w = iter->width;
6339 __m128i ff000000 = mask_ff000000;
6340 uint32_t *dst = iter->buffer;
6341 uint32_t *src = (uint32_t *)iter->bits;
6343 iter->bits += iter->stride;
6345 while (w && ((uintptr_t)dst) & 0x0f)
6347 *dst++ = (*src++) | 0xff000000;
6354 (__m128i *)dst, _mm_or_si128 (
6355 load_128_unaligned ((__m128i *)src), ff000000));
6364 *dst++ = (*src++) | 0xff000000;
6368 return iter->buffer;
6372 sse2_fetch_r5g6b5 (pixman_iter_t *iter, const uint32_t *mask)
6374 int w = iter->width;
6375 uint32_t *dst = iter->buffer;
6376 uint16_t *src = (uint16_t *)iter->bits;
6377 __m128i ff000000 = mask_ff000000;
6379 iter->bits += iter->stride;
6381 while (w && ((uintptr_t)dst) & 0x0f)
6383 uint16_t s = *src++;
6385 *dst++ = convert_0565_to_8888 (s);
6393 s = _mm_loadu_si128 ((__m128i *)src);
6395 lo = unpack_565_to_8888 (_mm_unpacklo_epi16 (s, _mm_setzero_si128 ()));
6396 hi = unpack_565_to_8888 (_mm_unpackhi_epi16 (s, _mm_setzero_si128 ()));
6398 save_128_aligned ((__m128i *)(dst + 0), _mm_or_si128 (lo, ff000000));
6399 save_128_aligned ((__m128i *)(dst + 4), _mm_or_si128 (hi, ff000000));
6408 uint16_t s = *src++;
6410 *dst++ = convert_0565_to_8888 (s);
6414 return iter->buffer;
6418 sse2_fetch_a8 (pixman_iter_t *iter, const uint32_t *mask)
6420 int w = iter->width;
6421 uint32_t *dst = iter->buffer;
6422 uint8_t *src = iter->bits;
6423 __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
6425 iter->bits += iter->stride;
6427 while (w && (((uintptr_t)dst) & 15))
6429 *dst++ = *(src++) << 24;
6435 xmm0 = _mm_loadu_si128((__m128i *)src);
6437 xmm1 = _mm_unpacklo_epi8 (_mm_setzero_si128(), xmm0);
6438 xmm2 = _mm_unpackhi_epi8 (_mm_setzero_si128(), xmm0);
6439 xmm3 = _mm_unpacklo_epi16 (_mm_setzero_si128(), xmm1);
6440 xmm4 = _mm_unpackhi_epi16 (_mm_setzero_si128(), xmm1);
6441 xmm5 = _mm_unpacklo_epi16 (_mm_setzero_si128(), xmm2);
6442 xmm6 = _mm_unpackhi_epi16 (_mm_setzero_si128(), xmm2);
6444 _mm_store_si128(((__m128i *)(dst + 0)), xmm3);
6445 _mm_store_si128(((__m128i *)(dst + 4)), xmm4);
6446 _mm_store_si128(((__m128i *)(dst + 8)), xmm5);
6447 _mm_store_si128(((__m128i *)(dst + 12)), xmm6);
6456 *dst++ = *(src++) << 24;
6460 return iter->buffer;
6463 #define IMAGE_FLAGS \
6464 (FAST_PATH_STANDARD_FLAGS | FAST_PATH_ID_TRANSFORM | \
6465 FAST_PATH_BITS_IMAGE | FAST_PATH_SAMPLES_COVER_CLIP_NEAREST)
6467 static const pixman_iter_info_t sse2_iters[] =
6469 { PIXMAN_x8r8g8b8, IMAGE_FLAGS, ITER_NARROW,
6470 _pixman_iter_init_bits_stride, sse2_fetch_x8r8g8b8, NULL
6472 { PIXMAN_r5g6b5, IMAGE_FLAGS, ITER_NARROW,
6473 _pixman_iter_init_bits_stride, sse2_fetch_r5g6b5, NULL
6475 { PIXMAN_a8, IMAGE_FLAGS, ITER_NARROW,
6476 _pixman_iter_init_bits_stride, sse2_fetch_a8, NULL
6481 #if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__)
6482 __attribute__((__force_align_arg_pointer__))
6484 pixman_implementation_t *
6485 _pixman_implementation_create_sse2 (pixman_implementation_t *fallback)
6487 pixman_implementation_t *imp = _pixman_implementation_create (fallback, sse2_fast_paths);
6489 /* SSE2 constants */
6490 mask_565_r = create_mask_2x32_128 (0x00f80000, 0x00f80000);
6491 mask_565_g1 = create_mask_2x32_128 (0x00070000, 0x00070000);
6492 mask_565_g2 = create_mask_2x32_128 (0x000000e0, 0x000000e0);
6493 mask_565_b = create_mask_2x32_128 (0x0000001f, 0x0000001f);
6494 mask_red = create_mask_2x32_128 (0x00f80000, 0x00f80000);
6495 mask_green = create_mask_2x32_128 (0x0000fc00, 0x0000fc00);
6496 mask_blue = create_mask_2x32_128 (0x000000f8, 0x000000f8);
6497 mask_565_fix_rb = create_mask_2x32_128 (0x00e000e0, 0x00e000e0);
6498 mask_565_fix_g = create_mask_2x32_128 (0x0000c000, 0x0000c000);
6499 mask_0080 = create_mask_16_128 (0x0080);
6500 mask_00ff = create_mask_16_128 (0x00ff);
6501 mask_0101 = create_mask_16_128 (0x0101);
6502 mask_ffff = create_mask_16_128 (0xffff);
6503 mask_ff000000 = create_mask_2x32_128 (0xff000000, 0xff000000);
6504 mask_alpha = create_mask_2x32_128 (0x00ff0000, 0x00000000);
6505 mask_565_rb = create_mask_2x32_128 (0x00f800f8, 0x00f800f8);
6506 mask_565_pack_multiplier = create_mask_2x32_128 (0x20000004, 0x20000004);
6508 /* Set up function pointers */
6509 imp->combine_32[PIXMAN_OP_OVER] = sse2_combine_over_u;
6510 imp->combine_32[PIXMAN_OP_OVER_REVERSE] = sse2_combine_over_reverse_u;
6511 imp->combine_32[PIXMAN_OP_IN] = sse2_combine_in_u;
6512 imp->combine_32[PIXMAN_OP_IN_REVERSE] = sse2_combine_in_reverse_u;
6513 imp->combine_32[PIXMAN_OP_OUT] = sse2_combine_out_u;
6514 imp->combine_32[PIXMAN_OP_OUT_REVERSE] = sse2_combine_out_reverse_u;
6515 imp->combine_32[PIXMAN_OP_ATOP] = sse2_combine_atop_u;
6516 imp->combine_32[PIXMAN_OP_ATOP_REVERSE] = sse2_combine_atop_reverse_u;
6517 imp->combine_32[PIXMAN_OP_XOR] = sse2_combine_xor_u;
6518 imp->combine_32[PIXMAN_OP_ADD] = sse2_combine_add_u;
6520 imp->combine_32[PIXMAN_OP_SATURATE] = sse2_combine_saturate_u;
6522 imp->combine_32_ca[PIXMAN_OP_SRC] = sse2_combine_src_ca;
6523 imp->combine_32_ca[PIXMAN_OP_OVER] = sse2_combine_over_ca;
6524 imp->combine_32_ca[PIXMAN_OP_OVER_REVERSE] = sse2_combine_over_reverse_ca;
6525 imp->combine_32_ca[PIXMAN_OP_IN] = sse2_combine_in_ca;
6526 imp->combine_32_ca[PIXMAN_OP_IN_REVERSE] = sse2_combine_in_reverse_ca;
6527 imp->combine_32_ca[PIXMAN_OP_OUT] = sse2_combine_out_ca;
6528 imp->combine_32_ca[PIXMAN_OP_OUT_REVERSE] = sse2_combine_out_reverse_ca;
6529 imp->combine_32_ca[PIXMAN_OP_ATOP] = sse2_combine_atop_ca;
6530 imp->combine_32_ca[PIXMAN_OP_ATOP_REVERSE] = sse2_combine_atop_reverse_ca;
6531 imp->combine_32_ca[PIXMAN_OP_XOR] = sse2_combine_xor_ca;
6532 imp->combine_32_ca[PIXMAN_OP_ADD] = sse2_combine_add_ca;
6534 imp->blt = sse2_blt;
6535 imp->fill = sse2_fill;
6537 imp->iter_info = sse2_iters;