2 * Copyright © 2009 Nokia Corporation
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
21 * DEALINGS IN THE SOFTWARE.
23 * Author: Siarhei Siamashka (siarhei.siamashka@nokia.com)
27 * This file contains implementations of NEON optimized pixel processing
28 * functions. There is no full and detailed tutorial, but some functions
29 * (those which are exposing some new or interesting features) are
30 * extensively commented and can be used as examples.
32 * You may want to have a look at the comments for following functions:
33 * - pixman_composite_over_8888_0565_asm_neon
34 * - pixman_composite_over_n_8_0565_asm_neon
37 /* Prevent the stack from becoming executable for no reason... */
38 #if defined(__linux__) && defined(__ELF__)
39 .section .note.GNU-stack,"",%progbits
46 .eabi_attribute 10, 0 /* suppress Tag_FP_arch */
47 .eabi_attribute 12, 0 /* suppress Tag_Advanced_SIMD_arch */
52 #include "pixman-private.h"
53 #include "pixman-arm-neon-asm.h"
55 /* Global configuration options and preferences */
58 * The code can optionally make use of unaligned memory accesses to improve
59 * performance of handling leading/trailing pixels for each scanline.
60 * Configuration variable RESPECT_STRICT_ALIGNMENT can be set to 0 for
61 * example in linux if unaligned memory accesses are not configured to
62 * generate.exceptions.
64 .set RESPECT_STRICT_ALIGNMENT, 1
67 * Set default prefetch type. There is a choice between the following options:
69 * PREFETCH_TYPE_NONE (may be useful for the ARM cores where PLD is set to work
70 * as NOP to workaround some HW bugs or for whatever other reason)
72 * PREFETCH_TYPE_SIMPLE (may be useful for simple single-issue ARM cores where
73 * advanced prefetch intruduces heavy overhead)
75 * PREFETCH_TYPE_ADVANCED (useful for superscalar cores such as ARM Cortex-A8
76 * which can run ARM and NEON instructions simultaneously so that extra ARM
77 * instructions do not add (many) extra cycles, but improve prefetch efficiency)
79 * Note: some types of function can't support advanced prefetch and fallback
80 * to simple one (those which handle 24bpp pixels)
82 .set PREFETCH_TYPE_DEFAULT, PREFETCH_TYPE_ADVANCED
84 /* Prefetch distance in pixels for simple prefetch */
85 .set PREFETCH_DISTANCE_SIMPLE, 64
88 * Implementation of pixman_composite_over_8888_0565_asm_neon
90 * This function takes a8r8g8b8 source buffer, r5g6b5 destination buffer and
91 * performs OVER compositing operation. Function fast_composite_over_8888_0565
92 * from pixman-fast-path.c does the same in C and can be used as a reference.
94 * First we need to have some NEON assembly code which can do the actual
95 * operation on the pixels and provide it to the template macro.
97 * Template macro quite conveniently takes care of emitting all the necessary
98 * code for memory reading and writing (including quite tricky cases of
99 * handling unaligned leading/trailing pixels), so we only need to deal with
100 * the data in NEON registers.
102 * NEON registers allocation in general is recommented to be the following:
103 * d0, d1, d2, d3 - contain loaded source pixel data
104 * d4, d5, d6, d7 - contain loaded destination pixels (if they are needed)
105 * d24, d25, d26, d27 - contain loading mask pixel data (if mask is used)
106 * d28, d29, d30, d31 - place for storing the result (destination pixels)
108 * As can be seen above, four 64-bit NEON registers are used for keeping
109 * intermediate pixel data and up to 8 pixels can be processed in one step
110 * for 32bpp formats (16 pixels for 16bpp, 32 pixels for 8bpp).
112 * This particular function uses the following registers allocation:
113 * d0, d1, d2, d3 - contain loaded source pixel data
114 * d4, d5 - contain loaded destination pixels (they are needed)
115 * d28, d29 - place for storing the result (destination pixels)
119 * Step one. We need to have some code to do some arithmetics on pixel data.
120 * This is implemented as a pair of macros: '*_head' and '*_tail'. When used
121 * back-to-back, they take pixel data from {d0, d1, d2, d3} and {d4, d5},
122 * perform all the needed calculations and write the result to {d28, d29}.
123 * The rationale for having two macros and not just one will be explained
124 * later. In practice, any single monolitic function which does the work can
125 * be split into two parts in any arbitrary way without affecting correctness.
127 * There is one special trick here too. Common template macro can optionally
128 * make our life a bit easier by doing R, G, B, A color components
129 * deinterleaving for 32bpp pixel formats (and this feature is used in
130 * 'pixman_composite_over_8888_0565_asm_neon' function). So it means that
131 * instead of having 8 packed pixels in {d0, d1, d2, d3} registers, we
132 * actually use d0 register for blue channel (a vector of eight 8-bit
133 * values), d1 register for green, d2 for red and d3 for alpha. This
134 * simple conversion can be also done with a few NEON instructions:
136 * Packed to planar conversion:
142 * Planar to packed conversion:
148 * But pixel can be loaded directly in planar format using VLD4.8 NEON
149 * instruction. It is 1 cycle slower than VLD1.32, so this is not always
150 * desirable, that's why deinterleaving is optional.
152 * But anyway, here is the code:
154 .macro pixman_composite_over_8888_0565_process_pixblock_head
155 /* convert 8 r5g6b5 pixel data from {d4, d5} to planar 8-bit format
156 and put data into d6 - red, d7 - green, d30 - blue */
161 vmvn.8 d3, d3 /* invert source alpha */
163 vshrn.u16 d30, q2, #2
164 /* now do alpha blending, storing results in 8-bit planar format
165 into d16 - red, d19 - green, d18 - blue */
168 vmull.u8 q12, d3, d30
169 vrshr.u16 q13, q10, #8
170 vrshr.u16 q3, q11, #8
171 vrshr.u16 q15, q12, #8
172 vraddhn.u16 d20, q10, q13
173 vraddhn.u16 d23, q11, q3
174 vraddhn.u16 d22, q12, q15
177 .macro pixman_composite_over_8888_0565_process_pixblock_tail
178 /* ... continue alpha blending */
179 vqadd.u8 d16, d2, d20
181 /* convert the result to r5g6b5 and store it into {d28, d29} */
182 vshll.u8 q14, d16, #8
186 vsri.u16 q14, q9, #11
190 * OK, now we got almost everything that we need. Using the above two
191 * macros, the work can be done right. But now we want to optimize
192 * it a bit. ARM Cortex-A8 is an in-order core, and benefits really
193 * a lot from good code scheduling and software pipelining.
195 * Let's construct some code, which will run in the core main loop.
196 * Some pseudo-code of the main loop will look like this:
204 * It may look a bit weird, but this setup allows to hide instruction
205 * latencies better and also utilize dual-issue capability more
206 * efficiently (make pairs of load-store and ALU instructions).
208 * So what we need now is a '*_tail_head' macro, which will be used
209 * in the core main loop. A trivial straightforward implementation
210 * of this macro would look like this:
212 * pixman_composite_over_8888_0565_process_pixblock_tail
213 * vst1.16 {d28, d29}, [DST_W, :128]!
214 * vld1.16 {d4, d5}, [DST_R, :128]!
215 * vld4.32 {d0, d1, d2, d3}, [SRC]!
216 * pixman_composite_over_8888_0565_process_pixblock_head
219 * Now it also got some VLD/VST instructions. We simply can't move from
220 * processing one block of pixels to the other one with just arithmetics.
221 * The previously processed data needs to be written to memory and new
222 * data needs to be fetched. Fortunately, this main loop does not deal
223 * with partial leading/trailing pixels and can load/store a full block
224 * of pixels in a bulk. Additionally, destination buffer is already
225 * 16 bytes aligned here (which is good for performance).
227 * New things here are DST_R, DST_W, SRC and MASK identifiers. These
228 * are the aliases for ARM registers which are used as pointers for
229 * accessing data. We maintain separate pointers for reading and writing
230 * destination buffer (DST_R and DST_W).
232 * Another new thing is 'cache_preload' macro. It is used for prefetching
233 * data into CPU L2 cache and improve performance when dealing with large
234 * images which are far larger than cache size. It uses one argument
235 * (actually two, but they need to be the same here) - number of pixels
236 * in a block. Looking into 'pixman-arm-neon-asm.h' can provide some
237 * details about this macro. Moreover, if good performance is needed
238 * the code from this macro needs to be copied into '*_tail_head' macro
239 * and mixed with the rest of code for optimal instructions scheduling.
240 * We are actually doing it below.
242 * Now after all the explanations, here is the optimized code.
243 * Different instruction streams (originaling from '*_head', '*_tail'
244 * and 'cache_preload' macro) use different indentation levels for
245 * better readability. Actually taking the code from one of these
246 * indentation levels and ignoring a few VLD/VST instructions would
247 * result in exactly the code from '*_head', '*_tail' or 'cache_preload'
253 .macro pixman_composite_over_8888_0565_process_pixblock_tail_head
254 vqadd.u8 d16, d2, d20
255 vld1.16 {d4, d5}, [DST_R, :128]!
261 vshll.u8 q14, d16, #8
262 PF add PF_X, PF_X, #8
266 PF addne PF_X, PF_X, #8
268 PF subne PF_CTL, PF_CTL, #1
270 vshrn.u16 d30, q2, #2
272 PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
274 vmull.u8 q12, d3, d30
275 PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
279 vrshr.u16 q13, q10, #8
280 PF subge PF_X, PF_X, ORIG_W
281 vrshr.u16 q3, q11, #8
282 vrshr.u16 q15, q12, #8
283 PF subges PF_CTL, PF_CTL, #0x10
284 vsri.u16 q14, q9, #11
285 PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
286 vraddhn.u16 d20, q10, q13
287 vraddhn.u16 d23, q11, q3
288 PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
289 vraddhn.u16 d22, q12, q15
290 vst1.16 {d28, d29}, [DST_W, :128]!
295 /* If we did not care much about the performance, we would just use this... */
296 .macro pixman_composite_over_8888_0565_process_pixblock_tail_head
297 pixman_composite_over_8888_0565_process_pixblock_tail
298 vst1.16 {d28, d29}, [DST_W, :128]!
299 vld1.16 {d4, d5}, [DST_R, :128]!
301 pixman_composite_over_8888_0565_process_pixblock_head
308 * And now the final part. We are using 'generate_composite_function' macro
309 * to put all the stuff together. We are specifying the name of the function
310 * which we want to get, number of bits per pixel for the source, mask and
311 * destination (0 if unused, like mask in this case). Next come some bit
313 * FLAG_DST_READWRITE - tells that the destination buffer is both read
314 * and written, for write-only buffer we would use
315 * FLAG_DST_WRITEONLY flag instead
316 * FLAG_DEINTERLEAVE_32BPP - tells that we prefer to work with planar data
317 * and separate color channels for 32bpp format.
318 * The next things are:
319 * - the number of pixels processed per iteration (8 in this case, because
320 * that's the maximum what can fit into four 64-bit NEON registers).
321 * - prefetch distance, measured in pixel blocks. In this case it is 5 times
322 * by 8 pixels. That would be 40 pixels, or up to 160 bytes. Optimal
323 * prefetch distance can be selected by running some benchmarks.
325 * After that we specify some macros, these are 'default_init',
326 * 'default_cleanup' here which are empty (but it is possible to have custom
327 * init/cleanup macros to be able to save/restore some extra NEON registers
328 * like d8-d15 or do anything else) followed by
329 * 'pixman_composite_over_8888_0565_process_pixblock_head',
330 * 'pixman_composite_over_8888_0565_process_pixblock_tail' and
331 * 'pixman_composite_over_8888_0565_process_pixblock_tail_head'
332 * which we got implemented above.
334 * The last part is the NEON registers allocation scheme.
336 generate_composite_function \
337 pixman_composite_over_8888_0565_asm_neon, 32, 0, 16, \
338 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
339 8, /* number of pixels, processed in a single block */ \
340 5, /* prefetch distance */ \
343 pixman_composite_over_8888_0565_process_pixblock_head, \
344 pixman_composite_over_8888_0565_process_pixblock_tail, \
345 pixman_composite_over_8888_0565_process_pixblock_tail_head, \
346 28, /* dst_w_basereg */ \
347 4, /* dst_r_basereg */ \
348 0, /* src_basereg */ \
349 24 /* mask_basereg */
351 /******************************************************************************/
353 .macro pixman_composite_over_n_0565_process_pixblock_head
354 /* convert 8 r5g6b5 pixel data from {d4, d5} to planar 8-bit format
355 and put data into d6 - red, d7 - green, d30 - blue */
361 vshrn.u16 d30, q2, #2
362 /* now do alpha blending, storing results in 8-bit planar format
363 into d16 - red, d19 - green, d18 - blue */
366 vmull.u8 q12, d3, d30
367 vrshr.u16 q13, q10, #8
368 vrshr.u16 q3, q11, #8
369 vrshr.u16 q15, q12, #8
370 vraddhn.u16 d20, q10, q13
371 vraddhn.u16 d23, q11, q3
372 vraddhn.u16 d22, q12, q15
375 .macro pixman_composite_over_n_0565_process_pixblock_tail
376 /* ... continue alpha blending */
377 vqadd.u8 d16, d2, d20
379 /* convert the result to r5g6b5 and store it into {d28, d29} */
380 vshll.u8 q14, d16, #8
384 vsri.u16 q14, q9, #11
387 /* TODO: expand macros and do better instructions scheduling */
388 .macro pixman_composite_over_n_0565_process_pixblock_tail_head
389 pixman_composite_over_n_0565_process_pixblock_tail
390 vld1.16 {d4, d5}, [DST_R, :128]!
391 vst1.16 {d28, d29}, [DST_W, :128]!
392 pixman_composite_over_n_0565_process_pixblock_head
396 .macro pixman_composite_over_n_0565_init
397 add DUMMY, sp, #ARGS_STACK_OFFSET
398 vld1.32 {d3[0]}, [DUMMY]
403 vmvn.8 d3, d3 /* invert source alpha */
406 generate_composite_function \
407 pixman_composite_over_n_0565_asm_neon, 0, 0, 16, \
408 FLAG_DST_READWRITE, \
409 8, /* number of pixels, processed in a single block */ \
410 5, /* prefetch distance */ \
411 pixman_composite_over_n_0565_init, \
413 pixman_composite_over_n_0565_process_pixblock_head, \
414 pixman_composite_over_n_0565_process_pixblock_tail, \
415 pixman_composite_over_n_0565_process_pixblock_tail_head, \
416 28, /* dst_w_basereg */ \
417 4, /* dst_r_basereg */ \
418 0, /* src_basereg */ \
419 24 /* mask_basereg */
421 /******************************************************************************/
423 .macro pixman_composite_src_8888_0565_process_pixblock_head
429 .macro pixman_composite_src_8888_0565_process_pixblock_tail
431 vsri.u16 q14, q9, #11
434 .macro pixman_composite_src_8888_0565_process_pixblock_tail_head
436 PF add PF_X, PF_X, #8
439 PF addne PF_X, PF_X, #8
440 PF subne PF_CTL, PF_CTL, #1
441 vsri.u16 q14, q9, #11
443 PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
445 vst1.16 {d28, d29}, [DST_W, :128]!
446 PF subge PF_X, PF_X, ORIG_W
447 PF subges PF_CTL, PF_CTL, #0x10
449 PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
453 generate_composite_function \
454 pixman_composite_src_8888_0565_asm_neon, 32, 0, 16, \
455 FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
456 8, /* number of pixels, processed in a single block */ \
457 10, /* prefetch distance */ \
460 pixman_composite_src_8888_0565_process_pixblock_head, \
461 pixman_composite_src_8888_0565_process_pixblock_tail, \
462 pixman_composite_src_8888_0565_process_pixblock_tail_head
464 /******************************************************************************/
466 .macro pixman_composite_src_0565_8888_process_pixblock_head
467 vshrn.u16 d30, q0, #8
468 vshrn.u16 d29, q0, #3
473 vshrn.u16 d28, q0, #2
476 .macro pixman_composite_src_0565_8888_process_pixblock_tail
479 /* TODO: expand macros and do better instructions scheduling */
480 .macro pixman_composite_src_0565_8888_process_pixblock_tail_head
481 pixman_composite_src_0565_8888_process_pixblock_tail
482 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
484 pixman_composite_src_0565_8888_process_pixblock_head
488 generate_composite_function \
489 pixman_composite_src_0565_8888_asm_neon, 16, 0, 32, \
490 FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
491 8, /* number of pixels, processed in a single block */ \
492 10, /* prefetch distance */ \
495 pixman_composite_src_0565_8888_process_pixblock_head, \
496 pixman_composite_src_0565_8888_process_pixblock_tail, \
497 pixman_composite_src_0565_8888_process_pixblock_tail_head
499 /******************************************************************************/
501 .macro pixman_composite_add_8_8_process_pixblock_head
506 .macro pixman_composite_add_8_8_process_pixblock_tail
509 .macro pixman_composite_add_8_8_process_pixblock_tail_head
511 PF add PF_X, PF_X, #32
513 vld1.8 {d4, d5, d6, d7}, [DST_R, :128]!
514 PF addne PF_X, PF_X, #32
515 PF subne PF_CTL, PF_CTL, #1
516 vst1.8 {d28, d29, d30, d31}, [DST_W, :128]!
518 PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
519 PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
520 PF subge PF_X, PF_X, ORIG_W
521 PF subges PF_CTL, PF_CTL, #0x10
523 PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
524 PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
528 generate_composite_function \
529 pixman_composite_add_8_8_asm_neon, 8, 0, 8, \
530 FLAG_DST_READWRITE, \
531 32, /* number of pixels, processed in a single block */ \
532 10, /* prefetch distance */ \
535 pixman_composite_add_8_8_process_pixblock_head, \
536 pixman_composite_add_8_8_process_pixblock_tail, \
537 pixman_composite_add_8_8_process_pixblock_tail_head
539 /******************************************************************************/
541 .macro pixman_composite_add_8888_8888_process_pixblock_tail_head
543 PF add PF_X, PF_X, #8
545 vld1.32 {d4, d5, d6, d7}, [DST_R, :128]!
546 PF addne PF_X, PF_X, #8
547 PF subne PF_CTL, PF_CTL, #1
548 vst1.32 {d28, d29, d30, d31}, [DST_W, :128]!
550 PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
551 PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
552 PF subge PF_X, PF_X, ORIG_W
553 PF subges PF_CTL, PF_CTL, #0x10
555 PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
556 PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
560 generate_composite_function \
561 pixman_composite_add_8888_8888_asm_neon, 32, 0, 32, \
562 FLAG_DST_READWRITE, \
563 8, /* number of pixels, processed in a single block */ \
564 10, /* prefetch distance */ \
567 pixman_composite_add_8_8_process_pixblock_head, \
568 pixman_composite_add_8_8_process_pixblock_tail, \
569 pixman_composite_add_8888_8888_process_pixblock_tail_head
571 generate_composite_function_single_scanline \
572 pixman_composite_scanline_add_asm_neon, 32, 0, 32, \
573 FLAG_DST_READWRITE, \
574 8, /* number of pixels, processed in a single block */ \
577 pixman_composite_add_8_8_process_pixblock_head, \
578 pixman_composite_add_8_8_process_pixblock_tail, \
579 pixman_composite_add_8888_8888_process_pixblock_tail_head
581 /******************************************************************************/
583 .macro pixman_composite_out_reverse_8888_8888_process_pixblock_head
584 vmvn.8 d24, d3 /* get inverted alpha */
585 /* do alpha blending */
588 vmull.u8 q10, d24, d6
589 vmull.u8 q11, d24, d7
592 .macro pixman_composite_out_reverse_8888_8888_process_pixblock_tail
593 vrshr.u16 q14, q8, #8
594 vrshr.u16 q15, q9, #8
595 vrshr.u16 q12, q10, #8
596 vrshr.u16 q13, q11, #8
597 vraddhn.u16 d28, q14, q8
598 vraddhn.u16 d29, q15, q9
599 vraddhn.u16 d30, q12, q10
600 vraddhn.u16 d31, q13, q11
603 .macro pixman_composite_out_reverse_8888_8888_process_pixblock_tail_head
604 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
605 vrshr.u16 q14, q8, #8
606 PF add PF_X, PF_X, #8
608 vrshr.u16 q15, q9, #8
609 vrshr.u16 q12, q10, #8
610 vrshr.u16 q13, q11, #8
611 PF addne PF_X, PF_X, #8
612 PF subne PF_CTL, PF_CTL, #1
613 vraddhn.u16 d28, q14, q8
614 vraddhn.u16 d29, q15, q9
616 vraddhn.u16 d30, q12, q10
617 vraddhn.u16 d31, q13, q11
619 PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
621 PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
622 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
623 PF subge PF_X, PF_X, ORIG_W
625 PF subges PF_CTL, PF_CTL, #0x10
627 PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
628 vmull.u8 q10, d22, d6
629 PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
630 vmull.u8 q11, d22, d7
633 generate_composite_function_single_scanline \
634 pixman_composite_scanline_out_reverse_asm_neon, 32, 0, 32, \
635 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
636 8, /* number of pixels, processed in a single block */ \
639 pixman_composite_out_reverse_8888_8888_process_pixblock_head, \
640 pixman_composite_out_reverse_8888_8888_process_pixblock_tail, \
641 pixman_composite_out_reverse_8888_8888_process_pixblock_tail_head
643 /******************************************************************************/
645 .macro pixman_composite_over_8888_8888_process_pixblock_head
646 pixman_composite_out_reverse_8888_8888_process_pixblock_head
649 .macro pixman_composite_over_8888_8888_process_pixblock_tail
650 pixman_composite_out_reverse_8888_8888_process_pixblock_tail
651 vqadd.u8 q14, q0, q14
652 vqadd.u8 q15, q1, q15
655 .macro pixman_composite_over_8888_8888_process_pixblock_tail_head
656 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
657 vrshr.u16 q14, q8, #8
658 PF add PF_X, PF_X, #8
660 vrshr.u16 q15, q9, #8
661 vrshr.u16 q12, q10, #8
662 vrshr.u16 q13, q11, #8
663 PF addne PF_X, PF_X, #8
664 PF subne PF_CTL, PF_CTL, #1
665 vraddhn.u16 d28, q14, q8
666 vraddhn.u16 d29, q15, q9
668 vraddhn.u16 d30, q12, q10
669 vraddhn.u16 d31, q13, q11
670 vqadd.u8 q14, q0, q14
671 vqadd.u8 q15, q1, q15
673 PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
675 PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
676 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
677 PF subge PF_X, PF_X, ORIG_W
679 PF subges PF_CTL, PF_CTL, #0x10
681 PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
682 vmull.u8 q10, d22, d6
683 PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
684 vmull.u8 q11, d22, d7
687 generate_composite_function \
688 pixman_composite_over_8888_8888_asm_neon, 32, 0, 32, \
689 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
690 8, /* number of pixels, processed in a single block */ \
691 5, /* prefetch distance */ \
694 pixman_composite_over_8888_8888_process_pixblock_head, \
695 pixman_composite_over_8888_8888_process_pixblock_tail, \
696 pixman_composite_over_8888_8888_process_pixblock_tail_head
698 generate_composite_function_single_scanline \
699 pixman_composite_scanline_over_asm_neon, 32, 0, 32, \
700 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
701 8, /* number of pixels, processed in a single block */ \
704 pixman_composite_over_8888_8888_process_pixblock_head, \
705 pixman_composite_over_8888_8888_process_pixblock_tail, \
706 pixman_composite_over_8888_8888_process_pixblock_tail_head
708 /******************************************************************************/
710 .macro pixman_composite_over_n_8888_process_pixblock_head
711 /* deinterleaved source pixels in {d0, d1, d2, d3} */
712 /* inverted alpha in {d24} */
713 /* destination pixels in {d4, d5, d6, d7} */
716 vmull.u8 q10, d24, d6
717 vmull.u8 q11, d24, d7
720 .macro pixman_composite_over_n_8888_process_pixblock_tail
721 vrshr.u16 q14, q8, #8
722 vrshr.u16 q15, q9, #8
723 vrshr.u16 q2, q10, #8
724 vrshr.u16 q3, q11, #8
725 vraddhn.u16 d28, q14, q8
726 vraddhn.u16 d29, q15, q9
727 vraddhn.u16 d30, q2, q10
728 vraddhn.u16 d31, q3, q11
729 vqadd.u8 q14, q0, q14
730 vqadd.u8 q15, q1, q15
733 .macro pixman_composite_over_n_8888_process_pixblock_tail_head
734 vrshr.u16 q14, q8, #8
735 vrshr.u16 q15, q9, #8
736 vrshr.u16 q2, q10, #8
737 vrshr.u16 q3, q11, #8
738 vraddhn.u16 d28, q14, q8
739 vraddhn.u16 d29, q15, q9
740 vraddhn.u16 d30, q2, q10
741 vraddhn.u16 d31, q3, q11
742 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
743 vqadd.u8 q14, q0, q14
744 PF add PF_X, PF_X, #8
746 PF addne PF_X, PF_X, #8
747 PF subne PF_CTL, PF_CTL, #1
748 vqadd.u8 q15, q1, q15
751 PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
753 PF subge PF_X, PF_X, ORIG_W
754 vmull.u8 q10, d24, d6
755 PF subges PF_CTL, PF_CTL, #0x10
756 vmull.u8 q11, d24, d7
757 PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
758 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
761 .macro pixman_composite_over_n_8888_init
762 add DUMMY, sp, #ARGS_STACK_OFFSET
763 vld1.32 {d3[0]}, [DUMMY]
768 vmvn.8 d24, d3 /* get inverted alpha */
771 generate_composite_function \
772 pixman_composite_over_n_8888_asm_neon, 0, 0, 32, \
773 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
774 8, /* number of pixels, processed in a single block */ \
775 5, /* prefetch distance */ \
776 pixman_composite_over_n_8888_init, \
778 pixman_composite_over_8888_8888_process_pixblock_head, \
779 pixman_composite_over_8888_8888_process_pixblock_tail, \
780 pixman_composite_over_n_8888_process_pixblock_tail_head
782 /******************************************************************************/
784 .macro pixman_composite_over_reverse_n_8888_process_pixblock_tail_head
785 vrshr.u16 q14, q8, #8
786 PF add PF_X, PF_X, #8
788 vrshr.u16 q15, q9, #8
789 vrshr.u16 q12, q10, #8
790 vrshr.u16 q13, q11, #8
791 PF addne PF_X, PF_X, #8
792 PF subne PF_CTL, PF_CTL, #1
793 vraddhn.u16 d28, q14, q8
794 vraddhn.u16 d29, q15, q9
796 vraddhn.u16 d30, q12, q10
797 vraddhn.u16 d31, q13, q11
798 vqadd.u8 q14, q0, q14
799 vqadd.u8 q15, q1, q15
800 vld4.8 {d0, d1, d2, d3}, [DST_R, :128]!
802 PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
803 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
804 PF subge PF_X, PF_X, ORIG_W
806 PF subges PF_CTL, PF_CTL, #0x10
808 vmull.u8 q10, d22, d6
809 PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
810 vmull.u8 q11, d22, d7
813 .macro pixman_composite_over_reverse_n_8888_init
814 add DUMMY, sp, #ARGS_STACK_OFFSET
815 vld1.32 {d7[0]}, [DUMMY]
822 generate_composite_function \
823 pixman_composite_over_reverse_n_8888_asm_neon, 0, 0, 32, \
824 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
825 8, /* number of pixels, processed in a single block */ \
826 5, /* prefetch distance */ \
827 pixman_composite_over_reverse_n_8888_init, \
829 pixman_composite_over_8888_8888_process_pixblock_head, \
830 pixman_composite_over_8888_8888_process_pixblock_tail, \
831 pixman_composite_over_reverse_n_8888_process_pixblock_tail_head, \
832 28, /* dst_w_basereg */ \
833 0, /* dst_r_basereg */ \
834 4, /* src_basereg */ \
835 24 /* mask_basereg */
837 /******************************************************************************/
839 .macro pixman_composite_over_8888_8_0565_process_pixblock_head
840 vmull.u8 q0, d24, d8 /* IN for SRC pixels (part1) */
842 vmull.u8 q6, d24, d10
843 vmull.u8 q7, d24, d11
844 vshrn.u16 d6, q2, #8 /* convert DST_R data to 32-bpp (part1) */
847 vrshr.u16 q8, q0, #8 /* IN for SRC pixels (part2) */
849 vrshr.u16 q10, q6, #8
850 vrshr.u16 q11, q7, #8
851 vraddhn.u16 d0, q0, q8
852 vraddhn.u16 d1, q1, q9
853 vraddhn.u16 d2, q6, q10
854 vraddhn.u16 d3, q7, q11
855 vsri.u8 d6, d6, #5 /* convert DST_R data to 32-bpp (part2) */
858 vshrn.u16 d30, q2, #2
859 vmull.u8 q8, d3, d6 /* now do alpha blending */
861 vmull.u8 q10, d3, d30
864 .macro pixman_composite_over_8888_8_0565_process_pixblock_tail
865 /* 3 cycle bubble (after vmull.u8) */
866 vrshr.u16 q13, q8, #8
867 vrshr.u16 q11, q9, #8
868 vrshr.u16 q15, q10, #8
869 vraddhn.u16 d16, q8, q13
870 vraddhn.u16 d27, q9, q11
871 vraddhn.u16 d26, q10, q15
872 vqadd.u8 d16, d2, d16
875 vshll.u8 q14, d16, #8 /* convert to 16bpp */
880 vsri.u16 q14, q9, #11
883 .macro pixman_composite_over_8888_8_0565_process_pixblock_tail_head
884 vld1.16 {d4, d5}, [DST_R, :128]!
889 vmull.u8 q6, d24, d10
890 vrshr.u16 q13, q8, #8
891 vrshr.u16 q11, q9, #8
892 vrshr.u16 q15, q10, #8
893 vraddhn.u16 d16, q8, q13
894 vraddhn.u16 d27, q9, q11
895 vraddhn.u16 d26, q10, q15
896 vqadd.u8 d16, d2, d16
899 vshll.u8 q14, d16, #8
904 vmull.u8 q7, d24, d11
905 vsri.u16 q14, q9, #11
912 vrshr.u16 q10, q6, #8
913 vrshr.u16 q11, q7, #8
914 vraddhn.u16 d0, q0, q8
915 vraddhn.u16 d1, q1, q9
916 vraddhn.u16 d2, q6, q10
917 vraddhn.u16 d3, q7, q11
921 vshrn.u16 d30, q2, #2
922 vst1.16 {d28, d29}, [DST_W, :128]!
925 vmull.u8 q10, d3, d30
928 generate_composite_function \
929 pixman_composite_over_8888_8_0565_asm_neon, 32, 8, 16, \
930 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
931 8, /* number of pixels, processed in a single block */ \
932 5, /* prefetch distance */ \
933 default_init_need_all_regs, \
934 default_cleanup_need_all_regs, \
935 pixman_composite_over_8888_8_0565_process_pixblock_head, \
936 pixman_composite_over_8888_8_0565_process_pixblock_tail, \
937 pixman_composite_over_8888_8_0565_process_pixblock_tail_head, \
938 28, /* dst_w_basereg */ \
939 4, /* dst_r_basereg */ \
940 8, /* src_basereg */ \
941 24 /* mask_basereg */
943 /******************************************************************************/
946 * This function needs a special initialization of solid mask.
947 * Solid source pixel data is fetched from stack at ARGS_STACK_OFFSET
948 * offset, split into color components and replicated in d8-d11
949 * registers. Additionally, this function needs all the NEON registers,
950 * so it has to save d8-d15 registers which are callee saved according
951 * to ABI. These registers are restored from 'cleanup' macro. All the
952 * other NEON registers are caller saved, so can be clobbered freely
953 * without introducing any problems.
955 .macro pixman_composite_over_n_8_0565_init
956 add DUMMY, sp, #ARGS_STACK_OFFSET
958 vld1.32 {d11[0]}, [DUMMY]
965 .macro pixman_composite_over_n_8_0565_cleanup
969 generate_composite_function \
970 pixman_composite_over_n_8_0565_asm_neon, 0, 8, 16, \
971 FLAG_DST_READWRITE, \
972 8, /* number of pixels, processed in a single block */ \
973 5, /* prefetch distance */ \
974 pixman_composite_over_n_8_0565_init, \
975 pixman_composite_over_n_8_0565_cleanup, \
976 pixman_composite_over_8888_8_0565_process_pixblock_head, \
977 pixman_composite_over_8888_8_0565_process_pixblock_tail, \
978 pixman_composite_over_8888_8_0565_process_pixblock_tail_head
980 /******************************************************************************/
982 .macro pixman_composite_over_8888_n_0565_init
983 add DUMMY, sp, #(ARGS_STACK_OFFSET + 8)
985 vld1.32 {d24[0]}, [DUMMY]
989 .macro pixman_composite_over_8888_n_0565_cleanup
993 generate_composite_function \
994 pixman_composite_over_8888_n_0565_asm_neon, 32, 0, 16, \
995 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
996 8, /* number of pixels, processed in a single block */ \
997 5, /* prefetch distance */ \
998 pixman_composite_over_8888_n_0565_init, \
999 pixman_composite_over_8888_n_0565_cleanup, \
1000 pixman_composite_over_8888_8_0565_process_pixblock_head, \
1001 pixman_composite_over_8888_8_0565_process_pixblock_tail, \
1002 pixman_composite_over_8888_8_0565_process_pixblock_tail_head, \
1003 28, /* dst_w_basereg */ \
1004 4, /* dst_r_basereg */ \
1005 8, /* src_basereg */ \
1006 24 /* mask_basereg */
1008 /******************************************************************************/
1010 .macro pixman_composite_src_0565_0565_process_pixblock_head
1013 .macro pixman_composite_src_0565_0565_process_pixblock_tail
1016 .macro pixman_composite_src_0565_0565_process_pixblock_tail_head
1017 vst1.16 {d0, d1, d2, d3}, [DST_W, :128]!
1019 cache_preload 16, 16
1022 generate_composite_function \
1023 pixman_composite_src_0565_0565_asm_neon, 16, 0, 16, \
1024 FLAG_DST_WRITEONLY, \
1025 16, /* number of pixels, processed in a single block */ \
1026 10, /* prefetch distance */ \
1029 pixman_composite_src_0565_0565_process_pixblock_head, \
1030 pixman_composite_src_0565_0565_process_pixblock_tail, \
1031 pixman_composite_src_0565_0565_process_pixblock_tail_head, \
1032 0, /* dst_w_basereg */ \
1033 0, /* dst_r_basereg */ \
1034 0, /* src_basereg */ \
1035 0 /* mask_basereg */
1037 /******************************************************************************/
1039 .macro pixman_composite_src_n_8_process_pixblock_head
1042 .macro pixman_composite_src_n_8_process_pixblock_tail
1045 .macro pixman_composite_src_n_8_process_pixblock_tail_head
1046 vst1.8 {d0, d1, d2, d3}, [DST_W, :128]!
1049 .macro pixman_composite_src_n_8_init
1050 add DUMMY, sp, #ARGS_STACK_OFFSET
1051 vld1.32 {d0[0]}, [DUMMY]
1053 vsli.u64 d0, d0, #16
1054 vsli.u64 d0, d0, #32
1059 .macro pixman_composite_src_n_8_cleanup
1062 generate_composite_function \
1063 pixman_composite_src_n_8_asm_neon, 0, 0, 8, \
1064 FLAG_DST_WRITEONLY, \
1065 32, /* number of pixels, processed in a single block */ \
1066 0, /* prefetch distance */ \
1067 pixman_composite_src_n_8_init, \
1068 pixman_composite_src_n_8_cleanup, \
1069 pixman_composite_src_n_8_process_pixblock_head, \
1070 pixman_composite_src_n_8_process_pixblock_tail, \
1071 pixman_composite_src_n_8_process_pixblock_tail_head, \
1072 0, /* dst_w_basereg */ \
1073 0, /* dst_r_basereg */ \
1074 0, /* src_basereg */ \
1075 0 /* mask_basereg */
1077 /******************************************************************************/
1079 .macro pixman_composite_src_n_0565_process_pixblock_head
1082 .macro pixman_composite_src_n_0565_process_pixblock_tail
1085 .macro pixman_composite_src_n_0565_process_pixblock_tail_head
1086 vst1.16 {d0, d1, d2, d3}, [DST_W, :128]!
1089 .macro pixman_composite_src_n_0565_init
1090 add DUMMY, sp, #ARGS_STACK_OFFSET
1091 vld1.32 {d0[0]}, [DUMMY]
1092 vsli.u64 d0, d0, #16
1093 vsli.u64 d0, d0, #32
1098 .macro pixman_composite_src_n_0565_cleanup
1101 generate_composite_function \
1102 pixman_composite_src_n_0565_asm_neon, 0, 0, 16, \
1103 FLAG_DST_WRITEONLY, \
1104 16, /* number of pixels, processed in a single block */ \
1105 0, /* prefetch distance */ \
1106 pixman_composite_src_n_0565_init, \
1107 pixman_composite_src_n_0565_cleanup, \
1108 pixman_composite_src_n_0565_process_pixblock_head, \
1109 pixman_composite_src_n_0565_process_pixblock_tail, \
1110 pixman_composite_src_n_0565_process_pixblock_tail_head, \
1111 0, /* dst_w_basereg */ \
1112 0, /* dst_r_basereg */ \
1113 0, /* src_basereg */ \
1114 0 /* mask_basereg */
1116 /******************************************************************************/
1118 .macro pixman_composite_src_n_8888_process_pixblock_head
1121 .macro pixman_composite_src_n_8888_process_pixblock_tail
1124 .macro pixman_composite_src_n_8888_process_pixblock_tail_head
1125 vst1.32 {d0, d1, d2, d3}, [DST_W, :128]!
1128 .macro pixman_composite_src_n_8888_init
1129 add DUMMY, sp, #ARGS_STACK_OFFSET
1130 vld1.32 {d0[0]}, [DUMMY]
1131 vsli.u64 d0, d0, #32
1136 .macro pixman_composite_src_n_8888_cleanup
1139 generate_composite_function \
1140 pixman_composite_src_n_8888_asm_neon, 0, 0, 32, \
1141 FLAG_DST_WRITEONLY, \
1142 8, /* number of pixels, processed in a single block */ \
1143 0, /* prefetch distance */ \
1144 pixman_composite_src_n_8888_init, \
1145 pixman_composite_src_n_8888_cleanup, \
1146 pixman_composite_src_n_8888_process_pixblock_head, \
1147 pixman_composite_src_n_8888_process_pixblock_tail, \
1148 pixman_composite_src_n_8888_process_pixblock_tail_head, \
1149 0, /* dst_w_basereg */ \
1150 0, /* dst_r_basereg */ \
1151 0, /* src_basereg */ \
1152 0 /* mask_basereg */
1154 /******************************************************************************/
1156 .macro pixman_composite_src_8888_8888_process_pixblock_head
1159 .macro pixman_composite_src_8888_8888_process_pixblock_tail
1162 .macro pixman_composite_src_8888_8888_process_pixblock_tail_head
1163 vst1.32 {d0, d1, d2, d3}, [DST_W, :128]!
1168 generate_composite_function \
1169 pixman_composite_src_8888_8888_asm_neon, 32, 0, 32, \
1170 FLAG_DST_WRITEONLY, \
1171 8, /* number of pixels, processed in a single block */ \
1172 10, /* prefetch distance */ \
1175 pixman_composite_src_8888_8888_process_pixblock_head, \
1176 pixman_composite_src_8888_8888_process_pixblock_tail, \
1177 pixman_composite_src_8888_8888_process_pixblock_tail_head, \
1178 0, /* dst_w_basereg */ \
1179 0, /* dst_r_basereg */ \
1180 0, /* src_basereg */ \
1181 0 /* mask_basereg */
1183 /******************************************************************************/
1185 .macro pixman_composite_src_x888_8888_process_pixblock_head
1190 .macro pixman_composite_src_x888_8888_process_pixblock_tail
1193 .macro pixman_composite_src_x888_8888_process_pixblock_tail_head
1194 vst1.32 {d0, d1, d2, d3}, [DST_W, :128]!
1201 .macro pixman_composite_src_x888_8888_init
1203 vshl.u32 q2, q2, #24
1206 generate_composite_function \
1207 pixman_composite_src_x888_8888_asm_neon, 32, 0, 32, \
1208 FLAG_DST_WRITEONLY, \
1209 8, /* number of pixels, processed in a single block */ \
1210 10, /* prefetch distance */ \
1211 pixman_composite_src_x888_8888_init, \
1213 pixman_composite_src_x888_8888_process_pixblock_head, \
1214 pixman_composite_src_x888_8888_process_pixblock_tail, \
1215 pixman_composite_src_x888_8888_process_pixblock_tail_head, \
1216 0, /* dst_w_basereg */ \
1217 0, /* dst_r_basereg */ \
1218 0, /* src_basereg */ \
1219 0 /* mask_basereg */
1221 /******************************************************************************/
1223 .macro pixman_composite_src_n_8_8888_process_pixblock_head
1224 /* expecting solid source in {d0, d1, d2, d3} */
1225 /* mask is in d24 (d25, d26, d27 are unused) */
1228 vmull.u8 q8, d24, d0
1229 vmull.u8 q9, d24, d1
1230 vmull.u8 q10, d24, d2
1231 vmull.u8 q11, d24, d3
1232 vrsra.u16 q8, q8, #8
1233 vrsra.u16 q9, q9, #8
1234 vrsra.u16 q10, q10, #8
1235 vrsra.u16 q11, q11, #8
1238 .macro pixman_composite_src_n_8_8888_process_pixblock_tail
1239 vrshrn.u16 d28, q8, #8
1240 vrshrn.u16 d29, q9, #8
1241 vrshrn.u16 d30, q10, #8
1242 vrshrn.u16 d31, q11, #8
1245 .macro pixman_composite_src_n_8_8888_process_pixblock_tail_head
1247 PF add PF_X, PF_X, #8
1248 vrshrn.u16 d28, q8, #8
1249 PF tst PF_CTL, #0x0F
1250 vrshrn.u16 d29, q9, #8
1251 PF addne PF_X, PF_X, #8
1252 vrshrn.u16 d30, q10, #8
1253 PF subne PF_CTL, PF_CTL, #1
1254 vrshrn.u16 d31, q11, #8
1256 vmull.u8 q8, d24, d0
1257 PF pld, [PF_MASK, PF_X, lsl #mask_bpp_shift]
1258 vmull.u8 q9, d24, d1
1259 PF subge PF_X, PF_X, ORIG_W
1260 vmull.u8 q10, d24, d2
1261 PF subges PF_CTL, PF_CTL, #0x10
1262 vmull.u8 q11, d24, d3
1263 PF ldrgeb DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]!
1264 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
1265 vrsra.u16 q8, q8, #8
1266 vrsra.u16 q9, q9, #8
1267 vrsra.u16 q10, q10, #8
1268 vrsra.u16 q11, q11, #8
1271 .macro pixman_composite_src_n_8_8888_init
1272 add DUMMY, sp, #ARGS_STACK_OFFSET
1273 vld1.32 {d3[0]}, [DUMMY]
1280 .macro pixman_composite_src_n_8_8888_cleanup
1283 generate_composite_function \
1284 pixman_composite_src_n_8_8888_asm_neon, 0, 8, 32, \
1285 FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
1286 8, /* number of pixels, processed in a single block */ \
1287 5, /* prefetch distance */ \
1288 pixman_composite_src_n_8_8888_init, \
1289 pixman_composite_src_n_8_8888_cleanup, \
1290 pixman_composite_src_n_8_8888_process_pixblock_head, \
1291 pixman_composite_src_n_8_8888_process_pixblock_tail, \
1292 pixman_composite_src_n_8_8888_process_pixblock_tail_head, \
1294 /******************************************************************************/
1296 .macro pixman_composite_src_n_8_8_process_pixblock_head
1297 vmull.u8 q0, d24, d16
1298 vmull.u8 q1, d25, d16
1299 vmull.u8 q2, d26, d16
1300 vmull.u8 q3, d27, d16
1301 vrsra.u16 q0, q0, #8
1302 vrsra.u16 q1, q1, #8
1303 vrsra.u16 q2, q2, #8
1304 vrsra.u16 q3, q3, #8
1307 .macro pixman_composite_src_n_8_8_process_pixblock_tail
1308 vrshrn.u16 d28, q0, #8
1309 vrshrn.u16 d29, q1, #8
1310 vrshrn.u16 d30, q2, #8
1311 vrshrn.u16 d31, q3, #8
1314 .macro pixman_composite_src_n_8_8_process_pixblock_tail_head
1316 PF add PF_X, PF_X, #8
1317 vrshrn.u16 d28, q0, #8
1318 PF tst PF_CTL, #0x0F
1319 vrshrn.u16 d29, q1, #8
1320 PF addne PF_X, PF_X, #8
1321 vrshrn.u16 d30, q2, #8
1322 PF subne PF_CTL, PF_CTL, #1
1323 vrshrn.u16 d31, q3, #8
1325 vmull.u8 q0, d24, d16
1326 PF pld, [PF_MASK, PF_X, lsl #mask_bpp_shift]
1327 vmull.u8 q1, d25, d16
1328 PF subge PF_X, PF_X, ORIG_W
1329 vmull.u8 q2, d26, d16
1330 PF subges PF_CTL, PF_CTL, #0x10
1331 vmull.u8 q3, d27, d16
1332 PF ldrgeb DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]!
1333 vst1.8 {d28, d29, d30, d31}, [DST_W, :128]!
1334 vrsra.u16 q0, q0, #8
1335 vrsra.u16 q1, q1, #8
1336 vrsra.u16 q2, q2, #8
1337 vrsra.u16 q3, q3, #8
1340 .macro pixman_composite_src_n_8_8_init
1341 add DUMMY, sp, #ARGS_STACK_OFFSET
1342 vld1.32 {d16[0]}, [DUMMY]
1346 .macro pixman_composite_src_n_8_8_cleanup
1349 generate_composite_function \
1350 pixman_composite_src_n_8_8_asm_neon, 0, 8, 8, \
1351 FLAG_DST_WRITEONLY, \
1352 32, /* number of pixels, processed in a single block */ \
1353 5, /* prefetch distance */ \
1354 pixman_composite_src_n_8_8_init, \
1355 pixman_composite_src_n_8_8_cleanup, \
1356 pixman_composite_src_n_8_8_process_pixblock_head, \
1357 pixman_composite_src_n_8_8_process_pixblock_tail, \
1358 pixman_composite_src_n_8_8_process_pixblock_tail_head
1360 /******************************************************************************/
1362 .macro pixman_composite_over_n_8_8888_process_pixblock_head
1363 /* expecting deinterleaved source data in {d8, d9, d10, d11} */
1364 /* d8 - blue, d9 - green, d10 - red, d11 - alpha */
1365 /* and destination data in {d4, d5, d6, d7} */
1366 /* mask is in d24 (d25, d26, d27 are unused) */
1369 vmull.u8 q6, d24, d8
1370 vmull.u8 q7, d24, d9
1371 vmull.u8 q8, d24, d10
1372 vmull.u8 q9, d24, d11
1373 vrshr.u16 q10, q6, #8
1374 vrshr.u16 q11, q7, #8
1375 vrshr.u16 q12, q8, #8
1376 vrshr.u16 q13, q9, #8
1377 vraddhn.u16 d0, q6, q10
1378 vraddhn.u16 d1, q7, q11
1379 vraddhn.u16 d2, q8, q12
1380 vraddhn.u16 d3, q9, q13
1381 vmvn.8 d25, d3 /* get inverted alpha */
1382 /* source: d0 - blue, d1 - green, d2 - red, d3 - alpha */
1383 /* destination: d4 - blue, d5 - green, d6 - red, d7 - alpha */
1384 /* now do alpha blending */
1385 vmull.u8 q8, d25, d4
1386 vmull.u8 q9, d25, d5
1387 vmull.u8 q10, d25, d6
1388 vmull.u8 q11, d25, d7
1391 .macro pixman_composite_over_n_8_8888_process_pixblock_tail
1392 vrshr.u16 q14, q8, #8
1393 vrshr.u16 q15, q9, #8
1394 vrshr.u16 q6, q10, #8
1395 vrshr.u16 q7, q11, #8
1396 vraddhn.u16 d28, q14, q8
1397 vraddhn.u16 d29, q15, q9
1398 vraddhn.u16 d30, q6, q10
1399 vraddhn.u16 d31, q7, q11
1400 vqadd.u8 q14, q0, q14
1401 vqadd.u8 q15, q1, q15
1404 .macro pixman_composite_over_n_8_8888_process_pixblock_tail_head
1405 vrshr.u16 q14, q8, #8
1406 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
1407 vrshr.u16 q15, q9, #8
1409 vrshr.u16 q6, q10, #8
1410 PF add PF_X, PF_X, #8
1411 vrshr.u16 q7, q11, #8
1412 PF tst PF_CTL, #0x0F
1413 vraddhn.u16 d28, q14, q8
1414 PF addne PF_X, PF_X, #8
1415 vraddhn.u16 d29, q15, q9
1416 PF subne PF_CTL, PF_CTL, #1
1417 vraddhn.u16 d30, q6, q10
1419 vraddhn.u16 d31, q7, q11
1420 PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
1421 vmull.u8 q6, d24, d8
1422 PF pld, [PF_MASK, PF_X, lsl #mask_bpp_shift]
1423 vmull.u8 q7, d24, d9
1424 PF subge PF_X, PF_X, ORIG_W
1425 vmull.u8 q8, d24, d10
1426 PF subges PF_CTL, PF_CTL, #0x10
1427 vmull.u8 q9, d24, d11
1428 PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
1429 vqadd.u8 q14, q0, q14
1430 PF ldrgeb DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]!
1431 vqadd.u8 q15, q1, q15
1432 vrshr.u16 q10, q6, #8
1433 vrshr.u16 q11, q7, #8
1434 vrshr.u16 q12, q8, #8
1435 vrshr.u16 q13, q9, #8
1436 vraddhn.u16 d0, q6, q10
1437 vraddhn.u16 d1, q7, q11
1438 vraddhn.u16 d2, q8, q12
1439 vraddhn.u16 d3, q9, q13
1440 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
1442 vmull.u8 q8, d25, d4
1443 vmull.u8 q9, d25, d5
1444 vmull.u8 q10, d25, d6
1445 vmull.u8 q11, d25, d7
1448 .macro pixman_composite_over_n_8_8888_init
1449 add DUMMY, sp, #ARGS_STACK_OFFSET
1451 vld1.32 {d11[0]}, [DUMMY]
1458 .macro pixman_composite_over_n_8_8888_cleanup
1462 generate_composite_function \
1463 pixman_composite_over_n_8_8888_asm_neon, 0, 8, 32, \
1464 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
1465 8, /* number of pixels, processed in a single block */ \
1466 5, /* prefetch distance */ \
1467 pixman_composite_over_n_8_8888_init, \
1468 pixman_composite_over_n_8_8888_cleanup, \
1469 pixman_composite_over_n_8_8888_process_pixblock_head, \
1470 pixman_composite_over_n_8_8888_process_pixblock_tail, \
1471 pixman_composite_over_n_8_8888_process_pixblock_tail_head
1473 /******************************************************************************/
1475 .macro pixman_composite_over_n_8_8_process_pixblock_head
1476 vmull.u8 q0, d24, d8
1477 vmull.u8 q1, d25, d8
1478 vmull.u8 q6, d26, d8
1479 vmull.u8 q7, d27, d8
1480 vrshr.u16 q10, q0, #8
1481 vrshr.u16 q11, q1, #8
1482 vrshr.u16 q12, q6, #8
1483 vrshr.u16 q13, q7, #8
1484 vraddhn.u16 d0, q0, q10
1485 vraddhn.u16 d1, q1, q11
1486 vraddhn.u16 d2, q6, q12
1487 vraddhn.u16 d3, q7, q13
1490 vmull.u8 q8, d24, d4
1491 vmull.u8 q9, d25, d5
1492 vmull.u8 q10, d26, d6
1493 vmull.u8 q11, d27, d7
1496 .macro pixman_composite_over_n_8_8_process_pixblock_tail
1497 vrshr.u16 q14, q8, #8
1498 vrshr.u16 q15, q9, #8
1499 vrshr.u16 q12, q10, #8
1500 vrshr.u16 q13, q11, #8
1501 vraddhn.u16 d28, q14, q8
1502 vraddhn.u16 d29, q15, q9
1503 vraddhn.u16 d30, q12, q10
1504 vraddhn.u16 d31, q13, q11
1505 vqadd.u8 q14, q0, q14
1506 vqadd.u8 q15, q1, q15
1509 /* TODO: expand macros and do better instructions scheduling */
1510 .macro pixman_composite_over_n_8_8_process_pixblock_tail_head
1511 vld1.8 {d4, d5, d6, d7}, [DST_R, :128]!
1512 pixman_composite_over_n_8_8_process_pixblock_tail
1514 cache_preload 32, 32
1515 vst1.8 {d28, d29, d30, d31}, [DST_W, :128]!
1516 pixman_composite_over_n_8_8_process_pixblock_head
1519 .macro pixman_composite_over_n_8_8_init
1520 add DUMMY, sp, #ARGS_STACK_OFFSET
1522 vld1.32 {d8[0]}, [DUMMY]
1526 .macro pixman_composite_over_n_8_8_cleanup
1530 generate_composite_function \
1531 pixman_composite_over_n_8_8_asm_neon, 0, 8, 8, \
1532 FLAG_DST_READWRITE, \
1533 32, /* number of pixels, processed in a single block */ \
1534 5, /* prefetch distance */ \
1535 pixman_composite_over_n_8_8_init, \
1536 pixman_composite_over_n_8_8_cleanup, \
1537 pixman_composite_over_n_8_8_process_pixblock_head, \
1538 pixman_composite_over_n_8_8_process_pixblock_tail, \
1539 pixman_composite_over_n_8_8_process_pixblock_tail_head
1541 /******************************************************************************/
1543 .macro pixman_composite_over_n_8888_8888_ca_process_pixblock_head
1545 * 'combine_mask_ca' replacement
1547 * input: solid src (n) in {d8, d9, d10, d11}
1548 * dest in {d4, d5, d6, d7 }
1549 * mask in {d24, d25, d26, d27}
1550 * output: updated src in {d0, d1, d2, d3 }
1551 * updated mask in {d24, d25, d26, d3 }
1553 vmull.u8 q0, d24, d8
1554 vmull.u8 q1, d25, d9
1555 vmull.u8 q6, d26, d10
1556 vmull.u8 q7, d27, d11
1557 vmull.u8 q9, d11, d25
1558 vmull.u8 q12, d11, d24
1559 vmull.u8 q13, d11, d26
1560 vrshr.u16 q8, q0, #8
1561 vrshr.u16 q10, q1, #8
1562 vrshr.u16 q11, q6, #8
1563 vraddhn.u16 d0, q0, q8
1564 vraddhn.u16 d1, q1, q10
1565 vraddhn.u16 d2, q6, q11
1566 vrshr.u16 q11, q12, #8
1567 vrshr.u16 q8, q9, #8
1568 vrshr.u16 q6, q13, #8
1569 vrshr.u16 q10, q7, #8
1570 vraddhn.u16 d24, q12, q11
1571 vraddhn.u16 d25, q9, q8
1572 vraddhn.u16 d26, q13, q6
1573 vraddhn.u16 d3, q7, q10
1575 * 'combine_over_ca' replacement
1577 * output: updated dest in {d28, d29, d30, d31}
1581 vmull.u8 q8, d24, d4
1582 vmull.u8 q9, d25, d5
1584 vmull.u8 q10, d26, d6
1585 vmull.u8 q11, d27, d7
1588 .macro pixman_composite_over_n_8888_8888_ca_process_pixblock_tail
1589 /* ... continue 'combine_over_ca' replacement */
1590 vrshr.u16 q14, q8, #8
1591 vrshr.u16 q15, q9, #8
1592 vrshr.u16 q6, q10, #8
1593 vrshr.u16 q7, q11, #8
1594 vraddhn.u16 d28, q14, q8
1595 vraddhn.u16 d29, q15, q9
1596 vraddhn.u16 d30, q6, q10
1597 vraddhn.u16 d31, q7, q11
1598 vqadd.u8 q14, q0, q14
1599 vqadd.u8 q15, q1, q15
1602 .macro pixman_composite_over_n_8888_8888_ca_process_pixblock_tail_head
1603 vrshr.u16 q14, q8, #8
1604 vrshr.u16 q15, q9, #8
1605 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
1606 vrshr.u16 q6, q10, #8
1607 vrshr.u16 q7, q11, #8
1608 vraddhn.u16 d28, q14, q8
1609 vraddhn.u16 d29, q15, q9
1610 vraddhn.u16 d30, q6, q10
1611 vraddhn.u16 d31, q7, q11
1613 vqadd.u8 q14, q0, q14
1614 vqadd.u8 q15, q1, q15
1616 pixman_composite_over_n_8888_8888_ca_process_pixblock_head
1617 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
1620 .macro pixman_composite_over_n_8888_8888_ca_init
1621 add DUMMY, sp, #ARGS_STACK_OFFSET
1623 vld1.32 {d11[0]}, [DUMMY]
1630 .macro pixman_composite_over_n_8888_8888_ca_cleanup
1634 generate_composite_function \
1635 pixman_composite_over_n_8888_8888_ca_asm_neon, 0, 32, 32, \
1636 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
1637 8, /* number of pixels, processed in a single block */ \
1638 5, /* prefetch distance */ \
1639 pixman_composite_over_n_8888_8888_ca_init, \
1640 pixman_composite_over_n_8888_8888_ca_cleanup, \
1641 pixman_composite_over_n_8888_8888_ca_process_pixblock_head, \
1642 pixman_composite_over_n_8888_8888_ca_process_pixblock_tail, \
1643 pixman_composite_over_n_8888_8888_ca_process_pixblock_tail_head
1645 /******************************************************************************/
1647 .macro pixman_composite_over_n_8888_0565_ca_process_pixblock_head
1649 * 'combine_mask_ca' replacement
1651 * input: solid src (n) in {d8, d9, d10, d11} [B, G, R, A]
1652 * mask in {d24, d25, d26} [B, G, R]
1653 * output: updated src in {d0, d1, d2 } [B, G, R]
1654 * updated mask in {d24, d25, d26} [B, G, R]
1656 vmull.u8 q0, d24, d8
1657 vmull.u8 q1, d25, d9
1658 vmull.u8 q6, d26, d10
1659 vmull.u8 q9, d11, d25
1660 vmull.u8 q12, d11, d24
1661 vmull.u8 q13, d11, d26
1662 vrshr.u16 q8, q0, #8
1663 vrshr.u16 q10, q1, #8
1664 vrshr.u16 q11, q6, #8
1665 vraddhn.u16 d0, q0, q8
1666 vraddhn.u16 d1, q1, q10
1667 vraddhn.u16 d2, q6, q11
1668 vrshr.u16 q11, q12, #8
1669 vrshr.u16 q8, q9, #8
1670 vrshr.u16 q6, q13, #8
1671 vraddhn.u16 d24, q12, q11
1672 vraddhn.u16 d25, q9, q8
1674 * convert 8 r5g6b5 pixel data from {d4, d5} to planar 8-bit format
1675 * and put data into d16 - blue, d17 - green, d18 - red
1677 vshrn.u16 d17, q2, #3
1678 vshrn.u16 d18, q2, #8
1679 vraddhn.u16 d26, q13, q6
1681 vsri.u8 d18, d18, #5
1682 vsri.u8 d17, d17, #6
1684 * 'combine_over_ca' replacement
1686 * output: updated dest in d16 - blue, d17 - green, d18 - red
1689 vshrn.u16 d16, q2, #2
1691 vmull.u8 q6, d16, d24
1692 vmull.u8 q7, d17, d25
1693 vmull.u8 q11, d18, d26
1696 .macro pixman_composite_over_n_8888_0565_ca_process_pixblock_tail
1697 /* ... continue 'combine_over_ca' replacement */
1698 vrshr.u16 q10, q6, #8
1699 vrshr.u16 q14, q7, #8
1700 vrshr.u16 q15, q11, #8
1701 vraddhn.u16 d16, q10, q6
1702 vraddhn.u16 d17, q14, q7
1703 vraddhn.u16 d18, q15, q11
1705 vqadd.u8 d18, d2, d18
1707 * convert the results in d16, d17, d18 to r5g6b5 and store
1708 * them into {d28, d29}
1710 vshll.u8 q14, d18, #8
1711 vshll.u8 q10, d17, #8
1712 vshll.u8 q15, d16, #8
1713 vsri.u16 q14, q10, #5
1714 vsri.u16 q14, q15, #11
1717 .macro pixman_composite_over_n_8888_0565_ca_process_pixblock_tail_head
1719 vrshr.u16 q10, q6, #8
1720 vrshr.u16 q14, q7, #8
1721 vld1.16 {d4, d5}, [DST_R, :128]!
1722 vrshr.u16 q15, q11, #8
1723 vraddhn.u16 d16, q10, q6
1724 vraddhn.u16 d17, q14, q7
1725 vraddhn.u16 d22, q15, q11
1726 /* process_pixblock_head */
1728 * 'combine_mask_ca' replacement
1730 * input: solid src (n) in {d8, d9, d10, d11} [B, G, R, A]
1731 * mask in {d24, d25, d26} [B, G, R]
1732 * output: updated src in {d0, d1, d2 } [B, G, R]
1733 * updated mask in {d24, d25, d26} [B, G, R]
1735 vmull.u8 q6, d26, d10
1737 vmull.u8 q0, d24, d8
1738 vqadd.u8 d22, d2, d22
1739 vmull.u8 q1, d25, d9
1741 * convert the result in d16, d17, d22 to r5g6b5 and store
1742 * it into {d28, d29}
1744 vshll.u8 q14, d22, #8
1745 vshll.u8 q10, d17, #8
1746 vshll.u8 q15, d16, #8
1747 vmull.u8 q9, d11, d25
1748 vsri.u16 q14, q10, #5
1749 vmull.u8 q12, d11, d24
1750 vmull.u8 q13, d11, d26
1751 vsri.u16 q14, q15, #11
1753 vrshr.u16 q8, q0, #8
1754 vrshr.u16 q10, q1, #8
1755 vrshr.u16 q11, q6, #8
1756 vraddhn.u16 d0, q0, q8
1757 vraddhn.u16 d1, q1, q10
1758 vraddhn.u16 d2, q6, q11
1759 vrshr.u16 q11, q12, #8
1760 vrshr.u16 q8, q9, #8
1761 vrshr.u16 q6, q13, #8
1762 vraddhn.u16 d24, q12, q11
1763 vraddhn.u16 d25, q9, q8
1765 * convert 8 r5g6b5 pixel data from {d4, d5} to planar
1766 * 8-bit format and put data into d16 - blue, d17 - green,
1769 vshrn.u16 d17, q2, #3
1770 vshrn.u16 d18, q2, #8
1771 vraddhn.u16 d26, q13, q6
1773 vsri.u8 d17, d17, #6
1774 vsri.u8 d18, d18, #5
1776 * 'combine_over_ca' replacement
1778 * output: updated dest in d16 - blue, d17 - green, d18 - red
1781 vshrn.u16 d16, q2, #2
1783 vmull.u8 q7, d17, d25
1784 vmull.u8 q6, d16, d24
1785 vmull.u8 q11, d18, d26
1786 vst1.16 {d28, d29}, [DST_W, :128]!
1789 .macro pixman_composite_over_n_8888_0565_ca_init
1790 add DUMMY, sp, #ARGS_STACK_OFFSET
1792 vld1.32 {d11[0]}, [DUMMY]
1799 .macro pixman_composite_over_n_8888_0565_ca_cleanup
1803 generate_composite_function \
1804 pixman_composite_over_n_8888_0565_ca_asm_neon, 0, 32, 16, \
1805 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
1806 8, /* number of pixels, processed in a single block */ \
1807 5, /* prefetch distance */ \
1808 pixman_composite_over_n_8888_0565_ca_init, \
1809 pixman_composite_over_n_8888_0565_ca_cleanup, \
1810 pixman_composite_over_n_8888_0565_ca_process_pixblock_head, \
1811 pixman_composite_over_n_8888_0565_ca_process_pixblock_tail, \
1812 pixman_composite_over_n_8888_0565_ca_process_pixblock_tail_head
1814 /******************************************************************************/
1816 .macro pixman_composite_in_n_8_process_pixblock_head
1817 /* expecting source data in {d0, d1, d2, d3} */
1818 /* and destination data in {d4, d5, d6, d7} */
1821 vmull.u8 q10, d6, d3
1822 vmull.u8 q11, d7, d3
1825 .macro pixman_composite_in_n_8_process_pixblock_tail
1826 vrshr.u16 q14, q8, #8
1827 vrshr.u16 q15, q9, #8
1828 vrshr.u16 q12, q10, #8
1829 vrshr.u16 q13, q11, #8
1830 vraddhn.u16 d28, q8, q14
1831 vraddhn.u16 d29, q9, q15
1832 vraddhn.u16 d30, q10, q12
1833 vraddhn.u16 d31, q11, q13
1836 .macro pixman_composite_in_n_8_process_pixblock_tail_head
1837 pixman_composite_in_n_8_process_pixblock_tail
1838 vld1.8 {d4, d5, d6, d7}, [DST_R, :128]!
1839 cache_preload 32, 32
1840 pixman_composite_in_n_8_process_pixblock_head
1841 vst1.8 {d28, d29, d30, d31}, [DST_W, :128]!
1844 .macro pixman_composite_in_n_8_init
1845 add DUMMY, sp, #ARGS_STACK_OFFSET
1846 vld1.32 {d3[0]}, [DUMMY]
1850 .macro pixman_composite_in_n_8_cleanup
1853 generate_composite_function \
1854 pixman_composite_in_n_8_asm_neon, 0, 0, 8, \
1855 FLAG_DST_READWRITE, \
1856 32, /* number of pixels, processed in a single block */ \
1857 5, /* prefetch distance */ \
1858 pixman_composite_in_n_8_init, \
1859 pixman_composite_in_n_8_cleanup, \
1860 pixman_composite_in_n_8_process_pixblock_head, \
1861 pixman_composite_in_n_8_process_pixblock_tail, \
1862 pixman_composite_in_n_8_process_pixblock_tail_head, \
1863 28, /* dst_w_basereg */ \
1864 4, /* dst_r_basereg */ \
1865 0, /* src_basereg */ \
1866 24 /* mask_basereg */
1868 .macro pixman_composite_add_n_8_8_process_pixblock_head
1869 /* expecting source data in {d8, d9, d10, d11} */
1870 /* d8 - blue, d9 - green, d10 - red, d11 - alpha */
1871 /* and destination data in {d4, d5, d6, d7} */
1872 /* mask is in d24, d25, d26, d27 */
1873 vmull.u8 q0, d24, d11
1874 vmull.u8 q1, d25, d11
1875 vmull.u8 q6, d26, d11
1876 vmull.u8 q7, d27, d11
1877 vrshr.u16 q10, q0, #8
1878 vrshr.u16 q11, q1, #8
1879 vrshr.u16 q12, q6, #8
1880 vrshr.u16 q13, q7, #8
1881 vraddhn.u16 d0, q0, q10
1882 vraddhn.u16 d1, q1, q11
1883 vraddhn.u16 d2, q6, q12
1884 vraddhn.u16 d3, q7, q13
1885 vqadd.u8 q14, q0, q2
1886 vqadd.u8 q15, q1, q3
1889 .macro pixman_composite_add_n_8_8_process_pixblock_tail
1892 /* TODO: expand macros and do better instructions scheduling */
1893 .macro pixman_composite_add_n_8_8_process_pixblock_tail_head
1894 pixman_composite_add_n_8_8_process_pixblock_tail
1895 vst1.8 {d28, d29, d30, d31}, [DST_W, :128]!
1896 vld1.8 {d4, d5, d6, d7}, [DST_R, :128]!
1898 cache_preload 32, 32
1899 pixman_composite_add_n_8_8_process_pixblock_head
1902 .macro pixman_composite_add_n_8_8_init
1903 add DUMMY, sp, #ARGS_STACK_OFFSET
1905 vld1.32 {d11[0]}, [DUMMY]
1909 .macro pixman_composite_add_n_8_8_cleanup
1913 generate_composite_function \
1914 pixman_composite_add_n_8_8_asm_neon, 0, 8, 8, \
1915 FLAG_DST_READWRITE, \
1916 32, /* number of pixels, processed in a single block */ \
1917 5, /* prefetch distance */ \
1918 pixman_composite_add_n_8_8_init, \
1919 pixman_composite_add_n_8_8_cleanup, \
1920 pixman_composite_add_n_8_8_process_pixblock_head, \
1921 pixman_composite_add_n_8_8_process_pixblock_tail, \
1922 pixman_composite_add_n_8_8_process_pixblock_tail_head
1924 /******************************************************************************/
1926 .macro pixman_composite_add_8_8_8_process_pixblock_head
1927 /* expecting source data in {d0, d1, d2, d3} */
1928 /* destination data in {d4, d5, d6, d7} */
1929 /* mask in {d24, d25, d26, d27} */
1930 vmull.u8 q8, d24, d0
1931 vmull.u8 q9, d25, d1
1932 vmull.u8 q10, d26, d2
1933 vmull.u8 q11, d27, d3
1934 vrshr.u16 q0, q8, #8
1935 vrshr.u16 q1, q9, #8
1936 vrshr.u16 q12, q10, #8
1937 vrshr.u16 q13, q11, #8
1938 vraddhn.u16 d0, q0, q8
1939 vraddhn.u16 d1, q1, q9
1940 vraddhn.u16 d2, q12, q10
1941 vraddhn.u16 d3, q13, q11
1942 vqadd.u8 q14, q0, q2
1943 vqadd.u8 q15, q1, q3
1946 .macro pixman_composite_add_8_8_8_process_pixblock_tail
1949 /* TODO: expand macros and do better instructions scheduling */
1950 .macro pixman_composite_add_8_8_8_process_pixblock_tail_head
1951 pixman_composite_add_8_8_8_process_pixblock_tail
1952 vst1.8 {d28, d29, d30, d31}, [DST_W, :128]!
1953 vld1.8 {d4, d5, d6, d7}, [DST_R, :128]!
1956 cache_preload 32, 32
1957 pixman_composite_add_8_8_8_process_pixblock_head
1960 .macro pixman_composite_add_8_8_8_init
1963 .macro pixman_composite_add_8_8_8_cleanup
1966 generate_composite_function \
1967 pixman_composite_add_8_8_8_asm_neon, 8, 8, 8, \
1968 FLAG_DST_READWRITE, \
1969 32, /* number of pixels, processed in a single block */ \
1970 5, /* prefetch distance */ \
1971 pixman_composite_add_8_8_8_init, \
1972 pixman_composite_add_8_8_8_cleanup, \
1973 pixman_composite_add_8_8_8_process_pixblock_head, \
1974 pixman_composite_add_8_8_8_process_pixblock_tail, \
1975 pixman_composite_add_8_8_8_process_pixblock_tail_head
1977 /******************************************************************************/
1979 .macro pixman_composite_add_8888_8888_8888_process_pixblock_head
1980 /* expecting source data in {d0, d1, d2, d3} */
1981 /* destination data in {d4, d5, d6, d7} */
1982 /* mask in {d24, d25, d26, d27} */
1983 vmull.u8 q8, d27, d0
1984 vmull.u8 q9, d27, d1
1985 vmull.u8 q10, d27, d2
1986 vmull.u8 q11, d27, d3
1987 /* 1 cycle bubble */
1988 vrsra.u16 q8, q8, #8
1989 vrsra.u16 q9, q9, #8
1990 vrsra.u16 q10, q10, #8
1991 vrsra.u16 q11, q11, #8
1994 .macro pixman_composite_add_8888_8888_8888_process_pixblock_tail
1995 /* 2 cycle bubble */
1996 vrshrn.u16 d28, q8, #8
1997 vrshrn.u16 d29, q9, #8
1998 vrshrn.u16 d30, q10, #8
1999 vrshrn.u16 d31, q11, #8
2000 vqadd.u8 q14, q2, q14
2001 /* 1 cycle bubble */
2002 vqadd.u8 q15, q3, q15
2005 .macro pixman_composite_add_8888_8888_8888_process_pixblock_tail_head
2007 vrshrn.u16 d28, q8, #8
2009 vrshrn.u16 d29, q9, #8
2010 vmull.u8 q8, d27, d0
2011 vrshrn.u16 d30, q10, #8
2012 vmull.u8 q9, d27, d1
2013 vrshrn.u16 d31, q11, #8
2014 vmull.u8 q10, d27, d2
2015 vqadd.u8 q14, q2, q14
2016 vmull.u8 q11, d27, d3
2017 vqadd.u8 q15, q3, q15
2018 vrsra.u16 q8, q8, #8
2019 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
2020 vrsra.u16 q9, q9, #8
2021 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
2022 vrsra.u16 q10, q10, #8
2026 vrsra.u16 q11, q11, #8
2029 generate_composite_function \
2030 pixman_composite_add_8888_8888_8888_asm_neon, 32, 32, 32, \
2031 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
2032 8, /* number of pixels, processed in a single block */ \
2033 10, /* prefetch distance */ \
2036 pixman_composite_add_8888_8888_8888_process_pixblock_head, \
2037 pixman_composite_add_8888_8888_8888_process_pixblock_tail, \
2038 pixman_composite_add_8888_8888_8888_process_pixblock_tail_head
2040 generate_composite_function_single_scanline \
2041 pixman_composite_scanline_add_mask_asm_neon, 32, 32, 32, \
2042 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
2043 8, /* number of pixels, processed in a single block */ \
2046 pixman_composite_add_8888_8888_8888_process_pixblock_head, \
2047 pixman_composite_add_8888_8888_8888_process_pixblock_tail, \
2048 pixman_composite_add_8888_8888_8888_process_pixblock_tail_head
2050 /******************************************************************************/
2052 generate_composite_function \
2053 pixman_composite_add_8888_8_8888_asm_neon, 32, 8, 32, \
2054 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
2055 8, /* number of pixels, processed in a single block */ \
2056 5, /* prefetch distance */ \
2059 pixman_composite_add_8888_8888_8888_process_pixblock_head, \
2060 pixman_composite_add_8888_8888_8888_process_pixblock_tail, \
2061 pixman_composite_add_8888_8888_8888_process_pixblock_tail_head, \
2062 28, /* dst_w_basereg */ \
2063 4, /* dst_r_basereg */ \
2064 0, /* src_basereg */ \
2065 27 /* mask_basereg */
2067 /******************************************************************************/
2069 .macro pixman_composite_add_n_8_8888_init
2070 add DUMMY, sp, #ARGS_STACK_OFFSET
2071 vld1.32 {d3[0]}, [DUMMY]
2078 .macro pixman_composite_add_n_8_8888_cleanup
2081 generate_composite_function \
2082 pixman_composite_add_n_8_8888_asm_neon, 0, 8, 32, \
2083 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
2084 8, /* number of pixels, processed in a single block */ \
2085 5, /* prefetch distance */ \
2086 pixman_composite_add_n_8_8888_init, \
2087 pixman_composite_add_n_8_8888_cleanup, \
2088 pixman_composite_add_8888_8888_8888_process_pixblock_head, \
2089 pixman_composite_add_8888_8888_8888_process_pixblock_tail, \
2090 pixman_composite_add_8888_8888_8888_process_pixblock_tail_head, \
2091 28, /* dst_w_basereg */ \
2092 4, /* dst_r_basereg */ \
2093 0, /* src_basereg */ \
2094 27 /* mask_basereg */
2096 /******************************************************************************/
2098 .macro pixman_composite_add_8888_n_8888_init
2099 add DUMMY, sp, #(ARGS_STACK_OFFSET + 8)
2100 vld1.32 {d27[0]}, [DUMMY]
2104 .macro pixman_composite_add_8888_n_8888_cleanup
2107 generate_composite_function \
2108 pixman_composite_add_8888_n_8888_asm_neon, 32, 0, 32, \
2109 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
2110 8, /* number of pixels, processed in a single block */ \
2111 5, /* prefetch distance */ \
2112 pixman_composite_add_8888_n_8888_init, \
2113 pixman_composite_add_8888_n_8888_cleanup, \
2114 pixman_composite_add_8888_8888_8888_process_pixblock_head, \
2115 pixman_composite_add_8888_8888_8888_process_pixblock_tail, \
2116 pixman_composite_add_8888_8888_8888_process_pixblock_tail_head, \
2117 28, /* dst_w_basereg */ \
2118 4, /* dst_r_basereg */ \
2119 0, /* src_basereg */ \
2120 27 /* mask_basereg */
2122 /******************************************************************************/
2124 .macro pixman_composite_out_reverse_8888_n_8888_process_pixblock_head
2125 /* expecting source data in {d0, d1, d2, d3} */
2126 /* destination data in {d4, d5, d6, d7} */
2127 /* solid mask is in d15 */
2130 vmull.u8 q8, d15, d3
2131 vmull.u8 q6, d15, d2
2132 vmull.u8 q5, d15, d1
2133 vmull.u8 q4, d15, d0
2134 vrshr.u16 q13, q8, #8
2135 vrshr.u16 q12, q6, #8
2136 vrshr.u16 q11, q5, #8
2137 vrshr.u16 q10, q4, #8
2138 vraddhn.u16 d3, q8, q13
2139 vraddhn.u16 d2, q6, q12
2140 vraddhn.u16 d1, q5, q11
2141 vraddhn.u16 d0, q4, q10
2142 vmvn.8 d24, d3 /* get inverted alpha */
2143 /* now do alpha blending */
2144 vmull.u8 q8, d24, d4
2145 vmull.u8 q9, d24, d5
2146 vmull.u8 q10, d24, d6
2147 vmull.u8 q11, d24, d7
2150 .macro pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail
2151 vrshr.u16 q14, q8, #8
2152 vrshr.u16 q15, q9, #8
2153 vrshr.u16 q12, q10, #8
2154 vrshr.u16 q13, q11, #8
2155 vraddhn.u16 d28, q14, q8
2156 vraddhn.u16 d29, q15, q9
2157 vraddhn.u16 d30, q12, q10
2158 vraddhn.u16 d31, q13, q11
2161 /* TODO: expand macros and do better instructions scheduling */
2162 .macro pixman_composite_out_reverse_8888_8888_8888_process_pixblock_tail_head
2163 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
2164 pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail
2168 pixman_composite_out_reverse_8888_n_8888_process_pixblock_head
2169 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
2172 generate_composite_function_single_scanline \
2173 pixman_composite_scanline_out_reverse_mask_asm_neon, 32, 32, 32, \
2174 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
2175 8, /* number of pixels, processed in a single block */ \
2176 default_init_need_all_regs, \
2177 default_cleanup_need_all_regs, \
2178 pixman_composite_out_reverse_8888_n_8888_process_pixblock_head, \
2179 pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail, \
2180 pixman_composite_out_reverse_8888_8888_8888_process_pixblock_tail_head \
2181 28, /* dst_w_basereg */ \
2182 4, /* dst_r_basereg */ \
2183 0, /* src_basereg */ \
2184 12 /* mask_basereg */
2186 /******************************************************************************/
2188 .macro pixman_composite_over_8888_n_8888_process_pixblock_head
2189 pixman_composite_out_reverse_8888_n_8888_process_pixblock_head
2192 .macro pixman_composite_over_8888_n_8888_process_pixblock_tail
2193 pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail
2194 vqadd.u8 q14, q0, q14
2195 vqadd.u8 q15, q1, q15
2198 /* TODO: expand macros and do better instructions scheduling */
2199 .macro pixman_composite_over_8888_n_8888_process_pixblock_tail_head
2200 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
2201 pixman_composite_over_8888_n_8888_process_pixblock_tail
2204 pixman_composite_over_8888_n_8888_process_pixblock_head
2205 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
2208 .macro pixman_composite_over_8888_n_8888_init
2211 vld1.32 {d15[0]}, [DUMMY]
2215 .macro pixman_composite_over_8888_n_8888_cleanup
2219 generate_composite_function \
2220 pixman_composite_over_8888_n_8888_asm_neon, 32, 0, 32, \
2221 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
2222 8, /* number of pixels, processed in a single block */ \
2223 5, /* prefetch distance */ \
2224 pixman_composite_over_8888_n_8888_init, \
2225 pixman_composite_over_8888_n_8888_cleanup, \
2226 pixman_composite_over_8888_n_8888_process_pixblock_head, \
2227 pixman_composite_over_8888_n_8888_process_pixblock_tail, \
2228 pixman_composite_over_8888_n_8888_process_pixblock_tail_head
2230 /******************************************************************************/
2232 /* TODO: expand macros and do better instructions scheduling */
2233 .macro pixman_composite_over_8888_8888_8888_process_pixblock_tail_head
2234 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
2235 pixman_composite_over_8888_n_8888_process_pixblock_tail
2239 pixman_composite_over_8888_n_8888_process_pixblock_head
2240 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
2243 generate_composite_function \
2244 pixman_composite_over_8888_8888_8888_asm_neon, 32, 32, 32, \
2245 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
2246 8, /* number of pixels, processed in a single block */ \
2247 5, /* prefetch distance */ \
2248 default_init_need_all_regs, \
2249 default_cleanup_need_all_regs, \
2250 pixman_composite_over_8888_n_8888_process_pixblock_head, \
2251 pixman_composite_over_8888_n_8888_process_pixblock_tail, \
2252 pixman_composite_over_8888_8888_8888_process_pixblock_tail_head \
2253 28, /* dst_w_basereg */ \
2254 4, /* dst_r_basereg */ \
2255 0, /* src_basereg */ \
2256 12 /* mask_basereg */
2258 generate_composite_function_single_scanline \
2259 pixman_composite_scanline_over_mask_asm_neon, 32, 32, 32, \
2260 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
2261 8, /* number of pixels, processed in a single block */ \
2262 default_init_need_all_regs, \
2263 default_cleanup_need_all_regs, \
2264 pixman_composite_over_8888_n_8888_process_pixblock_head, \
2265 pixman_composite_over_8888_n_8888_process_pixblock_tail, \
2266 pixman_composite_over_8888_8888_8888_process_pixblock_tail_head \
2267 28, /* dst_w_basereg */ \
2268 4, /* dst_r_basereg */ \
2269 0, /* src_basereg */ \
2270 12 /* mask_basereg */
2272 /******************************************************************************/
2274 /* TODO: expand macros and do better instructions scheduling */
2275 .macro pixman_composite_over_8888_8_8888_process_pixblock_tail_head
2276 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
2277 pixman_composite_over_8888_n_8888_process_pixblock_tail
2281 pixman_composite_over_8888_n_8888_process_pixblock_head
2282 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
2285 generate_composite_function \
2286 pixman_composite_over_8888_8_8888_asm_neon, 32, 8, 32, \
2287 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
2288 8, /* number of pixels, processed in a single block */ \
2289 5, /* prefetch distance */ \
2290 default_init_need_all_regs, \
2291 default_cleanup_need_all_regs, \
2292 pixman_composite_over_8888_n_8888_process_pixblock_head, \
2293 pixman_composite_over_8888_n_8888_process_pixblock_tail, \
2294 pixman_composite_over_8888_8_8888_process_pixblock_tail_head \
2295 28, /* dst_w_basereg */ \
2296 4, /* dst_r_basereg */ \
2297 0, /* src_basereg */ \
2298 15 /* mask_basereg */
2300 /******************************************************************************/
2302 .macro pixman_composite_src_0888_0888_process_pixblock_head
2305 .macro pixman_composite_src_0888_0888_process_pixblock_tail
2308 .macro pixman_composite_src_0888_0888_process_pixblock_tail_head
2309 vst3.8 {d0, d1, d2}, [DST_W]!
2314 generate_composite_function \
2315 pixman_composite_src_0888_0888_asm_neon, 24, 0, 24, \
2316 FLAG_DST_WRITEONLY, \
2317 8, /* number of pixels, processed in a single block */ \
2318 10, /* prefetch distance */ \
2321 pixman_composite_src_0888_0888_process_pixblock_head, \
2322 pixman_composite_src_0888_0888_process_pixblock_tail, \
2323 pixman_composite_src_0888_0888_process_pixblock_tail_head, \
2324 0, /* dst_w_basereg */ \
2325 0, /* dst_r_basereg */ \
2326 0, /* src_basereg */ \
2327 0 /* mask_basereg */
2329 /******************************************************************************/
2331 .macro pixman_composite_src_0888_8888_rev_process_pixblock_head
2335 .macro pixman_composite_src_0888_8888_rev_process_pixblock_tail
2338 .macro pixman_composite_src_0888_8888_rev_process_pixblock_tail_head
2339 vst4.8 {d0, d1, d2, d3}, [DST_W]!
2345 .macro pixman_composite_src_0888_8888_rev_init
2349 generate_composite_function \
2350 pixman_composite_src_0888_8888_rev_asm_neon, 24, 0, 32, \
2351 FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
2352 8, /* number of pixels, processed in a single block */ \
2353 10, /* prefetch distance */ \
2354 pixman_composite_src_0888_8888_rev_init, \
2356 pixman_composite_src_0888_8888_rev_process_pixblock_head, \
2357 pixman_composite_src_0888_8888_rev_process_pixblock_tail, \
2358 pixman_composite_src_0888_8888_rev_process_pixblock_tail_head, \
2359 0, /* dst_w_basereg */ \
2360 0, /* dst_r_basereg */ \
2361 0, /* src_basereg */ \
2362 0 /* mask_basereg */
2364 /******************************************************************************/
2366 .macro pixman_composite_src_0888_0565_rev_process_pixblock_head
2371 .macro pixman_composite_src_0888_0565_rev_process_pixblock_tail
2372 vshll.u8 q14, d0, #8
2373 vsri.u16 q14, q8, #5
2374 vsri.u16 q14, q9, #11
2377 .macro pixman_composite_src_0888_0565_rev_process_pixblock_tail_head
2378 vshll.u8 q14, d0, #8
2380 vsri.u16 q14, q8, #5
2381 vsri.u16 q14, q9, #11
2383 vst1.16 {d28, d29}, [DST_W, :128]!
2387 generate_composite_function \
2388 pixman_composite_src_0888_0565_rev_asm_neon, 24, 0, 16, \
2389 FLAG_DST_WRITEONLY, \
2390 8, /* number of pixels, processed in a single block */ \
2391 10, /* prefetch distance */ \
2394 pixman_composite_src_0888_0565_rev_process_pixblock_head, \
2395 pixman_composite_src_0888_0565_rev_process_pixblock_tail, \
2396 pixman_composite_src_0888_0565_rev_process_pixblock_tail_head, \
2397 28, /* dst_w_basereg */ \
2398 0, /* dst_r_basereg */ \
2399 0, /* src_basereg */ \
2400 0 /* mask_basereg */
2402 /******************************************************************************/
2404 .macro pixman_composite_src_pixbuf_8888_process_pixblock_head
2407 vmull.u8 q10, d3, d2
2410 .macro pixman_composite_src_pixbuf_8888_process_pixblock_tail
2411 vrshr.u16 q11, q8, #8
2413 vrshr.u16 q12, q9, #8
2414 vrshr.u16 q13, q10, #8
2415 vraddhn.u16 d30, q11, q8
2416 vraddhn.u16 d29, q12, q9
2417 vraddhn.u16 d28, q13, q10
2420 .macro pixman_composite_src_pixbuf_8888_process_pixblock_tail_head
2421 vrshr.u16 q11, q8, #8
2423 vrshr.u16 q12, q9, #8
2424 vrshr.u16 q13, q10, #8
2426 vraddhn.u16 d30, q11, q8
2427 PF add PF_X, PF_X, #8
2429 PF addne PF_X, PF_X, #8
2430 PF subne PF_CTL, PF_CTL, #1
2431 vraddhn.u16 d29, q12, q9
2432 vraddhn.u16 d28, q13, q10
2435 vmull.u8 q10, d3, d2
2436 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
2438 PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
2439 PF subge PF_X, PF_X, ORIG_W
2440 PF subges PF_CTL, PF_CTL, #0x10
2441 PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
2444 generate_composite_function \
2445 pixman_composite_src_pixbuf_8888_asm_neon, 32, 0, 32, \
2446 FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
2447 8, /* number of pixels, processed in a single block */ \
2448 10, /* prefetch distance */ \
2451 pixman_composite_src_pixbuf_8888_process_pixblock_head, \
2452 pixman_composite_src_pixbuf_8888_process_pixblock_tail, \
2453 pixman_composite_src_pixbuf_8888_process_pixblock_tail_head, \
2454 28, /* dst_w_basereg */ \
2455 0, /* dst_r_basereg */ \
2456 0, /* src_basereg */ \
2457 0 /* mask_basereg */
2459 /******************************************************************************/
2461 .macro pixman_composite_src_rpixbuf_8888_process_pixblock_head
2464 vmull.u8 q10, d3, d2
2467 .macro pixman_composite_src_rpixbuf_8888_process_pixblock_tail
2468 vrshr.u16 q11, q8, #8
2470 vrshr.u16 q12, q9, #8
2471 vrshr.u16 q13, q10, #8
2472 vraddhn.u16 d28, q11, q8
2473 vraddhn.u16 d29, q12, q9
2474 vraddhn.u16 d30, q13, q10
2477 .macro pixman_composite_src_rpixbuf_8888_process_pixblock_tail_head
2478 vrshr.u16 q11, q8, #8
2480 vrshr.u16 q12, q9, #8
2481 vrshr.u16 q13, q10, #8
2483 vraddhn.u16 d28, q11, q8
2484 PF add PF_X, PF_X, #8
2486 PF addne PF_X, PF_X, #8
2487 PF subne PF_CTL, PF_CTL, #1
2488 vraddhn.u16 d29, q12, q9
2489 vraddhn.u16 d30, q13, q10
2492 vmull.u8 q10, d3, d2
2493 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
2495 PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
2496 PF subge PF_X, PF_X, ORIG_W
2497 PF subges PF_CTL, PF_CTL, #0x10
2498 PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
2501 generate_composite_function \
2502 pixman_composite_src_rpixbuf_8888_asm_neon, 32, 0, 32, \
2503 FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
2504 8, /* number of pixels, processed in a single block */ \
2505 10, /* prefetch distance */ \
2508 pixman_composite_src_rpixbuf_8888_process_pixblock_head, \
2509 pixman_composite_src_rpixbuf_8888_process_pixblock_tail, \
2510 pixman_composite_src_rpixbuf_8888_process_pixblock_tail_head, \
2511 28, /* dst_w_basereg */ \
2512 0, /* dst_r_basereg */ \
2513 0, /* src_basereg */ \
2514 0 /* mask_basereg */
2516 /******************************************************************************/
2518 .macro pixman_composite_over_0565_8_0565_process_pixblock_head
2519 /* mask is in d15 */
2520 convert_0565_to_x888 q4, d2, d1, d0
2521 convert_0565_to_x888 q5, d6, d5, d4
2522 /* source pixel data is in {d0, d1, d2, XX} */
2523 /* destination pixel data is in {d4, d5, d6, XX} */
2525 vmull.u8 q6, d15, d2
2526 vmull.u8 q5, d15, d1
2527 vmull.u8 q4, d15, d0
2530 vmull.u8 q13, d7, d6
2531 vrshr.u16 q12, q6, #8
2532 vrshr.u16 q11, q5, #8
2533 vrshr.u16 q10, q4, #8
2534 vraddhn.u16 d2, q6, q12
2535 vraddhn.u16 d1, q5, q11
2536 vraddhn.u16 d0, q4, q10
2539 .macro pixman_composite_over_0565_8_0565_process_pixblock_tail
2540 vrshr.u16 q14, q8, #8
2541 vrshr.u16 q15, q9, #8
2542 vrshr.u16 q12, q13, #8
2543 vraddhn.u16 d28, q14, q8
2544 vraddhn.u16 d29, q15, q9
2545 vraddhn.u16 d30, q12, q13
2546 vqadd.u8 q0, q0, q14
2547 vqadd.u8 q1, q1, q15
2548 /* 32bpp result is in {d0, d1, d2, XX} */
2549 convert_8888_to_0565 d2, d1, d0, q14, q15, q3
2552 /* TODO: expand macros and do better instructions scheduling */
2553 .macro pixman_composite_over_0565_8_0565_process_pixblock_tail_head
2555 pixman_composite_over_0565_8_0565_process_pixblock_tail
2557 vld1.16 {d10, d11}, [DST_R, :128]!
2559 pixman_composite_over_0565_8_0565_process_pixblock_head
2560 vst1.16 {d28, d29}, [DST_W, :128]!
2563 generate_composite_function \
2564 pixman_composite_over_0565_8_0565_asm_neon, 16, 8, 16, \
2565 FLAG_DST_READWRITE, \
2566 8, /* number of pixels, processed in a single block */ \
2567 5, /* prefetch distance */ \
2568 default_init_need_all_regs, \
2569 default_cleanup_need_all_regs, \
2570 pixman_composite_over_0565_8_0565_process_pixblock_head, \
2571 pixman_composite_over_0565_8_0565_process_pixblock_tail, \
2572 pixman_composite_over_0565_8_0565_process_pixblock_tail_head, \
2573 28, /* dst_w_basereg */ \
2574 10, /* dst_r_basereg */ \
2575 8, /* src_basereg */ \
2576 15 /* mask_basereg */
2578 /******************************************************************************/
2580 .macro pixman_composite_over_0565_n_0565_init
2581 add DUMMY, sp, #(ARGS_STACK_OFFSET + 8)
2583 vld1.32 {d15[0]}, [DUMMY]
2587 .macro pixman_composite_over_0565_n_0565_cleanup
2591 generate_composite_function \
2592 pixman_composite_over_0565_n_0565_asm_neon, 16, 0, 16, \
2593 FLAG_DST_READWRITE, \
2594 8, /* number of pixels, processed in a single block */ \
2595 5, /* prefetch distance */ \
2596 pixman_composite_over_0565_n_0565_init, \
2597 pixman_composite_over_0565_n_0565_cleanup, \
2598 pixman_composite_over_0565_8_0565_process_pixblock_head, \
2599 pixman_composite_over_0565_8_0565_process_pixblock_tail, \
2600 pixman_composite_over_0565_8_0565_process_pixblock_tail_head, \
2601 28, /* dst_w_basereg */ \
2602 10, /* dst_r_basereg */ \
2603 8, /* src_basereg */ \
2604 15 /* mask_basereg */
2606 /******************************************************************************/
2608 .macro pixman_composite_add_0565_8_0565_process_pixblock_head
2609 /* mask is in d15 */
2610 convert_0565_to_x888 q4, d2, d1, d0
2611 convert_0565_to_x888 q5, d6, d5, d4
2612 /* source pixel data is in {d0, d1, d2, XX} */
2613 /* destination pixel data is in {d4, d5, d6, XX} */
2614 vmull.u8 q6, d15, d2
2615 vmull.u8 q5, d15, d1
2616 vmull.u8 q4, d15, d0
2617 vrshr.u16 q12, q6, #8
2618 vrshr.u16 q11, q5, #8
2619 vrshr.u16 q10, q4, #8
2620 vraddhn.u16 d2, q6, q12
2621 vraddhn.u16 d1, q5, q11
2622 vraddhn.u16 d0, q4, q10
2625 .macro pixman_composite_add_0565_8_0565_process_pixblock_tail
2628 /* 32bpp result is in {d0, d1, d2, XX} */
2629 convert_8888_to_0565 d2, d1, d0, q14, q15, q3
2632 /* TODO: expand macros and do better instructions scheduling */
2633 .macro pixman_composite_add_0565_8_0565_process_pixblock_tail_head
2635 pixman_composite_add_0565_8_0565_process_pixblock_tail
2637 vld1.16 {d10, d11}, [DST_R, :128]!
2639 pixman_composite_add_0565_8_0565_process_pixblock_head
2640 vst1.16 {d28, d29}, [DST_W, :128]!
2643 generate_composite_function \
2644 pixman_composite_add_0565_8_0565_asm_neon, 16, 8, 16, \
2645 FLAG_DST_READWRITE, \
2646 8, /* number of pixels, processed in a single block */ \
2647 5, /* prefetch distance */ \
2648 default_init_need_all_regs, \
2649 default_cleanup_need_all_regs, \
2650 pixman_composite_add_0565_8_0565_process_pixblock_head, \
2651 pixman_composite_add_0565_8_0565_process_pixblock_tail, \
2652 pixman_composite_add_0565_8_0565_process_pixblock_tail_head, \
2653 28, /* dst_w_basereg */ \
2654 10, /* dst_r_basereg */ \
2655 8, /* src_basereg */ \
2656 15 /* mask_basereg */
2658 /******************************************************************************/
2660 .macro pixman_composite_out_reverse_8_0565_process_pixblock_head
2661 /* mask is in d15 */
2662 convert_0565_to_x888 q5, d6, d5, d4
2663 /* destination pixel data is in {d4, d5, d6, xx} */
2664 vmvn.8 d24, d15 /* get inverted alpha */
2665 /* now do alpha blending */
2666 vmull.u8 q8, d24, d4
2667 vmull.u8 q9, d24, d5
2668 vmull.u8 q10, d24, d6
2671 .macro pixman_composite_out_reverse_8_0565_process_pixblock_tail
2672 vrshr.u16 q14, q8, #8
2673 vrshr.u16 q15, q9, #8
2674 vrshr.u16 q12, q10, #8
2675 vraddhn.u16 d0, q14, q8
2676 vraddhn.u16 d1, q15, q9
2677 vraddhn.u16 d2, q12, q10
2678 /* 32bpp result is in {d0, d1, d2, XX} */
2679 convert_8888_to_0565 d2, d1, d0, q14, q15, q3
2682 /* TODO: expand macros and do better instructions scheduling */
2683 .macro pixman_composite_out_reverse_8_0565_process_pixblock_tail_head
2685 pixman_composite_out_reverse_8_0565_process_pixblock_tail
2686 vld1.16 {d10, d11}, [DST_R, :128]!
2688 pixman_composite_out_reverse_8_0565_process_pixblock_head
2689 vst1.16 {d28, d29}, [DST_W, :128]!
2692 generate_composite_function \
2693 pixman_composite_out_reverse_8_0565_asm_neon, 8, 0, 16, \
2694 FLAG_DST_READWRITE, \
2695 8, /* number of pixels, processed in a single block */ \
2696 5, /* prefetch distance */ \
2697 default_init_need_all_regs, \
2698 default_cleanup_need_all_regs, \
2699 pixman_composite_out_reverse_8_0565_process_pixblock_head, \
2700 pixman_composite_out_reverse_8_0565_process_pixblock_tail, \
2701 pixman_composite_out_reverse_8_0565_process_pixblock_tail_head, \
2702 28, /* dst_w_basereg */ \
2703 10, /* dst_r_basereg */ \
2704 15, /* src_basereg */ \
2705 0 /* mask_basereg */
2707 /******************************************************************************/
2709 .macro pixman_composite_out_reverse_8_8888_process_pixblock_head
2711 /* destination pixel data is in {d4, d5, d6, d7} */
2712 vmvn.8 d1, d0 /* get inverted alpha */
2713 /* now do alpha blending */
2716 vmull.u8 q10, d1, d6
2717 vmull.u8 q11, d1, d7
2720 .macro pixman_composite_out_reverse_8_8888_process_pixblock_tail
2721 vrshr.u16 q14, q8, #8
2722 vrshr.u16 q15, q9, #8
2723 vrshr.u16 q12, q10, #8
2724 vrshr.u16 q13, q11, #8
2725 vraddhn.u16 d28, q14, q8
2726 vraddhn.u16 d29, q15, q9
2727 vraddhn.u16 d30, q12, q10
2728 vraddhn.u16 d31, q13, q11
2729 /* 32bpp result is in {d28, d29, d30, d31} */
2732 /* TODO: expand macros and do better instructions scheduling */
2733 .macro pixman_composite_out_reverse_8_8888_process_pixblock_tail_head
2735 pixman_composite_out_reverse_8_8888_process_pixblock_tail
2736 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
2738 pixman_composite_out_reverse_8_8888_process_pixblock_head
2739 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
2742 generate_composite_function \
2743 pixman_composite_out_reverse_8_8888_asm_neon, 8, 0, 32, \
2744 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
2745 8, /* number of pixels, processed in a single block */ \
2746 5, /* prefetch distance */ \
2749 pixman_composite_out_reverse_8_8888_process_pixblock_head, \
2750 pixman_composite_out_reverse_8_8888_process_pixblock_tail, \
2751 pixman_composite_out_reverse_8_8888_process_pixblock_tail_head, \
2752 28, /* dst_w_basereg */ \
2753 4, /* dst_r_basereg */ \
2754 0, /* src_basereg */ \
2755 0 /* mask_basereg */
2757 /******************************************************************************/
2759 generate_composite_function_nearest_scanline \
2760 pixman_scaled_nearest_scanline_8888_8888_OVER_asm_neon, 32, 0, 32, \
2761 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
2762 8, /* number of pixels, processed in a single block */ \
2765 pixman_composite_over_8888_8888_process_pixblock_head, \
2766 pixman_composite_over_8888_8888_process_pixblock_tail, \
2767 pixman_composite_over_8888_8888_process_pixblock_tail_head
2769 generate_composite_function_nearest_scanline \
2770 pixman_scaled_nearest_scanline_8888_0565_OVER_asm_neon, 32, 0, 16, \
2771 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
2772 8, /* number of pixels, processed in a single block */ \
2775 pixman_composite_over_8888_0565_process_pixblock_head, \
2776 pixman_composite_over_8888_0565_process_pixblock_tail, \
2777 pixman_composite_over_8888_0565_process_pixblock_tail_head, \
2778 28, /* dst_w_basereg */ \
2779 4, /* dst_r_basereg */ \
2780 0, /* src_basereg */ \
2781 24 /* mask_basereg */
2783 generate_composite_function_nearest_scanline \
2784 pixman_scaled_nearest_scanline_8888_0565_SRC_asm_neon, 32, 0, 16, \
2785 FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
2786 8, /* number of pixels, processed in a single block */ \
2789 pixman_composite_src_8888_0565_process_pixblock_head, \
2790 pixman_composite_src_8888_0565_process_pixblock_tail, \
2791 pixman_composite_src_8888_0565_process_pixblock_tail_head
2793 generate_composite_function_nearest_scanline \
2794 pixman_scaled_nearest_scanline_0565_8888_SRC_asm_neon, 16, 0, 32, \
2795 FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
2796 8, /* number of pixels, processed in a single block */ \
2799 pixman_composite_src_0565_8888_process_pixblock_head, \
2800 pixman_composite_src_0565_8888_process_pixblock_tail, \
2801 pixman_composite_src_0565_8888_process_pixblock_tail_head
2803 generate_composite_function_nearest_scanline \
2804 pixman_scaled_nearest_scanline_8888_8_0565_OVER_asm_neon, 32, 8, 16, \
2805 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
2806 8, /* number of pixels, processed in a single block */ \
2807 default_init_need_all_regs, \
2808 default_cleanup_need_all_regs, \
2809 pixman_composite_over_8888_8_0565_process_pixblock_head, \
2810 pixman_composite_over_8888_8_0565_process_pixblock_tail, \
2811 pixman_composite_over_8888_8_0565_process_pixblock_tail_head, \
2812 28, /* dst_w_basereg */ \
2813 4, /* dst_r_basereg */ \
2814 8, /* src_basereg */ \
2815 24 /* mask_basereg */
2817 generate_composite_function_nearest_scanline \
2818 pixman_scaled_nearest_scanline_0565_8_0565_OVER_asm_neon, 16, 8, 16, \
2819 FLAG_DST_READWRITE, \
2820 8, /* number of pixels, processed in a single block */ \
2821 default_init_need_all_regs, \
2822 default_cleanup_need_all_regs, \
2823 pixman_composite_over_0565_8_0565_process_pixblock_head, \
2824 pixman_composite_over_0565_8_0565_process_pixblock_tail, \
2825 pixman_composite_over_0565_8_0565_process_pixblock_tail_head, \
2826 28, /* dst_w_basereg */ \
2827 10, /* dst_r_basereg */ \
2828 8, /* src_basereg */ \
2829 15 /* mask_basereg */
2831 /******************************************************************************/
2833 /* Supplementary macro for setting function attributes */
2834 .macro pixman_asm_function fname
2839 .type fname, %function
2845 * Bilinear scaling support code which tries to provide pixel fetching, color
2846 * format conversion, and interpolation as separate macros which can be used
2847 * as the basic building blocks for constructing bilinear scanline functions.
2850 .macro bilinear_load_8888 reg1, reg2, tmp
2851 mov TMP1, X, asr #16
2853 add TMP1, TOP, TMP1, asl #2
2854 vld1.32 {reg1}, [TMP1], STRIDE
2855 vld1.32 {reg2}, [TMP1]
2858 .macro bilinear_load_0565 reg1, reg2, tmp
2859 mov TMP1, X, asr #16
2861 add TMP1, TOP, TMP1, asl #1
2862 vld1.32 {reg2[0]}, [TMP1], STRIDE
2863 vld1.32 {reg2[1]}, [TMP1]
2864 convert_four_0565_to_x888_packed reg2, reg1, reg2, tmp
2867 .macro bilinear_load_and_vertical_interpolate_two_8888 \
2868 acc1, acc2, reg1, reg2, reg3, reg4, tmp1, tmp2
2870 bilinear_load_8888 reg1, reg2, tmp1
2871 vmull.u8 acc1, reg1, d28
2872 vmlal.u8 acc1, reg2, d29
2873 bilinear_load_8888 reg3, reg4, tmp2
2874 vmull.u8 acc2, reg3, d28
2875 vmlal.u8 acc2, reg4, d29
2878 .macro bilinear_load_and_vertical_interpolate_four_8888 \
2879 xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi \
2880 yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
2882 bilinear_load_and_vertical_interpolate_two_8888 \
2883 xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi
2884 bilinear_load_and_vertical_interpolate_two_8888 \
2885 yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
2888 .macro bilinear_load_and_vertical_interpolate_two_0565 \
2889 acc1, acc2, reg1, reg2, reg3, reg4, acc2lo, acc2hi
2891 mov TMP1, X, asr #16
2893 add TMP1, TOP, TMP1, asl #1
2894 mov TMP2, X, asr #16
2896 add TMP2, TOP, TMP2, asl #1
2897 vld1.32 {acc2lo[0]}, [TMP1], STRIDE
2898 vld1.32 {acc2hi[0]}, [TMP2], STRIDE
2899 vld1.32 {acc2lo[1]}, [TMP1]
2900 vld1.32 {acc2hi[1]}, [TMP2]
2901 convert_0565_to_x888 acc2, reg3, reg2, reg1
2906 vmull.u8 acc1, reg1, d28
2907 vmlal.u8 acc1, reg2, d29
2908 vmull.u8 acc2, reg3, d28
2909 vmlal.u8 acc2, reg4, d29
2912 .macro bilinear_load_and_vertical_interpolate_four_0565 \
2913 xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi \
2914 yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
2916 mov TMP1, X, asr #16
2918 add TMP1, TOP, TMP1, asl #1
2919 mov TMP2, X, asr #16
2921 add TMP2, TOP, TMP2, asl #1
2922 vld1.32 {xacc2lo[0]}, [TMP1], STRIDE
2923 vld1.32 {xacc2hi[0]}, [TMP2], STRIDE
2924 vld1.32 {xacc2lo[1]}, [TMP1]
2925 vld1.32 {xacc2hi[1]}, [TMP2]
2926 convert_0565_to_x888 xacc2, xreg3, xreg2, xreg1
2927 mov TMP1, X, asr #16
2929 add TMP1, TOP, TMP1, asl #1
2930 mov TMP2, X, asr #16
2932 add TMP2, TOP, TMP2, asl #1
2933 vld1.32 {yacc2lo[0]}, [TMP1], STRIDE
2934 vzip.u8 xreg1, xreg3
2935 vld1.32 {yacc2hi[0]}, [TMP2], STRIDE
2936 vzip.u8 xreg2, xreg4
2937 vld1.32 {yacc2lo[1]}, [TMP1]
2938 vzip.u8 xreg3, xreg4
2939 vld1.32 {yacc2hi[1]}, [TMP2]
2940 vzip.u8 xreg1, xreg2
2941 convert_0565_to_x888 yacc2, yreg3, yreg2, yreg1
2942 vmull.u8 xacc1, xreg1, d28
2943 vzip.u8 yreg1, yreg3
2944 vmlal.u8 xacc1, xreg2, d29
2945 vzip.u8 yreg2, yreg4
2946 vmull.u8 xacc2, xreg3, d28
2947 vzip.u8 yreg3, yreg4
2948 vmlal.u8 xacc2, xreg4, d29
2949 vzip.u8 yreg1, yreg2
2950 vmull.u8 yacc1, yreg1, d28
2951 vmlal.u8 yacc1, yreg2, d29
2952 vmull.u8 yacc2, yreg3, d28
2953 vmlal.u8 yacc2, yreg4, d29
2956 .macro bilinear_store_8888 numpix, tmp1, tmp2
2958 vst1.32 {d0, d1}, [OUT, :128]!
2960 vst1.32 {d0}, [OUT, :64]!
2962 vst1.32 {d0[0]}, [OUT, :32]!
2964 .error bilinear_store_8888 numpix is unsupported
2968 .macro bilinear_store_0565 numpix, tmp1, tmp2
2973 convert_8888_to_0565 d2, d1, d0, q1, tmp1, tmp2
2975 vst1.16 {d2}, [OUT, :64]!
2977 vst1.32 {d2[0]}, [OUT, :32]!
2979 vst1.16 {d2[0]}, [OUT, :16]!
2981 .error bilinear_store_0565 numpix is unsupported
2985 .macro bilinear_interpolate_last_pixel src_fmt, dst_fmt
2986 bilinear_load_&src_fmt d0, d1, d2
2987 vmull.u8 q1, d0, d28
2988 vmlal.u8 q1, d1, d29
2989 /* 5 cycles bubble */
2990 vshll.u16 q0, d2, #BILINEAR_INTERPOLATION_BITS
2991 vmlsl.u16 q0, d2, d30
2992 vmlal.u16 q0, d3, d30
2993 /* 5 cycles bubble */
2994 vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
2995 /* 3 cycles bubble */
2997 /* 1 cycle bubble */
2998 bilinear_store_&dst_fmt 1, q2, q3
3001 .macro bilinear_interpolate_two_pixels src_fmt, dst_fmt
3002 bilinear_load_and_vertical_interpolate_two_&src_fmt \
3003 q1, q11, d0, d1, d20, d21, d22, d23
3004 vshll.u16 q0, d2, #BILINEAR_INTERPOLATION_BITS
3005 vmlsl.u16 q0, d2, d30
3006 vmlal.u16 q0, d3, d30
3007 vshll.u16 q10, d22, #BILINEAR_INTERPOLATION_BITS
3008 vmlsl.u16 q10, d22, d31
3009 vmlal.u16 q10, d23, d31
3010 vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
3011 vshrn.u32 d1, q10, #(2 * BILINEAR_INTERPOLATION_BITS)
3012 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
3013 vadd.u16 q12, q12, q13
3015 bilinear_store_&dst_fmt 2, q2, q3
3018 .macro bilinear_interpolate_four_pixels src_fmt, dst_fmt
3019 bilinear_load_and_vertical_interpolate_four_&src_fmt \
3020 q1, q11, d0, d1, d20, d21, d22, d23 \
3021 q3, q9, d4, d5, d16, d17, d18, d19
3023 sub TMP1, TMP1, STRIDE
3024 vshll.u16 q0, d2, #BILINEAR_INTERPOLATION_BITS
3025 vmlsl.u16 q0, d2, d30
3026 vmlal.u16 q0, d3, d30
3027 vshll.u16 q10, d22, #BILINEAR_INTERPOLATION_BITS
3028 vmlsl.u16 q10, d22, d31
3029 vmlal.u16 q10, d23, d31
3030 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
3031 vshll.u16 q2, d6, #BILINEAR_INTERPOLATION_BITS
3032 vmlsl.u16 q2, d6, d30
3033 vmlal.u16 q2, d7, d30
3034 vshll.u16 q8, d18, #BILINEAR_INTERPOLATION_BITS
3036 vmlsl.u16 q8, d18, d31
3037 vmlal.u16 q8, d19, d31
3038 vadd.u16 q12, q12, q13
3039 vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
3040 vshrn.u32 d1, q10, #(2 * BILINEAR_INTERPOLATION_BITS)
3041 vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS)
3042 vshrn.u32 d5, q8, #(2 * BILINEAR_INTERPOLATION_BITS)
3043 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
3046 vadd.u16 q12, q12, q13
3047 bilinear_store_&dst_fmt 4, q2, q3
3050 .macro bilinear_interpolate_four_pixels_head src_fmt, dst_fmt
3051 .ifdef have_bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt
3052 bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt&_head
3054 bilinear_interpolate_four_pixels src_fmt, dst_fmt
3058 .macro bilinear_interpolate_four_pixels_tail src_fmt, dst_fmt
3059 .ifdef have_bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt
3060 bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt&_tail
3064 .macro bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt
3065 .ifdef have_bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt
3066 bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt&_tail_head
3068 bilinear_interpolate_four_pixels src_fmt, dst_fmt
3072 .macro bilinear_interpolate_eight_pixels_head src_fmt, dst_fmt
3073 .ifdef have_bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt
3074 bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt&_head
3076 bilinear_interpolate_four_pixels_head src_fmt, dst_fmt
3077 bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt
3081 .macro bilinear_interpolate_eight_pixels_tail src_fmt, dst_fmt
3082 .ifdef have_bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt
3083 bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt&_tail
3085 bilinear_interpolate_four_pixels_tail src_fmt, dst_fmt
3089 .macro bilinear_interpolate_eight_pixels_tail_head src_fmt, dst_fmt
3090 .ifdef have_bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt
3091 bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt&_tail_head
3093 bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt
3094 bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt
3098 .set BILINEAR_FLAG_UNROLL_4, 0
3099 .set BILINEAR_FLAG_UNROLL_8, 1
3100 .set BILINEAR_FLAG_USE_ALL_NEON_REGS, 2
3103 * Main template macro for generating NEON optimized bilinear scanline
3106 * Bilinear scanline scaler macro template uses the following arguments:
3107 * fname - name of the function to generate
3108 * src_fmt - source color format (8888 or 0565)
3109 * dst_fmt - destination color format (8888 or 0565)
3110 * bpp_shift - (1 << bpp_shift) is the size of source pixel in bytes
3111 * prefetch_distance - prefetch in the source image by that many
3115 .macro generate_bilinear_scanline_func fname, src_fmt, dst_fmt, \
3116 src_bpp_shift, dst_bpp_shift, \
3117 prefetch_distance, flags
3119 pixman_asm_function fname
3136 push {r4, r5, r6, r7, r8, r9}
3137 mov PF_OFFS, #prefetch_distance
3138 ldmia ip, {WB, X, UX, WIDTH}
3139 mul PF_OFFS, PF_OFFS, UX
3141 .if ((flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0
3145 sub STRIDE, BOTTOM, TOP
3155 vadd.u16 d25, d25, d26
3157 /* ensure good destination alignment */
3160 tst OUT, #(1 << dst_bpp_shift)
3162 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
3163 vadd.u16 q12, q12, q13
3164 bilinear_interpolate_last_pixel src_fmt, dst_fmt
3165 sub WIDTH, WIDTH, #1
3167 vadd.u16 q13, q13, q13
3168 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
3169 vadd.u16 q12, q12, q13
3173 tst OUT, #(1 << (dst_bpp_shift + 1))
3175 bilinear_interpolate_two_pixels src_fmt, dst_fmt
3176 sub WIDTH, WIDTH, #2
3178 .if ((flags) & BILINEAR_FLAG_UNROLL_8) != 0
3179 /*********** 8 pixels per iteration *****************/
3182 tst OUT, #(1 << (dst_bpp_shift + 2))
3184 bilinear_interpolate_four_pixels src_fmt, dst_fmt
3185 sub WIDTH, WIDTH, #4
3187 subs WIDTH, WIDTH, #8
3189 mov PF_OFFS, PF_OFFS, asr #(16 - src_bpp_shift)
3190 bilinear_interpolate_eight_pixels_head src_fmt, dst_fmt
3191 subs WIDTH, WIDTH, #8
3194 bilinear_interpolate_eight_pixels_tail_head src_fmt, dst_fmt
3195 subs WIDTH, WIDTH, #8
3198 bilinear_interpolate_eight_pixels_tail src_fmt, dst_fmt
3202 bilinear_interpolate_four_pixels src_fmt, dst_fmt
3205 /*********** 4 pixels per iteration *****************/
3206 subs WIDTH, WIDTH, #4
3208 mov PF_OFFS, PF_OFFS, asr #(16 - src_bpp_shift)
3209 bilinear_interpolate_four_pixels_head src_fmt, dst_fmt
3210 subs WIDTH, WIDTH, #4
3213 bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt
3214 subs WIDTH, WIDTH, #4
3217 bilinear_interpolate_four_pixels_tail src_fmt, dst_fmt
3219 /****************************************************/
3221 /* handle the remaining trailing pixels */
3224 bilinear_interpolate_two_pixels src_fmt, dst_fmt
3228 bilinear_interpolate_last_pixel src_fmt, dst_fmt
3230 .if ((flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0
3233 pop {r4, r5, r6, r7, r8, r9}
3253 /*****************************************************************************/
3255 .set have_bilinear_interpolate_four_pixels_8888_8888, 1
3257 .macro bilinear_interpolate_four_pixels_8888_8888_head
3258 mov TMP1, X, asr #16
3260 add TMP1, TOP, TMP1, asl #2
3261 mov TMP2, X, asr #16
3263 add TMP2, TOP, TMP2, asl #2
3265 vld1.32 {d22}, [TMP1], STRIDE
3266 vld1.32 {d23}, [TMP1]
3267 mov TMP3, X, asr #16
3269 add TMP3, TOP, TMP3, asl #2
3270 vmull.u8 q8, d22, d28
3271 vmlal.u8 q8, d23, d29
3273 vld1.32 {d22}, [TMP2], STRIDE
3274 vld1.32 {d23}, [TMP2]
3275 mov TMP4, X, asr #16
3277 add TMP4, TOP, TMP4, asl #2
3278 vmull.u8 q9, d22, d28
3279 vmlal.u8 q9, d23, d29
3281 vld1.32 {d22}, [TMP3], STRIDE
3282 vld1.32 {d23}, [TMP3]
3283 vmull.u8 q10, d22, d28
3284 vmlal.u8 q10, d23, d29
3286 vshll.u16 q0, d16, #BILINEAR_INTERPOLATION_BITS
3287 vmlsl.u16 q0, d16, d30
3288 vmlal.u16 q0, d17, d30
3291 vld1.32 {d16}, [TMP4], STRIDE
3292 vld1.32 {d17}, [TMP4]
3294 vmull.u8 q11, d16, d28
3295 vmlal.u8 q11, d17, d29
3297 vshll.u16 q1, d18, #BILINEAR_INTERPOLATION_BITS
3298 vmlsl.u16 q1, d18, d31
3301 .macro bilinear_interpolate_four_pixels_8888_8888_tail
3302 vmlal.u16 q1, d19, d31
3303 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
3304 vshll.u16 q2, d20, #BILINEAR_INTERPOLATION_BITS
3305 vmlsl.u16 q2, d20, d30
3306 vmlal.u16 q2, d21, d30
3307 vshll.u16 q3, d22, #BILINEAR_INTERPOLATION_BITS
3308 vmlsl.u16 q3, d22, d31
3309 vmlal.u16 q3, d23, d31
3310 vadd.u16 q12, q12, q13
3311 vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
3312 vshrn.u32 d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS)
3313 vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS)
3314 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
3315 vshrn.u32 d5, q3, #(2 * BILINEAR_INTERPOLATION_BITS)
3318 vadd.u16 q12, q12, q13
3319 vst1.32 {d6, d7}, [OUT, :128]!
3322 .macro bilinear_interpolate_four_pixels_8888_8888_tail_head
3323 mov TMP1, X, asr #16
3325 add TMP1, TOP, TMP1, asl #2
3326 mov TMP2, X, asr #16
3328 add TMP2, TOP, TMP2, asl #2
3329 vmlal.u16 q1, d19, d31
3330 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
3331 vshll.u16 q2, d20, #BILINEAR_INTERPOLATION_BITS
3332 vmlsl.u16 q2, d20, d30
3333 vmlal.u16 q2, d21, d30
3334 vshll.u16 q3, d22, #BILINEAR_INTERPOLATION_BITS
3335 vld1.32 {d20}, [TMP1], STRIDE
3336 vmlsl.u16 q3, d22, d31
3337 vmlal.u16 q3, d23, d31
3338 vld1.32 {d21}, [TMP1]
3339 vmull.u8 q8, d20, d28
3340 vmlal.u8 q8, d21, d29
3341 vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
3342 vshrn.u32 d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS)
3343 vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS)
3344 vld1.32 {d22}, [TMP2], STRIDE
3345 vshrn.u32 d5, q3, #(2 * BILINEAR_INTERPOLATION_BITS)
3346 vadd.u16 q12, q12, q13
3347 vld1.32 {d23}, [TMP2]
3348 vmull.u8 q9, d22, d28
3349 mov TMP3, X, asr #16
3351 add TMP3, TOP, TMP3, asl #2
3352 mov TMP4, X, asr #16
3354 add TMP4, TOP, TMP4, asl #2
3355 vmlal.u8 q9, d23, d29
3356 vld1.32 {d22}, [TMP3], STRIDE
3357 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
3358 vld1.32 {d23}, [TMP3]
3359 vmull.u8 q10, d22, d28
3360 vmlal.u8 q10, d23, d29
3362 vshll.u16 q0, d16, #BILINEAR_INTERPOLATION_BITS
3364 vmlsl.u16 q0, d16, d30
3365 vmlal.u16 q0, d17, d30
3367 vld1.32 {d16}, [TMP4], STRIDE
3368 vadd.u16 q12, q12, q13
3369 vld1.32 {d17}, [TMP4]
3371 vmull.u8 q11, d16, d28
3372 vmlal.u8 q11, d17, d29
3373 vst1.32 {d6, d7}, [OUT, :128]!
3374 vshll.u16 q1, d18, #BILINEAR_INTERPOLATION_BITS
3375 vmlsl.u16 q1, d18, d31
3378 /*****************************************************************************/
3380 .set have_bilinear_interpolate_eight_pixels_8888_0565, 1
3382 .macro bilinear_interpolate_eight_pixels_8888_0565_head
3383 mov TMP1, X, asr #16
3385 add TMP1, TOP, TMP1, asl #2
3386 mov TMP2, X, asr #16
3388 add TMP2, TOP, TMP2, asl #2
3389 vld1.32 {d20}, [TMP1], STRIDE
3390 vld1.32 {d21}, [TMP1]
3391 vmull.u8 q8, d20, d28
3392 vmlal.u8 q8, d21, d29
3393 vld1.32 {d22}, [TMP2], STRIDE
3394 vld1.32 {d23}, [TMP2]
3395 vmull.u8 q9, d22, d28
3396 mov TMP3, X, asr #16
3398 add TMP3, TOP, TMP3, asl #2
3399 mov TMP4, X, asr #16
3401 add TMP4, TOP, TMP4, asl #2
3402 vmlal.u8 q9, d23, d29
3403 vld1.32 {d22}, [TMP3], STRIDE
3404 vld1.32 {d23}, [TMP3]
3405 vmull.u8 q10, d22, d28
3406 vmlal.u8 q10, d23, d29
3407 vshll.u16 q0, d16, #BILINEAR_INTERPOLATION_BITS
3408 vmlsl.u16 q0, d16, d30
3409 vmlal.u16 q0, d17, d30
3411 vld1.32 {d16}, [TMP4], STRIDE
3412 vld1.32 {d17}, [TMP4]
3414 vmull.u8 q11, d16, d28
3415 vmlal.u8 q11, d17, d29
3416 vshll.u16 q1, d18, #BILINEAR_INTERPOLATION_BITS
3417 vmlsl.u16 q1, d18, d31
3419 mov TMP1, X, asr #16
3421 add TMP1, TOP, TMP1, asl #2
3422 mov TMP2, X, asr #16
3424 add TMP2, TOP, TMP2, asl #2
3425 vmlal.u16 q1, d19, d31
3426 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
3427 vshll.u16 q2, d20, #BILINEAR_INTERPOLATION_BITS
3428 vmlsl.u16 q2, d20, d30
3429 vmlal.u16 q2, d21, d30
3430 vshll.u16 q3, d22, #BILINEAR_INTERPOLATION_BITS
3431 vld1.32 {d20}, [TMP1], STRIDE
3432 vmlsl.u16 q3, d22, d31
3433 vmlal.u16 q3, d23, d31
3434 vld1.32 {d21}, [TMP1]
3435 vmull.u8 q8, d20, d28
3436 vmlal.u8 q8, d21, d29
3437 vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
3438 vshrn.u32 d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS)
3439 vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS)
3440 vld1.32 {d22}, [TMP2], STRIDE
3441 vshrn.u32 d5, q3, #(2 * BILINEAR_INTERPOLATION_BITS)
3442 vadd.u16 q12, q12, q13
3443 vld1.32 {d23}, [TMP2]
3444 vmull.u8 q9, d22, d28
3445 mov TMP3, X, asr #16
3447 add TMP3, TOP, TMP3, asl #2
3448 mov TMP4, X, asr #16
3450 add TMP4, TOP, TMP4, asl #2
3451 vmlal.u8 q9, d23, d29
3452 vld1.32 {d22}, [TMP3], STRIDE
3453 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
3454 vld1.32 {d23}, [TMP3]
3455 vmull.u8 q10, d22, d28
3456 vmlal.u8 q10, d23, d29
3458 vshll.u16 q0, d16, #BILINEAR_INTERPOLATION_BITS
3460 vmlsl.u16 q0, d16, d30
3461 vmlal.u16 q0, d17, d30
3463 vld1.32 {d16}, [TMP4], STRIDE
3464 vadd.u16 q12, q12, q13
3465 vld1.32 {d17}, [TMP4]
3467 vmull.u8 q11, d16, d28
3468 vmlal.u8 q11, d17, d29
3469 vshll.u16 q1, d18, #BILINEAR_INTERPOLATION_BITS
3470 vmlsl.u16 q1, d18, d31
3473 .macro bilinear_interpolate_eight_pixels_8888_0565_tail
3474 vmlal.u16 q1, d19, d31
3475 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
3476 vshll.u16 q2, d20, #BILINEAR_INTERPOLATION_BITS
3477 vmlsl.u16 q2, d20, d30
3478 vmlal.u16 q2, d21, d30
3479 vshll.u16 q3, d22, #BILINEAR_INTERPOLATION_BITS
3480 vmlsl.u16 q3, d22, d31
3481 vmlal.u16 q3, d23, d31
3482 vadd.u16 q12, q12, q13
3483 vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
3484 vshrn.u32 d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS)
3485 vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS)
3486 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
3487 vshrn.u32 d5, q3, #(2 * BILINEAR_INTERPOLATION_BITS)
3490 vadd.u16 q12, q12, q13
3497 vshll.u8 q5, d10, #8
3500 vsri.u16 q5, q7, #11
3501 vst1.32 {d10, d11}, [OUT, :128]!
3504 .macro bilinear_interpolate_eight_pixels_8888_0565_tail_head
3505 mov TMP1, X, asr #16
3507 add TMP1, TOP, TMP1, asl #2
3508 mov TMP2, X, asr #16
3510 add TMP2, TOP, TMP2, asl #2
3511 vmlal.u16 q1, d19, d31
3512 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
3514 vshll.u16 q2, d20, #BILINEAR_INTERPOLATION_BITS
3515 vmlsl.u16 q2, d20, d30
3516 vmlal.u16 q2, d21, d30
3517 vshll.u16 q3, d22, #BILINEAR_INTERPOLATION_BITS
3518 vld1.32 {d20}, [TMP1], STRIDE
3519 vmlsl.u16 q3, d22, d31
3520 vmlal.u16 q3, d23, d31
3521 vld1.32 {d21}, [TMP1]
3522 vmull.u8 q8, d20, d28
3523 vmlal.u8 q8, d21, d29
3524 vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
3525 vshrn.u32 d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS)
3526 vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS)
3527 vld1.32 {d22}, [TMP2], STRIDE
3528 vshrn.u32 d5, q3, #(2 * BILINEAR_INTERPOLATION_BITS)
3529 vadd.u16 q12, q12, q13
3530 vld1.32 {d23}, [TMP2]
3531 vmull.u8 q9, d22, d28
3532 mov TMP3, X, asr #16
3534 add TMP3, TOP, TMP3, asl #2
3535 mov TMP4, X, asr #16
3537 add TMP4, TOP, TMP4, asl #2
3538 vmlal.u8 q9, d23, d29
3539 vld1.32 {d22}, [TMP3], STRIDE
3540 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
3541 vld1.32 {d23}, [TMP3]
3542 vmull.u8 q10, d22, d28
3543 vmlal.u8 q10, d23, d29
3545 vshll.u16 q0, d16, #BILINEAR_INTERPOLATION_BITS
3547 vmlsl.u16 q0, d16, d30
3548 vmlal.u16 q0, d17, d30
3550 vld1.32 {d16}, [TMP4], STRIDE
3551 vadd.u16 q12, q12, q13
3552 vld1.32 {d17}, [TMP4]
3554 vmull.u8 q11, d16, d28
3555 vmlal.u8 q11, d17, d29
3557 vshll.u16 q1, d18, #BILINEAR_INTERPOLATION_BITS
3558 vmlsl.u16 q1, d18, d31
3560 mov TMP1, X, asr #16
3562 add TMP1, TOP, TMP1, asl #2
3563 mov TMP2, X, asr #16
3565 add TMP2, TOP, TMP2, asl #2
3566 vmlal.u16 q1, d19, d31
3568 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
3569 vshll.u16 q2, d20, #BILINEAR_INTERPOLATION_BITS
3571 vmlsl.u16 q2, d20, d30
3572 vmlal.u16 q2, d21, d30
3573 vshll.u16 q3, d22, #BILINEAR_INTERPOLATION_BITS
3574 vld1.32 {d20}, [TMP1], STRIDE
3575 vmlsl.u16 q3, d22, d31
3576 vmlal.u16 q3, d23, d31
3577 vld1.32 {d21}, [TMP1]
3578 vmull.u8 q8, d20, d28
3579 vmlal.u8 q8, d21, d29
3581 vshll.u8 q5, d10, #8
3583 vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
3585 vshrn.u32 d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS)
3586 vsri.u16 q5, q7, #11
3587 vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS)
3588 vld1.32 {d22}, [TMP2], STRIDE
3589 vshrn.u32 d5, q3, #(2 * BILINEAR_INTERPOLATION_BITS)
3590 vadd.u16 q12, q12, q13
3591 vld1.32 {d23}, [TMP2]
3592 vmull.u8 q9, d22, d28
3593 mov TMP3, X, asr #16
3595 add TMP3, TOP, TMP3, asl #2
3596 mov TMP4, X, asr #16
3598 add TMP4, TOP, TMP4, asl #2
3599 vmlal.u8 q9, d23, d29
3600 vld1.32 {d22}, [TMP3], STRIDE
3601 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
3602 vld1.32 {d23}, [TMP3]
3603 vmull.u8 q10, d22, d28
3604 vmlal.u8 q10, d23, d29
3606 vshll.u16 q0, d16, #BILINEAR_INTERPOLATION_BITS
3608 vmlsl.u16 q0, d16, d30
3609 vmlal.u16 q0, d17, d30
3611 vld1.32 {d16}, [TMP4], STRIDE
3612 vadd.u16 q12, q12, q13
3613 vld1.32 {d17}, [TMP4]
3615 vmull.u8 q11, d16, d28
3616 vmlal.u8 q11, d17, d29
3617 vshll.u16 q1, d18, #BILINEAR_INTERPOLATION_BITS
3618 vst1.32 {d10, d11}, [OUT, :128]!
3619 vmlsl.u16 q1, d18, d31
3621 /*****************************************************************************/
3623 generate_bilinear_scanline_func \
3624 pixman_scaled_bilinear_scanline_8888_8888_SRC_asm_neon, 8888, 8888, \
3625 2, 2, 28, BILINEAR_FLAG_UNROLL_4
3627 generate_bilinear_scanline_func \
3628 pixman_scaled_bilinear_scanline_8888_0565_SRC_asm_neon, 8888, 0565, \
3629 2, 1, 28, BILINEAR_FLAG_UNROLL_8 | BILINEAR_FLAG_USE_ALL_NEON_REGS
3631 generate_bilinear_scanline_func \
3632 pixman_scaled_bilinear_scanline_0565_x888_SRC_asm_neon, 0565, 8888, \
3633 1, 2, 28, BILINEAR_FLAG_UNROLL_4
3635 generate_bilinear_scanline_func \
3636 pixman_scaled_bilinear_scanline_0565_0565_SRC_asm_neon, 0565, 0565, \
3637 1, 1, 28, BILINEAR_FLAG_UNROLL_4