2 * Copyright © 2011 SCore Corporation
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
21 * DEALINGS IN THE SOFTWARE.
23 * Author: Siarhei Siamashka (siarhei.siamashka@nokia.com)
24 * Author: Taekyun Kim (tkq.kim@samsung.com)
28 * This file contains scaled bilinear scanline functions implemented
29 * using older siarhei's bilinear macro template.
31 * << General scanline function procedures >>
32 * 1. bilinear interpolate source pixels
34 * 3. load destination pixels
35 * 4. duplicate mask to fill whole register
36 * 5. interleave source & destination pixels
37 * 6. apply mask to source pixels
38 * 7. combine source & destination pixels
39 * 8, Deinterleave final result
40 * 9. store destination pixels
42 * All registers with single number (i.e. src0, tmp0) are 64-bits registers.
43 * Registers with double numbers(src01, dst01) are 128-bits registers.
44 * All temp registers can be used freely outside the code block.
45 * Assume that symbol(register .req) OUT and MASK are defined at caller of these macro blocks.
48 * There can be lots of pipeline stalls inside code block and between code blocks.
49 * Further optimizations will be done by new macro templates using head/tail_head/tail scheme.
52 /* Prevent the stack from becoming executable for no reason... */
53 #if defined(__linux__) && defined (__ELF__)
54 .section .note.GNU-stack,"",%progbits
67 #include "pixman-private.h"
68 #include "pixman-arm-neon-asm.h"
71 * Bilinear macros from pixman-arm-neon-asm.S
74 /* Supplementary macro for setting function attributes */
75 .macro pixman_asm_function fname
80 .type fname, %function
86 * Bilinear scaling support code which tries to provide pixel fetching, color
87 * format conversion, and interpolation as separate macros which can be used
88 * as the basic building blocks for constructing bilinear scanline functions.
91 .macro bilinear_load_8888 reg1, reg2, tmp
94 add TMP1, TOP, TMP1, asl #2
95 vld1.32 {reg1}, [TMP1], STRIDE
96 vld1.32 {reg2}, [TMP1]
99 .macro bilinear_load_0565 reg1, reg2, tmp
102 add TMP1, TOP, TMP1, asl #1
103 vld1.32 {reg2[0]}, [TMP1], STRIDE
104 vld1.32 {reg2[1]}, [TMP1]
105 convert_four_0565_to_x888_packed reg2, reg1, reg2, tmp
108 .macro bilinear_load_and_vertical_interpolate_two_8888 \
109 acc1, acc2, reg1, reg2, reg3, reg4, tmp1, tmp2
111 bilinear_load_8888 reg1, reg2, tmp1
112 vmull.u8 acc1, reg1, d28
113 vmlal.u8 acc1, reg2, d29
114 bilinear_load_8888 reg3, reg4, tmp2
115 vmull.u8 acc2, reg3, d28
116 vmlal.u8 acc2, reg4, d29
119 .macro bilinear_load_and_vertical_interpolate_four_8888 \
120 xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi \
121 yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
123 bilinear_load_and_vertical_interpolate_two_8888 \
124 xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi
125 bilinear_load_and_vertical_interpolate_two_8888 \
126 yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
129 .macro bilinear_load_and_vertical_interpolate_two_0565 \
130 acc1, acc2, reg1, reg2, reg3, reg4, acc2lo, acc2hi
134 add TMP1, TOP, TMP1, asl #1
137 add TMP2, TOP, TMP2, asl #1
138 vld1.32 {acc2lo[0]}, [TMP1], STRIDE
139 vld1.32 {acc2hi[0]}, [TMP2], STRIDE
140 vld1.32 {acc2lo[1]}, [TMP1]
141 vld1.32 {acc2hi[1]}, [TMP2]
142 convert_0565_to_x888 acc2, reg3, reg2, reg1
147 vmull.u8 acc1, reg1, d28
148 vmlal.u8 acc1, reg2, d29
149 vmull.u8 acc2, reg3, d28
150 vmlal.u8 acc2, reg4, d29
153 .macro bilinear_load_and_vertical_interpolate_four_0565 \
154 xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi \
155 yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
159 add TMP1, TOP, TMP1, asl #1
162 add TMP2, TOP, TMP2, asl #1
163 vld1.32 {xacc2lo[0]}, [TMP1], STRIDE
164 vld1.32 {xacc2hi[0]}, [TMP2], STRIDE
165 vld1.32 {xacc2lo[1]}, [TMP1]
166 vld1.32 {xacc2hi[1]}, [TMP2]
167 convert_0565_to_x888 xacc2, xreg3, xreg2, xreg1
170 add TMP1, TOP, TMP1, asl #1
173 add TMP2, TOP, TMP2, asl #1
174 vld1.32 {yacc2lo[0]}, [TMP1], STRIDE
176 vld1.32 {yacc2hi[0]}, [TMP2], STRIDE
178 vld1.32 {yacc2lo[1]}, [TMP1]
180 vld1.32 {yacc2hi[1]}, [TMP2]
182 convert_0565_to_x888 yacc2, yreg3, yreg2, yreg1
183 vmull.u8 xacc1, xreg1, d28
185 vmlal.u8 xacc1, xreg2, d29
187 vmull.u8 xacc2, xreg3, d28
189 vmlal.u8 xacc2, xreg4, d29
191 vmull.u8 yacc1, yreg1, d28
192 vmlal.u8 yacc1, yreg2, d29
193 vmull.u8 yacc2, yreg3, d28
194 vmlal.u8 yacc2, yreg4, d29
197 .macro bilinear_store_8888 numpix, tmp1, tmp2
199 vst1.32 {d0, d1}, [OUT]!
203 vst1.32 {d0[0]}, [OUT, :32]!
205 .error bilinear_store_8888 numpix is unsupported
209 .macro bilinear_store_0565 numpix, tmp1, tmp2
214 convert_8888_to_0565 d2, d1, d0, q1, tmp1, tmp2
218 vst1.32 {d2[0]}, [OUT]!
220 vst1.16 {d2[0]}, [OUT]!
222 .error bilinear_store_0565 numpix is unsupported
228 * Macros for loading mask pixels into register 'mask'.
229 * vdup must be done in somewhere else.
231 .macro bilinear_load_mask_x numpix, mask
234 .macro bilinear_load_mask_8 numpix, mask
236 vld1.32 {mask[0]}, [MASK]!
238 vld1.16 {mask[0]}, [MASK]!
240 vld1.8 {mask[0]}, [MASK]!
242 .error bilinear_load_mask_8 numpix is unsupported
244 pld [MASK, #prefetch_offset]
247 .macro bilinear_load_mask mask_fmt, numpix, mask
248 bilinear_load_mask_&mask_fmt numpix, mask
253 * Macros for loading destination pixels into register 'dst0' and 'dst1'.
254 * Interleave should be done somewhere else.
256 .macro bilinear_load_dst_0565_src numpix, dst0, dst1, dst01
259 .macro bilinear_load_dst_8888_src numpix, dst0, dst1, dst01
262 .macro bilinear_load_dst_8888 numpix, dst0, dst1, dst01
264 vld1.32 {dst0, dst1}, [OUT]
266 vld1.32 {dst0}, [OUT]
268 vld1.32 {dst0[0]}, [OUT]
270 .error bilinear_load_dst_8888 numpix is unsupported
272 pld [OUT, #(prefetch_offset * 4)]
275 .macro bilinear_load_dst_8888_over numpix, dst0, dst1, dst01
276 bilinear_load_dst_8888 numpix, dst0, dst1, dst01
279 .macro bilinear_load_dst_8888_add numpix, dst0, dst1, dst01
280 bilinear_load_dst_8888 numpix, dst0, dst1, dst01
283 .macro bilinear_load_dst dst_fmt, op, numpix, dst0, dst1, dst01
284 bilinear_load_dst_&dst_fmt&_&op numpix, dst0, dst1, dst01
288 * Macros for duplicating partially loaded mask to fill entire register.
289 * We will apply mask to interleaved source pixels, that is
290 * (r0, r1, r2, r3, g0, g1, g2, g3) x (m0, m1, m2, m3, m0, m1, m2, m3)
291 * (b0, b1, b2, b3, a0, a1, a2, a3) x (m0, m1, m2, m3, m0, m1, m2, m3)
292 * So, we need to duplicate loaded mask into whole register.
295 * (r0, r1, x, x, g0, g1, x, x) x (m0, m1, m0, m1, m0, m1, m0, m1)
296 * (b0, b1, x, x, a0, a1, x, x) x (m0, m1, m0, m1, m0, m1, m0, m1)
297 * We can do some optimizations for this including last pixel cases.
299 .macro bilinear_duplicate_mask_x numpix, mask
302 .macro bilinear_duplicate_mask_8 numpix, mask
304 vdup.32 mask, mask[0]
306 vdup.16 mask, mask[0]
310 .error bilinear_duplicate_mask_8 is unsupported
314 .macro bilinear_duplicate_mask mask_fmt, numpix, mask
315 bilinear_duplicate_mask_&mask_fmt numpix, mask
319 * Macros for interleaving src and dst pixels to rrrr gggg bbbb aaaa form.
320 * Interleave should be done when maks is enabled or operator is 'over'.
322 .macro bilinear_interleave src0, src1, dst0, dst1
329 .macro bilinear_interleave_src_dst_x_src \
330 numpix, src0, src1, src01, dst0, dst1, dst01
333 .macro bilinear_interleave_src_dst_x_over \
334 numpix, src0, src1, src01, dst0, dst1, dst01
336 bilinear_interleave src0, src1, dst0, dst1
339 .macro bilinear_interleave_src_dst_x_add \
340 numpix, src0, src1, src01, dst0, dst1, dst01
343 .macro bilinear_interleave_src_dst_8_src \
344 numpix, src0, src1, src01, dst0, dst1, dst01
346 bilinear_interleave src0, src1, dst0, dst1
349 .macro bilinear_interleave_src_dst_8_over \
350 numpix, src0, src1, src01, dst0, dst1, dst01
352 bilinear_interleave src0, src1, dst0, dst1
355 .macro bilinear_interleave_src_dst_8_add \
356 numpix, src0, src1, src01, dst0, dst1, dst01
358 bilinear_interleave src0, src1, dst0, dst1
361 .macro bilinear_interleave_src_dst \
362 mask_fmt, op, numpix, src0, src1, src01, dst0, dst1, dst01
364 bilinear_interleave_src_dst_&mask_fmt&_&op \
365 numpix, src0, src1, src01, dst0, dst1, dst01
370 * Macros for applying masks to src pixels. (see combine_mask_u() function)
371 * src, dst should be in interleaved form.
372 * mask register should be in form (m0, m1, m2, m3).
374 .macro bilinear_apply_mask_to_src_x \
375 numpix, src0, src1, src01, mask, \
376 tmp01, tmp23, tmp45, tmp67
379 .macro bilinear_apply_mask_to_src_8 \
380 numpix, src0, src1, src01, mask, \
381 tmp01, tmp23, tmp45, tmp67
383 vmull.u8 tmp01, src0, mask
384 vmull.u8 tmp23, src1, mask
386 vrshr.u16 tmp45, tmp01, #8
387 vrshr.u16 tmp67, tmp23, #8
389 vraddhn.u16 src0, tmp45, tmp01
390 vraddhn.u16 src1, tmp67, tmp23
393 .macro bilinear_apply_mask_to_src \
394 mask_fmt, numpix, src0, src1, src01, mask, \
395 tmp01, tmp23, tmp45, tmp67
397 bilinear_apply_mask_to_src_&mask_fmt \
398 numpix, src0, src1, src01, mask, \
399 tmp01, tmp23, tmp45, tmp67
404 * Macros for combining src and destination pixels.
405 * Interleave or not is depending on operator 'op'.
407 .macro bilinear_combine_src \
408 numpix, src0, src1, src01, dst0, dst1, dst01, \
409 tmp01, tmp23, tmp45, tmp67, tmp8
412 .macro bilinear_combine_over \
413 numpix, src0, src1, src01, dst0, dst1, dst01, \
414 tmp01, tmp23, tmp45, tmp67, tmp8
416 vdup.32 tmp8, src1[1]
420 vmull.u8 tmp01, dst0, tmp8
422 vmull.u8 tmp23, dst1, tmp8
424 vrshr.u16 tmp45, tmp01, #8
425 vrshr.u16 tmp67, tmp23, #8
427 vraddhn.u16 dst0, tmp45, tmp01
428 vraddhn.u16 dst1, tmp67, tmp23
430 vqadd.u8 src01, dst01, src01
433 .macro bilinear_combine_add \
434 numpix, src0, src1, src01, dst0, dst1, dst01, \
435 tmp01, tmp23, tmp45, tmp67, tmp8
437 vqadd.u8 src01, dst01, src01
440 .macro bilinear_combine \
441 op, numpix, src0, src1, src01, dst0, dst1, dst01, \
442 tmp01, tmp23, tmp45, tmp67, tmp8
444 bilinear_combine_&op \
445 numpix, src0, src1, src01, dst0, dst1, dst01, \
446 tmp01, tmp23, tmp45, tmp67, tmp8
450 * Macros for final deinterleaving of destination pixels if needed.
452 .macro bilinear_deinterleave numpix, dst0, dst1, dst01
458 .macro bilinear_deinterleave_dst_x_src numpix, dst0, dst1, dst01
461 .macro bilinear_deinterleave_dst_x_over numpix, dst0, dst1, dst01
462 bilinear_deinterleave numpix, dst0, dst1, dst01
465 .macro bilinear_deinterleave_dst_x_add numpix, dst0, dst1, dst01
468 .macro bilinear_deinterleave_dst_8_src numpix, dst0, dst1, dst01
469 bilinear_deinterleave numpix, dst0, dst1, dst01
472 .macro bilinear_deinterleave_dst_8_over numpix, dst0, dst1, dst01
473 bilinear_deinterleave numpix, dst0, dst1, dst01
476 .macro bilinear_deinterleave_dst_8_add numpix, dst0, dst1, dst01
477 bilinear_deinterleave numpix, dst0, dst1, dst01
480 .macro bilinear_deinterleave_dst mask_fmt, op, numpix, dst0, dst1, dst01
481 bilinear_deinterleave_dst_&mask_fmt&_&op numpix, dst0, dst1, dst01
485 .macro bilinear_interpolate_last_pixel src_fmt, mask_fmt, dst_fmt, op
486 bilinear_load_&src_fmt d0, d1, d2
487 bilinear_load_mask mask_fmt, 1, d4
488 bilinear_load_dst dst_fmt, op, 1, d18, d19, q9
491 /* 5 cycles bubble */
492 vshll.u16 q0, d2, #BILINEAR_INTERPOLATION_BITS
493 vmlsl.u16 q0, d2, d30
494 vmlal.u16 q0, d3, d30
495 /* 5 cycles bubble */
496 bilinear_duplicate_mask mask_fmt, 1, d4
497 vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
498 /* 3 cycles bubble */
501 bilinear_interleave_src_dst \
502 mask_fmt, op, 1, d0, d1, q0, d18, d19, q9
503 bilinear_apply_mask_to_src \
504 mask_fmt, 1, d0, d1, q0, d4, \
507 op, 1, d0, d1, q0, d18, d19, q9, \
509 bilinear_deinterleave_dst mask_fmt, op, 1, d0, d1, q0
510 bilinear_store_&dst_fmt 1, q2, q3
513 .macro bilinear_interpolate_two_pixels src_fmt, mask_fmt, dst_fmt, op
514 bilinear_load_and_vertical_interpolate_two_&src_fmt \
515 q1, q11, d0, d1, d20, d21, d22, d23
516 bilinear_load_mask mask_fmt, 2, d4
517 bilinear_load_dst dst_fmt, op, 2, d18, d19, q9
518 vshll.u16 q0, d2, #BILINEAR_INTERPOLATION_BITS
519 vmlsl.u16 q0, d2, d30
520 vmlal.u16 q0, d3, d30
521 vshll.u16 q10, d22, #BILINEAR_INTERPOLATION_BITS
522 vmlsl.u16 q10, d22, d31
523 vmlal.u16 q10, d23, d31
524 vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
525 vshrn.u32 d1, q10, #(2 * BILINEAR_INTERPOLATION_BITS)
526 bilinear_duplicate_mask mask_fmt, 2, d4
527 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
528 vadd.u16 q12, q12, q13
530 bilinear_interleave_src_dst \
531 mask_fmt, op, 2, d0, d1, q0, d18, d19, q9
532 bilinear_apply_mask_to_src \
533 mask_fmt, 2, d0, d1, q0, d4, \
536 op, 2, d0, d1, q0, d18, d19, q9, \
538 bilinear_deinterleave_dst mask_fmt, op, 2, d0, d1, q0
539 bilinear_store_&dst_fmt 2, q2, q3
542 .macro bilinear_interpolate_four_pixels src_fmt, mask_fmt, dst_fmt, op
543 bilinear_load_and_vertical_interpolate_four_&src_fmt \
544 q1, q11, d0, d1, d20, d21, d22, d23 \
545 q3, q9, d4, d5, d16, d17, d18, d19
547 sub TMP1, TMP1, STRIDE
548 vshll.u16 q0, d2, #BILINEAR_INTERPOLATION_BITS
549 vmlsl.u16 q0, d2, d30
550 vmlal.u16 q0, d3, d30
551 vshll.u16 q10, d22, #BILINEAR_INTERPOLATION_BITS
552 vmlsl.u16 q10, d22, d31
553 vmlal.u16 q10, d23, d31
554 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
555 vshll.u16 q2, d6, #BILINEAR_INTERPOLATION_BITS
556 vmlsl.u16 q2, d6, d30
557 vmlal.u16 q2, d7, d30
558 vshll.u16 q8, d18, #BILINEAR_INTERPOLATION_BITS
559 bilinear_load_mask mask_fmt, 4, d22
560 bilinear_load_dst dst_fmt, op, 4, d2, d3, q1
562 vmlsl.u16 q8, d18, d31
563 vmlal.u16 q8, d19, d31
564 vadd.u16 q12, q12, q13
565 vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
566 vshrn.u32 d1, q10, #(2 * BILINEAR_INTERPOLATION_BITS)
567 vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS)
568 vshrn.u32 d5, q8, #(2 * BILINEAR_INTERPOLATION_BITS)
569 bilinear_duplicate_mask mask_fmt, 4, d22
570 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
573 vadd.u16 q12, q12, q13
574 bilinear_interleave_src_dst \
575 mask_fmt, op, 4, d0, d1, q0, d2, d3, q1
576 bilinear_apply_mask_to_src \
577 mask_fmt, 4, d0, d1, q0, d22, \
580 op, 4, d0, d1, q0, d2, d3, q1, \
582 bilinear_deinterleave_dst mask_fmt, op, 4, d0, d1, q0
583 bilinear_store_&dst_fmt 4, q2, q3
586 .set BILINEAR_FLAG_USE_MASK, 1
587 .set BILINEAR_FLAG_USE_ALL_NEON_REGS, 2
590 * Main template macro for generating NEON optimized bilinear scanline functions.
592 * Bilinear scanline generator macro take folling arguments:
593 * fname - name of the function to generate
594 * src_fmt - source color format (8888 or 0565)
595 * dst_fmt - destination color format (8888 or 0565)
596 * src/dst_bpp_shift - (1 << bpp_shift) is the size of src/dst pixel in bytes
597 * process_last_pixel - code block that interpolate one pixel and does not
598 * update horizontal weight
599 * process_two_pixels - code block that interpolate two pixels and update
601 * process_four_pixels - code block that interpolate four pixels and update
603 * process_pixblock_head - head part of middle loop
604 * process_pixblock_tail - tail part of middle loop
605 * process_pixblock_tail_head - tail_head of middle loop
606 * pixblock_size - number of pixels processed in a single middle loop
607 * prefetch_distance - prefetch in the source image by that many pixels ahead
610 .macro generate_bilinear_scanline_func \
612 src_fmt, dst_fmt, src_bpp_shift, dst_bpp_shift, \
613 bilinear_process_last_pixel, \
614 bilinear_process_two_pixels, \
615 bilinear_process_four_pixels, \
616 bilinear_process_pixblock_head, \
617 bilinear_process_pixblock_tail, \
618 bilinear_process_pixblock_tail_head, \
623 pixman_asm_function fname
624 .if pixblock_size == 8
625 .elseif pixblock_size == 4
627 .error unsupported pixblock size
630 .if ((flags) & BILINEAR_FLAG_USE_MASK) == 0
647 push {r4, r5, r6, r7, r8, r9}
648 mov PF_OFFS, #prefetch_distance
649 ldmia ip, {WB, X, UX, WIDTH}
667 .set prefetch_offset, prefetch_distance
670 push {r4, r5, r6, r7, r8, r9, r10, ip}
671 mov PF_OFFS, #prefetch_distance
672 ldmia ip, {WT, WB, X, UX, WIDTH}
675 mul PF_OFFS, PF_OFFS, UX
677 .if ((flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0
681 sub STRIDE, BOTTOM, TOP
691 vadd.u16 d25, d25, d26
693 /* ensure good destination alignment */
696 tst OUT, #(1 << dst_bpp_shift)
698 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
699 vadd.u16 q12, q12, q13
700 bilinear_process_last_pixel
703 vadd.u16 q13, q13, q13
704 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
705 vadd.u16 q12, q12, q13
709 tst OUT, #(1 << (dst_bpp_shift + 1))
711 bilinear_process_two_pixels
714 .if pixblock_size == 8
717 tst OUT, #(1 << (dst_bpp_shift + 2))
719 bilinear_process_four_pixels
723 subs WIDTH, WIDTH, #pixblock_size
725 mov PF_OFFS, PF_OFFS, asr #(16 - src_bpp_shift)
726 bilinear_process_pixblock_head
727 subs WIDTH, WIDTH, #pixblock_size
730 bilinear_process_pixblock_tail_head
731 subs WIDTH, WIDTH, #pixblock_size
734 bilinear_process_pixblock_tail
736 .if pixblock_size == 8
739 bilinear_process_four_pixels
742 /* handle the remaining trailing pixels */
745 bilinear_process_two_pixels
749 bilinear_process_last_pixel
751 .if ((flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0
755 .if ((flags) & BILINEAR_FLAG_USE_MASK) == 0
756 pop {r4, r5, r6, r7, r8, r9}
758 pop {r4, r5, r6, r7, r8, r9, r10, ip}
775 .if ((flags) & BILINEAR_FLAG_USE_MASK) != 0
783 /* src_8888_8_8888 */
784 .macro bilinear_src_8888_8_8888_process_last_pixel
785 bilinear_interpolate_last_pixel 8888, 8, 8888, src
788 .macro bilinear_src_8888_8_8888_process_two_pixels
789 bilinear_interpolate_two_pixels 8888, 8, 8888, src
792 .macro bilinear_src_8888_8_8888_process_four_pixels
793 bilinear_interpolate_four_pixels 8888, 8, 8888, src
796 .macro bilinear_src_8888_8_8888_process_pixblock_head
797 bilinear_src_8888_8_8888_process_four_pixels
800 .macro bilinear_src_8888_8_8888_process_pixblock_tail
803 .macro bilinear_src_8888_8_8888_process_pixblock_tail_head
804 bilinear_src_8888_8_8888_process_pixblock_tail
805 bilinear_src_8888_8_8888_process_pixblock_head
808 /* src_8888_8_0565 */
809 .macro bilinear_src_8888_8_0565_process_last_pixel
810 bilinear_interpolate_last_pixel 8888, 8, 0565, src
813 .macro bilinear_src_8888_8_0565_process_two_pixels
814 bilinear_interpolate_two_pixels 8888, 8, 0565, src
817 .macro bilinear_src_8888_8_0565_process_four_pixels
818 bilinear_interpolate_four_pixels 8888, 8, 0565, src
821 .macro bilinear_src_8888_8_0565_process_pixblock_head
822 bilinear_src_8888_8_0565_process_four_pixels
825 .macro bilinear_src_8888_8_0565_process_pixblock_tail
828 .macro bilinear_src_8888_8_0565_process_pixblock_tail_head
829 bilinear_src_8888_8_0565_process_pixblock_tail
830 bilinear_src_8888_8_0565_process_pixblock_head
833 /* src_0565_8_x888 */
834 .macro bilinear_src_0565_8_x888_process_last_pixel
835 bilinear_interpolate_last_pixel 0565, 8, 8888, src
838 .macro bilinear_src_0565_8_x888_process_two_pixels
839 bilinear_interpolate_two_pixels 0565, 8, 8888, src
842 .macro bilinear_src_0565_8_x888_process_four_pixels
843 bilinear_interpolate_four_pixels 0565, 8, 8888, src
846 .macro bilinear_src_0565_8_x888_process_pixblock_head
847 bilinear_src_0565_8_x888_process_four_pixels
850 .macro bilinear_src_0565_8_x888_process_pixblock_tail
853 .macro bilinear_src_0565_8_x888_process_pixblock_tail_head
854 bilinear_src_0565_8_x888_process_pixblock_tail
855 bilinear_src_0565_8_x888_process_pixblock_head
858 /* src_0565_8_0565 */
859 .macro bilinear_src_0565_8_0565_process_last_pixel
860 bilinear_interpolate_last_pixel 0565, 8, 0565, src
863 .macro bilinear_src_0565_8_0565_process_two_pixels
864 bilinear_interpolate_two_pixels 0565, 8, 0565, src
867 .macro bilinear_src_0565_8_0565_process_four_pixels
868 bilinear_interpolate_four_pixels 0565, 8, 0565, src
871 .macro bilinear_src_0565_8_0565_process_pixblock_head
872 bilinear_src_0565_8_0565_process_four_pixels
875 .macro bilinear_src_0565_8_0565_process_pixblock_tail
878 .macro bilinear_src_0565_8_0565_process_pixblock_tail_head
879 bilinear_src_0565_8_0565_process_pixblock_tail
880 bilinear_src_0565_8_0565_process_pixblock_head
884 .macro bilinear_over_8888_8888_process_last_pixel
885 bilinear_interpolate_last_pixel 8888, x, 8888, over
888 .macro bilinear_over_8888_8888_process_two_pixels
889 bilinear_interpolate_two_pixels 8888, x, 8888, over
892 .macro bilinear_over_8888_8888_process_four_pixels
893 bilinear_interpolate_four_pixels 8888, x, 8888, over
896 .macro bilinear_over_8888_8888_process_pixblock_head
899 add TMP1, TOP, TMP1, asl #2
902 add TMP2, TOP, TMP2, asl #2
904 vld1.32 {d22}, [TMP1], STRIDE
905 vld1.32 {d23}, [TMP1]
908 add TMP3, TOP, TMP3, asl #2
909 vmull.u8 q8, d22, d28
910 vmlal.u8 q8, d23, d29
912 vld1.32 {d22}, [TMP2], STRIDE
913 vld1.32 {d23}, [TMP2]
916 add TMP4, TOP, TMP4, asl #2
917 vmull.u8 q9, d22, d28
918 vmlal.u8 q9, d23, d29
920 vld1.32 {d22}, [TMP3], STRIDE
921 vld1.32 {d23}, [TMP3]
922 vmull.u8 q10, d22, d28
923 vmlal.u8 q10, d23, d29
925 vshll.u16 q0, d16, #BILINEAR_INTERPOLATION_BITS
926 vmlsl.u16 q0, d16, d30
927 vmlal.u16 q0, d17, d30
930 vld1.32 {d16}, [TMP4], STRIDE
931 vld1.32 {d17}, [TMP4]
933 vmull.u8 q11, d16, d28
934 vmlal.u8 q11, d17, d29
936 vshll.u16 q1, d18, #BILINEAR_INTERPOLATION_BITS
937 vmlsl.u16 q1, d18, d31
938 vmlal.u16 q1, d19, d31
939 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
940 vadd.u16 q12, q12, q13
943 .macro bilinear_over_8888_8888_process_pixblock_tail
944 vshll.u16 q2, d20, #BILINEAR_INTERPOLATION_BITS
945 vmlsl.u16 q2, d20, d30
946 vmlal.u16 q2, d21, d30
947 vshll.u16 q3, d22, #BILINEAR_INTERPOLATION_BITS
948 vmlsl.u16 q3, d22, d31
949 vmlal.u16 q3, d23, d31
950 vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
951 vshrn.u32 d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS)
952 vld1.32 {d2, d3}, [OUT, :128]
953 pld [OUT, #(prefetch_offset * 4)]
954 vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS)
955 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
956 vshrn.u32 d5, q3, #(2 * BILINEAR_INTERPOLATION_BITS)
967 vrshr.u16 q1, q11, #8
968 vrshr.u16 q10, q2, #8
969 vraddhn.u16 d2, q1, q11
970 vraddhn.u16 d3, q10, q2
974 vadd.u16 q12, q12, q13
975 vst1.32 {d6, d7}, [OUT, :128]!
978 .macro bilinear_over_8888_8888_process_pixblock_tail_head
979 vshll.u16 q2, d20, #BILINEAR_INTERPOLATION_BITS
982 add TMP1, TOP, TMP1, asl #2
983 vmlsl.u16 q2, d20, d30
986 add TMP2, TOP, TMP2, asl #2
987 vmlal.u16 q2, d21, d30
988 vshll.u16 q3, d22, #BILINEAR_INTERPOLATION_BITS
989 vld1.32 {d20}, [TMP1], STRIDE
990 vmlsl.u16 q3, d22, d31
991 vmlal.u16 q3, d23, d31
992 vld1.32 {d21}, [TMP1]
993 vmull.u8 q8, d20, d28
994 vmlal.u8 q8, d21, d29
995 vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
996 vshrn.u32 d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS)
997 vld1.32 {d2, d3}, [OUT, :128]
999 vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS)
1000 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
1001 vld1.32 {d22}, [TMP2], STRIDE
1002 vshrn.u32 d5, q3, #(2 * BILINEAR_INTERPOLATION_BITS)
1004 vld1.32 {d23}, [TMP2]
1005 vmull.u8 q9, d22, d28
1006 mov TMP3, X, asr #16
1008 add TMP3, TOP, TMP3, asl #2
1009 mov TMP4, X, asr #16
1011 add TMP4, TOP, TMP4, asl #2
1012 vmlal.u8 q9, d23, d29
1014 vld1.32 {d22}, [TMP3], STRIDE
1020 vld1.32 {d23}, [TMP3]
1022 vmull.u8 q10, d22, d28
1023 vmlal.u8 q10, d23, d29
1024 vmull.u8 q11, d2, d4
1026 vshll.u16 q0, d16, #BILINEAR_INTERPOLATION_BITS
1027 vmlsl.u16 q0, d16, d30
1028 vrshr.u16 q1, q11, #8
1029 vmlal.u16 q0, d17, d30
1030 vrshr.u16 q8, q2, #8
1031 vraddhn.u16 d2, q1, q11
1032 vraddhn.u16 d3, q8, q2
1034 vld1.32 {d16}, [TMP4], STRIDE
1036 vld1.32 {d17}, [TMP4]
1038 vmull.u8 q11, d16, d28
1039 vmlal.u8 q11, d17, d29
1041 vshll.u16 q1, d18, #BILINEAR_INTERPOLATION_BITS
1043 vmlsl.u16 q1, d18, d31
1044 vadd.u16 q12, q12, q13
1045 vmlal.u16 q1, d19, d31
1046 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
1047 vadd.u16 q12, q12, q13
1048 vst1.32 {d6, d7}, [OUT, :128]!
1051 /* over_8888_8_8888 */
1052 .macro bilinear_over_8888_8_8888_process_last_pixel
1053 bilinear_interpolate_last_pixel 8888, 8, 8888, over
1056 .macro bilinear_over_8888_8_8888_process_two_pixels
1057 bilinear_interpolate_two_pixels 8888, 8, 8888, over
1060 .macro bilinear_over_8888_8_8888_process_four_pixels
1061 bilinear_interpolate_four_pixels 8888, 8, 8888, over
1064 .macro bilinear_over_8888_8_8888_process_pixblock_head
1065 mov TMP1, X, asr #16
1067 add TMP1, TOP, TMP1, asl #2
1068 vld1.32 {d0}, [TMP1], STRIDE
1069 mov TMP2, X, asr #16
1071 add TMP2, TOP, TMP2, asl #2
1072 vld1.32 {d1}, [TMP1]
1073 mov TMP3, X, asr #16
1075 add TMP3, TOP, TMP3, asl #2
1076 vld1.32 {d2}, [TMP2], STRIDE
1077 mov TMP4, X, asr #16
1079 add TMP4, TOP, TMP4, asl #2
1080 vld1.32 {d3}, [TMP2]
1081 vmull.u8 q2, d0, d28
1082 vmull.u8 q3, d2, d28
1083 vmlal.u8 q2, d1, d29
1084 vmlal.u8 q3, d3, d29
1085 vshll.u16 q0, d4, #BILINEAR_INTERPOLATION_BITS
1086 vshll.u16 q1, d6, #BILINEAR_INTERPOLATION_BITS
1087 vmlsl.u16 q0, d4, d30
1088 vmlsl.u16 q1, d6, d31
1089 vmlal.u16 q0, d5, d30
1090 vmlal.u16 q1, d7, d31
1091 vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
1092 vshrn.u32 d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS)
1093 vld1.32 {d2}, [TMP3], STRIDE
1094 vld1.32 {d3}, [TMP3]
1096 vld1.32 {d4}, [TMP4], STRIDE
1097 vld1.32 {d5}, [TMP4]
1099 vmull.u8 q3, d2, d28
1100 vmlal.u8 q3, d3, d29
1101 vmull.u8 q1, d4, d28
1102 vmlal.u8 q1, d5, d29
1103 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
1104 vld1.32 {d22[0]}, [MASK]!
1105 pld [MASK, #prefetch_offset]
1106 vadd.u16 q12, q12, q13
1110 .macro bilinear_over_8888_8_8888_process_pixblock_tail
1111 vshll.u16 q9, d6, #BILINEAR_INTERPOLATION_BITS
1112 vshll.u16 q10, d2, #BILINEAR_INTERPOLATION_BITS
1113 vmlsl.u16 q9, d6, d30
1114 vmlsl.u16 q10, d2, d31
1115 vmlal.u16 q9, d7, d30
1116 vmlal.u16 q10, d3, d31
1117 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
1118 vadd.u16 q12, q12, q13
1120 vshrn.u32 d18, q9, #(2 * BILINEAR_INTERPOLATION_BITS)
1121 vshrn.u32 d19, q10, #(2 * BILINEAR_INTERPOLATION_BITS)
1123 vld1.32 {d18, d19}, [OUT, :128]
1129 vmull.u8 q10, d16, d22
1130 vmull.u8 q11, d17, d22
1131 vrsra.u16 q10, q10, #8
1132 vrsra.u16 q11, q11, #8
1133 vrshrn.u16 d16, q10, #8
1134 vrshrn.u16 d17, q11, #8
1137 vmull.u8 q10, d18, d22
1138 vmull.u8 q11, d19, d22
1139 vrshr.u16 q9, q10, #8
1140 vrshr.u16 q0, q11, #8
1141 vraddhn.u16 d18, q9, q10
1142 vraddhn.u16 d19, q0, q11
1146 vst1.32 {d18, d19}, [OUT, :128]!
1149 .macro bilinear_over_8888_8_8888_process_pixblock_tail_head
1150 vshll.u16 q9, d6, #BILINEAR_INTERPOLATION_BITS
1151 mov TMP1, X, asr #16
1153 add TMP1, TOP, TMP1, asl #2
1154 vshll.u16 q10, d2, #BILINEAR_INTERPOLATION_BITS
1155 vld1.32 {d0}, [TMP1], STRIDE
1156 mov TMP2, X, asr #16
1158 add TMP2, TOP, TMP2, asl #2
1159 vmlsl.u16 q9, d6, d30
1160 vmlsl.u16 q10, d2, d31
1161 vld1.32 {d1}, [TMP1]
1162 mov TMP3, X, asr #16
1164 add TMP3, TOP, TMP3, asl #2
1165 vmlal.u16 q9, d7, d30
1166 vmlal.u16 q10, d3, d31
1167 vld1.32 {d2}, [TMP2], STRIDE
1168 mov TMP4, X, asr #16
1170 add TMP4, TOP, TMP4, asl #2
1171 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
1172 vadd.u16 q12, q12, q13
1173 vld1.32 {d3}, [TMP2]
1175 vshrn.u32 d18, q9, #(2 * BILINEAR_INTERPOLATION_BITS)
1176 vshrn.u32 d19, q10, #(2 * BILINEAR_INTERPOLATION_BITS)
1177 vmull.u8 q2, d0, d28
1178 vmull.u8 q3, d2, d28
1180 vld1.32 {d18, d19}, [OUT, :128]
1181 pld [OUT, #(prefetch_offset * 4)]
1182 vmlal.u8 q2, d1, d29
1183 vmlal.u8 q3, d3, d29
1186 vshll.u16 q0, d4, #BILINEAR_INTERPOLATION_BITS
1187 vshll.u16 q1, d6, #BILINEAR_INTERPOLATION_BITS
1190 vmlsl.u16 q0, d4, d30
1191 vmlsl.u16 q1, d6, d31
1192 vmull.u8 q10, d16, d22
1193 vmull.u8 q11, d17, d22
1194 vmlal.u16 q0, d5, d30
1195 vmlal.u16 q1, d7, d31
1196 vrsra.u16 q10, q10, #8
1197 vrsra.u16 q11, q11, #8
1198 vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
1199 vshrn.u32 d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS)
1200 vrshrn.u16 d16, q10, #8
1201 vrshrn.u16 d17, q11, #8
1202 vld1.32 {d2}, [TMP3], STRIDE
1204 vld1.32 {d3}, [TMP3]
1207 vld1.32 {d4}, [TMP4], STRIDE
1208 vmull.u8 q10, d18, d22
1209 vmull.u8 q11, d19, d22
1210 vld1.32 {d5}, [TMP4]
1212 vmull.u8 q3, d2, d28
1213 vrshr.u16 q9, q10, #8
1214 vrshr.u16 q15, q11, #8
1215 vmlal.u8 q3, d3, d29
1216 vmull.u8 q1, d4, d28
1217 vraddhn.u16 d18, q9, q10
1218 vraddhn.u16 d19, q15, q11
1219 vmlal.u8 q1, d5, d29
1220 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
1222 vld1.32 {d22[0]}, [MASK]!
1224 vadd.u16 q12, q12, q13
1227 vst1.32 {d18, d19}, [OUT, :128]!
1231 .macro bilinear_add_8888_8888_process_last_pixel
1232 bilinear_interpolate_last_pixel 8888, x, 8888, add
1235 .macro bilinear_add_8888_8888_process_two_pixels
1236 bilinear_interpolate_two_pixels 8888, x, 8888, add
1239 .macro bilinear_add_8888_8888_process_four_pixels
1240 bilinear_interpolate_four_pixels 8888, x, 8888, add
1243 .macro bilinear_add_8888_8888_process_pixblock_head
1244 bilinear_add_8888_8888_process_four_pixels
1247 .macro bilinear_add_8888_8888_process_pixblock_tail
1250 .macro bilinear_add_8888_8888_process_pixblock_tail_head
1251 bilinear_add_8888_8888_process_pixblock_tail
1252 bilinear_add_8888_8888_process_pixblock_head
1255 /* add_8888_8_8888 */
1256 .macro bilinear_add_8888_8_8888_process_last_pixel
1257 bilinear_interpolate_last_pixel 8888, 8, 8888, add
1260 .macro bilinear_add_8888_8_8888_process_two_pixels
1261 bilinear_interpolate_two_pixels 8888, 8, 8888, add
1264 .macro bilinear_add_8888_8_8888_process_four_pixels
1265 bilinear_interpolate_four_pixels 8888, 8, 8888, add
1268 .macro bilinear_add_8888_8_8888_process_pixblock_head
1269 bilinear_add_8888_8_8888_process_four_pixels
1272 .macro bilinear_add_8888_8_8888_process_pixblock_tail
1275 .macro bilinear_add_8888_8_8888_process_pixblock_tail_head
1276 bilinear_add_8888_8_8888_process_pixblock_tail
1277 bilinear_add_8888_8_8888_process_pixblock_head
1281 /* Bilinear scanline functions */
1282 generate_bilinear_scanline_func \
1283 pixman_scaled_bilinear_scanline_8888_8_8888_SRC_asm_neon, \
1285 bilinear_src_8888_8_8888_process_last_pixel, \
1286 bilinear_src_8888_8_8888_process_two_pixels, \
1287 bilinear_src_8888_8_8888_process_four_pixels, \
1288 bilinear_src_8888_8_8888_process_pixblock_head, \
1289 bilinear_src_8888_8_8888_process_pixblock_tail, \
1290 bilinear_src_8888_8_8888_process_pixblock_tail_head, \
1291 4, 28, BILINEAR_FLAG_USE_MASK
1293 generate_bilinear_scanline_func \
1294 pixman_scaled_bilinear_scanline_8888_8_0565_SRC_asm_neon, \
1296 bilinear_src_8888_8_0565_process_last_pixel, \
1297 bilinear_src_8888_8_0565_process_two_pixels, \
1298 bilinear_src_8888_8_0565_process_four_pixels, \
1299 bilinear_src_8888_8_0565_process_pixblock_head, \
1300 bilinear_src_8888_8_0565_process_pixblock_tail, \
1301 bilinear_src_8888_8_0565_process_pixblock_tail_head, \
1302 4, 28, BILINEAR_FLAG_USE_MASK
1304 generate_bilinear_scanline_func \
1305 pixman_scaled_bilinear_scanline_0565_8_x888_SRC_asm_neon, \
1307 bilinear_src_0565_8_x888_process_last_pixel, \
1308 bilinear_src_0565_8_x888_process_two_pixels, \
1309 bilinear_src_0565_8_x888_process_four_pixels, \
1310 bilinear_src_0565_8_x888_process_pixblock_head, \
1311 bilinear_src_0565_8_x888_process_pixblock_tail, \
1312 bilinear_src_0565_8_x888_process_pixblock_tail_head, \
1313 4, 28, BILINEAR_FLAG_USE_MASK
1315 generate_bilinear_scanline_func \
1316 pixman_scaled_bilinear_scanline_0565_8_0565_SRC_asm_neon, \
1318 bilinear_src_0565_8_0565_process_last_pixel, \
1319 bilinear_src_0565_8_0565_process_two_pixels, \
1320 bilinear_src_0565_8_0565_process_four_pixels, \
1321 bilinear_src_0565_8_0565_process_pixblock_head, \
1322 bilinear_src_0565_8_0565_process_pixblock_tail, \
1323 bilinear_src_0565_8_0565_process_pixblock_tail_head, \
1324 4, 28, BILINEAR_FLAG_USE_MASK
1326 generate_bilinear_scanline_func \
1327 pixman_scaled_bilinear_scanline_8888_8888_OVER_asm_neon, \
1329 bilinear_over_8888_8888_process_last_pixel, \
1330 bilinear_over_8888_8888_process_two_pixels, \
1331 bilinear_over_8888_8888_process_four_pixels, \
1332 bilinear_over_8888_8888_process_pixblock_head, \
1333 bilinear_over_8888_8888_process_pixblock_tail, \
1334 bilinear_over_8888_8888_process_pixblock_tail_head, \
1337 generate_bilinear_scanline_func \
1338 pixman_scaled_bilinear_scanline_8888_8_8888_OVER_asm_neon, \
1340 bilinear_over_8888_8_8888_process_last_pixel, \
1341 bilinear_over_8888_8_8888_process_two_pixels, \
1342 bilinear_over_8888_8_8888_process_four_pixels, \
1343 bilinear_over_8888_8_8888_process_pixblock_head, \
1344 bilinear_over_8888_8_8888_process_pixblock_tail, \
1345 bilinear_over_8888_8_8888_process_pixblock_tail_head, \
1346 4, 28, BILINEAR_FLAG_USE_MASK
1348 generate_bilinear_scanline_func \
1349 pixman_scaled_bilinear_scanline_8888_8888_ADD_asm_neon, \
1351 bilinear_add_8888_8888_process_last_pixel, \
1352 bilinear_add_8888_8888_process_two_pixels, \
1353 bilinear_add_8888_8888_process_four_pixels, \
1354 bilinear_add_8888_8888_process_pixblock_head, \
1355 bilinear_add_8888_8888_process_pixblock_tail, \
1356 bilinear_add_8888_8888_process_pixblock_tail_head, \
1359 generate_bilinear_scanline_func \
1360 pixman_scaled_bilinear_scanline_8888_8_8888_ADD_asm_neon, \
1362 bilinear_add_8888_8_8888_process_last_pixel, \
1363 bilinear_add_8888_8_8888_process_two_pixels, \
1364 bilinear_add_8888_8_8888_process_four_pixels, \
1365 bilinear_add_8888_8_8888_process_pixblock_head, \
1366 bilinear_add_8888_8_8888_process_pixblock_tail, \
1367 bilinear_add_8888_8_8888_process_pixblock_tail_head, \
1368 4, 28, BILINEAR_FLAG_USE_MASK