qemu/pixman/pixman/pixman-arm-neon-asm.h

   1 /*
   2  * Copyright © 2009 Nokia Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
  21  * DEALINGS IN THE SOFTWARE.
  22  *
  23  * Author:  Siarhei Siamashka (siarhei.siamashka@nokia.com)
  24  */
  25
  26 /*
  27  * This file contains a macro ('generate_composite_function') which can
  28  * construct 2D image processing functions, based on a common template.
  29  * Any combinations of source, destination and mask images with 8bpp,
  30  * 16bpp, 24bpp, 32bpp color formats are supported.
  31  *
  32  * This macro takes care of:
  33  *  - handling of leading and trailing unaligned pixels
  34  *  - doing most of the work related to L2 cache preload
  35  *  - encourages the use of software pipelining for better instructions
  36  *    scheduling
  37  *
  38  * The user of this macro has to provide some configuration parameters
  39  * (bit depths for the images, prefetch distance, etc.) and a set of
  40  * macros, which should implement basic code chunks responsible for
  41  * pixels processing. See 'pixman-arm-neon-asm.S' file for the usage
  42  * examples.
  43  *
  44  * TODO:
  45  *  - try overlapped pixel method (from Ian Rickards) when processing
  46  *    exactly two blocks of pixels
  47  *  - maybe add an option to do reverse scanline processing
  48  */
  49
  50 /*
  51  * Bit flags for 'generate_composite_function' macro which are used
  52  * to tune generated functions behavior.
  53  */
  54 .set FLAG_DST_WRITEONLY,       0
  55 .set FLAG_DST_READWRITE,       1
  56 .set FLAG_DEINTERLEAVE_32BPP,  2
  57
  58 /*
  59  * Offset in stack where mask and source pointer/stride can be accessed
  60  * from 'init' macro. This is useful for doing special handling for solid mask.
  61  */
  62 .set ARGS_STACK_OFFSET,        40
  63
  64 /*
  65  * Constants for selecting preferable prefetch type.
  66  */
  67 .set PREFETCH_TYPE_NONE,       0 /* No prefetch at all */
  68 .set PREFETCH_TYPE_SIMPLE,     1 /* A simple, fixed-distance-ahead prefetch */
  69 .set PREFETCH_TYPE_ADVANCED,   2 /* Advanced fine-grained prefetch */
  70
  71 /*
  72  * Definitions of supplementary pixld/pixst macros (for partial load/store of
  73  * pixel data).
  74  */
  75
  76 .macro pixldst1 op, elem_size, reg1, mem_operand, abits
  77 .if abits > 0
  78     op&.&elem_size {d&reg1}, [&mem_operand&, :&abits&]!
  79 .else
  80     op&.&elem_size {d&reg1}, [&mem_operand&]!
  81 .endif
  82 .endm
  83
  84 .macro pixldst2 op, elem_size, reg1, reg2, mem_operand, abits
  85 .if abits > 0
  86     op&.&elem_size {d&reg1, d&reg2}, [&mem_operand&, :&abits&]!
  87 .else
  88     op&.&elem_size {d&reg1, d&reg2}, [&mem_operand&]!
  89 .endif
  90 .endm
  91
  92 .macro pixldst4 op, elem_size, reg1, reg2, reg3, reg4, mem_operand, abits
  93 .if abits > 0
  94     op&.&elem_size {d&reg1, d&reg2, d&reg3, d&reg4}, [&mem_operand&, :&abits&]!
  95 .else
  96     op&.&elem_size {d&reg1, d&reg2, d&reg3, d&reg4}, [&mem_operand&]!
  97 .endif
  98 .endm
  99
 100 .macro pixldst0 op, elem_size, reg1, idx, mem_operand, abits
 101     op&.&elem_size {d&reg1[idx]}, [&mem_operand&]!
 102 .endm
 103
 104 .macro pixldst3 op, elem_size, reg1, reg2, reg3, mem_operand
 105     op&.&elem_size {d&reg1, d&reg2, d&reg3}, [&mem_operand&]!
 106 .endm
 107
 108 .macro pixldst30 op, elem_size, reg1, reg2, reg3, idx, mem_operand
 109     op&.&elem_size {d&reg1[idx], d&reg2[idx], d&reg3[idx]}, [&mem_operand&]!
 110 .endm
 111
 112 .macro pixldst numbytes, op, elem_size, basereg, mem_operand, abits
 113 .if numbytes == 32
 114     pixldst4 op, elem_size, %(basereg+4), %(basereg+5), \
 115                               %(basereg+6), %(basereg+7), mem_operand, abits
 116 .elseif numbytes == 16
 117     pixldst2 op, elem_size, %(basereg+2), %(basereg+3), mem_operand, abits
 118 .elseif numbytes == 8
 119     pixldst1 op, elem_size, %(basereg+1), mem_operand, abits
 120 .elseif numbytes == 4
 121     .if !RESPECT_STRICT_ALIGNMENT || (elem_size == 32)
 122         pixldst0 op, 32, %(basereg+0), 1, mem_operand, abits
 123     .elseif elem_size == 16
 124         pixldst0 op, 16, %(basereg+0), 2, mem_operand, abits
 125         pixldst0 op, 16, %(basereg+0), 3, mem_operand, abits
 126     .else
 127         pixldst0 op, 8, %(basereg+0), 4, mem_operand, abits
 128         pixldst0 op, 8, %(basereg+0), 5, mem_operand, abits
 129         pixldst0 op, 8, %(basereg+0), 6, mem_operand, abits
 130         pixldst0 op, 8, %(basereg+0), 7, mem_operand, abits
 131     .endif
 132 .elseif numbytes == 2
 133     .if !RESPECT_STRICT_ALIGNMENT || (elem_size == 16)
 134         pixldst0 op, 16, %(basereg+0), 1, mem_operand, abits
 135     .else
 136         pixldst0 op, 8, %(basereg+0), 2, mem_operand, abits
 137         pixldst0 op, 8, %(basereg+0), 3, mem_operand, abits
 138     .endif
 139 .elseif numbytes == 1
 140     pixldst0 op, 8, %(basereg+0), 1, mem_operand, abits
 141 .else
 142     .error "unsupported size: numbytes"
 143 .endif
 144 .endm
 145
 146 .macro pixld numpix, bpp, basereg, mem_operand, abits=0
 147 .if bpp > 0
 148 .if (bpp == 32) && (numpix == 8) && (DEINTERLEAVE_32BPP_ENABLED != 0)
 149     pixldst4 vld4, 8, %(basereg+4), %(basereg+5), \
 150                       %(basereg+6), %(basereg+7), mem_operand, abits
 151 .elseif (bpp == 24) && (numpix == 8)
 152     pixldst3 vld3, 8, %(basereg+3), %(basereg+4), %(basereg+5), mem_operand
 153 .elseif (bpp == 24) && (numpix == 4)
 154     pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 4, mem_operand
 155     pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 5, mem_operand
 156     pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 6, mem_operand
 157     pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 7, mem_operand
 158 .elseif (bpp == 24) && (numpix == 2)
 159     pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 2, mem_operand
 160     pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 3, mem_operand
 161 .elseif (bpp == 24) && (numpix == 1)
 162     pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 1, mem_operand
 163 .else
 164     pixldst %(numpix * bpp / 8), vld1, %(bpp), basereg, mem_operand, abits
 165 .endif
 166 .endif
 167 .endm
 168
 169 .macro pixst numpix, bpp, basereg, mem_operand, abits=0
 170 .if bpp > 0
 171 .if (bpp == 32) && (numpix == 8) && (DEINTERLEAVE_32BPP_ENABLED != 0)
 172     pixldst4 vst4, 8, %(basereg+4), %(basereg+5), \
 173                       %(basereg+6), %(basereg+7), mem_operand, abits
 174 .elseif (bpp == 24) && (numpix == 8)
 175     pixldst3 vst3, 8, %(basereg+3), %(basereg+4), %(basereg+5), mem_operand
 176 .elseif (bpp == 24) && (numpix == 4)
 177     pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 4, mem_operand
 178     pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 5, mem_operand
 179     pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 6, mem_operand
 180     pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 7, mem_operand
 181 .elseif (bpp == 24) && (numpix == 2)
 182     pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 2, mem_operand
 183     pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 3, mem_operand
 184 .elseif (bpp == 24) && (numpix == 1)
 185     pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 1, mem_operand
 186 .else
 187     pixldst %(numpix * bpp / 8), vst1, %(bpp), basereg, mem_operand, abits
 188 .endif
 189 .endif
 190 .endm
 191
 192 .macro pixld_a numpix, bpp, basereg, mem_operand
 193 .if (bpp * numpix) <= 128
 194     pixld numpix, bpp, basereg, mem_operand, %(bpp * numpix)
 195 .else
 196     pixld numpix, bpp, basereg, mem_operand, 128
 197 .endif
 198 .endm
 199
 200 .macro pixst_a numpix, bpp, basereg, mem_operand
 201 .if (bpp * numpix) <= 128
 202     pixst numpix, bpp, basereg, mem_operand, %(bpp * numpix)
 203 .else
 204     pixst numpix, bpp, basereg, mem_operand, 128
 205 .endif
 206 .endm
 207
 208 /*
 209  * Pixel fetcher for nearest scaling (needs TMP1, TMP2, VX, UNIT_X register
 210  * aliases to be defined)
 211  */
 212 .macro pixld1_s elem_size, reg1, mem_operand
 213 .if elem_size == 16
 214     mov     TMP1, VX, asr #16
 215     adds    VX, VX, UNIT_X
 216 5:  subpls  VX, VX, SRC_WIDTH_FIXED
 217     bpl     5b
 218     add     TMP1, mem_operand, TMP1, asl #1
 219     mov     TMP2, VX, asr #16
 220     adds    VX, VX, UNIT_X
 221 5:  subpls  VX, VX, SRC_WIDTH_FIXED
 222     bpl     5b
 223     add     TMP2, mem_operand, TMP2, asl #1
 224     vld1.16 {d&reg1&[0]}, [TMP1, :16]
 225     mov     TMP1, VX, asr #16
 226     adds    VX, VX, UNIT_X
 227 5:  subpls  VX, VX, SRC_WIDTH_FIXED
 228     bpl     5b
 229     add     TMP1, mem_operand, TMP1, asl #1
 230     vld1.16 {d&reg1&[1]}, [TMP2, :16]
 231     mov     TMP2, VX, asr #16
 232     adds    VX, VX, UNIT_X
 233 5:  subpls  VX, VX, SRC_WIDTH_FIXED
 234     bpl     5b
 235     add     TMP2, mem_operand, TMP2, asl #1
 236     vld1.16 {d&reg1&[2]}, [TMP1, :16]
 237     vld1.16 {d&reg1&[3]}, [TMP2, :16]
 238 .elseif elem_size == 32
 239     mov     TMP1, VX, asr #16
 240     adds    VX, VX, UNIT_X
 241 5:  subpls  VX, VX, SRC_WIDTH_FIXED
 242     bpl     5b
 243     add     TMP1, mem_operand, TMP1, asl #2
 244     mov     TMP2, VX, asr #16
 245     adds    VX, VX, UNIT_X
 246 5:  subpls  VX, VX, SRC_WIDTH_FIXED
 247     bpl     5b
 248     add     TMP2, mem_operand, TMP2, asl #2
 249     vld1.32 {d&reg1&[0]}, [TMP1, :32]
 250     vld1.32 {d&reg1&[1]}, [TMP2, :32]
 251 .else
 252     .error "unsupported"
 253 .endif
 254 .endm
 255
 256 .macro pixld2_s elem_size, reg1, reg2, mem_operand
 257 .if 0 /* elem_size == 32 */
 258     mov     TMP1, VX, asr #16
 259     add     VX, VX, UNIT_X, asl #1
 260     add     TMP1, mem_operand, TMP1, asl #2
 261     mov     TMP2, VX, asr #16
 262     sub     VX, VX, UNIT_X
 263     add     TMP2, mem_operand, TMP2, asl #2
 264     vld1.32 {d&reg1&[0]}, [TMP1, :32]
 265     mov     TMP1, VX, asr #16
 266     add     VX, VX, UNIT_X, asl #1
 267     add     TMP1, mem_operand, TMP1, asl #2
 268     vld1.32 {d&reg2&[0]}, [TMP2, :32]
 269     mov     TMP2, VX, asr #16
 270     add     VX, VX, UNIT_X
 271     add     TMP2, mem_operand, TMP2, asl #2
 272     vld1.32 {d&reg1&[1]}, [TMP1, :32]
 273     vld1.32 {d&reg2&[1]}, [TMP2, :32]
 274 .else
 275     pixld1_s elem_size, reg1, mem_operand
 276     pixld1_s elem_size, reg2, mem_operand
 277 .endif
 278 .endm
 279
 280 .macro pixld0_s elem_size, reg1, idx, mem_operand
 281 .if elem_size == 16
 282     mov     TMP1, VX, asr #16
 283     adds    VX, VX, UNIT_X
 284 5:  subpls  VX, VX, SRC_WIDTH_FIXED
 285     bpl     5b
 286     add     TMP1, mem_operand, TMP1, asl #1
 287     vld1.16 {d&reg1&[idx]}, [TMP1, :16]
 288 .elseif elem_size == 32
 289     mov     TMP1, VX, asr #16
 290     adds    VX, VX, UNIT_X
 291 5:  subpls  VX, VX, SRC_WIDTH_FIXED
 292     bpl     5b
 293     add     TMP1, mem_operand, TMP1, asl #2
 294     vld1.32 {d&reg1&[idx]}, [TMP1, :32]
 295 .endif
 296 .endm
 297
 298 .macro pixld_s_internal numbytes, elem_size, basereg, mem_operand
 299 .if numbytes == 32
 300     pixld2_s elem_size, %(basereg+4), %(basereg+5), mem_operand
 301     pixld2_s elem_size, %(basereg+6), %(basereg+7), mem_operand
 302     pixdeinterleave elem_size, %(basereg+4)
 303 .elseif numbytes == 16
 304     pixld2_s elem_size, %(basereg+2), %(basereg+3), mem_operand
 305 .elseif numbytes == 8
 306     pixld1_s elem_size, %(basereg+1), mem_operand
 307 .elseif numbytes == 4
 308     .if elem_size == 32
 309         pixld0_s elem_size, %(basereg+0), 1, mem_operand
 310     .elseif elem_size == 16
 311         pixld0_s elem_size, %(basereg+0), 2, mem_operand
 312         pixld0_s elem_size, %(basereg+0), 3, mem_operand
 313     .else
 314         pixld0_s elem_size, %(basereg+0), 4, mem_operand
 315         pixld0_s elem_size, %(basereg+0), 5, mem_operand
 316         pixld0_s elem_size, %(basereg+0), 6, mem_operand
 317         pixld0_s elem_size, %(basereg+0), 7, mem_operand
 318     .endif
 319 .elseif numbytes == 2
 320     .if elem_size == 16
 321         pixld0_s elem_size, %(basereg+0), 1, mem_operand
 322     .else
 323         pixld0_s elem_size, %(basereg+0), 2, mem_operand
 324         pixld0_s elem_size, %(basereg+0), 3, mem_operand
 325     .endif
 326 .elseif numbytes == 1
 327     pixld0_s elem_size, %(basereg+0), 1, mem_operand
 328 .else
 329     .error "unsupported size: numbytes"
 330 .endif
 331 .endm
 332
 333 .macro pixld_s numpix, bpp, basereg, mem_operand
 334 .if bpp > 0
 335     pixld_s_internal %(numpix * bpp / 8), %(bpp), basereg, mem_operand
 336 .endif
 337 .endm
 338
 339 .macro vuzp8 reg1, reg2
 340     vuzp.8 d&reg1, d&reg2
 341 .endm
 342
 343 .macro vzip8 reg1, reg2
 344     vzip.8 d&reg1, d&reg2
 345 .endm
 346
 347 /* deinterleave B, G, R, A channels for eight 32bpp pixels in 4 registers */
 348 .macro pixdeinterleave bpp, basereg
 349 .if (bpp == 32) && (DEINTERLEAVE_32BPP_ENABLED != 0)
 350     vuzp8 %(basereg+0), %(basereg+1)
 351     vuzp8 %(basereg+2), %(basereg+3)
 352     vuzp8 %(basereg+1), %(basereg+3)
 353     vuzp8 %(basereg+0), %(basereg+2)
 354 .endif
 355 .endm
 356
 357 /* interleave B, G, R, A channels for eight 32bpp pixels in 4 registers */
 358 .macro pixinterleave bpp, basereg
 359 .if (bpp == 32) && (DEINTERLEAVE_32BPP_ENABLED != 0)
 360     vzip8 %(basereg+0), %(basereg+2)
 361     vzip8 %(basereg+1), %(basereg+3)
 362     vzip8 %(basereg+2), %(basereg+3)
 363     vzip8 %(basereg+0), %(basereg+1)
 364 .endif
 365 .endm
 366
 367 /*
 368  * This is a macro for implementing cache preload. The main idea is that
 369  * cache preload logic is mostly independent from the rest of pixels
 370  * processing code. It starts at the top left pixel and moves forward
 371  * across pixels and can jump across scanlines. Prefetch distance is
 372  * handled in an 'incremental' way: it starts from 0 and advances to the
 373  * optimal distance over time. After reaching optimal prefetch distance,
 374  * it is kept constant. There are some checks which prevent prefetching
 375  * unneeded pixel lines below the image (but it still can prefetch a bit
 376  * more data on the right side of the image - not a big issue and may
 377  * be actually helpful when rendering text glyphs). Additional trick is
 378  * the use of LDR instruction for prefetch instead of PLD when moving to
 379  * the next line, the point is that we have a high chance of getting TLB
 380  * miss in this case, and PLD would be useless.
 381  *
 382  * This sounds like it may introduce a noticeable overhead (when working with
 383  * fully cached data). But in reality, due to having a separate pipeline and
 384  * instruction queue for NEON unit in ARM Cortex-A8, normal ARM code can
 385  * execute simultaneously with NEON and be completely shadowed by it. Thus
 386  * we get no performance overhead at all (*). This looks like a very nice
 387  * feature of Cortex-A8, if used wisely. We don't have a hardware prefetcher,
 388  * but still can implement some rather advanced prefetch logic in software
 389  * for almost zero cost!
 390  *
 391  * (*) The overhead of the prefetcher is visible when running some trivial
 392  * pixels processing like simple copy. Anyway, having prefetch is a must
 393  * when working with the graphics data.
 394  */
 395 .macro PF a, x:vararg
 396 .if (PREFETCH_TYPE_CURRENT == PREFETCH_TYPE_ADVANCED)
 397     a x
 398 .endif
 399 .endm
 400
 401 .macro cache_preload std_increment, boost_increment
 402 .if (src_bpp_shift >= 0) || (dst_r_bpp != 0) || (mask_bpp_shift >= 0)
 403 .if regs_shortage
 404     PF ldr ORIG_W, [sp] /* If we are short on regs, ORIG_W is kept on stack */
 405 .endif
 406 .if std_increment != 0
 407     PF add PF_X, PF_X, #std_increment
 408 .endif
 409     PF tst PF_CTL, #0xF
 410     PF addne PF_X, PF_X, #boost_increment
 411     PF subne PF_CTL, PF_CTL, #1
 412     PF cmp PF_X, ORIG_W
 413 .if src_bpp_shift >= 0
 414     PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
 415 .endif
 416 .if dst_r_bpp != 0
 417     PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
 418 .endif
 419 .if mask_bpp_shift >= 0
 420     PF pld, [PF_MASK, PF_X, lsl #mask_bpp_shift]
 421 .endif
 422     PF subge PF_X, PF_X, ORIG_W
 423     PF subges PF_CTL, PF_CTL, #0x10
 424 .if src_bpp_shift >= 0
 425     PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
 426 .endif
 427 .if dst_r_bpp != 0
 428     PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
 429 .endif
 430 .if mask_bpp_shift >= 0
 431     PF ldrgeb DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]!
 432 .endif
 433 .endif
 434 .endm
 435
 436 .macro cache_preload_simple
 437 .if (PREFETCH_TYPE_CURRENT == PREFETCH_TYPE_SIMPLE)
 438 .if src_bpp > 0
 439     pld [SRC, #(PREFETCH_DISTANCE_SIMPLE * src_bpp / 8)]
 440 .endif
 441 .if dst_r_bpp > 0
 442     pld [DST_R, #(PREFETCH_DISTANCE_SIMPLE * dst_r_bpp / 8)]
 443 .endif
 444 .if mask_bpp > 0
 445     pld [MASK, #(PREFETCH_DISTANCE_SIMPLE * mask_bpp / 8)]
 446 .endif
 447 .endif
 448 .endm
 449
 450 .macro fetch_mask_pixblock
 451     pixld       pixblock_size, mask_bpp, \
 452                 (mask_basereg - pixblock_size * mask_bpp / 64), MASK
 453 .endm
 454
 455 /*
 456  * Macro which is used to process leading pixels until destination
 457  * pointer is properly aligned (at 16 bytes boundary). When destination
 458  * buffer uses 16bpp format, this is unnecessary, or even pointless.
 459  */
 460 .macro ensure_destination_ptr_alignment process_pixblock_head, \
 461                                         process_pixblock_tail, \
 462                                         process_pixblock_tail_head
 463 .if dst_w_bpp != 24
 464     tst         DST_R, #0xF
 465     beq         2f
 466
 467 .irp lowbit, 1, 2, 4, 8, 16
 468 local skip1
 469 .if (dst_w_bpp <= (lowbit * 8)) && ((lowbit * 8) < (pixblock_size * dst_w_bpp))
 470 .if lowbit < 16 /* we don't need more than 16-byte alignment */
 471     tst         DST_R, #lowbit
 472     beq         1f
 473 .endif
 474     pixld_src   (lowbit * 8 / dst_w_bpp), src_bpp, src_basereg, SRC
 475     pixld       (lowbit * 8 / dst_w_bpp), mask_bpp, mask_basereg, MASK
 476 .if dst_r_bpp > 0
 477     pixld_a     (lowbit * 8 / dst_r_bpp), dst_r_bpp, dst_r_basereg, DST_R
 478 .else
 479     add         DST_R, DST_R, #lowbit
 480 .endif
 481     PF add      PF_X, PF_X, #(lowbit * 8 / dst_w_bpp)
 482     sub         W, W, #(lowbit * 8 / dst_w_bpp)
 483 1:
 484 .endif
 485 .endr
 486     pixdeinterleave src_bpp, src_basereg
 487     pixdeinterleave mask_bpp, mask_basereg
 488     pixdeinterleave dst_r_bpp, dst_r_basereg
 489
 490     process_pixblock_head
 491     cache_preload 0, pixblock_size
 492     cache_preload_simple
 493     process_pixblock_tail
 494
 495     pixinterleave dst_w_bpp, dst_w_basereg
 496 .irp lowbit, 1, 2, 4, 8, 16
 497 .if (dst_w_bpp <= (lowbit * 8)) && ((lowbit * 8) < (pixblock_size * dst_w_bpp))
 498 .if lowbit < 16 /* we don't need more than 16-byte alignment */
 499     tst         DST_W, #lowbit
 500     beq         1f
 501 .endif
 502     pixst_a     (lowbit * 8 / dst_w_bpp), dst_w_bpp, dst_w_basereg, DST_W
 503 1:
 504 .endif
 505 .endr
 506 .endif
 507 2:
 508 .endm
 509
 510 /*
 511  * Special code for processing up to (pixblock_size - 1) remaining
 512  * trailing pixels. As SIMD processing performs operation on
 513  * pixblock_size pixels, anything smaller than this has to be loaded
 514  * and stored in a special way. Loading and storing of pixel data is
 515  * performed in such a way that we fill some 'slots' in the NEON
 516  * registers (some slots naturally are unused), then perform compositing
 517  * operation as usual. In the end, the data is taken from these 'slots'
 518  * and saved to memory.
 519  *
 520  * cache_preload_flag - allows to suppress prefetch if
 521  *                      set to 0
 522  * dst_aligned_flag   - selects whether destination buffer
 523  *                      is aligned
 524  */
 525 .macro process_trailing_pixels cache_preload_flag, \
 526                                dst_aligned_flag, \
 527                                process_pixblock_head, \
 528                                process_pixblock_tail, \
 529                                process_pixblock_tail_head
 530     tst         W, #(pixblock_size - 1)
 531     beq         2f
 532 .irp chunk_size, 16, 8, 4, 2, 1
 533 .if pixblock_size > chunk_size
 534     tst         W, #chunk_size
 535     beq         1f
 536     pixld_src   chunk_size, src_bpp, src_basereg, SRC
 537     pixld       chunk_size, mask_bpp, mask_basereg, MASK
 538 .if dst_aligned_flag != 0
 539     pixld_a     chunk_size, dst_r_bpp, dst_r_basereg, DST_R
 540 .else
 541     pixld       chunk_size, dst_r_bpp, dst_r_basereg, DST_R
 542 .endif
 543 .if cache_preload_flag != 0
 544     PF add      PF_X, PF_X, #chunk_size
 545 .endif
 546 1:
 547 .endif
 548 .endr
 549     pixdeinterleave src_bpp, src_basereg
 550     pixdeinterleave mask_bpp, mask_basereg
 551     pixdeinterleave dst_r_bpp, dst_r_basereg
 552
 553     process_pixblock_head
 554 .if cache_preload_flag != 0
 555     cache_preload 0, pixblock_size
 556     cache_preload_simple
 557 .endif
 558     process_pixblock_tail
 559     pixinterleave dst_w_bpp, dst_w_basereg
 560 .irp chunk_size, 16, 8, 4, 2, 1
 561 .if pixblock_size > chunk_size
 562     tst         W, #chunk_size
 563     beq         1f
 564 .if dst_aligned_flag != 0
 565     pixst_a     chunk_size, dst_w_bpp, dst_w_basereg, DST_W
 566 .else
 567     pixst       chunk_size, dst_w_bpp, dst_w_basereg, DST_W
 568 .endif
 569 1:
 570 .endif
 571 .endr
 572 2:
 573 .endm
 574
 575 /*
 576  * Macro, which performs all the needed operations to switch to the next
 577  * scanline and start the next loop iteration unless all the scanlines
 578  * are already processed.
 579  */
 580 .macro advance_to_next_scanline start_of_loop_label
 581 .if regs_shortage
 582     ldrd        W, [sp] /* load W and H (width and height) from stack */
 583 .else
 584     mov         W, ORIG_W
 585 .endif
 586     add         DST_W, DST_W, DST_STRIDE, lsl #dst_bpp_shift
 587 .if src_bpp != 0
 588     add         SRC, SRC, SRC_STRIDE, lsl #src_bpp_shift
 589 .endif
 590 .if mask_bpp != 0
 591     add         MASK, MASK, MASK_STRIDE, lsl #mask_bpp_shift
 592 .endif
 593 .if (dst_w_bpp != 24)
 594     sub         DST_W, DST_W, W, lsl #dst_bpp_shift
 595 .endif
 596 .if (src_bpp != 24) && (src_bpp != 0)
 597     sub         SRC, SRC, W, lsl #src_bpp_shift
 598 .endif
 599 .if (mask_bpp != 24) && (mask_bpp != 0)
 600     sub         MASK, MASK, W, lsl #mask_bpp_shift
 601 .endif
 602     subs        H, H, #1
 603     mov         DST_R, DST_W
 604 .if regs_shortage
 605     str         H, [sp, #4] /* save updated height to stack */
 606 .endif
 607     bge         start_of_loop_label
 608 .endm
 609
 610 /*
 611  * Registers are allocated in the following way by default:
 612  * d0, d1, d2, d3     - reserved for loading source pixel data
 613  * d4, d5, d6, d7     - reserved for loading destination pixel data
 614  * d24, d25, d26, d27 - reserved for loading mask pixel data
 615  * d28, d29, d30, d31 - final destination pixel data for writeback to memory
 616  */
 617 .macro generate_composite_function fname, \
 618                                    src_bpp_, \
 619                                    mask_bpp_, \
 620                                    dst_w_bpp_, \
 621                                    flags, \
 622                                    pixblock_size_, \
 623                                    prefetch_distance, \
 624                                    init, \
 625                                    cleanup, \
 626                                    process_pixblock_head, \
 627                                    process_pixblock_tail, \
 628                                    process_pixblock_tail_head, \
 629                                    dst_w_basereg_ = 28, \
 630                                    dst_r_basereg_ = 4, \
 631                                    src_basereg_   = 0, \
 632                                    mask_basereg_  = 24
 633
 634     .func fname
 635     .global fname
 636     /* For ELF format also set function visibility to hidden */
 637 #ifdef __ELF__
 638     .hidden fname
 639     .type fname, %function
 640 #endif
 641 fname:
 642     push        {r4-r12, lr}        /* save all registers */
 643
 644 /*
 645  * Select prefetch type for this function. If prefetch distance is
 646  * set to 0 or one of the color formats is 24bpp, SIMPLE prefetch
 647  * has to be used instead of ADVANCED.
 648  */
 649     .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_DEFAULT
 650 .if prefetch_distance == 0
 651     .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_NONE
 652 .elseif (PREFETCH_TYPE_CURRENT > PREFETCH_TYPE_SIMPLE) && \
 653         ((src_bpp_ == 24) || (mask_bpp_ == 24) || (dst_w_bpp_ == 24))
 654     .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_SIMPLE
 655 .endif
 656
 657 /*
 658  * Make some macro arguments globally visible and accessible
 659  * from other macros
 660  */
 661     .set src_bpp, src_bpp_
 662     .set mask_bpp, mask_bpp_
 663     .set dst_w_bpp, dst_w_bpp_
 664     .set pixblock_size, pixblock_size_
 665     .set dst_w_basereg, dst_w_basereg_
 666     .set dst_r_basereg, dst_r_basereg_
 667     .set src_basereg, src_basereg_
 668     .set mask_basereg, mask_basereg_
 669
 670     .macro pixld_src x:vararg
 671         pixld x
 672     .endm
 673     .macro fetch_src_pixblock
 674         pixld_src   pixblock_size, src_bpp, \
 675                     (src_basereg - pixblock_size * src_bpp / 64), SRC
 676     .endm
 677 /*
 678  * Assign symbolic names to registers
 679  */
 680     W           .req        r0      /* width (is updated during processing) */
 681     H           .req        r1      /* height (is updated during processing) */
 682     DST_W       .req        r2      /* destination buffer pointer for writes */
 683     DST_STRIDE  .req        r3      /* destination image stride */
 684     SRC         .req        r4      /* source buffer pointer */
 685     SRC_STRIDE  .req        r5      /* source image stride */
 686     DST_R       .req        r6      /* destination buffer pointer for reads */
 687
 688     MASK        .req        r7      /* mask pointer */
 689     MASK_STRIDE .req        r8      /* mask stride */
 690
 691     PF_CTL      .req        r9      /* combined lines counter and prefetch */
 692                                     /* distance increment counter */
 693     PF_X        .req        r10     /* pixel index in a scanline for current */
 694                                     /* pretetch position */
 695     PF_SRC      .req        r11     /* pointer to source scanline start */
 696                                     /* for prefetch purposes */
 697     PF_DST      .req        r12     /* pointer to destination scanline start */
 698                                     /* for prefetch purposes */
 699     PF_MASK     .req        r14     /* pointer to mask scanline start */
 700                                     /* for prefetch purposes */
 701 /*
 702  * Check whether we have enough registers for all the local variables.
 703  * If we don't have enough registers, original width and height are
 704  * kept on top of stack (and 'regs_shortage' variable is set to indicate
 705  * this for the rest of code). Even if there are enough registers, the
 706  * allocation scheme may be a bit different depending on whether source
 707  * or mask is not used.
 708  */
 709 .if (PREFETCH_TYPE_CURRENT < PREFETCH_TYPE_ADVANCED)
 710     ORIG_W      .req        r10     /* saved original width */
 711     DUMMY       .req        r12     /* temporary register */
 712     .set        regs_shortage, 0
 713 .elseif mask_bpp == 0
 714     ORIG_W      .req        r7      /* saved original width */
 715     DUMMY       .req        r8      /* temporary register */
 716     .set        regs_shortage, 0
 717 .elseif src_bpp == 0
 718     ORIG_W      .req        r4      /* saved original width */
 719     DUMMY       .req        r5      /* temporary register */
 720     .set        regs_shortage, 0
 721 .else
 722     ORIG_W      .req        r1      /* saved original width */
 723     DUMMY       .req        r1      /* temporary register */
 724     .set        regs_shortage, 1
 725 .endif
 726
 727     .set mask_bpp_shift, -1
 728 .if src_bpp == 32
 729     .set src_bpp_shift, 2
 730 .elseif src_bpp == 24
 731     .set src_bpp_shift, 0
 732 .elseif src_bpp == 16
 733     .set src_bpp_shift, 1
 734 .elseif src_bpp == 8
 735     .set src_bpp_shift, 0
 736 .elseif src_bpp == 0
 737     .set src_bpp_shift, -1
 738 .else
 739     .error "requested src bpp (src_bpp) is not supported"
 740 .endif
 741 .if mask_bpp == 32
 742     .set mask_bpp_shift, 2
 743 .elseif mask_bpp == 24
 744     .set mask_bpp_shift, 0
 745 .elseif mask_bpp == 8
 746     .set mask_bpp_shift, 0
 747 .elseif mask_bpp == 0
 748     .set mask_bpp_shift, -1
 749 .else
 750     .error "requested mask bpp (mask_bpp) is not supported"
 751 .endif
 752 .if dst_w_bpp == 32
 753     .set dst_bpp_shift, 2
 754 .elseif dst_w_bpp == 24
 755     .set dst_bpp_shift, 0
 756 .elseif dst_w_bpp == 16
 757     .set dst_bpp_shift, 1
 758 .elseif dst_w_bpp == 8
 759     .set dst_bpp_shift, 0
 760 .else
 761     .error "requested dst bpp (dst_w_bpp) is not supported"
 762 .endif
 763
 764 .if (((flags) & FLAG_DST_READWRITE) != 0)
 765     .set dst_r_bpp, dst_w_bpp
 766 .else
 767     .set dst_r_bpp, 0
 768 .endif
 769 .if (((flags) & FLAG_DEINTERLEAVE_32BPP) != 0)
 770     .set DEINTERLEAVE_32BPP_ENABLED, 1
 771 .else
 772     .set DEINTERLEAVE_32BPP_ENABLED, 0
 773 .endif
 774
 775 .if prefetch_distance < 0 || prefetch_distance > 15
 776     .error "invalid prefetch distance (prefetch_distance)"
 777 .endif
 778
 779 .if src_bpp > 0
 780     ldr         SRC, [sp, #40]
 781 .endif
 782 .if mask_bpp > 0
 783     ldr         MASK, [sp, #48]
 784 .endif
 785     PF mov      PF_X, #0
 786 .if src_bpp > 0
 787     ldr         SRC_STRIDE, [sp, #44]
 788 .endif
 789 .if mask_bpp > 0
 790     ldr         MASK_STRIDE, [sp, #52]
 791 .endif
 792     mov         DST_R, DST_W
 793
 794 .if src_bpp == 24
 795     sub         SRC_STRIDE, SRC_STRIDE, W
 796     sub         SRC_STRIDE, SRC_STRIDE, W, lsl #1
 797 .endif
 798 .if mask_bpp == 24
 799     sub         MASK_STRIDE, MASK_STRIDE, W
 800     sub         MASK_STRIDE, MASK_STRIDE, W, lsl #1
 801 .endif
 802 .if dst_w_bpp == 24
 803     sub         DST_STRIDE, DST_STRIDE, W
 804     sub         DST_STRIDE, DST_STRIDE, W, lsl #1
 805 .endif
 806
 807 /*
 808  * Setup advanced prefetcher initial state
 809  */
 810     PF mov      PF_SRC, SRC
 811     PF mov      PF_DST, DST_R
 812     PF mov      PF_MASK, MASK
 813     /* PF_CTL = prefetch_distance | ((h - 1) << 4) */
 814     PF mov      PF_CTL, H, lsl #4
 815     PF add      PF_CTL, #(prefetch_distance - 0x10)
 816
 817     init
 818 .if regs_shortage
 819     push        {r0, r1}
 820 .endif
 821     subs        H, H, #1
 822 .if regs_shortage
 823     str         H, [sp, #4] /* save updated height to stack */
 824 .else
 825     mov         ORIG_W, W
 826 .endif
 827     blt         9f
 828     cmp         W, #(pixblock_size * 2)
 829     blt         8f
 830 /*
 831  * This is the start of the pipelined loop, which if optimized for
 832  * long scanlines
 833  */
 834 0:
 835     ensure_destination_ptr_alignment process_pixblock_head, \
 836                                      process_pixblock_tail, \
 837                                      process_pixblock_tail_head
 838
 839     /* Implement "head (tail_head) ... (tail_head) tail" loop pattern */
 840     pixld_a     pixblock_size, dst_r_bpp, \
 841                 (dst_r_basereg - pixblock_size * dst_r_bpp / 64), DST_R
 842     fetch_src_pixblock
 843     pixld       pixblock_size, mask_bpp, \
 844                 (mask_basereg - pixblock_size * mask_bpp / 64), MASK
 845     PF add      PF_X, PF_X, #pixblock_size
 846     process_pixblock_head
 847     cache_preload 0, pixblock_size
 848     cache_preload_simple
 849     subs        W, W, #(pixblock_size * 2)
 850     blt         2f
 851 1:
 852     process_pixblock_tail_head
 853     cache_preload_simple
 854     subs        W, W, #pixblock_size
 855     bge         1b
 856 2:
 857     process_pixblock_tail
 858     pixst_a     pixblock_size, dst_w_bpp, \
 859                 (dst_w_basereg - pixblock_size * dst_w_bpp / 64), DST_W
 860
 861     /* Process the remaining trailing pixels in the scanline */
 862     process_trailing_pixels 1, 1, \
 863                             process_pixblock_head, \
 864                             process_pixblock_tail, \
 865                             process_pixblock_tail_head
 866     advance_to_next_scanline 0b
 867
 868 .if regs_shortage
 869     pop         {r0, r1}
 870 .endif
 871     cleanup
 872     pop         {r4-r12, pc}  /* exit */
 873 /*
 874  * This is the start of the loop, designed to process images with small width
 875  * (less than pixblock_size * 2 pixels). In this case neither pipelining
 876  * nor prefetch are used.
 877  */
 878 8:
 879     /* Process exactly pixblock_size pixels if needed */
 880     tst         W, #pixblock_size
 881     beq         1f
 882     pixld       pixblock_size, dst_r_bpp, \
 883                 (dst_r_basereg - pixblock_size * dst_r_bpp / 64), DST_R
 884     fetch_src_pixblock
 885     pixld       pixblock_size, mask_bpp, \
 886                 (mask_basereg - pixblock_size * mask_bpp / 64), MASK
 887     process_pixblock_head
 888     process_pixblock_tail
 889     pixst       pixblock_size, dst_w_bpp, \
 890                 (dst_w_basereg - pixblock_size * dst_w_bpp / 64), DST_W
 891 1:
 892     /* Process the remaining trailing pixels in the scanline */
 893     process_trailing_pixels 0, 0, \
 894                             process_pixblock_head, \
 895                             process_pixblock_tail, \
 896                             process_pixblock_tail_head
 897     advance_to_next_scanline 8b
 898 9:
 899 .if regs_shortage
 900     pop         {r0, r1}
 901 .endif
 902     cleanup
 903     pop         {r4-r12, pc}  /* exit */
 904
 905     .purgem     fetch_src_pixblock
 906     .purgem     pixld_src
 907
 908     .unreq      SRC
 909     .unreq      MASK
 910     .unreq      DST_R
 911     .unreq      DST_W
 912     .unreq      ORIG_W
 913     .unreq      W
 914     .unreq      H
 915     .unreq      SRC_STRIDE
 916     .unreq      DST_STRIDE
 917     .unreq      MASK_STRIDE
 918     .unreq      PF_CTL
 919     .unreq      PF_X
 920     .unreq      PF_SRC
 921     .unreq      PF_DST
 922     .unreq      PF_MASK
 923     .unreq      DUMMY
 924     .endfunc
 925 .endm
 926
 927 /*
 928  * A simplified variant of function generation template for a single
 929  * scanline processing (for implementing pixman combine functions)
 930  */
 931 .macro generate_composite_function_scanline        use_nearest_scaling, \
 932                                                    fname, \
 933                                                    src_bpp_, \
 934                                                    mask_bpp_, \
 935                                                    dst_w_bpp_, \
 936                                                    flags, \
 937                                                    pixblock_size_, \
 938                                                    init, \
 939                                                    cleanup, \
 940                                                    process_pixblock_head, \
 941                                                    process_pixblock_tail, \
 942                                                    process_pixblock_tail_head, \
 943                                                    dst_w_basereg_ = 28, \
 944                                                    dst_r_basereg_ = 4, \
 945                                                    src_basereg_   = 0, \
 946                                                    mask_basereg_  = 24
 947
 948     .func fname
 949     .global fname
 950     /* For ELF format also set function visibility to hidden */
 951 #ifdef __ELF__
 952     .hidden fname
 953     .type fname, %function
 954 #endif
 955 fname:
 956     .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_NONE
 957 /*
 958  * Make some macro arguments globally visible and accessible
 959  * from other macros
 960  */
 961     .set src_bpp, src_bpp_
 962     .set mask_bpp, mask_bpp_
 963     .set dst_w_bpp, dst_w_bpp_
 964     .set pixblock_size, pixblock_size_
 965     .set dst_w_basereg, dst_w_basereg_
 966     .set dst_r_basereg, dst_r_basereg_
 967     .set src_basereg, src_basereg_
 968     .set mask_basereg, mask_basereg_
 969
 970 .if use_nearest_scaling != 0
 971     /*
 972      * Assign symbolic names to registers for nearest scaling
 973      */
 974     W           .req        r0
 975     DST_W       .req        r1
 976     SRC         .req        r2
 977     VX          .req        r3
 978     UNIT_X      .req        ip
 979     MASK        .req        lr
 980     TMP1        .req        r4
 981     TMP2        .req        r5
 982     DST_R       .req        r6
 983     SRC_WIDTH_FIXED .req        r7
 984
 985     .macro pixld_src x:vararg
 986         pixld_s x
 987     .endm
 988
 989     ldr         UNIT_X, [sp]
 990     push        {r4-r8, lr}
 991     ldr         SRC_WIDTH_FIXED, [sp, #(24 + 4)]
 992     .if mask_bpp != 0
 993     ldr         MASK, [sp, #(24 + 8)]
 994     .endif
 995 .else
 996     /*
 997      * Assign symbolic names to registers
 998      */
 999     W           .req        r0      /* width (is updated during processing) */
1000     DST_W       .req        r1      /* destination buffer pointer for writes */
1001     SRC         .req        r2      /* source buffer pointer */
1002     DST_R       .req        ip      /* destination buffer pointer for reads */
1003     MASK        .req        r3      /* mask pointer */
1004
1005     .macro pixld_src x:vararg
1006         pixld x
1007     .endm
1008 .endif
1009
1010 .if (((flags) & FLAG_DST_READWRITE) != 0)
1011     .set dst_r_bpp, dst_w_bpp
1012 .else
1013     .set dst_r_bpp, 0
1014 .endif
1015 .if (((flags) & FLAG_DEINTERLEAVE_32BPP) != 0)
1016     .set DEINTERLEAVE_32BPP_ENABLED, 1
1017 .else
1018     .set DEINTERLEAVE_32BPP_ENABLED, 0
1019 .endif
1020
1021     .macro fetch_src_pixblock
1022         pixld_src   pixblock_size, src_bpp, \
1023                     (src_basereg - pixblock_size * src_bpp / 64), SRC
1024     .endm
1025
1026     init
1027     mov         DST_R, DST_W
1028
1029     cmp         W, #pixblock_size
1030     blt         8f
1031
1032     ensure_destination_ptr_alignment process_pixblock_head, \
1033                                      process_pixblock_tail, \
1034                                      process_pixblock_tail_head
1035
1036     subs        W, W, #pixblock_size
1037     blt         7f
1038
1039     /* Implement "head (tail_head) ... (tail_head) tail" loop pattern */
1040     pixld_a     pixblock_size, dst_r_bpp, \
1041                 (dst_r_basereg - pixblock_size * dst_r_bpp / 64), DST_R
1042     fetch_src_pixblock
1043     pixld       pixblock_size, mask_bpp, \
1044                 (mask_basereg - pixblock_size * mask_bpp / 64), MASK
1045     process_pixblock_head
1046     subs        W, W, #pixblock_size
1047     blt         2f
1048 1:
1049     process_pixblock_tail_head
1050     subs        W, W, #pixblock_size
1051     bge         1b
1052 2:
1053     process_pixblock_tail
1054     pixst_a     pixblock_size, dst_w_bpp, \
1055                 (dst_w_basereg - pixblock_size * dst_w_bpp / 64), DST_W
1056 7:
1057     /* Process the remaining trailing pixels in the scanline (dst aligned) */
1058     process_trailing_pixels 0, 1, \
1059                             process_pixblock_head, \
1060                             process_pixblock_tail, \
1061                             process_pixblock_tail_head
1062
1063     cleanup
1064 .if use_nearest_scaling != 0
1065     pop         {r4-r8, pc}  /* exit */
1066 .else
1067     bx          lr  /* exit */
1068 .endif
1069 8:
1070     /* Process the remaining trailing pixels in the scanline (dst unaligned) */
1071     process_trailing_pixels 0, 0, \
1072                             process_pixblock_head, \
1073                             process_pixblock_tail, \
1074                             process_pixblock_tail_head
1075
1076     cleanup
1077
1078 .if use_nearest_scaling != 0
1079     pop         {r4-r8, pc}  /* exit */
1080
1081     .unreq      DST_R
1082     .unreq      SRC
1083     .unreq      W
1084     .unreq      VX
1085     .unreq      UNIT_X
1086     .unreq      TMP1
1087     .unreq      TMP2
1088     .unreq      DST_W
1089     .unreq      MASK
1090     .unreq      SRC_WIDTH_FIXED
1091
1092 .else
1093     bx          lr  /* exit */
1094
1095     .unreq      SRC
1096     .unreq      MASK
1097     .unreq      DST_R
1098     .unreq      DST_W
1099     .unreq      W
1100 .endif
1101
1102     .purgem     fetch_src_pixblock
1103     .purgem     pixld_src
1104
1105     .endfunc
1106 .endm
1107
1108 .macro generate_composite_function_single_scanline x:vararg
1109     generate_composite_function_scanline 0, x
1110 .endm
1111
1112 .macro generate_composite_function_nearest_scanline x:vararg
1113     generate_composite_function_scanline 1, x
1114 .endm
1115
1116 /* Default prologue/epilogue, nothing special needs to be done */
1117
1118 .macro default_init
1119 .endm
1120
1121 .macro default_cleanup
1122 .endm
1123
1124 /*
1125  * Prologue/epilogue variant which additionally saves/restores d8-d15
1126  * registers (they need to be saved/restored by callee according to ABI).
1127  * This is required if the code needs to use all the NEON registers.
1128  */
1129
1130 .macro default_init_need_all_regs
1131     vpush       {d8-d15}
1132 .endm
1133
1134 .macro default_cleanup_need_all_regs
1135     vpop        {d8-d15}
1136 .endm
1137
1138 /******************************************************************************/
1139
1140 /*
1141  * Conversion of 8 r5g6b6 pixels packed in 128-bit register (in)
1142  * into a planar a8r8g8b8 format (with a, r, g, b color components
1143  * stored into 64-bit registers out_a, out_r, out_g, out_b respectively).
1144  *
1145  * Warning: the conversion is destructive and the original
1146  *          value (in) is lost.
1147  */
1148 .macro convert_0565_to_8888 in, out_a, out_r, out_g, out_b
1149     vshrn.u16   out_r, in,    #8
1150     vshrn.u16   out_g, in,    #3
1151     vsli.u16    in,    in,    #5
1152     vmov.u8     out_a, #255
1153     vsri.u8     out_r, out_r, #5
1154     vsri.u8     out_g, out_g, #6
1155     vshrn.u16   out_b, in,    #2
1156 .endm
1157
1158 .macro convert_0565_to_x888 in, out_r, out_g, out_b
1159     vshrn.u16   out_r, in,    #8
1160     vshrn.u16   out_g, in,    #3
1161     vsli.u16    in,    in,    #5
1162     vsri.u8     out_r, out_r, #5
1163     vsri.u8     out_g, out_g, #6
1164     vshrn.u16   out_b, in,    #2
1165 .endm
1166
1167 /*
1168  * Conversion from planar a8r8g8b8 format (with a, r, g, b color components
1169  * in 64-bit registers in_a, in_r, in_g, in_b respectively) into 8 r5g6b6
1170  * pixels packed in 128-bit register (out). Requires two temporary 128-bit
1171  * registers (tmp1, tmp2)
1172  */
1173 .macro convert_8888_to_0565 in_r, in_g, in_b, out, tmp1, tmp2
1174     vshll.u8    tmp1, in_g, #8
1175     vshll.u8    out, in_r, #8
1176     vshll.u8    tmp2, in_b, #8
1177     vsri.u16    out, tmp1, #5
1178     vsri.u16    out, tmp2, #11
1179 .endm
1180
1181 /*
1182  * Conversion of four r5g6b5 pixels (in) to four x8r8g8b8 pixels
1183  * returned in (out0, out1) registers pair. Requires one temporary
1184  * 64-bit register (tmp). 'out1' and 'in' may overlap, the original
1185  * value from 'in' is lost
1186  */
1187 .macro convert_four_0565_to_x888_packed in, out0, out1, tmp
1188     vshl.u16    out0, in,   #5  /* G top 6 bits */
1189     vshl.u16    tmp,  in,   #11 /* B top 5 bits */
1190     vsri.u16    in,   in,   #5  /* R is ready in top bits */
1191     vsri.u16    out0, out0, #6  /* G is ready in top bits */
1192     vsri.u16    tmp,  tmp,  #5  /* B is ready in top bits */
1193     vshr.u16    out1, in,   #8  /* R is in place */
1194     vsri.u16    out0, tmp,  #8  /* G & B is in place */
1195     vzip.u16    out0, out1      /* everything is in place */
1196 .endm