qemu/pixman/pixman/pixman-arm-simd-asm.S

   1 /*
   2  * Copyright © 2012 Raspberry Pi Foundation
   3  * Copyright © 2012 RISC OS Open Ltd
   4  *
   5  * Permission to use, copy, modify, distribute, and sell this software and its
   6  * documentation for any purpose is hereby granted without fee, provided that
   7  * the above copyright notice appear in all copies and that both that
   8  * copyright notice and this permission notice appear in supporting
   9  * documentation, and that the name of the copyright holders not be used in
  10  * advertising or publicity pertaining to distribution of the software without
  11  * specific, written prior permission.  The copyright holders make no
  12  * representations about the suitability of this software for any purpose.  It
  13  * is provided "as is" without express or implied warranty.
  14  *
  15  * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
  16  * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
  17  * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
  18  * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
  19  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
  20  * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
  21  * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
  22  * SOFTWARE.
  23  *
  24  * Author:  Ben Avison (bavison@riscosopen.org)
  25  *
  26  */
  27
  28 /* Prevent the stack from becoming executable */
  29 #if defined(__linux__) && defined(__ELF__)
  30 .section .note.GNU-stack,"",%progbits
  31 #endif
  32
  33         .text
  34         .arch armv6
  35         .object_arch armv4
  36         .arm
  37         .altmacro
  38         .p2align 2
  39
  40 #include "pixman-arm-simd-asm.h"
  41
  42 /* A head macro should do all processing which results in an output of up to
  43  * 16 bytes, as far as the final load instruction. The corresponding tail macro
  44  * should complete the processing of the up-to-16 bytes. The calling macro will
  45  * sometimes choose to insert a preload or a decrement of X between them.
  46  *   cond           ARM condition code for code block
  47  *   numbytes       Number of output bytes that should be generated this time
  48  *   firstreg       First WK register in which to place output
  49  *   unaligned_src  Whether to use non-wordaligned loads of source image
  50  *   unaligned_mask Whether to use non-wordaligned loads of mask image
  51  *   preload        If outputting 16 bytes causes 64 bytes to be read, whether an extra preload should be output
  52  */
  53
  54 .macro blit_init
  55         line_saved_regs STRIDE_D, STRIDE_S
  56 .endm
  57
  58 .macro blit_process_head   cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
  59         pixld   cond, numbytes, firstreg, SRC, unaligned_src
  60 .endm
  61
  62 .macro blit_inner_loop  process_head, process_tail, unaligned_src, unaligned_mask, dst_alignment
  63     WK4     .req    STRIDE_D
  64     WK5     .req    STRIDE_S
  65     WK6     .req    MASK
  66     WK7     .req    STRIDE_M
  67 110:    pixld   , 16, 0, SRC, unaligned_src
  68         pixld   , 16, 4, SRC, unaligned_src
  69         pld     [SRC, SCRATCH]
  70         pixst   , 16, 0, DST
  71         pixst   , 16, 4, DST
  72         subs    X, X, #32*8/src_bpp
  73         bhs     110b
  74     .unreq  WK4
  75     .unreq  WK5
  76     .unreq  WK6
  77     .unreq  WK7
  78 .endm
  79
  80 generate_composite_function \
  81     pixman_composite_src_8888_8888_asm_armv6, 32, 0, 32, \
  82     FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_SPILL_LINE_VARS_WIDE | FLAG_PROCESS_PRESERVES_SCRATCH, \
  83     4, /* prefetch distance */ \
  84     blit_init, \
  85     nop_macro, /* newline */ \
  86     nop_macro, /* cleanup */ \
  87     blit_process_head, \
  88     nop_macro, /* process tail */ \
  89     blit_inner_loop
  90
  91 generate_composite_function \
  92     pixman_composite_src_0565_0565_asm_armv6, 16, 0, 16, \
  93     FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_SPILL_LINE_VARS_WIDE | FLAG_PROCESS_PRESERVES_SCRATCH, \
  94     4, /* prefetch distance */ \
  95     blit_init, \
  96     nop_macro, /* newline */ \
  97     nop_macro, /* cleanup */ \
  98     blit_process_head, \
  99     nop_macro, /* process tail */ \
 100     blit_inner_loop
 101
 102 generate_composite_function \
 103     pixman_composite_src_8_8_asm_armv6, 8, 0, 8, \
 104     FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_SPILL_LINE_VARS_WIDE | FLAG_PROCESS_PRESERVES_SCRATCH, \
 105     3, /* prefetch distance */ \
 106     blit_init, \
 107     nop_macro, /* newline */ \
 108     nop_macro, /* cleanup */ \
 109     blit_process_head, \
 110     nop_macro, /* process tail */ \
 111     blit_inner_loop
 112
 113 /******************************************************************************/
 114
 115 .macro src_n_8888_init
 116         ldr     SRC, [sp, #ARGS_STACK_OFFSET]
 117         mov     STRIDE_S, SRC
 118         mov     MASK, SRC
 119         mov     STRIDE_M, SRC
 120 .endm
 121
 122 .macro src_n_0565_init
 123         ldrh    SRC, [sp, #ARGS_STACK_OFFSET]
 124         orr     SRC, SRC, lsl #16
 125         mov     STRIDE_S, SRC
 126         mov     MASK, SRC
 127         mov     STRIDE_M, SRC
 128 .endm
 129
 130 .macro src_n_8_init
 131         ldrb    SRC, [sp, #ARGS_STACK_OFFSET]
 132         orr     SRC, SRC, lsl #8
 133         orr     SRC, SRC, lsl #16
 134         mov     STRIDE_S, SRC
 135         mov     MASK, SRC
 136         mov     STRIDE_M, SRC
 137 .endm
 138
 139 .macro fill_process_tail  cond, numbytes, firstreg
 140     WK4     .req    SRC
 141     WK5     .req    STRIDE_S
 142     WK6     .req    MASK
 143     WK7     .req    STRIDE_M
 144         pixst   cond, numbytes, 4, DST
 145     .unreq  WK4
 146     .unreq  WK5
 147     .unreq  WK6
 148     .unreq  WK7
 149 .endm
 150
 151 generate_composite_function \
 152     pixman_composite_src_n_8888_asm_armv6, 0, 0, 32, \
 153     FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_PROCESS_PRESERVES_PSR | FLAG_PROCESS_DOES_STORE | FLAG_PROCESS_PRESERVES_SCRATCH \
 154     0, /* prefetch distance doesn't apply */ \
 155     src_n_8888_init \
 156     nop_macro, /* newline */ \
 157     nop_macro /* cleanup */ \
 158     nop_macro /* process head */ \
 159     fill_process_tail
 160
 161 generate_composite_function \
 162     pixman_composite_src_n_0565_asm_armv6, 0, 0, 16, \
 163     FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_PROCESS_PRESERVES_PSR | FLAG_PROCESS_DOES_STORE | FLAG_PROCESS_PRESERVES_SCRATCH \
 164     0, /* prefetch distance doesn't apply */ \
 165     src_n_0565_init \
 166     nop_macro, /* newline */ \
 167     nop_macro /* cleanup */ \
 168     nop_macro /* process head */ \
 169     fill_process_tail
 170
 171 generate_composite_function \
 172     pixman_composite_src_n_8_asm_armv6, 0, 0, 8, \
 173     FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_PROCESS_PRESERVES_PSR | FLAG_PROCESS_DOES_STORE | FLAG_PROCESS_PRESERVES_SCRATCH \
 174     0, /* prefetch distance doesn't apply */ \
 175     src_n_8_init \
 176     nop_macro, /* newline */ \
 177     nop_macro /* cleanup */ \
 178     nop_macro /* process head */ \
 179     fill_process_tail
 180
 181 /******************************************************************************/
 182
 183 .macro src_x888_8888_pixel, cond, reg
 184         orr&cond WK&reg, WK&reg, #0xFF000000
 185 .endm
 186
 187 .macro pixman_composite_src_x888_8888_process_head   cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
 188         pixld   cond, numbytes, firstreg, SRC, unaligned_src
 189 .endm
 190
 191 .macro pixman_composite_src_x888_8888_process_tail   cond, numbytes, firstreg
 192         src_x888_8888_pixel cond, %(firstreg+0)
 193  .if numbytes >= 8
 194         src_x888_8888_pixel cond, %(firstreg+1)
 195   .if numbytes == 16
 196         src_x888_8888_pixel cond, %(firstreg+2)
 197         src_x888_8888_pixel cond, %(firstreg+3)
 198   .endif
 199  .endif
 200 .endm
 201
 202 generate_composite_function \
 203     pixman_composite_src_x888_8888_asm_armv6, 32, 0, 32, \
 204     FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_PROCESS_PRESERVES_SCRATCH, \
 205     3, /* prefetch distance */ \
 206     nop_macro, /* init */ \
 207     nop_macro, /* newline */ \
 208     nop_macro, /* cleanup */ \
 209     pixman_composite_src_x888_8888_process_head, \
 210     pixman_composite_src_x888_8888_process_tail
 211
 212 /******************************************************************************/
 213
 214 .macro src_0565_8888_init
 215         /* Hold loop invariants in MASK and STRIDE_M */
 216         ldr     MASK, =0x07E007E0
 217         mov     STRIDE_M, #0xFF000000
 218         /* Set GE[3:0] to 1010 so SEL instructions do what we want */
 219         ldr     SCRATCH, =0x80008000
 220         uadd8   SCRATCH, SCRATCH, SCRATCH
 221 .endm
 222
 223 .macro src_0565_8888_2pixels, reg1, reg2
 224         and     SCRATCH, WK&reg1, MASK             @ 00000GGGGGG0000000000gggggg00000
 225         bic     WK&reg2, WK&reg1, MASK             @ RRRRR000000BBBBBrrrrr000000bbbbb
 226         orr     SCRATCH, SCRATCH, SCRATCH, lsr #6  @ 00000GGGGGGGGGGGG0000ggggggggggg
 227         mov     WK&reg1, WK&reg2, lsl #16          @ rrrrr000000bbbbb0000000000000000
 228         mov     SCRATCH, SCRATCH, ror #19          @ GGGG0000ggggggggggg00000GGGGGGGG
 229         bic     WK&reg2, WK&reg2, WK&reg1, lsr #16 @ RRRRR000000BBBBB0000000000000000
 230         orr     WK&reg1, WK&reg1, WK&reg1, lsr #5  @ rrrrrrrrrr0bbbbbbbbbb00000000000
 231         orr     WK&reg2, WK&reg2, WK&reg2, lsr #5  @ RRRRRRRRRR0BBBBBBBBBB00000000000
 232         pkhtb   WK&reg1, WK&reg1, WK&reg1, asr #5  @ rrrrrrrr--------bbbbbbbb--------
 233         sel     WK&reg1, WK&reg1, SCRATCH          @ rrrrrrrrggggggggbbbbbbbb--------
 234         mov     SCRATCH, SCRATCH, ror #16          @ ggg00000GGGGGGGGGGGG0000gggggggg
 235         pkhtb   WK&reg2, WK&reg2, WK&reg2, asr #5  @ RRRRRRRR--------BBBBBBBB--------
 236         sel     WK&reg2, WK&reg2, SCRATCH          @ RRRRRRRRGGGGGGGGBBBBBBBB--------
 237         orr     WK&reg1, STRIDE_M, WK&reg1, lsr #8 @ 11111111rrrrrrrrggggggggbbbbbbbb
 238         orr     WK&reg2, STRIDE_M, WK&reg2, lsr #8 @ 11111111RRRRRRRRGGGGGGGGBBBBBBBB
 239 .endm
 240
 241 /* This version doesn't need STRIDE_M, but is one instruction longer.
 242    It would however be preferable for an XRGB target, since we could knock off the last 2 instructions, but is that a common case?
 243         and     SCRATCH, WK&reg1, MASK             @ 00000GGGGGG0000000000gggggg00000
 244         bic     WK&reg1, WK&reg1, MASK             @ RRRRR000000BBBBBrrrrr000000bbbbb
 245         orr     SCRATCH, SCRATCH, SCRATCH, lsr #6  @ 00000GGGGGGGGGGGG0000ggggggggggg
 246         mov     WK&reg2, WK&reg1, lsr #16          @ 0000000000000000RRRRR000000BBBBB
 247         mov     SCRATCH, SCRATCH, ror #27          @ GGGGGGGGGGGG0000ggggggggggg00000
 248         bic     WK&reg1, WK&reg1, WK&reg2, lsl #16 @ 0000000000000000rrrrr000000bbbbb
 249         mov     WK&reg2, WK&reg2, lsl #3           @ 0000000000000RRRRR000000BBBBB000
 250         mov     WK&reg1, WK&reg1, lsl #3           @ 0000000000000rrrrr000000bbbbb000
 251         orr     WK&reg2, WK&reg2, WK&reg2, lsr #5  @ 0000000000000RRRRRRRRRR0BBBBBBBB
 252         orr     WK&reg1, WK&reg1, WK&reg1, lsr #5  @ 0000000000000rrrrrrrrrr0bbbbbbbb
 253         pkhbt   WK&reg2, WK&reg2, WK&reg2, lsl #5  @ --------RRRRRRRR--------BBBBBBBB
 254         pkhbt   WK&reg1, WK&reg1, WK&reg1, lsl #5  @ --------rrrrrrrr--------bbbbbbbb
 255         sel     WK&reg2, SCRATCH, WK&reg2          @ --------RRRRRRRRGGGGGGGGBBBBBBBB
 256         sel     WK&reg1, SCRATCH, WK&reg1          @ --------rrrrrrrrggggggggbbbbbbbb
 257         orr     WK&reg2, WK&reg2, #0xFF000000      @ 11111111RRRRRRRRGGGGGGGGBBBBBBBB
 258         orr     WK&reg1, WK&reg1, #0xFF000000      @ 11111111rrrrrrrrggggggggbbbbbbbb
 259 */
 260
 261 .macro src_0565_8888_1pixel, reg
 262         bic     SCRATCH, WK&reg, MASK              @ 0000000000000000rrrrr000000bbbbb
 263         and     WK&reg, WK&reg, MASK               @ 000000000000000000000gggggg00000
 264         mov     SCRATCH, SCRATCH, lsl #3           @ 0000000000000rrrrr000000bbbbb000
 265         mov     WK&reg, WK&reg, lsl #5             @ 0000000000000000gggggg0000000000
 266         orr     SCRATCH, SCRATCH, SCRATCH, lsr #5  @ 0000000000000rrrrrrrrrr0bbbbbbbb
 267         orr     WK&reg, WK&reg, WK&reg, lsr #6     @ 000000000000000gggggggggggg00000
 268         pkhbt   SCRATCH, SCRATCH, SCRATCH, lsl #5  @ --------rrrrrrrr--------bbbbbbbb
 269         sel     WK&reg, WK&reg, SCRATCH            @ --------rrrrrrrrggggggggbbbbbbbb
 270         orr     WK&reg, WK&reg, #0xFF000000        @ 11111111rrrrrrrrggggggggbbbbbbbb
 271 .endm
 272
 273 .macro src_0565_8888_process_head   cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
 274  .if numbytes == 16
 275         pixldst ld,, 8, firstreg, %(firstreg+2),,, SRC, unaligned_src
 276  .elseif numbytes == 8
 277         pixld   , 4, firstreg, SRC, unaligned_src
 278  .elseif numbytes == 4
 279         pixld   , 2, firstreg, SRC, unaligned_src
 280  .endif
 281 .endm
 282
 283 .macro src_0565_8888_process_tail   cond, numbytes, firstreg
 284  .if numbytes == 16
 285         src_0565_8888_2pixels firstreg, %(firstreg+1)
 286         src_0565_8888_2pixels %(firstreg+2), %(firstreg+3)
 287  .elseif numbytes == 8
 288         src_0565_8888_2pixels firstreg, %(firstreg+1)
 289  .else
 290         src_0565_8888_1pixel firstreg
 291  .endif
 292 .endm
 293
 294 generate_composite_function \
 295     pixman_composite_src_0565_8888_asm_armv6, 16, 0, 32, \
 296     FLAG_DST_WRITEONLY | FLAG_BRANCH_OVER, \
 297     3, /* prefetch distance */ \
 298     src_0565_8888_init, \
 299     nop_macro, /* newline */ \
 300     nop_macro, /* cleanup */ \
 301     src_0565_8888_process_head, \
 302     src_0565_8888_process_tail
 303
 304 /******************************************************************************/
 305
 306 .macro add_8_8_8pixels  cond, dst1, dst2
 307         uqadd8&cond  WK&dst1, WK&dst1, MASK
 308         uqadd8&cond  WK&dst2, WK&dst2, STRIDE_M
 309 .endm
 310
 311 .macro add_8_8_4pixels  cond, dst
 312         uqadd8&cond  WK&dst, WK&dst, MASK
 313 .endm
 314
 315 .macro add_8_8_process_head  cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
 316     WK4     .req    MASK
 317     WK5     .req    STRIDE_M
 318  .if numbytes == 16
 319         pixld   cond, 8, 4, SRC, unaligned_src
 320         pixld   cond, 16, firstreg, DST, 0
 321         add_8_8_8pixels cond, firstreg, %(firstreg+1)
 322         pixld   cond, 8, 4, SRC, unaligned_src
 323  .else
 324         pixld   cond, numbytes, 4, SRC, unaligned_src
 325         pixld   cond, numbytes, firstreg, DST, 0
 326  .endif
 327     .unreq  WK4
 328     .unreq  WK5
 329 .endm
 330
 331 .macro add_8_8_process_tail  cond, numbytes, firstreg
 332  .if numbytes == 16
 333         add_8_8_8pixels cond, %(firstreg+2), %(firstreg+3)
 334  .elseif numbytes == 8
 335         add_8_8_8pixels cond, firstreg, %(firstreg+1)
 336  .else
 337         add_8_8_4pixels cond, firstreg
 338  .endif
 339 .endm
 340
 341 generate_composite_function \
 342     pixman_composite_add_8_8_asm_armv6, 8, 0, 8, \
 343     FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_PRESERVES_SCRATCH, \
 344     2, /* prefetch distance */ \
 345     nop_macro, /* init */ \
 346     nop_macro, /* newline */ \
 347     nop_macro, /* cleanup */ \
 348     add_8_8_process_head, \
 349     add_8_8_process_tail
 350
 351 /******************************************************************************/
 352
 353 .macro over_8888_8888_init
 354         /* Hold loop invariant in MASK */
 355         ldr     MASK, =0x00800080
 356         /* Set GE[3:0] to 0101 so SEL instructions do what we want */
 357         uadd8   SCRATCH, MASK, MASK
 358         line_saved_regs STRIDE_D, STRIDE_S, ORIG_W
 359 .endm
 360
 361 .macro over_8888_8888_process_head  cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
 362     WK4     .req    STRIDE_D
 363     WK5     .req    STRIDE_S
 364     WK6     .req    STRIDE_M
 365     WK7     .req    ORIG_W
 366         pixld   , numbytes, %(4+firstreg), SRC, unaligned_src
 367         pixld   , numbytes, firstreg, DST, 0
 368     .unreq  WK4
 369     .unreq  WK5
 370     .unreq  WK6
 371     .unreq  WK7
 372 .endm
 373
 374 .macro over_8888_8888_check_transparent  numbytes, reg0, reg1, reg2, reg3
 375         /* Since these colours a premultiplied by alpha, only 0 indicates transparent (any other colour with 0 in the alpha byte is luminous) */
 376         teq     WK&reg0, #0
 377  .if numbytes > 4
 378         teqeq   WK&reg1, #0
 379   .if numbytes > 8
 380         teqeq   WK&reg2, #0
 381         teqeq   WK&reg3, #0
 382   .endif
 383  .endif
 384 .endm
 385
 386 .macro over_8888_8888_prepare  next
 387         mov     WK&next, WK&next, lsr #24
 388 .endm
 389
 390 .macro over_8888_8888_1pixel src, dst, offset, next
 391         /* src = destination component multiplier */
 392         rsb     WK&src, WK&src, #255
 393         /* Split even/odd bytes of dst into SCRATCH/dst */
 394         uxtb16  SCRATCH, WK&dst
 395         uxtb16  WK&dst, WK&dst, ror #8
 396         /* Multiply through, adding 0.5 to the upper byte of result for rounding */
 397         mla     SCRATCH, SCRATCH, WK&src, MASK
 398         mla     WK&dst, WK&dst, WK&src, MASK
 399         /* Where we would have had a stall between the result of the first MLA and the shifter input,
 400          * reload the complete source pixel */
 401         ldr     WK&src, [SRC, #offset]
 402         /* Multiply by 257/256 to approximate 256/255 */
 403         uxtab16 SCRATCH, SCRATCH, SCRATCH, ror #8
 404         /* In this stall, start processing the next pixel */
 405  .if offset < -4
 406         mov     WK&next, WK&next, lsr #24
 407  .endif
 408         uxtab16 WK&dst, WK&dst, WK&dst, ror #8
 409         /* Recombine even/odd bytes of multiplied destination */
 410         mov     SCRATCH, SCRATCH, ror #8
 411         sel     WK&dst, SCRATCH, WK&dst
 412         /* Saturated add of source to multiplied destination */
 413         uqadd8  WK&dst, WK&dst, WK&src
 414 .endm
 415
 416 .macro over_8888_8888_process_tail  cond, numbytes, firstreg
 417     WK4     .req    STRIDE_D
 418     WK5     .req    STRIDE_S
 419     WK6     .req    STRIDE_M
 420     WK7     .req    ORIG_W
 421         over_8888_8888_check_transparent numbytes, %(4+firstreg), %(5+firstreg), %(6+firstreg), %(7+firstreg)
 422         beq     10f
 423         over_8888_8888_prepare  %(4+firstreg)
 424  .set PROCESS_REG, firstreg
 425  .set PROCESS_OFF, -numbytes
 426  .rept numbytes / 4
 427         over_8888_8888_1pixel %(4+PROCESS_REG), %(0+PROCESS_REG), PROCESS_OFF, %(5+PROCESS_REG)
 428   .set PROCESS_REG, PROCESS_REG+1
 429   .set PROCESS_OFF, PROCESS_OFF+4
 430  .endr
 431         pixst   , numbytes, firstreg, DST
 432 10:
 433     .unreq  WK4
 434     .unreq  WK5
 435     .unreq  WK6
 436     .unreq  WK7
 437 .endm
 438
 439 generate_composite_function \
 440     pixman_composite_over_8888_8888_asm_armv6, 32, 0, 32 \
 441     FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS \
 442     2, /* prefetch distance */ \
 443     over_8888_8888_init, \
 444     nop_macro, /* newline */ \
 445     nop_macro, /* cleanup */ \
 446     over_8888_8888_process_head, \
 447     over_8888_8888_process_tail
 448
 449 /******************************************************************************/
 450
 451 /* Multiply each byte of a word by a byte.
 452  * Useful when there aren't any obvious ways to fill the stalls with other instructions.
 453  * word  Register containing 4 bytes
 454  * byte  Register containing byte multiplier (bits 8-31 must be 0)
 455  * tmp   Scratch register
 456  * half  Register containing the constant 0x00800080
 457  * GE[3:0] bits must contain 0101
 458  */
 459 .macro mul_8888_8  word, byte, tmp, half
 460         /* Split even/odd bytes of word apart */
 461         uxtb16  tmp, word
 462         uxtb16  word, word, ror #8
 463         /* Multiply bytes together with rounding, then by 257/256 */
 464         mla     tmp, tmp, byte, half
 465         mla     word, word, byte, half /* 1 stall follows */
 466         uxtab16 tmp, tmp, tmp, ror #8  /* 1 stall follows */
 467         uxtab16 word, word, word, ror #8
 468         /* Recombine bytes */
 469         mov     tmp, tmp, ror #8
 470         sel     word, tmp, word
 471 .endm
 472
 473 /******************************************************************************/
 474
 475 .macro over_8888_n_8888_init
 476         /* Mask is constant */
 477         ldr     MASK, [sp, #ARGS_STACK_OFFSET+8]
 478         /* Hold loop invariant in STRIDE_M */
 479         ldr     STRIDE_M, =0x00800080
 480         /* We only want the alpha bits of the constant mask */
 481         mov     MASK, MASK, lsr #24
 482         /* Set GE[3:0] to 0101 so SEL instructions do what we want */
 483         uadd8   SCRATCH, STRIDE_M, STRIDE_M
 484         line_saved_regs Y, STRIDE_D, STRIDE_S, ORIG_W
 485 .endm
 486
 487 .macro over_8888_n_8888_process_head  cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
 488     WK4     .req    Y
 489     WK5     .req    STRIDE_D
 490     WK6     .req    STRIDE_S
 491     WK7     .req    ORIG_W
 492         pixld   , numbytes, %(4+(firstreg%2)), SRC, unaligned_src
 493         pixld   , numbytes, firstreg, DST, 0
 494     .unreq  WK4
 495     .unreq  WK5
 496     .unreq  WK6
 497     .unreq  WK7
 498 .endm
 499
 500 .macro over_8888_n_8888_1pixel src, dst
 501         mul_8888_8  WK&src, MASK, SCRATCH, STRIDE_M
 502         sub     WK7, WK6, WK&src, lsr #24
 503         mul_8888_8  WK&dst, WK7, SCRATCH, STRIDE_M
 504         uqadd8  WK&dst, WK&dst, WK&src
 505 .endm
 506
 507 .macro over_8888_n_8888_process_tail  cond, numbytes, firstreg
 508     WK4     .req    Y
 509     WK5     .req    STRIDE_D
 510     WK6     .req    STRIDE_S
 511     WK7     .req    ORIG_W
 512         over_8888_8888_check_transparent numbytes, %(4+(firstreg%2)), %(5+(firstreg%2)), %(6+firstreg), %(7+firstreg)
 513         beq     10f
 514         mov     WK6, #255
 515  .set PROCESS_REG, firstreg
 516  .rept numbytes / 4
 517   .if numbytes == 16 && PROCESS_REG == 2
 518         /* We're using WK6 and WK7 as temporaries, so half way through
 519          * 4 pixels, reload the second two source pixels but this time
 520          * into WK4 and WK5 */
 521         ldmdb   SRC, {WK4, WK5}
 522   .endif
 523         over_8888_n_8888_1pixel  %(4+(PROCESS_REG%2)), %(PROCESS_REG)
 524   .set PROCESS_REG, PROCESS_REG+1
 525  .endr
 526         pixst   , numbytes, firstreg, DST
 527 10:
 528     .unreq  WK4
 529     .unreq  WK5
 530     .unreq  WK6
 531     .unreq  WK7
 532 .endm
 533
 534 generate_composite_function \
 535     pixman_composite_over_8888_n_8888_asm_armv6, 32, 0, 32 \
 536     FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS \
 537     2, /* prefetch distance */ \
 538     over_8888_n_8888_init, \
 539     nop_macro, /* newline */ \
 540     nop_macro, /* cleanup */ \
 541     over_8888_n_8888_process_head, \
 542     over_8888_n_8888_process_tail
 543
 544 /******************************************************************************/
 545
 546 .macro over_n_8_8888_init
 547         /* Source is constant, but splitting it into even/odd bytes is a loop invariant */
 548         ldr     SRC, [sp, #ARGS_STACK_OFFSET]
 549         /* Not enough registers to hold this constant, but we still use it here to set GE[3:0] */
 550         ldr     SCRATCH, =0x00800080
 551         uxtb16  STRIDE_S, SRC
 552         uxtb16  SRC, SRC, ror #8
 553         /* Set GE[3:0] to 0101 so SEL instructions do what we want */
 554         uadd8   SCRATCH, SCRATCH, SCRATCH
 555         line_saved_regs Y, STRIDE_D, STRIDE_M, ORIG_W
 556 .endm
 557
 558 .macro over_n_8_8888_newline
 559         ldr     STRIDE_D, =0x00800080
 560         b       1f
 561  .ltorg
 562 1:
 563 .endm
 564
 565 .macro over_n_8_8888_process_head  cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
 566     WK4     .req    STRIDE_M
 567         pixld   , numbytes/4, 4, MASK, unaligned_mask
 568         pixld   , numbytes, firstreg, DST, 0
 569     .unreq  WK4
 570 .endm
 571
 572 .macro over_n_8_8888_1pixel src, dst
 573         uxtb    Y, WK4, ror #src*8
 574         /* Trailing part of multiplication of source */
 575         mla     SCRATCH, STRIDE_S, Y, STRIDE_D
 576         mla     Y, SRC, Y, STRIDE_D
 577         mov     ORIG_W, #255
 578         uxtab16 SCRATCH, SCRATCH, SCRATCH, ror #8
 579         uxtab16 Y, Y, Y, ror #8
 580         mov     SCRATCH, SCRATCH, ror #8
 581         sub     ORIG_W, ORIG_W, Y, lsr #24
 582         sel     Y, SCRATCH, Y
 583         /* Then multiply the destination */
 584         mul_8888_8  WK&dst, ORIG_W, SCRATCH, STRIDE_D
 585         uqadd8  WK&dst, WK&dst, Y
 586 .endm
 587
 588 .macro over_n_8_8888_process_tail  cond, numbytes, firstreg
 589     WK4     .req    STRIDE_M
 590         teq     WK4, #0
 591         beq     10f
 592  .set PROCESS_REG, firstreg
 593  .rept numbytes / 4
 594         over_n_8_8888_1pixel  %(PROCESS_REG-firstreg), %(PROCESS_REG)
 595   .set PROCESS_REG, PROCESS_REG+1
 596  .endr
 597         pixst   , numbytes, firstreg, DST
 598 10:
 599     .unreq  WK4
 600 .endm
 601
 602 generate_composite_function \
 603     pixman_composite_over_n_8_8888_asm_armv6, 0, 8, 32 \
 604     FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS \
 605     2, /* prefetch distance */ \
 606     over_n_8_8888_init, \
 607     over_n_8_8888_newline, \
 608     nop_macro, /* cleanup */ \
 609     over_n_8_8888_process_head, \
 610     over_n_8_8888_process_tail
 611
 612 /******************************************************************************/
 613