qemu/pixman/pixman/pixman-arm-simd-asm.h

   1 /*
   2  * Copyright © 2012 Raspberry Pi Foundation
   3  * Copyright © 2012 RISC OS Open Ltd
   4  *
   5  * Permission to use, copy, modify, distribute, and sell this software and its
   6  * documentation for any purpose is hereby granted without fee, provided that
   7  * the above copyright notice appear in all copies and that both that
   8  * copyright notice and this permission notice appear in supporting
   9  * documentation, and that the name of the copyright holders not be used in
  10  * advertising or publicity pertaining to distribution of the software without
  11  * specific, written prior permission.  The copyright holders make no
  12  * representations about the suitability of this software for any purpose.  It
  13  * is provided "as is" without express or implied warranty.
  14  *
  15  * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
  16  * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
  17  * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
  18  * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
  19  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
  20  * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
  21  * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
  22  * SOFTWARE.
  23  *
  24  * Author:  Ben Avison (bavison@riscosopen.org)
  25  *
  26  */
  27
  28 /*
  29  * Because the alignment of pixel data to cachelines, and even the number of
  30  * cachelines per row can vary from row to row, and because of the need to
  31  * preload each scanline once and only once, this prefetch strategy treats
  32  * each row of pixels independently. When a pixel row is long enough, there
  33  * are three distinct phases of prefetch:
  34  * * an inner loop section, where each time a cacheline of data is
  35  *    processed, another cacheline is preloaded (the exact distance ahead is
  36  *    determined empirically using profiling results from lowlevel-blt-bench)
  37  * * a leading section, where enough cachelines are preloaded to ensure no
  38  *    cachelines escape being preloaded when the inner loop starts
  39  * * a trailing section, where a limited number (0 or more) of cachelines
  40  *    are preloaded to deal with data (if any) that hangs off the end of the
  41  *    last iteration of the inner loop, plus any trailing bytes that were not
  42  *    enough to make up one whole iteration of the inner loop
  43  *
  44  * There are (in general) three distinct code paths, selected between
  45  * depending upon how long the pixel row is. If it is long enough that there
  46  * is at least one iteration of the inner loop (as described above) then
  47  * this is described as the "wide" case. If it is shorter than that, but
  48  * there are still enough bytes output that there is at least one 16-byte-
  49  * long, 16-byte-aligned write to the destination (the optimum type of
  50  * write), then this is the "medium" case. If it is not even this long, then
  51  * this is the "narrow" case, and there is no attempt to align writes to
  52  * 16-byte boundaries. In the "medium" and "narrow" cases, all the
  53  * cachelines containing data from the pixel row are prefetched up-front.
  54  */
  55
  56 /*
  57  * Determine whether we put the arguments on the stack for debugging.
  58  */
  59 #undef DEBUG_PARAMS
  60
  61 /*
  62  * Bit flags for 'generate_composite_function' macro which are used
  63  * to tune generated functions behavior.
  64  */
  65 .set FLAG_DST_WRITEONLY,         0
  66 .set FLAG_DST_READWRITE,         1
  67 .set FLAG_COND_EXEC,             0
  68 .set FLAG_BRANCH_OVER,           2
  69 .set FLAG_PROCESS_PRESERVES_PSR, 0
  70 .set FLAG_PROCESS_CORRUPTS_PSR,  4
  71 .set FLAG_PROCESS_DOESNT_STORE,  0
  72 .set FLAG_PROCESS_DOES_STORE,    8 /* usually because it needs to conditionally skip it */
  73 .set FLAG_NO_SPILL_LINE_VARS,        0
  74 .set FLAG_SPILL_LINE_VARS_WIDE,      16
  75 .set FLAG_SPILL_LINE_VARS_NON_WIDE,  32
  76 .set FLAG_SPILL_LINE_VARS,           48
  77 .set FLAG_PROCESS_CORRUPTS_SCRATCH,  0
  78 .set FLAG_PROCESS_PRESERVES_SCRATCH, 64
  79
  80 /*
  81  * Offset into stack where mask and source pointer/stride can be accessed.
  82  */
  83 #ifdef DEBUG_PARAMS
  84 .set ARGS_STACK_OFFSET,        (9*4+9*4)
  85 #else
  86 .set ARGS_STACK_OFFSET,        (9*4)
  87 #endif
  88
  89 /*
  90  * Constants for selecting preferable prefetch type.
  91  */
  92 .set PREFETCH_TYPE_NONE,       0
  93 .set PREFETCH_TYPE_STANDARD,   1
  94
  95 /*
  96  * Definitions of macros for load/store of pixel data.
  97  */
  98
  99 .macro pixldst op, cond=al, numbytes, reg0, reg1, reg2, reg3, base, unaligned=0
 100  .if numbytes == 16
 101   .if unaligned == 1
 102         op&r&cond    WK&reg0, [base], #4
 103         op&r&cond    WK&reg1, [base], #4
 104         op&r&cond    WK&reg2, [base], #4
 105         op&r&cond    WK&reg3, [base], #4
 106   .else
 107         op&m&cond&ia base!, {WK&reg0,WK&reg1,WK&reg2,WK&reg3}
 108   .endif
 109  .elseif numbytes == 8
 110   .if unaligned == 1
 111         op&r&cond    WK&reg0, [base], #4
 112         op&r&cond    WK&reg1, [base], #4
 113   .else
 114         op&m&cond&ia base!, {WK&reg0,WK&reg1}
 115   .endif
 116  .elseif numbytes == 4
 117         op&r&cond    WK&reg0, [base], #4
 118  .elseif numbytes == 2
 119         op&r&cond&h  WK&reg0, [base], #2
 120  .elseif numbytes == 1
 121         op&r&cond&b  WK&reg0, [base], #1
 122  .else
 123   .error "unsupported size: numbytes"
 124  .endif
 125 .endm
 126
 127 .macro pixst_baseupdated cond, numbytes, reg0, reg1, reg2, reg3, base
 128  .if numbytes == 16
 129         stm&cond&db base, {WK&reg0,WK&reg1,WK&reg2,WK&reg3}
 130  .elseif numbytes == 8
 131         stm&cond&db base, {WK&reg0,WK&reg1}
 132  .elseif numbytes == 4
 133         str&cond    WK&reg0, [base, #-4]
 134  .elseif numbytes == 2
 135         str&cond&h  WK&reg0, [base, #-2]
 136  .elseif numbytes == 1
 137         str&cond&b  WK&reg0, [base, #-1]
 138  .else
 139   .error "unsupported size: numbytes"
 140  .endif
 141 .endm
 142
 143 .macro pixld cond, numbytes, firstreg, base, unaligned
 144         pixldst ld, cond, numbytes, %(firstreg+0), %(firstreg+1), %(firstreg+2), %(firstreg+3), base, unaligned
 145 .endm
 146
 147 .macro pixst cond, numbytes, firstreg, base
 148  .if (flags) & FLAG_DST_READWRITE
 149         pixst_baseupdated cond, numbytes, %(firstreg+0), %(firstreg+1), %(firstreg+2), %(firstreg+3), base
 150  .else
 151         pixldst st, cond, numbytes, %(firstreg+0), %(firstreg+1), %(firstreg+2), %(firstreg+3), base
 152  .endif
 153 .endm
 154
 155 .macro PF a, x:vararg
 156  .if (PREFETCH_TYPE_CURRENT == PREFETCH_TYPE_STANDARD)
 157         a x
 158  .endif
 159 .endm
 160
 161
 162 .macro preload_leading_step1  bpp, ptr, base
 163 /* If the destination is already 16-byte aligned, then we need to preload
 164  * between 0 and prefetch_distance (inclusive) cache lines ahead so there
 165  * are no gaps when the inner loop starts.
 166  */
 167  .if bpp > 0
 168         PF  bic,    ptr, base, #31
 169   .set OFFSET, 0
 170   .rept prefetch_distance+1
 171         PF  pld,    [ptr, #OFFSET]
 172    .set OFFSET, OFFSET+32
 173   .endr
 174  .endif
 175 .endm
 176
 177 .macro preload_leading_step2  bpp, bpp_shift, ptr, base
 178 /* However, if the destination is not 16-byte aligned, we may need to
 179  * preload more cache lines than that. The question we need to ask is:
 180  * are the bytes corresponding to the leading pixels more than the amount
 181  * by which the source pointer will be rounded down for preloading, and if
 182  * so, by how many cache lines? Effectively, we want to calculate
 183  *     leading_bytes = ((-dst)&15)*src_bpp/dst_bpp
 184  *     inner_loop_offset = (src+leading_bytes)&31
 185  *     extra_needed = leading_bytes - inner_loop_offset
 186  * and test if extra_needed is <= 0, <= 32, or > 32 (where > 32 is only
 187  * possible when there are 4 src bytes for every 1 dst byte).
 188  */
 189  .if bpp > 0
 190   .ifc base,DST
 191         /* The test can be simplified further when preloading the destination */
 192         PF  tst,    base, #16
 193         PF  beq,    61f
 194   .else
 195    .if bpp/dst_w_bpp == 4
 196         PF  add,    SCRATCH, base, WK0, lsl #bpp_shift-dst_bpp_shift
 197         PF  and,    SCRATCH, SCRATCH, #31
 198         PF  rsb,    SCRATCH, SCRATCH, WK0, lsl #bpp_shift-dst_bpp_shift
 199         PF  sub,    SCRATCH, SCRATCH, #1    /* so now ranges are -16..-1 / 0..31 / 32..63 */
 200         PF  movs,   SCRATCH, SCRATCH, #32-6 /* so this sets         NC   /  nc   /   Nc   */
 201         PF  bcs,    61f
 202         PF  bpl,    60f
 203         PF  pld,    [ptr, #32*(prefetch_distance+2)]
 204    .else
 205         PF  mov,    SCRATCH, base, lsl #32-5
 206         PF  add,    SCRATCH, SCRATCH, WK0, lsl #32-5+bpp_shift-dst_bpp_shift
 207         PF  rsbs,   SCRATCH, SCRATCH, WK0, lsl #32-5+bpp_shift-dst_bpp_shift
 208         PF  bls,    61f
 209    .endif
 210   .endif
 211 60:     PF  pld,    [ptr, #32*(prefetch_distance+1)]
 212 61:
 213  .endif
 214 .endm
 215
 216 #define IS_END_OF_GROUP(INDEX,SIZE) ((SIZE) < 2 || ((INDEX) & ~((INDEX)+1)) & ((SIZE)/2))
 217 .macro preload_middle   bpp, base, scratch_holds_offset
 218  .if bpp > 0
 219         /* prefetch distance = 256/bpp, stm distance = 128/dst_w_bpp */
 220   .if IS_END_OF_GROUP(SUBBLOCK,256/128*dst_w_bpp/bpp)
 221    .if scratch_holds_offset
 222         PF  pld,    [base, SCRATCH]
 223    .else
 224         PF  bic,    SCRATCH, base, #31
 225         PF  pld,    [SCRATCH, #32*prefetch_distance]
 226    .endif
 227   .endif
 228  .endif
 229 .endm
 230
 231 .macro preload_trailing  bpp, bpp_shift, base
 232  .if bpp > 0
 233   .if bpp*pix_per_block > 256
 234         /* Calculations are more complex if more than one fetch per block */
 235         PF  and,    WK1, base, #31
 236         PF  add,    WK1, WK1, WK0, lsl #bpp_shift
 237         PF  add,    WK1, WK1, #32*(bpp*pix_per_block/256-1)*(prefetch_distance+1)
 238         PF  bic,    SCRATCH, base, #31
 239 80:     PF  pld,    [SCRATCH, #32*(prefetch_distance+1)]
 240         PF  add,    SCRATCH, SCRATCH, #32
 241         PF  subs,   WK1, WK1, #32
 242         PF  bhi,    80b
 243   .else
 244         /* If exactly one fetch per block, then we need either 0, 1 or 2 extra preloads */
 245         PF  mov,    SCRATCH, base, lsl #32-5
 246         PF  adds,   SCRATCH, SCRATCH, X, lsl #32-5+bpp_shift
 247         PF  adceqs, SCRATCH, SCRATCH, #0
 248         /* The instruction above has two effects: ensures Z is only
 249          * set if C was clear (so Z indicates that both shifted quantities
 250          * were 0), and clears C if Z was set (so C indicates that the sum
 251          * of the shifted quantities was greater and not equal to 32) */
 252         PF  beq,    82f
 253         PF  bic,    SCRATCH, base, #31
 254         PF  bcc,    81f
 255         PF  pld,    [SCRATCH, #32*(prefetch_distance+2)]
 256 81:     PF  pld,    [SCRATCH, #32*(prefetch_distance+1)]
 257 82:
 258   .endif
 259  .endif
 260 .endm
 261
 262
 263 .macro preload_line    narrow_case, bpp, bpp_shift, base
 264 /* "narrow_case" - just means that the macro was invoked from the "narrow"
 265  *    code path rather than the "medium" one - because in the narrow case,
 266  *    the row of pixels is known to output no more than 30 bytes, then
 267  *    (assuming the source pixels are no wider than the the destination
 268  *    pixels) they cannot possibly straddle more than 2 32-byte cachelines,
 269  *    meaning there's no need for a loop.
 270  * "bpp" - number of bits per pixel in the channel (source, mask or
 271  *    destination) that's being preloaded, or 0 if this channel is not used
 272  *    for reading
 273  * "bpp_shift" - log2 of ("bpp"/8) (except if "bpp"=0 of course)
 274  * "base" - base address register of channel to preload (SRC, MASK or DST)
 275  */
 276  .if bpp > 0
 277   .if narrow_case && (bpp <= dst_w_bpp)
 278         /* In these cases, each line for each channel is in either 1 or 2 cache lines */
 279         PF  bic,    WK0, base, #31
 280         PF  pld,    [WK0]
 281         PF  add,    WK1, base, X, LSL #bpp_shift
 282         PF  sub,    WK1, WK1, #1
 283         PF  bic,    WK1, WK1, #31
 284         PF  cmp,    WK1, WK0
 285         PF  beq,    90f
 286         PF  pld,    [WK1]
 287 90:
 288   .else
 289         PF  bic,    WK0, base, #31
 290         PF  pld,    [WK0]
 291         PF  add,    WK1, base, X, lsl #bpp_shift
 292         PF  sub,    WK1, WK1, #1
 293         PF  bic,    WK1, WK1, #31
 294         PF  cmp,    WK1, WK0
 295         PF  beq,    92f
 296 91:     PF  add,    WK0, WK0, #32
 297         PF  cmp,    WK0, WK1
 298         PF  pld,    [WK0]
 299         PF  bne,    91b
 300 92:
 301   .endif
 302  .endif
 303 .endm
 304
 305
 306 .macro conditional_process1_helper  cond, process_head, process_tail, numbytes, firstreg, unaligned_src, unaligned_mask, decrementx
 307         process_head  cond, numbytes, firstreg, unaligned_src, unaligned_mask, 0
 308  .if decrementx
 309         sub&cond X, X, #8*numbytes/dst_w_bpp
 310  .endif
 311         process_tail  cond, numbytes, firstreg
 312  .if !((flags) & FLAG_PROCESS_DOES_STORE)
 313         pixst   cond, numbytes, firstreg, DST
 314  .endif
 315 .endm
 316
 317 .macro conditional_process1  cond, process_head, process_tail, numbytes, firstreg, unaligned_src, unaligned_mask, decrementx
 318  .if (flags) & FLAG_BRANCH_OVER
 319   .ifc cond,mi
 320         bpl     100f
 321   .endif
 322   .ifc cond,cs
 323         bcc     100f
 324   .endif
 325   .ifc cond,ne
 326         beq     100f
 327   .endif
 328         conditional_process1_helper  , process_head, process_tail, numbytes, firstreg, unaligned_src, unaligned_mask, decrementx
 329 100:
 330  .else
 331         conditional_process1_helper  cond, process_head, process_tail, numbytes, firstreg, unaligned_src, unaligned_mask, decrementx
 332  .endif
 333 .endm
 334
 335 .macro conditional_process2  test, cond1, cond2, process_head, process_tail, numbytes1, numbytes2, firstreg1, firstreg2, unaligned_src, unaligned_mask, decrementx
 336  .if (flags) & (FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE)
 337         /* Can't interleave reads and writes */
 338         test
 339         conditional_process1  cond1, process_head, process_tail, numbytes1, firstreg1, unaligned_src, unaligned_mask, decrementx
 340   .if (flags) & FLAG_PROCESS_CORRUPTS_PSR
 341         test
 342   .endif
 343         conditional_process1  cond2, process_head, process_tail, numbytes2, firstreg2, unaligned_src, unaligned_mask, decrementx
 344  .else
 345         /* Can interleave reads and writes for better scheduling */
 346         test
 347         process_head  cond1, numbytes1, firstreg1, unaligned_src, unaligned_mask, 0
 348         process_head  cond2, numbytes2, firstreg2, unaligned_src, unaligned_mask, 0
 349   .if decrementx
 350         sub&cond1 X, X, #8*numbytes1/dst_w_bpp
 351         sub&cond2 X, X, #8*numbytes2/dst_w_bpp
 352   .endif
 353         process_tail  cond1, numbytes1, firstreg1
 354         process_tail  cond2, numbytes2, firstreg2
 355         pixst   cond1, numbytes1, firstreg1, DST
 356         pixst   cond2, numbytes2, firstreg2, DST
 357  .endif
 358 .endm
 359
 360
 361 .macro test_bits_1_0_ptr
 362         movs    SCRATCH, WK0, lsl #32-1  /* C,N = bits 1,0 of DST */
 363 .endm
 364
 365 .macro test_bits_3_2_ptr
 366         movs    SCRATCH, WK0, lsl #32-3  /* C,N = bits 3, 2 of DST */
 367 .endm
 368
 369 .macro leading_15bytes  process_head, process_tail
 370         /* On entry, WK0 bits 0-3 = number of bytes until destination is 16-byte aligned */
 371         /* Use unaligned loads in all cases for simplicity */
 372  .if dst_w_bpp == 8
 373         conditional_process2  test_bits_1_0_ptr, mi, cs, process_head, process_tail, 1, 2, 1, 2, 1, 1, 1
 374  .elseif dst_w_bpp == 16
 375         test_bits_1_0_ptr
 376         conditional_process1  cs, process_head, process_tail, 2, 2, 1, 1, 1
 377  .endif
 378         conditional_process2  test_bits_3_2_ptr, mi, cs, process_head, process_tail, 4, 8, 1, 2, 1, 1, 1
 379 .endm
 380
 381 .macro test_bits_3_2_pix
 382         movs    SCRATCH, X, lsl #dst_bpp_shift+32-3
 383 .endm
 384
 385 .macro test_bits_1_0_pix
 386  .if dst_w_bpp == 8
 387         movs    SCRATCH, X, lsl #dst_bpp_shift+32-1
 388  .else
 389         movs    SCRATCH, X, lsr #1
 390  .endif
 391 .endm
 392
 393 .macro trailing_15bytes  process_head, process_tail, unaligned_src, unaligned_mask
 394         conditional_process2  test_bits_3_2_pix, cs, mi, process_head, process_tail, 8, 4, 0, 2, unaligned_src, unaligned_mask, 0
 395  .if dst_w_bpp == 16
 396         test_bits_1_0_pix
 397         conditional_process1  cs, process_head, process_tail, 2, 0, unaligned_src, unaligned_mask, 0
 398  .elseif dst_w_bpp == 8
 399         conditional_process2  test_bits_1_0_pix, cs, mi, process_head, process_tail, 2, 1, 0, 1, unaligned_src, unaligned_mask, 0
 400  .endif
 401 .endm
 402
 403
 404 .macro wide_case_inner_loop  process_head, process_tail, unaligned_src, unaligned_mask, dst_alignment
 405 110:
 406  .set SUBBLOCK, 0 /* this is a count of STMs; there can be up to 8 STMs per block */
 407  .rept pix_per_block*dst_w_bpp/128
 408         process_head  , 16, 0, unaligned_src, unaligned_mask, 1
 409   .if (src_bpp > 0) && (mask_bpp == 0) && ((flags) & FLAG_PROCESS_PRESERVES_SCRATCH)
 410         preload_middle  src_bpp, SRC, 1
 411   .elseif (src_bpp == 0) && (mask_bpp > 0) && ((flags) & FLAG_PROCESS_PRESERVES_SCRATCH)
 412         preload_middle  mask_bpp, MASK, 1
 413   .else
 414         preload_middle  src_bpp, SRC, 0
 415         preload_middle  mask_bpp, MASK, 0
 416   .endif
 417   .if (dst_r_bpp > 0) && ((SUBBLOCK % 2) == 0)
 418         /* Because we know that writes are 16-byte aligned, it's relatively easy to ensure that
 419          * destination prefetches are 32-byte aligned. It's also the easiest channel to offset
 420          * preloads for, to achieve staggered prefetches for multiple channels, because there are
 421          * always two STMs per prefetch, so there is always an opposite STM on which to put the
 422          * preload. Note, no need to BIC the base register here */
 423         PF  pld,    [DST, #32*prefetch_distance - dst_alignment]
 424   .endif
 425         process_tail  , 16, 0
 426   .if !((flags) & FLAG_PROCESS_DOES_STORE)
 427         pixst   , 16, 0, DST
 428   .endif
 429   .set SUBBLOCK, SUBBLOCK+1
 430  .endr
 431         subs    X, X, #pix_per_block
 432         bhs     110b
 433 .endm
 434
 435 .macro wide_case_inner_loop_and_trailing_pixels  process_head, process_tail, process_inner_loop, exit_label, unaligned_src, unaligned_mask
 436         /* Destination now 16-byte aligned; we have at least one block before we have to stop preloading */
 437  .if dst_r_bpp > 0
 438         tst     DST, #16
 439         bne     111f
 440         process_inner_loop  process_head, process_tail, unaligned_src, unaligned_mask, 16
 441         b       112f
 442 111:
 443  .endif
 444         process_inner_loop  process_head, process_tail, unaligned_src, unaligned_mask, 0
 445 112:
 446         /* Just before the final (prefetch_distance+1) 32-byte blocks, deal with final preloads */
 447  .if (src_bpp*pix_per_block > 256) || (mask_bpp*pix_per_block > 256) || (dst_r_bpp*pix_per_block > 256)
 448         PF  and,    WK0, X, #pix_per_block-1
 449  .endif
 450         preload_trailing  src_bpp, src_bpp_shift, SRC
 451         preload_trailing  mask_bpp, mask_bpp_shift, MASK
 452         preload_trailing  dst_r_bpp, dst_bpp_shift, DST
 453         add     X, X, #(prefetch_distance+2)*pix_per_block - 128/dst_w_bpp
 454         /* The remainder of the line is handled identically to the medium case */
 455         medium_case_inner_loop_and_trailing_pixels  process_head, process_tail,, exit_label, unaligned_src, unaligned_mask
 456 .endm
 457
 458 .macro medium_case_inner_loop_and_trailing_pixels  process_head, process_tail, unused, exit_label, unaligned_src, unaligned_mask
 459 120:
 460         process_head  , 16, 0, unaligned_src, unaligned_mask, 0
 461         process_tail  , 16, 0
 462  .if !((flags) & FLAG_PROCESS_DOES_STORE)
 463         pixst   , 16, 0, DST
 464  .endif
 465         subs    X, X, #128/dst_w_bpp
 466         bhs     120b
 467         /* Trailing pixels */
 468         tst     X, #128/dst_w_bpp - 1
 469         beq     exit_label
 470         trailing_15bytes  process_head, process_tail, unaligned_src, unaligned_mask
 471 .endm
 472
 473 .macro narrow_case_inner_loop_and_trailing_pixels  process_head, process_tail, unused, exit_label, unaligned_src, unaligned_mask
 474         tst     X, #16*8/dst_w_bpp
 475         conditional_process1  ne, process_head, process_tail, 16, 0, unaligned_src, unaligned_mask, 0
 476         /* Trailing pixels */
 477         /* In narrow case, it's relatively unlikely to be aligned, so let's do without a branch here */
 478         trailing_15bytes  process_head, process_tail, unaligned_src, unaligned_mask
 479 .endm
 480
 481 .macro switch_on_alignment  action, process_head, process_tail, process_inner_loop, exit_label
 482  /* Note that if we're reading the destination, it's already guaranteed to be aligned at this point */
 483  .if mask_bpp == 8 || mask_bpp == 16
 484         tst     MASK, #3
 485         bne     141f
 486  .endif
 487   .if src_bpp == 8 || src_bpp == 16
 488         tst     SRC, #3
 489         bne     140f
 490   .endif
 491         action  process_head, process_tail, process_inner_loop, exit_label, 0, 0
 492   .if src_bpp == 8 || src_bpp == 16
 493         b       exit_label
 494 140:
 495         action  process_head, process_tail, process_inner_loop, exit_label, 1, 0
 496   .endif
 497  .if mask_bpp == 8 || mask_bpp == 16
 498         b       exit_label
 499 141:
 500   .if src_bpp == 8 || src_bpp == 16
 501         tst     SRC, #3
 502         bne     142f
 503   .endif
 504         action  process_head, process_tail, process_inner_loop, exit_label, 0, 1
 505   .if src_bpp == 8 || src_bpp == 16
 506         b       exit_label
 507 142:
 508         action  process_head, process_tail, process_inner_loop, exit_label, 1, 1
 509   .endif
 510  .endif
 511 .endm
 512
 513
 514 .macro end_of_line      restore_x, vars_spilled, loop_label, last_one
 515  .if vars_spilled
 516         /* Sadly, GAS doesn't seem have an equivalent of the DCI directive? */
 517         /* This is ldmia sp,{} */
 518         .word   0xE89D0000 | LINE_SAVED_REGS
 519  .endif
 520         subs    Y, Y, #1
 521  .if vars_spilled
 522   .if (LINE_SAVED_REGS) & (1<<1)
 523         str     Y, [sp]
 524   .endif
 525  .endif
 526         add     DST, DST, STRIDE_D
 527  .if src_bpp > 0
 528         add     SRC, SRC, STRIDE_S
 529  .endif
 530  .if mask_bpp > 0
 531         add     MASK, MASK, STRIDE_M
 532  .endif
 533  .if restore_x
 534         mov     X, ORIG_W
 535  .endif
 536         bhs     loop_label
 537  .ifc "last_one",""
 538   .if vars_spilled
 539         b       197f
 540   .else
 541         b       198f
 542   .endif
 543  .else
 544   .if (!vars_spilled) && ((flags) & FLAG_SPILL_LINE_VARS)
 545         b       198f
 546   .endif
 547  .endif
 548 .endm
 549
 550
 551 .macro generate_composite_function fname, \
 552                                    src_bpp_, \
 553                                    mask_bpp_, \
 554                                    dst_w_bpp_, \
 555                                    flags_, \
 556                                    prefetch_distance_, \
 557                                    init, \
 558                                    newline, \
 559                                    cleanup, \
 560                                    process_head, \
 561                                    process_tail, \
 562                                    process_inner_loop
 563
 564  .func fname
 565  .global fname
 566  /* For ELF format also set function visibility to hidden */
 567 #ifdef __ELF__
 568  .hidden fname
 569  .type fname, %function
 570 #endif
 571
 572 /*
 573  * Make some macro arguments globally visible and accessible
 574  * from other macros
 575  */
 576  .set src_bpp, src_bpp_
 577  .set mask_bpp, mask_bpp_
 578  .set dst_w_bpp, dst_w_bpp_
 579  .set flags, flags_
 580  .set prefetch_distance, prefetch_distance_
 581
 582 /*
 583  * Select prefetch type for this function.
 584  */
 585  .if prefetch_distance == 0
 586   .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_NONE
 587  .else
 588   .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_STANDARD
 589  .endif
 590
 591  .if src_bpp == 32
 592   .set src_bpp_shift, 2
 593  .elseif src_bpp == 24
 594   .set src_bpp_shift, 0
 595  .elseif src_bpp == 16
 596   .set src_bpp_shift, 1
 597  .elseif src_bpp == 8
 598   .set src_bpp_shift, 0
 599  .elseif src_bpp == 0
 600   .set src_bpp_shift, -1
 601  .else
 602   .error "requested src bpp (src_bpp) is not supported"
 603  .endif
 604
 605  .if mask_bpp == 32
 606   .set mask_bpp_shift, 2
 607  .elseif mask_bpp == 24
 608   .set mask_bpp_shift, 0
 609  .elseif mask_bpp == 8
 610   .set mask_bpp_shift, 0
 611  .elseif mask_bpp == 0
 612   .set mask_bpp_shift, -1
 613  .else
 614   .error "requested mask bpp (mask_bpp) is not supported"
 615  .endif
 616
 617  .if dst_w_bpp == 32
 618   .set dst_bpp_shift, 2
 619  .elseif dst_w_bpp == 24
 620   .set dst_bpp_shift, 0
 621  .elseif dst_w_bpp == 16
 622   .set dst_bpp_shift, 1
 623  .elseif dst_w_bpp == 8
 624   .set dst_bpp_shift, 0
 625  .else
 626   .error "requested dst bpp (dst_w_bpp) is not supported"
 627  .endif
 628
 629  .if (((flags) & FLAG_DST_READWRITE) != 0)
 630   .set dst_r_bpp, dst_w_bpp
 631  .else
 632   .set dst_r_bpp, 0
 633  .endif
 634
 635  .set pix_per_block, 16*8/dst_w_bpp
 636  .if src_bpp != 0
 637   .if 32*8/src_bpp > pix_per_block
 638    .set pix_per_block, 32*8/src_bpp
 639   .endif
 640  .endif
 641  .if mask_bpp != 0
 642   .if 32*8/mask_bpp > pix_per_block
 643    .set pix_per_block, 32*8/mask_bpp
 644   .endif
 645  .endif
 646  .if dst_r_bpp != 0
 647   .if 32*8/dst_r_bpp > pix_per_block
 648    .set pix_per_block, 32*8/dst_r_bpp
 649   .endif
 650  .endif
 651
 652 /* The standard entry conditions set up by pixman-arm-common.h are:
 653  * r0 = width (pixels)
 654  * r1 = height (rows)
 655  * r2 = pointer to top-left pixel of destination
 656  * r3 = destination stride (pixels)
 657  * [sp] = source pixel value, or pointer to top-left pixel of source
 658  * [sp,#4] = 0 or source stride (pixels)
 659  * The following arguments are unused for non-mask operations
 660  * [sp,#8] = mask pixel value, or pointer to top-left pixel of mask
 661  * [sp,#12] = 0 or mask stride (pixels)
 662  */
 663
 664 /*
 665  * Assign symbolic names to registers
 666  */
 667     X           .req    r0  /* pixels to go on this line */
 668     Y           .req    r1  /* lines to go */
 669     DST         .req    r2  /* destination pixel pointer */
 670     STRIDE_D    .req    r3  /* destination stride (bytes, minus width) */
 671     SRC         .req    r4  /* source pixel pointer */
 672     STRIDE_S    .req    r5  /* source stride (bytes, minus width) */
 673     MASK        .req    r6  /* mask pixel pointer (if applicable) */
 674     STRIDE_M    .req    r7  /* mask stride (bytes, minus width) */
 675     WK0         .req    r8  /* pixel data registers */
 676     WK1         .req    r9
 677     WK2         .req    r10
 678     WK3         .req    r11
 679     SCRATCH     .req    r12
 680     ORIG_W      .req    r14 /* width (pixels) */
 681
 682 fname:
 683         push    {r4-r11, lr}        /* save all registers */
 684
 685         subs    Y, Y, #1
 686         blo     199f
 687
 688 #ifdef DEBUG_PARAMS
 689         sub     sp, sp, #9*4
 690 #endif
 691
 692  .if src_bpp > 0
 693         ldr     SRC, [sp, #ARGS_STACK_OFFSET]
 694         ldr     STRIDE_S, [sp, #ARGS_STACK_OFFSET+4]
 695  .endif
 696  .if mask_bpp > 0
 697         ldr     MASK, [sp, #ARGS_STACK_OFFSET+8]
 698         ldr     STRIDE_M, [sp, #ARGS_STACK_OFFSET+12]
 699  .endif
 700
 701 #ifdef DEBUG_PARAMS
 702         add     Y, Y, #1
 703         stmia   sp, {r0-r7,pc}
 704         sub     Y, Y, #1
 705 #endif
 706
 707         init
 708
 709         lsl     STRIDE_D, #dst_bpp_shift /* stride in bytes */
 710         sub     STRIDE_D, STRIDE_D, X, lsl #dst_bpp_shift
 711  .if src_bpp > 0
 712         lsl     STRIDE_S, #src_bpp_shift
 713         sub     STRIDE_S, STRIDE_S, X, lsl #src_bpp_shift
 714  .endif
 715  .if mask_bpp > 0
 716         lsl     STRIDE_M, #mask_bpp_shift
 717         sub     STRIDE_M, STRIDE_M, X, lsl #mask_bpp_shift
 718  .endif
 719
 720         /* Are we not even wide enough to have one 16-byte aligned 16-byte block write? */
 721         cmp     X, #2*16*8/dst_w_bpp - 1
 722         blo     170f
 723  .if src_bpp || mask_bpp || dst_r_bpp /* Wide and medium cases are the same for fill */
 724         /* To preload ahead on the current line, we need at least (prefetch_distance+2) 32-byte blocks on all prefetch channels */
 725         cmp     X, #(prefetch_distance+3)*pix_per_block - 1
 726         blo     160f
 727
 728         /* Wide case */
 729         /* Adjust X so that the decrement instruction can also test for
 730          * inner loop termination. We want it to stop when there are
 731          * (prefetch_distance+1) complete blocks to go. */
 732         sub     X, X, #(prefetch_distance+2)*pix_per_block
 733         mov     ORIG_W, X
 734   .if (flags) & FLAG_SPILL_LINE_VARS_WIDE
 735         /* This is stmdb sp!,{} */
 736         .word   0xE92D0000 | LINE_SAVED_REGS
 737   .endif
 738 151:    /* New line */
 739         newline
 740         preload_leading_step1  src_bpp, WK1, SRC
 741         preload_leading_step1  mask_bpp, WK2, MASK
 742         preload_leading_step1  dst_r_bpp, WK3, DST
 743
 744         tst     DST, #15
 745         beq     154f
 746         rsb     WK0, DST, #0 /* bits 0-3 = number of leading bytes until destination aligned */
 747   .if (src_bpp != 0 && src_bpp != 2*dst_w_bpp) || (mask_bpp != 0 && mask_bpp != 2*dst_w_bpp)
 748         PF  and,    WK0, WK0, #15
 749   .endif
 750
 751         preload_leading_step2  src_bpp, src_bpp_shift, WK1, SRC
 752         preload_leading_step2  mask_bpp, mask_bpp_shift, WK2, MASK
 753         preload_leading_step2  dst_r_bpp, dst_bpp_shift, WK3, DST
 754
 755         leading_15bytes  process_head, process_tail
 756
 757 154:    /* Destination now 16-byte aligned; we have at least one prefetch on each channel as well as at least one 16-byte output block */
 758  .if (src_bpp > 0) && (mask_bpp == 0) && ((flags) & FLAG_PROCESS_PRESERVES_SCRATCH)
 759         and     SCRATCH, SRC, #31
 760         rsb     SCRATCH, SCRATCH, #32*prefetch_distance
 761  .elseif (src_bpp == 0) && (mask_bpp > 0) && ((flags) & FLAG_PROCESS_PRESERVES_SCRATCH)
 762         and     SCRATCH, MASK, #31
 763         rsb     SCRATCH, SCRATCH, #32*prefetch_distance
 764  .endif
 765  .ifc "process_inner_loop",""
 766         switch_on_alignment  wide_case_inner_loop_and_trailing_pixels, process_head, process_tail, wide_case_inner_loop, 157f
 767  .else
 768         switch_on_alignment  wide_case_inner_loop_and_trailing_pixels, process_head, process_tail, process_inner_loop, 157f
 769  .endif
 770
 771 157:    /* Check for another line */
 772         end_of_line 1, %((flags) & FLAG_SPILL_LINE_VARS_WIDE), 151b
 773  .endif
 774
 775  .ltorg
 776
 777 160:    /* Medium case */
 778         mov     ORIG_W, X
 779  .if (flags) & FLAG_SPILL_LINE_VARS_NON_WIDE
 780         /* This is stmdb sp!,{} */
 781         .word   0xE92D0000 | LINE_SAVED_REGS
 782  .endif
 783 161:    /* New line */
 784         newline
 785         preload_line 0, src_bpp, src_bpp_shift, SRC  /* in: X, corrupts: WK0-WK1 */
 786         preload_line 0, mask_bpp, mask_bpp_shift, MASK
 787         preload_line 0, dst_r_bpp, dst_bpp_shift, DST
 788
 789         sub     X, X, #128/dst_w_bpp     /* simplifies inner loop termination */
 790         tst     DST, #15
 791         beq     164f
 792         rsb     WK0, DST, #0 /* bits 0-3 = number of leading bytes until destination aligned */
 793
 794         leading_15bytes  process_head, process_tail
 795
 796 164:    /* Destination now 16-byte aligned; we have at least one 16-byte output block */
 797         switch_on_alignment  medium_case_inner_loop_and_trailing_pixels, process_head, process_tail,, 167f
 798
 799 167:    /* Check for another line */
 800         end_of_line 1, %((flags) & FLAG_SPILL_LINE_VARS_NON_WIDE), 161b
 801
 802  .ltorg
 803
 804 170:    /* Narrow case, less than 31 bytes, so no guarantee of at least one 16-byte block */
 805  .if dst_w_bpp < 32
 806         mov     ORIG_W, X
 807  .endif
 808  .if (flags) & FLAG_SPILL_LINE_VARS_NON_WIDE
 809         /* This is stmdb sp!,{} */
 810         .word   0xE92D0000 | LINE_SAVED_REGS
 811  .endif
 812 171:    /* New line */
 813         newline
 814         preload_line 1, src_bpp, src_bpp_shift, SRC  /* in: X, corrupts: WK0-WK1 */
 815         preload_line 1, mask_bpp, mask_bpp_shift, MASK
 816         preload_line 1, dst_r_bpp, dst_bpp_shift, DST
 817
 818  .if dst_w_bpp == 8
 819         tst     DST, #3
 820         beq     174f
 821 172:    subs    X, X, #1
 822         blo     177f
 823         process_head  , 1, 0, 1, 1, 0
 824         process_tail  , 1, 0
 825   .if !((flags) & FLAG_PROCESS_DOES_STORE)
 826         pixst   , 1, 0, DST
 827   .endif
 828         tst     DST, #3
 829         bne     172b
 830  .elseif dst_w_bpp == 16
 831         tst     DST, #2
 832         beq     174f
 833         subs    X, X, #1
 834         blo     177f
 835         process_head  , 2, 0, 1, 1, 0
 836         process_tail  , 2, 0
 837   .if !((flags) & FLAG_PROCESS_DOES_STORE)
 838         pixst   , 2, 0, DST
 839   .endif
 840  .endif
 841
 842 174:    /* Destination now 4-byte aligned; we have 0 or more output bytes to go */
 843         switch_on_alignment  narrow_case_inner_loop_and_trailing_pixels, process_head, process_tail,, 177f
 844
 845 177:    /* Check for another line */
 846         end_of_line %(dst_w_bpp < 32), %((flags) & FLAG_SPILL_LINE_VARS_NON_WIDE), 171b, last_one
 847
 848 197:
 849  .if (flags) & FLAG_SPILL_LINE_VARS
 850         add     sp, sp, #LINE_SAVED_REG_COUNT*4
 851  .endif
 852 198:
 853         cleanup
 854
 855 #ifdef DEBUG_PARAMS
 856         add     sp, sp, #9*4 /* junk the debug copy of arguments */
 857 #endif
 858 199:
 859         pop     {r4-r11, pc}  /* exit */
 860
 861  .ltorg
 862
 863     .unreq  X
 864     .unreq  Y
 865     .unreq  DST
 866     .unreq  STRIDE_D
 867     .unreq  SRC
 868     .unreq  STRIDE_S
 869     .unreq  MASK
 870     .unreq  STRIDE_M
 871     .unreq  WK0
 872     .unreq  WK1
 873     .unreq  WK2
 874     .unreq  WK3
 875     .unreq  SCRATCH
 876     .unreq  ORIG_W
 877     .endfunc
 878 .endm
 879
 880 .macro line_saved_regs  x:vararg
 881  .set LINE_SAVED_REGS, 0
 882  .set LINE_SAVED_REG_COUNT, 0
 883  .irp SAVED_REG,x
 884   .ifc "SAVED_REG","Y"
 885    .set LINE_SAVED_REGS, LINE_SAVED_REGS | (1<<1)
 886    .set LINE_SAVED_REG_COUNT, LINE_SAVED_REG_COUNT + 1
 887   .endif
 888   .ifc "SAVED_REG","STRIDE_D"
 889    .set LINE_SAVED_REGS, LINE_SAVED_REGS | (1<<3)
 890    .set LINE_SAVED_REG_COUNT, LINE_SAVED_REG_COUNT + 1
 891   .endif
 892   .ifc "SAVED_REG","STRIDE_S"
 893    .set LINE_SAVED_REGS, LINE_SAVED_REGS | (1<<5)
 894    .set LINE_SAVED_REG_COUNT, LINE_SAVED_REG_COUNT + 1
 895   .endif
 896   .ifc "SAVED_REG","STRIDE_M"
 897    .set LINE_SAVED_REGS, LINE_SAVED_REGS | (1<<7)
 898    .set LINE_SAVED_REG_COUNT, LINE_SAVED_REG_COUNT + 1
 899   .endif
 900   .ifc "SAVED_REG","ORIG_W"
 901    .set LINE_SAVED_REGS, LINE_SAVED_REGS | (1<<14)
 902    .set LINE_SAVED_REG_COUNT, LINE_SAVED_REG_COUNT + 1
 903   .endif
 904  .endr
 905 .endm
 906
 907 .macro nop_macro x:vararg
 908 .endm