2 * Copyright © 2012 Raspberry Pi Foundation
3 * Copyright © 2012 RISC OS Open Ltd
5 * Permission to use, copy, modify, distribute, and sell this software and its
6 * documentation for any purpose is hereby granted without fee, provided that
7 * the above copyright notice appear in all copies and that both that
8 * copyright notice and this permission notice appear in supporting
9 * documentation, and that the name of the copyright holders not be used in
10 * advertising or publicity pertaining to distribution of the software without
11 * specific, written prior permission. The copyright holders make no
12 * representations about the suitability of this software for any purpose. It
13 * is provided "as is" without express or implied warranty.
15 * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
16 * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
17 * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
18 * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
19 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
20 * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
21 * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
24 * Author: Ben Avison (bavison@riscosopen.org)
29 * Because the alignment of pixel data to cachelines, and even the number of
30 * cachelines per row can vary from row to row, and because of the need to
31 * preload each scanline once and only once, this prefetch strategy treats
32 * each row of pixels independently. When a pixel row is long enough, there
33 * are three distinct phases of prefetch:
34 * * an inner loop section, where each time a cacheline of data is
35 * processed, another cacheline is preloaded (the exact distance ahead is
36 * determined empirically using profiling results from lowlevel-blt-bench)
37 * * a leading section, where enough cachelines are preloaded to ensure no
38 * cachelines escape being preloaded when the inner loop starts
39 * * a trailing section, where a limited number (0 or more) of cachelines
40 * are preloaded to deal with data (if any) that hangs off the end of the
41 * last iteration of the inner loop, plus any trailing bytes that were not
42 * enough to make up one whole iteration of the inner loop
44 * There are (in general) three distinct code paths, selected between
45 * depending upon how long the pixel row is. If it is long enough that there
46 * is at least one iteration of the inner loop (as described above) then
47 * this is described as the "wide" case. If it is shorter than that, but
48 * there are still enough bytes output that there is at least one 16-byte-
49 * long, 16-byte-aligned write to the destination (the optimum type of
50 * write), then this is the "medium" case. If it is not even this long, then
51 * this is the "narrow" case, and there is no attempt to align writes to
52 * 16-byte boundaries. In the "medium" and "narrow" cases, all the
53 * cachelines containing data from the pixel row are prefetched up-front.
57 * Determine whether we put the arguments on the stack for debugging.
62 * Bit flags for 'generate_composite_function' macro which are used
63 * to tune generated functions behavior.
65 .set FLAG_DST_WRITEONLY, 0
66 .set FLAG_DST_READWRITE, 1
67 .set FLAG_COND_EXEC, 0
68 .set FLAG_BRANCH_OVER, 2
69 .set FLAG_PROCESS_PRESERVES_PSR, 0
70 .set FLAG_PROCESS_CORRUPTS_PSR, 4
71 .set FLAG_PROCESS_DOESNT_STORE, 0
72 .set FLAG_PROCESS_DOES_STORE, 8 /* usually because it needs to conditionally skip it */
73 .set FLAG_NO_SPILL_LINE_VARS, 0
74 .set FLAG_SPILL_LINE_VARS_WIDE, 16
75 .set FLAG_SPILL_LINE_VARS_NON_WIDE, 32
76 .set FLAG_SPILL_LINE_VARS, 48
77 .set FLAG_PROCESS_CORRUPTS_SCRATCH, 0
78 .set FLAG_PROCESS_PRESERVES_SCRATCH, 64
81 * Offset into stack where mask and source pointer/stride can be accessed.
84 .set ARGS_STACK_OFFSET, (9*4+9*4)
86 .set ARGS_STACK_OFFSET, (9*4)
90 * Constants for selecting preferable prefetch type.
92 .set PREFETCH_TYPE_NONE, 0
93 .set PREFETCH_TYPE_STANDARD, 1
96 * Definitions of macros for load/store of pixel data.
99 .macro pixldst op, cond=al, numbytes, reg0, reg1, reg2, reg3, base, unaligned=0
102 op&r&cond WK®0, [base], #4
103 op&r&cond WK®1, [base], #4
104 op&r&cond WK®2, [base], #4
105 op&r&cond WK®3, [base], #4
107 op&m&cond&ia base!, {WK®0,WK®1,WK®2,WK®3}
109 .elseif numbytes == 8
111 op&r&cond WK®0, [base], #4
112 op&r&cond WK®1, [base], #4
114 op&m&cond&ia base!, {WK®0,WK®1}
116 .elseif numbytes == 4
117 op&r&cond WK®0, [base], #4
118 .elseif numbytes == 2
119 op&r&cond&h WK®0, [base], #2
120 .elseif numbytes == 1
121 op&r&cond&b WK®0, [base], #1
123 .error "unsupported size: numbytes"
127 .macro pixst_baseupdated cond, numbytes, reg0, reg1, reg2, reg3, base
129 stm&cond&db base, {WK®0,WK®1,WK®2,WK®3}
130 .elseif numbytes == 8
131 stm&cond&db base, {WK®0,WK®1}
132 .elseif numbytes == 4
133 str&cond WK®0, [base, #-4]
134 .elseif numbytes == 2
135 str&cond&h WK®0, [base, #-2]
136 .elseif numbytes == 1
137 str&cond&b WK®0, [base, #-1]
139 .error "unsupported size: numbytes"
143 .macro pixld cond, numbytes, firstreg, base, unaligned
144 pixldst ld, cond, numbytes, %(firstreg+0), %(firstreg+1), %(firstreg+2), %(firstreg+3), base, unaligned
147 .macro pixst cond, numbytes, firstreg, base
148 .if (flags) & FLAG_DST_READWRITE
149 pixst_baseupdated cond, numbytes, %(firstreg+0), %(firstreg+1), %(firstreg+2), %(firstreg+3), base
151 pixldst st, cond, numbytes, %(firstreg+0), %(firstreg+1), %(firstreg+2), %(firstreg+3), base
155 .macro PF a, x:vararg
156 .if (PREFETCH_TYPE_CURRENT == PREFETCH_TYPE_STANDARD)
162 .macro preload_leading_step1 bpp, ptr, base
163 /* If the destination is already 16-byte aligned, then we need to preload
164 * between 0 and prefetch_distance (inclusive) cache lines ahead so there
165 * are no gaps when the inner loop starts.
168 PF bic, ptr, base, #31
170 .rept prefetch_distance+1
171 PF pld, [ptr, #OFFSET]
172 .set OFFSET, OFFSET+32
177 .macro preload_leading_step2 bpp, bpp_shift, ptr, base
178 /* However, if the destination is not 16-byte aligned, we may need to
179 * preload more cache lines than that. The question we need to ask is:
180 * are the bytes corresponding to the leading pixels more than the amount
181 * by which the source pointer will be rounded down for preloading, and if
182 * so, by how many cache lines? Effectively, we want to calculate
183 * leading_bytes = ((-dst)&15)*src_bpp/dst_bpp
184 * inner_loop_offset = (src+leading_bytes)&31
185 * extra_needed = leading_bytes - inner_loop_offset
186 * and test if extra_needed is <= 0, <= 32, or > 32 (where > 32 is only
187 * possible when there are 4 src bytes for every 1 dst byte).
191 /* The test can be simplified further when preloading the destination */
195 .if bpp/dst_w_bpp == 4
196 PF add, SCRATCH, base, WK0, lsl #bpp_shift-dst_bpp_shift
197 PF and, SCRATCH, SCRATCH, #31
198 PF rsb, SCRATCH, SCRATCH, WK0, lsl #bpp_shift-dst_bpp_shift
199 PF sub, SCRATCH, SCRATCH, #1 /* so now ranges are -16..-1 / 0..31 / 32..63 */
200 PF movs, SCRATCH, SCRATCH, #32-6 /* so this sets NC / nc / Nc */
203 PF pld, [ptr, #32*(prefetch_distance+2)]
205 PF mov, SCRATCH, base, lsl #32-5
206 PF add, SCRATCH, SCRATCH, WK0, lsl #32-5+bpp_shift-dst_bpp_shift
207 PF rsbs, SCRATCH, SCRATCH, WK0, lsl #32-5+bpp_shift-dst_bpp_shift
211 60: PF pld, [ptr, #32*(prefetch_distance+1)]
216 #define IS_END_OF_GROUP(INDEX,SIZE) ((SIZE) < 2 || ((INDEX) & ~((INDEX)+1)) & ((SIZE)/2))
217 .macro preload_middle bpp, base, scratch_holds_offset
219 /* prefetch distance = 256/bpp, stm distance = 128/dst_w_bpp */
220 .if IS_END_OF_GROUP(SUBBLOCK,256/128*dst_w_bpp/bpp)
221 .if scratch_holds_offset
222 PF pld, [base, SCRATCH]
224 PF bic, SCRATCH, base, #31
225 PF pld, [SCRATCH, #32*prefetch_distance]
231 .macro preload_trailing bpp, bpp_shift, base
233 .if bpp*pix_per_block > 256
234 /* Calculations are more complex if more than one fetch per block */
235 PF and, WK1, base, #31
236 PF add, WK1, WK1, WK0, lsl #bpp_shift
237 PF add, WK1, WK1, #32*(bpp*pix_per_block/256-1)*(prefetch_distance+1)
238 PF bic, SCRATCH, base, #31
239 80: PF pld, [SCRATCH, #32*(prefetch_distance+1)]
240 PF add, SCRATCH, SCRATCH, #32
241 PF subs, WK1, WK1, #32
244 /* If exactly one fetch per block, then we need either 0, 1 or 2 extra preloads */
245 PF mov, SCRATCH, base, lsl #32-5
246 PF adds, SCRATCH, SCRATCH, X, lsl #32-5+bpp_shift
247 PF adceqs, SCRATCH, SCRATCH, #0
248 /* The instruction above has two effects: ensures Z is only
249 * set if C was clear (so Z indicates that both shifted quantities
250 * were 0), and clears C if Z was set (so C indicates that the sum
251 * of the shifted quantities was greater and not equal to 32) */
253 PF bic, SCRATCH, base, #31
255 PF pld, [SCRATCH, #32*(prefetch_distance+2)]
256 81: PF pld, [SCRATCH, #32*(prefetch_distance+1)]
263 .macro preload_line narrow_case, bpp, bpp_shift, base
264 /* "narrow_case" - just means that the macro was invoked from the "narrow"
265 * code path rather than the "medium" one - because in the narrow case,
266 * the row of pixels is known to output no more than 30 bytes, then
267 * (assuming the source pixels are no wider than the the destination
268 * pixels) they cannot possibly straddle more than 2 32-byte cachelines,
269 * meaning there's no need for a loop.
270 * "bpp" - number of bits per pixel in the channel (source, mask or
271 * destination) that's being preloaded, or 0 if this channel is not used
273 * "bpp_shift" - log2 of ("bpp"/8) (except if "bpp"=0 of course)
274 * "base" - base address register of channel to preload (SRC, MASK or DST)
277 .if narrow_case && (bpp <= dst_w_bpp)
278 /* In these cases, each line for each channel is in either 1 or 2 cache lines */
279 PF bic, WK0, base, #31
281 PF add, WK1, base, X, LSL #bpp_shift
283 PF bic, WK1, WK1, #31
289 PF bic, WK0, base, #31
291 PF add, WK1, base, X, lsl #bpp_shift
293 PF bic, WK1, WK1, #31
296 91: PF add, WK0, WK0, #32
306 .macro conditional_process1_helper cond, process_head, process_tail, numbytes, firstreg, unaligned_src, unaligned_mask, decrementx
307 process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, 0
309 sub&cond X, X, #8*numbytes/dst_w_bpp
311 process_tail cond, numbytes, firstreg
312 .if !((flags) & FLAG_PROCESS_DOES_STORE)
313 pixst cond, numbytes, firstreg, DST
317 .macro conditional_process1 cond, process_head, process_tail, numbytes, firstreg, unaligned_src, unaligned_mask, decrementx
318 .if (flags) & FLAG_BRANCH_OVER
328 conditional_process1_helper , process_head, process_tail, numbytes, firstreg, unaligned_src, unaligned_mask, decrementx
331 conditional_process1_helper cond, process_head, process_tail, numbytes, firstreg, unaligned_src, unaligned_mask, decrementx
335 .macro conditional_process2 test, cond1, cond2, process_head, process_tail, numbytes1, numbytes2, firstreg1, firstreg2, unaligned_src, unaligned_mask, decrementx
336 .if (flags) & (FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE)
337 /* Can't interleave reads and writes */
339 conditional_process1 cond1, process_head, process_tail, numbytes1, firstreg1, unaligned_src, unaligned_mask, decrementx
340 .if (flags) & FLAG_PROCESS_CORRUPTS_PSR
343 conditional_process1 cond2, process_head, process_tail, numbytes2, firstreg2, unaligned_src, unaligned_mask, decrementx
345 /* Can interleave reads and writes for better scheduling */
347 process_head cond1, numbytes1, firstreg1, unaligned_src, unaligned_mask, 0
348 process_head cond2, numbytes2, firstreg2, unaligned_src, unaligned_mask, 0
350 sub&cond1 X, X, #8*numbytes1/dst_w_bpp
351 sub&cond2 X, X, #8*numbytes2/dst_w_bpp
353 process_tail cond1, numbytes1, firstreg1
354 process_tail cond2, numbytes2, firstreg2
355 pixst cond1, numbytes1, firstreg1, DST
356 pixst cond2, numbytes2, firstreg2, DST
361 .macro test_bits_1_0_ptr
362 movs SCRATCH, WK0, lsl #32-1 /* C,N = bits 1,0 of DST */
365 .macro test_bits_3_2_ptr
366 movs SCRATCH, WK0, lsl #32-3 /* C,N = bits 3, 2 of DST */
369 .macro leading_15bytes process_head, process_tail
370 /* On entry, WK0 bits 0-3 = number of bytes until destination is 16-byte aligned */
371 /* Use unaligned loads in all cases for simplicity */
373 conditional_process2 test_bits_1_0_ptr, mi, cs, process_head, process_tail, 1, 2, 1, 2, 1, 1, 1
374 .elseif dst_w_bpp == 16
376 conditional_process1 cs, process_head, process_tail, 2, 2, 1, 1, 1
378 conditional_process2 test_bits_3_2_ptr, mi, cs, process_head, process_tail, 4, 8, 1, 2, 1, 1, 1
381 .macro test_bits_3_2_pix
382 movs SCRATCH, X, lsl #dst_bpp_shift+32-3
385 .macro test_bits_1_0_pix
387 movs SCRATCH, X, lsl #dst_bpp_shift+32-1
389 movs SCRATCH, X, lsr #1
393 .macro trailing_15bytes process_head, process_tail, unaligned_src, unaligned_mask
394 conditional_process2 test_bits_3_2_pix, cs, mi, process_head, process_tail, 8, 4, 0, 2, unaligned_src, unaligned_mask, 0
397 conditional_process1 cs, process_head, process_tail, 2, 0, unaligned_src, unaligned_mask, 0
398 .elseif dst_w_bpp == 8
399 conditional_process2 test_bits_1_0_pix, cs, mi, process_head, process_tail, 2, 1, 0, 1, unaligned_src, unaligned_mask, 0
404 .macro wide_case_inner_loop process_head, process_tail, unaligned_src, unaligned_mask, dst_alignment
406 .set SUBBLOCK, 0 /* this is a count of STMs; there can be up to 8 STMs per block */
407 .rept pix_per_block*dst_w_bpp/128
408 process_head , 16, 0, unaligned_src, unaligned_mask, 1
409 .if (src_bpp > 0) && (mask_bpp == 0) && ((flags) & FLAG_PROCESS_PRESERVES_SCRATCH)
410 preload_middle src_bpp, SRC, 1
411 .elseif (src_bpp == 0) && (mask_bpp > 0) && ((flags) & FLAG_PROCESS_PRESERVES_SCRATCH)
412 preload_middle mask_bpp, MASK, 1
414 preload_middle src_bpp, SRC, 0
415 preload_middle mask_bpp, MASK, 0
417 .if (dst_r_bpp > 0) && ((SUBBLOCK % 2) == 0)
418 /* Because we know that writes are 16-byte aligned, it's relatively easy to ensure that
419 * destination prefetches are 32-byte aligned. It's also the easiest channel to offset
420 * preloads for, to achieve staggered prefetches for multiple channels, because there are
421 * always two STMs per prefetch, so there is always an opposite STM on which to put the
422 * preload. Note, no need to BIC the base register here */
423 PF pld, [DST, #32*prefetch_distance - dst_alignment]
426 .if !((flags) & FLAG_PROCESS_DOES_STORE)
429 .set SUBBLOCK, SUBBLOCK+1
431 subs X, X, #pix_per_block
435 .macro wide_case_inner_loop_and_trailing_pixels process_head, process_tail, process_inner_loop, exit_label, unaligned_src, unaligned_mask
436 /* Destination now 16-byte aligned; we have at least one block before we have to stop preloading */
440 process_inner_loop process_head, process_tail, unaligned_src, unaligned_mask, 16
444 process_inner_loop process_head, process_tail, unaligned_src, unaligned_mask, 0
446 /* Just before the final (prefetch_distance+1) 32-byte blocks, deal with final preloads */
447 .if (src_bpp*pix_per_block > 256) || (mask_bpp*pix_per_block > 256) || (dst_r_bpp*pix_per_block > 256)
448 PF and, WK0, X, #pix_per_block-1
450 preload_trailing src_bpp, src_bpp_shift, SRC
451 preload_trailing mask_bpp, mask_bpp_shift, MASK
452 preload_trailing dst_r_bpp, dst_bpp_shift, DST
453 add X, X, #(prefetch_distance+2)*pix_per_block - 128/dst_w_bpp
454 /* The remainder of the line is handled identically to the medium case */
455 medium_case_inner_loop_and_trailing_pixels process_head, process_tail,, exit_label, unaligned_src, unaligned_mask
458 .macro medium_case_inner_loop_and_trailing_pixels process_head, process_tail, unused, exit_label, unaligned_src, unaligned_mask
460 process_head , 16, 0, unaligned_src, unaligned_mask, 0
462 .if !((flags) & FLAG_PROCESS_DOES_STORE)
465 subs X, X, #128/dst_w_bpp
467 /* Trailing pixels */
468 tst X, #128/dst_w_bpp - 1
470 trailing_15bytes process_head, process_tail, unaligned_src, unaligned_mask
473 .macro narrow_case_inner_loop_and_trailing_pixels process_head, process_tail, unused, exit_label, unaligned_src, unaligned_mask
474 tst X, #16*8/dst_w_bpp
475 conditional_process1 ne, process_head, process_tail, 16, 0, unaligned_src, unaligned_mask, 0
476 /* Trailing pixels */
477 /* In narrow case, it's relatively unlikely to be aligned, so let's do without a branch here */
478 trailing_15bytes process_head, process_tail, unaligned_src, unaligned_mask
481 .macro switch_on_alignment action, process_head, process_tail, process_inner_loop, exit_label
482 /* Note that if we're reading the destination, it's already guaranteed to be aligned at this point */
483 .if mask_bpp == 8 || mask_bpp == 16
487 .if src_bpp == 8 || src_bpp == 16
491 action process_head, process_tail, process_inner_loop, exit_label, 0, 0
492 .if src_bpp == 8 || src_bpp == 16
495 action process_head, process_tail, process_inner_loop, exit_label, 1, 0
497 .if mask_bpp == 8 || mask_bpp == 16
500 .if src_bpp == 8 || src_bpp == 16
504 action process_head, process_tail, process_inner_loop, exit_label, 0, 1
505 .if src_bpp == 8 || src_bpp == 16
508 action process_head, process_tail, process_inner_loop, exit_label, 1, 1
514 .macro end_of_line restore_x, vars_spilled, loop_label, last_one
516 /* Sadly, GAS doesn't seem have an equivalent of the DCI directive? */
517 /* This is ldmia sp,{} */
518 .word 0xE89D0000 | LINE_SAVED_REGS
522 .if (LINE_SAVED_REGS) & (1<<1)
526 add DST, DST, STRIDE_D
528 add SRC, SRC, STRIDE_S
531 add MASK, MASK, STRIDE_M
544 .if (!vars_spilled) && ((flags) & FLAG_SPILL_LINE_VARS)
551 .macro generate_composite_function fname, \
556 prefetch_distance_, \
566 /* For ELF format also set function visibility to hidden */
569 .type fname, %function
573 * Make some macro arguments globally visible and accessible
576 .set src_bpp, src_bpp_
577 .set mask_bpp, mask_bpp_
578 .set dst_w_bpp, dst_w_bpp_
580 .set prefetch_distance, prefetch_distance_
583 * Select prefetch type for this function.
585 .if prefetch_distance == 0
586 .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_NONE
588 .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_STANDARD
592 .set src_bpp_shift, 2
593 .elseif src_bpp == 24
594 .set src_bpp_shift, 0
595 .elseif src_bpp == 16
596 .set src_bpp_shift, 1
598 .set src_bpp_shift, 0
600 .set src_bpp_shift, -1
602 .error "requested src bpp (src_bpp) is not supported"
606 .set mask_bpp_shift, 2
607 .elseif mask_bpp == 24
608 .set mask_bpp_shift, 0
609 .elseif mask_bpp == 8
610 .set mask_bpp_shift, 0
611 .elseif mask_bpp == 0
612 .set mask_bpp_shift, -1
614 .error "requested mask bpp (mask_bpp) is not supported"
618 .set dst_bpp_shift, 2
619 .elseif dst_w_bpp == 24
620 .set dst_bpp_shift, 0
621 .elseif dst_w_bpp == 16
622 .set dst_bpp_shift, 1
623 .elseif dst_w_bpp == 8
624 .set dst_bpp_shift, 0
626 .error "requested dst bpp (dst_w_bpp) is not supported"
629 .if (((flags) & FLAG_DST_READWRITE) != 0)
630 .set dst_r_bpp, dst_w_bpp
635 .set pix_per_block, 16*8/dst_w_bpp
637 .if 32*8/src_bpp > pix_per_block
638 .set pix_per_block, 32*8/src_bpp
642 .if 32*8/mask_bpp > pix_per_block
643 .set pix_per_block, 32*8/mask_bpp
647 .if 32*8/dst_r_bpp > pix_per_block
648 .set pix_per_block, 32*8/dst_r_bpp
652 /* The standard entry conditions set up by pixman-arm-common.h are:
653 * r0 = width (pixels)
655 * r2 = pointer to top-left pixel of destination
656 * r3 = destination stride (pixels)
657 * [sp] = source pixel value, or pointer to top-left pixel of source
658 * [sp,#4] = 0 or source stride (pixels)
659 * The following arguments are unused for non-mask operations
660 * [sp,#8] = mask pixel value, or pointer to top-left pixel of mask
661 * [sp,#12] = 0 or mask stride (pixels)
665 * Assign symbolic names to registers
667 X .req r0 /* pixels to go on this line */
668 Y .req r1 /* lines to go */
669 DST .req r2 /* destination pixel pointer */
670 STRIDE_D .req r3 /* destination stride (bytes, minus width) */
671 SRC .req r4 /* source pixel pointer */
672 STRIDE_S .req r5 /* source stride (bytes, minus width) */
673 MASK .req r6 /* mask pixel pointer (if applicable) */
674 STRIDE_M .req r7 /* mask stride (bytes, minus width) */
675 WK0 .req r8 /* pixel data registers */
680 ORIG_W .req r14 /* width (pixels) */
683 push {r4-r11, lr} /* save all registers */
693 ldr SRC, [sp, #ARGS_STACK_OFFSET]
694 ldr STRIDE_S, [sp, #ARGS_STACK_OFFSET+4]
697 ldr MASK, [sp, #ARGS_STACK_OFFSET+8]
698 ldr STRIDE_M, [sp, #ARGS_STACK_OFFSET+12]
709 lsl STRIDE_D, #dst_bpp_shift /* stride in bytes */
710 sub STRIDE_D, STRIDE_D, X, lsl #dst_bpp_shift
712 lsl STRIDE_S, #src_bpp_shift
713 sub STRIDE_S, STRIDE_S, X, lsl #src_bpp_shift
716 lsl STRIDE_M, #mask_bpp_shift
717 sub STRIDE_M, STRIDE_M, X, lsl #mask_bpp_shift
720 /* Are we not even wide enough to have one 16-byte aligned 16-byte block write? */
721 cmp X, #2*16*8/dst_w_bpp - 1
723 .if src_bpp || mask_bpp || dst_r_bpp /* Wide and medium cases are the same for fill */
724 /* To preload ahead on the current line, we need at least (prefetch_distance+2) 32-byte blocks on all prefetch channels */
725 cmp X, #(prefetch_distance+3)*pix_per_block - 1
729 /* Adjust X so that the decrement instruction can also test for
730 * inner loop termination. We want it to stop when there are
731 * (prefetch_distance+1) complete blocks to go. */
732 sub X, X, #(prefetch_distance+2)*pix_per_block
734 .if (flags) & FLAG_SPILL_LINE_VARS_WIDE
735 /* This is stmdb sp!,{} */
736 .word 0xE92D0000 | LINE_SAVED_REGS
740 preload_leading_step1 src_bpp, WK1, SRC
741 preload_leading_step1 mask_bpp, WK2, MASK
742 preload_leading_step1 dst_r_bpp, WK3, DST
746 rsb WK0, DST, #0 /* bits 0-3 = number of leading bytes until destination aligned */
747 .if (src_bpp != 0 && src_bpp != 2*dst_w_bpp) || (mask_bpp != 0 && mask_bpp != 2*dst_w_bpp)
748 PF and, WK0, WK0, #15
751 preload_leading_step2 src_bpp, src_bpp_shift, WK1, SRC
752 preload_leading_step2 mask_bpp, mask_bpp_shift, WK2, MASK
753 preload_leading_step2 dst_r_bpp, dst_bpp_shift, WK3, DST
755 leading_15bytes process_head, process_tail
757 154: /* Destination now 16-byte aligned; we have at least one prefetch on each channel as well as at least one 16-byte output block */
758 .if (src_bpp > 0) && (mask_bpp == 0) && ((flags) & FLAG_PROCESS_PRESERVES_SCRATCH)
759 and SCRATCH, SRC, #31
760 rsb SCRATCH, SCRATCH, #32*prefetch_distance
761 .elseif (src_bpp == 0) && (mask_bpp > 0) && ((flags) & FLAG_PROCESS_PRESERVES_SCRATCH)
762 and SCRATCH, MASK, #31
763 rsb SCRATCH, SCRATCH, #32*prefetch_distance
765 .ifc "process_inner_loop",""
766 switch_on_alignment wide_case_inner_loop_and_trailing_pixels, process_head, process_tail, wide_case_inner_loop, 157f
768 switch_on_alignment wide_case_inner_loop_and_trailing_pixels, process_head, process_tail, process_inner_loop, 157f
771 157: /* Check for another line */
772 end_of_line 1, %((flags) & FLAG_SPILL_LINE_VARS_WIDE), 151b
777 160: /* Medium case */
779 .if (flags) & FLAG_SPILL_LINE_VARS_NON_WIDE
780 /* This is stmdb sp!,{} */
781 .word 0xE92D0000 | LINE_SAVED_REGS
785 preload_line 0, src_bpp, src_bpp_shift, SRC /* in: X, corrupts: WK0-WK1 */
786 preload_line 0, mask_bpp, mask_bpp_shift, MASK
787 preload_line 0, dst_r_bpp, dst_bpp_shift, DST
789 sub X, X, #128/dst_w_bpp /* simplifies inner loop termination */
792 rsb WK0, DST, #0 /* bits 0-3 = number of leading bytes until destination aligned */
794 leading_15bytes process_head, process_tail
796 164: /* Destination now 16-byte aligned; we have at least one 16-byte output block */
797 switch_on_alignment medium_case_inner_loop_and_trailing_pixels, process_head, process_tail,, 167f
799 167: /* Check for another line */
800 end_of_line 1, %((flags) & FLAG_SPILL_LINE_VARS_NON_WIDE), 161b
804 170: /* Narrow case, less than 31 bytes, so no guarantee of at least one 16-byte block */
808 .if (flags) & FLAG_SPILL_LINE_VARS_NON_WIDE
809 /* This is stmdb sp!,{} */
810 .word 0xE92D0000 | LINE_SAVED_REGS
814 preload_line 1, src_bpp, src_bpp_shift, SRC /* in: X, corrupts: WK0-WK1 */
815 preload_line 1, mask_bpp, mask_bpp_shift, MASK
816 preload_line 1, dst_r_bpp, dst_bpp_shift, DST
823 process_head , 1, 0, 1, 1, 0
825 .if !((flags) & FLAG_PROCESS_DOES_STORE)
830 .elseif dst_w_bpp == 16
835 process_head , 2, 0, 1, 1, 0
837 .if !((flags) & FLAG_PROCESS_DOES_STORE)
842 174: /* Destination now 4-byte aligned; we have 0 or more output bytes to go */
843 switch_on_alignment narrow_case_inner_loop_and_trailing_pixels, process_head, process_tail,, 177f
845 177: /* Check for another line */
846 end_of_line %(dst_w_bpp < 32), %((flags) & FLAG_SPILL_LINE_VARS_NON_WIDE), 171b, last_one
849 .if (flags) & FLAG_SPILL_LINE_VARS
850 add sp, sp, #LINE_SAVED_REG_COUNT*4
856 add sp, sp, #9*4 /* junk the debug copy of arguments */
859 pop {r4-r11, pc} /* exit */
880 .macro line_saved_regs x:vararg
881 .set LINE_SAVED_REGS, 0
882 .set LINE_SAVED_REG_COUNT, 0
885 .set LINE_SAVED_REGS, LINE_SAVED_REGS | (1<<1)
886 .set LINE_SAVED_REG_COUNT, LINE_SAVED_REG_COUNT + 1
888 .ifc "SAVED_REG","STRIDE_D"
889 .set LINE_SAVED_REGS, LINE_SAVED_REGS | (1<<3)
890 .set LINE_SAVED_REG_COUNT, LINE_SAVED_REG_COUNT + 1
892 .ifc "SAVED_REG","STRIDE_S"
893 .set LINE_SAVED_REGS, LINE_SAVED_REGS | (1<<5)
894 .set LINE_SAVED_REG_COUNT, LINE_SAVED_REG_COUNT + 1
896 .ifc "SAVED_REG","STRIDE_M"
897 .set LINE_SAVED_REGS, LINE_SAVED_REGS | (1<<7)
898 .set LINE_SAVED_REG_COUNT, LINE_SAVED_REG_COUNT + 1
900 .ifc "SAVED_REG","ORIG_W"
901 .set LINE_SAVED_REGS, LINE_SAVED_REGS | (1<<14)
902 .set LINE_SAVED_REG_COUNT, LINE_SAVED_REG_COUNT + 1
907 .macro nop_macro x:vararg