Add qemu 2.4.0
[kvmfornfv.git] / qemu / pixman / pixman / pixman-arm-simd-asm.S
1 /*
2  * Copyright © 2012 Raspberry Pi Foundation
3  * Copyright © 2012 RISC OS Open Ltd
4  *
5  * Permission to use, copy, modify, distribute, and sell this software and its
6  * documentation for any purpose is hereby granted without fee, provided that
7  * the above copyright notice appear in all copies and that both that
8  * copyright notice and this permission notice appear in supporting
9  * documentation, and that the name of the copyright holders not be used in
10  * advertising or publicity pertaining to distribution of the software without
11  * specific, written prior permission.  The copyright holders make no
12  * representations about the suitability of this software for any purpose.  It
13  * is provided "as is" without express or implied warranty.
14  *
15  * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
16  * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
17  * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
18  * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
19  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
20  * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
21  * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
22  * SOFTWARE.
23  *
24  * Author:  Ben Avison (bavison@riscosopen.org)
25  *
26  */
27
28 /* Prevent the stack from becoming executable */
29 #if defined(__linux__) && defined(__ELF__)
30 .section .note.GNU-stack,"",%progbits
31 #endif
32
33         .text
34         .arch armv6
35         .object_arch armv4
36         .arm
37         .altmacro
38         .p2align 2
39
40 #include "pixman-arm-simd-asm.h"
41
42 /* A head macro should do all processing which results in an output of up to
43  * 16 bytes, as far as the final load instruction. The corresponding tail macro
44  * should complete the processing of the up-to-16 bytes. The calling macro will
45  * sometimes choose to insert a preload or a decrement of X between them.
46  *   cond           ARM condition code for code block
47  *   numbytes       Number of output bytes that should be generated this time
48  *   firstreg       First WK register in which to place output
49  *   unaligned_src  Whether to use non-wordaligned loads of source image
50  *   unaligned_mask Whether to use non-wordaligned loads of mask image
51  *   preload        If outputting 16 bytes causes 64 bytes to be read, whether an extra preload should be output
52  */
53
54 .macro blit_init
55         line_saved_regs STRIDE_D, STRIDE_S
56 .endm
57
58 .macro blit_process_head   cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
59         pixld   cond, numbytes, firstreg, SRC, unaligned_src
60 .endm
61
62 .macro blit_inner_loop  process_head, process_tail, unaligned_src, unaligned_mask, dst_alignment
63     WK4     .req    STRIDE_D
64     WK5     .req    STRIDE_S
65     WK6     .req    MASK
66     WK7     .req    STRIDE_M
67 110:    pixld   , 16, 0, SRC, unaligned_src
68         pixld   , 16, 4, SRC, unaligned_src
69         pld     [SRC, SCRATCH]
70         pixst   , 16, 0, DST
71         pixst   , 16, 4, DST
72         subs    X, X, #32*8/src_bpp
73         bhs     110b
74     .unreq  WK4
75     .unreq  WK5
76     .unreq  WK6
77     .unreq  WK7
78 .endm
79
80 generate_composite_function \
81     pixman_composite_src_8888_8888_asm_armv6, 32, 0, 32, \
82     FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_SPILL_LINE_VARS_WIDE | FLAG_PROCESS_PRESERVES_SCRATCH, \
83     4, /* prefetch distance */ \
84     blit_init, \
85     nop_macro, /* newline */ \
86     nop_macro, /* cleanup */ \
87     blit_process_head, \
88     nop_macro, /* process tail */ \
89     blit_inner_loop
90
91 generate_composite_function \
92     pixman_composite_src_0565_0565_asm_armv6, 16, 0, 16, \
93     FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_SPILL_LINE_VARS_WIDE | FLAG_PROCESS_PRESERVES_SCRATCH, \
94     4, /* prefetch distance */ \
95     blit_init, \
96     nop_macro, /* newline */ \
97     nop_macro, /* cleanup */ \
98     blit_process_head, \
99     nop_macro, /* process tail */ \
100     blit_inner_loop
101
102 generate_composite_function \
103     pixman_composite_src_8_8_asm_armv6, 8, 0, 8, \
104     FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_SPILL_LINE_VARS_WIDE | FLAG_PROCESS_PRESERVES_SCRATCH, \
105     3, /* prefetch distance */ \
106     blit_init, \
107     nop_macro, /* newline */ \
108     nop_macro, /* cleanup */ \
109     blit_process_head, \
110     nop_macro, /* process tail */ \
111     blit_inner_loop
112
113 /******************************************************************************/
114
115 .macro src_n_8888_init
116         ldr     SRC, [sp, #ARGS_STACK_OFFSET]
117         mov     STRIDE_S, SRC
118         mov     MASK, SRC
119         mov     STRIDE_M, SRC
120 .endm
121
122 .macro src_n_0565_init
123         ldrh    SRC, [sp, #ARGS_STACK_OFFSET]
124         orr     SRC, SRC, lsl #16
125         mov     STRIDE_S, SRC
126         mov     MASK, SRC
127         mov     STRIDE_M, SRC
128 .endm
129
130 .macro src_n_8_init
131         ldrb    SRC, [sp, #ARGS_STACK_OFFSET]
132         orr     SRC, SRC, lsl #8
133         orr     SRC, SRC, lsl #16
134         mov     STRIDE_S, SRC
135         mov     MASK, SRC
136         mov     STRIDE_M, SRC
137 .endm
138
139 .macro fill_process_tail  cond, numbytes, firstreg
140     WK4     .req    SRC
141     WK5     .req    STRIDE_S
142     WK6     .req    MASK
143     WK7     .req    STRIDE_M
144         pixst   cond, numbytes, 4, DST
145     .unreq  WK4
146     .unreq  WK5
147     .unreq  WK6
148     .unreq  WK7
149 .endm
150
151 generate_composite_function \
152     pixman_composite_src_n_8888_asm_armv6, 0, 0, 32, \
153     FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_PROCESS_PRESERVES_PSR | FLAG_PROCESS_DOES_STORE | FLAG_PROCESS_PRESERVES_SCRATCH \
154     0, /* prefetch distance doesn't apply */ \
155     src_n_8888_init \
156     nop_macro, /* newline */ \
157     nop_macro /* cleanup */ \
158     nop_macro /* process head */ \
159     fill_process_tail
160
161 generate_composite_function \
162     pixman_composite_src_n_0565_asm_armv6, 0, 0, 16, \
163     FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_PROCESS_PRESERVES_PSR | FLAG_PROCESS_DOES_STORE | FLAG_PROCESS_PRESERVES_SCRATCH \
164     0, /* prefetch distance doesn't apply */ \
165     src_n_0565_init \
166     nop_macro, /* newline */ \
167     nop_macro /* cleanup */ \
168     nop_macro /* process head */ \
169     fill_process_tail
170
171 generate_composite_function \
172     pixman_composite_src_n_8_asm_armv6, 0, 0, 8, \
173     FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_PROCESS_PRESERVES_PSR | FLAG_PROCESS_DOES_STORE | FLAG_PROCESS_PRESERVES_SCRATCH \
174     0, /* prefetch distance doesn't apply */ \
175     src_n_8_init \
176     nop_macro, /* newline */ \
177     nop_macro /* cleanup */ \
178     nop_macro /* process head */ \
179     fill_process_tail
180
181 /******************************************************************************/
182
183 .macro src_x888_8888_pixel, cond, reg
184         orr&cond WK&reg, WK&reg, #0xFF000000
185 .endm
186
187 .macro pixman_composite_src_x888_8888_process_head   cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
188         pixld   cond, numbytes, firstreg, SRC, unaligned_src
189 .endm
190
191 .macro pixman_composite_src_x888_8888_process_tail   cond, numbytes, firstreg
192         src_x888_8888_pixel cond, %(firstreg+0)
193  .if numbytes >= 8
194         src_x888_8888_pixel cond, %(firstreg+1)
195   .if numbytes == 16
196         src_x888_8888_pixel cond, %(firstreg+2)
197         src_x888_8888_pixel cond, %(firstreg+3)
198   .endif
199  .endif
200 .endm
201
202 generate_composite_function \
203     pixman_composite_src_x888_8888_asm_armv6, 32, 0, 32, \
204     FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_PROCESS_PRESERVES_SCRATCH, \
205     3, /* prefetch distance */ \
206     nop_macro, /* init */ \
207     nop_macro, /* newline */ \
208     nop_macro, /* cleanup */ \
209     pixman_composite_src_x888_8888_process_head, \
210     pixman_composite_src_x888_8888_process_tail
211
212 /******************************************************************************/
213
214 .macro src_0565_8888_init
215         /* Hold loop invariants in MASK and STRIDE_M */
216         ldr     MASK, =0x07E007E0
217         mov     STRIDE_M, #0xFF000000
218         /* Set GE[3:0] to 1010 so SEL instructions do what we want */
219         ldr     SCRATCH, =0x80008000
220         uadd8   SCRATCH, SCRATCH, SCRATCH
221 .endm
222
223 .macro src_0565_8888_2pixels, reg1, reg2
224         and     SCRATCH, WK&reg1, MASK             @ 00000GGGGGG0000000000gggggg00000
225         bic     WK&reg2, WK&reg1, MASK             @ RRRRR000000BBBBBrrrrr000000bbbbb
226         orr     SCRATCH, SCRATCH, SCRATCH, lsr #6  @ 00000GGGGGGGGGGGG0000ggggggggggg
227         mov     WK&reg1, WK&reg2, lsl #16          @ rrrrr000000bbbbb0000000000000000
228         mov     SCRATCH, SCRATCH, ror #19          @ GGGG0000ggggggggggg00000GGGGGGGG
229         bic     WK&reg2, WK&reg2, WK&reg1, lsr #16 @ RRRRR000000BBBBB0000000000000000
230         orr     WK&reg1, WK&reg1, WK&reg1, lsr #5  @ rrrrrrrrrr0bbbbbbbbbb00000000000
231         orr     WK&reg2, WK&reg2, WK&reg2, lsr #5  @ RRRRRRRRRR0BBBBBBBBBB00000000000
232         pkhtb   WK&reg1, WK&reg1, WK&reg1, asr #5  @ rrrrrrrr--------bbbbbbbb--------
233         sel     WK&reg1, WK&reg1, SCRATCH          @ rrrrrrrrggggggggbbbbbbbb--------
234         mov     SCRATCH, SCRATCH, ror #16          @ ggg00000GGGGGGGGGGGG0000gggggggg
235         pkhtb   WK&reg2, WK&reg2, WK&reg2, asr #5  @ RRRRRRRR--------BBBBBBBB--------
236         sel     WK&reg2, WK&reg2, SCRATCH          @ RRRRRRRRGGGGGGGGBBBBBBBB--------
237         orr     WK&reg1, STRIDE_M, WK&reg1, lsr #8 @ 11111111rrrrrrrrggggggggbbbbbbbb
238         orr     WK&reg2, STRIDE_M, WK&reg2, lsr #8 @ 11111111RRRRRRRRGGGGGGGGBBBBBBBB
239 .endm
240
241 /* This version doesn't need STRIDE_M, but is one instruction longer.
242    It would however be preferable for an XRGB target, since we could knock off the last 2 instructions, but is that a common case?
243         and     SCRATCH, WK&reg1, MASK             @ 00000GGGGGG0000000000gggggg00000
244         bic     WK&reg1, WK&reg1, MASK             @ RRRRR000000BBBBBrrrrr000000bbbbb
245         orr     SCRATCH, SCRATCH, SCRATCH, lsr #6  @ 00000GGGGGGGGGGGG0000ggggggggggg
246         mov     WK&reg2, WK&reg1, lsr #16          @ 0000000000000000RRRRR000000BBBBB
247         mov     SCRATCH, SCRATCH, ror #27          @ GGGGGGGGGGGG0000ggggggggggg00000
248         bic     WK&reg1, WK&reg1, WK&reg2, lsl #16 @ 0000000000000000rrrrr000000bbbbb
249         mov     WK&reg2, WK&reg2, lsl #3           @ 0000000000000RRRRR000000BBBBB000
250         mov     WK&reg1, WK&reg1, lsl #3           @ 0000000000000rrrrr000000bbbbb000
251         orr     WK&reg2, WK&reg2, WK&reg2, lsr #5  @ 0000000000000RRRRRRRRRR0BBBBBBBB
252         orr     WK&reg1, WK&reg1, WK&reg1, lsr #5  @ 0000000000000rrrrrrrrrr0bbbbbbbb
253         pkhbt   WK&reg2, WK&reg2, WK&reg2, lsl #5  @ --------RRRRRRRR--------BBBBBBBB
254         pkhbt   WK&reg1, WK&reg1, WK&reg1, lsl #5  @ --------rrrrrrrr--------bbbbbbbb
255         sel     WK&reg2, SCRATCH, WK&reg2          @ --------RRRRRRRRGGGGGGGGBBBBBBBB
256         sel     WK&reg1, SCRATCH, WK&reg1          @ --------rrrrrrrrggggggggbbbbbbbb
257         orr     WK&reg2, WK&reg2, #0xFF000000      @ 11111111RRRRRRRRGGGGGGGGBBBBBBBB
258         orr     WK&reg1, WK&reg1, #0xFF000000      @ 11111111rrrrrrrrggggggggbbbbbbbb
259 */
260
261 .macro src_0565_8888_1pixel, reg
262         bic     SCRATCH, WK&reg, MASK              @ 0000000000000000rrrrr000000bbbbb
263         and     WK&reg, WK&reg, MASK               @ 000000000000000000000gggggg00000
264         mov     SCRATCH, SCRATCH, lsl #3           @ 0000000000000rrrrr000000bbbbb000
265         mov     WK&reg, WK&reg, lsl #5             @ 0000000000000000gggggg0000000000
266         orr     SCRATCH, SCRATCH, SCRATCH, lsr #5  @ 0000000000000rrrrrrrrrr0bbbbbbbb
267         orr     WK&reg, WK&reg, WK&reg, lsr #6     @ 000000000000000gggggggggggg00000
268         pkhbt   SCRATCH, SCRATCH, SCRATCH, lsl #5  @ --------rrrrrrrr--------bbbbbbbb
269         sel     WK&reg, WK&reg, SCRATCH            @ --------rrrrrrrrggggggggbbbbbbbb
270         orr     WK&reg, WK&reg, #0xFF000000        @ 11111111rrrrrrrrggggggggbbbbbbbb
271 .endm
272
273 .macro src_0565_8888_process_head   cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
274  .if numbytes == 16
275         pixldst ld,, 8, firstreg, %(firstreg+2),,, SRC, unaligned_src
276  .elseif numbytes == 8
277         pixld   , 4, firstreg, SRC, unaligned_src
278  .elseif numbytes == 4
279         pixld   , 2, firstreg, SRC, unaligned_src
280  .endif
281 .endm
282
283 .macro src_0565_8888_process_tail   cond, numbytes, firstreg
284  .if numbytes == 16
285         src_0565_8888_2pixels firstreg, %(firstreg+1)
286         src_0565_8888_2pixels %(firstreg+2), %(firstreg+3)
287  .elseif numbytes == 8
288         src_0565_8888_2pixels firstreg, %(firstreg+1)
289  .else
290         src_0565_8888_1pixel firstreg
291  .endif
292 .endm
293
294 generate_composite_function \
295     pixman_composite_src_0565_8888_asm_armv6, 16, 0, 32, \
296     FLAG_DST_WRITEONLY | FLAG_BRANCH_OVER, \
297     3, /* prefetch distance */ \
298     src_0565_8888_init, \
299     nop_macro, /* newline */ \
300     nop_macro, /* cleanup */ \
301     src_0565_8888_process_head, \
302     src_0565_8888_process_tail
303
304 /******************************************************************************/
305
306 .macro add_8_8_8pixels  cond, dst1, dst2
307         uqadd8&cond  WK&dst1, WK&dst1, MASK
308         uqadd8&cond  WK&dst2, WK&dst2, STRIDE_M
309 .endm
310
311 .macro add_8_8_4pixels  cond, dst
312         uqadd8&cond  WK&dst, WK&dst, MASK
313 .endm
314
315 .macro add_8_8_process_head  cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
316     WK4     .req    MASK
317     WK5     .req    STRIDE_M
318  .if numbytes == 16
319         pixld   cond, 8, 4, SRC, unaligned_src
320         pixld   cond, 16, firstreg, DST, 0
321         add_8_8_8pixels cond, firstreg, %(firstreg+1)
322         pixld   cond, 8, 4, SRC, unaligned_src
323  .else
324         pixld   cond, numbytes, 4, SRC, unaligned_src
325         pixld   cond, numbytes, firstreg, DST, 0
326  .endif
327     .unreq  WK4
328     .unreq  WK5
329 .endm
330
331 .macro add_8_8_process_tail  cond, numbytes, firstreg
332  .if numbytes == 16
333         add_8_8_8pixels cond, %(firstreg+2), %(firstreg+3)
334  .elseif numbytes == 8
335         add_8_8_8pixels cond, firstreg, %(firstreg+1)
336  .else
337         add_8_8_4pixels cond, firstreg
338  .endif
339 .endm
340
341 generate_composite_function \
342     pixman_composite_add_8_8_asm_armv6, 8, 0, 8, \
343     FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_PRESERVES_SCRATCH, \
344     2, /* prefetch distance */ \
345     nop_macro, /* init */ \
346     nop_macro, /* newline */ \
347     nop_macro, /* cleanup */ \
348     add_8_8_process_head, \
349     add_8_8_process_tail
350
351 /******************************************************************************/
352
353 .macro over_8888_8888_init
354         /* Hold loop invariant in MASK */
355         ldr     MASK, =0x00800080
356         /* Set GE[3:0] to 0101 so SEL instructions do what we want */
357         uadd8   SCRATCH, MASK, MASK
358         line_saved_regs STRIDE_D, STRIDE_S, ORIG_W
359 .endm
360
361 .macro over_8888_8888_process_head  cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
362     WK4     .req    STRIDE_D
363     WK5     .req    STRIDE_S
364     WK6     .req    STRIDE_M
365     WK7     .req    ORIG_W
366         pixld   , numbytes, %(4+firstreg), SRC, unaligned_src
367         pixld   , numbytes, firstreg, DST, 0
368     .unreq  WK4
369     .unreq  WK5
370     .unreq  WK6
371     .unreq  WK7
372 .endm
373
374 .macro over_8888_8888_check_transparent  numbytes, reg0, reg1, reg2, reg3
375         /* Since these colours a premultiplied by alpha, only 0 indicates transparent (any other colour with 0 in the alpha byte is luminous) */
376         teq     WK&reg0, #0
377  .if numbytes > 4
378         teqeq   WK&reg1, #0
379   .if numbytes > 8
380         teqeq   WK&reg2, #0
381         teqeq   WK&reg3, #0
382   .endif
383  .endif
384 .endm
385
386 .macro over_8888_8888_prepare  next
387         mov     WK&next, WK&next, lsr #24
388 .endm
389
390 .macro over_8888_8888_1pixel src, dst, offset, next
391         /* src = destination component multiplier */
392         rsb     WK&src, WK&src, #255
393         /* Split even/odd bytes of dst into SCRATCH/dst */
394         uxtb16  SCRATCH, WK&dst
395         uxtb16  WK&dst, WK&dst, ror #8
396         /* Multiply through, adding 0.5 to the upper byte of result for rounding */
397         mla     SCRATCH, SCRATCH, WK&src, MASK
398         mla     WK&dst, WK&dst, WK&src, MASK
399         /* Where we would have had a stall between the result of the first MLA and the shifter input,
400          * reload the complete source pixel */
401         ldr     WK&src, [SRC, #offset]
402         /* Multiply by 257/256 to approximate 256/255 */
403         uxtab16 SCRATCH, SCRATCH, SCRATCH, ror #8
404         /* In this stall, start processing the next pixel */
405  .if offset < -4
406         mov     WK&next, WK&next, lsr #24
407  .endif
408         uxtab16 WK&dst, WK&dst, WK&dst, ror #8
409         /* Recombine even/odd bytes of multiplied destination */
410         mov     SCRATCH, SCRATCH, ror #8
411         sel     WK&dst, SCRATCH, WK&dst
412         /* Saturated add of source to multiplied destination */
413         uqadd8  WK&dst, WK&dst, WK&src
414 .endm
415
416 .macro over_8888_8888_process_tail  cond, numbytes, firstreg
417     WK4     .req    STRIDE_D
418     WK5     .req    STRIDE_S
419     WK6     .req    STRIDE_M
420     WK7     .req    ORIG_W
421         over_8888_8888_check_transparent numbytes, %(4+firstreg), %(5+firstreg), %(6+firstreg), %(7+firstreg)
422         beq     10f
423         over_8888_8888_prepare  %(4+firstreg)
424  .set PROCESS_REG, firstreg
425  .set PROCESS_OFF, -numbytes
426  .rept numbytes / 4
427         over_8888_8888_1pixel %(4+PROCESS_REG), %(0+PROCESS_REG), PROCESS_OFF, %(5+PROCESS_REG)
428   .set PROCESS_REG, PROCESS_REG+1
429   .set PROCESS_OFF, PROCESS_OFF+4
430  .endr
431         pixst   , numbytes, firstreg, DST
432 10:
433     .unreq  WK4
434     .unreq  WK5
435     .unreq  WK6
436     .unreq  WK7
437 .endm
438
439 generate_composite_function \
440     pixman_composite_over_8888_8888_asm_armv6, 32, 0, 32 \
441     FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS \
442     2, /* prefetch distance */ \
443     over_8888_8888_init, \
444     nop_macro, /* newline */ \
445     nop_macro, /* cleanup */ \
446     over_8888_8888_process_head, \
447     over_8888_8888_process_tail
448
449 /******************************************************************************/
450
451 /* Multiply each byte of a word by a byte.
452  * Useful when there aren't any obvious ways to fill the stalls with other instructions.
453  * word  Register containing 4 bytes
454  * byte  Register containing byte multiplier (bits 8-31 must be 0)
455  * tmp   Scratch register
456  * half  Register containing the constant 0x00800080
457  * GE[3:0] bits must contain 0101
458  */
459 .macro mul_8888_8  word, byte, tmp, half
460         /* Split even/odd bytes of word apart */
461         uxtb16  tmp, word
462         uxtb16  word, word, ror #8
463         /* Multiply bytes together with rounding, then by 257/256 */
464         mla     tmp, tmp, byte, half
465         mla     word, word, byte, half /* 1 stall follows */
466         uxtab16 tmp, tmp, tmp, ror #8  /* 1 stall follows */
467         uxtab16 word, word, word, ror #8
468         /* Recombine bytes */
469         mov     tmp, tmp, ror #8
470         sel     word, tmp, word
471 .endm
472
473 /******************************************************************************/
474
475 .macro over_8888_n_8888_init
476         /* Mask is constant */
477         ldr     MASK, [sp, #ARGS_STACK_OFFSET+8]
478         /* Hold loop invariant in STRIDE_M */
479         ldr     STRIDE_M, =0x00800080
480         /* We only want the alpha bits of the constant mask */
481         mov     MASK, MASK, lsr #24
482         /* Set GE[3:0] to 0101 so SEL instructions do what we want */
483         uadd8   SCRATCH, STRIDE_M, STRIDE_M
484         line_saved_regs Y, STRIDE_D, STRIDE_S, ORIG_W
485 .endm
486
487 .macro over_8888_n_8888_process_head  cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
488     WK4     .req    Y
489     WK5     .req    STRIDE_D
490     WK6     .req    STRIDE_S
491     WK7     .req    ORIG_W
492         pixld   , numbytes, %(4+(firstreg%2)), SRC, unaligned_src
493         pixld   , numbytes, firstreg, DST, 0
494     .unreq  WK4
495     .unreq  WK5
496     .unreq  WK6
497     .unreq  WK7
498 .endm
499
500 .macro over_8888_n_8888_1pixel src, dst
501         mul_8888_8  WK&src, MASK, SCRATCH, STRIDE_M
502         sub     WK7, WK6, WK&src, lsr #24
503         mul_8888_8  WK&dst, WK7, SCRATCH, STRIDE_M
504         uqadd8  WK&dst, WK&dst, WK&src
505 .endm
506
507 .macro over_8888_n_8888_process_tail  cond, numbytes, firstreg
508     WK4     .req    Y
509     WK5     .req    STRIDE_D
510     WK6     .req    STRIDE_S
511     WK7     .req    ORIG_W
512         over_8888_8888_check_transparent numbytes, %(4+(firstreg%2)), %(5+(firstreg%2)), %(6+firstreg), %(7+firstreg)
513         beq     10f
514         mov     WK6, #255
515  .set PROCESS_REG, firstreg
516  .rept numbytes / 4
517   .if numbytes == 16 && PROCESS_REG == 2
518         /* We're using WK6 and WK7 as temporaries, so half way through
519          * 4 pixels, reload the second two source pixels but this time
520          * into WK4 and WK5 */
521         ldmdb   SRC, {WK4, WK5}
522   .endif
523         over_8888_n_8888_1pixel  %(4+(PROCESS_REG%2)), %(PROCESS_REG)
524   .set PROCESS_REG, PROCESS_REG+1
525  .endr
526         pixst   , numbytes, firstreg, DST
527 10:
528     .unreq  WK4
529     .unreq  WK5
530     .unreq  WK6
531     .unreq  WK7
532 .endm
533
534 generate_composite_function \
535     pixman_composite_over_8888_n_8888_asm_armv6, 32, 0, 32 \
536     FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS \
537     2, /* prefetch distance */ \
538     over_8888_n_8888_init, \
539     nop_macro, /* newline */ \
540     nop_macro, /* cleanup */ \
541     over_8888_n_8888_process_head, \
542     over_8888_n_8888_process_tail
543
544 /******************************************************************************/
545
546 .macro over_n_8_8888_init
547         /* Source is constant, but splitting it into even/odd bytes is a loop invariant */
548         ldr     SRC, [sp, #ARGS_STACK_OFFSET]
549         /* Not enough registers to hold this constant, but we still use it here to set GE[3:0] */
550         ldr     SCRATCH, =0x00800080
551         uxtb16  STRIDE_S, SRC
552         uxtb16  SRC, SRC, ror #8
553         /* Set GE[3:0] to 0101 so SEL instructions do what we want */
554         uadd8   SCRATCH, SCRATCH, SCRATCH
555         line_saved_regs Y, STRIDE_D, STRIDE_M, ORIG_W
556 .endm
557
558 .macro over_n_8_8888_newline
559         ldr     STRIDE_D, =0x00800080
560         b       1f
561  .ltorg
562 1:
563 .endm
564
565 .macro over_n_8_8888_process_head  cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
566     WK4     .req    STRIDE_M
567         pixld   , numbytes/4, 4, MASK, unaligned_mask
568         pixld   , numbytes, firstreg, DST, 0
569     .unreq  WK4
570 .endm
571
572 .macro over_n_8_8888_1pixel src, dst
573         uxtb    Y, WK4, ror #src*8
574         /* Trailing part of multiplication of source */
575         mla     SCRATCH, STRIDE_S, Y, STRIDE_D
576         mla     Y, SRC, Y, STRIDE_D
577         mov     ORIG_W, #255
578         uxtab16 SCRATCH, SCRATCH, SCRATCH, ror #8
579         uxtab16 Y, Y, Y, ror #8
580         mov     SCRATCH, SCRATCH, ror #8
581         sub     ORIG_W, ORIG_W, Y, lsr #24
582         sel     Y, SCRATCH, Y
583         /* Then multiply the destination */
584         mul_8888_8  WK&dst, ORIG_W, SCRATCH, STRIDE_D
585         uqadd8  WK&dst, WK&dst, Y
586 .endm
587
588 .macro over_n_8_8888_process_tail  cond, numbytes, firstreg
589     WK4     .req    STRIDE_M
590         teq     WK4, #0
591         beq     10f
592  .set PROCESS_REG, firstreg
593  .rept numbytes / 4
594         over_n_8_8888_1pixel  %(PROCESS_REG-firstreg), %(PROCESS_REG)
595   .set PROCESS_REG, PROCESS_REG+1
596  .endr
597         pixst   , numbytes, firstreg, DST
598 10:
599     .unreq  WK4
600 .endm
601
602 generate_composite_function \
603     pixman_composite_over_n_8_8888_asm_armv6, 0, 8, 32 \
604     FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS \
605     2, /* prefetch distance */ \
606     over_n_8_8888_init, \
607     over_n_8_8888_newline, \
608     nop_macro, /* cleanup */ \
609     over_n_8_8888_process_head, \
610     over_n_8_8888_process_tail
611
612 /******************************************************************************/
613