Add the rt linux 4.1.3-rt3 as base
[kvmfornfv.git] / kernel / arch / powerpc / lib / copy_32.S
diff --git a/kernel/arch/powerpc/lib/copy_32.S b/kernel/arch/powerpc/lib/copy_32.S
new file mode 100644 (file)
index 0000000..6813f80
--- /dev/null
@@ -0,0 +1,391 @@
+/*
+ * Memory copy functions for 32-bit PowerPC.
+ *
+ * Copyright (C) 1996-2005 Paul Mackerras.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+#include <asm/processor.h>
+#include <asm/cache.h>
+#include <asm/errno.h>
+#include <asm/ppc_asm.h>
+
+#define COPY_16_BYTES          \
+       lwz     r7,4(r4);       \
+       lwz     r8,8(r4);       \
+       lwz     r9,12(r4);      \
+       lwzu    r10,16(r4);     \
+       stw     r7,4(r6);       \
+       stw     r8,8(r6);       \
+       stw     r9,12(r6);      \
+       stwu    r10,16(r6)
+
+#define COPY_16_BYTES_WITHEX(n)        \
+8 ## n ## 0:                   \
+       lwz     r7,4(r4);       \
+8 ## n ## 1:                   \
+       lwz     r8,8(r4);       \
+8 ## n ## 2:                   \
+       lwz     r9,12(r4);      \
+8 ## n ## 3:                   \
+       lwzu    r10,16(r4);     \
+8 ## n ## 4:                   \
+       stw     r7,4(r6);       \
+8 ## n ## 5:                   \
+       stw     r8,8(r6);       \
+8 ## n ## 6:                   \
+       stw     r9,12(r6);      \
+8 ## n ## 7:                   \
+       stwu    r10,16(r6)
+
+#define COPY_16_BYTES_EXCODE(n)                        \
+9 ## n ## 0:                                   \
+       addi    r5,r5,-(16 * n);                \
+       b       104f;                           \
+9 ## n ## 1:                                   \
+       addi    r5,r5,-(16 * n);                \
+       b       105f;                           \
+.section __ex_table,"a";                       \
+       .align  2;                              \
+       .long   8 ## n ## 0b,9 ## n ## 0b;      \
+       .long   8 ## n ## 1b,9 ## n ## 0b;      \
+       .long   8 ## n ## 2b,9 ## n ## 0b;      \
+       .long   8 ## n ## 3b,9 ## n ## 0b;      \
+       .long   8 ## n ## 4b,9 ## n ## 1b;      \
+       .long   8 ## n ## 5b,9 ## n ## 1b;      \
+       .long   8 ## n ## 6b,9 ## n ## 1b;      \
+       .long   8 ## n ## 7b,9 ## n ## 1b;      \
+       .text
+
+       .text
+       .stabs  "arch/powerpc/lib/",N_SO,0,0,0f
+       .stabs  "copy_32.S",N_SO,0,0,0f
+0:
+
+CACHELINE_BYTES = L1_CACHE_BYTES
+LG_CACHELINE_BYTES = L1_CACHE_SHIFT
+CACHELINE_MASK = (L1_CACHE_BYTES-1)
+
+_GLOBAL(memset)
+       rlwimi  r4,r4,8,16,23
+       rlwimi  r4,r4,16,0,15
+       addi    r6,r3,-4
+       cmplwi  0,r5,4
+       blt     7f
+       stwu    r4,4(r6)
+       beqlr
+       andi.   r0,r6,3
+       add     r5,r0,r5
+       subf    r6,r0,r6
+       srwi    r0,r5,2
+       mtctr   r0
+       bdz     6f
+1:     stwu    r4,4(r6)
+       bdnz    1b
+6:     andi.   r5,r5,3
+7:     cmpwi   0,r5,0
+       beqlr
+       mtctr   r5
+       addi    r6,r6,3
+8:     stbu    r4,1(r6)
+       bdnz    8b
+       blr
+
+_GLOBAL(memmove)
+       cmplw   0,r3,r4
+       bgt     backwards_memcpy
+       /* fall through */
+
+_GLOBAL(memcpy)
+       srwi.   r7,r5,3
+       addi    r6,r3,-4
+       addi    r4,r4,-4
+       beq     2f                      /* if less than 8 bytes to do */
+       andi.   r0,r6,3                 /* get dest word aligned */
+       mtctr   r7
+       bne     5f
+1:     lwz     r7,4(r4)
+       lwzu    r8,8(r4)
+       stw     r7,4(r6)
+       stwu    r8,8(r6)
+       bdnz    1b
+       andi.   r5,r5,7
+2:     cmplwi  0,r5,4
+       blt     3f
+       lwzu    r0,4(r4)
+       addi    r5,r5,-4
+       stwu    r0,4(r6)
+3:     cmpwi   0,r5,0
+       beqlr
+       mtctr   r5
+       addi    r4,r4,3
+       addi    r6,r6,3
+4:     lbzu    r0,1(r4)
+       stbu    r0,1(r6)
+       bdnz    4b
+       blr
+5:     subfic  r0,r0,4
+       mtctr   r0
+6:     lbz     r7,4(r4)
+       addi    r4,r4,1
+       stb     r7,4(r6)
+       addi    r6,r6,1
+       bdnz    6b
+       subf    r5,r0,r5
+       rlwinm. r7,r5,32-3,3,31
+       beq     2b
+       mtctr   r7
+       b       1b
+
+_GLOBAL(backwards_memcpy)
+       rlwinm. r7,r5,32-3,3,31         /* r0 = r5 >> 3 */
+       add     r6,r3,r5
+       add     r4,r4,r5
+       beq     2f
+       andi.   r0,r6,3
+       mtctr   r7
+       bne     5f
+1:     lwz     r7,-4(r4)
+       lwzu    r8,-8(r4)
+       stw     r7,-4(r6)
+       stwu    r8,-8(r6)
+       bdnz    1b
+       andi.   r5,r5,7
+2:     cmplwi  0,r5,4
+       blt     3f
+       lwzu    r0,-4(r4)
+       subi    r5,r5,4
+       stwu    r0,-4(r6)
+3:     cmpwi   0,r5,0
+       beqlr
+       mtctr   r5
+4:     lbzu    r0,-1(r4)
+       stbu    r0,-1(r6)
+       bdnz    4b
+       blr
+5:     mtctr   r0
+6:     lbzu    r7,-1(r4)
+       stbu    r7,-1(r6)
+       bdnz    6b
+       subf    r5,r0,r5
+       rlwinm. r7,r5,32-3,3,31
+       beq     2b
+       mtctr   r7
+       b       1b
+
+_GLOBAL(__copy_tofrom_user)
+       addi    r4,r4,-4
+       addi    r6,r3,-4
+       neg     r0,r3
+       andi.   r0,r0,CACHELINE_MASK    /* # bytes to start of cache line */
+       beq     58f
+
+       cmplw   0,r5,r0                 /* is this more than total to do? */
+       blt     63f                     /* if not much to do */
+       andi.   r8,r0,3                 /* get it word-aligned first */
+       mtctr   r8
+       beq+    61f
+70:    lbz     r9,4(r4)                /* do some bytes */
+71:    stb     r9,4(r6)
+       addi    r4,r4,1
+       addi    r6,r6,1
+       bdnz    70b
+61:    subf    r5,r0,r5
+       srwi.   r0,r0,2
+       mtctr   r0
+       beq     58f
+72:    lwzu    r9,4(r4)                /* do some words */
+73:    stwu    r9,4(r6)
+       bdnz    72b
+
+       .section __ex_table,"a"
+       .align  2
+       .long   70b,100f
+       .long   71b,101f
+       .long   72b,102f
+       .long   73b,103f
+       .text
+
+58:    srwi.   r0,r5,LG_CACHELINE_BYTES /* # complete cachelines */
+       clrlwi  r5,r5,32-LG_CACHELINE_BYTES
+       li      r11,4
+       beq     63f
+
+       /* Here we decide how far ahead to prefetch the source */
+       li      r3,4
+       cmpwi   r0,1
+       li      r7,0
+       ble     114f
+       li      r7,1
+#if MAX_COPY_PREFETCH > 1
+       /* Heuristically, for large transfers we prefetch
+          MAX_COPY_PREFETCH cachelines ahead.  For small transfers
+          we prefetch 1 cacheline ahead. */
+       cmpwi   r0,MAX_COPY_PREFETCH
+       ble     112f
+       li      r7,MAX_COPY_PREFETCH
+112:   mtctr   r7
+111:   dcbt    r3,r4
+       addi    r3,r3,CACHELINE_BYTES
+       bdnz    111b
+#else
+       dcbt    r3,r4
+       addi    r3,r3,CACHELINE_BYTES
+#endif /* MAX_COPY_PREFETCH > 1 */
+
+114:   subf    r8,r7,r0
+       mr      r0,r7
+       mtctr   r8
+
+53:    dcbt    r3,r4
+54:    dcbz    r11,r6
+       .section __ex_table,"a"
+       .align  2
+       .long   54b,105f
+       .text
+/* the main body of the cacheline loop */
+       COPY_16_BYTES_WITHEX(0)
+#if L1_CACHE_BYTES >= 32
+       COPY_16_BYTES_WITHEX(1)
+#if L1_CACHE_BYTES >= 64
+       COPY_16_BYTES_WITHEX(2)
+       COPY_16_BYTES_WITHEX(3)
+#if L1_CACHE_BYTES >= 128
+       COPY_16_BYTES_WITHEX(4)
+       COPY_16_BYTES_WITHEX(5)
+       COPY_16_BYTES_WITHEX(6)
+       COPY_16_BYTES_WITHEX(7)
+#endif
+#endif
+#endif
+       bdnz    53b
+       cmpwi   r0,0
+       li      r3,4
+       li      r7,0
+       bne     114b
+
+63:    srwi.   r0,r5,2
+       mtctr   r0
+       beq     64f
+30:    lwzu    r0,4(r4)
+31:    stwu    r0,4(r6)
+       bdnz    30b
+
+64:    andi.   r0,r5,3
+       mtctr   r0
+       beq+    65f
+40:    lbz     r0,4(r4)
+41:    stb     r0,4(r6)
+       addi    r4,r4,1
+       addi    r6,r6,1
+       bdnz    40b
+65:    li      r3,0
+       blr
+
+/* read fault, initial single-byte copy */
+100:   li      r9,0
+       b       90f
+/* write fault, initial single-byte copy */
+101:   li      r9,1
+90:    subf    r5,r8,r5
+       li      r3,0
+       b       99f
+/* read fault, initial word copy */
+102:   li      r9,0
+       b       91f
+/* write fault, initial word copy */
+103:   li      r9,1
+91:    li      r3,2
+       b       99f
+
+/*
+ * this stuff handles faults in the cacheline loop and branches to either
+ * 104f (if in read part) or 105f (if in write part), after updating r5
+ */
+       COPY_16_BYTES_EXCODE(0)
+#if L1_CACHE_BYTES >= 32
+       COPY_16_BYTES_EXCODE(1)
+#if L1_CACHE_BYTES >= 64
+       COPY_16_BYTES_EXCODE(2)
+       COPY_16_BYTES_EXCODE(3)
+#if L1_CACHE_BYTES >= 128
+       COPY_16_BYTES_EXCODE(4)
+       COPY_16_BYTES_EXCODE(5)
+       COPY_16_BYTES_EXCODE(6)
+       COPY_16_BYTES_EXCODE(7)
+#endif
+#endif
+#endif
+
+/* read fault in cacheline loop */
+104:   li      r9,0
+       b       92f
+/* fault on dcbz (effectively a write fault) */
+/* or write fault in cacheline loop */
+105:   li      r9,1
+92:    li      r3,LG_CACHELINE_BYTES
+       mfctr   r8
+       add     r0,r0,r8
+       b       106f
+/* read fault in final word loop */
+108:   li      r9,0
+       b       93f
+/* write fault in final word loop */
+109:   li      r9,1
+93:    andi.   r5,r5,3
+       li      r3,2
+       b       99f
+/* read fault in final byte loop */
+110:   li      r9,0
+       b       94f
+/* write fault in final byte loop */
+111:   li      r9,1
+94:    li      r5,0
+       li      r3,0
+/*
+ * At this stage the number of bytes not copied is
+ * r5 + (ctr << r3), and r9 is 0 for read or 1 for write.
+ */
+99:    mfctr   r0
+106:   slw     r3,r0,r3
+       add.    r3,r3,r5
+       beq     120f                    /* shouldn't happen */
+       cmpwi   0,r9,0
+       bne     120f
+/* for a read fault, first try to continue the copy one byte at a time */
+       mtctr   r3
+130:   lbz     r0,4(r4)
+131:   stb     r0,4(r6)
+       addi    r4,r4,1
+       addi    r6,r6,1
+       bdnz    130b
+/* then clear out the destination: r3 bytes starting at 4(r6) */
+132:   mfctr   r3
+       srwi.   r0,r3,2
+       li      r9,0
+       mtctr   r0
+       beq     113f
+112:   stwu    r9,4(r6)
+       bdnz    112b
+113:   andi.   r0,r3,3
+       mtctr   r0
+       beq     120f
+114:   stb     r9,4(r6)
+       addi    r6,r6,1
+       bdnz    114b
+120:   blr
+
+       .section __ex_table,"a"
+       .align  2
+       .long   30b,108b
+       .long   31b,109b
+       .long   40b,110b
+       .long   41b,111b
+       .long   130b,132b
+       .long   131b,120b
+       .long   112b,120b
+       .long   114b,120b
+       .text