Add the rt linux 4.1.3-rt3 as base
[kvmfornfv.git] / kernel / arch / arm64 / crypto / aes-modes.S
diff --git a/kernel/arch/arm64/crypto/aes-modes.S b/kernel/arch/arm64/crypto/aes-modes.S
new file mode 100644 (file)
index 0000000..f6e372c
--- /dev/null
@@ -0,0 +1,532 @@
+/*
+ * linux/arch/arm64/crypto/aes-modes.S - chaining mode wrappers for AES
+ *
+ * Copyright (C) 2013 Linaro Ltd <ard.biesheuvel@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+/* included by aes-ce.S and aes-neon.S */
+
+       .text
+       .align          4
+
+/*
+ * There are several ways to instantiate this code:
+ * - no interleave, all inline
+ * - 2-way interleave, 2x calls out of line (-DINTERLEAVE=2)
+ * - 2-way interleave, all inline (-DINTERLEAVE=2 -DINTERLEAVE_INLINE)
+ * - 4-way interleave, 4x calls out of line (-DINTERLEAVE=4)
+ * - 4-way interleave, all inline (-DINTERLEAVE=4 -DINTERLEAVE_INLINE)
+ *
+ * Macros imported by this code:
+ * - enc_prepare       - setup NEON registers for encryption
+ * - dec_prepare       - setup NEON registers for decryption
+ * - enc_switch_key    - change to new key after having prepared for encryption
+ * - encrypt_block     - encrypt a single block
+ * - decrypt block     - decrypt a single block
+ * - encrypt_block2x   - encrypt 2 blocks in parallel (if INTERLEAVE == 2)
+ * - decrypt_block2x   - decrypt 2 blocks in parallel (if INTERLEAVE == 2)
+ * - encrypt_block4x   - encrypt 4 blocks in parallel (if INTERLEAVE == 4)
+ * - decrypt_block4x   - decrypt 4 blocks in parallel (if INTERLEAVE == 4)
+ */
+
+#if defined(INTERLEAVE) && !defined(INTERLEAVE_INLINE)
+#define FRAME_PUSH     stp x29, x30, [sp,#-16]! ; mov x29, sp
+#define FRAME_POP      ldp x29, x30, [sp],#16
+
+#if INTERLEAVE == 2
+
+aes_encrypt_block2x:
+       encrypt_block2x v0, v1, w3, x2, x6, w7
+       ret
+ENDPROC(aes_encrypt_block2x)
+
+aes_decrypt_block2x:
+       decrypt_block2x v0, v1, w3, x2, x6, w7
+       ret
+ENDPROC(aes_decrypt_block2x)
+
+#elif INTERLEAVE == 4
+
+aes_encrypt_block4x:
+       encrypt_block4x v0, v1, v2, v3, w3, x2, x6, w7
+       ret
+ENDPROC(aes_encrypt_block4x)
+
+aes_decrypt_block4x:
+       decrypt_block4x v0, v1, v2, v3, w3, x2, x6, w7
+       ret
+ENDPROC(aes_decrypt_block4x)
+
+#else
+#error INTERLEAVE should equal 2 or 4
+#endif
+
+       .macro          do_encrypt_block2x
+       bl              aes_encrypt_block2x
+       .endm
+
+       .macro          do_decrypt_block2x
+       bl              aes_decrypt_block2x
+       .endm
+
+       .macro          do_encrypt_block4x
+       bl              aes_encrypt_block4x
+       .endm
+
+       .macro          do_decrypt_block4x
+       bl              aes_decrypt_block4x
+       .endm
+
+#else
+#define FRAME_PUSH
+#define FRAME_POP
+
+       .macro          do_encrypt_block2x
+       encrypt_block2x v0, v1, w3, x2, x6, w7
+       .endm
+
+       .macro          do_decrypt_block2x
+       decrypt_block2x v0, v1, w3, x2, x6, w7
+       .endm
+
+       .macro          do_encrypt_block4x
+       encrypt_block4x v0, v1, v2, v3, w3, x2, x6, w7
+       .endm
+
+       .macro          do_decrypt_block4x
+       decrypt_block4x v0, v1, v2, v3, w3, x2, x6, w7
+       .endm
+
+#endif
+
+       /*
+        * aes_ecb_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
+        *                 int blocks, int first)
+        * aes_ecb_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
+        *                 int blocks, int first)
+        */
+
+AES_ENTRY(aes_ecb_encrypt)
+       FRAME_PUSH
+       cbz             w5, .LecbencloopNx
+
+       enc_prepare     w3, x2, x5
+
+.LecbencloopNx:
+#if INTERLEAVE >= 2
+       subs            w4, w4, #INTERLEAVE
+       bmi             .Lecbenc1x
+#if INTERLEAVE == 2
+       ld1             {v0.16b-v1.16b}, [x1], #32      /* get 2 pt blocks */
+       do_encrypt_block2x
+       st1             {v0.16b-v1.16b}, [x0], #32
+#else
+       ld1             {v0.16b-v3.16b}, [x1], #64      /* get 4 pt blocks */
+       do_encrypt_block4x
+       st1             {v0.16b-v3.16b}, [x0], #64
+#endif
+       b               .LecbencloopNx
+.Lecbenc1x:
+       adds            w4, w4, #INTERLEAVE
+       beq             .Lecbencout
+#endif
+.Lecbencloop:
+       ld1             {v0.16b}, [x1], #16             /* get next pt block */
+       encrypt_block   v0, w3, x2, x5, w6
+       st1             {v0.16b}, [x0], #16
+       subs            w4, w4, #1
+       bne             .Lecbencloop
+.Lecbencout:
+       FRAME_POP
+       ret
+AES_ENDPROC(aes_ecb_encrypt)
+
+
+AES_ENTRY(aes_ecb_decrypt)
+       FRAME_PUSH
+       cbz             w5, .LecbdecloopNx
+
+       dec_prepare     w3, x2, x5
+
+.LecbdecloopNx:
+#if INTERLEAVE >= 2
+       subs            w4, w4, #INTERLEAVE
+       bmi             .Lecbdec1x
+#if INTERLEAVE == 2
+       ld1             {v0.16b-v1.16b}, [x1], #32      /* get 2 ct blocks */
+       do_decrypt_block2x
+       st1             {v0.16b-v1.16b}, [x0], #32
+#else
+       ld1             {v0.16b-v3.16b}, [x1], #64      /* get 4 ct blocks */
+       do_decrypt_block4x
+       st1             {v0.16b-v3.16b}, [x0], #64
+#endif
+       b               .LecbdecloopNx
+.Lecbdec1x:
+       adds            w4, w4, #INTERLEAVE
+       beq             .Lecbdecout
+#endif
+.Lecbdecloop:
+       ld1             {v0.16b}, [x1], #16             /* get next ct block */
+       decrypt_block   v0, w3, x2, x5, w6
+       st1             {v0.16b}, [x0], #16
+       subs            w4, w4, #1
+       bne             .Lecbdecloop
+.Lecbdecout:
+       FRAME_POP
+       ret
+AES_ENDPROC(aes_ecb_decrypt)
+
+
+       /*
+        * aes_cbc_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
+        *                 int blocks, u8 iv[], int first)
+        * aes_cbc_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
+        *                 int blocks, u8 iv[], int first)
+        */
+
+AES_ENTRY(aes_cbc_encrypt)
+       cbz             w6, .Lcbcencloop
+
+       ld1             {v0.16b}, [x5]                  /* get iv */
+       enc_prepare     w3, x2, x5
+
+.Lcbcencloop:
+       ld1             {v1.16b}, [x1], #16             /* get next pt block */
+       eor             v0.16b, v0.16b, v1.16b          /* ..and xor with iv */
+       encrypt_block   v0, w3, x2, x5, w6
+       st1             {v0.16b}, [x0], #16
+       subs            w4, w4, #1
+       bne             .Lcbcencloop
+       ret
+AES_ENDPROC(aes_cbc_encrypt)
+
+
+AES_ENTRY(aes_cbc_decrypt)
+       FRAME_PUSH
+       cbz             w6, .LcbcdecloopNx
+
+       ld1             {v7.16b}, [x5]                  /* get iv */
+       dec_prepare     w3, x2, x5
+
+.LcbcdecloopNx:
+#if INTERLEAVE >= 2
+       subs            w4, w4, #INTERLEAVE
+       bmi             .Lcbcdec1x
+#if INTERLEAVE == 2
+       ld1             {v0.16b-v1.16b}, [x1], #32      /* get 2 ct blocks */
+       mov             v2.16b, v0.16b
+       mov             v3.16b, v1.16b
+       do_decrypt_block2x
+       eor             v0.16b, v0.16b, v7.16b
+       eor             v1.16b, v1.16b, v2.16b
+       mov             v7.16b, v3.16b
+       st1             {v0.16b-v1.16b}, [x0], #32
+#else
+       ld1             {v0.16b-v3.16b}, [x1], #64      /* get 4 ct blocks */
+       mov             v4.16b, v0.16b
+       mov             v5.16b, v1.16b
+       mov             v6.16b, v2.16b
+       do_decrypt_block4x
+       sub             x1, x1, #16
+       eor             v0.16b, v0.16b, v7.16b
+       eor             v1.16b, v1.16b, v4.16b
+       ld1             {v7.16b}, [x1], #16             /* reload 1 ct block */
+       eor             v2.16b, v2.16b, v5.16b
+       eor             v3.16b, v3.16b, v6.16b
+       st1             {v0.16b-v3.16b}, [x0], #64
+#endif
+       b               .LcbcdecloopNx
+.Lcbcdec1x:
+       adds            w4, w4, #INTERLEAVE
+       beq             .Lcbcdecout
+#endif
+.Lcbcdecloop:
+       ld1             {v1.16b}, [x1], #16             /* get next ct block */
+       mov             v0.16b, v1.16b                  /* ...and copy to v0 */
+       decrypt_block   v0, w3, x2, x5, w6
+       eor             v0.16b, v0.16b, v7.16b          /* xor with iv => pt */
+       mov             v7.16b, v1.16b                  /* ct is next iv */
+       st1             {v0.16b}, [x0], #16
+       subs            w4, w4, #1
+       bne             .Lcbcdecloop
+.Lcbcdecout:
+       FRAME_POP
+       ret
+AES_ENDPROC(aes_cbc_decrypt)
+
+
+       /*
+        * aes_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
+        *                 int blocks, u8 ctr[], int first)
+        */
+
+AES_ENTRY(aes_ctr_encrypt)
+       FRAME_PUSH
+       cbnz            w6, .Lctrfirst          /* 1st time around? */
+       umov            x5, v4.d[1]             /* keep swabbed ctr in reg */
+       rev             x5, x5
+#if INTERLEAVE >= 2
+       cmn             w5, w4                  /* 32 bit overflow? */
+       bcs             .Lctrinc
+       add             x5, x5, #1              /* increment BE ctr */
+       b               .LctrincNx
+#else
+       b               .Lctrinc
+#endif
+.Lctrfirst:
+       enc_prepare     w3, x2, x6
+       ld1             {v4.16b}, [x5]
+       umov            x5, v4.d[1]             /* keep swabbed ctr in reg */
+       rev             x5, x5
+#if INTERLEAVE >= 2
+       cmn             w5, w4                  /* 32 bit overflow? */
+       bcs             .Lctrloop
+.LctrloopNx:
+       subs            w4, w4, #INTERLEAVE
+       bmi             .Lctr1x
+#if INTERLEAVE == 2
+       mov             v0.8b, v4.8b
+       mov             v1.8b, v4.8b
+       rev             x7, x5
+       add             x5, x5, #1
+       ins             v0.d[1], x7
+       rev             x7, x5
+       add             x5, x5, #1
+       ins             v1.d[1], x7
+       ld1             {v2.16b-v3.16b}, [x1], #32      /* get 2 input blocks */
+       do_encrypt_block2x
+       eor             v0.16b, v0.16b, v2.16b
+       eor             v1.16b, v1.16b, v3.16b
+       st1             {v0.16b-v1.16b}, [x0], #32
+#else
+       ldr             q8, =0x30000000200000001        /* addends 1,2,3[,0] */
+       dup             v7.4s, w5
+       mov             v0.16b, v4.16b
+       add             v7.4s, v7.4s, v8.4s
+       mov             v1.16b, v4.16b
+       rev32           v8.16b, v7.16b
+       mov             v2.16b, v4.16b
+       mov             v3.16b, v4.16b
+       mov             v1.s[3], v8.s[0]
+       mov             v2.s[3], v8.s[1]
+       mov             v3.s[3], v8.s[2]
+       ld1             {v5.16b-v7.16b}, [x1], #48      /* get 3 input blocks */
+       do_encrypt_block4x
+       eor             v0.16b, v5.16b, v0.16b
+       ld1             {v5.16b}, [x1], #16             /* get 1 input block  */
+       eor             v1.16b, v6.16b, v1.16b
+       eor             v2.16b, v7.16b, v2.16b
+       eor             v3.16b, v5.16b, v3.16b
+       st1             {v0.16b-v3.16b}, [x0], #64
+       add             x5, x5, #INTERLEAVE
+#endif
+       cbz             w4, .LctroutNx
+.LctrincNx:
+       rev             x7, x5
+       ins             v4.d[1], x7
+       b               .LctrloopNx
+.LctroutNx:
+       sub             x5, x5, #1
+       rev             x7, x5
+       ins             v4.d[1], x7
+       b               .Lctrout
+.Lctr1x:
+       adds            w4, w4, #INTERLEAVE
+       beq             .Lctrout
+#endif
+.Lctrloop:
+       mov             v0.16b, v4.16b
+       encrypt_block   v0, w3, x2, x6, w7
+       subs            w4, w4, #1
+       bmi             .Lctrhalfblock          /* blocks < 0 means 1/2 block */
+       ld1             {v3.16b}, [x1], #16
+       eor             v3.16b, v0.16b, v3.16b
+       st1             {v3.16b}, [x0], #16
+       beq             .Lctrout
+.Lctrinc:
+       adds            x5, x5, #1              /* increment BE ctr */
+       rev             x7, x5
+       ins             v4.d[1], x7
+       bcc             .Lctrloop               /* no overflow? */
+       umov            x7, v4.d[0]             /* load upper word of ctr  */
+       rev             x7, x7                  /* ... to handle the carry */
+       add             x7, x7, #1
+       rev             x7, x7
+       ins             v4.d[0], x7
+       b               .Lctrloop
+.Lctrhalfblock:
+       ld1             {v3.8b}, [x1]
+       eor             v3.8b, v0.8b, v3.8b
+       st1             {v3.8b}, [x0]
+.Lctrout:
+       FRAME_POP
+       ret
+AES_ENDPROC(aes_ctr_encrypt)
+       .ltorg
+
+
+       /*
+        * aes_xts_decrypt(u8 out[], u8 const in[], u8 const rk1[], int rounds,
+        *                 int blocks, u8 const rk2[], u8 iv[], int first)
+        * aes_xts_decrypt(u8 out[], u8 const in[], u8 const rk1[], int rounds,
+        *                 int blocks, u8 const rk2[], u8 iv[], int first)
+        */
+
+       .macro          next_tweak, out, in, const, tmp
+       sshr            \tmp\().2d,  \in\().2d,   #63
+       and             \tmp\().16b, \tmp\().16b, \const\().16b
+       add             \out\().2d,  \in\().2d,   \in\().2d
+       ext             \tmp\().16b, \tmp\().16b, \tmp\().16b, #8
+       eor             \out\().16b, \out\().16b, \tmp\().16b
+       .endm
+
+.Lxts_mul_x:
+       .word           1, 0, 0x87, 0
+
+AES_ENTRY(aes_xts_encrypt)
+       FRAME_PUSH
+       cbz             w7, .LxtsencloopNx
+
+       ld1             {v4.16b}, [x6]
+       enc_prepare     w3, x5, x6
+       encrypt_block   v4, w3, x5, x6, w7              /* first tweak */
+       enc_switch_key  w3, x2, x6
+       ldr             q7, .Lxts_mul_x
+       b               .LxtsencNx
+
+.LxtsencloopNx:
+       ldr             q7, .Lxts_mul_x
+       next_tweak      v4, v4, v7, v8
+.LxtsencNx:
+#if INTERLEAVE >= 2
+       subs            w4, w4, #INTERLEAVE
+       bmi             .Lxtsenc1x
+#if INTERLEAVE == 2
+       ld1             {v0.16b-v1.16b}, [x1], #32      /* get 2 pt blocks */
+       next_tweak      v5, v4, v7, v8
+       eor             v0.16b, v0.16b, v4.16b
+       eor             v1.16b, v1.16b, v5.16b
+       do_encrypt_block2x
+       eor             v0.16b, v0.16b, v4.16b
+       eor             v1.16b, v1.16b, v5.16b
+       st1             {v0.16b-v1.16b}, [x0], #32
+       cbz             w4, .LxtsencoutNx
+       next_tweak      v4, v5, v7, v8
+       b               .LxtsencNx
+.LxtsencoutNx:
+       mov             v4.16b, v5.16b
+       b               .Lxtsencout
+#else
+       ld1             {v0.16b-v3.16b}, [x1], #64      /* get 4 pt blocks */
+       next_tweak      v5, v4, v7, v8
+       eor             v0.16b, v0.16b, v4.16b
+       next_tweak      v6, v5, v7, v8
+       eor             v1.16b, v1.16b, v5.16b
+       eor             v2.16b, v2.16b, v6.16b
+       next_tweak      v7, v6, v7, v8
+       eor             v3.16b, v3.16b, v7.16b
+       do_encrypt_block4x
+       eor             v3.16b, v3.16b, v7.16b
+       eor             v0.16b, v0.16b, v4.16b
+       eor             v1.16b, v1.16b, v5.16b
+       eor             v2.16b, v2.16b, v6.16b
+       st1             {v0.16b-v3.16b}, [x0], #64
+       mov             v4.16b, v7.16b
+       cbz             w4, .Lxtsencout
+       b               .LxtsencloopNx
+#endif
+.Lxtsenc1x:
+       adds            w4, w4, #INTERLEAVE
+       beq             .Lxtsencout
+#endif
+.Lxtsencloop:
+       ld1             {v1.16b}, [x1], #16
+       eor             v0.16b, v1.16b, v4.16b
+       encrypt_block   v0, w3, x2, x6, w7
+       eor             v0.16b, v0.16b, v4.16b
+       st1             {v0.16b}, [x0], #16
+       subs            w4, w4, #1
+       beq             .Lxtsencout
+       next_tweak      v4, v4, v7, v8
+       b               .Lxtsencloop
+.Lxtsencout:
+       FRAME_POP
+       ret
+AES_ENDPROC(aes_xts_encrypt)
+
+
+AES_ENTRY(aes_xts_decrypt)
+       FRAME_PUSH
+       cbz             w7, .LxtsdecloopNx
+
+       ld1             {v4.16b}, [x6]
+       enc_prepare     w3, x5, x6
+       encrypt_block   v4, w3, x5, x6, w7              /* first tweak */
+       dec_prepare     w3, x2, x6
+       ldr             q7, .Lxts_mul_x
+       b               .LxtsdecNx
+
+.LxtsdecloopNx:
+       ldr             q7, .Lxts_mul_x
+       next_tweak      v4, v4, v7, v8
+.LxtsdecNx:
+#if INTERLEAVE >= 2
+       subs            w4, w4, #INTERLEAVE
+       bmi             .Lxtsdec1x
+#if INTERLEAVE == 2
+       ld1             {v0.16b-v1.16b}, [x1], #32      /* get 2 ct blocks */
+       next_tweak      v5, v4, v7, v8
+       eor             v0.16b, v0.16b, v4.16b
+       eor             v1.16b, v1.16b, v5.16b
+       do_decrypt_block2x
+       eor             v0.16b, v0.16b, v4.16b
+       eor             v1.16b, v1.16b, v5.16b
+       st1             {v0.16b-v1.16b}, [x0], #32
+       cbz             w4, .LxtsdecoutNx
+       next_tweak      v4, v5, v7, v8
+       b               .LxtsdecNx
+.LxtsdecoutNx:
+       mov             v4.16b, v5.16b
+       b               .Lxtsdecout
+#else
+       ld1             {v0.16b-v3.16b}, [x1], #64      /* get 4 ct blocks */
+       next_tweak      v5, v4, v7, v8
+       eor             v0.16b, v0.16b, v4.16b
+       next_tweak      v6, v5, v7, v8
+       eor             v1.16b, v1.16b, v5.16b
+       eor             v2.16b, v2.16b, v6.16b
+       next_tweak      v7, v6, v7, v8
+       eor             v3.16b, v3.16b, v7.16b
+       do_decrypt_block4x
+       eor             v3.16b, v3.16b, v7.16b
+       eor             v0.16b, v0.16b, v4.16b
+       eor             v1.16b, v1.16b, v5.16b
+       eor             v2.16b, v2.16b, v6.16b
+       st1             {v0.16b-v3.16b}, [x0], #64
+       mov             v4.16b, v7.16b
+       cbz             w4, .Lxtsdecout
+       b               .LxtsdecloopNx
+#endif
+.Lxtsdec1x:
+       adds            w4, w4, #INTERLEAVE
+       beq             .Lxtsdecout
+#endif
+.Lxtsdecloop:
+       ld1             {v1.16b}, [x1], #16
+       eor             v0.16b, v1.16b, v4.16b
+       decrypt_block   v0, w3, x2, x6, w7
+       eor             v0.16b, v0.16b, v4.16b
+       st1             {v0.16b}, [x0], #16
+       subs            w4, w4, #1
+       beq             .Lxtsdecout
+       next_tweak      v4, v4, v7, v8
+       b               .Lxtsdecloop
+.Lxtsdecout:
+       FRAME_POP
+       ret
+AES_ENDPROC(aes_xts_decrypt)