These changes are the raw update to linux-4.4.6-rt14. Kernel sources
[kvmfornfv.git] / kernel / arch / x86 / crypto / chacha20-avx2-x86_64.S
1 /*
2  * ChaCha20 256-bit cipher algorithm, RFC7539, x64 AVX2 functions
3  *
4  * Copyright (C) 2015 Martin Willi
5  *
6  * This program is free software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License as published by
8  * the Free Software Foundation; either version 2 of the License, or
9  * (at your option) any later version.
10  */
11
12 #include <linux/linkage.h>
13
14 .data
15 .align 32
16
17 ROT8:   .octa 0x0e0d0c0f0a09080b0605040702010003
18         .octa 0x0e0d0c0f0a09080b0605040702010003
19 ROT16:  .octa 0x0d0c0f0e09080b0a0504070601000302
20         .octa 0x0d0c0f0e09080b0a0504070601000302
21 CTRINC: .octa 0x00000003000000020000000100000000
22         .octa 0x00000007000000060000000500000004
23
24 .text
25
26 ENTRY(chacha20_8block_xor_avx2)
27         # %rdi: Input state matrix, s
28         # %rsi: 8 data blocks output, o
29         # %rdx: 8 data blocks input, i
30
31         # This function encrypts eight consecutive ChaCha20 blocks by loading
32         # the state matrix in AVX registers eight times. As we need some
33         # scratch registers, we save the first four registers on the stack. The
34         # algorithm performs each operation on the corresponding word of each
35         # state matrix, hence requires no word shuffling. For final XORing step
36         # we transpose the matrix by interleaving 32-, 64- and then 128-bit
37         # words, which allows us to do XOR in AVX registers. 8/16-bit word
38         # rotation is done with the slightly better performing byte shuffling,
39         # 7/12-bit word rotation uses traditional shift+OR.
40
41         vzeroupper
42         # 4 * 32 byte stack, 32-byte aligned
43         mov             %rsp, %r8
44         and             $~31, %rsp
45         sub             $0x80, %rsp
46
47         # x0..15[0-7] = s[0..15]
48         vpbroadcastd    0x00(%rdi),%ymm0
49         vpbroadcastd    0x04(%rdi),%ymm1
50         vpbroadcastd    0x08(%rdi),%ymm2
51         vpbroadcastd    0x0c(%rdi),%ymm3
52         vpbroadcastd    0x10(%rdi),%ymm4
53         vpbroadcastd    0x14(%rdi),%ymm5
54         vpbroadcastd    0x18(%rdi),%ymm6
55         vpbroadcastd    0x1c(%rdi),%ymm7
56         vpbroadcastd    0x20(%rdi),%ymm8
57         vpbroadcastd    0x24(%rdi),%ymm9
58         vpbroadcastd    0x28(%rdi),%ymm10
59         vpbroadcastd    0x2c(%rdi),%ymm11
60         vpbroadcastd    0x30(%rdi),%ymm12
61         vpbroadcastd    0x34(%rdi),%ymm13
62         vpbroadcastd    0x38(%rdi),%ymm14
63         vpbroadcastd    0x3c(%rdi),%ymm15
64         # x0..3 on stack
65         vmovdqa         %ymm0,0x00(%rsp)
66         vmovdqa         %ymm1,0x20(%rsp)
67         vmovdqa         %ymm2,0x40(%rsp)
68         vmovdqa         %ymm3,0x60(%rsp)
69
70         vmovdqa         CTRINC(%rip),%ymm1
71         vmovdqa         ROT8(%rip),%ymm2
72         vmovdqa         ROT16(%rip),%ymm3
73
74         # x12 += counter values 0-3
75         vpaddd          %ymm1,%ymm12,%ymm12
76
77         mov             $10,%ecx
78
79 .Ldoubleround8:
80         # x0 += x4, x12 = rotl32(x12 ^ x0, 16)
81         vpaddd          0x00(%rsp),%ymm4,%ymm0
82         vmovdqa         %ymm0,0x00(%rsp)
83         vpxor           %ymm0,%ymm12,%ymm12
84         vpshufb         %ymm3,%ymm12,%ymm12
85         # x1 += x5, x13 = rotl32(x13 ^ x1, 16)
86         vpaddd          0x20(%rsp),%ymm5,%ymm0
87         vmovdqa         %ymm0,0x20(%rsp)
88         vpxor           %ymm0,%ymm13,%ymm13
89         vpshufb         %ymm3,%ymm13,%ymm13
90         # x2 += x6, x14 = rotl32(x14 ^ x2, 16)
91         vpaddd          0x40(%rsp),%ymm6,%ymm0
92         vmovdqa         %ymm0,0x40(%rsp)
93         vpxor           %ymm0,%ymm14,%ymm14
94         vpshufb         %ymm3,%ymm14,%ymm14
95         # x3 += x7, x15 = rotl32(x15 ^ x3, 16)
96         vpaddd          0x60(%rsp),%ymm7,%ymm0
97         vmovdqa         %ymm0,0x60(%rsp)
98         vpxor           %ymm0,%ymm15,%ymm15
99         vpshufb         %ymm3,%ymm15,%ymm15
100
101         # x8 += x12, x4 = rotl32(x4 ^ x8, 12)
102         vpaddd          %ymm12,%ymm8,%ymm8
103         vpxor           %ymm8,%ymm4,%ymm4
104         vpslld          $12,%ymm4,%ymm0
105         vpsrld          $20,%ymm4,%ymm4
106         vpor            %ymm0,%ymm4,%ymm4
107         # x9 += x13, x5 = rotl32(x5 ^ x9, 12)
108         vpaddd          %ymm13,%ymm9,%ymm9
109         vpxor           %ymm9,%ymm5,%ymm5
110         vpslld          $12,%ymm5,%ymm0
111         vpsrld          $20,%ymm5,%ymm5
112         vpor            %ymm0,%ymm5,%ymm5
113         # x10 += x14, x6 = rotl32(x6 ^ x10, 12)
114         vpaddd          %ymm14,%ymm10,%ymm10
115         vpxor           %ymm10,%ymm6,%ymm6
116         vpslld          $12,%ymm6,%ymm0
117         vpsrld          $20,%ymm6,%ymm6
118         vpor            %ymm0,%ymm6,%ymm6
119         # x11 += x15, x7 = rotl32(x7 ^ x11, 12)
120         vpaddd          %ymm15,%ymm11,%ymm11
121         vpxor           %ymm11,%ymm7,%ymm7
122         vpslld          $12,%ymm7,%ymm0
123         vpsrld          $20,%ymm7,%ymm7
124         vpor            %ymm0,%ymm7,%ymm7
125
126         # x0 += x4, x12 = rotl32(x12 ^ x0, 8)
127         vpaddd          0x00(%rsp),%ymm4,%ymm0
128         vmovdqa         %ymm0,0x00(%rsp)
129         vpxor           %ymm0,%ymm12,%ymm12
130         vpshufb         %ymm2,%ymm12,%ymm12
131         # x1 += x5, x13 = rotl32(x13 ^ x1, 8)
132         vpaddd          0x20(%rsp),%ymm5,%ymm0
133         vmovdqa         %ymm0,0x20(%rsp)
134         vpxor           %ymm0,%ymm13,%ymm13
135         vpshufb         %ymm2,%ymm13,%ymm13
136         # x2 += x6, x14 = rotl32(x14 ^ x2, 8)
137         vpaddd          0x40(%rsp),%ymm6,%ymm0
138         vmovdqa         %ymm0,0x40(%rsp)
139         vpxor           %ymm0,%ymm14,%ymm14
140         vpshufb         %ymm2,%ymm14,%ymm14
141         # x3 += x7, x15 = rotl32(x15 ^ x3, 8)
142         vpaddd          0x60(%rsp),%ymm7,%ymm0
143         vmovdqa         %ymm0,0x60(%rsp)
144         vpxor           %ymm0,%ymm15,%ymm15
145         vpshufb         %ymm2,%ymm15,%ymm15
146
147         # x8 += x12, x4 = rotl32(x4 ^ x8, 7)
148         vpaddd          %ymm12,%ymm8,%ymm8
149         vpxor           %ymm8,%ymm4,%ymm4
150         vpslld          $7,%ymm4,%ymm0
151         vpsrld          $25,%ymm4,%ymm4
152         vpor            %ymm0,%ymm4,%ymm4
153         # x9 += x13, x5 = rotl32(x5 ^ x9, 7)
154         vpaddd          %ymm13,%ymm9,%ymm9
155         vpxor           %ymm9,%ymm5,%ymm5
156         vpslld          $7,%ymm5,%ymm0
157         vpsrld          $25,%ymm5,%ymm5
158         vpor            %ymm0,%ymm5,%ymm5
159         # x10 += x14, x6 = rotl32(x6 ^ x10, 7)
160         vpaddd          %ymm14,%ymm10,%ymm10
161         vpxor           %ymm10,%ymm6,%ymm6
162         vpslld          $7,%ymm6,%ymm0
163         vpsrld          $25,%ymm6,%ymm6
164         vpor            %ymm0,%ymm6,%ymm6
165         # x11 += x15, x7 = rotl32(x7 ^ x11, 7)
166         vpaddd          %ymm15,%ymm11,%ymm11
167         vpxor           %ymm11,%ymm7,%ymm7
168         vpslld          $7,%ymm7,%ymm0
169         vpsrld          $25,%ymm7,%ymm7
170         vpor            %ymm0,%ymm7,%ymm7
171
172         # x0 += x5, x15 = rotl32(x15 ^ x0, 16)
173         vpaddd          0x00(%rsp),%ymm5,%ymm0
174         vmovdqa         %ymm0,0x00(%rsp)
175         vpxor           %ymm0,%ymm15,%ymm15
176         vpshufb         %ymm3,%ymm15,%ymm15
177         # x1 += x6, x12 = rotl32(x12 ^ x1, 16)%ymm0
178         vpaddd          0x20(%rsp),%ymm6,%ymm0
179         vmovdqa         %ymm0,0x20(%rsp)
180         vpxor           %ymm0,%ymm12,%ymm12
181         vpshufb         %ymm3,%ymm12,%ymm12
182         # x2 += x7, x13 = rotl32(x13 ^ x2, 16)
183         vpaddd          0x40(%rsp),%ymm7,%ymm0
184         vmovdqa         %ymm0,0x40(%rsp)
185         vpxor           %ymm0,%ymm13,%ymm13
186         vpshufb         %ymm3,%ymm13,%ymm13
187         # x3 += x4, x14 = rotl32(x14 ^ x3, 16)
188         vpaddd          0x60(%rsp),%ymm4,%ymm0
189         vmovdqa         %ymm0,0x60(%rsp)
190         vpxor           %ymm0,%ymm14,%ymm14
191         vpshufb         %ymm3,%ymm14,%ymm14
192
193         # x10 += x15, x5 = rotl32(x5 ^ x10, 12)
194         vpaddd          %ymm15,%ymm10,%ymm10
195         vpxor           %ymm10,%ymm5,%ymm5
196         vpslld          $12,%ymm5,%ymm0
197         vpsrld          $20,%ymm5,%ymm5
198         vpor            %ymm0,%ymm5,%ymm5
199         # x11 += x12, x6 = rotl32(x6 ^ x11, 12)
200         vpaddd          %ymm12,%ymm11,%ymm11
201         vpxor           %ymm11,%ymm6,%ymm6
202         vpslld          $12,%ymm6,%ymm0
203         vpsrld          $20,%ymm6,%ymm6
204         vpor            %ymm0,%ymm6,%ymm6
205         # x8 += x13, x7 = rotl32(x7 ^ x8, 12)
206         vpaddd          %ymm13,%ymm8,%ymm8
207         vpxor           %ymm8,%ymm7,%ymm7
208         vpslld          $12,%ymm7,%ymm0
209         vpsrld          $20,%ymm7,%ymm7
210         vpor            %ymm0,%ymm7,%ymm7
211         # x9 += x14, x4 = rotl32(x4 ^ x9, 12)
212         vpaddd          %ymm14,%ymm9,%ymm9
213         vpxor           %ymm9,%ymm4,%ymm4
214         vpslld          $12,%ymm4,%ymm0
215         vpsrld          $20,%ymm4,%ymm4
216         vpor            %ymm0,%ymm4,%ymm4
217
218         # x0 += x5, x15 = rotl32(x15 ^ x0, 8)
219         vpaddd          0x00(%rsp),%ymm5,%ymm0
220         vmovdqa         %ymm0,0x00(%rsp)
221         vpxor           %ymm0,%ymm15,%ymm15
222         vpshufb         %ymm2,%ymm15,%ymm15
223         # x1 += x6, x12 = rotl32(x12 ^ x1, 8)
224         vpaddd          0x20(%rsp),%ymm6,%ymm0
225         vmovdqa         %ymm0,0x20(%rsp)
226         vpxor           %ymm0,%ymm12,%ymm12
227         vpshufb         %ymm2,%ymm12,%ymm12
228         # x2 += x7, x13 = rotl32(x13 ^ x2, 8)
229         vpaddd          0x40(%rsp),%ymm7,%ymm0
230         vmovdqa         %ymm0,0x40(%rsp)
231         vpxor           %ymm0,%ymm13,%ymm13
232         vpshufb         %ymm2,%ymm13,%ymm13
233         # x3 += x4, x14 = rotl32(x14 ^ x3, 8)
234         vpaddd          0x60(%rsp),%ymm4,%ymm0
235         vmovdqa         %ymm0,0x60(%rsp)
236         vpxor           %ymm0,%ymm14,%ymm14
237         vpshufb         %ymm2,%ymm14,%ymm14
238
239         # x10 += x15, x5 = rotl32(x5 ^ x10, 7)
240         vpaddd          %ymm15,%ymm10,%ymm10
241         vpxor           %ymm10,%ymm5,%ymm5
242         vpslld          $7,%ymm5,%ymm0
243         vpsrld          $25,%ymm5,%ymm5
244         vpor            %ymm0,%ymm5,%ymm5
245         # x11 += x12, x6 = rotl32(x6 ^ x11, 7)
246         vpaddd          %ymm12,%ymm11,%ymm11
247         vpxor           %ymm11,%ymm6,%ymm6
248         vpslld          $7,%ymm6,%ymm0
249         vpsrld          $25,%ymm6,%ymm6
250         vpor            %ymm0,%ymm6,%ymm6
251         # x8 += x13, x7 = rotl32(x7 ^ x8, 7)
252         vpaddd          %ymm13,%ymm8,%ymm8
253         vpxor           %ymm8,%ymm7,%ymm7
254         vpslld          $7,%ymm7,%ymm0
255         vpsrld          $25,%ymm7,%ymm7
256         vpor            %ymm0,%ymm7,%ymm7
257         # x9 += x14, x4 = rotl32(x4 ^ x9, 7)
258         vpaddd          %ymm14,%ymm9,%ymm9
259         vpxor           %ymm9,%ymm4,%ymm4
260         vpslld          $7,%ymm4,%ymm0
261         vpsrld          $25,%ymm4,%ymm4
262         vpor            %ymm0,%ymm4,%ymm4
263
264         dec             %ecx
265         jnz             .Ldoubleround8
266
267         # x0..15[0-3] += s[0..15]
268         vpbroadcastd    0x00(%rdi),%ymm0
269         vpaddd          0x00(%rsp),%ymm0,%ymm0
270         vmovdqa         %ymm0,0x00(%rsp)
271         vpbroadcastd    0x04(%rdi),%ymm0
272         vpaddd          0x20(%rsp),%ymm0,%ymm0
273         vmovdqa         %ymm0,0x20(%rsp)
274         vpbroadcastd    0x08(%rdi),%ymm0
275         vpaddd          0x40(%rsp),%ymm0,%ymm0
276         vmovdqa         %ymm0,0x40(%rsp)
277         vpbroadcastd    0x0c(%rdi),%ymm0
278         vpaddd          0x60(%rsp),%ymm0,%ymm0
279         vmovdqa         %ymm0,0x60(%rsp)
280         vpbroadcastd    0x10(%rdi),%ymm0
281         vpaddd          %ymm0,%ymm4,%ymm4
282         vpbroadcastd    0x14(%rdi),%ymm0
283         vpaddd          %ymm0,%ymm5,%ymm5
284         vpbroadcastd    0x18(%rdi),%ymm0
285         vpaddd          %ymm0,%ymm6,%ymm6
286         vpbroadcastd    0x1c(%rdi),%ymm0
287         vpaddd          %ymm0,%ymm7,%ymm7
288         vpbroadcastd    0x20(%rdi),%ymm0
289         vpaddd          %ymm0,%ymm8,%ymm8
290         vpbroadcastd    0x24(%rdi),%ymm0
291         vpaddd          %ymm0,%ymm9,%ymm9
292         vpbroadcastd    0x28(%rdi),%ymm0
293         vpaddd          %ymm0,%ymm10,%ymm10
294         vpbroadcastd    0x2c(%rdi),%ymm0
295         vpaddd          %ymm0,%ymm11,%ymm11
296         vpbroadcastd    0x30(%rdi),%ymm0
297         vpaddd          %ymm0,%ymm12,%ymm12
298         vpbroadcastd    0x34(%rdi),%ymm0
299         vpaddd          %ymm0,%ymm13,%ymm13
300         vpbroadcastd    0x38(%rdi),%ymm0
301         vpaddd          %ymm0,%ymm14,%ymm14
302         vpbroadcastd    0x3c(%rdi),%ymm0
303         vpaddd          %ymm0,%ymm15,%ymm15
304
305         # x12 += counter values 0-3
306         vpaddd          %ymm1,%ymm12,%ymm12
307
308         # interleave 32-bit words in state n, n+1
309         vmovdqa         0x00(%rsp),%ymm0
310         vmovdqa         0x20(%rsp),%ymm1
311         vpunpckldq      %ymm1,%ymm0,%ymm2
312         vpunpckhdq      %ymm1,%ymm0,%ymm1
313         vmovdqa         %ymm2,0x00(%rsp)
314         vmovdqa         %ymm1,0x20(%rsp)
315         vmovdqa         0x40(%rsp),%ymm0
316         vmovdqa         0x60(%rsp),%ymm1
317         vpunpckldq      %ymm1,%ymm0,%ymm2
318         vpunpckhdq      %ymm1,%ymm0,%ymm1
319         vmovdqa         %ymm2,0x40(%rsp)
320         vmovdqa         %ymm1,0x60(%rsp)
321         vmovdqa         %ymm4,%ymm0
322         vpunpckldq      %ymm5,%ymm0,%ymm4
323         vpunpckhdq      %ymm5,%ymm0,%ymm5
324         vmovdqa         %ymm6,%ymm0
325         vpunpckldq      %ymm7,%ymm0,%ymm6
326         vpunpckhdq      %ymm7,%ymm0,%ymm7
327         vmovdqa         %ymm8,%ymm0
328         vpunpckldq      %ymm9,%ymm0,%ymm8
329         vpunpckhdq      %ymm9,%ymm0,%ymm9
330         vmovdqa         %ymm10,%ymm0
331         vpunpckldq      %ymm11,%ymm0,%ymm10
332         vpunpckhdq      %ymm11,%ymm0,%ymm11
333         vmovdqa         %ymm12,%ymm0
334         vpunpckldq      %ymm13,%ymm0,%ymm12
335         vpunpckhdq      %ymm13,%ymm0,%ymm13
336         vmovdqa         %ymm14,%ymm0
337         vpunpckldq      %ymm15,%ymm0,%ymm14
338         vpunpckhdq      %ymm15,%ymm0,%ymm15
339
340         # interleave 64-bit words in state n, n+2
341         vmovdqa         0x00(%rsp),%ymm0
342         vmovdqa         0x40(%rsp),%ymm2
343         vpunpcklqdq     %ymm2,%ymm0,%ymm1
344         vpunpckhqdq     %ymm2,%ymm0,%ymm2
345         vmovdqa         %ymm1,0x00(%rsp)
346         vmovdqa         %ymm2,0x40(%rsp)
347         vmovdqa         0x20(%rsp),%ymm0
348         vmovdqa         0x60(%rsp),%ymm2
349         vpunpcklqdq     %ymm2,%ymm0,%ymm1
350         vpunpckhqdq     %ymm2,%ymm0,%ymm2
351         vmovdqa         %ymm1,0x20(%rsp)
352         vmovdqa         %ymm2,0x60(%rsp)
353         vmovdqa         %ymm4,%ymm0
354         vpunpcklqdq     %ymm6,%ymm0,%ymm4
355         vpunpckhqdq     %ymm6,%ymm0,%ymm6
356         vmovdqa         %ymm5,%ymm0
357         vpunpcklqdq     %ymm7,%ymm0,%ymm5
358         vpunpckhqdq     %ymm7,%ymm0,%ymm7
359         vmovdqa         %ymm8,%ymm0
360         vpunpcklqdq     %ymm10,%ymm0,%ymm8
361         vpunpckhqdq     %ymm10,%ymm0,%ymm10
362         vmovdqa         %ymm9,%ymm0
363         vpunpcklqdq     %ymm11,%ymm0,%ymm9
364         vpunpckhqdq     %ymm11,%ymm0,%ymm11
365         vmovdqa         %ymm12,%ymm0
366         vpunpcklqdq     %ymm14,%ymm0,%ymm12
367         vpunpckhqdq     %ymm14,%ymm0,%ymm14
368         vmovdqa         %ymm13,%ymm0
369         vpunpcklqdq     %ymm15,%ymm0,%ymm13
370         vpunpckhqdq     %ymm15,%ymm0,%ymm15
371
372         # interleave 128-bit words in state n, n+4
373         vmovdqa         0x00(%rsp),%ymm0
374         vperm2i128      $0x20,%ymm4,%ymm0,%ymm1
375         vperm2i128      $0x31,%ymm4,%ymm0,%ymm4
376         vmovdqa         %ymm1,0x00(%rsp)
377         vmovdqa         0x20(%rsp),%ymm0
378         vperm2i128      $0x20,%ymm5,%ymm0,%ymm1
379         vperm2i128      $0x31,%ymm5,%ymm0,%ymm5
380         vmovdqa         %ymm1,0x20(%rsp)
381         vmovdqa         0x40(%rsp),%ymm0
382         vperm2i128      $0x20,%ymm6,%ymm0,%ymm1
383         vperm2i128      $0x31,%ymm6,%ymm0,%ymm6
384         vmovdqa         %ymm1,0x40(%rsp)
385         vmovdqa         0x60(%rsp),%ymm0
386         vperm2i128      $0x20,%ymm7,%ymm0,%ymm1
387         vperm2i128      $0x31,%ymm7,%ymm0,%ymm7
388         vmovdqa         %ymm1,0x60(%rsp)
389         vperm2i128      $0x20,%ymm12,%ymm8,%ymm0
390         vperm2i128      $0x31,%ymm12,%ymm8,%ymm12
391         vmovdqa         %ymm0,%ymm8
392         vperm2i128      $0x20,%ymm13,%ymm9,%ymm0
393         vperm2i128      $0x31,%ymm13,%ymm9,%ymm13
394         vmovdqa         %ymm0,%ymm9
395         vperm2i128      $0x20,%ymm14,%ymm10,%ymm0
396         vperm2i128      $0x31,%ymm14,%ymm10,%ymm14
397         vmovdqa         %ymm0,%ymm10
398         vperm2i128      $0x20,%ymm15,%ymm11,%ymm0
399         vperm2i128      $0x31,%ymm15,%ymm11,%ymm15
400         vmovdqa         %ymm0,%ymm11
401
402         # xor with corresponding input, write to output
403         vmovdqa         0x00(%rsp),%ymm0
404         vpxor           0x0000(%rdx),%ymm0,%ymm0
405         vmovdqu         %ymm0,0x0000(%rsi)
406         vmovdqa         0x20(%rsp),%ymm0
407         vpxor           0x0080(%rdx),%ymm0,%ymm0
408         vmovdqu         %ymm0,0x0080(%rsi)
409         vmovdqa         0x40(%rsp),%ymm0
410         vpxor           0x0040(%rdx),%ymm0,%ymm0
411         vmovdqu         %ymm0,0x0040(%rsi)
412         vmovdqa         0x60(%rsp),%ymm0
413         vpxor           0x00c0(%rdx),%ymm0,%ymm0
414         vmovdqu         %ymm0,0x00c0(%rsi)
415         vpxor           0x0100(%rdx),%ymm4,%ymm4
416         vmovdqu         %ymm4,0x0100(%rsi)
417         vpxor           0x0180(%rdx),%ymm5,%ymm5
418         vmovdqu         %ymm5,0x00180(%rsi)
419         vpxor           0x0140(%rdx),%ymm6,%ymm6
420         vmovdqu         %ymm6,0x0140(%rsi)
421         vpxor           0x01c0(%rdx),%ymm7,%ymm7
422         vmovdqu         %ymm7,0x01c0(%rsi)
423         vpxor           0x0020(%rdx),%ymm8,%ymm8
424         vmovdqu         %ymm8,0x0020(%rsi)
425         vpxor           0x00a0(%rdx),%ymm9,%ymm9
426         vmovdqu         %ymm9,0x00a0(%rsi)
427         vpxor           0x0060(%rdx),%ymm10,%ymm10
428         vmovdqu         %ymm10,0x0060(%rsi)
429         vpxor           0x00e0(%rdx),%ymm11,%ymm11
430         vmovdqu         %ymm11,0x00e0(%rsi)
431         vpxor           0x0120(%rdx),%ymm12,%ymm12
432         vmovdqu         %ymm12,0x0120(%rsi)
433         vpxor           0x01a0(%rdx),%ymm13,%ymm13
434         vmovdqu         %ymm13,0x01a0(%rsi)
435         vpxor           0x0160(%rdx),%ymm14,%ymm14
436         vmovdqu         %ymm14,0x0160(%rsi)
437         vpxor           0x01e0(%rdx),%ymm15,%ymm15
438         vmovdqu         %ymm15,0x01e0(%rsi)
439
440         vzeroupper
441         mov             %r8,%rsp
442         ret
443 ENDPROC(chacha20_8block_xor_avx2)