These changes are the raw update to linux-4.4.6-rt14. Kernel sources
[kvmfornfv.git] / kernel / arch / x86 / crypto / chacha20-ssse3-x86_64.S
1 /*
2  * ChaCha20 256-bit cipher algorithm, RFC7539, x64 SSSE3 functions
3  *
4  * Copyright (C) 2015 Martin Willi
5  *
6  * This program is free software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License as published by
8  * the Free Software Foundation; either version 2 of the License, or
9  * (at your option) any later version.
10  */
11
12 #include <linux/linkage.h>
13
14 .data
15 .align 16
16
17 ROT8:   .octa 0x0e0d0c0f0a09080b0605040702010003
18 ROT16:  .octa 0x0d0c0f0e09080b0a0504070601000302
19 CTRINC: .octa 0x00000003000000020000000100000000
20
21 .text
22
23 ENTRY(chacha20_block_xor_ssse3)
24         # %rdi: Input state matrix, s
25         # %rsi: 1 data block output, o
26         # %rdx: 1 data block input, i
27
28         # This function encrypts one ChaCha20 block by loading the state matrix
29         # in four SSE registers. It performs matrix operation on four words in
30         # parallel, but requireds shuffling to rearrange the words after each
31         # round. 8/16-bit word rotation is done with the slightly better
32         # performing SSSE3 byte shuffling, 7/12-bit word rotation uses
33         # traditional shift+OR.
34
35         # x0..3 = s0..3
36         movdqa          0x00(%rdi),%xmm0
37         movdqa          0x10(%rdi),%xmm1
38         movdqa          0x20(%rdi),%xmm2
39         movdqa          0x30(%rdi),%xmm3
40         movdqa          %xmm0,%xmm8
41         movdqa          %xmm1,%xmm9
42         movdqa          %xmm2,%xmm10
43         movdqa          %xmm3,%xmm11
44
45         movdqa          ROT8(%rip),%xmm4
46         movdqa          ROT16(%rip),%xmm5
47
48         mov     $10,%ecx
49
50 .Ldoubleround:
51
52         # x0 += x1, x3 = rotl32(x3 ^ x0, 16)
53         paddd           %xmm1,%xmm0
54         pxor            %xmm0,%xmm3
55         pshufb          %xmm5,%xmm3
56
57         # x2 += x3, x1 = rotl32(x1 ^ x2, 12)
58         paddd           %xmm3,%xmm2
59         pxor            %xmm2,%xmm1
60         movdqa          %xmm1,%xmm6
61         pslld           $12,%xmm6
62         psrld           $20,%xmm1
63         por             %xmm6,%xmm1
64
65         # x0 += x1, x3 = rotl32(x3 ^ x0, 8)
66         paddd           %xmm1,%xmm0
67         pxor            %xmm0,%xmm3
68         pshufb          %xmm4,%xmm3
69
70         # x2 += x3, x1 = rotl32(x1 ^ x2, 7)
71         paddd           %xmm3,%xmm2
72         pxor            %xmm2,%xmm1
73         movdqa          %xmm1,%xmm7
74         pslld           $7,%xmm7
75         psrld           $25,%xmm1
76         por             %xmm7,%xmm1
77
78         # x1 = shuffle32(x1, MASK(0, 3, 2, 1))
79         pshufd          $0x39,%xmm1,%xmm1
80         # x2 = shuffle32(x2, MASK(1, 0, 3, 2))
81         pshufd          $0x4e,%xmm2,%xmm2
82         # x3 = shuffle32(x3, MASK(2, 1, 0, 3))
83         pshufd          $0x93,%xmm3,%xmm3
84
85         # x0 += x1, x3 = rotl32(x3 ^ x0, 16)
86         paddd           %xmm1,%xmm0
87         pxor            %xmm0,%xmm3
88         pshufb          %xmm5,%xmm3
89
90         # x2 += x3, x1 = rotl32(x1 ^ x2, 12)
91         paddd           %xmm3,%xmm2
92         pxor            %xmm2,%xmm1
93         movdqa          %xmm1,%xmm6
94         pslld           $12,%xmm6
95         psrld           $20,%xmm1
96         por             %xmm6,%xmm1
97
98         # x0 += x1, x3 = rotl32(x3 ^ x0, 8)
99         paddd           %xmm1,%xmm0
100         pxor            %xmm0,%xmm3
101         pshufb          %xmm4,%xmm3
102
103         # x2 += x3, x1 = rotl32(x1 ^ x2, 7)
104         paddd           %xmm3,%xmm2
105         pxor            %xmm2,%xmm1
106         movdqa          %xmm1,%xmm7
107         pslld           $7,%xmm7
108         psrld           $25,%xmm1
109         por             %xmm7,%xmm1
110
111         # x1 = shuffle32(x1, MASK(2, 1, 0, 3))
112         pshufd          $0x93,%xmm1,%xmm1
113         # x2 = shuffle32(x2, MASK(1, 0, 3, 2))
114         pshufd          $0x4e,%xmm2,%xmm2
115         # x3 = shuffle32(x3, MASK(0, 3, 2, 1))
116         pshufd          $0x39,%xmm3,%xmm3
117
118         dec             %ecx
119         jnz             .Ldoubleround
120
121         # o0 = i0 ^ (x0 + s0)
122         movdqu          0x00(%rdx),%xmm4
123         paddd           %xmm8,%xmm0
124         pxor            %xmm4,%xmm0
125         movdqu          %xmm0,0x00(%rsi)
126         # o1 = i1 ^ (x1 + s1)
127         movdqu          0x10(%rdx),%xmm5
128         paddd           %xmm9,%xmm1
129         pxor            %xmm5,%xmm1
130         movdqu          %xmm1,0x10(%rsi)
131         # o2 = i2 ^ (x2 + s2)
132         movdqu          0x20(%rdx),%xmm6
133         paddd           %xmm10,%xmm2
134         pxor            %xmm6,%xmm2
135         movdqu          %xmm2,0x20(%rsi)
136         # o3 = i3 ^ (x3 + s3)
137         movdqu          0x30(%rdx),%xmm7
138         paddd           %xmm11,%xmm3
139         pxor            %xmm7,%xmm3
140         movdqu          %xmm3,0x30(%rsi)
141
142         ret
143 ENDPROC(chacha20_block_xor_ssse3)
144
145 ENTRY(chacha20_4block_xor_ssse3)
146         # %rdi: Input state matrix, s
147         # %rsi: 4 data blocks output, o
148         # %rdx: 4 data blocks input, i
149
150         # This function encrypts four consecutive ChaCha20 blocks by loading the
151         # the state matrix in SSE registers four times. As we need some scratch
152         # registers, we save the first four registers on the stack. The
153         # algorithm performs each operation on the corresponding word of each
154         # state matrix, hence requires no word shuffling. For final XORing step
155         # we transpose the matrix by interleaving 32- and then 64-bit words,
156         # which allows us to do XOR in SSE registers. 8/16-bit word rotation is
157         # done with the slightly better performing SSSE3 byte shuffling,
158         # 7/12-bit word rotation uses traditional shift+OR.
159
160         mov             %rsp,%r11
161         sub             $0x80,%rsp
162         and             $~63,%rsp
163
164         # x0..15[0-3] = s0..3[0..3]
165         movq            0x00(%rdi),%xmm1
166         pshufd          $0x00,%xmm1,%xmm0
167         pshufd          $0x55,%xmm1,%xmm1
168         movq            0x08(%rdi),%xmm3
169         pshufd          $0x00,%xmm3,%xmm2
170         pshufd          $0x55,%xmm3,%xmm3
171         movq            0x10(%rdi),%xmm5
172         pshufd          $0x00,%xmm5,%xmm4
173         pshufd          $0x55,%xmm5,%xmm5
174         movq            0x18(%rdi),%xmm7
175         pshufd          $0x00,%xmm7,%xmm6
176         pshufd          $0x55,%xmm7,%xmm7
177         movq            0x20(%rdi),%xmm9
178         pshufd          $0x00,%xmm9,%xmm8
179         pshufd          $0x55,%xmm9,%xmm9
180         movq            0x28(%rdi),%xmm11
181         pshufd          $0x00,%xmm11,%xmm10
182         pshufd          $0x55,%xmm11,%xmm11
183         movq            0x30(%rdi),%xmm13
184         pshufd          $0x00,%xmm13,%xmm12
185         pshufd          $0x55,%xmm13,%xmm13
186         movq            0x38(%rdi),%xmm15
187         pshufd          $0x00,%xmm15,%xmm14
188         pshufd          $0x55,%xmm15,%xmm15
189         # x0..3 on stack
190         movdqa          %xmm0,0x00(%rsp)
191         movdqa          %xmm1,0x10(%rsp)
192         movdqa          %xmm2,0x20(%rsp)
193         movdqa          %xmm3,0x30(%rsp)
194
195         movdqa          CTRINC(%rip),%xmm1
196         movdqa          ROT8(%rip),%xmm2
197         movdqa          ROT16(%rip),%xmm3
198
199         # x12 += counter values 0-3
200         paddd           %xmm1,%xmm12
201
202         mov             $10,%ecx
203
204 .Ldoubleround4:
205         # x0 += x4, x12 = rotl32(x12 ^ x0, 16)
206         movdqa          0x00(%rsp),%xmm0
207         paddd           %xmm4,%xmm0
208         movdqa          %xmm0,0x00(%rsp)
209         pxor            %xmm0,%xmm12
210         pshufb          %xmm3,%xmm12
211         # x1 += x5, x13 = rotl32(x13 ^ x1, 16)
212         movdqa          0x10(%rsp),%xmm0
213         paddd           %xmm5,%xmm0
214         movdqa          %xmm0,0x10(%rsp)
215         pxor            %xmm0,%xmm13
216         pshufb          %xmm3,%xmm13
217         # x2 += x6, x14 = rotl32(x14 ^ x2, 16)
218         movdqa          0x20(%rsp),%xmm0
219         paddd           %xmm6,%xmm0
220         movdqa          %xmm0,0x20(%rsp)
221         pxor            %xmm0,%xmm14
222         pshufb          %xmm3,%xmm14
223         # x3 += x7, x15 = rotl32(x15 ^ x3, 16)
224         movdqa          0x30(%rsp),%xmm0
225         paddd           %xmm7,%xmm0
226         movdqa          %xmm0,0x30(%rsp)
227         pxor            %xmm0,%xmm15
228         pshufb          %xmm3,%xmm15
229
230         # x8 += x12, x4 = rotl32(x4 ^ x8, 12)
231         paddd           %xmm12,%xmm8
232         pxor            %xmm8,%xmm4
233         movdqa          %xmm4,%xmm0
234         pslld           $12,%xmm0
235         psrld           $20,%xmm4
236         por             %xmm0,%xmm4
237         # x9 += x13, x5 = rotl32(x5 ^ x9, 12)
238         paddd           %xmm13,%xmm9
239         pxor            %xmm9,%xmm5
240         movdqa          %xmm5,%xmm0
241         pslld           $12,%xmm0
242         psrld           $20,%xmm5
243         por             %xmm0,%xmm5
244         # x10 += x14, x6 = rotl32(x6 ^ x10, 12)
245         paddd           %xmm14,%xmm10
246         pxor            %xmm10,%xmm6
247         movdqa          %xmm6,%xmm0
248         pslld           $12,%xmm0
249         psrld           $20,%xmm6
250         por             %xmm0,%xmm6
251         # x11 += x15, x7 = rotl32(x7 ^ x11, 12)
252         paddd           %xmm15,%xmm11
253         pxor            %xmm11,%xmm7
254         movdqa          %xmm7,%xmm0
255         pslld           $12,%xmm0
256         psrld           $20,%xmm7
257         por             %xmm0,%xmm7
258
259         # x0 += x4, x12 = rotl32(x12 ^ x0, 8)
260         movdqa          0x00(%rsp),%xmm0
261         paddd           %xmm4,%xmm0
262         movdqa          %xmm0,0x00(%rsp)
263         pxor            %xmm0,%xmm12
264         pshufb          %xmm2,%xmm12
265         # x1 += x5, x13 = rotl32(x13 ^ x1, 8)
266         movdqa          0x10(%rsp),%xmm0
267         paddd           %xmm5,%xmm0
268         movdqa          %xmm0,0x10(%rsp)
269         pxor            %xmm0,%xmm13
270         pshufb          %xmm2,%xmm13
271         # x2 += x6, x14 = rotl32(x14 ^ x2, 8)
272         movdqa          0x20(%rsp),%xmm0
273         paddd           %xmm6,%xmm0
274         movdqa          %xmm0,0x20(%rsp)
275         pxor            %xmm0,%xmm14
276         pshufb          %xmm2,%xmm14
277         # x3 += x7, x15 = rotl32(x15 ^ x3, 8)
278         movdqa          0x30(%rsp),%xmm0
279         paddd           %xmm7,%xmm0
280         movdqa          %xmm0,0x30(%rsp)
281         pxor            %xmm0,%xmm15
282         pshufb          %xmm2,%xmm15
283
284         # x8 += x12, x4 = rotl32(x4 ^ x8, 7)
285         paddd           %xmm12,%xmm8
286         pxor            %xmm8,%xmm4
287         movdqa          %xmm4,%xmm0
288         pslld           $7,%xmm0
289         psrld           $25,%xmm4
290         por             %xmm0,%xmm4
291         # x9 += x13, x5 = rotl32(x5 ^ x9, 7)
292         paddd           %xmm13,%xmm9
293         pxor            %xmm9,%xmm5
294         movdqa          %xmm5,%xmm0
295         pslld           $7,%xmm0
296         psrld           $25,%xmm5
297         por             %xmm0,%xmm5
298         # x10 += x14, x6 = rotl32(x6 ^ x10, 7)
299         paddd           %xmm14,%xmm10
300         pxor            %xmm10,%xmm6
301         movdqa          %xmm6,%xmm0
302         pslld           $7,%xmm0
303         psrld           $25,%xmm6
304         por             %xmm0,%xmm6
305         # x11 += x15, x7 = rotl32(x7 ^ x11, 7)
306         paddd           %xmm15,%xmm11
307         pxor            %xmm11,%xmm7
308         movdqa          %xmm7,%xmm0
309         pslld           $7,%xmm0
310         psrld           $25,%xmm7
311         por             %xmm0,%xmm7
312
313         # x0 += x5, x15 = rotl32(x15 ^ x0, 16)
314         movdqa          0x00(%rsp),%xmm0
315         paddd           %xmm5,%xmm0
316         movdqa          %xmm0,0x00(%rsp)
317         pxor            %xmm0,%xmm15
318         pshufb          %xmm3,%xmm15
319         # x1 += x6, x12 = rotl32(x12 ^ x1, 16)
320         movdqa          0x10(%rsp),%xmm0
321         paddd           %xmm6,%xmm0
322         movdqa          %xmm0,0x10(%rsp)
323         pxor            %xmm0,%xmm12
324         pshufb          %xmm3,%xmm12
325         # x2 += x7, x13 = rotl32(x13 ^ x2, 16)
326         movdqa          0x20(%rsp),%xmm0
327         paddd           %xmm7,%xmm0
328         movdqa          %xmm0,0x20(%rsp)
329         pxor            %xmm0,%xmm13
330         pshufb          %xmm3,%xmm13
331         # x3 += x4, x14 = rotl32(x14 ^ x3, 16)
332         movdqa          0x30(%rsp),%xmm0
333         paddd           %xmm4,%xmm0
334         movdqa          %xmm0,0x30(%rsp)
335         pxor            %xmm0,%xmm14
336         pshufb          %xmm3,%xmm14
337
338         # x10 += x15, x5 = rotl32(x5 ^ x10, 12)
339         paddd           %xmm15,%xmm10
340         pxor            %xmm10,%xmm5
341         movdqa          %xmm5,%xmm0
342         pslld           $12,%xmm0
343         psrld           $20,%xmm5
344         por             %xmm0,%xmm5
345         # x11 += x12, x6 = rotl32(x6 ^ x11, 12)
346         paddd           %xmm12,%xmm11
347         pxor            %xmm11,%xmm6
348         movdqa          %xmm6,%xmm0
349         pslld           $12,%xmm0
350         psrld           $20,%xmm6
351         por             %xmm0,%xmm6
352         # x8 += x13, x7 = rotl32(x7 ^ x8, 12)
353         paddd           %xmm13,%xmm8
354         pxor            %xmm8,%xmm7
355         movdqa          %xmm7,%xmm0
356         pslld           $12,%xmm0
357         psrld           $20,%xmm7
358         por             %xmm0,%xmm7
359         # x9 += x14, x4 = rotl32(x4 ^ x9, 12)
360         paddd           %xmm14,%xmm9
361         pxor            %xmm9,%xmm4
362         movdqa          %xmm4,%xmm0
363         pslld           $12,%xmm0
364         psrld           $20,%xmm4
365         por             %xmm0,%xmm4
366
367         # x0 += x5, x15 = rotl32(x15 ^ x0, 8)
368         movdqa          0x00(%rsp),%xmm0
369         paddd           %xmm5,%xmm0
370         movdqa          %xmm0,0x00(%rsp)
371         pxor            %xmm0,%xmm15
372         pshufb          %xmm2,%xmm15
373         # x1 += x6, x12 = rotl32(x12 ^ x1, 8)
374         movdqa          0x10(%rsp),%xmm0
375         paddd           %xmm6,%xmm0
376         movdqa          %xmm0,0x10(%rsp)
377         pxor            %xmm0,%xmm12
378         pshufb          %xmm2,%xmm12
379         # x2 += x7, x13 = rotl32(x13 ^ x2, 8)
380         movdqa          0x20(%rsp),%xmm0
381         paddd           %xmm7,%xmm0
382         movdqa          %xmm0,0x20(%rsp)
383         pxor            %xmm0,%xmm13
384         pshufb          %xmm2,%xmm13
385         # x3 += x4, x14 = rotl32(x14 ^ x3, 8)
386         movdqa          0x30(%rsp),%xmm0
387         paddd           %xmm4,%xmm0
388         movdqa          %xmm0,0x30(%rsp)
389         pxor            %xmm0,%xmm14
390         pshufb          %xmm2,%xmm14
391
392         # x10 += x15, x5 = rotl32(x5 ^ x10, 7)
393         paddd           %xmm15,%xmm10
394         pxor            %xmm10,%xmm5
395         movdqa          %xmm5,%xmm0
396         pslld           $7,%xmm0
397         psrld           $25,%xmm5
398         por             %xmm0,%xmm5
399         # x11 += x12, x6 = rotl32(x6 ^ x11, 7)
400         paddd           %xmm12,%xmm11
401         pxor            %xmm11,%xmm6
402         movdqa          %xmm6,%xmm0
403         pslld           $7,%xmm0
404         psrld           $25,%xmm6
405         por             %xmm0,%xmm6
406         # x8 += x13, x7 = rotl32(x7 ^ x8, 7)
407         paddd           %xmm13,%xmm8
408         pxor            %xmm8,%xmm7
409         movdqa          %xmm7,%xmm0
410         pslld           $7,%xmm0
411         psrld           $25,%xmm7
412         por             %xmm0,%xmm7
413         # x9 += x14, x4 = rotl32(x4 ^ x9, 7)
414         paddd           %xmm14,%xmm9
415         pxor            %xmm9,%xmm4
416         movdqa          %xmm4,%xmm0
417         pslld           $7,%xmm0
418         psrld           $25,%xmm4
419         por             %xmm0,%xmm4
420
421         dec             %ecx
422         jnz             .Ldoubleround4
423
424         # x0[0-3] += s0[0]
425         # x1[0-3] += s0[1]
426         movq            0x00(%rdi),%xmm3
427         pshufd          $0x00,%xmm3,%xmm2
428         pshufd          $0x55,%xmm3,%xmm3
429         paddd           0x00(%rsp),%xmm2
430         movdqa          %xmm2,0x00(%rsp)
431         paddd           0x10(%rsp),%xmm3
432         movdqa          %xmm3,0x10(%rsp)
433         # x2[0-3] += s0[2]
434         # x3[0-3] += s0[3]
435         movq            0x08(%rdi),%xmm3
436         pshufd          $0x00,%xmm3,%xmm2
437         pshufd          $0x55,%xmm3,%xmm3
438         paddd           0x20(%rsp),%xmm2
439         movdqa          %xmm2,0x20(%rsp)
440         paddd           0x30(%rsp),%xmm3
441         movdqa          %xmm3,0x30(%rsp)
442
443         # x4[0-3] += s1[0]
444         # x5[0-3] += s1[1]
445         movq            0x10(%rdi),%xmm3
446         pshufd          $0x00,%xmm3,%xmm2
447         pshufd          $0x55,%xmm3,%xmm3
448         paddd           %xmm2,%xmm4
449         paddd           %xmm3,%xmm5
450         # x6[0-3] += s1[2]
451         # x7[0-3] += s1[3]
452         movq            0x18(%rdi),%xmm3
453         pshufd          $0x00,%xmm3,%xmm2
454         pshufd          $0x55,%xmm3,%xmm3
455         paddd           %xmm2,%xmm6
456         paddd           %xmm3,%xmm7
457
458         # x8[0-3] += s2[0]
459         # x9[0-3] += s2[1]
460         movq            0x20(%rdi),%xmm3
461         pshufd          $0x00,%xmm3,%xmm2
462         pshufd          $0x55,%xmm3,%xmm3
463         paddd           %xmm2,%xmm8
464         paddd           %xmm3,%xmm9
465         # x10[0-3] += s2[2]
466         # x11[0-3] += s2[3]
467         movq            0x28(%rdi),%xmm3
468         pshufd          $0x00,%xmm3,%xmm2
469         pshufd          $0x55,%xmm3,%xmm3
470         paddd           %xmm2,%xmm10
471         paddd           %xmm3,%xmm11
472
473         # x12[0-3] += s3[0]
474         # x13[0-3] += s3[1]
475         movq            0x30(%rdi),%xmm3
476         pshufd          $0x00,%xmm3,%xmm2
477         pshufd          $0x55,%xmm3,%xmm3
478         paddd           %xmm2,%xmm12
479         paddd           %xmm3,%xmm13
480         # x14[0-3] += s3[2]
481         # x15[0-3] += s3[3]
482         movq            0x38(%rdi),%xmm3
483         pshufd          $0x00,%xmm3,%xmm2
484         pshufd          $0x55,%xmm3,%xmm3
485         paddd           %xmm2,%xmm14
486         paddd           %xmm3,%xmm15
487
488         # x12 += counter values 0-3
489         paddd           %xmm1,%xmm12
490
491         # interleave 32-bit words in state n, n+1
492         movdqa          0x00(%rsp),%xmm0
493         movdqa          0x10(%rsp),%xmm1
494         movdqa          %xmm0,%xmm2
495         punpckldq       %xmm1,%xmm2
496         punpckhdq       %xmm1,%xmm0
497         movdqa          %xmm2,0x00(%rsp)
498         movdqa          %xmm0,0x10(%rsp)
499         movdqa          0x20(%rsp),%xmm0
500         movdqa          0x30(%rsp),%xmm1
501         movdqa          %xmm0,%xmm2
502         punpckldq       %xmm1,%xmm2
503         punpckhdq       %xmm1,%xmm0
504         movdqa          %xmm2,0x20(%rsp)
505         movdqa          %xmm0,0x30(%rsp)
506         movdqa          %xmm4,%xmm0
507         punpckldq       %xmm5,%xmm4
508         punpckhdq       %xmm5,%xmm0
509         movdqa          %xmm0,%xmm5
510         movdqa          %xmm6,%xmm0
511         punpckldq       %xmm7,%xmm6
512         punpckhdq       %xmm7,%xmm0
513         movdqa          %xmm0,%xmm7
514         movdqa          %xmm8,%xmm0
515         punpckldq       %xmm9,%xmm8
516         punpckhdq       %xmm9,%xmm0
517         movdqa          %xmm0,%xmm9
518         movdqa          %xmm10,%xmm0
519         punpckldq       %xmm11,%xmm10
520         punpckhdq       %xmm11,%xmm0
521         movdqa          %xmm0,%xmm11
522         movdqa          %xmm12,%xmm0
523         punpckldq       %xmm13,%xmm12
524         punpckhdq       %xmm13,%xmm0
525         movdqa          %xmm0,%xmm13
526         movdqa          %xmm14,%xmm0
527         punpckldq       %xmm15,%xmm14
528         punpckhdq       %xmm15,%xmm0
529         movdqa          %xmm0,%xmm15
530
531         # interleave 64-bit words in state n, n+2
532         movdqa          0x00(%rsp),%xmm0
533         movdqa          0x20(%rsp),%xmm1
534         movdqa          %xmm0,%xmm2
535         punpcklqdq      %xmm1,%xmm2
536         punpckhqdq      %xmm1,%xmm0
537         movdqa          %xmm2,0x00(%rsp)
538         movdqa          %xmm0,0x20(%rsp)
539         movdqa          0x10(%rsp),%xmm0
540         movdqa          0x30(%rsp),%xmm1
541         movdqa          %xmm0,%xmm2
542         punpcklqdq      %xmm1,%xmm2
543         punpckhqdq      %xmm1,%xmm0
544         movdqa          %xmm2,0x10(%rsp)
545         movdqa          %xmm0,0x30(%rsp)
546         movdqa          %xmm4,%xmm0
547         punpcklqdq      %xmm6,%xmm4
548         punpckhqdq      %xmm6,%xmm0
549         movdqa          %xmm0,%xmm6
550         movdqa          %xmm5,%xmm0
551         punpcklqdq      %xmm7,%xmm5
552         punpckhqdq      %xmm7,%xmm0
553         movdqa          %xmm0,%xmm7
554         movdqa          %xmm8,%xmm0
555         punpcklqdq      %xmm10,%xmm8
556         punpckhqdq      %xmm10,%xmm0
557         movdqa          %xmm0,%xmm10
558         movdqa          %xmm9,%xmm0
559         punpcklqdq      %xmm11,%xmm9
560         punpckhqdq      %xmm11,%xmm0
561         movdqa          %xmm0,%xmm11
562         movdqa          %xmm12,%xmm0
563         punpcklqdq      %xmm14,%xmm12
564         punpckhqdq      %xmm14,%xmm0
565         movdqa          %xmm0,%xmm14
566         movdqa          %xmm13,%xmm0
567         punpcklqdq      %xmm15,%xmm13
568         punpckhqdq      %xmm15,%xmm0
569         movdqa          %xmm0,%xmm15
570
571         # xor with corresponding input, write to output
572         movdqa          0x00(%rsp),%xmm0
573         movdqu          0x00(%rdx),%xmm1
574         pxor            %xmm1,%xmm0
575         movdqu          %xmm0,0x00(%rsi)
576         movdqa          0x10(%rsp),%xmm0
577         movdqu          0x80(%rdx),%xmm1
578         pxor            %xmm1,%xmm0
579         movdqu          %xmm0,0x80(%rsi)
580         movdqa          0x20(%rsp),%xmm0
581         movdqu          0x40(%rdx),%xmm1
582         pxor            %xmm1,%xmm0
583         movdqu          %xmm0,0x40(%rsi)
584         movdqa          0x30(%rsp),%xmm0
585         movdqu          0xc0(%rdx),%xmm1
586         pxor            %xmm1,%xmm0
587         movdqu          %xmm0,0xc0(%rsi)
588         movdqu          0x10(%rdx),%xmm1
589         pxor            %xmm1,%xmm4
590         movdqu          %xmm4,0x10(%rsi)
591         movdqu          0x90(%rdx),%xmm1
592         pxor            %xmm1,%xmm5
593         movdqu          %xmm5,0x90(%rsi)
594         movdqu          0x50(%rdx),%xmm1
595         pxor            %xmm1,%xmm6
596         movdqu          %xmm6,0x50(%rsi)
597         movdqu          0xd0(%rdx),%xmm1
598         pxor            %xmm1,%xmm7
599         movdqu          %xmm7,0xd0(%rsi)
600         movdqu          0x20(%rdx),%xmm1
601         pxor            %xmm1,%xmm8
602         movdqu          %xmm8,0x20(%rsi)
603         movdqu          0xa0(%rdx),%xmm1
604         pxor            %xmm1,%xmm9
605         movdqu          %xmm9,0xa0(%rsi)
606         movdqu          0x60(%rdx),%xmm1
607         pxor            %xmm1,%xmm10
608         movdqu          %xmm10,0x60(%rsi)
609         movdqu          0xe0(%rdx),%xmm1
610         pxor            %xmm1,%xmm11
611         movdqu          %xmm11,0xe0(%rsi)
612         movdqu          0x30(%rdx),%xmm1
613         pxor            %xmm1,%xmm12
614         movdqu          %xmm12,0x30(%rsi)
615         movdqu          0xb0(%rdx),%xmm1
616         pxor            %xmm1,%xmm13
617         movdqu          %xmm13,0xb0(%rsi)
618         movdqu          0x70(%rdx),%xmm1
619         pxor            %xmm1,%xmm14
620         movdqu          %xmm14,0x70(%rsi)
621         movdqu          0xf0(%rdx),%xmm1
622         pxor            %xmm1,%xmm15
623         movdqu          %xmm15,0xf0(%rsi)
624
625         mov             %r11,%rsp
626         ret
627 ENDPROC(chacha20_4block_xor_ssse3)