Add the rt linux 4.1.3-rt3 as base
[kvmfornfv.git] / kernel / arch / x86 / crypto / salsa20-x86_64-asm_64.S
1 #include <linux/linkage.h>
2
3 # enter salsa20_encrypt_bytes
4 ENTRY(salsa20_encrypt_bytes)
5         mov     %rsp,%r11
6         and     $31,%r11
7         add     $256,%r11
8         sub     %r11,%rsp
9         # x = arg1
10         mov     %rdi,%r8
11         # m = arg2
12         mov     %rsi,%rsi
13         # out = arg3
14         mov     %rdx,%rdi
15         # bytes = arg4
16         mov     %rcx,%rdx
17         #               unsigned>? bytes - 0
18         cmp     $0,%rdx
19         # comment:fp stack unchanged by jump
20         # goto done if !unsigned>
21         jbe     ._done
22         # comment:fp stack unchanged by fallthrough
23 # start:
24 ._start:
25         # r11_stack = r11
26         movq    %r11,0(%rsp)
27         # r12_stack = r12
28         movq    %r12,8(%rsp)
29         # r13_stack = r13
30         movq    %r13,16(%rsp)
31         # r14_stack = r14
32         movq    %r14,24(%rsp)
33         # r15_stack = r15
34         movq    %r15,32(%rsp)
35         # rbx_stack = rbx
36         movq    %rbx,40(%rsp)
37         # rbp_stack = rbp
38         movq    %rbp,48(%rsp)
39         # in0 = *(uint64 *) (x + 0)
40         movq    0(%r8),%rcx
41         # in2 = *(uint64 *) (x + 8)
42         movq    8(%r8),%r9
43         # in4 = *(uint64 *) (x + 16)
44         movq    16(%r8),%rax
45         # in6 = *(uint64 *) (x + 24)
46         movq    24(%r8),%r10
47         # in8 = *(uint64 *) (x + 32)
48         movq    32(%r8),%r11
49         # in10 = *(uint64 *) (x + 40)
50         movq    40(%r8),%r12
51         # in12 = *(uint64 *) (x + 48)
52         movq    48(%r8),%r13
53         # in14 = *(uint64 *) (x + 56)
54         movq    56(%r8),%r14
55         # j0 = in0
56         movq    %rcx,56(%rsp)
57         # j2 = in2
58         movq    %r9,64(%rsp)
59         # j4 = in4
60         movq    %rax,72(%rsp)
61         # j6 = in6
62         movq    %r10,80(%rsp)
63         # j8 = in8
64         movq    %r11,88(%rsp)
65         # j10 = in10
66         movq    %r12,96(%rsp)
67         # j12 = in12
68         movq    %r13,104(%rsp)
69         # j14 = in14
70         movq    %r14,112(%rsp)
71         # x_backup = x
72         movq    %r8,120(%rsp)
73 # bytesatleast1:
74 ._bytesatleast1:
75         #                   unsigned<? bytes - 64
76         cmp     $64,%rdx
77         # comment:fp stack unchanged by jump
78         #   goto nocopy if !unsigned<
79         jae     ._nocopy
80         #     ctarget = out
81         movq    %rdi,128(%rsp)
82         #     out = &tmp
83         leaq    192(%rsp),%rdi
84         #     i = bytes
85         mov     %rdx,%rcx
86         #     while (i) { *out++ = *m++; --i }
87         rep     movsb
88         #     out = &tmp
89         leaq    192(%rsp),%rdi
90         #     m = &tmp
91         leaq    192(%rsp),%rsi
92         # comment:fp stack unchanged by fallthrough
93 #   nocopy:
94 ._nocopy:
95         #   out_backup = out
96         movq    %rdi,136(%rsp)
97         #   m_backup = m
98         movq    %rsi,144(%rsp)
99         #   bytes_backup = bytes
100         movq    %rdx,152(%rsp)
101         #   x1 = j0
102         movq    56(%rsp),%rdi
103         #   x0 = x1
104         mov     %rdi,%rdx
105         #   (uint64) x1 >>= 32
106         shr     $32,%rdi
107         #               x3 = j2
108         movq    64(%rsp),%rsi
109         #               x2 = x3
110         mov     %rsi,%rcx
111         #               (uint64) x3 >>= 32
112         shr     $32,%rsi
113         #   x5 = j4
114         movq    72(%rsp),%r8
115         #   x4 = x5
116         mov     %r8,%r9
117         #   (uint64) x5 >>= 32
118         shr     $32,%r8
119         #   x5_stack = x5
120         movq    %r8,160(%rsp)
121         #               x7 = j6
122         movq    80(%rsp),%r8
123         #               x6 = x7
124         mov     %r8,%rax
125         #               (uint64) x7 >>= 32
126         shr     $32,%r8
127         #   x9 = j8
128         movq    88(%rsp),%r10
129         #   x8 = x9
130         mov     %r10,%r11
131         #   (uint64) x9 >>= 32
132         shr     $32,%r10
133         #               x11 = j10
134         movq    96(%rsp),%r12
135         #               x10 = x11
136         mov     %r12,%r13
137         #               x10_stack = x10
138         movq    %r13,168(%rsp)
139         #               (uint64) x11 >>= 32
140         shr     $32,%r12
141         #   x13 = j12
142         movq    104(%rsp),%r13
143         #   x12 = x13
144         mov     %r13,%r14
145         #   (uint64) x13 >>= 32
146         shr     $32,%r13
147         #               x15 = j14
148         movq    112(%rsp),%r15
149         #               x14 = x15
150         mov     %r15,%rbx
151         #               (uint64) x15 >>= 32
152         shr     $32,%r15
153         #               x15_stack = x15
154         movq    %r15,176(%rsp)
155         #   i = 20
156         mov     $20,%r15
157 #   mainloop:
158 ._mainloop:
159         #   i_backup = i
160         movq    %r15,184(%rsp)
161         #               x5 = x5_stack
162         movq    160(%rsp),%r15
163         # a = x12 + x0
164         lea     (%r14,%rdx),%rbp
165         # (uint32) a <<<= 7
166         rol     $7,%ebp
167         # x4 ^= a
168         xor     %rbp,%r9
169         #               b = x1 + x5
170         lea     (%rdi,%r15),%rbp
171         #               (uint32) b <<<= 7
172         rol     $7,%ebp
173         #               x9 ^= b
174         xor     %rbp,%r10
175         # a = x0 + x4
176         lea     (%rdx,%r9),%rbp
177         # (uint32) a <<<= 9
178         rol     $9,%ebp
179         # x8 ^= a
180         xor     %rbp,%r11
181         #               b = x5 + x9
182         lea     (%r15,%r10),%rbp
183         #               (uint32) b <<<= 9
184         rol     $9,%ebp
185         #               x13 ^= b
186         xor     %rbp,%r13
187         # a = x4 + x8
188         lea     (%r9,%r11),%rbp
189         # (uint32) a <<<= 13
190         rol     $13,%ebp
191         # x12 ^= a
192         xor     %rbp,%r14
193         #               b = x9 + x13
194         lea     (%r10,%r13),%rbp
195         #               (uint32) b <<<= 13
196         rol     $13,%ebp
197         #               x1 ^= b
198         xor     %rbp,%rdi
199         # a = x8 + x12
200         lea     (%r11,%r14),%rbp
201         # (uint32) a <<<= 18
202         rol     $18,%ebp
203         # x0 ^= a
204         xor     %rbp,%rdx
205         #               b = x13 + x1
206         lea     (%r13,%rdi),%rbp
207         #               (uint32) b <<<= 18
208         rol     $18,%ebp
209         #               x5 ^= b
210         xor     %rbp,%r15
211         #                               x10 = x10_stack
212         movq    168(%rsp),%rbp
213         #               x5_stack = x5
214         movq    %r15,160(%rsp)
215         #                               c = x6 + x10
216         lea     (%rax,%rbp),%r15
217         #                               (uint32) c <<<= 7
218         rol     $7,%r15d
219         #                               x14 ^= c
220         xor     %r15,%rbx
221         #                               c = x10 + x14
222         lea     (%rbp,%rbx),%r15
223         #                               (uint32) c <<<= 9
224         rol     $9,%r15d
225         #                               x2 ^= c
226         xor     %r15,%rcx
227         #                               c = x14 + x2
228         lea     (%rbx,%rcx),%r15
229         #                               (uint32) c <<<= 13
230         rol     $13,%r15d
231         #                               x6 ^= c
232         xor     %r15,%rax
233         #                               c = x2 + x6
234         lea     (%rcx,%rax),%r15
235         #                               (uint32) c <<<= 18
236         rol     $18,%r15d
237         #                               x10 ^= c
238         xor     %r15,%rbp
239         #                                               x15 = x15_stack
240         movq    176(%rsp),%r15
241         #                               x10_stack = x10
242         movq    %rbp,168(%rsp)
243         #                                               d = x11 + x15
244         lea     (%r12,%r15),%rbp
245         #                                               (uint32) d <<<= 7
246         rol     $7,%ebp
247         #                                               x3 ^= d
248         xor     %rbp,%rsi
249         #                                               d = x15 + x3
250         lea     (%r15,%rsi),%rbp
251         #                                               (uint32) d <<<= 9
252         rol     $9,%ebp
253         #                                               x7 ^= d
254         xor     %rbp,%r8
255         #                                               d = x3 + x7
256         lea     (%rsi,%r8),%rbp
257         #                                               (uint32) d <<<= 13
258         rol     $13,%ebp
259         #                                               x11 ^= d
260         xor     %rbp,%r12
261         #                                               d = x7 + x11
262         lea     (%r8,%r12),%rbp
263         #                                               (uint32) d <<<= 18
264         rol     $18,%ebp
265         #                                               x15 ^= d
266         xor     %rbp,%r15
267         #                                               x15_stack = x15
268         movq    %r15,176(%rsp)
269         #               x5 = x5_stack
270         movq    160(%rsp),%r15
271         # a = x3 + x0
272         lea     (%rsi,%rdx),%rbp
273         # (uint32) a <<<= 7
274         rol     $7,%ebp
275         # x1 ^= a
276         xor     %rbp,%rdi
277         #               b = x4 + x5
278         lea     (%r9,%r15),%rbp
279         #               (uint32) b <<<= 7
280         rol     $7,%ebp
281         #               x6 ^= b
282         xor     %rbp,%rax
283         # a = x0 + x1
284         lea     (%rdx,%rdi),%rbp
285         # (uint32) a <<<= 9
286         rol     $9,%ebp
287         # x2 ^= a
288         xor     %rbp,%rcx
289         #               b = x5 + x6
290         lea     (%r15,%rax),%rbp
291         #               (uint32) b <<<= 9
292         rol     $9,%ebp
293         #               x7 ^= b
294         xor     %rbp,%r8
295         # a = x1 + x2
296         lea     (%rdi,%rcx),%rbp
297         # (uint32) a <<<= 13
298         rol     $13,%ebp
299         # x3 ^= a
300         xor     %rbp,%rsi
301         #               b = x6 + x7
302         lea     (%rax,%r8),%rbp
303         #               (uint32) b <<<= 13
304         rol     $13,%ebp
305         #               x4 ^= b
306         xor     %rbp,%r9
307         # a = x2 + x3
308         lea     (%rcx,%rsi),%rbp
309         # (uint32) a <<<= 18
310         rol     $18,%ebp
311         # x0 ^= a
312         xor     %rbp,%rdx
313         #               b = x7 + x4
314         lea     (%r8,%r9),%rbp
315         #               (uint32) b <<<= 18
316         rol     $18,%ebp
317         #               x5 ^= b
318         xor     %rbp,%r15
319         #                               x10 = x10_stack
320         movq    168(%rsp),%rbp
321         #               x5_stack = x5
322         movq    %r15,160(%rsp)
323         #                               c = x9 + x10
324         lea     (%r10,%rbp),%r15
325         #                               (uint32) c <<<= 7
326         rol     $7,%r15d
327         #                               x11 ^= c
328         xor     %r15,%r12
329         #                               c = x10 + x11
330         lea     (%rbp,%r12),%r15
331         #                               (uint32) c <<<= 9
332         rol     $9,%r15d
333         #                               x8 ^= c
334         xor     %r15,%r11
335         #                               c = x11 + x8
336         lea     (%r12,%r11),%r15
337         #                               (uint32) c <<<= 13
338         rol     $13,%r15d
339         #                               x9 ^= c
340         xor     %r15,%r10
341         #                               c = x8 + x9
342         lea     (%r11,%r10),%r15
343         #                               (uint32) c <<<= 18
344         rol     $18,%r15d
345         #                               x10 ^= c
346         xor     %r15,%rbp
347         #                                               x15 = x15_stack
348         movq    176(%rsp),%r15
349         #                               x10_stack = x10
350         movq    %rbp,168(%rsp)
351         #                                               d = x14 + x15
352         lea     (%rbx,%r15),%rbp
353         #                                               (uint32) d <<<= 7
354         rol     $7,%ebp
355         #                                               x12 ^= d
356         xor     %rbp,%r14
357         #                                               d = x15 + x12
358         lea     (%r15,%r14),%rbp
359         #                                               (uint32) d <<<= 9
360         rol     $9,%ebp
361         #                                               x13 ^= d
362         xor     %rbp,%r13
363         #                                               d = x12 + x13
364         lea     (%r14,%r13),%rbp
365         #                                               (uint32) d <<<= 13
366         rol     $13,%ebp
367         #                                               x14 ^= d
368         xor     %rbp,%rbx
369         #                                               d = x13 + x14
370         lea     (%r13,%rbx),%rbp
371         #                                               (uint32) d <<<= 18
372         rol     $18,%ebp
373         #                                               x15 ^= d
374         xor     %rbp,%r15
375         #                                               x15_stack = x15
376         movq    %r15,176(%rsp)
377         #               x5 = x5_stack
378         movq    160(%rsp),%r15
379         # a = x12 + x0
380         lea     (%r14,%rdx),%rbp
381         # (uint32) a <<<= 7
382         rol     $7,%ebp
383         # x4 ^= a
384         xor     %rbp,%r9
385         #               b = x1 + x5
386         lea     (%rdi,%r15),%rbp
387         #               (uint32) b <<<= 7
388         rol     $7,%ebp
389         #               x9 ^= b
390         xor     %rbp,%r10
391         # a = x0 + x4
392         lea     (%rdx,%r9),%rbp
393         # (uint32) a <<<= 9
394         rol     $9,%ebp
395         # x8 ^= a
396         xor     %rbp,%r11
397         #               b = x5 + x9
398         lea     (%r15,%r10),%rbp
399         #               (uint32) b <<<= 9
400         rol     $9,%ebp
401         #               x13 ^= b
402         xor     %rbp,%r13
403         # a = x4 + x8
404         lea     (%r9,%r11),%rbp
405         # (uint32) a <<<= 13
406         rol     $13,%ebp
407         # x12 ^= a
408         xor     %rbp,%r14
409         #               b = x9 + x13
410         lea     (%r10,%r13),%rbp
411         #               (uint32) b <<<= 13
412         rol     $13,%ebp
413         #               x1 ^= b
414         xor     %rbp,%rdi
415         # a = x8 + x12
416         lea     (%r11,%r14),%rbp
417         # (uint32) a <<<= 18
418         rol     $18,%ebp
419         # x0 ^= a
420         xor     %rbp,%rdx
421         #               b = x13 + x1
422         lea     (%r13,%rdi),%rbp
423         #               (uint32) b <<<= 18
424         rol     $18,%ebp
425         #               x5 ^= b
426         xor     %rbp,%r15
427         #                               x10 = x10_stack
428         movq    168(%rsp),%rbp
429         #               x5_stack = x5
430         movq    %r15,160(%rsp)
431         #                               c = x6 + x10
432         lea     (%rax,%rbp),%r15
433         #                               (uint32) c <<<= 7
434         rol     $7,%r15d
435         #                               x14 ^= c
436         xor     %r15,%rbx
437         #                               c = x10 + x14
438         lea     (%rbp,%rbx),%r15
439         #                               (uint32) c <<<= 9
440         rol     $9,%r15d
441         #                               x2 ^= c
442         xor     %r15,%rcx
443         #                               c = x14 + x2
444         lea     (%rbx,%rcx),%r15
445         #                               (uint32) c <<<= 13
446         rol     $13,%r15d
447         #                               x6 ^= c
448         xor     %r15,%rax
449         #                               c = x2 + x6
450         lea     (%rcx,%rax),%r15
451         #                               (uint32) c <<<= 18
452         rol     $18,%r15d
453         #                               x10 ^= c
454         xor     %r15,%rbp
455         #                                               x15 = x15_stack
456         movq    176(%rsp),%r15
457         #                               x10_stack = x10
458         movq    %rbp,168(%rsp)
459         #                                               d = x11 + x15
460         lea     (%r12,%r15),%rbp
461         #                                               (uint32) d <<<= 7
462         rol     $7,%ebp
463         #                                               x3 ^= d
464         xor     %rbp,%rsi
465         #                                               d = x15 + x3
466         lea     (%r15,%rsi),%rbp
467         #                                               (uint32) d <<<= 9
468         rol     $9,%ebp
469         #                                               x7 ^= d
470         xor     %rbp,%r8
471         #                                               d = x3 + x7
472         lea     (%rsi,%r8),%rbp
473         #                                               (uint32) d <<<= 13
474         rol     $13,%ebp
475         #                                               x11 ^= d
476         xor     %rbp,%r12
477         #                                               d = x7 + x11
478         lea     (%r8,%r12),%rbp
479         #                                               (uint32) d <<<= 18
480         rol     $18,%ebp
481         #                                               x15 ^= d
482         xor     %rbp,%r15
483         #                                               x15_stack = x15
484         movq    %r15,176(%rsp)
485         #               x5 = x5_stack
486         movq    160(%rsp),%r15
487         # a = x3 + x0
488         lea     (%rsi,%rdx),%rbp
489         # (uint32) a <<<= 7
490         rol     $7,%ebp
491         # x1 ^= a
492         xor     %rbp,%rdi
493         #               b = x4 + x5
494         lea     (%r9,%r15),%rbp
495         #               (uint32) b <<<= 7
496         rol     $7,%ebp
497         #               x6 ^= b
498         xor     %rbp,%rax
499         # a = x0 + x1
500         lea     (%rdx,%rdi),%rbp
501         # (uint32) a <<<= 9
502         rol     $9,%ebp
503         # x2 ^= a
504         xor     %rbp,%rcx
505         #               b = x5 + x6
506         lea     (%r15,%rax),%rbp
507         #               (uint32) b <<<= 9
508         rol     $9,%ebp
509         #               x7 ^= b
510         xor     %rbp,%r8
511         # a = x1 + x2
512         lea     (%rdi,%rcx),%rbp
513         # (uint32) a <<<= 13
514         rol     $13,%ebp
515         # x3 ^= a
516         xor     %rbp,%rsi
517         #               b = x6 + x7
518         lea     (%rax,%r8),%rbp
519         #               (uint32) b <<<= 13
520         rol     $13,%ebp
521         #               x4 ^= b
522         xor     %rbp,%r9
523         # a = x2 + x3
524         lea     (%rcx,%rsi),%rbp
525         # (uint32) a <<<= 18
526         rol     $18,%ebp
527         # x0 ^= a
528         xor     %rbp,%rdx
529         #               b = x7 + x4
530         lea     (%r8,%r9),%rbp
531         #               (uint32) b <<<= 18
532         rol     $18,%ebp
533         #               x5 ^= b
534         xor     %rbp,%r15
535         #                               x10 = x10_stack
536         movq    168(%rsp),%rbp
537         #               x5_stack = x5
538         movq    %r15,160(%rsp)
539         #                               c = x9 + x10
540         lea     (%r10,%rbp),%r15
541         #                               (uint32) c <<<= 7
542         rol     $7,%r15d
543         #                               x11 ^= c
544         xor     %r15,%r12
545         #                               c = x10 + x11
546         lea     (%rbp,%r12),%r15
547         #                               (uint32) c <<<= 9
548         rol     $9,%r15d
549         #                               x8 ^= c
550         xor     %r15,%r11
551         #                               c = x11 + x8
552         lea     (%r12,%r11),%r15
553         #                               (uint32) c <<<= 13
554         rol     $13,%r15d
555         #                               x9 ^= c
556         xor     %r15,%r10
557         #                               c = x8 + x9
558         lea     (%r11,%r10),%r15
559         #                               (uint32) c <<<= 18
560         rol     $18,%r15d
561         #                               x10 ^= c
562         xor     %r15,%rbp
563         #                                               x15 = x15_stack
564         movq    176(%rsp),%r15
565         #                               x10_stack = x10
566         movq    %rbp,168(%rsp)
567         #                                               d = x14 + x15
568         lea     (%rbx,%r15),%rbp
569         #                                               (uint32) d <<<= 7
570         rol     $7,%ebp
571         #                                               x12 ^= d
572         xor     %rbp,%r14
573         #                                               d = x15 + x12
574         lea     (%r15,%r14),%rbp
575         #                                               (uint32) d <<<= 9
576         rol     $9,%ebp
577         #                                               x13 ^= d
578         xor     %rbp,%r13
579         #                                               d = x12 + x13
580         lea     (%r14,%r13),%rbp
581         #                                               (uint32) d <<<= 13
582         rol     $13,%ebp
583         #                                               x14 ^= d
584         xor     %rbp,%rbx
585         #                                               d = x13 + x14
586         lea     (%r13,%rbx),%rbp
587         #                                               (uint32) d <<<= 18
588         rol     $18,%ebp
589         #                                               x15 ^= d
590         xor     %rbp,%r15
591         #                                               x15_stack = x15
592         movq    %r15,176(%rsp)
593         #   i = i_backup
594         movq    184(%rsp),%r15
595         #                  unsigned>? i -= 4
596         sub     $4,%r15
597         # comment:fp stack unchanged by jump
598         # goto mainloop if unsigned>
599         ja      ._mainloop
600         #   (uint32) x2 += j2
601         addl    64(%rsp),%ecx
602         #   x3 <<= 32
603         shl     $32,%rsi
604         #   x3 += j2
605         addq    64(%rsp),%rsi
606         #   (uint64) x3 >>= 32
607         shr     $32,%rsi
608         #   x3 <<= 32
609         shl     $32,%rsi
610         #   x2 += x3
611         add     %rsi,%rcx
612         #   (uint32) x6 += j6
613         addl    80(%rsp),%eax
614         #   x7 <<= 32
615         shl     $32,%r8
616         #   x7 += j6
617         addq    80(%rsp),%r8
618         #   (uint64) x7 >>= 32
619         shr     $32,%r8
620         #   x7 <<= 32
621         shl     $32,%r8
622         #   x6 += x7
623         add     %r8,%rax
624         #   (uint32) x8 += j8
625         addl    88(%rsp),%r11d
626         #   x9 <<= 32
627         shl     $32,%r10
628         #   x9 += j8
629         addq    88(%rsp),%r10
630         #   (uint64) x9 >>= 32
631         shr     $32,%r10
632         #   x9 <<= 32
633         shl     $32,%r10
634         #   x8 += x9
635         add     %r10,%r11
636         #   (uint32) x12 += j12
637         addl    104(%rsp),%r14d
638         #   x13 <<= 32
639         shl     $32,%r13
640         #   x13 += j12
641         addq    104(%rsp),%r13
642         #   (uint64) x13 >>= 32
643         shr     $32,%r13
644         #   x13 <<= 32
645         shl     $32,%r13
646         #   x12 += x13
647         add     %r13,%r14
648         #   (uint32) x0 += j0
649         addl    56(%rsp),%edx
650         #   x1 <<= 32
651         shl     $32,%rdi
652         #   x1 += j0
653         addq    56(%rsp),%rdi
654         #   (uint64) x1 >>= 32
655         shr     $32,%rdi
656         #   x1 <<= 32
657         shl     $32,%rdi
658         #   x0 += x1
659         add     %rdi,%rdx
660         #   x5 = x5_stack
661         movq    160(%rsp),%rdi
662         #   (uint32) x4 += j4
663         addl    72(%rsp),%r9d
664         #   x5 <<= 32
665         shl     $32,%rdi
666         #   x5 += j4
667         addq    72(%rsp),%rdi
668         #   (uint64) x5 >>= 32
669         shr     $32,%rdi
670         #   x5 <<= 32
671         shl     $32,%rdi
672         #   x4 += x5
673         add     %rdi,%r9
674         #   x10 = x10_stack
675         movq    168(%rsp),%r8
676         #   (uint32) x10 += j10
677         addl    96(%rsp),%r8d
678         #   x11 <<= 32
679         shl     $32,%r12
680         #   x11 += j10
681         addq    96(%rsp),%r12
682         #   (uint64) x11 >>= 32
683         shr     $32,%r12
684         #   x11 <<= 32
685         shl     $32,%r12
686         #   x10 += x11
687         add     %r12,%r8
688         #   x15 = x15_stack
689         movq    176(%rsp),%rdi
690         #   (uint32) x14 += j14
691         addl    112(%rsp),%ebx
692         #   x15 <<= 32
693         shl     $32,%rdi
694         #   x15 += j14
695         addq    112(%rsp),%rdi
696         #   (uint64) x15 >>= 32
697         shr     $32,%rdi
698         #   x15 <<= 32
699         shl     $32,%rdi
700         #   x14 += x15
701         add     %rdi,%rbx
702         #   out = out_backup
703         movq    136(%rsp),%rdi
704         #   m = m_backup
705         movq    144(%rsp),%rsi
706         #   x0 ^= *(uint64 *) (m + 0)
707         xorq    0(%rsi),%rdx
708         #   *(uint64 *) (out + 0) = x0
709         movq    %rdx,0(%rdi)
710         #   x2 ^= *(uint64 *) (m + 8)
711         xorq    8(%rsi),%rcx
712         #   *(uint64 *) (out + 8) = x2
713         movq    %rcx,8(%rdi)
714         #   x4 ^= *(uint64 *) (m + 16)
715         xorq    16(%rsi),%r9
716         #   *(uint64 *) (out + 16) = x4
717         movq    %r9,16(%rdi)
718         #   x6 ^= *(uint64 *) (m + 24)
719         xorq    24(%rsi),%rax
720         #   *(uint64 *) (out + 24) = x6
721         movq    %rax,24(%rdi)
722         #   x8 ^= *(uint64 *) (m + 32)
723         xorq    32(%rsi),%r11
724         #   *(uint64 *) (out + 32) = x8
725         movq    %r11,32(%rdi)
726         #   x10 ^= *(uint64 *) (m + 40)
727         xorq    40(%rsi),%r8
728         #   *(uint64 *) (out + 40) = x10
729         movq    %r8,40(%rdi)
730         #   x12 ^= *(uint64 *) (m + 48)
731         xorq    48(%rsi),%r14
732         #   *(uint64 *) (out + 48) = x12
733         movq    %r14,48(%rdi)
734         #   x14 ^= *(uint64 *) (m + 56)
735         xorq    56(%rsi),%rbx
736         #   *(uint64 *) (out + 56) = x14
737         movq    %rbx,56(%rdi)
738         #   bytes = bytes_backup
739         movq    152(%rsp),%rdx
740         #   in8 = j8
741         movq    88(%rsp),%rcx
742         #   in8 += 1
743         add     $1,%rcx
744         #   j8 = in8
745         movq    %rcx,88(%rsp)
746         #                          unsigned>? unsigned<? bytes - 64
747         cmp     $64,%rdx
748         # comment:fp stack unchanged by jump
749         #   goto bytesatleast65 if unsigned>
750         ja      ._bytesatleast65
751         # comment:fp stack unchanged by jump
752         #     goto bytesatleast64 if !unsigned<
753         jae     ._bytesatleast64
754         #       m = out
755         mov     %rdi,%rsi
756         #       out = ctarget
757         movq    128(%rsp),%rdi
758         #       i = bytes
759         mov     %rdx,%rcx
760         #       while (i) { *out++ = *m++; --i }
761         rep     movsb
762         # comment:fp stack unchanged by fallthrough
763 #     bytesatleast64:
764 ._bytesatleast64:
765         #     x = x_backup
766         movq    120(%rsp),%rdi
767         #     in8 = j8
768         movq    88(%rsp),%rsi
769         #     *(uint64 *) (x + 32) = in8
770         movq    %rsi,32(%rdi)
771         #     r11 = r11_stack
772         movq    0(%rsp),%r11
773         #     r12 = r12_stack
774         movq    8(%rsp),%r12
775         #     r13 = r13_stack
776         movq    16(%rsp),%r13
777         #     r14 = r14_stack
778         movq    24(%rsp),%r14
779         #     r15 = r15_stack
780         movq    32(%rsp),%r15
781         #     rbx = rbx_stack
782         movq    40(%rsp),%rbx
783         #     rbp = rbp_stack
784         movq    48(%rsp),%rbp
785         # comment:fp stack unchanged by fallthrough
786 #     done:
787 ._done:
788         #     leave
789         add     %r11,%rsp
790         mov     %rdi,%rax
791         mov     %rsi,%rdx
792         ret
793 #   bytesatleast65:
794 ._bytesatleast65:
795         #   bytes -= 64
796         sub     $64,%rdx
797         #   out += 64
798         add     $64,%rdi
799         #   m += 64
800         add     $64,%rsi
801         # comment:fp stack unchanged by jump
802         # goto bytesatleast1
803         jmp     ._bytesatleast1
804 ENDPROC(salsa20_encrypt_bytes)
805
806 # enter salsa20_keysetup
807 ENTRY(salsa20_keysetup)
808         mov     %rsp,%r11
809         and     $31,%r11
810         add     $256,%r11
811         sub     %r11,%rsp
812         #   k = arg2
813         mov     %rsi,%rsi
814         #   kbits = arg3
815         mov     %rdx,%rdx
816         #   x = arg1
817         mov     %rdi,%rdi
818         #   in0 = *(uint64 *) (k + 0)
819         movq    0(%rsi),%r8
820         #   in2 = *(uint64 *) (k + 8)
821         movq    8(%rsi),%r9
822         #   *(uint64 *) (x + 4) = in0
823         movq    %r8,4(%rdi)
824         #   *(uint64 *) (x + 12) = in2
825         movq    %r9,12(%rdi)
826         #                    unsigned<? kbits - 256
827         cmp     $256,%rdx
828         # comment:fp stack unchanged by jump
829         #   goto kbits128 if unsigned<
830         jb      ._kbits128
831 #   kbits256:
832 ._kbits256:
833         #     in10 = *(uint64 *) (k + 16)
834         movq    16(%rsi),%rdx
835         #     in12 = *(uint64 *) (k + 24)
836         movq    24(%rsi),%rsi
837         #     *(uint64 *) (x + 44) = in10
838         movq    %rdx,44(%rdi)
839         #     *(uint64 *) (x + 52) = in12
840         movq    %rsi,52(%rdi)
841         #     in0 = 1634760805
842         mov     $1634760805,%rsi
843         #     in4 = 857760878
844         mov     $857760878,%rdx
845         #     in10 = 2036477234
846         mov     $2036477234,%rcx
847         #     in14 = 1797285236
848         mov     $1797285236,%r8
849         #     *(uint32 *) (x + 0) = in0
850         movl    %esi,0(%rdi)
851         #     *(uint32 *) (x + 20) = in4
852         movl    %edx,20(%rdi)
853         #     *(uint32 *) (x + 40) = in10
854         movl    %ecx,40(%rdi)
855         #     *(uint32 *) (x + 60) = in14
856         movl    %r8d,60(%rdi)
857         # comment:fp stack unchanged by jump
858         #   goto keysetupdone
859         jmp     ._keysetupdone
860 #   kbits128:
861 ._kbits128:
862         #     in10 = *(uint64 *) (k + 0)
863         movq    0(%rsi),%rdx
864         #     in12 = *(uint64 *) (k + 8)
865         movq    8(%rsi),%rsi
866         #     *(uint64 *) (x + 44) = in10
867         movq    %rdx,44(%rdi)
868         #     *(uint64 *) (x + 52) = in12
869         movq    %rsi,52(%rdi)
870         #     in0 = 1634760805
871         mov     $1634760805,%rsi
872         #     in4 = 824206446
873         mov     $824206446,%rdx
874         #     in10 = 2036477238
875         mov     $2036477238,%rcx
876         #     in14 = 1797285236
877         mov     $1797285236,%r8
878         #     *(uint32 *) (x + 0) = in0
879         movl    %esi,0(%rdi)
880         #     *(uint32 *) (x + 20) = in4
881         movl    %edx,20(%rdi)
882         #     *(uint32 *) (x + 40) = in10
883         movl    %ecx,40(%rdi)
884         #     *(uint32 *) (x + 60) = in14
885         movl    %r8d,60(%rdi)
886 #   keysetupdone:
887 ._keysetupdone:
888         # leave
889         add     %r11,%rsp
890         mov     %rdi,%rax
891         mov     %rsi,%rdx
892         ret
893 ENDPROC(salsa20_keysetup)
894
895 # enter salsa20_ivsetup
896 ENTRY(salsa20_ivsetup)
897         mov     %rsp,%r11
898         and     $31,%r11
899         add     $256,%r11
900         sub     %r11,%rsp
901         #   iv = arg2
902         mov     %rsi,%rsi
903         #   x = arg1
904         mov     %rdi,%rdi
905         #   in6 = *(uint64 *) (iv + 0)
906         movq    0(%rsi),%rsi
907         #   in8 = 0
908         mov     $0,%r8
909         #   *(uint64 *) (x + 24) = in6
910         movq    %rsi,24(%rdi)
911         #   *(uint64 *) (x + 32) = in8
912         movq    %r8,32(%rdi)
913         # leave
914         add     %r11,%rsp
915         mov     %rdi,%rax
916         mov     %rsi,%rdx
917         ret
918 ENDPROC(salsa20_ivsetup)