Add the rt linux 4.1.3-rt3 as base
[kvmfornfv.git] / kernel / arch / x86 / crypto / salsa20-i586-asm_32.S
1 # salsa20_pm.s version 20051229
2 # D. J. Bernstein
3 # Public domain.
4
5 #include <linux/linkage.h>
6
7 .text
8
9 # enter salsa20_encrypt_bytes
10 ENTRY(salsa20_encrypt_bytes)
11         mov     %esp,%eax
12         and     $31,%eax
13         add     $256,%eax
14         sub     %eax,%esp
15         # eax_stack = eax
16         movl    %eax,80(%esp)
17         # ebx_stack = ebx
18         movl    %ebx,84(%esp)
19         # esi_stack = esi
20         movl    %esi,88(%esp)
21         # edi_stack = edi
22         movl    %edi,92(%esp)
23         # ebp_stack = ebp
24         movl    %ebp,96(%esp)
25         # x = arg1
26         movl    4(%esp,%eax),%edx
27         # m = arg2
28         movl    8(%esp,%eax),%esi
29         # out = arg3
30         movl    12(%esp,%eax),%edi
31         # bytes = arg4
32         movl    16(%esp,%eax),%ebx
33         # bytes -= 0
34         sub     $0,%ebx
35         # goto done if unsigned<=
36         jbe     ._done
37 ._start:
38         # in0 = *(uint32 *) (x + 0)
39         movl    0(%edx),%eax
40         # in1 = *(uint32 *) (x + 4)
41         movl    4(%edx),%ecx
42         # in2 = *(uint32 *) (x + 8)
43         movl    8(%edx),%ebp
44         # j0 = in0
45         movl    %eax,164(%esp)
46         # in3 = *(uint32 *) (x + 12)
47         movl    12(%edx),%eax
48         # j1 = in1
49         movl    %ecx,168(%esp)
50         # in4 = *(uint32 *) (x + 16)
51         movl    16(%edx),%ecx
52         # j2 = in2
53         movl    %ebp,172(%esp)
54         # in5 = *(uint32 *) (x + 20)
55         movl    20(%edx),%ebp
56         # j3 = in3
57         movl    %eax,176(%esp)
58         # in6 = *(uint32 *) (x + 24)
59         movl    24(%edx),%eax
60         # j4 = in4
61         movl    %ecx,180(%esp)
62         # in7 = *(uint32 *) (x + 28)
63         movl    28(%edx),%ecx
64         # j5 = in5
65         movl    %ebp,184(%esp)
66         # in8 = *(uint32 *) (x + 32)
67         movl    32(%edx),%ebp
68         # j6 = in6
69         movl    %eax,188(%esp)
70         # in9 = *(uint32 *) (x + 36)
71         movl    36(%edx),%eax
72         # j7 = in7
73         movl    %ecx,192(%esp)
74         # in10 = *(uint32 *) (x + 40)
75         movl    40(%edx),%ecx
76         # j8 = in8
77         movl    %ebp,196(%esp)
78         # in11 = *(uint32 *) (x + 44)
79         movl    44(%edx),%ebp
80         # j9 = in9
81         movl    %eax,200(%esp)
82         # in12 = *(uint32 *) (x + 48)
83         movl    48(%edx),%eax
84         # j10 = in10
85         movl    %ecx,204(%esp)
86         # in13 = *(uint32 *) (x + 52)
87         movl    52(%edx),%ecx
88         # j11 = in11
89         movl    %ebp,208(%esp)
90         # in14 = *(uint32 *) (x + 56)
91         movl    56(%edx),%ebp
92         # j12 = in12
93         movl    %eax,212(%esp)
94         # in15 = *(uint32 *) (x + 60)
95         movl    60(%edx),%eax
96         # j13 = in13
97         movl    %ecx,216(%esp)
98         # j14 = in14
99         movl    %ebp,220(%esp)
100         # j15 = in15
101         movl    %eax,224(%esp)
102         # x_backup = x
103         movl    %edx,64(%esp)
104 ._bytesatleast1:
105         #   bytes - 64
106         cmp     $64,%ebx
107         #   goto nocopy if unsigned>=
108         jae     ._nocopy
109         #     ctarget = out
110         movl    %edi,228(%esp)
111         #     out = &tmp
112         leal    0(%esp),%edi
113         #     i = bytes
114         mov     %ebx,%ecx
115         #     while (i) { *out++ = *m++; --i }
116         rep     movsb
117         #     out = &tmp
118         leal    0(%esp),%edi
119         #     m = &tmp
120         leal    0(%esp),%esi
121 ._nocopy:
122         #   out_backup = out
123         movl    %edi,72(%esp)
124         #   m_backup = m
125         movl    %esi,68(%esp)
126         #   bytes_backup = bytes
127         movl    %ebx,76(%esp)
128         #   in0 = j0
129         movl    164(%esp),%eax
130         #   in1 = j1
131         movl    168(%esp),%ecx
132         #   in2 = j2
133         movl    172(%esp),%edx
134         #   in3 = j3
135         movl    176(%esp),%ebx
136         #   x0 = in0
137         movl    %eax,100(%esp)
138         #   x1 = in1
139         movl    %ecx,104(%esp)
140         #   x2 = in2
141         movl    %edx,108(%esp)
142         #   x3 = in3
143         movl    %ebx,112(%esp)
144         #   in4 = j4
145         movl    180(%esp),%eax
146         #   in5 = j5
147         movl    184(%esp),%ecx
148         #   in6 = j6
149         movl    188(%esp),%edx
150         #   in7 = j7
151         movl    192(%esp),%ebx
152         #   x4 = in4
153         movl    %eax,116(%esp)
154         #   x5 = in5
155         movl    %ecx,120(%esp)
156         #   x6 = in6
157         movl    %edx,124(%esp)
158         #   x7 = in7
159         movl    %ebx,128(%esp)
160         #   in8 = j8
161         movl    196(%esp),%eax
162         #   in9 = j9
163         movl    200(%esp),%ecx
164         #   in10 = j10
165         movl    204(%esp),%edx
166         #   in11 = j11
167         movl    208(%esp),%ebx
168         #   x8 = in8
169         movl    %eax,132(%esp)
170         #   x9 = in9
171         movl    %ecx,136(%esp)
172         #   x10 = in10
173         movl    %edx,140(%esp)
174         #   x11 = in11
175         movl    %ebx,144(%esp)
176         #   in12 = j12
177         movl    212(%esp),%eax
178         #   in13 = j13
179         movl    216(%esp),%ecx
180         #   in14 = j14
181         movl    220(%esp),%edx
182         #   in15 = j15
183         movl    224(%esp),%ebx
184         #   x12 = in12
185         movl    %eax,148(%esp)
186         #   x13 = in13
187         movl    %ecx,152(%esp)
188         #   x14 = in14
189         movl    %edx,156(%esp)
190         #   x15 = in15
191         movl    %ebx,160(%esp)
192         #   i = 20
193         mov     $20,%ebp
194         # p = x0
195         movl    100(%esp),%eax
196         # s = x5
197         movl    120(%esp),%ecx
198         # t = x10
199         movl    140(%esp),%edx
200         # w = x15
201         movl    160(%esp),%ebx
202 ._mainloop:
203         # x0 = p
204         movl    %eax,100(%esp)
205         #                               x10 = t
206         movl    %edx,140(%esp)
207         # p += x12
208         addl    148(%esp),%eax
209         #               x5 = s
210         movl    %ecx,120(%esp)
211         #                               t += x6
212         addl    124(%esp),%edx
213         #                                               x15 = w
214         movl    %ebx,160(%esp)
215         #               r = x1
216         movl    104(%esp),%esi
217         #               r += s
218         add     %ecx,%esi
219         #                                               v = x11
220         movl    144(%esp),%edi
221         #                                               v += w
222         add     %ebx,%edi
223         # p <<<= 7
224         rol     $7,%eax
225         # p ^= x4
226         xorl    116(%esp),%eax
227         #                               t <<<= 7
228         rol     $7,%edx
229         #                               t ^= x14
230         xorl    156(%esp),%edx
231         #               r <<<= 7
232         rol     $7,%esi
233         #               r ^= x9
234         xorl    136(%esp),%esi
235         #                                               v <<<= 7
236         rol     $7,%edi
237         #                                               v ^= x3
238         xorl    112(%esp),%edi
239         # x4 = p
240         movl    %eax,116(%esp)
241         #                               x14 = t
242         movl    %edx,156(%esp)
243         # p += x0
244         addl    100(%esp),%eax
245         #               x9 = r
246         movl    %esi,136(%esp)
247         #                               t += x10
248         addl    140(%esp),%edx
249         #                                               x3 = v
250         movl    %edi,112(%esp)
251         # p <<<= 9
252         rol     $9,%eax
253         # p ^= x8
254         xorl    132(%esp),%eax
255         #                               t <<<= 9
256         rol     $9,%edx
257         #                               t ^= x2
258         xorl    108(%esp),%edx
259         #               s += r
260         add     %esi,%ecx
261         #               s <<<= 9
262         rol     $9,%ecx
263         #               s ^= x13
264         xorl    152(%esp),%ecx
265         #                                               w += v
266         add     %edi,%ebx
267         #                                               w <<<= 9
268         rol     $9,%ebx
269         #                                               w ^= x7
270         xorl    128(%esp),%ebx
271         # x8 = p
272         movl    %eax,132(%esp)
273         #                               x2 = t
274         movl    %edx,108(%esp)
275         # p += x4
276         addl    116(%esp),%eax
277         #               x13 = s
278         movl    %ecx,152(%esp)
279         #                               t += x14
280         addl    156(%esp),%edx
281         #                                               x7 = w
282         movl    %ebx,128(%esp)
283         # p <<<= 13
284         rol     $13,%eax
285         # p ^= x12
286         xorl    148(%esp),%eax
287         #                               t <<<= 13
288         rol     $13,%edx
289         #                               t ^= x6
290         xorl    124(%esp),%edx
291         #               r += s
292         add     %ecx,%esi
293         #               r <<<= 13
294         rol     $13,%esi
295         #               r ^= x1
296         xorl    104(%esp),%esi
297         #                                               v += w
298         add     %ebx,%edi
299         #                                               v <<<= 13
300         rol     $13,%edi
301         #                                               v ^= x11
302         xorl    144(%esp),%edi
303         # x12 = p
304         movl    %eax,148(%esp)
305         #                               x6 = t
306         movl    %edx,124(%esp)
307         # p += x8
308         addl    132(%esp),%eax
309         #               x1 = r
310         movl    %esi,104(%esp)
311         #                               t += x2
312         addl    108(%esp),%edx
313         #                                               x11 = v
314         movl    %edi,144(%esp)
315         # p <<<= 18
316         rol     $18,%eax
317         # p ^= x0
318         xorl    100(%esp),%eax
319         #                               t <<<= 18
320         rol     $18,%edx
321         #                               t ^= x10
322         xorl    140(%esp),%edx
323         #               s += r
324         add     %esi,%ecx
325         #               s <<<= 18
326         rol     $18,%ecx
327         #               s ^= x5
328         xorl    120(%esp),%ecx
329         #                                               w += v
330         add     %edi,%ebx
331         #                                               w <<<= 18
332         rol     $18,%ebx
333         #                                               w ^= x15
334         xorl    160(%esp),%ebx
335         # x0 = p
336         movl    %eax,100(%esp)
337         #                               x10 = t
338         movl    %edx,140(%esp)
339         # p += x3
340         addl    112(%esp),%eax
341         # p <<<= 7
342         rol     $7,%eax
343         #               x5 = s
344         movl    %ecx,120(%esp)
345         #                               t += x9
346         addl    136(%esp),%edx
347         #                                               x15 = w
348         movl    %ebx,160(%esp)
349         #               r = x4
350         movl    116(%esp),%esi
351         #               r += s
352         add     %ecx,%esi
353         #                                               v = x14
354         movl    156(%esp),%edi
355         #                                               v += w
356         add     %ebx,%edi
357         # p ^= x1
358         xorl    104(%esp),%eax
359         #                               t <<<= 7
360         rol     $7,%edx
361         #                               t ^= x11
362         xorl    144(%esp),%edx
363         #               r <<<= 7
364         rol     $7,%esi
365         #               r ^= x6
366         xorl    124(%esp),%esi
367         #                                               v <<<= 7
368         rol     $7,%edi
369         #                                               v ^= x12
370         xorl    148(%esp),%edi
371         # x1 = p
372         movl    %eax,104(%esp)
373         #                               x11 = t
374         movl    %edx,144(%esp)
375         # p += x0
376         addl    100(%esp),%eax
377         #               x6 = r
378         movl    %esi,124(%esp)
379         #                               t += x10
380         addl    140(%esp),%edx
381         #                                               x12 = v
382         movl    %edi,148(%esp)
383         # p <<<= 9
384         rol     $9,%eax
385         # p ^= x2
386         xorl    108(%esp),%eax
387         #                               t <<<= 9
388         rol     $9,%edx
389         #                               t ^= x8
390         xorl    132(%esp),%edx
391         #               s += r
392         add     %esi,%ecx
393         #               s <<<= 9
394         rol     $9,%ecx
395         #               s ^= x7
396         xorl    128(%esp),%ecx
397         #                                               w += v
398         add     %edi,%ebx
399         #                                               w <<<= 9
400         rol     $9,%ebx
401         #                                               w ^= x13
402         xorl    152(%esp),%ebx
403         # x2 = p
404         movl    %eax,108(%esp)
405         #                               x8 = t
406         movl    %edx,132(%esp)
407         # p += x1
408         addl    104(%esp),%eax
409         #               x7 = s
410         movl    %ecx,128(%esp)
411         #                               t += x11
412         addl    144(%esp),%edx
413         #                                               x13 = w
414         movl    %ebx,152(%esp)
415         # p <<<= 13
416         rol     $13,%eax
417         # p ^= x3
418         xorl    112(%esp),%eax
419         #                               t <<<= 13
420         rol     $13,%edx
421         #                               t ^= x9
422         xorl    136(%esp),%edx
423         #               r += s
424         add     %ecx,%esi
425         #               r <<<= 13
426         rol     $13,%esi
427         #               r ^= x4
428         xorl    116(%esp),%esi
429         #                                               v += w
430         add     %ebx,%edi
431         #                                               v <<<= 13
432         rol     $13,%edi
433         #                                               v ^= x14
434         xorl    156(%esp),%edi
435         # x3 = p
436         movl    %eax,112(%esp)
437         #                               x9 = t
438         movl    %edx,136(%esp)
439         # p += x2
440         addl    108(%esp),%eax
441         #               x4 = r
442         movl    %esi,116(%esp)
443         #                               t += x8
444         addl    132(%esp),%edx
445         #                                               x14 = v
446         movl    %edi,156(%esp)
447         # p <<<= 18
448         rol     $18,%eax
449         # p ^= x0
450         xorl    100(%esp),%eax
451         #                               t <<<= 18
452         rol     $18,%edx
453         #                               t ^= x10
454         xorl    140(%esp),%edx
455         #               s += r
456         add     %esi,%ecx
457         #               s <<<= 18
458         rol     $18,%ecx
459         #               s ^= x5
460         xorl    120(%esp),%ecx
461         #                                               w += v
462         add     %edi,%ebx
463         #                                               w <<<= 18
464         rol     $18,%ebx
465         #                                               w ^= x15
466         xorl    160(%esp),%ebx
467         # x0 = p
468         movl    %eax,100(%esp)
469         #                               x10 = t
470         movl    %edx,140(%esp)
471         # p += x12
472         addl    148(%esp),%eax
473         #               x5 = s
474         movl    %ecx,120(%esp)
475         #                               t += x6
476         addl    124(%esp),%edx
477         #                                               x15 = w
478         movl    %ebx,160(%esp)
479         #               r = x1
480         movl    104(%esp),%esi
481         #               r += s
482         add     %ecx,%esi
483         #                                               v = x11
484         movl    144(%esp),%edi
485         #                                               v += w
486         add     %ebx,%edi
487         # p <<<= 7
488         rol     $7,%eax
489         # p ^= x4
490         xorl    116(%esp),%eax
491         #                               t <<<= 7
492         rol     $7,%edx
493         #                               t ^= x14
494         xorl    156(%esp),%edx
495         #               r <<<= 7
496         rol     $7,%esi
497         #               r ^= x9
498         xorl    136(%esp),%esi
499         #                                               v <<<= 7
500         rol     $7,%edi
501         #                                               v ^= x3
502         xorl    112(%esp),%edi
503         # x4 = p
504         movl    %eax,116(%esp)
505         #                               x14 = t
506         movl    %edx,156(%esp)
507         # p += x0
508         addl    100(%esp),%eax
509         #               x9 = r
510         movl    %esi,136(%esp)
511         #                               t += x10
512         addl    140(%esp),%edx
513         #                                               x3 = v
514         movl    %edi,112(%esp)
515         # p <<<= 9
516         rol     $9,%eax
517         # p ^= x8
518         xorl    132(%esp),%eax
519         #                               t <<<= 9
520         rol     $9,%edx
521         #                               t ^= x2
522         xorl    108(%esp),%edx
523         #               s += r
524         add     %esi,%ecx
525         #               s <<<= 9
526         rol     $9,%ecx
527         #               s ^= x13
528         xorl    152(%esp),%ecx
529         #                                               w += v
530         add     %edi,%ebx
531         #                                               w <<<= 9
532         rol     $9,%ebx
533         #                                               w ^= x7
534         xorl    128(%esp),%ebx
535         # x8 = p
536         movl    %eax,132(%esp)
537         #                               x2 = t
538         movl    %edx,108(%esp)
539         # p += x4
540         addl    116(%esp),%eax
541         #               x13 = s
542         movl    %ecx,152(%esp)
543         #                               t += x14
544         addl    156(%esp),%edx
545         #                                               x7 = w
546         movl    %ebx,128(%esp)
547         # p <<<= 13
548         rol     $13,%eax
549         # p ^= x12
550         xorl    148(%esp),%eax
551         #                               t <<<= 13
552         rol     $13,%edx
553         #                               t ^= x6
554         xorl    124(%esp),%edx
555         #               r += s
556         add     %ecx,%esi
557         #               r <<<= 13
558         rol     $13,%esi
559         #               r ^= x1
560         xorl    104(%esp),%esi
561         #                                               v += w
562         add     %ebx,%edi
563         #                                               v <<<= 13
564         rol     $13,%edi
565         #                                               v ^= x11
566         xorl    144(%esp),%edi
567         # x12 = p
568         movl    %eax,148(%esp)
569         #                               x6 = t
570         movl    %edx,124(%esp)
571         # p += x8
572         addl    132(%esp),%eax
573         #               x1 = r
574         movl    %esi,104(%esp)
575         #                               t += x2
576         addl    108(%esp),%edx
577         #                                               x11 = v
578         movl    %edi,144(%esp)
579         # p <<<= 18
580         rol     $18,%eax
581         # p ^= x0
582         xorl    100(%esp),%eax
583         #                               t <<<= 18
584         rol     $18,%edx
585         #                               t ^= x10
586         xorl    140(%esp),%edx
587         #               s += r
588         add     %esi,%ecx
589         #               s <<<= 18
590         rol     $18,%ecx
591         #               s ^= x5
592         xorl    120(%esp),%ecx
593         #                                               w += v
594         add     %edi,%ebx
595         #                                               w <<<= 18
596         rol     $18,%ebx
597         #                                               w ^= x15
598         xorl    160(%esp),%ebx
599         # x0 = p
600         movl    %eax,100(%esp)
601         #                               x10 = t
602         movl    %edx,140(%esp)
603         # p += x3
604         addl    112(%esp),%eax
605         # p <<<= 7
606         rol     $7,%eax
607         #               x5 = s
608         movl    %ecx,120(%esp)
609         #                               t += x9
610         addl    136(%esp),%edx
611         #                                               x15 = w
612         movl    %ebx,160(%esp)
613         #               r = x4
614         movl    116(%esp),%esi
615         #               r += s
616         add     %ecx,%esi
617         #                                               v = x14
618         movl    156(%esp),%edi
619         #                                               v += w
620         add     %ebx,%edi
621         # p ^= x1
622         xorl    104(%esp),%eax
623         #                               t <<<= 7
624         rol     $7,%edx
625         #                               t ^= x11
626         xorl    144(%esp),%edx
627         #               r <<<= 7
628         rol     $7,%esi
629         #               r ^= x6
630         xorl    124(%esp),%esi
631         #                                               v <<<= 7
632         rol     $7,%edi
633         #                                               v ^= x12
634         xorl    148(%esp),%edi
635         # x1 = p
636         movl    %eax,104(%esp)
637         #                               x11 = t
638         movl    %edx,144(%esp)
639         # p += x0
640         addl    100(%esp),%eax
641         #               x6 = r
642         movl    %esi,124(%esp)
643         #                               t += x10
644         addl    140(%esp),%edx
645         #                                               x12 = v
646         movl    %edi,148(%esp)
647         # p <<<= 9
648         rol     $9,%eax
649         # p ^= x2
650         xorl    108(%esp),%eax
651         #                               t <<<= 9
652         rol     $9,%edx
653         #                               t ^= x8
654         xorl    132(%esp),%edx
655         #               s += r
656         add     %esi,%ecx
657         #               s <<<= 9
658         rol     $9,%ecx
659         #               s ^= x7
660         xorl    128(%esp),%ecx
661         #                                               w += v
662         add     %edi,%ebx
663         #                                               w <<<= 9
664         rol     $9,%ebx
665         #                                               w ^= x13
666         xorl    152(%esp),%ebx
667         # x2 = p
668         movl    %eax,108(%esp)
669         #                               x8 = t
670         movl    %edx,132(%esp)
671         # p += x1
672         addl    104(%esp),%eax
673         #               x7 = s
674         movl    %ecx,128(%esp)
675         #                               t += x11
676         addl    144(%esp),%edx
677         #                                               x13 = w
678         movl    %ebx,152(%esp)
679         # p <<<= 13
680         rol     $13,%eax
681         # p ^= x3
682         xorl    112(%esp),%eax
683         #                               t <<<= 13
684         rol     $13,%edx
685         #                               t ^= x9
686         xorl    136(%esp),%edx
687         #               r += s
688         add     %ecx,%esi
689         #               r <<<= 13
690         rol     $13,%esi
691         #               r ^= x4
692         xorl    116(%esp),%esi
693         #                                               v += w
694         add     %ebx,%edi
695         #                                               v <<<= 13
696         rol     $13,%edi
697         #                                               v ^= x14
698         xorl    156(%esp),%edi
699         # x3 = p
700         movl    %eax,112(%esp)
701         #                               x9 = t
702         movl    %edx,136(%esp)
703         # p += x2
704         addl    108(%esp),%eax
705         #               x4 = r
706         movl    %esi,116(%esp)
707         #                               t += x8
708         addl    132(%esp),%edx
709         #                                               x14 = v
710         movl    %edi,156(%esp)
711         # p <<<= 18
712         rol     $18,%eax
713         # p ^= x0
714         xorl    100(%esp),%eax
715         #                               t <<<= 18
716         rol     $18,%edx
717         #                               t ^= x10
718         xorl    140(%esp),%edx
719         #               s += r
720         add     %esi,%ecx
721         #               s <<<= 18
722         rol     $18,%ecx
723         #               s ^= x5
724         xorl    120(%esp),%ecx
725         #                                               w += v
726         add     %edi,%ebx
727         #                                               w <<<= 18
728         rol     $18,%ebx
729         #                                               w ^= x15
730         xorl    160(%esp),%ebx
731         # i -= 4
732         sub     $4,%ebp
733         # goto mainloop if unsigned >
734         ja      ._mainloop
735         # x0 = p
736         movl    %eax,100(%esp)
737         # x5 = s
738         movl    %ecx,120(%esp)
739         # x10 = t
740         movl    %edx,140(%esp)
741         # x15 = w
742         movl    %ebx,160(%esp)
743         #   out = out_backup
744         movl    72(%esp),%edi
745         #   m = m_backup
746         movl    68(%esp),%esi
747         #   in0 = x0
748         movl    100(%esp),%eax
749         #   in1 = x1
750         movl    104(%esp),%ecx
751         #   in0 += j0
752         addl    164(%esp),%eax
753         #   in1 += j1
754         addl    168(%esp),%ecx
755         #   in0 ^= *(uint32 *) (m + 0)
756         xorl    0(%esi),%eax
757         #   in1 ^= *(uint32 *) (m + 4)
758         xorl    4(%esi),%ecx
759         #   *(uint32 *) (out + 0) = in0
760         movl    %eax,0(%edi)
761         #   *(uint32 *) (out + 4) = in1
762         movl    %ecx,4(%edi)
763         #   in2 = x2
764         movl    108(%esp),%eax
765         #   in3 = x3
766         movl    112(%esp),%ecx
767         #   in2 += j2
768         addl    172(%esp),%eax
769         #   in3 += j3
770         addl    176(%esp),%ecx
771         #   in2 ^= *(uint32 *) (m + 8)
772         xorl    8(%esi),%eax
773         #   in3 ^= *(uint32 *) (m + 12)
774         xorl    12(%esi),%ecx
775         #   *(uint32 *) (out + 8) = in2
776         movl    %eax,8(%edi)
777         #   *(uint32 *) (out + 12) = in3
778         movl    %ecx,12(%edi)
779         #   in4 = x4
780         movl    116(%esp),%eax
781         #   in5 = x5
782         movl    120(%esp),%ecx
783         #   in4 += j4
784         addl    180(%esp),%eax
785         #   in5 += j5
786         addl    184(%esp),%ecx
787         #   in4 ^= *(uint32 *) (m + 16)
788         xorl    16(%esi),%eax
789         #   in5 ^= *(uint32 *) (m + 20)
790         xorl    20(%esi),%ecx
791         #   *(uint32 *) (out + 16) = in4
792         movl    %eax,16(%edi)
793         #   *(uint32 *) (out + 20) = in5
794         movl    %ecx,20(%edi)
795         #   in6 = x6
796         movl    124(%esp),%eax
797         #   in7 = x7
798         movl    128(%esp),%ecx
799         #   in6 += j6
800         addl    188(%esp),%eax
801         #   in7 += j7
802         addl    192(%esp),%ecx
803         #   in6 ^= *(uint32 *) (m + 24)
804         xorl    24(%esi),%eax
805         #   in7 ^= *(uint32 *) (m + 28)
806         xorl    28(%esi),%ecx
807         #   *(uint32 *) (out + 24) = in6
808         movl    %eax,24(%edi)
809         #   *(uint32 *) (out + 28) = in7
810         movl    %ecx,28(%edi)
811         #   in8 = x8
812         movl    132(%esp),%eax
813         #   in9 = x9
814         movl    136(%esp),%ecx
815         #   in8 += j8
816         addl    196(%esp),%eax
817         #   in9 += j9
818         addl    200(%esp),%ecx
819         #   in8 ^= *(uint32 *) (m + 32)
820         xorl    32(%esi),%eax
821         #   in9 ^= *(uint32 *) (m + 36)
822         xorl    36(%esi),%ecx
823         #   *(uint32 *) (out + 32) = in8
824         movl    %eax,32(%edi)
825         #   *(uint32 *) (out + 36) = in9
826         movl    %ecx,36(%edi)
827         #   in10 = x10
828         movl    140(%esp),%eax
829         #   in11 = x11
830         movl    144(%esp),%ecx
831         #   in10 += j10
832         addl    204(%esp),%eax
833         #   in11 += j11
834         addl    208(%esp),%ecx
835         #   in10 ^= *(uint32 *) (m + 40)
836         xorl    40(%esi),%eax
837         #   in11 ^= *(uint32 *) (m + 44)
838         xorl    44(%esi),%ecx
839         #   *(uint32 *) (out + 40) = in10
840         movl    %eax,40(%edi)
841         #   *(uint32 *) (out + 44) = in11
842         movl    %ecx,44(%edi)
843         #   in12 = x12
844         movl    148(%esp),%eax
845         #   in13 = x13
846         movl    152(%esp),%ecx
847         #   in12 += j12
848         addl    212(%esp),%eax
849         #   in13 += j13
850         addl    216(%esp),%ecx
851         #   in12 ^= *(uint32 *) (m + 48)
852         xorl    48(%esi),%eax
853         #   in13 ^= *(uint32 *) (m + 52)
854         xorl    52(%esi),%ecx
855         #   *(uint32 *) (out + 48) = in12
856         movl    %eax,48(%edi)
857         #   *(uint32 *) (out + 52) = in13
858         movl    %ecx,52(%edi)
859         #   in14 = x14
860         movl    156(%esp),%eax
861         #   in15 = x15
862         movl    160(%esp),%ecx
863         #   in14 += j14
864         addl    220(%esp),%eax
865         #   in15 += j15
866         addl    224(%esp),%ecx
867         #   in14 ^= *(uint32 *) (m + 56)
868         xorl    56(%esi),%eax
869         #   in15 ^= *(uint32 *) (m + 60)
870         xorl    60(%esi),%ecx
871         #   *(uint32 *) (out + 56) = in14
872         movl    %eax,56(%edi)
873         #   *(uint32 *) (out + 60) = in15
874         movl    %ecx,60(%edi)
875         #   bytes = bytes_backup
876         movl    76(%esp),%ebx
877         #   in8 = j8
878         movl    196(%esp),%eax
879         #   in9 = j9
880         movl    200(%esp),%ecx
881         #   in8 += 1
882         add     $1,%eax
883         #   in9 += 0 + carry
884         adc     $0,%ecx
885         #   j8 = in8
886         movl    %eax,196(%esp)
887         #   j9 = in9
888         movl    %ecx,200(%esp)
889         #   bytes - 64
890         cmp     $64,%ebx
891         #   goto bytesatleast65 if unsigned>
892         ja      ._bytesatleast65
893         #     goto bytesatleast64 if unsigned>=
894         jae     ._bytesatleast64
895         #       m = out
896         mov     %edi,%esi
897         #       out = ctarget
898         movl    228(%esp),%edi
899         #       i = bytes
900         mov     %ebx,%ecx
901         #       while (i) { *out++ = *m++; --i }
902         rep     movsb
903 ._bytesatleast64:
904         #     x = x_backup
905         movl    64(%esp),%eax
906         #     in8 = j8
907         movl    196(%esp),%ecx
908         #     in9 = j9
909         movl    200(%esp),%edx
910         #     *(uint32 *) (x + 32) = in8
911         movl    %ecx,32(%eax)
912         #     *(uint32 *) (x + 36) = in9
913         movl    %edx,36(%eax)
914 ._done:
915         #     eax = eax_stack
916         movl    80(%esp),%eax
917         #     ebx = ebx_stack
918         movl    84(%esp),%ebx
919         #     esi = esi_stack
920         movl    88(%esp),%esi
921         #     edi = edi_stack
922         movl    92(%esp),%edi
923         #     ebp = ebp_stack
924         movl    96(%esp),%ebp
925         #     leave
926         add     %eax,%esp
927         ret
928 ._bytesatleast65:
929         #   bytes -= 64
930         sub     $64,%ebx
931         #   out += 64
932         add     $64,%edi
933         #   m += 64
934         add     $64,%esi
935         # goto bytesatleast1
936         jmp     ._bytesatleast1
937 ENDPROC(salsa20_encrypt_bytes)
938
939 # enter salsa20_keysetup
940 ENTRY(salsa20_keysetup)
941         mov     %esp,%eax
942         and     $31,%eax
943         add     $256,%eax
944         sub     %eax,%esp
945         #   eax_stack = eax
946         movl    %eax,64(%esp)
947         #   ebx_stack = ebx
948         movl    %ebx,68(%esp)
949         #   esi_stack = esi
950         movl    %esi,72(%esp)
951         #   edi_stack = edi
952         movl    %edi,76(%esp)
953         #   ebp_stack = ebp
954         movl    %ebp,80(%esp)
955         #   k = arg2
956         movl    8(%esp,%eax),%ecx
957         #   kbits = arg3
958         movl    12(%esp,%eax),%edx
959         #   x = arg1
960         movl    4(%esp,%eax),%eax
961         #   in1 = *(uint32 *) (k + 0)
962         movl    0(%ecx),%ebx
963         #   in2 = *(uint32 *) (k + 4)
964         movl    4(%ecx),%esi
965         #   in3 = *(uint32 *) (k + 8)
966         movl    8(%ecx),%edi
967         #   in4 = *(uint32 *) (k + 12)
968         movl    12(%ecx),%ebp
969         #   *(uint32 *) (x + 4) = in1
970         movl    %ebx,4(%eax)
971         #   *(uint32 *) (x + 8) = in2
972         movl    %esi,8(%eax)
973         #   *(uint32 *) (x + 12) = in3
974         movl    %edi,12(%eax)
975         #   *(uint32 *) (x + 16) = in4
976         movl    %ebp,16(%eax)
977         #   kbits - 256
978         cmp     $256,%edx
979         #   goto kbits128 if unsigned<
980         jb      ._kbits128
981 ._kbits256:
982         #     in11 = *(uint32 *) (k + 16)
983         movl    16(%ecx),%edx
984         #     in12 = *(uint32 *) (k + 20)
985         movl    20(%ecx),%ebx
986         #     in13 = *(uint32 *) (k + 24)
987         movl    24(%ecx),%esi
988         #     in14 = *(uint32 *) (k + 28)
989         movl    28(%ecx),%ecx
990         #     *(uint32 *) (x + 44) = in11
991         movl    %edx,44(%eax)
992         #     *(uint32 *) (x + 48) = in12
993         movl    %ebx,48(%eax)
994         #     *(uint32 *) (x + 52) = in13
995         movl    %esi,52(%eax)
996         #     *(uint32 *) (x + 56) = in14
997         movl    %ecx,56(%eax)
998         #     in0 = 1634760805
999         mov     $1634760805,%ecx
1000         #     in5 = 857760878
1001         mov     $857760878,%edx
1002         #     in10 = 2036477234
1003         mov     $2036477234,%ebx
1004         #     in15 = 1797285236
1005         mov     $1797285236,%esi
1006         #     *(uint32 *) (x + 0) = in0
1007         movl    %ecx,0(%eax)
1008         #     *(uint32 *) (x + 20) = in5
1009         movl    %edx,20(%eax)
1010         #     *(uint32 *) (x + 40) = in10
1011         movl    %ebx,40(%eax)
1012         #     *(uint32 *) (x + 60) = in15
1013         movl    %esi,60(%eax)
1014         #   goto keysetupdone
1015         jmp     ._keysetupdone
1016 ._kbits128:
1017         #     in11 = *(uint32 *) (k + 0)
1018         movl    0(%ecx),%edx
1019         #     in12 = *(uint32 *) (k + 4)
1020         movl    4(%ecx),%ebx
1021         #     in13 = *(uint32 *) (k + 8)
1022         movl    8(%ecx),%esi
1023         #     in14 = *(uint32 *) (k + 12)
1024         movl    12(%ecx),%ecx
1025         #     *(uint32 *) (x + 44) = in11
1026         movl    %edx,44(%eax)
1027         #     *(uint32 *) (x + 48) = in12
1028         movl    %ebx,48(%eax)
1029         #     *(uint32 *) (x + 52) = in13
1030         movl    %esi,52(%eax)
1031         #     *(uint32 *) (x + 56) = in14
1032         movl    %ecx,56(%eax)
1033         #     in0 = 1634760805
1034         mov     $1634760805,%ecx
1035         #     in5 = 824206446
1036         mov     $824206446,%edx
1037         #     in10 = 2036477238
1038         mov     $2036477238,%ebx
1039         #     in15 = 1797285236
1040         mov     $1797285236,%esi
1041         #     *(uint32 *) (x + 0) = in0
1042         movl    %ecx,0(%eax)
1043         #     *(uint32 *) (x + 20) = in5
1044         movl    %edx,20(%eax)
1045         #     *(uint32 *) (x + 40) = in10
1046         movl    %ebx,40(%eax)
1047         #     *(uint32 *) (x + 60) = in15
1048         movl    %esi,60(%eax)
1049 ._keysetupdone:
1050         #   eax = eax_stack
1051         movl    64(%esp),%eax
1052         #   ebx = ebx_stack
1053         movl    68(%esp),%ebx
1054         #   esi = esi_stack
1055         movl    72(%esp),%esi
1056         #   edi = edi_stack
1057         movl    76(%esp),%edi
1058         #   ebp = ebp_stack
1059         movl    80(%esp),%ebp
1060         # leave
1061         add     %eax,%esp
1062         ret
1063 ENDPROC(salsa20_keysetup)
1064
1065 # enter salsa20_ivsetup
1066 ENTRY(salsa20_ivsetup)
1067         mov     %esp,%eax
1068         and     $31,%eax
1069         add     $256,%eax
1070         sub     %eax,%esp
1071         #   eax_stack = eax
1072         movl    %eax,64(%esp)
1073         #   ebx_stack = ebx
1074         movl    %ebx,68(%esp)
1075         #   esi_stack = esi
1076         movl    %esi,72(%esp)
1077         #   edi_stack = edi
1078         movl    %edi,76(%esp)
1079         #   ebp_stack = ebp
1080         movl    %ebp,80(%esp)
1081         #   iv = arg2
1082         movl    8(%esp,%eax),%ecx
1083         #   x = arg1
1084         movl    4(%esp,%eax),%eax
1085         #   in6 = *(uint32 *) (iv + 0)
1086         movl    0(%ecx),%edx
1087         #   in7 = *(uint32 *) (iv + 4)
1088         movl    4(%ecx),%ecx
1089         #   in8 = 0
1090         mov     $0,%ebx
1091         #   in9 = 0
1092         mov     $0,%esi
1093         #   *(uint32 *) (x + 24) = in6
1094         movl    %edx,24(%eax)
1095         #   *(uint32 *) (x + 28) = in7
1096         movl    %ecx,28(%eax)
1097         #   *(uint32 *) (x + 32) = in8
1098         movl    %ebx,32(%eax)
1099         #   *(uint32 *) (x + 36) = in9
1100         movl    %esi,36(%eax)
1101         #   eax = eax_stack
1102         movl    64(%esp),%eax
1103         #   ebx = ebx_stack
1104         movl    68(%esp),%ebx
1105         #   esi = esi_stack
1106         movl    72(%esp),%esi
1107         #   edi = edi_stack
1108         movl    76(%esp),%edi
1109         #   ebp = ebp_stack
1110         movl    80(%esp),%ebp
1111         # leave
1112         add     %eax,%esp
1113         ret
1114 ENDPROC(salsa20_ivsetup)