kernel/arch/x86/crypto/salsa20-x86_64-asm_64.S

   1 #include <linux/linkage.h>
   2
   3 # enter salsa20_encrypt_bytes
   4 ENTRY(salsa20_encrypt_bytes)
   5         mov     %rsp,%r11
   6         and     $31,%r11
   7         add     $256,%r11
   8         sub     %r11,%rsp
   9         # x = arg1
  10         mov     %rdi,%r8
  11         # m = arg2
  12         mov     %rsi,%rsi
  13         # out = arg3
  14         mov     %rdx,%rdi
  15         # bytes = arg4
  16         mov     %rcx,%rdx
  17         #               unsigned>? bytes - 0
  18         cmp     $0,%rdx
  19         # comment:fp stack unchanged by jump
  20         # goto done if !unsigned>
  21         jbe     ._done
  22         # comment:fp stack unchanged by fallthrough
  23 # start:
  24 ._start:
  25         # r11_stack = r11
  26         movq    %r11,0(%rsp)
  27         # r12_stack = r12
  28         movq    %r12,8(%rsp)
  29         # r13_stack = r13
  30         movq    %r13,16(%rsp)
  31         # r14_stack = r14
  32         movq    %r14,24(%rsp)
  33         # r15_stack = r15
  34         movq    %r15,32(%rsp)
  35         # rbx_stack = rbx
  36         movq    %rbx,40(%rsp)
  37         # rbp_stack = rbp
  38         movq    %rbp,48(%rsp)
  39         # in0 = *(uint64 *) (x + 0)
  40         movq    0(%r8),%rcx
  41         # in2 = *(uint64 *) (x + 8)
  42         movq    8(%r8),%r9
  43         # in4 = *(uint64 *) (x + 16)
  44         movq    16(%r8),%rax
  45         # in6 = *(uint64 *) (x + 24)
  46         movq    24(%r8),%r10
  47         # in8 = *(uint64 *) (x + 32)
  48         movq    32(%r8),%r11
  49         # in10 = *(uint64 *) (x + 40)
  50         movq    40(%r8),%r12
  51         # in12 = *(uint64 *) (x + 48)
  52         movq    48(%r8),%r13
  53         # in14 = *(uint64 *) (x + 56)
  54         movq    56(%r8),%r14
  55         # j0 = in0
  56         movq    %rcx,56(%rsp)
  57         # j2 = in2
  58         movq    %r9,64(%rsp)
  59         # j4 = in4
  60         movq    %rax,72(%rsp)
  61         # j6 = in6
  62         movq    %r10,80(%rsp)
  63         # j8 = in8
  64         movq    %r11,88(%rsp)
  65         # j10 = in10
  66         movq    %r12,96(%rsp)
  67         # j12 = in12
  68         movq    %r13,104(%rsp)
  69         # j14 = in14
  70         movq    %r14,112(%rsp)
  71         # x_backup = x
  72         movq    %r8,120(%rsp)
  73 # bytesatleast1:
  74 ._bytesatleast1:
  75         #                   unsigned<? bytes - 64
  76         cmp     $64,%rdx
  77         # comment:fp stack unchanged by jump
  78         #   goto nocopy if !unsigned<
  79         jae     ._nocopy
  80         #     ctarget = out
  81         movq    %rdi,128(%rsp)
  82         #     out = &tmp
  83         leaq    192(%rsp),%rdi
  84         #     i = bytes
  85         mov     %rdx,%rcx
  86         #     while (i) { *out++ = *m++; --i }
  87         rep     movsb
  88         #     out = &tmp
  89         leaq    192(%rsp),%rdi
  90         #     m = &tmp
  91         leaq    192(%rsp),%rsi
  92         # comment:fp stack unchanged by fallthrough
  93 #   nocopy:
  94 ._nocopy:
  95         #   out_backup = out
  96         movq    %rdi,136(%rsp)
  97         #   m_backup = m
  98         movq    %rsi,144(%rsp)
  99         #   bytes_backup = bytes
 100         movq    %rdx,152(%rsp)
 101         #   x1 = j0
 102         movq    56(%rsp),%rdi
 103         #   x0 = x1
 104         mov     %rdi,%rdx
 105         #   (uint64) x1 >>= 32
 106         shr     $32,%rdi
 107         #               x3 = j2
 108         movq    64(%rsp),%rsi
 109         #               x2 = x3
 110         mov     %rsi,%rcx
 111         #               (uint64) x3 >>= 32
 112         shr     $32,%rsi
 113         #   x5 = j4
 114         movq    72(%rsp),%r8
 115         #   x4 = x5
 116         mov     %r8,%r9
 117         #   (uint64) x5 >>= 32
 118         shr     $32,%r8
 119         #   x5_stack = x5
 120         movq    %r8,160(%rsp)
 121         #               x7 = j6
 122         movq    80(%rsp),%r8
 123         #               x6 = x7
 124         mov     %r8,%rax
 125         #               (uint64) x7 >>= 32
 126         shr     $32,%r8
 127         #   x9 = j8
 128         movq    88(%rsp),%r10
 129         #   x8 = x9
 130         mov     %r10,%r11
 131         #   (uint64) x9 >>= 32
 132         shr     $32,%r10
 133         #               x11 = j10
 134         movq    96(%rsp),%r12
 135         #               x10 = x11
 136         mov     %r12,%r13
 137         #               x10_stack = x10
 138         movq    %r13,168(%rsp)
 139         #               (uint64) x11 >>= 32
 140         shr     $32,%r12
 141         #   x13 = j12
 142         movq    104(%rsp),%r13
 143         #   x12 = x13
 144         mov     %r13,%r14
 145         #   (uint64) x13 >>= 32
 146         shr     $32,%r13
 147         #               x15 = j14
 148         movq    112(%rsp),%r15
 149         #               x14 = x15
 150         mov     %r15,%rbx
 151         #               (uint64) x15 >>= 32
 152         shr     $32,%r15
 153         #               x15_stack = x15
 154         movq    %r15,176(%rsp)
 155         #   i = 20
 156         mov     $20,%r15
 157 #   mainloop:
 158 ._mainloop:
 159         #   i_backup = i
 160         movq    %r15,184(%rsp)
 161         #               x5 = x5_stack
 162         movq    160(%rsp),%r15
 163         # a = x12 + x0
 164         lea     (%r14,%rdx),%rbp
 165         # (uint32) a <<<= 7
 166         rol     $7,%ebp
 167         # x4 ^= a
 168         xor     %rbp,%r9
 169         #               b = x1 + x5
 170         lea     (%rdi,%r15),%rbp
 171         #               (uint32) b <<<= 7
 172         rol     $7,%ebp
 173         #               x9 ^= b
 174         xor     %rbp,%r10
 175         # a = x0 + x4
 176         lea     (%rdx,%r9),%rbp
 177         # (uint32) a <<<= 9
 178         rol     $9,%ebp
 179         # x8 ^= a
 180         xor     %rbp,%r11
 181         #               b = x5 + x9
 182         lea     (%r15,%r10),%rbp
 183         #               (uint32) b <<<= 9
 184         rol     $9,%ebp
 185         #               x13 ^= b
 186         xor     %rbp,%r13
 187         # a = x4 + x8
 188         lea     (%r9,%r11),%rbp
 189         # (uint32) a <<<= 13
 190         rol     $13,%ebp
 191         # x12 ^= a
 192         xor     %rbp,%r14
 193         #               b = x9 + x13
 194         lea     (%r10,%r13),%rbp
 195         #               (uint32) b <<<= 13
 196         rol     $13,%ebp
 197         #               x1 ^= b
 198         xor     %rbp,%rdi
 199         # a = x8 + x12
 200         lea     (%r11,%r14),%rbp
 201         # (uint32) a <<<= 18
 202         rol     $18,%ebp
 203         # x0 ^= a
 204         xor     %rbp,%rdx
 205         #               b = x13 + x1
 206         lea     (%r13,%rdi),%rbp
 207         #               (uint32) b <<<= 18
 208         rol     $18,%ebp
 209         #               x5 ^= b
 210         xor     %rbp,%r15
 211         #                               x10 = x10_stack
 212         movq    168(%rsp),%rbp
 213         #               x5_stack = x5
 214         movq    %r15,160(%rsp)
 215         #                               c = x6 + x10
 216         lea     (%rax,%rbp),%r15
 217         #                               (uint32) c <<<= 7
 218         rol     $7,%r15d
 219         #                               x14 ^= c
 220         xor     %r15,%rbx
 221         #                               c = x10 + x14
 222         lea     (%rbp,%rbx),%r15
 223         #                               (uint32) c <<<= 9
 224         rol     $9,%r15d
 225         #                               x2 ^= c
 226         xor     %r15,%rcx
 227         #                               c = x14 + x2
 228         lea     (%rbx,%rcx),%r15
 229         #                               (uint32) c <<<= 13
 230         rol     $13,%r15d
 231         #                               x6 ^= c
 232         xor     %r15,%rax
 233         #                               c = x2 + x6
 234         lea     (%rcx,%rax),%r15
 235         #                               (uint32) c <<<= 18
 236         rol     $18,%r15d
 237         #                               x10 ^= c
 238         xor     %r15,%rbp
 239         #                                               x15 = x15_stack
 240         movq    176(%rsp),%r15
 241         #                               x10_stack = x10
 242         movq    %rbp,168(%rsp)
 243         #                                               d = x11 + x15
 244         lea     (%r12,%r15),%rbp
 245         #                                               (uint32) d <<<= 7
 246         rol     $7,%ebp
 247         #                                               x3 ^= d
 248         xor     %rbp,%rsi
 249         #                                               d = x15 + x3
 250         lea     (%r15,%rsi),%rbp
 251         #                                               (uint32) d <<<= 9
 252         rol     $9,%ebp
 253         #                                               x7 ^= d
 254         xor     %rbp,%r8
 255         #                                               d = x3 + x7
 256         lea     (%rsi,%r8),%rbp
 257         #                                               (uint32) d <<<= 13
 258         rol     $13,%ebp
 259         #                                               x11 ^= d
 260         xor     %rbp,%r12
 261         #                                               d = x7 + x11
 262         lea     (%r8,%r12),%rbp
 263         #                                               (uint32) d <<<= 18
 264         rol     $18,%ebp
 265         #                                               x15 ^= d
 266         xor     %rbp,%r15
 267         #                                               x15_stack = x15
 268         movq    %r15,176(%rsp)
 269         #               x5 = x5_stack
 270         movq    160(%rsp),%r15
 271         # a = x3 + x0
 272         lea     (%rsi,%rdx),%rbp
 273         # (uint32) a <<<= 7
 274         rol     $7,%ebp
 275         # x1 ^= a
 276         xor     %rbp,%rdi
 277         #               b = x4 + x5
 278         lea     (%r9,%r15),%rbp
 279         #               (uint32) b <<<= 7
 280         rol     $7,%ebp
 281         #               x6 ^= b
 282         xor     %rbp,%rax
 283         # a = x0 + x1
 284         lea     (%rdx,%rdi),%rbp
 285         # (uint32) a <<<= 9
 286         rol     $9,%ebp
 287         # x2 ^= a
 288         xor     %rbp,%rcx
 289         #               b = x5 + x6
 290         lea     (%r15,%rax),%rbp
 291         #               (uint32) b <<<= 9
 292         rol     $9,%ebp
 293         #               x7 ^= b
 294         xor     %rbp,%r8
 295         # a = x1 + x2
 296         lea     (%rdi,%rcx),%rbp
 297         # (uint32) a <<<= 13
 298         rol     $13,%ebp
 299         # x3 ^= a
 300         xor     %rbp,%rsi
 301         #               b = x6 + x7
 302         lea     (%rax,%r8),%rbp
 303         #               (uint32) b <<<= 13
 304         rol     $13,%ebp
 305         #               x4 ^= b
 306         xor     %rbp,%r9
 307         # a = x2 + x3
 308         lea     (%rcx,%rsi),%rbp
 309         # (uint32) a <<<= 18
 310         rol     $18,%ebp
 311         # x0 ^= a
 312         xor     %rbp,%rdx
 313         #               b = x7 + x4
 314         lea     (%r8,%r9),%rbp
 315         #               (uint32) b <<<= 18
 316         rol     $18,%ebp
 317         #               x5 ^= b
 318         xor     %rbp,%r15
 319         #                               x10 = x10_stack
 320         movq    168(%rsp),%rbp
 321         #               x5_stack = x5
 322         movq    %r15,160(%rsp)
 323         #                               c = x9 + x10
 324         lea     (%r10,%rbp),%r15
 325         #                               (uint32) c <<<= 7
 326         rol     $7,%r15d
 327         #                               x11 ^= c
 328         xor     %r15,%r12
 329         #                               c = x10 + x11
 330         lea     (%rbp,%r12),%r15
 331         #                               (uint32) c <<<= 9
 332         rol     $9,%r15d
 333         #                               x8 ^= c
 334         xor     %r15,%r11
 335         #                               c = x11 + x8
 336         lea     (%r12,%r11),%r15
 337         #                               (uint32) c <<<= 13
 338         rol     $13,%r15d
 339         #                               x9 ^= c
 340         xor     %r15,%r10
 341         #                               c = x8 + x9
 342         lea     (%r11,%r10),%r15
 343         #                               (uint32) c <<<= 18
 344         rol     $18,%r15d
 345         #                               x10 ^= c
 346         xor     %r15,%rbp
 347         #                                               x15 = x15_stack
 348         movq    176(%rsp),%r15
 349         #                               x10_stack = x10
 350         movq    %rbp,168(%rsp)
 351         #                                               d = x14 + x15
 352         lea     (%rbx,%r15),%rbp
 353         #                                               (uint32) d <<<= 7
 354         rol     $7,%ebp
 355         #                                               x12 ^= d
 356         xor     %rbp,%r14
 357         #                                               d = x15 + x12
 358         lea     (%r15,%r14),%rbp
 359         #                                               (uint32) d <<<= 9
 360         rol     $9,%ebp
 361         #                                               x13 ^= d
 362         xor     %rbp,%r13
 363         #                                               d = x12 + x13
 364         lea     (%r14,%r13),%rbp
 365         #                                               (uint32) d <<<= 13
 366         rol     $13,%ebp
 367         #                                               x14 ^= d
 368         xor     %rbp,%rbx
 369         #                                               d = x13 + x14
 370         lea     (%r13,%rbx),%rbp
 371         #                                               (uint32) d <<<= 18
 372         rol     $18,%ebp
 373         #                                               x15 ^= d
 374         xor     %rbp,%r15
 375         #                                               x15_stack = x15
 376         movq    %r15,176(%rsp)
 377         #               x5 = x5_stack
 378         movq    160(%rsp),%r15
 379         # a = x12 + x0
 380         lea     (%r14,%rdx),%rbp
 381         # (uint32) a <<<= 7
 382         rol     $7,%ebp
 383         # x4 ^= a
 384         xor     %rbp,%r9
 385         #               b = x1 + x5
 386         lea     (%rdi,%r15),%rbp
 387         #               (uint32) b <<<= 7
 388         rol     $7,%ebp
 389         #               x9 ^= b
 390         xor     %rbp,%r10
 391         # a = x0 + x4
 392         lea     (%rdx,%r9),%rbp
 393         # (uint32) a <<<= 9
 394         rol     $9,%ebp
 395         # x8 ^= a
 396         xor     %rbp,%r11
 397         #               b = x5 + x9
 398         lea     (%r15,%r10),%rbp
 399         #               (uint32) b <<<= 9
 400         rol     $9,%ebp
 401         #               x13 ^= b
 402         xor     %rbp,%r13
 403         # a = x4 + x8
 404         lea     (%r9,%r11),%rbp
 405         # (uint32) a <<<= 13
 406         rol     $13,%ebp
 407         # x12 ^= a
 408         xor     %rbp,%r14
 409         #               b = x9 + x13
 410         lea     (%r10,%r13),%rbp
 411         #               (uint32) b <<<= 13
 412         rol     $13,%ebp
 413         #               x1 ^= b
 414         xor     %rbp,%rdi
 415         # a = x8 + x12
 416         lea     (%r11,%r14),%rbp
 417         # (uint32) a <<<= 18
 418         rol     $18,%ebp
 419         # x0 ^= a
 420         xor     %rbp,%rdx
 421         #               b = x13 + x1
 422         lea     (%r13,%rdi),%rbp
 423         #               (uint32) b <<<= 18
 424         rol     $18,%ebp
 425         #               x5 ^= b
 426         xor     %rbp,%r15
 427         #                               x10 = x10_stack
 428         movq    168(%rsp),%rbp
 429         #               x5_stack = x5
 430         movq    %r15,160(%rsp)
 431         #                               c = x6 + x10
 432         lea     (%rax,%rbp),%r15
 433         #                               (uint32) c <<<= 7
 434         rol     $7,%r15d
 435         #                               x14 ^= c
 436         xor     %r15,%rbx
 437         #                               c = x10 + x14
 438         lea     (%rbp,%rbx),%r15
 439         #                               (uint32) c <<<= 9
 440         rol     $9,%r15d
 441         #                               x2 ^= c
 442         xor     %r15,%rcx
 443         #                               c = x14 + x2
 444         lea     (%rbx,%rcx),%r15
 445         #                               (uint32) c <<<= 13
 446         rol     $13,%r15d
 447         #                               x6 ^= c
 448         xor     %r15,%rax
 449         #                               c = x2 + x6
 450         lea     (%rcx,%rax),%r15
 451         #                               (uint32) c <<<= 18
 452         rol     $18,%r15d
 453         #                               x10 ^= c
 454         xor     %r15,%rbp
 455         #                                               x15 = x15_stack
 456         movq    176(%rsp),%r15
 457         #                               x10_stack = x10
 458         movq    %rbp,168(%rsp)
 459         #                                               d = x11 + x15
 460         lea     (%r12,%r15),%rbp
 461         #                                               (uint32) d <<<= 7
 462         rol     $7,%ebp
 463         #                                               x3 ^= d
 464         xor     %rbp,%rsi
 465         #                                               d = x15 + x3
 466         lea     (%r15,%rsi),%rbp
 467         #                                               (uint32) d <<<= 9
 468         rol     $9,%ebp
 469         #                                               x7 ^= d
 470         xor     %rbp,%r8
 471         #                                               d = x3 + x7
 472         lea     (%rsi,%r8),%rbp
 473         #                                               (uint32) d <<<= 13
 474         rol     $13,%ebp
 475         #                                               x11 ^= d
 476         xor     %rbp,%r12
 477         #                                               d = x7 + x11
 478         lea     (%r8,%r12),%rbp
 479         #                                               (uint32) d <<<= 18
 480         rol     $18,%ebp
 481         #                                               x15 ^= d
 482         xor     %rbp,%r15
 483         #                                               x15_stack = x15
 484         movq    %r15,176(%rsp)
 485         #               x5 = x5_stack
 486         movq    160(%rsp),%r15
 487         # a = x3 + x0
 488         lea     (%rsi,%rdx),%rbp
 489         # (uint32) a <<<= 7
 490         rol     $7,%ebp
 491         # x1 ^= a
 492         xor     %rbp,%rdi
 493         #               b = x4 + x5
 494         lea     (%r9,%r15),%rbp
 495         #               (uint32) b <<<= 7
 496         rol     $7,%ebp
 497         #               x6 ^= b
 498         xor     %rbp,%rax
 499         # a = x0 + x1
 500         lea     (%rdx,%rdi),%rbp
 501         # (uint32) a <<<= 9
 502         rol     $9,%ebp
 503         # x2 ^= a
 504         xor     %rbp,%rcx
 505         #               b = x5 + x6
 506         lea     (%r15,%rax),%rbp
 507         #               (uint32) b <<<= 9
 508         rol     $9,%ebp
 509         #               x7 ^= b
 510         xor     %rbp,%r8
 511         # a = x1 + x2
 512         lea     (%rdi,%rcx),%rbp
 513         # (uint32) a <<<= 13
 514         rol     $13,%ebp
 515         # x3 ^= a
 516         xor     %rbp,%rsi
 517         #               b = x6 + x7
 518         lea     (%rax,%r8),%rbp
 519         #               (uint32) b <<<= 13
 520         rol     $13,%ebp
 521         #               x4 ^= b
 522         xor     %rbp,%r9
 523         # a = x2 + x3
 524         lea     (%rcx,%rsi),%rbp
 525         # (uint32) a <<<= 18
 526         rol     $18,%ebp
 527         # x0 ^= a
 528         xor     %rbp,%rdx
 529         #               b = x7 + x4
 530         lea     (%r8,%r9),%rbp
 531         #               (uint32) b <<<= 18
 532         rol     $18,%ebp
 533         #               x5 ^= b
 534         xor     %rbp,%r15
 535         #                               x10 = x10_stack
 536         movq    168(%rsp),%rbp
 537         #               x5_stack = x5
 538         movq    %r15,160(%rsp)
 539         #                               c = x9 + x10
 540         lea     (%r10,%rbp),%r15
 541         #                               (uint32) c <<<= 7
 542         rol     $7,%r15d
 543         #                               x11 ^= c
 544         xor     %r15,%r12
 545         #                               c = x10 + x11
 546         lea     (%rbp,%r12),%r15
 547         #                               (uint32) c <<<= 9
 548         rol     $9,%r15d
 549         #                               x8 ^= c
 550         xor     %r15,%r11
 551         #                               c = x11 + x8
 552         lea     (%r12,%r11),%r15
 553         #                               (uint32) c <<<= 13
 554         rol     $13,%r15d
 555         #                               x9 ^= c
 556         xor     %r15,%r10
 557         #                               c = x8 + x9
 558         lea     (%r11,%r10),%r15
 559         #                               (uint32) c <<<= 18
 560         rol     $18,%r15d
 561         #                               x10 ^= c
 562         xor     %r15,%rbp
 563         #                                               x15 = x15_stack
 564         movq    176(%rsp),%r15
 565         #                               x10_stack = x10
 566         movq    %rbp,168(%rsp)
 567         #                                               d = x14 + x15
 568         lea     (%rbx,%r15),%rbp
 569         #                                               (uint32) d <<<= 7
 570         rol     $7,%ebp
 571         #                                               x12 ^= d
 572         xor     %rbp,%r14
 573         #                                               d = x15 + x12
 574         lea     (%r15,%r14),%rbp
 575         #                                               (uint32) d <<<= 9
 576         rol     $9,%ebp
 577         #                                               x13 ^= d
 578         xor     %rbp,%r13
 579         #                                               d = x12 + x13
 580         lea     (%r14,%r13),%rbp
 581         #                                               (uint32) d <<<= 13
 582         rol     $13,%ebp
 583         #                                               x14 ^= d
 584         xor     %rbp,%rbx
 585         #                                               d = x13 + x14
 586         lea     (%r13,%rbx),%rbp
 587         #                                               (uint32) d <<<= 18
 588         rol     $18,%ebp
 589         #                                               x15 ^= d
 590         xor     %rbp,%r15
 591         #                                               x15_stack = x15
 592         movq    %r15,176(%rsp)
 593         #   i = i_backup
 594         movq    184(%rsp),%r15
 595         #                  unsigned>? i -= 4
 596         sub     $4,%r15
 597         # comment:fp stack unchanged by jump
 598         # goto mainloop if unsigned>
 599         ja      ._mainloop
 600         #   (uint32) x2 += j2
 601         addl    64(%rsp),%ecx
 602         #   x3 <<= 32
 603         shl     $32,%rsi
 604         #   x3 += j2
 605         addq    64(%rsp),%rsi
 606         #   (uint64) x3 >>= 32
 607         shr     $32,%rsi
 608         #   x3 <<= 32
 609         shl     $32,%rsi
 610         #   x2 += x3
 611         add     %rsi,%rcx
 612         #   (uint32) x6 += j6
 613         addl    80(%rsp),%eax
 614         #   x7 <<= 32
 615         shl     $32,%r8
 616         #   x7 += j6
 617         addq    80(%rsp),%r8
 618         #   (uint64) x7 >>= 32
 619         shr     $32,%r8
 620         #   x7 <<= 32
 621         shl     $32,%r8
 622         #   x6 += x7
 623         add     %r8,%rax
 624         #   (uint32) x8 += j8
 625         addl    88(%rsp),%r11d
 626         #   x9 <<= 32
 627         shl     $32,%r10
 628         #   x9 += j8
 629         addq    88(%rsp),%r10
 630         #   (uint64) x9 >>= 32
 631         shr     $32,%r10
 632         #   x9 <<= 32
 633         shl     $32,%r10
 634         #   x8 += x9
 635         add     %r10,%r11
 636         #   (uint32) x12 += j12
 637         addl    104(%rsp),%r14d
 638         #   x13 <<= 32
 639         shl     $32,%r13
 640         #   x13 += j12
 641         addq    104(%rsp),%r13
 642         #   (uint64) x13 >>= 32
 643         shr     $32,%r13
 644         #   x13 <<= 32
 645         shl     $32,%r13
 646         #   x12 += x13
 647         add     %r13,%r14
 648         #   (uint32) x0 += j0
 649         addl    56(%rsp),%edx
 650         #   x1 <<= 32
 651         shl     $32,%rdi
 652         #   x1 += j0
 653         addq    56(%rsp),%rdi
 654         #   (uint64) x1 >>= 32
 655         shr     $32,%rdi
 656         #   x1 <<= 32
 657         shl     $32,%rdi
 658         #   x0 += x1
 659         add     %rdi,%rdx
 660         #   x5 = x5_stack
 661         movq    160(%rsp),%rdi
 662         #   (uint32) x4 += j4
 663         addl    72(%rsp),%r9d
 664         #   x5 <<= 32
 665         shl     $32,%rdi
 666         #   x5 += j4
 667         addq    72(%rsp),%rdi
 668         #   (uint64) x5 >>= 32
 669         shr     $32,%rdi
 670         #   x5 <<= 32
 671         shl     $32,%rdi
 672         #   x4 += x5
 673         add     %rdi,%r9
 674         #   x10 = x10_stack
 675         movq    168(%rsp),%r8
 676         #   (uint32) x10 += j10
 677         addl    96(%rsp),%r8d
 678         #   x11 <<= 32
 679         shl     $32,%r12
 680         #   x11 += j10
 681         addq    96(%rsp),%r12
 682         #   (uint64) x11 >>= 32
 683         shr     $32,%r12
 684         #   x11 <<= 32
 685         shl     $32,%r12
 686         #   x10 += x11
 687         add     %r12,%r8
 688         #   x15 = x15_stack
 689         movq    176(%rsp),%rdi
 690         #   (uint32) x14 += j14
 691         addl    112(%rsp),%ebx
 692         #   x15 <<= 32
 693         shl     $32,%rdi
 694         #   x15 += j14
 695         addq    112(%rsp),%rdi
 696         #   (uint64) x15 >>= 32
 697         shr     $32,%rdi
 698         #   x15 <<= 32
 699         shl     $32,%rdi
 700         #   x14 += x15
 701         add     %rdi,%rbx
 702         #   out = out_backup
 703         movq    136(%rsp),%rdi
 704         #   m = m_backup
 705         movq    144(%rsp),%rsi
 706         #   x0 ^= *(uint64 *) (m + 0)
 707         xorq    0(%rsi),%rdx
 708         #   *(uint64 *) (out + 0) = x0
 709         movq    %rdx,0(%rdi)
 710         #   x2 ^= *(uint64 *) (m + 8)
 711         xorq    8(%rsi),%rcx
 712         #   *(uint64 *) (out + 8) = x2
 713         movq    %rcx,8(%rdi)
 714         #   x4 ^= *(uint64 *) (m + 16)
 715         xorq    16(%rsi),%r9
 716         #   *(uint64 *) (out + 16) = x4
 717         movq    %r9,16(%rdi)
 718         #   x6 ^= *(uint64 *) (m + 24)
 719         xorq    24(%rsi),%rax
 720         #   *(uint64 *) (out + 24) = x6
 721         movq    %rax,24(%rdi)
 722         #   x8 ^= *(uint64 *) (m + 32)
 723         xorq    32(%rsi),%r11
 724         #   *(uint64 *) (out + 32) = x8
 725         movq    %r11,32(%rdi)
 726         #   x10 ^= *(uint64 *) (m + 40)
 727         xorq    40(%rsi),%r8
 728         #   *(uint64 *) (out + 40) = x10
 729         movq    %r8,40(%rdi)
 730         #   x12 ^= *(uint64 *) (m + 48)
 731         xorq    48(%rsi),%r14
 732         #   *(uint64 *) (out + 48) = x12
 733         movq    %r14,48(%rdi)
 734         #   x14 ^= *(uint64 *) (m + 56)
 735         xorq    56(%rsi),%rbx
 736         #   *(uint64 *) (out + 56) = x14
 737         movq    %rbx,56(%rdi)
 738         #   bytes = bytes_backup
 739         movq    152(%rsp),%rdx
 740         #   in8 = j8
 741         movq    88(%rsp),%rcx
 742         #   in8 += 1
 743         add     $1,%rcx
 744         #   j8 = in8
 745         movq    %rcx,88(%rsp)
 746         #                          unsigned>? unsigned<? bytes - 64
 747         cmp     $64,%rdx
 748         # comment:fp stack unchanged by jump
 749         #   goto bytesatleast65 if unsigned>
 750         ja      ._bytesatleast65
 751         # comment:fp stack unchanged by jump
 752         #     goto bytesatleast64 if !unsigned<
 753         jae     ._bytesatleast64
 754         #       m = out
 755         mov     %rdi,%rsi
 756         #       out = ctarget
 757         movq    128(%rsp),%rdi
 758         #       i = bytes
 759         mov     %rdx,%rcx
 760         #       while (i) { *out++ = *m++; --i }
 761         rep     movsb
 762         # comment:fp stack unchanged by fallthrough
 763 #     bytesatleast64:
 764 ._bytesatleast64:
 765         #     x = x_backup
 766         movq    120(%rsp),%rdi
 767         #     in8 = j8
 768         movq    88(%rsp),%rsi
 769         #     *(uint64 *) (x + 32) = in8
 770         movq    %rsi,32(%rdi)
 771         #     r11 = r11_stack
 772         movq    0(%rsp),%r11
 773         #     r12 = r12_stack
 774         movq    8(%rsp),%r12
 775         #     r13 = r13_stack
 776         movq    16(%rsp),%r13
 777         #     r14 = r14_stack
 778         movq    24(%rsp),%r14
 779         #     r15 = r15_stack
 780         movq    32(%rsp),%r15
 781         #     rbx = rbx_stack
 782         movq    40(%rsp),%rbx
 783         #     rbp = rbp_stack
 784         movq    48(%rsp),%rbp
 785         # comment:fp stack unchanged by fallthrough
 786 #     done:
 787 ._done:
 788         #     leave
 789         add     %r11,%rsp
 790         mov     %rdi,%rax
 791         mov     %rsi,%rdx
 792         ret
 793 #   bytesatleast65:
 794 ._bytesatleast65:
 795         #   bytes -= 64
 796         sub     $64,%rdx
 797         #   out += 64
 798         add     $64,%rdi
 799         #   m += 64
 800         add     $64,%rsi
 801         # comment:fp stack unchanged by jump
 802         # goto bytesatleast1
 803         jmp     ._bytesatleast1
 804 ENDPROC(salsa20_encrypt_bytes)
 805
 806 # enter salsa20_keysetup
 807 ENTRY(salsa20_keysetup)
 808         mov     %rsp,%r11
 809         and     $31,%r11
 810         add     $256,%r11
 811         sub     %r11,%rsp
 812         #   k = arg2
 813         mov     %rsi,%rsi
 814         #   kbits = arg3
 815         mov     %rdx,%rdx
 816         #   x = arg1
 817         mov     %rdi,%rdi
 818         #   in0 = *(uint64 *) (k + 0)
 819         movq    0(%rsi),%r8
 820         #   in2 = *(uint64 *) (k + 8)
 821         movq    8(%rsi),%r9
 822         #   *(uint64 *) (x + 4) = in0
 823         movq    %r8,4(%rdi)
 824         #   *(uint64 *) (x + 12) = in2
 825         movq    %r9,12(%rdi)
 826         #                    unsigned<? kbits - 256
 827         cmp     $256,%rdx
 828         # comment:fp stack unchanged by jump
 829         #   goto kbits128 if unsigned<
 830         jb      ._kbits128
 831 #   kbits256:
 832 ._kbits256:
 833         #     in10 = *(uint64 *) (k + 16)
 834         movq    16(%rsi),%rdx
 835         #     in12 = *(uint64 *) (k + 24)
 836         movq    24(%rsi),%rsi
 837         #     *(uint64 *) (x + 44) = in10
 838         movq    %rdx,44(%rdi)
 839         #     *(uint64 *) (x + 52) = in12
 840         movq    %rsi,52(%rdi)
 841         #     in0 = 1634760805
 842         mov     $1634760805,%rsi
 843         #     in4 = 857760878
 844         mov     $857760878,%rdx
 845         #     in10 = 2036477234
 846         mov     $2036477234,%rcx
 847         #     in14 = 1797285236
 848         mov     $1797285236,%r8
 849         #     *(uint32 *) (x + 0) = in0
 850         movl    %esi,0(%rdi)
 851         #     *(uint32 *) (x + 20) = in4
 852         movl    %edx,20(%rdi)
 853         #     *(uint32 *) (x + 40) = in10
 854         movl    %ecx,40(%rdi)
 855         #     *(uint32 *) (x + 60) = in14
 856         movl    %r8d,60(%rdi)
 857         # comment:fp stack unchanged by jump
 858         #   goto keysetupdone
 859         jmp     ._keysetupdone
 860 #   kbits128:
 861 ._kbits128:
 862         #     in10 = *(uint64 *) (k + 0)
 863         movq    0(%rsi),%rdx
 864         #     in12 = *(uint64 *) (k + 8)
 865         movq    8(%rsi),%rsi
 866         #     *(uint64 *) (x + 44) = in10
 867         movq    %rdx,44(%rdi)
 868         #     *(uint64 *) (x + 52) = in12
 869         movq    %rsi,52(%rdi)
 870         #     in0 = 1634760805
 871         mov     $1634760805,%rsi
 872         #     in4 = 824206446
 873         mov     $824206446,%rdx
 874         #     in10 = 2036477238
 875         mov     $2036477238,%rcx
 876         #     in14 = 1797285236
 877         mov     $1797285236,%r8
 878         #     *(uint32 *) (x + 0) = in0
 879         movl    %esi,0(%rdi)
 880         #     *(uint32 *) (x + 20) = in4
 881         movl    %edx,20(%rdi)
 882         #     *(uint32 *) (x + 40) = in10
 883         movl    %ecx,40(%rdi)
 884         #     *(uint32 *) (x + 60) = in14
 885         movl    %r8d,60(%rdi)
 886 #   keysetupdone:
 887 ._keysetupdone:
 888         # leave
 889         add     %r11,%rsp
 890         mov     %rdi,%rax
 891         mov     %rsi,%rdx
 892         ret
 893 ENDPROC(salsa20_keysetup)
 894
 895 # enter salsa20_ivsetup
 896 ENTRY(salsa20_ivsetup)
 897         mov     %rsp,%r11
 898         and     $31,%r11
 899         add     $256,%r11
 900         sub     %r11,%rsp
 901         #   iv = arg2
 902         mov     %rsi,%rsi
 903         #   x = arg1
 904         mov     %rdi,%rdi
 905         #   in6 = *(uint64 *) (iv + 0)
 906         movq    0(%rsi),%rsi
 907         #   in8 = 0
 908         mov     $0,%r8
 909         #   *(uint64 *) (x + 24) = in6
 910         movq    %rsi,24(%rdi)
 911         #   *(uint64 *) (x + 32) = in8
 912         movq    %r8,32(%rdi)
 913         # leave
 914         add     %r11,%rsp
 915         mov     %rdi,%rax
 916         mov     %rsi,%rdx
 917         ret
 918 ENDPROC(salsa20_ivsetup)