kernel/arch/x86/lib/csum-copy_64.S

   1 /*
   2  * Copyright 2002, 2003 Andi Kleen, SuSE Labs.
   3  *
   4  * This file is subject to the terms and conditions of the GNU General Public
   5  * License.  See the file COPYING in the main directory of this archive
   6  * for more details. No warranty for anything given at all.
   7  */
   8 #include <linux/linkage.h>
   9 #include <asm/errno.h>
  10 #include <asm/asm.h>
  11
  12 /*
  13  * Checksum copy with exception handling.
  14  * On exceptions src_err_ptr or dst_err_ptr is set to -EFAULT and the
  15  * destination is zeroed.
  16  *
  17  * Input
  18  * rdi  source
  19  * rsi  destination
  20  * edx  len (32bit)
  21  * ecx  sum (32bit)
  22  * r8   src_err_ptr (int)
  23  * r9   dst_err_ptr (int)
  24  *
  25  * Output
  26  * eax  64bit sum. undefined in case of exception.
  27  *
  28  * Wrappers need to take care of valid exception sum and zeroing.
  29  * They also should align source or destination to 8 bytes.
  30  */
  31
  32         .macro source
  33 10:
  34         _ASM_EXTABLE(10b, .Lbad_source)
  35         .endm
  36
  37         .macro dest
  38 20:
  39         _ASM_EXTABLE(20b, .Lbad_dest)
  40         .endm
  41
  42         .macro ignore L=.Lignore
  43 30:
  44         _ASM_EXTABLE(30b, \L)
  45         .endm
  46
  47
  48 ENTRY(csum_partial_copy_generic)
  49         cmpl    $3*64, %edx
  50         jle     .Lignore
  51
  52 .Lignore:
  53         subq  $7*8, %rsp
  54         movq  %rbx, 2*8(%rsp)
  55         movq  %r12, 3*8(%rsp)
  56         movq  %r14, 4*8(%rsp)
  57         movq  %r13, 5*8(%rsp)
  58         movq  %rbp, 6*8(%rsp)
  59
  60         movq  %r8, (%rsp)
  61         movq  %r9, 1*8(%rsp)
  62
  63         movl  %ecx, %eax
  64         movl  %edx, %ecx
  65
  66         xorl  %r9d, %r9d
  67         movq  %rcx, %r12
  68
  69         shrq  $6, %r12
  70         jz      .Lhandle_tail       /* < 64 */
  71
  72         clc
  73
  74         /* main loop. clear in 64 byte blocks */
  75         /* r9: zero, r8: temp2, rbx: temp1, rax: sum, rcx: saved length */
  76         /* r11: temp3, rdx: temp4, r12 loopcnt */
  77         /* r10: temp5, rbp: temp6, r14 temp7, r13 temp8 */
  78         .p2align 4
  79 .Lloop:
  80         source
  81         movq  (%rdi), %rbx
  82         source
  83         movq  8(%rdi), %r8
  84         source
  85         movq  16(%rdi), %r11
  86         source
  87         movq  24(%rdi), %rdx
  88
  89         source
  90         movq  32(%rdi), %r10
  91         source
  92         movq  40(%rdi), %rbp
  93         source
  94         movq  48(%rdi), %r14
  95         source
  96         movq  56(%rdi), %r13
  97
  98         ignore 2f
  99         prefetcht0 5*64(%rdi)
 100 2:
 101         adcq  %rbx, %rax
 102         adcq  %r8, %rax
 103         adcq  %r11, %rax
 104         adcq  %rdx, %rax
 105         adcq  %r10, %rax
 106         adcq  %rbp, %rax
 107         adcq  %r14, %rax
 108         adcq  %r13, %rax
 109
 110         decl %r12d
 111
 112         dest
 113         movq %rbx, (%rsi)
 114         dest
 115         movq %r8, 8(%rsi)
 116         dest
 117         movq %r11, 16(%rsi)
 118         dest
 119         movq %rdx, 24(%rsi)
 120
 121         dest
 122         movq %r10, 32(%rsi)
 123         dest
 124         movq %rbp, 40(%rsi)
 125         dest
 126         movq %r14, 48(%rsi)
 127         dest
 128         movq %r13, 56(%rsi)
 129
 130 3:
 131
 132         leaq 64(%rdi), %rdi
 133         leaq 64(%rsi), %rsi
 134
 135         jnz     .Lloop
 136
 137         adcq  %r9, %rax
 138
 139         /* do last up to 56 bytes */
 140 .Lhandle_tail:
 141         /* ecx: count */
 142         movl %ecx, %r10d
 143         andl $63, %ecx
 144         shrl $3, %ecx
 145         jz      .Lfold
 146         clc
 147         .p2align 4
 148 .Lloop_8:
 149         source
 150         movq (%rdi), %rbx
 151         adcq %rbx, %rax
 152         decl %ecx
 153         dest
 154         movq %rbx, (%rsi)
 155         leaq 8(%rsi), %rsi /* preserve carry */
 156         leaq 8(%rdi), %rdi
 157         jnz     .Lloop_8
 158         adcq %r9, %rax  /* add in carry */
 159
 160 .Lfold:
 161         /* reduce checksum to 32bits */
 162         movl %eax, %ebx
 163         shrq $32, %rax
 164         addl %ebx, %eax
 165         adcl %r9d, %eax
 166
 167         /* do last up to 6 bytes */
 168 .Lhandle_7:
 169         movl %r10d, %ecx
 170         andl $7, %ecx
 171         shrl $1, %ecx
 172         jz   .Lhandle_1
 173         movl $2, %edx
 174         xorl %ebx, %ebx
 175         clc
 176         .p2align 4
 177 .Lloop_1:
 178         source
 179         movw (%rdi), %bx
 180         adcl %ebx, %eax
 181         decl %ecx
 182         dest
 183         movw %bx, (%rsi)
 184         leaq 2(%rdi), %rdi
 185         leaq 2(%rsi), %rsi
 186         jnz .Lloop_1
 187         adcl %r9d, %eax /* add in carry */
 188
 189         /* handle last odd byte */
 190 .Lhandle_1:
 191         testb $1, %r10b
 192         jz    .Lende
 193         xorl  %ebx, %ebx
 194         source
 195         movb (%rdi), %bl
 196         dest
 197         movb %bl, (%rsi)
 198         addl %ebx, %eax
 199         adcl %r9d, %eax         /* carry */
 200
 201 .Lende:
 202         movq 2*8(%rsp), %rbx
 203         movq 3*8(%rsp), %r12
 204         movq 4*8(%rsp), %r14
 205         movq 5*8(%rsp), %r13
 206         movq 6*8(%rsp), %rbp
 207         addq $7*8, %rsp
 208         ret
 209
 210         /* Exception handlers. Very simple, zeroing is done in the wrappers */
 211 .Lbad_source:
 212         movq (%rsp), %rax
 213         testq %rax, %rax
 214         jz   .Lende
 215         movl $-EFAULT, (%rax)
 216         jmp  .Lende
 217
 218 .Lbad_dest:
 219         movq 8(%rsp), %rax
 220         testq %rax, %rax
 221         jz   .Lende
 222         movl $-EFAULT, (%rax)
 223         jmp .Lende
 224 ENDPROC(csum_partial_copy_generic)