src/ceph/src/common/crc32c_ppc_asm.S

   1 /*
   2  * Calculate the checksum of data that is 16 byte aligned and a multiple of
   3  * 16 bytes.
   4  *
   5  * The first step is to reduce it to 1024 bits. We do this in 8 parallel
   6  * chunks in order to mask the latency of the vpmsum instructions. If we
   7  * have more than 32 kB of data to checksum we repeat this step multiple
   8  * times, passing in the previous 1024 bits.
   9  *
  10  * The next step is to reduce the 1024 bits to 64 bits. This step adds
  11  * 32 bits of 0s to the end - this matches what a CRC does. We just
  12  * calculate constants that land the data in this 32 bits.
  13  *
  14  * We then use fixed point Barrett reduction to compute a mod n over GF(2)
  15  * for n = CRC using POWER8 instructions. We use x = 32.
  16  *
  17  * http://en.wikipedia.org/wiki/Barrett_reduction
  18  *
  19  * Copyright (C) 2015 Anton Blanchard <anton@au.ibm.com>, IBM
  20  * Copyright (C) 2017 International Business Machines Corp.
  21  * All rights reserved.
  22  *
  23  * This program is free software; you can redistribute it and/or
  24  * modify it under the terms of the GNU General Public License
  25  * as published by the Free Software Foundation; either version
  26  * 2 of the License, or (at your option) any later version.
  27  */
  28 #include <ppc-asm.h>
  29 #include "common/ppc-opcode.h"
  30
  31 #undef toc
  32
  33 #ifndef r1
  34 #define r1 1
  35 #endif
  36
  37 #ifndef r2
  38 #define r2 2
  39 #endif
  40
  41         .section        .rodata
  42 .balign 16
  43
  44 .byteswap_constant:
  45         /* byte reverse permute constant */
  46         .octa 0x0F0E0D0C0B0A09080706050403020100
  47
  48 #define __ASSEMBLY__
  49 #include "crc32c_ppc_constants.h"
  50
  51         .text
  52
  53 #if defined(__BIG_ENDIAN__) && defined(REFLECT)
  54 #define BYTESWAP_DATA
  55 #elif defined(__LITTLE_ENDIAN__) && !defined(REFLECT)
  56 #define BYTESWAP_DATA
  57 #else
  58 #undef BYTESWAP_DATA
  59 #endif
  60
  61 #define off16           r25
  62 #define off32           r26
  63 #define off48           r27
  64 #define off64           r28
  65 #define off80           r29
  66 #define off96           r30
  67 #define off112          r31
  68
  69 #define const1          v24
  70 #define const2          v25
  71
  72 #define byteswap        v26
  73 #define mask_32bit      v27
  74 #define mask_64bit      v28
  75 #define zeroes          v29
  76
  77 #ifdef BYTESWAP_DATA
  78 #define VPERM(A, B, C, D) vperm A, B, C, D
  79 #else
  80 #define VPERM(A, B, C, D)
  81 #endif
  82
  83 /* unsigned int __crc32_vpmsum(unsigned int crc, void *p, unsigned long len) */
  84 FUNC_START(__crc32_vpmsum)
  85         std     r31,-8(r1)
  86         std     r30,-16(r1)
  87         std     r29,-24(r1)
  88         std     r28,-32(r1)
  89         std     r27,-40(r1)
  90         std     r26,-48(r1)
  91         std     r25,-56(r1)
  92
  93         li      off16,16
  94         li      off32,32
  95         li      off48,48
  96         li      off64,64
  97         li      off80,80
  98         li      off96,96
  99         li      off112,112
 100         li      r0,0
 101
 102         /* Enough room for saving 10 non volatile VMX registers */
 103         subi    r6,r1,56+10*16
 104         subi    r7,r1,56+2*16
 105
 106         stvx    v20,0,r6
 107         stvx    v21,off16,r6
 108         stvx    v22,off32,r6
 109         stvx    v23,off48,r6
 110         stvx    v24,off64,r6
 111         stvx    v25,off80,r6
 112         stvx    v26,off96,r6
 113         stvx    v27,off112,r6
 114         stvx    v28,0,r7
 115         stvx    v29,off16,r7
 116
 117         mr      r10,r3
 118
 119         vxor    zeroes,zeroes,zeroes
 120         vspltisw v0,-1
 121
 122         vsldoi  mask_32bit,zeroes,v0,4
 123         vsldoi  mask_64bit,zeroes,v0,8
 124
 125         /* Get the initial value into v8 */
 126         vxor    v8,v8,v8
 127         MTVRD(v8, r3)
 128 #ifdef REFLECT
 129         vsldoi  v8,zeroes,v8,8  /* shift into bottom 32 bits */
 130 #else
 131         vsldoi  v8,v8,zeroes,4  /* shift into top 32 bits */
 132 #endif
 133
 134 #ifdef BYTESWAP_DATA
 135         addis   r3,r2,.byteswap_constant@toc@ha
 136         addi    r3,r3,.byteswap_constant@toc@l
 137
 138         lvx     byteswap,0,r3
 139         addi    r3,r3,16
 140 #endif
 141
 142         cmpdi   r5,256
 143         blt     .Lshort
 144
 145         rldicr  r6,r5,0,56
 146
 147         /* Checksum in blocks of MAX_SIZE */
 148 1:      lis     r7,MAX_SIZE@h
 149         ori     r7,r7,MAX_SIZE@l
 150         mr      r9,r7
 151         cmpd    r6,r7
 152         bgt     2f
 153         mr      r7,r6
 154 2:      subf    r6,r7,r6
 155
 156         /* our main loop does 128 bytes at a time */
 157         srdi    r7,r7,7
 158
 159         /*
 160          * Work out the offset into the constants table to start at. Each
 161          * constant is 16 bytes, and it is used against 128 bytes of input
 162          * data - 128 / 16 = 8
 163          */
 164         sldi    r8,r7,4
 165         srdi    r9,r9,3
 166         subf    r8,r8,r9
 167
 168         /* We reduce our final 128 bytes in a separate step */
 169         addi    r7,r7,-1
 170         mtctr   r7
 171
 172         addis   r3,r2,.constants@toc@ha
 173         addi    r3,r3,.constants@toc@l
 174
 175         /* Find the start of our constants */
 176         add     r3,r3,r8
 177
 178         /* zero v0-v7 which will contain our checksums */
 179         vxor    v0,v0,v0
 180         vxor    v1,v1,v1
 181         vxor    v2,v2,v2
 182         vxor    v3,v3,v3
 183         vxor    v4,v4,v4
 184         vxor    v5,v5,v5
 185         vxor    v6,v6,v6
 186         vxor    v7,v7,v7
 187
 188         lvx     const1,0,r3
 189
 190         /*
 191          * If we are looping back to consume more data we use the values
 192          * already in v16-v23.
 193          */
 194         cmpdi   r0,1
 195         beq     2f
 196
 197         /* First warm up pass */
 198         lvx     v16,0,r4
 199         lvx     v17,off16,r4
 200         VPERM(v16,v16,v16,byteswap)
 201         VPERM(v17,v17,v17,byteswap)
 202         lvx     v18,off32,r4
 203         lvx     v19,off48,r4
 204         VPERM(v18,v18,v18,byteswap)
 205         VPERM(v19,v19,v19,byteswap)
 206         lvx     v20,off64,r4
 207         lvx     v21,off80,r4
 208         VPERM(v20,v20,v20,byteswap)
 209         VPERM(v21,v21,v21,byteswap)
 210         lvx     v22,off96,r4
 211         lvx     v23,off112,r4
 212         VPERM(v22,v22,v22,byteswap)
 213         VPERM(v23,v23,v23,byteswap)
 214         addi    r4,r4,8*16
 215
 216         /* xor in initial value */
 217         vxor    v16,v16,v8
 218
 219 2:      bdz     .Lfirst_warm_up_done
 220
 221         addi    r3,r3,16
 222         lvx     const2,0,r3
 223
 224         /* Second warm up pass */
 225         VPMSUMD(v8,v16,const1)
 226         lvx     v16,0,r4
 227         VPERM(v16,v16,v16,byteswap)
 228         ori     r2,r2,0
 229
 230         VPMSUMD(v9,v17,const1)
 231         lvx     v17,off16,r4
 232         VPERM(v17,v17,v17,byteswap)
 233         ori     r2,r2,0
 234
 235         VPMSUMD(v10,v18,const1)
 236         lvx     v18,off32,r4
 237         VPERM(v18,v18,v18,byteswap)
 238         ori     r2,r2,0
 239
 240         VPMSUMD(v11,v19,const1)
 241         lvx     v19,off48,r4
 242         VPERM(v19,v19,v19,byteswap)
 243         ori     r2,r2,0
 244
 245         VPMSUMD(v12,v20,const1)
 246         lvx     v20,off64,r4
 247         VPERM(v20,v20,v20,byteswap)
 248         ori     r2,r2,0
 249
 250         VPMSUMD(v13,v21,const1)
 251         lvx     v21,off80,r4
 252         VPERM(v21,v21,v21,byteswap)
 253         ori     r2,r2,0
 254
 255         VPMSUMD(v14,v22,const1)
 256         lvx     v22,off96,r4
 257         VPERM(v22,v22,v22,byteswap)
 258         ori     r2,r2,0
 259
 260         VPMSUMD(v15,v23,const1)
 261         lvx     v23,off112,r4
 262         VPERM(v23,v23,v23,byteswap)
 263
 264         addi    r4,r4,8*16
 265
 266         bdz     .Lfirst_cool_down
 267
 268         /*
 269          * main loop. We modulo schedule it such that it takes three iterations
 270          * to complete - first iteration load, second iteration vpmsum, third
 271          * iteration xor.
 272          */
 273         .balign 16
 274 4:      lvx     const1,0,r3
 275         addi    r3,r3,16
 276         ori     r2,r2,0
 277
 278         vxor    v0,v0,v8
 279         VPMSUMD(v8,v16,const2)
 280         lvx     v16,0,r4
 281         VPERM(v16,v16,v16,byteswap)
 282         ori     r2,r2,0
 283
 284         vxor    v1,v1,v9
 285         VPMSUMD(v9,v17,const2)
 286         lvx     v17,off16,r4
 287         VPERM(v17,v17,v17,byteswap)
 288         ori     r2,r2,0
 289
 290         vxor    v2,v2,v10
 291         VPMSUMD(v10,v18,const2)
 292         lvx     v18,off32,r4
 293         VPERM(v18,v18,v18,byteswap)
 294         ori     r2,r2,0
 295
 296         vxor    v3,v3,v11
 297         VPMSUMD(v11,v19,const2)
 298         lvx     v19,off48,r4
 299         VPERM(v19,v19,v19,byteswap)
 300         lvx     const2,0,r3
 301         ori     r2,r2,0
 302
 303         vxor    v4,v4,v12
 304         VPMSUMD(v12,v20,const1)
 305         lvx     v20,off64,r4
 306         VPERM(v20,v20,v20,byteswap)
 307         ori     r2,r2,0
 308
 309         vxor    v5,v5,v13
 310         VPMSUMD(v13,v21,const1)
 311         lvx     v21,off80,r4
 312         VPERM(v21,v21,v21,byteswap)
 313         ori     r2,r2,0
 314
 315         vxor    v6,v6,v14
 316         VPMSUMD(v14,v22,const1)
 317         lvx     v22,off96,r4
 318         VPERM(v22,v22,v22,byteswap)
 319         ori     r2,r2,0
 320
 321         vxor    v7,v7,v15
 322         VPMSUMD(v15,v23,const1)
 323         lvx     v23,off112,r4
 324         VPERM(v23,v23,v23,byteswap)
 325
 326         addi    r4,r4,8*16
 327
 328         bdnz    4b
 329
 330 .Lfirst_cool_down:
 331         /* First cool down pass */
 332         lvx     const1,0,r3
 333         addi    r3,r3,16
 334
 335         vxor    v0,v0,v8
 336         VPMSUMD(v8,v16,const1)
 337         ori     r2,r2,0
 338
 339         vxor    v1,v1,v9
 340         VPMSUMD(v9,v17,const1)
 341         ori     r2,r2,0
 342
 343         vxor    v2,v2,v10
 344         VPMSUMD(v10,v18,const1)
 345         ori     r2,r2,0
 346
 347         vxor    v3,v3,v11
 348         VPMSUMD(v11,v19,const1)
 349         ori     r2,r2,0
 350
 351         vxor    v4,v4,v12
 352         VPMSUMD(v12,v20,const1)
 353         ori     r2,r2,0
 354
 355         vxor    v5,v5,v13
 356         VPMSUMD(v13,v21,const1)
 357         ori     r2,r2,0
 358
 359         vxor    v6,v6,v14
 360         VPMSUMD(v14,v22,const1)
 361         ori     r2,r2,0
 362
 363         vxor    v7,v7,v15
 364         VPMSUMD(v15,v23,const1)
 365         ori     r2,r2,0
 366
 367 .Lsecond_cool_down:
 368         /* Second cool down pass */
 369         vxor    v0,v0,v8
 370         vxor    v1,v1,v9
 371         vxor    v2,v2,v10
 372         vxor    v3,v3,v11
 373         vxor    v4,v4,v12
 374         vxor    v5,v5,v13
 375         vxor    v6,v6,v14
 376         vxor    v7,v7,v15
 377
 378 #ifdef REFLECT
 379         /*
 380          * vpmsumd produces a 96 bit result in the least significant bits
 381          * of the register. Since we are bit reflected we have to shift it
 382          * left 32 bits so it occupies the least significant bits in the
 383          * bit reflected domain.
 384          */
 385         vsldoi  v0,v0,zeroes,4
 386         vsldoi  v1,v1,zeroes,4
 387         vsldoi  v2,v2,zeroes,4
 388         vsldoi  v3,v3,zeroes,4
 389         vsldoi  v4,v4,zeroes,4
 390         vsldoi  v5,v5,zeroes,4
 391         vsldoi  v6,v6,zeroes,4
 392         vsldoi  v7,v7,zeroes,4
 393 #endif
 394
 395         /* xor with last 1024 bits */
 396         lvx     v8,0,r4
 397         lvx     v9,off16,r4
 398         VPERM(v8,v8,v8,byteswap)
 399         VPERM(v9,v9,v9,byteswap)
 400         lvx     v10,off32,r4
 401         lvx     v11,off48,r4
 402         VPERM(v10,v10,v10,byteswap)
 403         VPERM(v11,v11,v11,byteswap)
 404         lvx     v12,off64,r4
 405         lvx     v13,off80,r4
 406         VPERM(v12,v12,v12,byteswap)
 407         VPERM(v13,v13,v13,byteswap)
 408         lvx     v14,off96,r4
 409         lvx     v15,off112,r4
 410         VPERM(v14,v14,v14,byteswap)
 411         VPERM(v15,v15,v15,byteswap)
 412
 413         addi    r4,r4,8*16
 414
 415         vxor    v16,v0,v8
 416         vxor    v17,v1,v9
 417         vxor    v18,v2,v10
 418         vxor    v19,v3,v11
 419         vxor    v20,v4,v12
 420         vxor    v21,v5,v13
 421         vxor    v22,v6,v14
 422         vxor    v23,v7,v15
 423
 424         li      r0,1
 425         cmpdi   r6,0
 426         addi    r6,r6,128
 427         bne     1b
 428
 429         /* Work out how many bytes we have left */
 430         andi.   r5,r5,127
 431
 432         /* Calculate where in the constant table we need to start */
 433         subfic  r6,r5,128
 434         add     r3,r3,r6
 435
 436         /* How many 16 byte chunks are in the tail */
 437         srdi    r7,r5,4
 438         mtctr   r7
 439
 440         /*
 441          * Reduce the previously calculated 1024 bits to 64 bits, shifting
 442          * 32 bits to include the trailing 32 bits of zeros
 443          */
 444         lvx     v0,0,r3
 445         lvx     v1,off16,r3
 446         lvx     v2,off32,r3
 447         lvx     v3,off48,r3
 448         lvx     v4,off64,r3
 449         lvx     v5,off80,r3
 450         lvx     v6,off96,r3
 451         lvx     v7,off112,r3
 452         addi    r3,r3,8*16
 453
 454         VPMSUMW(v0,v16,v0)
 455         VPMSUMW(v1,v17,v1)
 456         VPMSUMW(v2,v18,v2)
 457         VPMSUMW(v3,v19,v3)
 458         VPMSUMW(v4,v20,v4)
 459         VPMSUMW(v5,v21,v5)
 460         VPMSUMW(v6,v22,v6)
 461         VPMSUMW(v7,v23,v7)
 462
 463         /* Now reduce the tail (0 - 112 bytes) */
 464         cmpdi   r7,0
 465         beq     1f
 466
 467         lvx     v16,0,r4
 468         lvx     v17,0,r3
 469         VPERM(v16,v16,v16,byteswap)
 470         VPMSUMW(v16,v16,v17)
 471         vxor    v0,v0,v16
 472         bdz     1f
 473
 474         lvx     v16,off16,r4
 475         lvx     v17,off16,r3
 476         VPERM(v16,v16,v16,byteswap)
 477         VPMSUMW(v16,v16,v17)
 478         vxor    v0,v0,v16
 479         bdz     1f
 480
 481         lvx     v16,off32,r4
 482         lvx     v17,off32,r3
 483         VPERM(v16,v16,v16,byteswap)
 484         VPMSUMW(v16,v16,v17)
 485         vxor    v0,v0,v16
 486         bdz     1f
 487
 488         lvx     v16,off48,r4
 489         lvx     v17,off48,r3
 490         VPERM(v16,v16,v16,byteswap)
 491         VPMSUMW(v16,v16,v17)
 492         vxor    v0,v0,v16
 493         bdz     1f
 494
 495         lvx     v16,off64,r4
 496         lvx     v17,off64,r3
 497         VPERM(v16,v16,v16,byteswap)
 498         VPMSUMW(v16,v16,v17)
 499         vxor    v0,v0,v16
 500         bdz     1f
 501
 502         lvx     v16,off80,r4
 503         lvx     v17,off80,r3
 504         VPERM(v16,v16,v16,byteswap)
 505         VPMSUMW(v16,v16,v17)
 506         vxor    v0,v0,v16
 507         bdz     1f
 508
 509         lvx     v16,off96,r4
 510         lvx     v17,off96,r3
 511         VPERM(v16,v16,v16,byteswap)
 512         VPMSUMW(v16,v16,v17)
 513         vxor    v0,v0,v16
 514
 515         /* Now xor all the parallel chunks together */
 516 1:      vxor    v0,v0,v1
 517         vxor    v2,v2,v3
 518         vxor    v4,v4,v5
 519         vxor    v6,v6,v7
 520
 521         vxor    v0,v0,v2
 522         vxor    v4,v4,v6
 523
 524         vxor    v0,v0,v4
 525
 526 .Lbarrett_reduction:
 527         /* Barrett constants */
 528         addis   r3,r2,.barrett_constants@toc@ha
 529         addi    r3,r3,.barrett_constants@toc@l
 530
 531         lvx     const1,0,r3
 532         lvx     const2,off16,r3
 533
 534         vsldoi  v1,v0,v0,8
 535         vxor    v0,v0,v1                /* xor two 64 bit results together */
 536
 537 #ifdef REFLECT
 538         /* shift left one bit */
 539         vspltisb v1,1
 540         vsl     v0,v0,v1
 541 #endif
 542
 543         vand    v0,v0,mask_64bit
 544
 545 #ifndef REFLECT
 546         /*
 547          * Now for the Barrett reduction algorithm. The idea is to calculate q,
 548          * the multiple of our polynomial that we need to subtract. By
 549          * doing the computation 2x bits higher (ie 64 bits) and shifting the
 550          * result back down 2x bits, we round down to the nearest multiple.
 551          */
 552         VPMSUMD(v1,v0,const1)   /* ma */
 553         vsldoi  v1,zeroes,v1,8  /* q = floor(ma/(2^64)) */
 554         VPMSUMD(v1,v1,const2)   /* qn */
 555         vxor    v0,v0,v1        /* a - qn, subtraction is xor in GF(2) */
 556
 557         /*
 558          * Get the result into r3. We need to shift it left 8 bytes:
 559          * V0 [ 0 1 2 X ]
 560          * V0 [ 0 X 2 3 ]
 561          */
 562         vsldoi  v0,v0,zeroes,8  /* shift result into top 64 bits */
 563 #else
 564         /*
 565          * The reflected version of Barrett reduction. Instead of bit
 566          * reflecting our data (which is expensive to do), we bit reflect our
 567          * constants and our algorithm, which means the intermediate data in
 568          * our vector registers goes from 0-63 instead of 63-0. We can reflect
 569          * the algorithm because we don't carry in mod 2 arithmetic.
 570          */
 571         vand    v1,v0,mask_32bit        /* bottom 32 bits of a */
 572         VPMSUMD(v1,v1,const1)           /* ma */
 573         vand    v1,v1,mask_32bit        /* bottom 32bits of ma */
 574         VPMSUMD(v1,v1,const2)           /* qn */
 575         vxor    v0,v0,v1                /* a - qn, subtraction is xor in GF(2) */
 576
 577         /*
 578          * Since we are bit reflected, the result (ie the low 32 bits) is in
 579          * the high 32 bits. We just need to shift it left 4 bytes
 580          * V0 [ 0 1 X 3 ]
 581          * V0 [ 0 X 2 3 ]
 582          */
 583         vsldoi  v0,v0,zeroes,4          /* shift result into top 64 bits of */
 584 #endif
 585
 586         /* Get it into r3 */
 587         MFVRD(r3, v0)
 588
 589 .Lout:
 590         subi    r6,r1,56+10*16
 591         subi    r7,r1,56+2*16
 592
 593         lvx     v20,0,r6
 594         lvx     v21,off16,r6
 595         lvx     v22,off32,r6
 596         lvx     v23,off48,r6
 597         lvx     v24,off64,r6
 598         lvx     v25,off80,r6
 599         lvx     v26,off96,r6
 600         lvx     v27,off112,r6
 601         lvx     v28,0,r7
 602         lvx     v29,off16,r7
 603
 604         ld      r31,-8(r1)
 605         ld      r30,-16(r1)
 606         ld      r29,-24(r1)
 607         ld      r28,-32(r1)
 608         ld      r27,-40(r1)
 609         ld      r26,-48(r1)
 610         ld      r25,-56(r1)
 611
 612         blr
 613
 614 .Lfirst_warm_up_done:
 615         lvx     const1,0,r3
 616         addi    r3,r3,16
 617
 618         VPMSUMD(v8,v16,const1)
 619         VPMSUMD(v9,v17,const1)
 620         VPMSUMD(v10,v18,const1)
 621         VPMSUMD(v11,v19,const1)
 622         VPMSUMD(v12,v20,const1)
 623         VPMSUMD(v13,v21,const1)
 624         VPMSUMD(v14,v22,const1)
 625         VPMSUMD(v15,v23,const1)
 626
 627         b       .Lsecond_cool_down
 628
 629 .Lshort:
 630         cmpdi   r5,0
 631         beq     .Lzero
 632
 633         addis   r3,r2,.short_constants@toc@ha
 634         addi    r3,r3,.short_constants@toc@l
 635
 636         /* Calculate where in the constant table we need to start */
 637         subfic  r6,r5,256
 638         add     r3,r3,r6
 639
 640         /* How many 16 byte chunks? */
 641         srdi    r7,r5,4
 642         mtctr   r7
 643
 644         vxor    v19,v19,v19
 645         vxor    v20,v20,v20
 646
 647         lvx     v0,0,r4
 648         lvx     v16,0,r3
 649         VPERM(v0,v0,v16,byteswap)
 650         vxor    v0,v0,v8        /* xor in initial value */
 651         VPMSUMW(v0,v0,v16)
 652         bdz     .Lv0
 653
 654         lvx     v1,off16,r4
 655         lvx     v17,off16,r3
 656         VPERM(v1,v1,v17,byteswap)
 657         VPMSUMW(v1,v1,v17)
 658         bdz     .Lv1
 659
 660         lvx     v2,off32,r4
 661         lvx     v16,off32,r3
 662         VPERM(v2,v2,v16,byteswap)
 663         VPMSUMW(v2,v2,v16)
 664         bdz     .Lv2
 665
 666         lvx     v3,off48,r4
 667         lvx     v17,off48,r3
 668         VPERM(v3,v3,v17,byteswap)
 669         VPMSUMW(v3,v3,v17)
 670         bdz     .Lv3
 671
 672         lvx     v4,off64,r4
 673         lvx     v16,off64,r3
 674         VPERM(v4,v4,v16,byteswap)
 675         VPMSUMW(v4,v4,v16)
 676         bdz     .Lv4
 677
 678         lvx     v5,off80,r4
 679         lvx     v17,off80,r3
 680         VPERM(v5,v5,v17,byteswap)
 681         VPMSUMW(v5,v5,v17)
 682         bdz     .Lv5
 683
 684         lvx     v6,off96,r4
 685         lvx     v16,off96,r3
 686         VPERM(v6,v6,v16,byteswap)
 687         VPMSUMW(v6,v6,v16)
 688         bdz     .Lv6
 689
 690         lvx     v7,off112,r4
 691         lvx     v17,off112,r3
 692         VPERM(v7,v7,v17,byteswap)
 693         VPMSUMW(v7,v7,v17)
 694         bdz     .Lv7
 695
 696         addi    r3,r3,128
 697         addi    r4,r4,128
 698
 699         lvx     v8,0,r4
 700         lvx     v16,0,r3
 701         VPERM(v8,v8,v16,byteswap)
 702         VPMSUMW(v8,v8,v16)
 703         bdz     .Lv8
 704
 705         lvx     v9,off16,r4
 706         lvx     v17,off16,r3
 707         VPERM(v9,v9,v17,byteswap)
 708         VPMSUMW(v9,v9,v17)
 709         bdz     .Lv9
 710
 711         lvx     v10,off32,r4
 712         lvx     v16,off32,r3
 713         VPERM(v10,v10,v16,byteswap)
 714         VPMSUMW(v10,v10,v16)
 715         bdz     .Lv10
 716
 717         lvx     v11,off48,r4
 718         lvx     v17,off48,r3
 719         VPERM(v11,v11,v17,byteswap)
 720         VPMSUMW(v11,v11,v17)
 721         bdz     .Lv11
 722
 723         lvx     v12,off64,r4
 724         lvx     v16,off64,r3
 725         VPERM(v12,v12,v16,byteswap)
 726         VPMSUMW(v12,v12,v16)
 727         bdz     .Lv12
 728
 729         lvx     v13,off80,r4
 730         lvx     v17,off80,r3
 731         VPERM(v13,v13,v17,byteswap)
 732         VPMSUMW(v13,v13,v17)
 733         bdz     .Lv13
 734
 735         lvx     v14,off96,r4
 736         lvx     v16,off96,r3
 737         VPERM(v14,v14,v16,byteswap)
 738         VPMSUMW(v14,v14,v16)
 739         bdz     .Lv14
 740
 741         lvx     v15,off112,r4
 742         lvx     v17,off112,r3
 743         VPERM(v15,v15,v17,byteswap)
 744         VPMSUMW(v15,v15,v17)
 745
 746 .Lv15:  vxor    v19,v19,v15
 747 .Lv14:  vxor    v20,v20,v14
 748 .Lv13:  vxor    v19,v19,v13
 749 .Lv12:  vxor    v20,v20,v12
 750 .Lv11:  vxor    v19,v19,v11
 751 .Lv10:  vxor    v20,v20,v10
 752 .Lv9:   vxor    v19,v19,v9
 753 .Lv8:   vxor    v20,v20,v8
 754 .Lv7:   vxor    v19,v19,v7
 755 .Lv6:   vxor    v20,v20,v6
 756 .Lv5:   vxor    v19,v19,v5
 757 .Lv4:   vxor    v20,v20,v4
 758 .Lv3:   vxor    v19,v19,v3
 759 .Lv2:   vxor    v20,v20,v2
 760 .Lv1:   vxor    v19,v19,v1
 761 .Lv0:   vxor    v20,v20,v0
 762
 763         vxor    v0,v19,v20
 764
 765         b       .Lbarrett_reduction
 766
 767 .Lzero:
 768         mr      r3,r10
 769         b       .Lout
 770
 771 FUNC_END(__crc32_vpmsum)