2 * Calculate the checksum of data that is 16 byte aligned and a multiple of
5 * The first step is to reduce it to 1024 bits. We do this in 8 parallel
6 * chunks in order to mask the latency of the vpmsum instructions. If we
7 * have more than 32 kB of data to checksum we repeat this step multiple
8 * times, passing in the previous 1024 bits.
10 * The next step is to reduce the 1024 bits to 64 bits. This step adds
11 * 32 bits of 0s to the end - this matches what a CRC does. We just
12 * calculate constants that land the data in this 32 bits.
14 * We then use fixed point Barrett reduction to compute a mod n over GF(2)
15 * for n = CRC using POWER8 instructions. We use x = 32.
17 * http://en.wikipedia.org/wiki/Barrett_reduction
19 * Copyright (C) 2015 Anton Blanchard <anton@au.ibm.com>, IBM
20 * Copyright (C) 2017 International Business Machines Corp.
21 * All rights reserved.
23 * This program is free software; you can redistribute it and/or
24 * modify it under the terms of the GNU General Public License
25 * as published by the Free Software Foundation; either version
26 * 2 of the License, or (at your option) any later version.
29 #include "common/ppc-opcode.h"
45 /* byte reverse permute constant */
46 .octa 0x0F0E0D0C0B0A09080706050403020100
49 #include "crc32c_ppc_constants.h"
53 #if defined(__BIG_ENDIAN__) && defined(REFLECT)
55 #elif defined(__LITTLE_ENDIAN__) && !defined(REFLECT)
73 #define mask_32bit v27
74 #define mask_64bit v28
78 #define VPERM(A, B, C, D) vperm A, B, C, D
80 #define VPERM(A, B, C, D)
83 /* unsigned int __crc32_vpmsum(unsigned int crc, void *p, unsigned long len) */
84 FUNC_START(__crc32_vpmsum)
102 /* Enough room for saving 10 non volatile VMX registers */
119 vxor zeroes,zeroes,zeroes
122 vsldoi mask_32bit,zeroes,v0,4
123 vsldoi mask_64bit,zeroes,v0,8
125 /* Get the initial value into v8 */
129 vsldoi v8,zeroes,v8,8 /* shift into bottom 32 bits */
131 vsldoi v8,v8,zeroes,4 /* shift into top 32 bits */
135 addis r3,r2,.byteswap_constant@toc@ha
136 addi r3,r3,.byteswap_constant@toc@l
147 /* Checksum in blocks of MAX_SIZE */
156 /* our main loop does 128 bytes at a time */
160 * Work out the offset into the constants table to start at. Each
161 * constant is 16 bytes, and it is used against 128 bytes of input
162 * data - 128 / 16 = 8
168 /* We reduce our final 128 bytes in a separate step */
172 addis r3,r2,.constants@toc@ha
173 addi r3,r3,.constants@toc@l
175 /* Find the start of our constants */
178 /* zero v0-v7 which will contain our checksums */
191 * If we are looping back to consume more data we use the values
192 * already in v16-v23.
197 /* First warm up pass */
200 VPERM(v16,v16,v16,byteswap)
201 VPERM(v17,v17,v17,byteswap)
204 VPERM(v18,v18,v18,byteswap)
205 VPERM(v19,v19,v19,byteswap)
208 VPERM(v20,v20,v20,byteswap)
209 VPERM(v21,v21,v21,byteswap)
212 VPERM(v22,v22,v22,byteswap)
213 VPERM(v23,v23,v23,byteswap)
216 /* xor in initial value */
219 2: bdz .Lfirst_warm_up_done
224 /* Second warm up pass */
225 VPMSUMD(v8,v16,const1)
227 VPERM(v16,v16,v16,byteswap)
230 VPMSUMD(v9,v17,const1)
232 VPERM(v17,v17,v17,byteswap)
235 VPMSUMD(v10,v18,const1)
237 VPERM(v18,v18,v18,byteswap)
240 VPMSUMD(v11,v19,const1)
242 VPERM(v19,v19,v19,byteswap)
245 VPMSUMD(v12,v20,const1)
247 VPERM(v20,v20,v20,byteswap)
250 VPMSUMD(v13,v21,const1)
252 VPERM(v21,v21,v21,byteswap)
255 VPMSUMD(v14,v22,const1)
257 VPERM(v22,v22,v22,byteswap)
260 VPMSUMD(v15,v23,const1)
262 VPERM(v23,v23,v23,byteswap)
266 bdz .Lfirst_cool_down
269 * main loop. We modulo schedule it such that it takes three iterations
270 * to complete - first iteration load, second iteration vpmsum, third
279 VPMSUMD(v8,v16,const2)
281 VPERM(v16,v16,v16,byteswap)
285 VPMSUMD(v9,v17,const2)
287 VPERM(v17,v17,v17,byteswap)
291 VPMSUMD(v10,v18,const2)
293 VPERM(v18,v18,v18,byteswap)
297 VPMSUMD(v11,v19,const2)
299 VPERM(v19,v19,v19,byteswap)
304 VPMSUMD(v12,v20,const1)
306 VPERM(v20,v20,v20,byteswap)
310 VPMSUMD(v13,v21,const1)
312 VPERM(v21,v21,v21,byteswap)
316 VPMSUMD(v14,v22,const1)
318 VPERM(v22,v22,v22,byteswap)
322 VPMSUMD(v15,v23,const1)
324 VPERM(v23,v23,v23,byteswap)
331 /* First cool down pass */
336 VPMSUMD(v8,v16,const1)
340 VPMSUMD(v9,v17,const1)
344 VPMSUMD(v10,v18,const1)
348 VPMSUMD(v11,v19,const1)
352 VPMSUMD(v12,v20,const1)
356 VPMSUMD(v13,v21,const1)
360 VPMSUMD(v14,v22,const1)
364 VPMSUMD(v15,v23,const1)
368 /* Second cool down pass */
380 * vpmsumd produces a 96 bit result in the least significant bits
381 * of the register. Since we are bit reflected we have to shift it
382 * left 32 bits so it occupies the least significant bits in the
383 * bit reflected domain.
385 vsldoi v0,v0,zeroes,4
386 vsldoi v1,v1,zeroes,4
387 vsldoi v2,v2,zeroes,4
388 vsldoi v3,v3,zeroes,4
389 vsldoi v4,v4,zeroes,4
390 vsldoi v5,v5,zeroes,4
391 vsldoi v6,v6,zeroes,4
392 vsldoi v7,v7,zeroes,4
395 /* xor with last 1024 bits */
398 VPERM(v8,v8,v8,byteswap)
399 VPERM(v9,v9,v9,byteswap)
402 VPERM(v10,v10,v10,byteswap)
403 VPERM(v11,v11,v11,byteswap)
406 VPERM(v12,v12,v12,byteswap)
407 VPERM(v13,v13,v13,byteswap)
410 VPERM(v14,v14,v14,byteswap)
411 VPERM(v15,v15,v15,byteswap)
429 /* Work out how many bytes we have left */
432 /* Calculate where in the constant table we need to start */
436 /* How many 16 byte chunks are in the tail */
441 * Reduce the previously calculated 1024 bits to 64 bits, shifting
442 * 32 bits to include the trailing 32 bits of zeros
463 /* Now reduce the tail (0 - 112 bytes) */
469 VPERM(v16,v16,v16,byteswap)
476 VPERM(v16,v16,v16,byteswap)
483 VPERM(v16,v16,v16,byteswap)
490 VPERM(v16,v16,v16,byteswap)
497 VPERM(v16,v16,v16,byteswap)
504 VPERM(v16,v16,v16,byteswap)
511 VPERM(v16,v16,v16,byteswap)
515 /* Now xor all the parallel chunks together */
527 /* Barrett constants */
528 addis r3,r2,.barrett_constants@toc@ha
529 addi r3,r3,.barrett_constants@toc@l
535 vxor v0,v0,v1 /* xor two 64 bit results together */
538 /* shift left one bit */
543 vand v0,v0,mask_64bit
547 * Now for the Barrett reduction algorithm. The idea is to calculate q,
548 * the multiple of our polynomial that we need to subtract. By
549 * doing the computation 2x bits higher (ie 64 bits) and shifting the
550 * result back down 2x bits, we round down to the nearest multiple.
552 VPMSUMD(v1,v0,const1) /* ma */
553 vsldoi v1,zeroes,v1,8 /* q = floor(ma/(2^64)) */
554 VPMSUMD(v1,v1,const2) /* qn */
555 vxor v0,v0,v1 /* a - qn, subtraction is xor in GF(2) */
558 * Get the result into r3. We need to shift it left 8 bytes:
562 vsldoi v0,v0,zeroes,8 /* shift result into top 64 bits */
565 * The reflected version of Barrett reduction. Instead of bit
566 * reflecting our data (which is expensive to do), we bit reflect our
567 * constants and our algorithm, which means the intermediate data in
568 * our vector registers goes from 0-63 instead of 63-0. We can reflect
569 * the algorithm because we don't carry in mod 2 arithmetic.
571 vand v1,v0,mask_32bit /* bottom 32 bits of a */
572 VPMSUMD(v1,v1,const1) /* ma */
573 vand v1,v1,mask_32bit /* bottom 32bits of ma */
574 VPMSUMD(v1,v1,const2) /* qn */
575 vxor v0,v0,v1 /* a - qn, subtraction is xor in GF(2) */
578 * Since we are bit reflected, the result (ie the low 32 bits) is in
579 * the high 32 bits. We just need to shift it left 4 bytes
583 vsldoi v0,v0,zeroes,4 /* shift result into top 64 bits of */
614 .Lfirst_warm_up_done:
618 VPMSUMD(v8,v16,const1)
619 VPMSUMD(v9,v17,const1)
620 VPMSUMD(v10,v18,const1)
621 VPMSUMD(v11,v19,const1)
622 VPMSUMD(v12,v20,const1)
623 VPMSUMD(v13,v21,const1)
624 VPMSUMD(v14,v22,const1)
625 VPMSUMD(v15,v23,const1)
633 addis r3,r2,.short_constants@toc@ha
634 addi r3,r3,.short_constants@toc@l
636 /* Calculate where in the constant table we need to start */
640 /* How many 16 byte chunks? */
649 VPERM(v0,v0,v16,byteswap)
650 vxor v0,v0,v8 /* xor in initial value */
656 VPERM(v1,v1,v17,byteswap)
662 VPERM(v2,v2,v16,byteswap)
668 VPERM(v3,v3,v17,byteswap)
674 VPERM(v4,v4,v16,byteswap)
680 VPERM(v5,v5,v17,byteswap)
686 VPERM(v6,v6,v16,byteswap)
692 VPERM(v7,v7,v17,byteswap)
701 VPERM(v8,v8,v16,byteswap)
707 VPERM(v9,v9,v17,byteswap)
713 VPERM(v10,v10,v16,byteswap)
719 VPERM(v11,v11,v17,byteswap)
725 VPERM(v12,v12,v16,byteswap)
731 VPERM(v13,v13,v17,byteswap)
737 VPERM(v14,v14,v16,byteswap)
743 VPERM(v15,v15,v17,byteswap)
746 .Lv15: vxor v19,v19,v15
747 .Lv14: vxor v20,v20,v14
748 .Lv13: vxor v19,v19,v13
749 .Lv12: vxor v20,v20,v12
750 .Lv11: vxor v19,v19,v11
751 .Lv10: vxor v20,v20,v10
752 .Lv9: vxor v19,v19,v9
753 .Lv8: vxor v20,v20,v8
754 .Lv7: vxor v19,v19,v7
755 .Lv6: vxor v20,v20,v6
756 .Lv5: vxor v19,v19,v5
757 .Lv4: vxor v20,v20,v4
758 .Lv3: vxor v19,v19,v3
759 .Lv2: vxor v20,v20,v2
760 .Lv1: vxor v19,v19,v1
761 .Lv0: vxor v20,v20,v0
765 b .Lbarrett_reduction
771 FUNC_END(__crc32_vpmsum)