2 #include "include/int_types.h"
3 #include "common/crc32c_aarch64.h"
5 #ifndef HAVE_ARMV8_CRC_CRYPTO_INTRINSICS
6 /* Request crc extension capabilities from the assembler */
7 asm(".arch_extension crc");
9 #ifdef HAVE_ARMV8_CRYPTO
10 /* Request crypto extension capabilities from the assembler */
11 asm(".arch_extension crypto");
14 #define CRC32CX(crc, value) __asm__("crc32cx %w[c], %w[c], %x[v]":[c]"+r"(crc):[v]"r"(value))
15 #define CRC32CW(crc, value) __asm__("crc32cw %w[c], %w[c], %w[v]":[c]"+r"(crc):[v]"r"(value))
16 #define CRC32CH(crc, value) __asm__("crc32ch %w[c], %w[c], %w[v]":[c]"+r"(crc):[v]"r"(value))
17 #define CRC32CB(crc, value) __asm__("crc32cb %w[c], %w[c], %w[v]":[c]"+r"(crc):[v]"r"(value))
19 #define CRC32C3X8(ITR) \
20 __asm__("crc32cx %w[c1], %w[c1], %x[v]":[c1]"+r"(crc1):[v]"r"(*((const uint64_t *)buffer + 42*1 + (ITR))));\
21 __asm__("crc32cx %w[c2], %w[c2], %x[v]":[c2]"+r"(crc2):[v]"r"(*((const uint64_t *)buffer + 42*2 + (ITR))));\
22 __asm__("crc32cx %w[c0], %w[c0], %x[v]":[c0]"+r"(crc0):[v]"r"(*((const uint64_t *)buffer + 42*0 + (ITR))));
24 #define CRC32C3X8_ZERO \
25 __asm__("crc32cx %w[c0], %w[c0], xzr":[c0]"+r"(crc0));
27 #else /* HAVE_ARMV8_CRC_CRYPTO_INTRINSICS */
32 #define CRC32CX(crc, value) (crc) = __crc32cd((crc), (value))
33 #define CRC32CW(crc, value) (crc) = __crc32cw((crc), (value))
34 #define CRC32CH(crc, value) (crc) = __crc32ch((crc), (value))
35 #define CRC32CB(crc, value) (crc) = __crc32cb((crc), (value))
37 #define CRC32C3X8(ITR) \
38 crc1 = __crc32cd(crc1, *((const uint64_t *)buffer + 42*1 + (ITR)));\
39 crc2 = __crc32cd(crc2, *((const uint64_t *)buffer + 42*2 + (ITR)));\
40 crc0 = __crc32cd(crc0, *((const uint64_t *)buffer + 42*0 + (ITR)));
42 #define CRC32C3X8_ZERO \
43 crc0 = __crc32cd(crc0, (const uint64_t)0);
45 #endif /* HAVE_ARMV8_CRC_CRYPTO_INTRINSICS */
47 #define CRC32C7X3X8(ITR) do {\
48 CRC32C3X8((ITR)*7+0) \
49 CRC32C3X8((ITR)*7+1) \
50 CRC32C3X8((ITR)*7+2) \
51 CRC32C3X8((ITR)*7+3) \
52 CRC32C3X8((ITR)*7+4) \
53 CRC32C3X8((ITR)*7+5) \
54 CRC32C3X8((ITR)*7+6) \
57 #define CRC32C7X3X8_ZERO do {\
67 #define PREF4X64L1(PREF_OFFSET, ITR) \
68 __asm__("PRFM PLDL1KEEP, [%x[v],%[c]]"::[v]"r"(buffer), [c]"I"((PREF_OFFSET) + ((ITR) + 0)*64));\
69 __asm__("PRFM PLDL1KEEP, [%x[v],%[c]]"::[v]"r"(buffer), [c]"I"((PREF_OFFSET) + ((ITR) + 1)*64));\
70 __asm__("PRFM PLDL1KEEP, [%x[v],%[c]]"::[v]"r"(buffer), [c]"I"((PREF_OFFSET) + ((ITR) + 2)*64));\
71 __asm__("PRFM PLDL1KEEP, [%x[v],%[c]]"::[v]"r"(buffer), [c]"I"((PREF_OFFSET) + ((ITR) + 3)*64));
73 #define PREF1KL1(PREF_OFFSET) \
74 PREF4X64L1((PREF_OFFSET), 0) \
75 PREF4X64L1((PREF_OFFSET), 4) \
76 PREF4X64L1((PREF_OFFSET), 8) \
77 PREF4X64L1((PREF_OFFSET), 12)
79 #define PREF4X64L2(PREF_OFFSET, ITR) \
80 __asm__("PRFM PLDL2KEEP, [%x[v],%[c]]"::[v]"r"(buffer), [c]"I"((PREF_OFFSET) + ((ITR) + 0)*64));\
81 __asm__("PRFM PLDL2KEEP, [%x[v],%[c]]"::[v]"r"(buffer), [c]"I"((PREF_OFFSET) + ((ITR) + 1)*64));\
82 __asm__("PRFM PLDL2KEEP, [%x[v],%[c]]"::[v]"r"(buffer), [c]"I"((PREF_OFFSET) + ((ITR) + 2)*64));\
83 __asm__("PRFM PLDL2KEEP, [%x[v],%[c]]"::[v]"r"(buffer), [c]"I"((PREF_OFFSET) + ((ITR) + 3)*64));
85 #define PREF1KL2(PREF_OFFSET) \
86 PREF4X64L2((PREF_OFFSET), 0) \
87 PREF4X64L2((PREF_OFFSET), 4) \
88 PREF4X64L2((PREF_OFFSET), 8) \
89 PREF4X64L2((PREF_OFFSET), 12)
92 uint32_t ceph_crc32c_aarch64(uint32_t crc, unsigned char const *buffer, unsigned len)
95 uint32_t crc0, crc1, crc2;
98 #ifdef HAVE_ARMV8_CRYPTO
99 #ifdef HAVE_ARMV8_CRC_CRYPTO_INTRINSICS
100 /* Calculate reflected crc with PMULL Instruction */
101 const poly64_t k1 = 0xe417f38a, k2 = 0x8f158014;
104 /* crc done "by 3" for fixed input block size of 1024 bytes */
105 while ((length -= 1024) >= 0) {
106 /* Prefetch data for following block to avoid cache miss */
108 /* Do first 8 bytes here for better pipelining */
109 crc0 = __crc32cd(crc, *(const uint64_t *)buffer);
112 buffer += sizeof(uint64_t);
114 /* Process block inline
115 Process crc0 last to avoid dependency with above */
123 buffer += 42*3*sizeof(uint64_t);
124 /* Prefetch data for following block to avoid cache miss */
127 /* Merge crc0 and crc1 into crc2
129 crc0 multiply by K1 */
131 t1 = (uint64_t)vmull_p64(crc1, k2);
132 t0 = (uint64_t)vmull_p64(crc0, k1);
133 crc = __crc32cd(crc2, *(const uint64_t *)buffer);
134 crc1 = __crc32cd(0, t1);
136 crc0 = __crc32cd(0, t0);
139 buffer += sizeof(uint64_t);
141 #else /* !HAVE_ARMV8_CRC_CRYPTO_INTRINSICS */
142 __asm__("mov x16, #0xf38a \n\t"
143 "movk x16, #0xe417, lsl 16 \n\t"
144 "mov v1.2d[0], x16 \n\t"
145 "mov x16, #0x8014 \n\t"
146 "movk x16, #0x8f15, lsl 16 \n\t"
147 "mov v0.2d[0], x16 \n\t"
150 while ((length -= 1024) >= 0) {
152 __asm__("crc32cx %w[c0], %w[c], %x[v]\n\t"
153 :[c0]"=r"(crc0):[c]"r"(crc), [v]"r"(*(const uint64_t *)buffer):);
156 buffer += sizeof(uint64_t);
165 buffer += 42*3*sizeof(uint64_t);
167 __asm__("mov v2.2d[0], %x[c1] \n\t"
168 "pmull v2.1q, v2.1d, v0.1d \n\t"
169 "mov v3.2d[0], %x[c0] \n\t"
170 "pmull v3.1q, v3.1d, v1.1d \n\t"
171 "crc32cx %w[c], %w[c2], %x[v] \n\t"
172 "mov %x[c1], v2.2d[0] \n\t"
173 "crc32cx %w[c1], wzr, %x[c1] \n\t"
174 "eor %w[c], %w[c], %w[c1] \n\t"
175 "mov %x[c0], v3.2d[0] \n\t"
176 "crc32cx %w[c0], wzr, %x[c0] \n\t"
177 "eor %w[c], %w[c], %w[c0] \n\t"
178 :[c1]"+r"(crc1), [c0]"+r"(crc0), [c2]"+r"(crc2), [c]"+r"(crc)
179 :[v]"r"(*((const uint64_t *)buffer)));
180 buffer += sizeof(uint64_t);
182 #endif /* HAVE_ARMV8_CRC_CRYPTO_INTRINSICS */
184 if(!(length += 1024))
187 #endif /* HAVE_ARMV8_CRYPTO */
188 while ((length -= sizeof(uint64_t)) >= 0) {
189 CRC32CX(crc, *(uint64_t *)buffer);
190 buffer += sizeof(uint64_t);
193 /* The following is more efficient than the straight loop */
194 if (length & sizeof(uint32_t)) {
195 CRC32CW(crc, *(uint32_t *)buffer);
196 buffer += sizeof(uint32_t);
198 if (length & sizeof(uint16_t)) {
199 CRC32CH(crc, *(uint16_t *)buffer);
200 buffer += sizeof(uint16_t);
202 if (length & sizeof(uint8_t))
203 CRC32CB(crc, *buffer);
205 #ifdef HAVE_ARMV8_CRYPTO
206 #ifdef HAVE_ARMV8_CRC_CRYPTO_INTRINSICS
207 const poly64_t k1 = 0xe417f38a;
210 while ((length -= 1024) >= 0) {
211 crc0 = __crc32cd(crc, 0);
220 /* Merge crc0 into crc: crc0 multiply by K1 */
222 t0 = (uint64_t)vmull_p64(crc0, k1);
223 crc = __crc32cd(0, t0);
225 #else /* !HAVE_ARMV8_CRC_CRYPTO_INTRINSICS */
226 __asm__("mov x16, #0xf38a \n\t"
227 "movk x16, #0xe417, lsl 16 \n\t"
228 "mov v1.2d[0], x16 \n\t"
231 while ((length -= 1024) >= 0) {
232 __asm__("crc32cx %w[c0], %w[c], xzr\n\t"
233 :[c0]"=r"(crc0):[c]"r"(crc));
242 __asm__("mov v3.2d[0], %x[c0] \n\t"
243 "pmull v3.1q, v3.1d, v1.1d \n\t"
244 "mov %x[c0], v3.2d[0] \n\t"
245 "crc32cx %w[c], wzr, %x[c0] \n\t"
249 #endif /* HAVE_ARMV8_CRC_CRYPTO_INTRINSICS */
251 if(!(length += 1024))
254 #endif /* HAVE_ARMV8_CRYPTO */
255 while ((length -= sizeof(uint64_t)) >= 0)
258 /* The following is more efficient than the straight loop */
259 if (length & sizeof(uint32_t))
262 if (length & sizeof(uint16_t))
265 if (length & sizeof(uint8_t))