Fix some bugs when testing opensds ansible
[stor4nfv.git] / src / ceph / src / common / crc32c_aarch64.c
1 #include "acconfig.h"
2 #include "include/int_types.h"
3 #include "common/crc32c_aarch64.h"
4
5 #ifndef HAVE_ARMV8_CRC_CRYPTO_INTRINSICS
6 /* Request crc extension capabilities from the assembler */
7 asm(".arch_extension crc");
8
9 #ifdef HAVE_ARMV8_CRYPTO
10 /* Request crypto extension capabilities from the assembler */
11 asm(".arch_extension crypto");
12 #endif
13
14 #define CRC32CX(crc, value) __asm__("crc32cx %w[c], %w[c], %x[v]":[c]"+r"(crc):[v]"r"(value))
15 #define CRC32CW(crc, value) __asm__("crc32cw %w[c], %w[c], %w[v]":[c]"+r"(crc):[v]"r"(value))
16 #define CRC32CH(crc, value) __asm__("crc32ch %w[c], %w[c], %w[v]":[c]"+r"(crc):[v]"r"(value))
17 #define CRC32CB(crc, value) __asm__("crc32cb %w[c], %w[c], %w[v]":[c]"+r"(crc):[v]"r"(value))
18
19 #define CRC32C3X8(ITR) \
20         __asm__("crc32cx %w[c1], %w[c1], %x[v]":[c1]"+r"(crc1):[v]"r"(*((const uint64_t *)buffer + 42*1 + (ITR))));\
21         __asm__("crc32cx %w[c2], %w[c2], %x[v]":[c2]"+r"(crc2):[v]"r"(*((const uint64_t *)buffer + 42*2 + (ITR))));\
22         __asm__("crc32cx %w[c0], %w[c0], %x[v]":[c0]"+r"(crc0):[v]"r"(*((const uint64_t *)buffer + 42*0 + (ITR))));
23
24 #define CRC32C3X8_ZERO \
25         __asm__("crc32cx %w[c0], %w[c0], xzr":[c0]"+r"(crc0));
26
27 #else /* HAVE_ARMV8_CRC_CRYPTO_INTRINSICS */
28
29 #include <arm_acle.h>
30 #include <arm_neon.h>
31
32 #define CRC32CX(crc, value) (crc) = __crc32cd((crc), (value))
33 #define CRC32CW(crc, value) (crc) = __crc32cw((crc), (value))
34 #define CRC32CH(crc, value) (crc) = __crc32ch((crc), (value))
35 #define CRC32CB(crc, value) (crc) = __crc32cb((crc), (value))
36
37 #define CRC32C3X8(ITR) \
38         crc1 = __crc32cd(crc1, *((const uint64_t *)buffer + 42*1 + (ITR)));\
39         crc2 = __crc32cd(crc2, *((const uint64_t *)buffer + 42*2 + (ITR)));\
40         crc0 = __crc32cd(crc0, *((const uint64_t *)buffer + 42*0 + (ITR)));
41
42 #define CRC32C3X8_ZERO \
43         crc0 = __crc32cd(crc0, (const uint64_t)0);
44
45 #endif /* HAVE_ARMV8_CRC_CRYPTO_INTRINSICS */
46
47 #define CRC32C7X3X8(ITR) do {\
48         CRC32C3X8((ITR)*7+0) \
49         CRC32C3X8((ITR)*7+1) \
50         CRC32C3X8((ITR)*7+2) \
51         CRC32C3X8((ITR)*7+3) \
52         CRC32C3X8((ITR)*7+4) \
53         CRC32C3X8((ITR)*7+5) \
54         CRC32C3X8((ITR)*7+6) \
55         } while(0)
56
57 #define CRC32C7X3X8_ZERO do {\
58         CRC32C3X8_ZERO \
59         CRC32C3X8_ZERO \
60         CRC32C3X8_ZERO \
61         CRC32C3X8_ZERO \
62         CRC32C3X8_ZERO \
63         CRC32C3X8_ZERO \
64         CRC32C3X8_ZERO \
65         } while(0)
66
67 #define PREF4X64L1(PREF_OFFSET, ITR) \
68         __asm__("PRFM PLDL1KEEP, [%x[v],%[c]]"::[v]"r"(buffer), [c]"I"((PREF_OFFSET) + ((ITR) + 0)*64));\
69         __asm__("PRFM PLDL1KEEP, [%x[v],%[c]]"::[v]"r"(buffer), [c]"I"((PREF_OFFSET) + ((ITR) + 1)*64));\
70         __asm__("PRFM PLDL1KEEP, [%x[v],%[c]]"::[v]"r"(buffer), [c]"I"((PREF_OFFSET) + ((ITR) + 2)*64));\
71         __asm__("PRFM PLDL1KEEP, [%x[v],%[c]]"::[v]"r"(buffer), [c]"I"((PREF_OFFSET) + ((ITR) + 3)*64));
72
73 #define PREF1KL1(PREF_OFFSET) \
74         PREF4X64L1((PREF_OFFSET), 0) \
75         PREF4X64L1((PREF_OFFSET), 4) \
76         PREF4X64L1((PREF_OFFSET), 8) \
77         PREF4X64L1((PREF_OFFSET), 12)
78
79 #define PREF4X64L2(PREF_OFFSET, ITR) \
80         __asm__("PRFM PLDL2KEEP, [%x[v],%[c]]"::[v]"r"(buffer), [c]"I"((PREF_OFFSET) + ((ITR) + 0)*64));\
81         __asm__("PRFM PLDL2KEEP, [%x[v],%[c]]"::[v]"r"(buffer), [c]"I"((PREF_OFFSET) + ((ITR) + 1)*64));\
82         __asm__("PRFM PLDL2KEEP, [%x[v],%[c]]"::[v]"r"(buffer), [c]"I"((PREF_OFFSET) + ((ITR) + 2)*64));\
83         __asm__("PRFM PLDL2KEEP, [%x[v],%[c]]"::[v]"r"(buffer), [c]"I"((PREF_OFFSET) + ((ITR) + 3)*64));
84
85 #define PREF1KL2(PREF_OFFSET) \
86         PREF4X64L2((PREF_OFFSET), 0) \
87         PREF4X64L2((PREF_OFFSET), 4) \
88         PREF4X64L2((PREF_OFFSET), 8) \
89         PREF4X64L2((PREF_OFFSET), 12)
90
91
92 uint32_t ceph_crc32c_aarch64(uint32_t crc, unsigned char const *buffer, unsigned len)
93 {
94         int64_t length = len;
95         uint32_t crc0, crc1, crc2;
96
97         if (buffer) {
98 #ifdef HAVE_ARMV8_CRYPTO
99 #ifdef HAVE_ARMV8_CRC_CRYPTO_INTRINSICS
100                 /* Calculate reflected crc with PMULL Instruction */
101                 const poly64_t k1 = 0xe417f38a, k2 = 0x8f158014;
102                 uint64_t t0, t1;
103
104                 /* crc done "by 3" for fixed input block size of 1024 bytes */
105                 while ((length -= 1024) >= 0) {
106                         /* Prefetch data for following block to avoid cache miss */
107                         PREF1KL2(1024*3);
108                         /* Do first 8 bytes here for better pipelining */
109                         crc0 = __crc32cd(crc, *(const uint64_t *)buffer);
110                         crc1 = 0;
111                         crc2 = 0;
112                         buffer += sizeof(uint64_t);
113
114                         /* Process block inline
115                         Process crc0 last to avoid dependency with above */
116                         CRC32C7X3X8(0);
117                         CRC32C7X3X8(1);
118                         CRC32C7X3X8(2);
119                         CRC32C7X3X8(3);
120                         CRC32C7X3X8(4);
121                         CRC32C7X3X8(5);
122
123                         buffer += 42*3*sizeof(uint64_t);
124                         /* Prefetch data for following block to avoid cache miss */
125                         PREF1KL1(1024);
126
127                         /* Merge crc0 and crc1 into crc2
128                            crc1 multiply by K2
129                            crc0 multiply by K1 */
130
131                         t1 = (uint64_t)vmull_p64(crc1, k2);
132                         t0 = (uint64_t)vmull_p64(crc0, k1);
133                         crc = __crc32cd(crc2, *(const uint64_t *)buffer);
134                         crc1 = __crc32cd(0, t1);
135                         crc ^= crc1;
136                         crc0 = __crc32cd(0, t0);
137                         crc ^= crc0;
138
139                         buffer += sizeof(uint64_t);
140                 }
141 #else /* !HAVE_ARMV8_CRC_CRYPTO_INTRINSICS */
142                 __asm__("mov    x16,            #0xf38a         \n\t"
143                         "movk   x16,            #0xe417, lsl 16 \n\t"
144                         "mov    v1.2d[0],       x16             \n\t"
145                         "mov    x16,            #0x8014         \n\t"
146                         "movk   x16,            #0x8f15, lsl 16 \n\t"
147                         "mov    v0.2d[0],       x16             \n\t"
148                         :::"x16");
149
150                 while ((length -= 1024) >= 0) {
151                         PREF1KL2(1024*3);
152                         __asm__("crc32cx %w[c0], %w[c], %x[v]\n\t"
153                                 :[c0]"=r"(crc0):[c]"r"(crc), [v]"r"(*(const uint64_t *)buffer):);
154                         crc1 = 0;
155                         crc2 = 0;
156                         buffer += sizeof(uint64_t);
157
158                         CRC32C7X3X8(0);
159                         CRC32C7X3X8(1);
160                         CRC32C7X3X8(2);
161                         CRC32C7X3X8(3);
162                         CRC32C7X3X8(4);
163                         CRC32C7X3X8(5);
164
165                         buffer += 42*3*sizeof(uint64_t);
166                         PREF1KL1(1024);
167                         __asm__("mov            v2.2d[0],       %x[c1]          \n\t"
168                                 "pmull          v2.1q,          v2.1d,  v0.1d   \n\t"
169                                 "mov            v3.2d[0],       %x[c0]          \n\t"
170                                 "pmull          v3.1q,          v3.1d,  v1.1d   \n\t"
171                                 "crc32cx        %w[c],          %w[c2], %x[v]   \n\t"
172                                 "mov            %x[c1],         v2.2d[0]        \n\t"
173                                 "crc32cx        %w[c1],         wzr,    %x[c1]  \n\t"
174                                 "eor            %w[c],          %w[c],  %w[c1]  \n\t"
175                                 "mov            %x[c0],         v3.2d[0]        \n\t"
176                                 "crc32cx        %w[c0],         wzr,    %x[c0]  \n\t"
177                                 "eor            %w[c],          %w[c],  %w[c0]  \n\t"
178                                 :[c1]"+r"(crc1), [c0]"+r"(crc0), [c2]"+r"(crc2), [c]"+r"(crc)
179                                 :[v]"r"(*((const uint64_t *)buffer)));
180                         buffer += sizeof(uint64_t);
181                 }
182 #endif /* HAVE_ARMV8_CRC_CRYPTO_INTRINSICS */
183
184                 if(!(length += 1024))
185                         return crc;
186
187 #endif /* HAVE_ARMV8_CRYPTO */
188                 while ((length -= sizeof(uint64_t)) >= 0) {
189                         CRC32CX(crc, *(uint64_t *)buffer);
190                         buffer += sizeof(uint64_t);
191                 }
192
193                 /* The following is more efficient than the straight loop */
194                 if (length & sizeof(uint32_t)) {
195                         CRC32CW(crc, *(uint32_t *)buffer);
196                         buffer += sizeof(uint32_t);
197                 }
198                 if (length & sizeof(uint16_t)) {
199                         CRC32CH(crc, *(uint16_t *)buffer);
200                         buffer += sizeof(uint16_t);
201                 }
202                 if (length & sizeof(uint8_t))
203                         CRC32CB(crc, *buffer);
204         } else {
205 #ifdef HAVE_ARMV8_CRYPTO
206 #ifdef HAVE_ARMV8_CRC_CRYPTO_INTRINSICS
207                 const poly64_t k1 = 0xe417f38a;
208                 uint64_t t0;
209
210                 while ((length -= 1024) >= 0) {
211                         crc0 = __crc32cd(crc, 0);
212
213                         CRC32C7X3X8_ZERO;
214                         CRC32C7X3X8_ZERO;
215                         CRC32C7X3X8_ZERO;
216                         CRC32C7X3X8_ZERO;
217                         CRC32C7X3X8_ZERO;
218                         CRC32C7X3X8_ZERO;
219
220                         /* Merge crc0 into crc: crc0 multiply by K1 */
221
222                         t0 = (uint64_t)vmull_p64(crc0, k1);
223                         crc = __crc32cd(0, t0);
224                 }
225 #else /* !HAVE_ARMV8_CRC_CRYPTO_INTRINSICS */
226                 __asm__("mov    x16,            #0xf38a         \n\t"
227                         "movk   x16,            #0xe417, lsl 16 \n\t"
228                         "mov    v1.2d[0],       x16             \n\t"
229                         :::"x16");
230
231                 while ((length -= 1024) >= 0) {
232                         __asm__("crc32cx %w[c0], %w[c], xzr\n\t"
233                                 :[c0]"=r"(crc0):[c]"r"(crc));
234
235                         CRC32C7X3X8_ZERO;
236                         CRC32C7X3X8_ZERO;
237                         CRC32C7X3X8_ZERO;
238                         CRC32C7X3X8_ZERO;
239                         CRC32C7X3X8_ZERO;
240                         CRC32C7X3X8_ZERO;
241
242                         __asm__("mov            v3.2d[0],       %x[c0]          \n\t"
243                                 "pmull          v3.1q,          v3.1d,  v1.1d   \n\t"
244                                 "mov            %x[c0],         v3.2d[0]        \n\t"
245                                 "crc32cx        %w[c],          wzr,    %x[c0]  \n\t"
246                                 :[c]"=r"(crc)
247                                 :[c0]"r"(crc0));
248                 }
249 #endif /* HAVE_ARMV8_CRC_CRYPTO_INTRINSICS */
250
251                 if(!(length += 1024))
252                         return crc;
253
254 #endif /* HAVE_ARMV8_CRYPTO */
255                 while ((length -= sizeof(uint64_t)) >= 0)
256                         CRC32CX(crc, 0);
257
258                 /* The following is more efficient than the straight loop */
259                 if (length & sizeof(uint32_t))
260                         CRC32CW(crc, 0);
261
262                 if (length & sizeof(uint16_t))
263                         CRC32CH(crc, 0);
264
265                 if (length & sizeof(uint8_t))
266                         CRC32CB(crc, 0);
267         }
268         return crc;
269 }