2 * Ceph - scalable distributed file system
4 * Copyright (C) 2014 CERN (Switzerland)
6 * Author: Andreas-Joachim Peters <Andreas.Joachim.Peters@cern.ch>
8 * This library is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License as published by the Free Software Foundation; either
11 * version 2.1 of the License, or (at your option) any later version.
15 // -----------------------------------------------------------------------------
18 // -----------------------------------------------------------------------------
19 #include "common/debug.h"
20 #include "ErasureCodeIsa.h"
22 #include "include/assert.h"
25 // -----------------------------------------------------------------------------
27 #include "isa-l/include/erasure_code.h"
29 // -----------------------------------------------------------------------------
30 #define dout_context g_ceph_context
31 #define dout_subsys ceph_subsys_osd
33 #define dout_prefix _prefix(_dout)
34 // -----------------------------------------------------------------------------
36 // -----------------------------------------------------------------------------
39 _prefix(std::ostream* _dout)
41 return *_dout << "ErasureCodeIsa: ";
43 // -----------------------------------------------------------------------------
45 const std::string ErasureCodeIsaDefault::DEFAULT_K("7");
46 const std::string ErasureCodeIsaDefault::DEFAULT_M("3");
49 // -----------------------------------------------------------------------------
52 ErasureCodeIsa::init(ErasureCodeProfile &profile, ostream *ss)
55 err |= parse(profile, ss);
59 return ErasureCode::init(profile, ss);
62 // -----------------------------------------------------------------------------
65 ErasureCodeIsa::get_chunk_size(unsigned int object_size) const
67 unsigned alignment = get_alignment();
68 unsigned chunk_size = ( object_size + k - 1 ) / k;
69 dout(20) << "get_chunk_size: chunk_size " << chunk_size
70 << " must be modulo " << alignment << dendl;
71 unsigned modulo = chunk_size % alignment;
73 dout(10) << "get_chunk_size: " << chunk_size
74 << " padded to " << chunk_size + alignment - modulo << dendl;
75 chunk_size += alignment - modulo;
80 // -----------------------------------------------------------------------------
82 int ErasureCodeIsa::encode_chunks(const set<int> &want_to_encode,
83 map<int, bufferlist> *encoded)
86 for (int i = 0; i < k + m; i++)
87 chunks[i] = (*encoded)[i].c_str();
88 isa_encode(&chunks[0], &chunks[k], (*encoded)[0].length());
92 int ErasureCodeIsa::decode_chunks(const set<int> &want_to_read,
93 const map<int, bufferlist> &chunks,
94 map<int, bufferlist> *decoded)
96 unsigned blocksize = (*chunks.begin()).second.length();
97 int erasures[k + m + 1];
98 int erasures_count = 0;
101 for (int i = 0; i < k + m; i++) {
102 if (chunks.find(i) == chunks.end()) {
103 erasures[erasures_count] = i;
107 data[i] = (*decoded)[i].c_str();
109 coding[i - k] = (*decoded)[i].c_str();
111 erasures[erasures_count] = -1;
112 assert(erasures_count > 0);
113 return isa_decode(erasures, data, coding, blocksize);
116 // -----------------------------------------------------------------------------
119 ErasureCodeIsaDefault::isa_encode(char **data,
125 // single parity stripe
126 region_xor((unsigned char**) data, (unsigned char*) coding[0], k, blocksize);
128 ec_encode_data(blocksize, k, m, encode_tbls,
129 (unsigned char**) data, (unsigned char**) coding);
132 // -----------------------------------------------------------------------------
135 ErasureCodeIsaDefault::erasure_contains(int *erasures, int i)
137 for (int l = 0; erasures[l] != -1; l++) {
138 if (erasures[l] == i)
144 // -----------------------------------------------------------------------------
148 // -----------------------------------------------------------------------------
151 ErasureCodeIsaDefault::isa_decode(int *erasures,
160 for (int l = 0; erasures[l] != -1; l++) {
164 unsigned char *recover_source[k];
165 unsigned char *recover_target[m];
167 memset(recover_source, 0, sizeof (recover_source));
168 memset(recover_target, 0, sizeof (recover_target));
170 // ---------------------------------------------
171 // Assign source and target buffers
172 // ---------------------------------------------
173 for (i = 0, s = 0, r = 0; ((r < k) || (s < nerrs)) && (i < (k + m)); i++) {
174 if (!erasure_contains(erasures, i)) {
177 recover_source[r] = (unsigned char*) data[i];
179 recover_source[r] = (unsigned char*) coding[i - k];
186 recover_target[s] = (unsigned char*) data[i];
188 recover_target[s] = (unsigned char*) coding[i - k];
196 // single parity decoding
198 dout(20) << "isa_decode: reconstruct using region xor [" <<
199 erasures[0] << "]" << dendl;
200 region_xor(recover_source, recover_target[0], k, blocksize);
205 if ((matrixtype == kVandermonde) &&
207 (erasures[0] < (k + 1))) {
208 // use xor decoding if a data chunk is missing or the first coding chunk
209 dout(20) << "isa_decode: reconstruct using region xor [" <<
210 erasures[0] << "]" << dendl;
213 region_xor(recover_source, recover_target[0], k, blocksize);
217 unsigned char d[k * (m + k)];
218 unsigned char decode_tbls[k * (m + k)*32];
219 unsigned char *p_tbls = decode_tbls;
226 std::string erasure_signature; // describes a matrix configuration for caching
228 // ---------------------------------------------
229 // Construct b by removing error rows
230 // ---------------------------------------------
232 for (i = 0, r = 0; i < k; i++, r++) {
234 while (erasure_contains(erasures, r))
239 snprintf(id, sizeof (id), "+%d", r);
240 erasure_signature += id;
243 for (int p = 0; p < nerrs; p++) {
245 snprintf(id, sizeof (id), "-%d", erasures[p]);
246 erasure_signature += id;
249 // ---------------------------------------------
250 // Try to get an already computed matrix
251 // ---------------------------------------------
252 if (!tcache.getDecodingTableFromCache(erasure_signature, p_tbls, matrixtype, k, m)) {
254 unsigned char b[k * (m + k)];
255 unsigned char c[k * (m + k)];
257 for (i = 0; i < k; i++) {
259 for (j = 0; j < k; j++)
260 b[k * i + j] = encode_coeff[k * r + j];
262 // ---------------------------------------------
263 // Compute inverted matrix
264 // ---------------------------------------------
266 // --------------------------------------------------------
267 // Remark: this may fail for certain Vandermonde matrices !
268 // There is an advanced way trying to use different
269 // source chunks to get an invertible matrix, however
270 // there are also (k,m) combinations which cannot be
271 // inverted when m chunks are lost and this optimizations
272 // does not help. Therefor we keep the code simpler.
273 // --------------------------------------------------------
274 if (gf_invert_matrix(b, d, k) < 0) {
275 dout(0) << "isa_decode: bad matrix" << dendl;
279 for (int p = 0; p < nerrs; p++) {
280 if (erasures[p] < k) {
281 // decoding matrix elements for data chunks
282 for (j = 0; j < k; j++) {
283 c[k * p + j] = d[k * erasures[p] + j];
286 // decoding matrix element for coding chunks
287 for (i = 0; i < k; i++) {
289 for (j = 0; j < k; j++)
290 s ^= gf_mul(d[j * k + i],
291 encode_coeff[k * erasures[p] + j]);
298 // ---------------------------------------------
299 // Initialize Decoding Table
300 // ---------------------------------------------
301 ec_init_tables(k, nerrs, c, decode_tbls);
302 tcache.putDecodingTableToCache(erasure_signature, p_tbls, matrixtype, k, m);
304 // Recover data sources
305 ec_encode_data(blocksize,
306 k, nerrs, decode_tbls, recover_source, recover_target);
312 // -----------------------------------------------------------------------------
315 ErasureCodeIsaDefault::get_alignment() const
317 return EC_ISA_ADDRESS_ALIGNMENT;
320 // -----------------------------------------------------------------------------
322 int ErasureCodeIsaDefault::parse(ErasureCodeProfile &profile,
325 int err = ErasureCode::parse(profile, ss);
326 err |= to_int("k", profile, &k, DEFAULT_K, ss);
327 err |= to_int("m", profile, &m, DEFAULT_M, ss);
328 err |= sanity_check_k(k, ss);
330 if (matrixtype == kVandermonde) {
331 // these are verified safe values evaluated using the
332 // benchmarktool and 10*(combinatoric for maximum loss) random
335 *ss << "Vandermonde: m=" << m
336 << " should be less/equal than 32 : revert to k=32" << std::endl;
342 *ss << "Vandermonde: m=" << m
343 << " should be less than 5 to guarantee an MDS codec:"
344 << " revert to m=4" << std::endl;
351 *ss << "Vandermonde: k=" << k
352 << " should be less than 22 to guarantee an MDS"
353 << " codec with m=4: revert to k=21" << std::endl;
365 // -----------------------------------------------------------------------------
368 ErasureCodeIsaDefault::prepare()
370 // setup shared encoding table and coefficients
371 unsigned char** p_enc_table =
372 tcache.getEncodingTable(matrixtype, k, m);
374 unsigned char** p_enc_coeff =
375 tcache.getEncodingCoefficient(matrixtype, k, m);
378 dout(10) << "[ cache tables ] creating coeff for k=" <<
379 k << " m=" << m << dendl;
380 // build encoding coefficients which need to be computed once for each (k,m)
381 encode_coeff = (unsigned char*) malloc(k * (m + k));
383 if (matrixtype == kVandermonde)
384 gf_gen_rs_matrix(encode_coeff, k + m, k);
385 if (matrixtype == kCauchy)
386 gf_gen_cauchy1_matrix(encode_coeff, k + m, k);
388 // either our new created coefficients are stored or if they have been
389 // created in the meanwhile the locally allocated coefficients will be
390 // freed by setEncodingCoefficient
391 encode_coeff = tcache.setEncodingCoefficient(matrixtype, k, m, encode_coeff);
393 encode_coeff = *p_enc_coeff;
397 dout(10) << "[ cache tables ] creating tables for k=" <<
398 k << " m=" << m << dendl;
399 // build encoding table which needs to be computed once for each (k,m)
400 encode_tbls = (unsigned char*) malloc(k * (m + k)*32);
401 ec_init_tables(k, m, &encode_coeff[k * k], encode_tbls);
403 // either our new created table is stored or if it has been
404 // created in the meanwhile the locally allocated table will be
405 // freed by setEncodingTable
406 encode_tbls = tcache.setEncodingTable(matrixtype, k, m, encode_tbls);
408 encode_tbls = *p_enc_table;
411 unsigned memory_lru_cache =
412 k * (m + k) * 32 * tcache.decoding_tables_lru_length;
414 dout(10) << "[ cache memory ] = " << memory_lru_cache << " bytes" <<
416 ((matrixtype == kVandermonde) ? "Vandermonde" : "Cauchy") << dendl;
418 assert((matrixtype == kVandermonde) || (matrixtype == kCauchy));
421 // -----------------------------------------------------------------------------