1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2014 FUJITSU LIMITED
7 * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com>
8 * Copyright (C) 2014 Red Hat <contact@redhat.com>
10 * Author: Takanori Nakao <nakao.takanori@jp.fujitsu.com>
11 * Author: Takeshi Miyamae <miyamae.takeshi@jp.fujitsu.com>
12 * Author: Loic Dachary <loic@dachary.org>
14 * This library is free software; you can redistribute it and/or
15 * modify it under the terms of the GNU Lesser General Public
16 * License as published by the Free Software Foundation; either
17 * version 2.1 of the License, or (at your option) any later version.
28 #include "common/debug.h"
29 #include "ErasureCodeShec.h"
31 #include "jerasure/include/jerasure.h"
32 #include "jerasure/include/galois.h"
34 extern int calc_determinant(int *matrix, int dim);
35 extern int* reed_sol_vandermonde_coding_matrix(int k, int m, int w);
38 #define dout_context g_ceph_context
39 #define dout_subsys ceph_subsys_osd
41 #define dout_prefix _prefix(_dout)
43 static ostream& _prefix(std::ostream* _dout)
45 return *_dout << "ErasureCodeShec: ";
48 int ErasureCodeShec::init(ErasureCodeProfile &profile,
52 err |= parse(profile);
56 return ErasureCode::init(profile, ss);
59 unsigned int ErasureCodeShec::get_chunk_size(unsigned int object_size) const
61 unsigned alignment = get_alignment();
62 unsigned tail = object_size % alignment;
63 unsigned padded_length = object_size + ( tail ? ( alignment - tail ) : 0 );
65 assert(padded_length % k == 0);
66 return padded_length / k;
69 int ErasureCodeShec::minimum_to_decode(const set<int> &want_to_read,
70 const set<int> &available_chunks,
71 set<int> *minimum_chunks)
73 if (!minimum_chunks) return -EINVAL;
75 for (set<int>::iterator it = available_chunks.begin(); it != available_chunks.end(); ++it){
76 if (*it < 0 || k+m <= *it) return -EINVAL;
79 for (set<int>::iterator it = want_to_read.begin(); it != want_to_read.end(); ++it){
80 if (*it < 0 || k+m <= *it) return -EINVAL;
87 memset(want, 0, sizeof(want));
88 memset(avails, 0, sizeof(avails));
89 memset(minimum, 0, sizeof(minimum));
90 (*minimum_chunks).clear();
92 for (set<int>::const_iterator i = want_to_read.begin();
93 i != want_to_read.end();
98 for (set<int>::const_iterator i = available_chunks.begin();
99 i != available_chunks.end();
105 int decoding_matrix[k*k];
108 memset(decoding_matrix, 0, sizeof(decoding_matrix));
109 memset(dm_row, 0, sizeof(dm_row));
110 memset(dm_column, 0, sizeof(dm_column));
111 if (shec_make_decoding_matrix(true, want, avails, decoding_matrix, dm_row, dm_column, minimum) < 0) {
116 for (int i = 0; i < k + m; i++) {
117 if (minimum[i] == 1) minimum_chunks->insert(i);
123 int ErasureCodeShec::minimum_to_decode_with_cost(const set<int> &want_to_read,
124 const map<int, int> &available,
125 set<int> *minimum_chunks)
127 set <int> available_chunks;
129 for (map<int, int>::const_iterator i = available.begin();
130 i != available.end();
132 available_chunks.insert(i->first);
134 return minimum_to_decode(want_to_read, available_chunks, minimum_chunks);
137 int ErasureCodeShec::encode(const set<int> &want_to_encode,
138 const bufferlist &in,
139 map<int, bufferlist> *encoded)
141 unsigned int k = get_data_chunk_count();
142 unsigned int m = get_chunk_count() - k;
145 if (!encoded || !encoded->empty()){
149 int err = encode_prepare(in, *encoded);
152 encode_chunks(want_to_encode, encoded);
153 for (unsigned int i = 0; i < k + m; i++) {
154 if (want_to_encode.count(i) == 0)
160 int ErasureCodeShec::encode_chunks(const set<int> &want_to_encode,
161 map<int, bufferlist> *encoded)
164 for (int i = 0; i < k + m; i++){
165 chunks[i] = (*encoded)[i].c_str();
167 shec_encode(&chunks[0], &chunks[k], (*encoded)[0].length());
171 int ErasureCodeShec::decode(const set<int> &want_to_read,
172 const map<int, bufferlist> &chunks,
173 map<int, bufferlist> *decoded)
177 if (!decoded || !decoded->empty()){
181 have.reserve(chunks.size());
182 for (map<int, bufferlist>::const_iterator i = chunks.begin();
185 have.push_back(i->first);
188 have.begin(), have.end(), want_to_read.begin(), want_to_read.end())) {
189 for (set<int>::iterator i = want_to_read.begin();
190 i != want_to_read.end();
192 (*decoded)[*i] = chunks.find(*i)->second;
196 unsigned int k = get_data_chunk_count();
197 unsigned int m = get_chunk_count() - k;
198 unsigned blocksize = (*chunks.begin()).second.length();
199 for (unsigned int i = 0; i < k + m; i++) {
200 if (chunks.find(i) == chunks.end()) {
201 bufferptr ptr(buffer::create_aligned(blocksize, SIMD_ALIGN));
202 (*decoded)[i].push_front(ptr);
204 (*decoded)[i] = chunks.find(i)->second;
205 (*decoded)[i].rebuild_aligned(SIMD_ALIGN);
208 return decode_chunks(want_to_read, chunks, decoded);
211 int ErasureCodeShec::decode_chunks(const set<int> &want_to_read,
212 const map<int, bufferlist> &chunks,
213 map<int, bufferlist> *decoded)
215 unsigned blocksize = (*chunks.begin()).second.length();
217 int erased_count = 0;
222 for (int i = 0; i < k + m; i++) {
224 if (chunks.find(i) == chunks.end()) {
225 if (want_to_read.count(i) > 0) {
234 data[i] = (*decoded)[i].c_str();
236 coding[i - k] = (*decoded)[i].c_str();
239 if (erased_count > 0) {
240 return shec_decode(erased, avails, data, coding, blocksize);
247 // ErasureCodeShecReedSolomonVandermonde
250 void ErasureCodeShecReedSolomonVandermonde::shec_encode(char **data,
254 jerasure_matrix_encode(k, m, w, matrix, data, coding, blocksize);
257 int ErasureCodeShecReedSolomonVandermonde::shec_decode(int *erased,
263 return shec_matrix_decode(erased, avails, data, coding, blocksize);
266 unsigned ErasureCodeShecReedSolomonVandermonde::get_alignment() const
268 return k*w*sizeof(int);
271 int ErasureCodeShecReedSolomonVandermonde::parse(const ErasureCodeProfile &profile)
275 if (profile.find("k") == profile.end() &&
276 profile.find("m") == profile.end() &&
277 profile.find("c") == profile.end()){
278 dout(10) << "(k, m, c) default to " << "(" << DEFAULT_K
279 << ", " << DEFAULT_M << ", " << DEFAULT_C << ")" << dendl;
280 k = DEFAULT_K; m = DEFAULT_M; c = DEFAULT_C;
281 } else if (profile.find("k") == profile.end() ||
282 profile.find("m") == profile.end() ||
283 profile.find("c") == profile.end()){
284 dout(10) << "(k, m, c) must be chosen" << dendl;
287 std::string err_k, err_m, err_c, value_k, value_m, value_c;
288 value_k = profile.find("k")->second;
289 value_m = profile.find("m")->second;
290 value_c = profile.find("c")->second;
291 k = strict_strtol(value_k.c_str(), 10, &err_k);
292 m = strict_strtol(value_m.c_str(), 10, &err_m);
293 c = strict_strtol(value_c.c_str(), 10, &err_c);
295 if (!err_k.empty() || !err_m.empty() || !err_c.empty()){
297 derr << "could not convert k=" << value_k << "to int" << dendl;
298 } else if (!err_m.empty()){
299 derr << "could not convert m=" << value_m << "to int" << dendl;
300 } else if (!err_c.empty()){
301 derr << "could not convert c=" << value_c << "to int" << dendl;
306 << " must be a positive number" << dendl;
310 << " must be a positive number" << dendl;
314 << " must be a positive number" << dendl;
318 << " must be less than or equal to m=" << m << dendl;
322 << " must be less than or equal to 12" << dendl;
324 } else if (k+m > 20){
325 derr << "k+m=" << k+m
326 << " must be less than or equal to 20" << dendl;
330 << " must be less than or equal to k=" << k << dendl;
336 derr << "(k, m, c)=(" << k << ", " << m << ", " << c
337 << ") is not a valid parameter." << dendl;
341 dout(10) << "(k, m, c) set to " << "(" << k << ", " << m << ", "
345 if (profile.find("w") == profile.end()){
346 dout(10) << "w default to " << DEFAULT_W << dendl;
349 std::string err_w, value_w;
350 value_w = profile.find("w")->second;
351 w = strict_strtol(value_w.c_str(), 10, &err_w);
354 derr << "could not convert w=" << value_w << "to int" << dendl;
355 dout(10) << "w default to " << DEFAULT_W << dendl;
358 } else if (w != 8 && w != 16 && w != 32) {
360 << " must be one of {8, 16, 32}" << dendl;
361 dout(10) << "w default to " << DEFAULT_W << dendl;
365 dout(10) << "w set to " << w << dendl;
371 void ErasureCodeShecReedSolomonVandermonde::prepare()
373 // setup shared encoding table
375 tcache.getEncodingTable(technique, k, m, c, w);
378 dout(10) << "[ cache tables ] creating coeff for k=" <<
379 k << " m=" << m << " c=" << c << " w=" << w << dendl;
381 matrix = shec_reedsolomon_coding_matrix(technique);
383 // either our new created table is stored or if it has been
384 // created in the meanwhile the locally allocated table will be
385 // freed by setEncodingTable
386 matrix = tcache.setEncodingTable(technique, k, m, c, w, matrix);
388 dout(10) << "matrix = " << dendl;
389 for (int i=0; i<m; i++) {
391 for (int j=0; j<k; j++) {
392 if (matrix[i*k+j] > 0) {
399 dout(10) << mat << dendl;
402 matrix = *p_enc_table;
405 dout(10) << " [ technique ] = " <<
406 ((technique == MULTIPLE) ? "multiple" : "single") << dendl;
408 assert((technique == SINGLE) || (technique == MULTIPLE));
413 // Mearged from shec.cc.
415 double ErasureCodeShec::shec_calc_recovery_efficiency1(int k, int m1, int m2, int c1, int c2){
418 int i, rr, cc, start, end;
421 if (m1 < c1 || m2 < c2) return -1;
422 if ((m1 == 0 && c1 != 0) || (m2 == 0 && c2 != 0)) return -1;
424 for (i=0; i<k; i++) r_eff_k[i] = 100000000;
427 for (rr=0; rr<m1; rr++){
428 start = ((rr*k)/m1) % k;
429 end = (((rr+c1)*k)/m1) % k;
430 for (cc=start, first_flag=1; first_flag || cc!=end; cc=(cc+1)%k){
432 r_eff_k[cc] = std::min(r_eff_k[cc], ((rr+c1)*k)/m1 - (rr*k)/m1);
434 r_e1 += ((rr+c1)*k)/m1 - (rr*k)/m1;
437 for (rr=0; rr<m2; rr++){
438 start = ((rr*k)/m2) % k;
439 end = (((rr+c2)*k)/m2) % k;
440 for (cc=start, first_flag=1; first_flag || cc!=end; cc=(cc+1)%k){
442 r_eff_k[cc] = std::min(r_eff_k[cc], ((rr+c2)*k)/m2 - (rr*k)/m2);
444 r_e1 += ((rr+c2)*k)/m2 - (rr*k)/m2;
456 int* ErasureCodeShec::shec_reedsolomon_coding_matrix(int is_single)
459 int rr, cc, start, end;
462 if (w != 8 && w != 16 && w != 32) return NULL;
465 int c1_best = -1, m1_best = -1;
466 double min_r_e1 = 100.0;
468 // create all multiple shec pattern and choose best.
470 for (c1=0; c1 <= c/2; c1++){
471 for (m1=0; m1 <= m; m1++){
475 if (m1 < c1 || m2 < c2) continue;
476 if ((m1 == 0 && c1 != 0) || (m2 == 0 && c2 != 0)) continue;
477 if ((m1 != 0 && c1 == 0) || (m2 != 0 && c2 == 0)) continue;
483 r_e1 = shec_calc_recovery_efficiency1(k, m1, m2, c1, c2);
484 if (min_r_e1 - r_e1 > std::numeric_limits<double>::epsilon() &&
505 matrix = reed_sol_vandermonde_coding_matrix(k, m, w);
507 for (rr=0; rr<m1; rr++){
508 end = ((rr*k)/m1) % k;
509 start = (((rr+c1)*k)/m1) % k;
510 for (cc=start; cc!=end; cc=(cc+1)%k){
511 matrix[cc + rr*k] = 0;
515 for (rr=0; rr<m2; rr++){
516 end = ((rr*k)/m2) % k;
517 start = (((rr+c2)*k)/m2) % k;
518 for (cc=start; cc!=end; cc=(cc+1)%k){
519 matrix[cc + (rr+m1)*k] = 0;
526 int ErasureCodeShec::shec_make_decoding_matrix(bool prepare, int *want_, int *avails,
527 int *decoding_matrix, int *dm_row, int *dm_column,
530 int mindup = k+1, minp = k+1;
533 memset(want, 0, sizeof(want));
535 for (int i = 0; i < k + m; ++i) {
539 for (int i = 0; i < m; ++i) {
540 if (want[i + k] && !avails[i + k]) {
541 for (int j=0; j < k; ++j) {
542 if (matrix[i * k + j] > 0) {
549 if (tcache.getDecodingTableFromCache(decoding_matrix,
550 dm_row, dm_column, minimum,
557 for (unsigned long long pp = 0; pp < (1ull << m); ++pp) {
559 // select parity chunks
562 for (int i=0; i < m; ++i) {
563 if (pp & (1ull << i)) {
571 // Are selected parity chunks avail?
573 for (int i = 0; i < ek && ok; i++) {
574 if (!avails[k+p[i]]) {
586 for (int i = 0; i < k + m; i++) {
589 for (int i = 0; i < k; i++) {
593 for (int i=0; i < k; i++) {
594 if (want[i] && !avails[i]) {
599 // Parity chunks which are used to recovery erased data chunks, are added to tmprow.
600 for (int i = 0; i < ek; i++) {
601 tmprow[k + p[i]] = 1;
602 for (int j = 0; j < k; j++) {
603 int element = matrix[(p[i]) * k + j];
607 if (element != 0 && avails[j] == 1) {
613 int dup_row = 0, dup_column = 0, dup = 0;
614 for (int i = 0; i < k + m; i++) {
620 for (int i = 0; i < k; i++) {
626 if (dup_row != dup_column) {
632 for (int i = 0; i < k; i++) {
635 for (int i = 0; i < k; i++) {
641 // minimum is updated.
643 int tmpmat[dup * dup];
645 for (int i = 0, row = 0; i < k + m; i++) {
647 for (int j = 0, column = 0; j < k; j++) {
650 tmpmat[row * dup + column] = (i == j ? 1 : 0);
652 tmpmat[row * dup + column] = matrix[(i - k) * k + j];
661 int det = calc_determinant(tmpmat, dup);
666 for (int i = 0; i < k; i++) {
669 for (int i = 0; i < k; i++) {
674 for (int i=0; i < k + m; i++) {
676 dm_row[row_id++] = i;
679 for (int i=0; i < k; i++) {
681 dm_column[column_id++] = i;
691 fprintf(stderr, "shec_make_decoding_matrix(): can't find recover matrix.\n");
695 for (int i = 0; i < k + m; i++) {
699 for (int i=0; i < k && dm_row[i] != -1; i++) {
700 minimum[dm_row[i]] = 1;
703 for (int i = 0; i < k; ++i) {
704 if (want[i] && avails[i]) {
709 for (int i = 0; i < m; ++i) {
710 if (want[k + i] && avails[k + i] && !minimum[k + i]) {
711 for (int j = 0; j < k; ++j) {
712 if (matrix[i * k + j] > 0 && !want[j]) {
724 int tmpmat[mindup * mindup];
725 for (int i=0; i < mindup; i++) {
726 for (int j=0; j < mindup; j++) {
728 tmpmat[i * mindup + j] = (dm_row[i] == dm_column[j] ? 1 : 0);
730 tmpmat[i * mindup + j] = matrix[(dm_row[i] - k) * k + dm_column[j]];
734 for (int j = 0; j < mindup; j++) {
735 if (dm_row[i] == dm_column[j]) {
740 dm_row[i] -= (k - mindup);
748 int ret = jerasure_invert_matrix(tmpmat, decoding_matrix, mindup, w);
750 tcache.putDecodingTableToCache(decoding_matrix, dm_row, dm_column, minimum, technique,
751 k, m, c, w, want, avails);
756 int ErasureCodeShec::shec_matrix_decode(int *want, int *avails, char **data_ptrs,
757 char **coding_ptrs, int size)
759 int decoding_matrix[k*k];
760 int dm_row[k], dm_column[k];
763 memset(decoding_matrix, 0, sizeof(decoding_matrix));
764 memset(dm_row, -1, sizeof(dm_row));
765 memset(dm_column, -1, sizeof(dm_column));
766 memset(minimum, -1, sizeof(minimum));
768 if (w != 8 && w != 16 && w != 32) return -1;
770 if (shec_make_decoding_matrix(false, want, avails, decoding_matrix,
771 dm_row, dm_column, minimum) < 0) {
775 // Get decoding matrix size
777 for (int i = 0; i < k; i++) {
778 if (dm_row[i] == -1) {
784 char *dm_data_ptrs[dm_size];
785 for (int i = 0; i < dm_size; i++) {
786 dm_data_ptrs[i] = data_ptrs[dm_column[i]];
789 // Decode the data drives
790 for (int i = 0; i < dm_size; i++) {
791 if (!avails[dm_column[i]]) {
792 jerasure_matrix_dotprod(dm_size, w, decoding_matrix + (i * dm_size),
793 dm_row, i, dm_data_ptrs, coding_ptrs, size);
797 // Re-encode any erased coding devices
798 for (int i = 0; i < m; i++) {
799 if (want[k+i] && !avails[k+i]) {
800 jerasure_matrix_dotprod(k, w, matrix + (i * k), NULL, i+k,
801 data_ptrs, coding_ptrs, size);