// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- // vim: ts=8 sw=2 smarttab #include #include "common/debug.h" #include "mdstypes.h" #include "mds/flock.h" #define dout_subsys ceph_subsys_mds static multimap global_waiting_locks; static void remove_global_waiting(ceph_filelock &fl, ceph_lock_state_t *lock_state) { for (auto p = global_waiting_locks.find(fl); p != global_waiting_locks.end(); ) { if (p->first != fl) break; if (p->second == lock_state) { global_waiting_locks.erase(p); break; } ++p; } } ceph_lock_state_t::~ceph_lock_state_t() { if (type == CEPH_LOCK_FCNTL) { for (auto p = waiting_locks.begin(); p != waiting_locks.end(); ++p) { remove_global_waiting(p->second, this); } } } bool ceph_lock_state_t::is_waiting(const ceph_filelock &fl) const { multimap::const_iterator p = waiting_locks.find(fl.start); while (p != waiting_locks.end()) { if (p->second.start > fl.start) return false; if (p->second.length == fl.length && ceph_filelock_owner_equal(p->second, fl)) return true; ++p; } return false; } void ceph_lock_state_t::remove_waiting(const ceph_filelock& fl) { for (auto p = waiting_locks.find(fl.start); p != waiting_locks.end(); ) { if (p->second.start > fl.start) break; if (p->second.length == fl.length && ceph_filelock_owner_equal(p->second, fl)) { if (type == CEPH_LOCK_FCNTL) { remove_global_waiting(p->second, this); } waiting_locks.erase(p); --client_waiting_lock_counts[(client_t)fl.client]; if (!client_waiting_lock_counts[(client_t)fl.client]) { client_waiting_lock_counts.erase((client_t)fl.client); } break; } ++p; } } bool ceph_lock_state_t::is_deadlock(const ceph_filelock& fl, list::iterator>& overlapping_locks, const ceph_filelock *first_fl, unsigned depth) const { ldout(cct,15) << "is_deadlock " << fl << dendl; // only for posix lock if (type != CEPH_LOCK_FCNTL) return false; // find conflict locks' owners set lock_owners; for (auto p = overlapping_locks.begin(); p != overlapping_locks.end(); ++p) { if (fl.type == CEPH_LOCK_SHARED && (*p)->second.type == CEPH_LOCK_SHARED) continue; // circle detected if (first_fl && ceph_filelock_owner_equal(*first_fl, (*p)->second)) { ldout(cct,15) << " detect deadlock" << dendl; return true; } ceph_filelock tmp = (*p)->second; tmp.start = 0; tmp.length = 0; tmp.type = 0; lock_owners.insert(tmp); } if (depth >= MAX_DEADLK_DEPTH) return false; first_fl = first_fl ? first_fl : &fl; for (auto p = lock_owners.begin(); p != lock_owners.end(); ++p) { ldout(cct,15) << " conflict lock owner " << *p << dendl; // if conflict lock' owner is waiting for other lock? for (auto q = global_waiting_locks.lower_bound(*p); q != global_waiting_locks.end(); ++q) { if (!ceph_filelock_owner_equal(q->first, *p)) break; list::iterator> _overlapping_locks, _self_overlapping_locks; ceph_lock_state_t& state = *(q->second); if (state.get_overlapping_locks(q->first, _overlapping_locks)) { state.split_by_owner(q->first, _overlapping_locks, _self_overlapping_locks); } if (!_overlapping_locks.empty()) { if (is_deadlock(q->first, _overlapping_locks, first_fl, depth + 1)) return true; } } } return false; } void ceph_lock_state_t::add_waiting(const ceph_filelock& fl) { waiting_locks.insert(pair(fl.start, fl)); ++client_waiting_lock_counts[(client_t)fl.client]; if (type == CEPH_LOCK_FCNTL) { global_waiting_locks.insert(pair(fl, this)); } } bool ceph_lock_state_t::add_lock(ceph_filelock& new_lock, bool wait_on_fail, bool replay, bool *deadlock) { ldout(cct,15) << "add_lock " << new_lock << dendl; bool ret = false; list::iterator> overlapping_locks, self_overlapping_locks, neighbor_locks; // first, get any overlapping locks and split them into owned-by-us and not if (get_overlapping_locks(new_lock, overlapping_locks, &neighbor_locks)) { ldout(cct,15) << "got overlapping lock, splitting by owner" << dendl; split_by_owner(new_lock, overlapping_locks, self_overlapping_locks); } if (!overlapping_locks.empty()) { //overlapping locks owned by others :( if (CEPH_LOCK_EXCL == new_lock.type) { //can't set, we want an exclusive ldout(cct,15) << "overlapping lock, and this lock is exclusive, can't set" << dendl; if (wait_on_fail && !replay) { if (is_deadlock(new_lock, overlapping_locks)) *deadlock = true; else add_waiting(new_lock); } } else { //shared lock, check for any exclusive locks blocking us if (contains_exclusive_lock(overlapping_locks)) { //blocked :( ldout(cct,15) << " blocked by exclusive lock in overlapping_locks" << dendl; if (wait_on_fail && !replay) { if (is_deadlock(new_lock, overlapping_locks)) *deadlock = true; else add_waiting(new_lock); } } else { //yay, we can insert a shared lock ldout(cct,15) << "inserting shared lock" << dendl; remove_waiting(new_lock); adjust_locks(self_overlapping_locks, new_lock, neighbor_locks); held_locks.insert(pair(new_lock.start, new_lock)); ret = true; } } } else { //no overlapping locks except our own remove_waiting(new_lock); adjust_locks(self_overlapping_locks, new_lock, neighbor_locks); ldout(cct,15) << "no conflicts, inserting " << new_lock << dendl; held_locks.insert(pair (new_lock.start, new_lock)); ret = true; } if (ret) { ++client_held_lock_counts[(client_t)new_lock.client]; } return ret; } void ceph_lock_state_t::look_for_lock(ceph_filelock& testing_lock) { list::iterator> overlapping_locks, self_overlapping_locks; if (get_overlapping_locks(testing_lock, overlapping_locks)) { split_by_owner(testing_lock, overlapping_locks, self_overlapping_locks); } if (!overlapping_locks.empty()) { //somebody else owns overlapping lock if (CEPH_LOCK_EXCL == testing_lock.type) { //any lock blocks it testing_lock = (*overlapping_locks.begin())->second; } else { ceph_filelock *blocking_lock; if ((blocking_lock = contains_exclusive_lock(overlapping_locks))) { testing_lock = *blocking_lock; } else { //nothing blocking! testing_lock.type = CEPH_LOCK_UNLOCK; } } return; } //if we get here, only our own locks block testing_lock.type = CEPH_LOCK_UNLOCK; } void ceph_lock_state_t::remove_lock(ceph_filelock removal_lock, list& activated_locks) { list::iterator> overlapping_locks, self_overlapping_locks; if (get_overlapping_locks(removal_lock, overlapping_locks)) { ldout(cct,15) << "splitting by owner" << dendl; split_by_owner(removal_lock, overlapping_locks, self_overlapping_locks); } else ldout(cct,15) << "attempt to remove lock at " << removal_lock.start << " but no locks there!" << dendl; bool remove_to_end = (0 == removal_lock.length); uint64_t removal_start = removal_lock.start; uint64_t removal_end = removal_start + removal_lock.length - 1; __s64 old_lock_client = 0; ceph_filelock *old_lock; ldout(cct,15) << "examining " << self_overlapping_locks.size() << " self-overlapping locks for removal" << dendl; for (list::iterator>::iterator iter = self_overlapping_locks.begin(); iter != self_overlapping_locks.end(); ++iter) { ldout(cct,15) << "self overlapping lock " << (*iter)->second << dendl; old_lock = &(*iter)->second; bool old_lock_to_end = (0 == old_lock->length); uint64_t old_lock_end = old_lock->start + old_lock->length - 1; old_lock_client = old_lock->client; if (remove_to_end) { if (old_lock->start < removal_start) { old_lock->length = removal_start - old_lock->start; } else { ldout(cct,15) << "erasing " << (*iter)->second << dendl; held_locks.erase(*iter); --client_held_lock_counts[old_lock_client]; } } else if (old_lock_to_end) { ceph_filelock append_lock = *old_lock; append_lock.start = removal_end+1; held_locks.insert(pair (append_lock.start, append_lock)); ++client_held_lock_counts[(client_t)old_lock->client]; if (old_lock->start >= removal_start) { ldout(cct,15) << "erasing " << (*iter)->second << dendl; held_locks.erase(*iter); --client_held_lock_counts[old_lock_client]; } else old_lock->length = removal_start - old_lock->start; } else { if (old_lock_end > removal_end) { ceph_filelock append_lock = *old_lock; append_lock.start = removal_end + 1; append_lock.length = old_lock_end - append_lock.start + 1; held_locks.insert(pair (append_lock.start, append_lock)); ++client_held_lock_counts[(client_t)old_lock->client]; } if (old_lock->start < removal_start) { old_lock->length = removal_start - old_lock->start; } else { ldout(cct,15) << "erasing " << (*iter)->second << dendl; held_locks.erase(*iter); --client_held_lock_counts[old_lock_client]; } } if (!client_held_lock_counts[old_lock_client]) { client_held_lock_counts.erase(old_lock_client); } } } bool ceph_lock_state_t::remove_all_from (client_t client) { bool cleared_any = false; if (client_held_lock_counts.count(client)) { multimap::iterator iter = held_locks.begin(); while (iter != held_locks.end()) { if ((client_t)iter->second.client == client) { held_locks.erase(iter++); } else ++iter; } client_held_lock_counts.erase(client); cleared_any = true; } if (client_waiting_lock_counts.count(client)) { multimap::iterator iter = waiting_locks.begin(); while (iter != waiting_locks.end()) { if ((client_t)iter->second.client != client) { ++iter; continue; } if (type == CEPH_LOCK_FCNTL) { remove_global_waiting(iter->second, this); } waiting_locks.erase(iter++); } client_waiting_lock_counts.erase(client); } return cleared_any; } void ceph_lock_state_t::adjust_locks(list::iterator> old_locks, ceph_filelock& new_lock, list::iterator> neighbor_locks) { ldout(cct,15) << "adjust_locks" << dendl; bool new_lock_to_end = (0 == new_lock.length); __s64 old_lock_client = 0; ceph_filelock *old_lock; for (list::iterator>::iterator iter = old_locks.begin(); iter != old_locks.end(); ++iter) { old_lock = &(*iter)->second; ldout(cct,15) << "adjusting lock: " << *old_lock << dendl; bool old_lock_to_end = (0 == old_lock->length); uint64_t old_lock_start = old_lock->start; uint64_t old_lock_end = old_lock->start + old_lock->length - 1; uint64_t new_lock_start = new_lock.start; uint64_t new_lock_end = new_lock.start + new_lock.length - 1; old_lock_client = old_lock->client; if (new_lock_to_end || old_lock_to_end) { //special code path to deal with a length set at 0 ldout(cct,15) << "one lock extends forever" << dendl; if (old_lock->type == new_lock.type) { //just unify them in new lock, remove old lock ldout(cct,15) << "same lock type, unifying" << dendl; new_lock.start = (new_lock_start < old_lock_start) ? new_lock_start : old_lock_start; new_lock.length = 0; held_locks.erase(*iter); --client_held_lock_counts[old_lock_client]; } else { //not same type, have to keep any remains of old lock around ldout(cct,15) << "shrinking old lock" << dendl; if (new_lock_to_end) { if (old_lock_start < new_lock_start) { old_lock->length = new_lock_start - old_lock_start; } else { held_locks.erase(*iter); --client_held_lock_counts[old_lock_client]; } } else { //old lock extends past end of new lock ceph_filelock appended_lock = *old_lock; appended_lock.start = new_lock_end + 1; held_locks.insert(pair (appended_lock.start, appended_lock)); ++client_held_lock_counts[(client_t)old_lock->client]; if (old_lock_start < new_lock_start) { old_lock->length = new_lock_start - old_lock_start; } else { held_locks.erase(*iter); --client_held_lock_counts[old_lock_client]; } } } } else { if (old_lock->type == new_lock.type) { //just merge them! ldout(cct,15) << "merging locks, they're the same type" << dendl; new_lock.start = (old_lock_start < new_lock_start ) ? old_lock_start : new_lock_start; int new_end = (new_lock_end > old_lock_end) ? new_lock_end : old_lock_end; new_lock.length = new_end - new_lock.start + 1; ldout(cct,15) << "erasing lock " << (*iter)->second << dendl; held_locks.erase(*iter); --client_held_lock_counts[old_lock_client]; } else { //we'll have to update sizes and maybe make new locks ldout(cct,15) << "locks aren't same type, changing sizes" << dendl; if (old_lock_end > new_lock_end) { //add extra lock after new_lock ceph_filelock appended_lock = *old_lock; appended_lock.start = new_lock_end + 1; appended_lock.length = old_lock_end - appended_lock.start + 1; held_locks.insert(pair (appended_lock.start, appended_lock)); ++client_held_lock_counts[(client_t)old_lock->client]; } if (old_lock_start < new_lock_start) { old_lock->length = new_lock_start - old_lock_start; } else { //old_lock starts inside new_lock, so remove it //if it extended past new_lock_end it's been replaced held_locks.erase(*iter); --client_held_lock_counts[old_lock_client]; } } } if (!client_held_lock_counts[old_lock_client]) { client_held_lock_counts.erase(old_lock_client); } } //make sure to coalesce neighboring locks for (list::iterator>::iterator iter = neighbor_locks.begin(); iter != neighbor_locks.end(); ++iter) { old_lock = &(*iter)->second; old_lock_client = old_lock->client; ldout(cct,15) << "lock to coalesce: " << *old_lock << dendl; /* because if it's a neighboring lock there can't be any self-overlapping locks that covered it */ if (old_lock->type == new_lock.type) { //merge them if (0 == new_lock.length) { if (old_lock->start + old_lock->length == new_lock.start) { new_lock.start = old_lock->start; } else ceph_abort(); /* if there's no end to new_lock, the neighbor HAS TO be to left side */ } else if (0 == old_lock->length) { if (new_lock.start + new_lock.length == old_lock->start) { new_lock.length = 0; } else ceph_abort(); //same as before, but reversed } else { if (old_lock->start + old_lock->length == new_lock.start) { new_lock.start = old_lock->start; new_lock.length = old_lock->length + new_lock.length; } else if (new_lock.start + new_lock.length == old_lock->start) { new_lock.length = old_lock->length + new_lock.length; } } held_locks.erase(*iter); --client_held_lock_counts[old_lock_client]; } if (!client_held_lock_counts[old_lock_client]) { client_held_lock_counts.erase(old_lock_client); } } } multimap::iterator ceph_lock_state_t::get_lower_bound(uint64_t start, multimap& lock_map) { multimap::iterator lower_bound = lock_map.lower_bound(start); if ((lower_bound->first != start) && (start != 0) && (lower_bound != lock_map.begin())) --lower_bound; if (lock_map.end() == lower_bound) ldout(cct,15) << "get_lower_dout(15)eturning end()" << dendl; else ldout(cct,15) << "get_lower_bound returning iterator pointing to " << lower_bound->second << dendl; return lower_bound; } multimap::iterator ceph_lock_state_t::get_last_before(uint64_t end, multimap& lock_map) { multimap::iterator last = lock_map.upper_bound(end); if (last != lock_map.begin()) --last; if (lock_map.end() == last) ldout(cct,15) << "get_last_before returning end()" << dendl; else ldout(cct,15) << "get_last_before returning iterator pointing to " << last->second << dendl; return last; } bool ceph_lock_state_t::share_space( multimap::iterator& iter, uint64_t start, uint64_t end) { bool ret = ((iter->first >= start && iter->first <= end) || ((iter->first < start) && (((iter->first + iter->second.length - 1) >= start) || (0 == iter->second.length)))); ldout(cct,15) << "share_space got start: " << start << ", end: " << end << ", lock: " << iter->second << ", returning " << ret << dendl; return ret; } bool ceph_lock_state_t::get_overlapping_locks(const ceph_filelock& lock, list::iterator> & overlaps, list::iterator> *self_neighbors) { ldout(cct,15) << "get_overlapping_locks" << dendl; // create a lock starting one earlier and ending one later // to check for neighbors ceph_filelock neighbor_check_lock = lock; if (neighbor_check_lock.start != 0) { neighbor_check_lock.start = neighbor_check_lock.start - 1; if (neighbor_check_lock.length) neighbor_check_lock.length = neighbor_check_lock.length + 2; } else { if (neighbor_check_lock.length) neighbor_check_lock.length = neighbor_check_lock.length + 1; } //find the last held lock starting at the point after lock uint64_t endpoint = lock.start; if (lock.length) { endpoint += lock.length; } else { endpoint = uint64_t(-1); // max offset } multimap::iterator iter = get_last_before(endpoint, held_locks); bool cont = iter != held_locks.end(); while(cont) { if (share_space(iter, lock)) { overlaps.push_front(iter); } else if (self_neighbors && ceph_filelock_owner_equal(neighbor_check_lock, iter->second) && share_space(iter, neighbor_check_lock)) { self_neighbors->push_front(iter); } if ((iter->first < lock.start) && (CEPH_LOCK_EXCL == iter->second.type)) { //can't be any more overlapping locks or they'd interfere with this one cont = false; } else if (held_locks.begin() == iter) cont = false; else --iter; } return !overlaps.empty(); } bool ceph_lock_state_t::get_waiting_overlaps(const ceph_filelock& lock, list::iterator>& overlaps) { ldout(cct,15) << "get_waiting_overlaps" << dendl; multimap::iterator iter = get_last_before(lock.start + lock.length - 1, waiting_locks); bool cont = iter != waiting_locks.end(); while(cont) { if (share_space(iter, lock)) overlaps.push_front(iter); if (waiting_locks.begin() == iter) cont = false; --iter; } return !overlaps.empty(); } void ceph_lock_state_t::split_by_owner(const ceph_filelock& owner, list::iterator>& locks, list::iterator>& owned_locks) { list::iterator>::iterator iter = locks.begin(); ldout(cct,15) << "owner lock: " << owner << dendl; while (iter != locks.end()) { ldout(cct,15) << "comparing to " << (*iter)->second << dendl; if (ceph_filelock_owner_equal((*iter)->second, owner)) { ldout(cct,15) << "success, pushing to owned_locks" << dendl; owned_locks.push_back(*iter); iter = locks.erase(iter); } else { ldout(cct,15) << "failure, something not equal in this group " << (*iter)->second.client << ":" << owner.client << "," << (*iter)->second.owner << ":" << owner.owner << "," << (*iter)->second.pid << ":" << owner.pid << dendl; ++iter; } } } ceph_filelock * ceph_lock_state_t::contains_exclusive_lock(list::iterator>& locks) { for (list::iterator>::iterator iter = locks.begin(); iter != locks.end(); ++iter) { if (CEPH_LOCK_EXCL == (*iter)->second.type) return &(*iter)->second; } return NULL; }