qemu/util/rcu.c

   1 /*
   2  * urcu-mb.c
   3  *
   4  * Userspace RCU library with explicit memory barriers
   5  *
   6  * Copyright (c) 2009 Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
   7  * Copyright (c) 2009 Paul E. McKenney, IBM Corporation.
   8  * Copyright 2015 Red Hat, Inc.
   9  *
  10  * Ported to QEMU by Paolo Bonzini  <pbonzini@redhat.com>
  11  *
  12  * This library is free software; you can redistribute it and/or
  13  * modify it under the terms of the GNU Lesser General Public
  14  * License as published by the Free Software Foundation; either
  15  * version 2.1 of the License, or (at your option) any later version.
  16  *
  17  * This library is distributed in the hope that it will be useful,
  18  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  19  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  20  * Lesser General Public License for more details.
  21  *
  22  * You should have received a copy of the GNU Lesser General Public
  23  * License along with this library; if not, write to the Free Software
  24  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  25  *
  26  * IBM's contributions to this file may be relicensed under LGPLv2 or later.
  27  */
  28
  29 #include "qemu/osdep.h"
  30 #include "qemu-common.h"
  31 #include "qemu/rcu.h"
  32 #include "qemu/atomic.h"
  33 #include "qemu/thread.h"
  34 #include "qemu/main-loop.h"
  35
  36 /*
  37  * Global grace period counter.  Bit 0 is always one in rcu_gp_ctr.
  38  * Bits 1 and above are defined in synchronize_rcu.
  39  */
  40 #define RCU_GP_LOCKED           (1UL << 0)
  41 #define RCU_GP_CTR              (1UL << 1)
  42
  43 unsigned long rcu_gp_ctr = RCU_GP_LOCKED;
  44
  45 QemuEvent rcu_gp_event;
  46 static QemuMutex rcu_registry_lock;
  47 static QemuMutex rcu_sync_lock;
  48
  49 /*
  50  * Check whether a quiescent state was crossed between the beginning of
  51  * update_counter_and_wait and now.
  52  */
  53 static inline int rcu_gp_ongoing(unsigned long *ctr)
  54 {
  55     unsigned long v;
  56
  57     v = atomic_read(ctr);
  58     return v && (v != rcu_gp_ctr);
  59 }
  60
  61 /* Written to only by each individual reader. Read by both the reader and the
  62  * writers.
  63  */
  64 __thread struct rcu_reader_data rcu_reader;
  65
  66 /* Protected by rcu_registry_lock.  */
  67 typedef QLIST_HEAD(, rcu_reader_data) ThreadList;
  68 static ThreadList registry = QLIST_HEAD_INITIALIZER(registry);
  69
  70 /* Wait for previous parity/grace period to be empty of readers.  */
  71 static void wait_for_readers(void)
  72 {
  73     ThreadList qsreaders = QLIST_HEAD_INITIALIZER(qsreaders);
  74     struct rcu_reader_data *index, *tmp;
  75
  76     for (;;) {
  77         /* We want to be notified of changes made to rcu_gp_ongoing
  78          * while we walk the list.
  79          */
  80         qemu_event_reset(&rcu_gp_event);
  81
  82         /* Instead of using atomic_mb_set for index->waiting, and
  83          * atomic_mb_read for index->ctr, memory barriers are placed
  84          * manually since writes to different threads are independent.
  85          * atomic_mb_set has a smp_wmb before...
  86          */
  87         smp_wmb();
  88         QLIST_FOREACH(index, &registry, node) {
  89             atomic_set(&index->waiting, true);
  90         }
  91
  92         /* ... and a smp_mb after.  */
  93         smp_mb();
  94
  95         QLIST_FOREACH_SAFE(index, &registry, node, tmp) {
  96             if (!rcu_gp_ongoing(&index->ctr)) {
  97                 QLIST_REMOVE(index, node);
  98                 QLIST_INSERT_HEAD(&qsreaders, index, node);
  99
 100                 /* No need for mb_set here, worst of all we
 101                  * get some extra futex wakeups.
 102                  */
 103                 atomic_set(&index->waiting, false);
 104             }
 105         }
 106
 107         /* atomic_mb_read has smp_rmb after.  */
 108         smp_rmb();
 109
 110         if (QLIST_EMPTY(&registry)) {
 111             break;
 112         }
 113
 114         /* Wait for one thread to report a quiescent state and try again.
 115          * Release rcu_registry_lock, so rcu_(un)register_thread() doesn't
 116          * wait too much time.
 117          *
 118          * rcu_register_thread() may add nodes to &registry; it will not
 119          * wake up synchronize_rcu, but that is okay because at least another
 120          * thread must exit its RCU read-side critical section before
 121          * synchronize_rcu is done.  The next iteration of the loop will
 122          * move the new thread's rcu_reader from &registry to &qsreaders,
 123          * because rcu_gp_ongoing() will return false.
 124          *
 125          * rcu_unregister_thread() may remove nodes from &qsreaders instead
 126          * of &registry if it runs during qemu_event_wait.  That's okay;
 127          * the node then will not be added back to &registry by QLIST_SWAP
 128          * below.  The invariant is that the node is part of one list when
 129          * rcu_registry_lock is released.
 130          */
 131         qemu_mutex_unlock(&rcu_registry_lock);
 132         qemu_event_wait(&rcu_gp_event);
 133         qemu_mutex_lock(&rcu_registry_lock);
 134     }
 135
 136     /* put back the reader list in the registry */
 137     QLIST_SWAP(&registry, &qsreaders, node);
 138 }
 139
 140 void synchronize_rcu(void)
 141 {
 142     qemu_mutex_lock(&rcu_sync_lock);
 143     qemu_mutex_lock(&rcu_registry_lock);
 144
 145     if (!QLIST_EMPTY(&registry)) {
 146         /* In either case, the atomic_mb_set below blocks stores that free
 147          * old RCU-protected pointers.
 148          */
 149         if (sizeof(rcu_gp_ctr) < 8) {
 150             /* For architectures with 32-bit longs, a two-subphases algorithm
 151              * ensures we do not encounter overflow bugs.
 152              *
 153              * Switch parity: 0 -> 1, 1 -> 0.
 154              */
 155             atomic_mb_set(&rcu_gp_ctr, rcu_gp_ctr ^ RCU_GP_CTR);
 156             wait_for_readers();
 157             atomic_mb_set(&rcu_gp_ctr, rcu_gp_ctr ^ RCU_GP_CTR);
 158         } else {
 159             /* Increment current grace period.  */
 160             atomic_mb_set(&rcu_gp_ctr, rcu_gp_ctr + RCU_GP_CTR);
 161         }
 162
 163         wait_for_readers();
 164     }
 165
 166     qemu_mutex_unlock(&rcu_registry_lock);
 167     qemu_mutex_unlock(&rcu_sync_lock);
 168 }
 169
 170
 171 #define RCU_CALL_MIN_SIZE        30
 172
 173 /* Multi-producer, single-consumer queue based on urcu/static/wfqueue.h
 174  * from liburcu.  Note that head is only used by the consumer.
 175  */
 176 static struct rcu_head dummy;
 177 static struct rcu_head *head = &dummy, **tail = &dummy.next;
 178 static int rcu_call_count;
 179 static QemuEvent rcu_call_ready_event;
 180
 181 static void enqueue(struct rcu_head *node)
 182 {
 183     struct rcu_head **old_tail;
 184
 185     node->next = NULL;
 186     old_tail = atomic_xchg(&tail, &node->next);
 187     atomic_mb_set(old_tail, node);
 188 }
 189
 190 static struct rcu_head *try_dequeue(void)
 191 {
 192     struct rcu_head *node, *next;
 193
 194 retry:
 195     /* Test for an empty list, which we do not expect.  Note that for
 196      * the consumer head and tail are always consistent.  The head
 197      * is consistent because only the consumer reads/writes it.
 198      * The tail, because it is the first step in the enqueuing.
 199      * It is only the next pointers that might be inconsistent.
 200      */
 201     if (head == &dummy && atomic_mb_read(&tail) == &dummy.next) {
 202         abort();
 203     }
 204
 205     /* If the head node has NULL in its next pointer, the value is
 206      * wrong and we need to wait until its enqueuer finishes the update.
 207      */
 208     node = head;
 209     next = atomic_mb_read(&head->next);
 210     if (!next) {
 211         return NULL;
 212     }
 213
 214     /* Since we are the sole consumer, and we excluded the empty case
 215      * above, the queue will always have at least two nodes: the
 216      * dummy node, and the one being removed.  So we do not need to update
 217      * the tail pointer.
 218      */
 219     head = next;
 220
 221     /* If we dequeued the dummy node, add it back at the end and retry.  */
 222     if (node == &dummy) {
 223         enqueue(node);
 224         goto retry;
 225     }
 226
 227     return node;
 228 }
 229
 230 static void *call_rcu_thread(void *opaque)
 231 {
 232     struct rcu_head *node;
 233
 234     rcu_register_thread();
 235
 236     for (;;) {
 237         int tries = 0;
 238         int n = atomic_read(&rcu_call_count);
 239
 240         /* Heuristically wait for a decent number of callbacks to pile up.
 241          * Fetch rcu_call_count now, we only must process elements that were
 242          * added before synchronize_rcu() starts.
 243          */
 244         while (n == 0 || (n < RCU_CALL_MIN_SIZE && ++tries <= 5)) {
 245             g_usleep(10000);
 246             if (n == 0) {
 247                 qemu_event_reset(&rcu_call_ready_event);
 248                 n = atomic_read(&rcu_call_count);
 249                 if (n == 0) {
 250                     qemu_event_wait(&rcu_call_ready_event);
 251                 }
 252             }
 253             n = atomic_read(&rcu_call_count);
 254         }
 255
 256         atomic_sub(&rcu_call_count, n);
 257         synchronize_rcu();
 258         qemu_mutex_lock_iothread();
 259         while (n > 0) {
 260             node = try_dequeue();
 261             while (!node) {
 262                 qemu_mutex_unlock_iothread();
 263                 qemu_event_reset(&rcu_call_ready_event);
 264                 node = try_dequeue();
 265                 if (!node) {
 266                     qemu_event_wait(&rcu_call_ready_event);
 267                     node = try_dequeue();
 268                 }
 269                 qemu_mutex_lock_iothread();
 270             }
 271
 272             n--;
 273             node->func(node);
 274         }
 275         qemu_mutex_unlock_iothread();
 276     }
 277     abort();
 278 }
 279
 280 void call_rcu1(struct rcu_head *node, void (*func)(struct rcu_head *node))
 281 {
 282     node->func = func;
 283     enqueue(node);
 284     atomic_inc(&rcu_call_count);
 285     qemu_event_set(&rcu_call_ready_event);
 286 }
 287
 288 void rcu_register_thread(void)
 289 {
 290     assert(rcu_reader.ctr == 0);
 291     qemu_mutex_lock(&rcu_registry_lock);
 292     QLIST_INSERT_HEAD(&registry, &rcu_reader, node);
 293     qemu_mutex_unlock(&rcu_registry_lock);
 294 }
 295
 296 void rcu_unregister_thread(void)
 297 {
 298     qemu_mutex_lock(&rcu_registry_lock);
 299     QLIST_REMOVE(&rcu_reader, node);
 300     qemu_mutex_unlock(&rcu_registry_lock);
 301 }
 302
 303 static void rcu_init_complete(void)
 304 {
 305     QemuThread thread;
 306
 307     qemu_mutex_init(&rcu_registry_lock);
 308     qemu_mutex_init(&rcu_sync_lock);
 309     qemu_event_init(&rcu_gp_event, true);
 310
 311     qemu_event_init(&rcu_call_ready_event, false);
 312
 313     /* The caller is assumed to have iothread lock, so the call_rcu thread
 314      * must have been quiescent even after forking, just recreate it.
 315      */
 316     qemu_thread_create(&thread, "call_rcu", call_rcu_thread,
 317                        NULL, QEMU_THREAD_DETACHED);
 318
 319     rcu_register_thread();
 320 }
 321
 322 #ifdef CONFIG_POSIX
 323 static void rcu_init_lock(void)
 324 {
 325     qemu_mutex_lock(&rcu_sync_lock);
 326     qemu_mutex_lock(&rcu_registry_lock);
 327 }
 328
 329 static void rcu_init_unlock(void)
 330 {
 331     qemu_mutex_unlock(&rcu_registry_lock);
 332     qemu_mutex_unlock(&rcu_sync_lock);
 333 }
 334 #endif
 335
 336 void rcu_after_fork(void)
 337 {
 338     memset(&registry, 0, sizeof(registry));
 339     rcu_init_complete();
 340 }
 341
 342 static void __attribute__((__constructor__)) rcu_init(void)
 343 {
 344 #ifdef CONFIG_POSIX
 345     pthread_atfork(rcu_init_lock, rcu_init_unlock, rcu_init_unlock);
 346 #endif
 347     rcu_init_complete();
 348 }