< prev index next >

src/hotspot/share/gc/g1/g1DirtyCardQueue.cpp

Print this page
rev 57716 : [mq]: remove_cbl_mon

@@ -1,7 +1,7 @@
 /*
- * Copyright (c) 2001, 2019, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2001, 2020, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
  * under the terms of the GNU General Public License version 2 only, as
  * published by the Free Software Foundation.

@@ -24,10 +24,11 @@
 
 #include "precompiled.hpp"
 #include "gc/g1/g1BufferNodeList.hpp"
 #include "gc/g1/g1CardTableEntryClosure.hpp"
 #include "gc/g1/g1CollectedHeap.inline.hpp"
+#include "gc/g1/g1ConcurrentRefineThread.hpp"
 #include "gc/g1/g1DirtyCardQueue.hpp"
 #include "gc/g1/g1FreeIdSet.hpp"
 #include "gc/g1/g1RedirtyCardsQueue.hpp"
 #include "gc/g1/g1RemSet.hpp"
 #include "gc/g1/g1ThreadLocalData.hpp"

@@ -40,11 +41,14 @@
 #include "runtime/orderAccess.hpp"
 #include "runtime/os.hpp"
 #include "runtime/safepoint.hpp"
 #include "runtime/thread.inline.hpp"
 #include "runtime/threadSMR.hpp"
+#include "utilities/globalCounter.inline.hpp"
+#include "utilities/macros.hpp"
 #include "utilities/quickSort.hpp"
+#include <new>
 
 G1DirtyCardQueue::G1DirtyCardQueue(G1DirtyCardQueueSet* qset) :
   // Dirty card queues are always active, so we create them with their
   // active field set to true.
   PtrQueue(qset, true /* active */)

@@ -52,10 +56,12 @@
 
 G1DirtyCardQueue::~G1DirtyCardQueue() {
   flush();
 }
 
+BufferNode* const NULL_buffer = NULL;
+
 void G1DirtyCardQueue::handle_completed_buffer() {
   assert(_buf != NULL, "precondition");
   BufferNode* node = BufferNode::make_node_from_buffer(_buf, index());
   G1DirtyCardQueueSet* dcqs = dirty_card_qset();
   if (dcqs->process_or_enqueue_completed_buffer(node)) {

@@ -66,19 +72,19 @@
 }
 
 // Assumed to be zero by concurrent threads.
 static uint par_ids_start() { return 0; }
 
-G1DirtyCardQueueSet::G1DirtyCardQueueSet(Monitor* cbl_mon,
-                                         BufferNode::Allocator* allocator) :
+G1DirtyCardQueueSet::G1DirtyCardQueueSet(BufferNode::Allocator* allocator) :
   PtrQueueSet(allocator),
-  _cbl_mon(cbl_mon),
-  _completed_buffers_head(NULL),
-  _completed_buffers_tail(NULL),
+  _primary_refinement_thread(NULL),
+  _completed_buffers_head(NULL_buffer),
+  _completed_buffers_tail(NULL_buffer),
   _num_cards(0),
+  DEBUG_ONLY(_concurrency(0) COMMA)
+  _paused(),
   _process_cards_threshold(ProcessCardsThresholdNever),
-  _process_completed_buffers(false),
   _max_cards(MaxCardsUnlimited),
   _max_cards_padding(0),
   _free_ids(par_ids_start(), num_par_ids()),
   _mutator_refined_cards_counters(NEW_C_HEAP_ARRAY(size_t, num_par_ids(), mtGC))
 {

@@ -106,127 +112,328 @@
 
 void G1DirtyCardQueueSet::handle_zero_index_for_thread(Thread* t) {
   G1ThreadLocalData::dirty_card_queue(t).handle_zero_index();
 }
 
-void G1DirtyCardQueueSet::enqueue_completed_buffer(BufferNode* cbn) {
-  MonitorLocker ml(_cbl_mon, Mutex::_no_safepoint_check_flag);
-  cbn->set_next(NULL);
-  if (_completed_buffers_tail == NULL) {
-    assert(_completed_buffers_head == NULL, "Well-formedness");
-    _completed_buffers_head = cbn;
-    _completed_buffers_tail = cbn;
+// _concurrency is an int that is used in debug-only context to verify
+// we're not overlapping queue operations that support concurrency with
+// those which don't.  The value is initially zero, meaning there are no
+// relevant operations in progress.  A "no concurrency" context is entered
+// by atomically changing the value from 0 to -1, with an assert on failure.
+// It is similarly exited by reverting the value back to 0.  A "concurrent"
+// context is entered by atomically incrementing the value and verifying the
+// result is greater than zero (so we weren't in a "no concurrency" context).
+// It is similarly exited by atomically decrementing the value and verifying
+// the result is at least zero (so no mismatches).
+//
+// ConcurrentVerifier and NonconcurrentVerifier are helper classes to
+// establish and remove such contexts.
+
+class G1DirtyCardQueueSet::ConcurrentVerifier : public StackObj {
+#ifdef ASSERT
+  const G1DirtyCardQueueSet* _dcqs;
+
+public:
+  ~ConcurrentVerifier() {
+    assert(Atomic::sub(&_dcqs->_concurrency, 1) >= 0, "invariant");
+  }
+#endif // ASSERT
+
+public:
+  ConcurrentVerifier(const G1DirtyCardQueueSet* dcqs) DEBUG_ONLY(: _dcqs(dcqs)) {
+    assert(Atomic::add(&_dcqs->_concurrency, 1) > 0, "invariant");
+  }
+};
+
+class G1DirtyCardQueueSet::NonconcurrentVerifier : public StackObj {
+#ifdef ASSERT
+  const G1DirtyCardQueueSet* _dcqs;
+
+public:
+  ~NonconcurrentVerifier() {
+    assert(Atomic::cmpxchg(&_dcqs->_concurrency, -1, 0) == -1, "invariant");
+  }
+#endif // ASSERT
+
+public:
+  NonconcurrentVerifier(const G1DirtyCardQueueSet* dcqs) DEBUG_ONLY(: _dcqs(dcqs)) {
+    assert(Atomic::cmpxchg(&_dcqs->_concurrency, 0, -1) == 0, "invariant");
+  }
+};
+
+// _completed_buffers_{head,tail} and _num_cards provide a lock-free FIFO
+// of buffers, linked through their next() fields.
+//
+// The key idea to make this work is that pop (get_completed_buffer) never
+// returns an element of the queue if it is the only accessible element,
+// e.g.  its "next" value is NULL.  It is expected that there will be a
+// later push/append that will make that element available to a future pop,
+// or there will eventually be a complete transfer (take_all_completed_buffers).
+//
+// An append operation atomically exchanges the new tail with the queue tail.
+// It then sets the "next" value of the old tail to the head of the list being
+// appended.  (It is an invariant that the old tail's "next" value is NULL.)
+// But if the old tail is NULL then the queue was empty.  In this case the
+// head of the list being appended is instead stored in the queue head (which
+// must be NULL).
+//
+// A push operation is just a degenerate append, where the buffer being pushed
+// is both the head and the tail of the list being appended.
+//
+// This means there is a period between the exchange and the old tail update
+// where the queue sequence is split into two parts, the list from the queue
+// head to the old tail, and the list being appended.  If there are concurrent
+// push/append operations, each may introduce another such segment.  But they
+// all eventually get resolved by their respective updates of their old tail's
+// "next" value.
+//
+// pop gets the queue head as the candidate result (returning NULL if the
+// queue head was NULL), and then gets that result node's "next" value.  If
+// that "next" value is NULL and the queue head hasn't changed, then there
+// is only one element in the (accessible) list.  We can't return that
+// element, because it may be the old tail of a concurrent push/append.  So
+// return NULL in this case.  Otherwise, attempt to cmpxchg that "next"
+// value into the queue head, retrying the whole operation if that fails.
+// This is the "usual" lock-free pop from head of slist, with the additional
+// restriction on taking the last element.
+//
+// In order to address the ABA problem for pop, a pop operation protects its
+// access to the head of the list with a GlobalCounter critical section. This
+// works with the buffer allocator's use of GlobalCounter synchronization to
+// prevent ABA from arising in the normal buffer usage cycle.  The paused
+// buffer handling prevents another ABA source (see record_paused_buffer and
+// enqueue_previous_paused_buffers).
+
+size_t G1DirtyCardQueueSet::append_buffers(BufferNode* first,
+                                           BufferNode* last,
+                                           size_t card_count) {
+  assert(last->next() == NULL_buffer, "precondition");
+  ConcurrentVerifier cv(this);
+  // Increment _num_cards before adding to queue, so queue removal doesn't
+  // need to deal with _num_cards possibly going negative.
+  size_t new_num_cards = Atomic::add(&_num_cards, card_count);
+  BufferNode* old_tail = Atomic::xchg(&_completed_buffers_tail, last);
+  if (old_tail == NULL_buffer) { // Empty list.
+    assert(Atomic::load(&_completed_buffers_head) == NULL_buffer, "invariant");
+    Atomic::store(&_completed_buffers_head, first);
   } else {
-    _completed_buffers_tail->set_next(cbn);
-    _completed_buffers_tail = cbn;
+    assert(old_tail->next() == NULL_buffer, "invariant");
+    old_tail->set_next(first);
   }
-  _num_cards += buffer_size() - cbn->index();
+  return new_num_cards;
+}
 
-  if (!process_completed_buffers() &&
-      (num_cards() > process_cards_threshold())) {
-    set_process_completed_buffers(true);
-    ml.notify_all();
+void G1DirtyCardQueueSet::enqueue_completed_buffer(BufferNode* cbn) {
+  assert(cbn != NULL_buffer, "precondition");
+  size_t new_num_cards = append_buffers(cbn, cbn, buffer_size() - cbn->index());
+  if ((_primary_refinement_thread != NULL) &&
+      (new_num_cards > process_cards_threshold())) {
+    _primary_refinement_thread->activate();
   }
-  verify_num_cards();
 }
 
 BufferNode* G1DirtyCardQueueSet::get_completed_buffer(size_t stop_at) {
-  MutexLocker x(_cbl_mon, Mutex::_no_safepoint_check_flag);
+  enqueue_previous_paused_buffers();
 
-  if (num_cards() <= stop_at) {
-    return NULL;
-  }
+  ConcurrentVerifier cv(this);
 
-  assert(num_cards() > 0, "invariant");
-  assert(_completed_buffers_head != NULL, "invariant");
-  assert(_completed_buffers_tail != NULL, "invariant");
-
-  BufferNode* bn = _completed_buffers_head;
-  _num_cards -= buffer_size() - bn->index();
-  _completed_buffers_head = bn->next();
-  if (_completed_buffers_head == NULL) {
-    assert(num_cards() == 0, "invariant");
-    _completed_buffers_tail = NULL;
-    set_process_completed_buffers(false);
+  // Check for unsufficient cards to satisfy request.  We only do this once,
+  // up front, rather than on each iteration below, since the test is racy
+  // regardless of when we do it.
+  if (Atomic::load_acquire(&_num_cards) <= stop_at) {
+    return NULL_buffer;
+  }
+
+  Thread* current_thread = Thread::current();
+
+  while (true) {
+    // Use a critical section per iteration, rather than over the whole
+    // operation.  We're not guaranteed to make progress, because of possible
+    // contention on the queue head.  Lingering in one CS the whole time could
+    // lead to excessive allocation of buffers, because the CS blocks return
+    // of released buffers to the free list for reuse.
+    GlobalCounter::CriticalSection cs(current_thread);
+
+    BufferNode* result = Atomic::load_acquire(&_completed_buffers_head);
+    // Check for empty queue.  Only needs to be done on first iteration,
+    // since we never take the last element, but it's messy to make use
+    // of that and we expect one iteration to be the common case.
+    if (result == NULL_buffer) return result;
+
+    BufferNode* next = Atomic::load_acquire(BufferNode::next_ptr(*result));
+    if (next != NULL_buffer) {
+      next = Atomic::cmpxchg(&_completed_buffers_head, result, next);
+      if (next == result) {
+        // Former head successfully taken; it is not the last.
+        assert(Atomic::load(&_completed_buffers_tail) != result, "invariant");
+        assert(result->next() != NULL_buffer, "invariant");
+        result->set_next(NULL_buffer);
+        Atomic::sub(&_num_cards, buffer_size() - result->index());
+        return result;
   }
-  verify_num_cards();
-  bn->set_next(NULL);
-  return bn;
+      // cmpxchg failed; try again.
+    } else if (result == Atomic::load_acquire(&_completed_buffers_head)) {
+      // If follower of head is NULL and head hasn't changed, then only
+      // the one element is currently accessible.  We don't take the last
+      // accessible element, because there may be a concurrent add using it.
+      // The check for unchanged head isn't needed for correctness, but the
+      // retry on change may sometimes let us get a buffer after all.
+      return NULL_buffer;
+    }
+    // Head changed; try again.
+  }
+  // Unreachable
 }
 
 #ifdef ASSERT
 void G1DirtyCardQueueSet::verify_num_cards() const {
+  NonconcurrentVerifier ncv(this);
   size_t actual = 0;
-  BufferNode* cur = _completed_buffers_head;
-  while (cur != NULL) {
+  BufferNode* cur = Atomic::load(&_completed_buffers_head);
+  for ( ; cur != NULL_buffer; cur = cur->next()) {
     actual += buffer_size() - cur->index();
-    cur = cur->next();
   }
-  assert(actual == _num_cards,
+  assert(actual == Atomic::load(&_num_cards),
          "Num entries in completed buffers should be " SIZE_FORMAT " but are " SIZE_FORMAT,
-         _num_cards, actual);
+         Atomic::load(&_num_cards), actual);
 }
-#endif
+#endif // ASSERT
 
-void G1DirtyCardQueueSet::abandon_completed_buffers() {
-  BufferNode* buffers_to_delete = NULL;
-  {
-    MutexLocker x(_cbl_mon, Mutex::_no_safepoint_check_flag);
-    buffers_to_delete = _completed_buffers_head;
-    _completed_buffers_head = NULL;
-    _completed_buffers_tail = NULL;
-    _num_cards = 0;
-    set_process_completed_buffers(false);
+// Refinement processing stops early if there is a pending safepoint, to
+// avoid long delays to safepoint.  We need to record the partially
+// processed buffer for later continued processing.  However, we can't
+// simply add it back to the completed buffer queue, as that would introduce
+// a new source of ABA for the queue.  Instead, we have a pair of buffer
+// lists (with each list represented by head and tail), one for each of the
+// previous and next safepoints (*).  When pausing the processing of a
+// buffer for a safepoint, we add the buffer (lock free) to the list for the
+// next safepoint.  Before attempting to obtain a buffer from the queue we
+// first transfer any buffers in the previous safepoint list to the queue.
+// This is safe (doesn't introduce ABA) because threads cannot be in the
+// midst of a queue pop across a safepoint.
+//
+// These paused buffer lists are conceptually an extension of the queue, and
+// operations which need to deal with all of the queued buffers (such as
+// concatenate_logs) also need to deal with any paused buffers.  In general,
+// if the safepoint performs a GC then the paused buffers will be processed
+// as part of it and both lists will be empty afterward.
+//
+// An alternative would be to directly reenqueue a paused buffer, but only
+// after first calling GlobalCounter::write_synchronize.  However, that
+// might noticeably delay the pending safepoint.
+//
+// A single paused list and a safepoint cleanup action to perform the transfer
+// doesn't work because cleanup actions are not invoked for every safepoint.
+//
+// (*) If the safepoint does not perform a GC, the next list becomes the
+// previous list after the safepoint.  Since buffers are only added to the
+// next list if there were threads performing refinement work, there will
+// likely be refinement work done after the safepoint, which will transfer
+// those buffers to the queue.  However, multiple non-GC safepoints in
+// succession, without intervening refinement work to perform a transfer
+// (possibly through lack of time), can result in old buffers being present
+// and inaccessible in the next list.  This doesn't affect correctness, but
+// might affect performance.  The alternatives discussed above don't have
+// this problem, but have problems of their own.
+
+static size_t next_paused_buffer_list_index() {
+  return SafepointSynchronize::safepoint_id() & 1;
+}
+
+static size_t previous_paused_buffer_list_index() {
+  return next_paused_buffer_list_index() ^ 1;
+}
+
+void G1DirtyCardQueueSet::record_paused_buffer(BufferNode* node) {
+  assert_not_at_safepoint();
+  assert(node->next() == NULL_buffer, "precondition");
+  size_t next_index = next_paused_buffer_list_index();
+  // Cards for paused buffers are included in count, to contribute to
+  // notification checking after the coming safepoint if it doesn't GC.
+  Atomic::add(&_num_cards, buffer_size() - node->index());
+  BufferNode* old_head = Atomic::xchg(&_paused[next_index]._head, node);
+  if (old_head == NULL_buffer) {
+    assert(_paused[next_index]._tail == NULL, "invariant");
+    _paused[next_index]._tail = node;
+  } else {
+    node->set_next(old_head);
+  }
+}
+
+void G1DirtyCardQueueSet::enqueue_paused_buffers_aux(size_t index) {
+  if (Atomic::load(&_paused[index]._head) != NULL_buffer) {
+    BufferNode* paused = Atomic::xchg(&_paused[index]._head, NULL_buffer);
+    if (paused != NULL_buffer) {
+      BufferNode* tail = _paused[index]._tail;
+      assert(tail != NULL, "invariant");
+      _paused[index]._tail = NULL_buffer;
+      append_buffers(paused, tail, 0); // Cards already counted when recorded.
   }
-  while (buffers_to_delete != NULL) {
+  }
+}
+
+void G1DirtyCardQueueSet::enqueue_previous_paused_buffers() {
+  size_t previous_index = previous_paused_buffer_list_index();
+  enqueue_paused_buffers_aux(previous_index);
+}
+
+void G1DirtyCardQueueSet::enqueue_all_paused_buffers() {
+  assert_at_safepoint();
+  for (size_t i = 0; i < ARRAY_SIZE(_paused); ++i) {
+    enqueue_paused_buffers_aux(i);
+  }
+}
+
+void G1DirtyCardQueueSet::clear_completed_buffers() {
+  Atomic::store(&_completed_buffers_head, NULL_buffer);
+  Atomic::store(&_completed_buffers_tail, NULL_buffer);
+  Atomic::store(&_num_cards, size_t(0));
+}
+
+void G1DirtyCardQueueSet::abandon_completed_buffers() {
+  enqueue_all_paused_buffers();
+  verify_num_cards();
+  NonconcurrentVerifier ncv(this);
+  BufferNode* buffers_to_delete = Atomic::load(&_completed_buffers_head);
+  clear_completed_buffers();
+  while (buffers_to_delete != NULL_buffer) {
     BufferNode* bn = buffers_to_delete;
     buffers_to_delete = bn->next();
-    bn->set_next(NULL);
+    bn->set_next(NULL_buffer);
     deallocate_buffer(bn);
   }
 }
 
 void G1DirtyCardQueueSet::notify_if_necessary() {
-  MonitorLocker ml(_cbl_mon, Mutex::_no_safepoint_check_flag);
-  if (num_cards() > process_cards_threshold()) {
-    set_process_completed_buffers(true);
-    ml.notify_all();
+  if ((_primary_refinement_thread != NULL) &&
+      (num_cards() > process_cards_threshold())) {
+    _primary_refinement_thread->activate();
   }
 }
 
-// Merge lists of buffers. Notify the processing threads.
-// The source queue is emptied as a result. The queues
-// must share the monitor.
+// Merge lists of buffers. The source queue set is emptied as a
+// result. The queue sets must share the same allocator.
 void G1DirtyCardQueueSet::merge_bufferlists(G1RedirtyCardsQueueSet* src) {
   assert(allocator() == src->allocator(), "precondition");
   const G1BufferNodeList from = src->take_all_completed_buffers();
-  if (from._head == NULL) return;
-
-  MutexLocker x(_cbl_mon, Mutex::_no_safepoint_check_flag);
-  if (_completed_buffers_tail == NULL) {
-    assert(_completed_buffers_head == NULL, "Well-formedness");
-    _completed_buffers_head = from._head;
-    _completed_buffers_tail = from._tail;
-  } else {
-    assert(_completed_buffers_head != NULL, "Well formedness");
-    _completed_buffers_tail->set_next(from._head);
-    _completed_buffers_tail = from._tail;
-  }
-  _num_cards += from._entry_count;
-
-  assert(_completed_buffers_head == NULL && _completed_buffers_tail == NULL ||
-         _completed_buffers_head != NULL && _completed_buffers_tail != NULL,
-         "Sanity");
-  verify_num_cards();
+  if (from._head == NULL_buffer) return;
+  append_buffers(from._head, from._tail, from._entry_count);
 }
 
 G1BufferNodeList G1DirtyCardQueueSet::take_all_completed_buffers() {
-  MutexLocker x(_cbl_mon, Mutex::_no_safepoint_check_flag);
-  G1BufferNodeList result(_completed_buffers_head, _completed_buffers_tail, _num_cards);
-  _completed_buffers_head = NULL;
-  _completed_buffers_tail = NULL;
-  _num_cards = 0;
+#ifdef ASSERT
+  for (size_t i = 0; i < ARRAY_SIZE(_paused); ++i) {
+    assert(Atomic::load(&_paused[i]._head) == NULL_buffer, "precondition");
+    assert(Atomic::load(&_paused[i]._tail) == NULL_buffer, "precondition");
+  }
+#endif // ASSERT
+  verify_num_cards();
+  NonconcurrentVerifier ncv(this);
+  G1BufferNodeList result(Atomic::load(&_completed_buffers_head),
+                          Atomic::load(&_completed_buffers_tail),
+                          Atomic::load(&_num_cards));
+  clear_completed_buffers();
   return result;
 }
 
 class G1RefineBufferedCards : public StackObj {
   BufferNode* const _node;

@@ -397,26 +604,27 @@
 
 bool G1DirtyCardQueueSet::refine_completed_buffer_concurrently(uint worker_id,
                                                                size_t stop_at,
                                                                size_t* total_refined_cards) {
   BufferNode* node = get_completed_buffer(stop_at);
-  if (node == NULL) {
+  if (node == NULL_buffer) {
     return false;
   } else if (refine_buffer(node, worker_id, total_refined_cards)) {
     assert_fully_consumed(node, buffer_size());
     // Done with fully processed buffer.
     deallocate_buffer(node);
     return true;
   } else {
-    // Return partially processed buffer to the queue.
-    enqueue_completed_buffer(node);
+    // Buffer incompletely processed because there is a pending safepoint.
+    // Record partially processed buffer, to be finished later.
+    record_paused_buffer(node);
     return true;
   }
 }
 
 void G1DirtyCardQueueSet::abandon_logs() {
-  assert(SafepointSynchronize::is_at_safepoint(), "Must be at safepoint.");
+  assert_at_safepoint();
   abandon_completed_buffers();
 
   // Since abandon is done only at safepoints, we can safely manipulate
   // these queues.
   struct AbandonThreadLogClosure : public ThreadClosure {

@@ -431,11 +639,11 @@
 
 void G1DirtyCardQueueSet::concatenate_logs() {
   // Iterate over all the threads, if we find a partial log add it to
   // the global list of logs.  Temporarily turn off the limit on the number
   // of outstanding buffers.
-  assert(SafepointSynchronize::is_at_safepoint(), "Must be at safepoint.");
+  assert_at_safepoint();
   size_t old_limit = max_cards();
   set_max_cards(MaxCardsUnlimited);
 
   struct ConcatenateThreadLogClosure : public ThreadClosure {
     virtual void do_thread(Thread* t) {

@@ -446,7 +654,9 @@
     }
   } closure;
   Threads::threads_do(&closure);
 
   G1BarrierSet::shared_dirty_card_queue().flush();
+  enqueue_all_paused_buffers();
+  verify_num_cards();
   set_max_cards(old_limit);
 }
< prev index next >