# HG changeset patch
# User redestad
# Date 1421937492 -3600
#      Thu Jan 22 15:38:12 2015 +0100
# Node ID fe6f92712028159cb5cd54f0b3be57fa4e895ec3
# Parent  e6a0cfbfdc9abbff0b47af93996335a7a9ae72f4
8069273: Reduce Hot Card Cache Lock contention
Reviewed-by: tschatzl

diff --git a/src/share/vm/gc_implementation/g1/g1HotCardCache.cpp b/src/share/vm/gc_implementation/g1/g1HotCardCache.cpp
--- a/src/share/vm/gc_implementation/g1/g1HotCardCache.cpp
+++ b/src/share/vm/gc_implementation/g1/g1HotCardCache.cpp
@@ -36,10 +36,12 @@
   if (default_use_cache()) {
     _use_cache = true;
 
-    _hot_cache_size = (1 << G1ConcRSLogCacheSize);
+    _hot_cache_size = (intptr_t)1 << G1ConcRSLogCacheSize;
     _hot_cache = NEW_C_HEAP_ARRAY(jbyte*, _hot_cache_size, mtGC);
+    for (intptr_t i = 0; i < _hot_cache_size; i++) {
+      _hot_cache[i] = NULL;
+    }
 
-    _n_hot = 0;
     _hot_cache_idx = 0;
 
     // For refining the cards in the hot cache in parallel
@@ -64,26 +66,29 @@
     // return it for immediate refining.
     return card_ptr;
   }
+  // Otherwise, the card is hot.
+  jbyte* current_ptr;
+  intptr_t masked_index;
+  intptr_t index;
+  do {
+    index = _hot_cache_idx;
+    masked_index = index & (_hot_cache_size - 1);
+    current_ptr = _hot_cache[masked_index];
+    if (current_ptr == card_ptr) {
+        return NULL;
+    }
+  } while (index != Atomic::cmpxchg_ptr(index + 1, &_hot_cache_idx, index));
 
-  // Otherwise, the card is hot.
-  jbyte* res = NULL;
-  MutexLockerEx x(HotCardCache_lock, Mutex::_no_safepoint_check_flag);
-  if (_n_hot == _hot_cache_size) {
-    res = _hot_cache[_hot_cache_idx];
-    _n_hot--;
-  }
-
-  // Now _n_hot < _hot_cache_size, and we can insert at _hot_cache_idx.
-  _hot_cache[_hot_cache_idx] = card_ptr;
-  _hot_cache_idx++;
-
-  if (_hot_cache_idx == _hot_cache_size) {
-    // Wrap around
-    _hot_cache_idx = 0;
-  }
-  _n_hot++;
-
-  return res;
+  // Try to store the new card pointer into the cache. Compare-and-swap to guard
+  // against the unlikely event of a race resulting in another card pointer to
+  // have already been written to the cache. In this case we will return
+  // card_ptr in favor of the other option, which would be starting over. This
+  // should be OK since card_ptr will likely be the older card already when/if
+  // this ever happens.
+  jbyte* previous_ptr = (jbyte*)Atomic::cmpxchg_ptr(card_ptr,
+                                            &_hot_cache[masked_index],
+                                            current_ptr);
+  return (previous_ptr == current_ptr) ? previous_ptr : card_ptr;
 }
 
 void G1HotCardCache::drain(uint worker_i,
@@ -96,16 +101,16 @@
 
   assert(_hot_cache != NULL, "Logic");
   assert(!use_cache(), "cache should be disabled");
-  int start_idx;
+  intptr_t start_idx;
 
-  while ((start_idx = _hot_cache_par_claimed_idx) < _n_hot) { // read once
-    int end_idx = start_idx + _hot_cache_par_chunk_size;
+  while ((start_idx = _hot_cache_par_claimed_idx) < _hot_cache_size) { // read once
+    intptr_t end_idx = start_idx + _hot_cache_par_chunk_size;
 
     if (start_idx ==
-        Atomic::cmpxchg(end_idx, &_hot_cache_par_claimed_idx, start_idx)) {
+        Atomic::cmpxchg_ptr(end_idx, &_hot_cache_par_claimed_idx, start_idx)) {
       // The current worker has successfully claimed the chunk [start_idx..end_idx)
-      end_idx = MIN2(end_idx, _n_hot);
-      for (int i = start_idx; i < end_idx; i++) {
+      end_idx = MIN2(end_idx, _hot_cache_size);
+      for (intptr_t i = start_idx; i < end_idx; i++) {
         jbyte* card_ptr = _hot_cache[i];
         if (card_ptr != NULL) {
           if (g1rs->refine_card(card_ptr, worker_i, true)) {
@@ -124,6 +129,8 @@
 
             into_cset_dcq->enqueue(card_ptr);
           }
+        } else {
+          break;
         }
       }
     }
diff --git a/src/share/vm/gc_implementation/g1/g1HotCardCache.hpp b/src/share/vm/gc_implementation/g1/g1HotCardCache.hpp
--- a/src/share/vm/gc_implementation/g1/g1HotCardCache.hpp
+++ b/src/share/vm/gc_implementation/g1/g1HotCardCache.hpp
@@ -54,21 +54,30 @@
 // code, increasing throughput.
 
 class G1HotCardCache: public CHeapObj<mtGC> {
-  G1CollectedHeap*   _g1h;
+
+  G1CollectedHeap*  _g1h;
+
+  bool              _use_cache;
+
+  G1CardCounts      _card_counts;
 
   // The card cache table
-  jbyte**      _hot_cache;
+  jbyte**           _hot_cache;
 
-  int          _hot_cache_size;
-  int          _n_hot;
-  int          _hot_cache_idx;
+  intptr_t          _hot_cache_size;
 
-  int          _hot_cache_par_chunk_size;
-  volatile int _hot_cache_par_claimed_idx;
+  int               _hot_cache_par_chunk_size;
 
-  bool         _use_cache;
+  // Avoids false sharing when concurrently updating _hot_cache_idx or
+  // _hot_cache_par_claimed_idx. These are never updated at the same time
+  // thus it's not necessary to separate them as well
+  char _pad_before[DEFAULT_CACHE_LINE_SIZE];
 
-  G1CardCounts _card_counts;
+  volatile intptr_t _hot_cache_idx;
+  
+  volatile intptr_t _hot_cache_par_claimed_idx;
+
+  char _pad_after[DEFAULT_CACHE_LINE_SIZE];
 
   // The number of cached cards a thread claims when flushing the cache
   static const int ClaimChunkSize = 32;
@@ -113,10 +122,13 @@
   void reset_hot_cache() {
     assert(SafepointSynchronize::is_at_safepoint(), "Should be at a safepoint");
     assert(Thread::current()->is_VM_thread(), "Current thread should be the VMthread");
-    _hot_cache_idx = 0; _n_hot = 0;
+    _hot_cache_idx = 0;
+    for (intptr_t i = 0; i < _hot_cache_size; i++) {
+        _hot_cache[i] = NULL;
+    }
   }
 
-  bool hot_cache_is_empty() { return _n_hot == 0; }
+  bool hot_cache_is_empty() { return _hot_cache[0] == NULL; }
 
   // Zeros the values in the card counts table for entire committed heap
   void reset_card_counts();
diff --git a/src/share/vm/runtime/mutexLocker.cpp b/src/share/vm/runtime/mutexLocker.cpp
--- a/src/share/vm/runtime/mutexLocker.cpp
+++ b/src/share/vm/runtime/mutexLocker.cpp
@@ -120,7 +120,6 @@
 Mutex*   OldSets_lock                 = NULL;
 Monitor* RootRegionScan_lock          = NULL;
 Mutex*   MMUTracker_lock              = NULL;
-Mutex*   HotCardCache_lock            = NULL;
 
 Monitor* GCTaskManager_lock           = NULL;
 
@@ -199,7 +198,6 @@
     def(OldSets_lock               , Mutex  , leaf     ,   true,  Monitor::_safepoint_check_never);
     def(RootRegionScan_lock        , Monitor, leaf     ,   true,  Monitor::_safepoint_check_never);
     def(MMUTracker_lock            , Mutex  , leaf     ,   true,  Monitor::_safepoint_check_never);
-    def(HotCardCache_lock          , Mutex  , special  ,   true,  Monitor::_safepoint_check_never);
     def(EvacFailureStack_lock      , Mutex  , nonleaf  ,   true,  Monitor::_safepoint_check_never);
 
     def(StringDedupQueue_lock      , Monitor, leaf,        true,  Monitor::_safepoint_check_never);
diff --git a/src/share/vm/runtime/mutexLocker.hpp b/src/share/vm/runtime/mutexLocker.hpp
--- a/src/share/vm/runtime/mutexLocker.hpp
+++ b/src/share/vm/runtime/mutexLocker.hpp
@@ -122,7 +122,6 @@
 extern Monitor* RootRegionScan_lock;             // used to notify that the CM threads have finished scanning the IM snapshot regions
 extern Mutex*   MMUTracker_lock;                 // protects the MMU
                                                  // tracker data structures
-extern Mutex*   HotCardCache_lock;               // protects the hot card cache
 
 extern Mutex*   Management_lock;                 // a lock used to serialize JVM management
 extern Monitor* Service_lock;                    // a lock used for service thread operation
# HG changeset patch
# User redestad
# Date 1422024712 -3600
#      Fri Jan 23 15:51:52 2015 +0100
# Node ID 768089606ee00cd079958f9e6179bac0ea08c945
# Parent  fe6f92712028159cb5cd54f0b3be57fa4e895ec3
[mq]: atomicadd

diff --git a/src/share/vm/gc_implementation/g1/g1HotCardCache.cpp b/src/share/vm/gc_implementation/g1/g1HotCardCache.cpp
--- a/src/share/vm/gc_implementation/g1/g1HotCardCache.cpp
+++ b/src/share/vm/gc_implementation/g1/g1HotCardCache.cpp
@@ -36,9 +36,9 @@
   if (default_use_cache()) {
     _use_cache = true;
 
-    _hot_cache_size = (intptr_t)1 << G1ConcRSLogCacheSize;
+    _hot_cache_size = (size_t)1 << G1ConcRSLogCacheSize;
     _hot_cache = NEW_C_HEAP_ARRAY(jbyte*, _hot_cache_size, mtGC);
-    for (intptr_t i = 0; i < _hot_cache_size; i++) {
+    for (size_t i = 0; i < _hot_cache_size; i++) {
       _hot_cache[i] = NULL;
     }
 
@@ -66,19 +66,11 @@
     // return it for immediate refining.
     return card_ptr;
   }
-  // Otherwise, the card is hot.
-  jbyte* current_ptr;
-  intptr_t masked_index;
-  intptr_t index;
-  do {
-    index = _hot_cache_idx;
-    masked_index = index & (_hot_cache_size - 1);
-    current_ptr = _hot_cache[masked_index];
-    if (current_ptr == card_ptr) {
-        return NULL;
-    }
-  } while (index != Atomic::cmpxchg_ptr(index + 1, &_hot_cache_idx, index));
-
+  // Otherwise, the card is hot. 
+  size_t index = Atomic::add(1, &_hot_cache_idx) - 1;
+  size_t masked_index = index & (_hot_cache_size - 1);
+  jbyte* current_ptr = _hot_cache[masked_index];
+  
   // Try to store the new card pointer into the cache. Compare-and-swap to guard
   // against the unlikely event of a race resulting in another card pointer to
   // have already been written to the cache. In this case we will return
@@ -86,8 +78,8 @@
   // should be OK since card_ptr will likely be the older card already when/if
   // this ever happens.
   jbyte* previous_ptr = (jbyte*)Atomic::cmpxchg_ptr(card_ptr,
-                                            &_hot_cache[masked_index],
-                                            current_ptr);
+                                                    &_hot_cache[masked_index],
+                                                    current_ptr);
   return (previous_ptr == current_ptr) ? previous_ptr : card_ptr;
 }
 
@@ -101,40 +93,37 @@
 
   assert(_hot_cache != NULL, "Logic");
   assert(!use_cache(), "cache should be disabled");
-  intptr_t start_idx;
 
-  while ((start_idx = _hot_cache_par_claimed_idx) < _hot_cache_size) { // read once
-    intptr_t end_idx = start_idx + _hot_cache_par_chunk_size;
+  while (_hot_cache_par_claimed_idx < _hot_cache_size) { // read once
+    size_t end_idx = Atomic::add(_hot_cache_par_chunk_size, &_hot_cache_par_claimed_idx);
+    size_t start_idx = end_idx - _hot_cache_par_chunk_size;
+    // The current worker has successfully claimed the chunk [start_idx..end_idx)
+    end_idx = MIN2(end_idx, _hot_cache_size);
+    for (size_t i = start_idx; i < end_idx; i++) {
+      jbyte* card_ptr = _hot_cache[i];
+      if (card_ptr != NULL) {
+        if (g1rs->refine_card(card_ptr, worker_i, true)) {
+          // The part of the heap spanned by the card contains references
+          // that point into the current collection set.
+          // We need to record the card pointer in the DirtyCardQueueSet
+          // that we use for such cards.
+          //
+          // The only time we care about recording cards that contain
+          // references that point into the collection set is during
+          // RSet updating while within an evacuation pause.
+          // In this case worker_i should be the id of a GC worker thread
+          assert(SafepointSynchronize::is_at_safepoint(), "Should be at a safepoint");
+          assert(worker_i < ParallelGCThreads,
+                 err_msg("incorrect worker id: %u", worker_i));
 
-    if (start_idx ==
-        Atomic::cmpxchg_ptr(end_idx, &_hot_cache_par_claimed_idx, start_idx)) {
-      // The current worker has successfully claimed the chunk [start_idx..end_idx)
-      end_idx = MIN2(end_idx, _hot_cache_size);
-      for (intptr_t i = start_idx; i < end_idx; i++) {
-        jbyte* card_ptr = _hot_cache[i];
-        if (card_ptr != NULL) {
-          if (g1rs->refine_card(card_ptr, worker_i, true)) {
-            // The part of the heap spanned by the card contains references
-            // that point into the current collection set.
-            // We need to record the card pointer in the DirtyCardQueueSet
-            // that we use for such cards.
-            //
-            // The only time we care about recording cards that contain
-            // references that point into the collection set is during
-            // RSet updating while within an evacuation pause.
-            // In this case worker_i should be the id of a GC worker thread
-            assert(SafepointSynchronize::is_at_safepoint(), "Should be at a safepoint");
-            assert(worker_i < ParallelGCThreads,
-                   err_msg("incorrect worker id: %u", worker_i));
-
-            into_cset_dcq->enqueue(card_ptr);
-          }
-        } else {
-          break;
+          into_cset_dcq->enqueue(card_ptr);
         }
+      } else {
+        break;
       }
     }
   }
+  
   // The existing entries in the hot card cache, which were just refined
   // above, are discarded prior to re-enabling the cache near the end of the GC.
 }
diff --git a/src/share/vm/gc_implementation/g1/g1HotCardCache.hpp b/src/share/vm/gc_implementation/g1/g1HotCardCache.hpp
--- a/src/share/vm/gc_implementation/g1/g1HotCardCache.hpp
+++ b/src/share/vm/gc_implementation/g1/g1HotCardCache.hpp
@@ -64,7 +64,7 @@
   // The card cache table
   jbyte**           _hot_cache;
 
-  intptr_t          _hot_cache_size;
+  size_t            _hot_cache_size;
 
   int               _hot_cache_par_chunk_size;
 
@@ -73,9 +73,9 @@
   // thus it's not necessary to separate them as well
   char _pad_before[DEFAULT_CACHE_LINE_SIZE];
 
-  volatile intptr_t _hot_cache_idx;
+  volatile size_t _hot_cache_idx;
   
-  volatile intptr_t _hot_cache_par_claimed_idx;
+  volatile size_t _hot_cache_par_claimed_idx;
 
   char _pad_after[DEFAULT_CACHE_LINE_SIZE];
 
@@ -123,7 +123,7 @@
     assert(SafepointSynchronize::is_at_safepoint(), "Should be at a safepoint");
     assert(Thread::current()->is_VM_thread(), "Current thread should be the VMthread");
     _hot_cache_idx = 0;
-    for (intptr_t i = 0; i < _hot_cache_size; i++) {
+    for (size_t i = 0; i < _hot_cache_size; i++) {
         _hot_cache[i] = NULL;
     }
   }