Print this page
rev 2896 : 6484965: G1: piggy-back liveness accounting phase on marking
Summary: Remove the separate counting phase of concurrent marking by tracking the amount of marked bytes and the cards spanned by marked objects in marking task/worker thread local data structures, which are updated as individual objects are marked.
Reviewed-by: brutisso

Split Close
Expand all
Collapse all
          --- old/src/share/vm/gc_implementation/g1/concurrentMark.cpp
          +++ new/src/share/vm/gc_implementation/g1/concurrentMark.cpp
↓ open down ↓ 470 lines elided ↑ open up ↑
 471  471    _max_parallel_marking_threads(0),
 472  472    _sleep_factor(0.0),
 473  473    _marking_task_overhead(1.0),
 474  474    _cleanup_sleep_factor(0.0),
 475  475    _cleanup_task_overhead(1.0),
 476  476    _cleanup_list("Cleanup List"),
 477  477    _region_bm(max_regions, false /* in_resource_area*/),
 478  478    _card_bm((rs.size() + CardTableModRefBS::card_size - 1) >>
 479  479             CardTableModRefBS::card_shift,
 480  480             false /* in_resource_area*/),
      481 +
 481  482    _prevMarkBitMap(&_markBitMap1),
 482  483    _nextMarkBitMap(&_markBitMap2),
 483  484    _at_least_one_mark_complete(false),
 484  485  
 485  486    _markStack(this),
 486  487    _regionStack(),
 487  488    // _finger set in set_non_marking_state
 488  489  
 489  490    _max_task_num(MAX2(ParallelGCThreads, (size_t)1)),
 490  491    // _active_tasks set in set_non_marking_state
↓ open down ↓ 9 lines elided ↑ open up ↑
 500  501    _should_gray_objects(false),
 501  502  
 502  503    // _verbose_level set below
 503  504  
 504  505    _init_times(),
 505  506    _remark_times(), _remark_mark_times(), _remark_weak_ref_times(),
 506  507    _cleanup_times(),
 507  508    _total_counting_time(0.0),
 508  509    _total_rs_scrub_time(0.0),
 509  510  
 510      -  _parallel_workers(NULL) {
      511 +  _parallel_workers(NULL),
      512 +
      513 +  _count_card_bitmaps(NULL),
      514 +  _count_marked_bytes(NULL)
      515 +{
 511  516    CMVerboseLevel verbose_level = (CMVerboseLevel) G1MarkingVerboseLevel;
 512  517    if (verbose_level < no_verbose) {
 513  518      verbose_level = no_verbose;
 514  519    }
 515  520    if (verbose_level > high_verbose) {
 516  521      verbose_level = high_verbose;
 517  522    }
 518  523    _verbose_level = verbose_level;
 519  524  
 520  525    if (verbose_low()) {
↓ open down ↓ 13 lines elided ↑ open up ↑
 534  539    assert(CGC_lock != NULL, "Where's the CGC_lock?");
 535  540    assert(_markBitMap1.covers(rs), "_markBitMap1 inconsistency");
 536  541    assert(_markBitMap2.covers(rs), "_markBitMap2 inconsistency");
 537  542  
 538  543    SATBMarkQueueSet& satb_qs = JavaThread::satb_mark_queue_set();
 539  544    satb_qs.set_buffer_size(G1SATBBufferSize);
 540  545  
 541  546    _tasks = NEW_C_HEAP_ARRAY(CMTask*, _max_task_num);
 542  547    _accum_task_vtime = NEW_C_HEAP_ARRAY(double, _max_task_num);
 543  548  
      549 +  _count_card_bitmaps = NEW_C_HEAP_ARRAY(BitMap,  _max_task_num);
      550 +  _count_marked_bytes = NEW_C_HEAP_ARRAY(size_t*, _max_task_num);
      551 +
      552 +  BitMap::idx_t card_bm_size = _card_bm.size();
      553 +
 544  554    // so that the assertion in MarkingTaskQueue::task_queue doesn't fail
 545  555    _active_tasks = _max_task_num;
 546  556    for (int i = 0; i < (int) _max_task_num; ++i) {
 547  557      CMTaskQueue* task_queue = new CMTaskQueue();
 548  558      task_queue->initialize();
 549  559      _task_queues->register_queue(i, task_queue);
 550  560  
 551      -    _tasks[i] = new CMTask(i, this, task_queue, _task_queues);
      561 +    _count_card_bitmaps[i] = BitMap(card_bm_size, false);
      562 +    _count_marked_bytes[i] = NEW_C_HEAP_ARRAY(size_t, max_regions);
      563 +
      564 +    _tasks[i] = new CMTask(i, this, 
      565 +                           _count_marked_bytes[i],
      566 +                           &_count_card_bitmaps[i],
      567 +                           task_queue, _task_queues);
      568 +
 552  569      _accum_task_vtime[i] = 0.0;
 553  570    }
 554  571  
      572 +  // Calculate the card number for the bottom of the heap. Used
      573 +  // in biasing indexes into the accounting card bitmaps.
      574 +  _heap_bottom_card_num =
      575 +    intptr_t(uintptr_t(_g1h->reserved_region().start()) >> 
      576 +                                CardTableModRefBS::card_shift);
      577 +
      578 +
 555  579    if (ConcGCThreads > ParallelGCThreads) {
 556  580      vm_exit_during_initialization("Can't have more ConcGCThreads "
 557  581                                    "than ParallelGCThreads.");
 558  582    }
 559  583    if (ParallelGCThreads == 0) {
 560  584      // if we are not running with any parallel GC threads we will not
 561  585      // spawn any marking threads either
 562  586      _parallel_marking_threads =       0;
 563  587      _max_parallel_marking_threads =   0;
 564  588      _sleep_factor             =     0.0;
↓ open down ↓ 103 lines elided ↑ open up ↑
 668  692    _heap_end   = committed.end();
 669  693  
 670  694    // Separated the asserts so that we know which one fires.
 671  695    assert(_heap_start != NULL, "heap bounds should look ok");
 672  696    assert(_heap_end != NULL, "heap bounds should look ok");
 673  697    assert(_heap_start < _heap_end, "heap bounds should look ok");
 674  698  
 675  699    // reset all the marking data structures and any necessary flags
 676  700    clear_marking_state();
 677  701  
      702 +  clear_all_count_data();
      703 +
 678  704    if (verbose_low()) {
 679  705      gclog_or_tty->print_cr("[global] resetting");
 680  706    }
 681  707  
 682  708    // We do reset all of them, since different phases will use
 683  709    // different number of active threads. So, it's easiest to have all
 684  710    // of them ready.
 685  711    for (int i = 0; i < (int) _max_task_num; ++i) {
 686  712      _tasks[i]->reset(_nextMarkBitMap);
 687  713    }
↓ open down ↓ 31 lines elided ↑ open up ↑
 719  745  }
 720  746  
 721  747  void ConcurrentMark::set_non_marking_state() {
 722  748    // We set the global marking state to some default values when we're
 723  749    // not doing marking.
 724  750    clear_marking_state();
 725  751    _active_tasks = 0;
 726  752    clear_concurrent_marking_in_progress();
 727  753  }
 728  754  
 729      -ConcurrentMark::~ConcurrentMark() {
 730      -  for (int i = 0; i < (int) _max_task_num; ++i) {
 731      -    delete _task_queues->queue(i);
 732      -    delete _tasks[i];
 733      -  }
 734      -  delete _task_queues;
 735      -  FREE_C_HEAP_ARRAY(CMTask*, _max_task_num);
 736      -}
 737      -
 738  755  // This closure is used to mark refs into the g1 generation
 739  756  // from external roots in the CMS bit map.
 740  757  // Called at the first checkpoint.
 741  758  //
 742  759  
 743  760  void ConcurrentMark::clearNextBitmap() {
 744  761    G1CollectedHeap* g1h = G1CollectedHeap::heap();
 745  762    G1CollectorPolicy* g1p = g1h->g1_policy();
 746  763  
 747  764    // Make sure that the concurrent mark thread looks to still be in
↓ open down ↓ 196 lines elided ↑ open up ↑
 944  961  bool ForceOverflowSettings::should_force() {
 945  962    if (_force) {
 946  963      _force = false;
 947  964      return true;
 948  965    } else {
 949  966      return false;
 950  967    }
 951  968  }
 952  969  #endif // !PRODUCT
 953  970  
 954      -void ConcurrentMark::grayRoot(oop p) {
      971 +void ConcurrentMark::grayRoot(oop p, int worker_i) {
 955  972    HeapWord* addr = (HeapWord*) p;
 956  973    // We can't really check against _heap_start and _heap_end, since it
 957  974    // is possible during an evacuation pause with piggy-backed
 958  975    // initial-mark that the committed space is expanded during the
 959  976    // pause without CM observing this change. So the assertions below
 960  977    // is a bit conservative; but better than nothing.
 961  978    assert(_g1h->g1_committed().contains(addr),
 962  979           "address should be within the heap bounds");
 963  980  
 964  981    if (!_nextMarkBitMap->isMarked(addr)) {
 965      -    _nextMarkBitMap->parMark(addr);
      982 +    par_mark_and_count(p, worker_i);
 966  983    }
 967  984  }
 968  985  
 969  986  void ConcurrentMark::grayRegionIfNecessary(MemRegion mr) {
 970  987    // The objects on the region have already been marked "in bulk" by
 971  988    // the caller. We only need to decide whether to push the region on
 972  989    // the region stack or not.
 973  990  
 974  991    if (!concurrent_marking_in_progress() || !_should_gray_objects) {
 975  992      // We're done with marking and waiting for remark. We do not need to
↓ open down ↓ 28 lines elided ↑ open up ↑
1004 1021      }
1005 1022  
1006 1023      if (!region_stack_push_lock_free(mr)) {
1007 1024        if (verbose_low()) {
1008 1025          gclog_or_tty->print_cr("[global] region stack has overflown.");
1009 1026        }
1010 1027      }
1011 1028    }
1012 1029  }
1013 1030  
1014      -void ConcurrentMark::markAndGrayObjectIfNecessary(oop p) {
     1031 +void ConcurrentMark::markAndGrayObjectIfNecessary(oop p, int worker_i) {
1015 1032    // The object is not marked by the caller. We need to at least mark
1016 1033    // it and maybe push in on the stack.
1017 1034  
1018 1035    HeapWord* addr = (HeapWord*)p;
1019 1036    if (!_nextMarkBitMap->isMarked(addr)) {
1020 1037      // We definitely need to mark it, irrespective whether we bail out
1021 1038      // because we're done with marking.
1022      -    if (_nextMarkBitMap->parMark(addr)) {
     1039 +
     1040 +    if (par_mark_and_count(p, worker_i)) {
1023 1041        if (!concurrent_marking_in_progress() || !_should_gray_objects) {
1024 1042          // If we're done with concurrent marking and we're waiting for
1025 1043          // remark, then we're not pushing anything on the stack.
1026 1044          return;
1027 1045        }
1028 1046  
1029 1047        // No OrderAccess:store_load() is needed. It is implicit in the
1030 1048        // CAS done in parMark(addr) above
1031 1049        HeapWord* finger = _finger;
1032 1050  
↓ open down ↓ 181 lines elided ↑ open up ↑
1214 1232  
1215 1233    if (has_overflown()) {
1216 1234      // Oops.  We overflowed.  Restart concurrent marking.
1217 1235      _restart_for_overflow = true;
1218 1236      // Clear the flag. We do not need it any more.
1219 1237      clear_has_overflown();
1220 1238      if (G1TraceMarkStackOverflow) {
1221 1239        gclog_or_tty->print_cr("\nRemark led to restart for overflow.");
1222 1240      }
1223 1241    } else {
     1242 +    // Aggregate the per-task counting data that we have accumulated
     1243 +    // while marking.
     1244 +    aggregate_and_clear_count_data();
     1245 +
1224 1246      SATBMarkQueueSet& satb_mq_set = JavaThread::satb_mark_queue_set();
1225 1247      // We're done with marking.
1226 1248      // This is the end of  the marking cycle, we're expected all
1227 1249      // threads to have SATB queues with active set to true.
1228 1250      satb_mq_set.set_active_all_threads(false, /* new active value */
1229 1251                                         true /* expected_active */);
1230 1252  
1231 1253      if (VerifyDuringGC) {
1232 1254  
1233 1255        HandleMark hm;  // handle scope
↓ open down ↓ 20 lines elided ↑ open up ↑
1254 1276    double now = os::elapsedTime();
1255 1277    _remark_mark_times.add((mark_work_end - start) * 1000.0);
1256 1278    _remark_weak_ref_times.add((now - mark_work_end) * 1000.0);
1257 1279    _remark_times.add((now - start) * 1000.0);
1258 1280  
1259 1281    g1p->record_concurrent_mark_remark_end();
1260 1282  }
1261 1283  
1262 1284  #define CARD_BM_TEST_MODE 0
1263 1285  
     1286 +// Used to calculate the # live objects per region
     1287 +// for verification purposes
1264 1288  class CalcLiveObjectsClosure: public HeapRegionClosure {
1265 1289  
1266 1290    CMBitMapRO* _bm;
1267 1291    ConcurrentMark* _cm;
1268      -  bool _changed;
1269      -  bool _yield;
1270      -  size_t _words_done;
     1292 +  BitMap* _region_bm;
     1293 +  BitMap* _card_bm;
     1294 +
     1295 +  size_t _tot_words_done;
1271 1296    size_t _tot_live;
1272 1297    size_t _tot_used;
1273      -  size_t _regions_done;
1274      -  double _start_vtime_sec;
1275 1298  
1276      -  BitMap* _region_bm;
1277      -  BitMap* _card_bm;
     1299 +  size_t _region_marked_bytes;
     1300 +
1278 1301    intptr_t _bottom_card_num;
1279      -  bool _final;
1280 1302  
1281 1303    void mark_card_num_range(intptr_t start_card_num, intptr_t last_card_num) {
1282      -    for (intptr_t i = start_card_num; i <= last_card_num; i++) {
     1304 +    BitMap::idx_t start_idx = start_card_num - _bottom_card_num;
     1305 +    BitMap::idx_t last_idx = last_card_num - _bottom_card_num;
     1306 +    
     1307 +    for (BitMap::idx_t i = start_idx; i <= last_idx; i += 1) {
1283 1308  #if CARD_BM_TEST_MODE
1284      -      guarantee(_card_bm->at(i - _bottom_card_num), "Should already be set.");
     1309 +      guarantee(_card_bm->at(i), "Should already be set.");
1285 1310  #else
1286      -      _card_bm->par_at_put(i - _bottom_card_num, 1);
     1311 +      _card_bm->par_at_put(i, 1);
1287 1312  #endif
1288 1313      }
1289 1314    }
1290 1315  
1291 1316  public:
1292      -  CalcLiveObjectsClosure(bool final,
1293      -                         CMBitMapRO *bm, ConcurrentMark *cm,
     1317 +  CalcLiveObjectsClosure(CMBitMapRO *bm, ConcurrentMark *cm,
1294 1318                           BitMap* region_bm, BitMap* card_bm) :
1295      -    _bm(bm), _cm(cm), _changed(false), _yield(true),
1296      -    _words_done(0), _tot_live(0), _tot_used(0),
1297      -    _region_bm(region_bm), _card_bm(card_bm),_final(final),
1298      -    _regions_done(0), _start_vtime_sec(0.0)
     1319 +    _bm(bm), _cm(cm), _region_bm(region_bm), _card_bm(card_bm),
     1320 +    _region_marked_bytes(0), _tot_words_done(0),
     1321 +    _tot_live(0), _tot_used(0)
1299 1322    {
1300      -    _bottom_card_num =
1301      -      intptr_t(uintptr_t(G1CollectedHeap::heap()->reserved_region().start()) >>
1302      -               CardTableModRefBS::card_shift);
     1323 +    _bottom_card_num = cm->heap_bottom_card_num();
1303 1324    }
1304 1325  
1305 1326    // It takes a region that's not empty (i.e., it has at least one
1306 1327    // live object in it and sets its corresponding bit on the region
1307 1328    // bitmap to 1. If the region is "starts humongous" it will also set
1308 1329    // to 1 the bits on the region bitmap that correspond to its
1309 1330    // associated "continues humongous" regions.
1310 1331    void set_bit_for_region(HeapRegion* hr) {
1311 1332      assert(!hr->continuesHumongous(), "should have filtered those out");
1312 1333  
1313 1334      size_t index = hr->hrs_index();
1314 1335      if (!hr->startsHumongous()) {
1315 1336        // Normal (non-humongous) case: just set the bit.
1316 1337        _region_bm->par_at_put((BitMap::idx_t) index, true);
1317 1338      } else {
1318 1339        // Starts humongous case: calculate how many regions are part of
1319      -      // this humongous region and then set the bit range. It might
1320      -      // have been a bit more efficient to look at the object that
1321      -      // spans these humongous regions to calculate their number from
1322      -      // the object's size. However, it's a good idea to calculate
1323      -      // this based on the metadata itself, and not the region
1324      -      // contents, so that this code is not aware of what goes into
1325      -      // the humongous regions (in case this changes in the future).
     1340 +      // this humongous region and then set the bit range.
1326 1341        G1CollectedHeap* g1h = G1CollectedHeap::heap();
1327      -      size_t end_index = index + 1;
1328      -      while (end_index < g1h->n_regions()) {
1329      -        HeapRegion* chr = g1h->region_at(end_index);
1330      -        if (!chr->continuesHumongous()) break;
1331      -        end_index += 1;
1332      -      }
     1342 +      HeapRegion *last_hr = g1h->heap_region_containing_raw(hr->end() - 1);
     1343 +      size_t end_index = last_hr->hrs_index() + 1;
1333 1344        _region_bm->par_at_put_range((BitMap::idx_t) index,
1334 1345                                     (BitMap::idx_t) end_index, true);
1335 1346      }
1336 1347    }
1337 1348  
1338 1349    bool doHeapRegion(HeapRegion* hr) {
1339      -    if (!_final && _regions_done == 0) {
1340      -      _start_vtime_sec = os::elapsedVTime();
1341      -    }
1342 1350  
1343 1351      if (hr->continuesHumongous()) {
1344 1352        // We will ignore these here and process them when their
1345 1353        // associated "starts humongous" region is processed (see
1346 1354        // set_bit_for_heap_region()). Note that we cannot rely on their
1347 1355        // associated "starts humongous" region to have their bit set to
1348 1356        // 1 since, due to the region chunking in the parallel region
1349 1357        // iteration, a "continues humongous" region might be visited
1350 1358        // before its associated "starts humongous".
1351 1359        return false;
1352 1360      }
1353 1361  
1354 1362      HeapWord* nextTop = hr->next_top_at_mark_start();
1355      -    HeapWord* start   = hr->top_at_conc_mark_count();
1356      -    assert(hr->bottom() <= start && start <= hr->end() &&
1357      -           hr->bottom() <= nextTop && nextTop <= hr->end() &&
1358      -           start <= nextTop,
1359      -           "Preconditions.");
1360      -    // Otherwise, record the number of word's we'll examine.
     1363 +    HeapWord* start   = hr->bottom();
     1364 +
     1365 +    assert(start <= hr->end() && start <= nextTop && nextTop <= hr->end(),
     1366 +                "Preconditions.");
     1367 +
     1368 +    // Record the number of word's we'll examine.
1361 1369      size_t words_done = (nextTop - start);
     1370 +
1362 1371      // Find the first marked object at or after "start".
1363 1372      start = _bm->getNextMarkedWordAddress(start, nextTop);
     1373 +
1364 1374      size_t marked_bytes = 0;
     1375 +    _region_marked_bytes = 0;
1365 1376  
1366 1377      // Below, the term "card num" means the result of shifting an address
1367 1378      // by the card shift -- address 0 corresponds to card number 0.  One
1368 1379      // must subtract the card num of the bottom of the heap to obtain a
1369 1380      // card table index.
     1381 +
1370 1382      // The first card num of the sequence of live cards currently being
1371 1383      // constructed.  -1 ==> no sequence.
1372 1384      intptr_t start_card_num = -1;
     1385 +
1373 1386      // The last card num of the sequence of live cards currently being
1374 1387      // constructed.  -1 ==> no sequence.
1375 1388      intptr_t last_card_num = -1;
1376 1389  
1377 1390      while (start < nextTop) {
1378      -      if (_yield && _cm->do_yield_check()) {
1379      -        // We yielded.  It might be for a full collection, in which case
1380      -        // all bets are off; terminate the traversal.
1381      -        if (_cm->has_aborted()) {
1382      -          _changed = false;
1383      -          return true;
1384      -        } else {
1385      -          // Otherwise, it might be a collection pause, and the region
1386      -          // we're looking at might be in the collection set.  We'll
1387      -          // abandon this region.
1388      -          return false;
1389      -        }
1390      -      }
1391 1391        oop obj = oop(start);
1392 1392        int obj_sz = obj->size();
     1393 +
1393 1394        // The card num of the start of the current object.
1394 1395        intptr_t obj_card_num =
1395 1396          intptr_t(uintptr_t(start) >> CardTableModRefBS::card_shift);
1396      -
1397 1397        HeapWord* obj_last = start + obj_sz - 1;
1398 1398        intptr_t obj_last_card_num =
1399 1399          intptr_t(uintptr_t(obj_last) >> CardTableModRefBS::card_shift);
1400 1400  
1401 1401        if (obj_card_num != last_card_num) {
1402 1402          if (start_card_num == -1) {
1403 1403            assert(last_card_num == -1, "Both or neither.");
1404 1404            start_card_num = obj_card_num;
1405 1405          } else {
1406 1406            assert(last_card_num != -1, "Both or neither.");
1407 1407            assert(obj_card_num >= last_card_num, "Inv");
1408 1408            if ((obj_card_num - last_card_num) > 1) {
1409 1409              // Mark the last run, and start a new one.
1410 1410              mark_card_num_range(start_card_num, last_card_num);
1411 1411              start_card_num = obj_card_num;
1412 1412            }
1413 1413          }
1414 1414  #if CARD_BM_TEST_MODE
1415      -        /*
1416      -        gclog_or_tty->print_cr("Setting bits from %d/%d.",
1417      -                               obj_card_num - _bottom_card_num,
1418      -                               obj_last_card_num - _bottom_card_num);
1419      -        */
1420 1415          for (intptr_t j = obj_card_num; j <= obj_last_card_num; j++) {
1421 1416            _card_bm->par_at_put(j - _bottom_card_num, 1);
1422 1417          }
1423      -#endif
     1418 +#endif // CARD_BM_TEST_MODE
1424 1419        }
1425 1420        // In any case, we set the last card num.
1426 1421        last_card_num = obj_last_card_num;
1427 1422  
1428 1423        marked_bytes += (size_t)obj_sz * HeapWordSize;
     1424 +
1429 1425        // Find the next marked object after this one.
1430 1426        start = _bm->getNextMarkedWordAddress(start + 1, nextTop);
1431      -      _changed = true;
1432 1427      }
     1428 +
1433 1429      // Handle the last range, if any.
1434 1430      if (start_card_num != -1) {
1435 1431        mark_card_num_range(start_card_num, last_card_num);
1436 1432      }
1437      -    if (_final) {
1438      -      // Mark the allocated-since-marking portion...
1439      -      HeapWord* tp = hr->top();
1440      -      if (nextTop < tp) {
1441      -        start_card_num =
1442      -          intptr_t(uintptr_t(nextTop) >> CardTableModRefBS::card_shift);
1443      -        last_card_num =
1444      -          intptr_t(uintptr_t(tp) >> CardTableModRefBS::card_shift);
1445      -        mark_card_num_range(start_card_num, last_card_num);
1446      -        // This definitely means the region has live objects.
1447      -        set_bit_for_region(hr);
1448      -      }
     1433 +
     1434 +    // Mark the allocated-since-marking portion...
     1435 +    HeapWord* top = hr->top();
     1436 +    if (nextTop < top) {
     1437 +      start_card_num = intptr_t(uintptr_t(nextTop) >> CardTableModRefBS::card_shift);
     1438 +      last_card_num = intptr_t(uintptr_t(top) >> CardTableModRefBS::card_shift);
     1439 +
     1440 +      mark_card_num_range(start_card_num, last_card_num);
     1441 +      
     1442 +      // This definitely means the region has live objects.
     1443 +      set_bit_for_region(hr);
1449 1444      }
1450 1445  
1451      -    hr->add_to_marked_bytes(marked_bytes);
1452 1446      // Update the live region bitmap.
1453 1447      if (marked_bytes > 0) {
1454 1448        set_bit_for_region(hr);
1455 1449      }
1456      -    hr->set_top_at_conc_mark_count(nextTop);
     1450 +
     1451 +    // Set the marked bytes for the current region so that
     1452 +    // it can be queried by a calling verificiation routine
     1453 +    _region_marked_bytes = marked_bytes;
     1454 +
1457 1455      _tot_live += hr->next_live_bytes();
1458 1456      _tot_used += hr->used();
1459      -    _words_done = words_done;
     1457 +    _tot_words_done = words_done;
1460 1458  
1461      -    if (!_final) {
1462      -      ++_regions_done;
1463      -      if (_regions_done % 10 == 0) {
1464      -        double end_vtime_sec = os::elapsedVTime();
1465      -        double elapsed_vtime_sec = end_vtime_sec - _start_vtime_sec;
1466      -        if (elapsed_vtime_sec > (10.0 / 1000.0)) {
1467      -          jlong sleep_time_ms =
1468      -            (jlong) (elapsed_vtime_sec * _cm->cleanup_sleep_factor() * 1000.0);
1469      -          os::sleep(Thread::current(), sleep_time_ms, false);
1470      -          _start_vtime_sec = end_vtime_sec;
     1459 +    return false;
     1460 +  }
     1461 +
     1462 +  size_t region_marked_bytes() const { return _region_marked_bytes; }
     1463 +  size_t tot_words_done() const      { return _tot_words_done; }
     1464 +  size_t tot_live() const            { return _tot_live; }
     1465 +  size_t tot_used() const            { return _tot_used; }
     1466 +};
     1467 +
     1468 +// Heap region closure used for verifying the counting data
     1469 +// that was accumulated concurrently and aggregated during
     1470 +// the remark pause. This closure is applied to the heap
     1471 +// regions during the STW cleanup pause.
     1472 +
     1473 +class VerifyLiveObjectDataHRClosure: public HeapRegionClosure {
     1474 +  ConcurrentMark* _cm;
     1475 +  CalcLiveObjectsClosure _calc_cl;
     1476 +  BitMap* _region_bm;   // Region BM to be verified
     1477 +  BitMap* _card_bm;     // Card BM to be verified
     1478 +  bool _verbose;        // verbose output?
     1479 +
     1480 +  BitMap* _exp_region_bm; // Expected Region BM values
     1481 +  BitMap* _exp_card_bm;   // Expected card BM values
     1482 +
     1483 +  intptr_t _bottom_card_num; // Used for calculatint bitmap indices
     1484 +
     1485 +  int _failures;
     1486 +
     1487 +public:
     1488 +  VerifyLiveObjectDataHRClosure(ConcurrentMark* cm,
     1489 +                                BitMap* region_bm,
     1490 +                                BitMap* card_bm,
     1491 +                                BitMap* exp_region_bm,
     1492 +                                BitMap* exp_card_bm,
     1493 +                                bool verbose) :
     1494 +    _cm(cm),
     1495 +    _calc_cl(_cm->nextMarkBitMap(), _cm, exp_region_bm, exp_card_bm),
     1496 +    _region_bm(region_bm), _card_bm(card_bm), _verbose(verbose),
     1497 +    _exp_region_bm(exp_region_bm), _exp_card_bm(exp_card_bm),
     1498 +    _failures(0)
     1499 +  { 
     1500 +    _bottom_card_num = cm->heap_bottom_card_num();
     1501 +  }
     1502 +
     1503 +  int failures() const { return _failures; }
     1504 +
     1505 +  bool doHeapRegion(HeapRegion* hr) {
     1506 +    if (hr->continuesHumongous()) {
     1507 +      // We will ignore these here and process them when their
     1508 +      // associated "starts humongous" region is processed (see
     1509 +      // set_bit_for_heap_region()). Note that we cannot rely on their
     1510 +      // associated "starts humongous" region to have their bit set to
     1511 +      // 1 since, due to the region chunking in the parallel region
     1512 +      // iteration, a "continues humongous" region might be visited
     1513 +      // before its associated "starts humongous".
     1514 +      return false;
     1515 +    }
     1516 +
     1517 +    int failures = 0;
     1518 +
     1519 +    // Call the CalcLiveObjectsClosure to walk the marking bitmap for
     1520 +    // this region and set the corresponding bits in the expected region
     1521 +    // and card bitmaps.
     1522 +    bool res = _calc_cl.doHeapRegion(hr);
     1523 +    assert(res == false, "should be continuing");
     1524 +
     1525 +    // Note that the calculated count data could be a subset of the
     1526 +    // count data that was accumlated during marking. See the comment
     1527 +    // in G1ParCopyHelper::copy_to_survivor space for an explanation
     1528 +    // why.
     1529 +
     1530 +    // Verify that _top_at_conc_count == ntams
     1531 +    if (hr->top_at_conc_mark_count() != hr->next_top_at_mark_start()) {
     1532 +      if (_verbose) {
     1533 +        gclog_or_tty->print_cr("Region %d: top at conc count incorrect: expected "
     1534 +                               PTR_FORMAT", actual: "PTR_FORMAT,
     1535 +                               hr->hrs_index(), hr->next_top_at_mark_start(),
     1536 +                               hr->top_at_conc_mark_count());
     1537 +      }
     1538 +      failures += 1;
     1539 +    }
     1540 +
     1541 +    // Verify the marked bytes for this region. 
     1542 +    size_t exp_marked_bytes = _calc_cl.region_marked_bytes();
     1543 +    size_t act_marked_bytes = hr->next_marked_bytes();
     1544 +    
     1545 +    // We're not OK if expected marked bytes > actual marked bytes. It means
     1546 +    // we have missed accounting some objects during the actual marking.
     1547 +    if (exp_marked_bytes > act_marked_bytes) {
     1548 +      if (_verbose) {
     1549 +        gclog_or_tty->print_cr("Region %d: marked bytes mismatch: expected: "
     1550 +                               SIZE_FORMAT", actual: "SIZE_FORMAT,
     1551 +                               hr->hrs_index(), exp_marked_bytes, act_marked_bytes);
     1552 +      }
     1553 +      failures += 1;
     1554 +    }
     1555 +
     1556 +    // Verify the bit, for this region, in the actual and expected
     1557 +    // (which was just calculated) region bit maps.
     1558 +    // We're not OK if the expected bit is set and the actual is not set.
     1559 +    BitMap::idx_t index = (BitMap::idx_t)hr->hrs_index();
     1560 +    
     1561 +    bool expected = _exp_region_bm->at(index);
     1562 +    bool actual = _region_bm->at(index);
     1563 +    if (expected && !actual) {
     1564 +      if (_verbose) {
     1565 +        gclog_or_tty->print_cr("Region %d: region bitmap mismatch: expected: %d, actual: %d",
     1566 +                               hr->hrs_index(), expected, actual);
     1567 +      }
     1568 +      failures += 1;
     1569 +    }
     1570 +
     1571 +    // Verify that the card bit maps for the cards spanned by the current
     1572 +    // region match. The set of offsets that have set bits in the expected
     1573 +    // bitmap should be a subset of the offsets with set bits from the actual
     1574 +    // calculated card bitmap.
     1575 +    // Again it's more important that if the expected bit is set then the
     1576 +    // actual bit be set.
     1577 +    intptr_t start_card_num =
     1578 +        intptr_t(uintptr_t(hr->bottom()) >> CardTableModRefBS::card_shift);
     1579 +    intptr_t top_card_num =
     1580 +        intptr_t(uintptr_t(hr->top()) >> CardTableModRefBS::card_shift);
     1581 +
     1582 +    BitMap::idx_t start_idx = start_card_num - _bottom_card_num;
     1583 +    BitMap::idx_t end_idx = top_card_num - _bottom_card_num;
     1584 +
     1585 +    for (BitMap::idx_t i = start_idx; i < end_idx; i+=1) {
     1586 +      expected = _exp_card_bm->at(i);
     1587 +      actual = _card_bm->at(i);
     1588 +      
     1589 +      if (expected && !actual) {
     1590 +        if (_verbose) {
     1591 +          gclog_or_tty->print_cr("Region %d: card bitmap mismatch at idx %d: expected: %d, actual: %d",
     1592 +                                 hr->hrs_index(), i, expected, actual);
1471 1593          }
     1594 +        failures += 1;
1472 1595        }
1473 1596      }
1474 1597  
     1598 +    if (failures > 0 && _verbose)  {
     1599 +      gclog_or_tty->print("Region %d: bottom: "PTR_FORMAT", ntams: "
     1600 +                          PTR_FORMAT", top: "PTR_FORMAT", end: "PTR_FORMAT,
     1601 +                          hr->hrs_index(), hr->bottom(), hr->next_top_at_mark_start(),
     1602 +                          hr->top(), hr->end());
     1603 +      gclog_or_tty->print_cr(", marked_bytes: calc/actual "SIZE_FORMAT"/"SIZE_FORMAT,
     1604 +                             _calc_cl.region_marked_bytes(),
     1605 +                             hr->next_marked_bytes());
     1606 +    }
     1607 +
     1608 +    _failures += failures;
     1609 +
     1610 +    // We could stop iteration over the heap when we
     1611 +    // find the first voilating region by returning true.
1475 1612      return false;
1476 1613    }
     1614 +};
     1615 +
     1616 +
     1617 +class G1ParVerifyFinalCountTask: public AbstractGangTask {
     1618 +protected:
     1619 +  G1CollectedHeap* _g1h;
     1620 +  ConcurrentMark* _cm;
     1621 +  BitMap* _actual_region_bm;
     1622 +  BitMap* _actual_card_bm;
     1623 +
     1624 +  size_t _n_workers;
     1625 +
     1626 +  BitMap* _expected_region_bm;
     1627 +  BitMap* _expected_card_bm;
     1628 +
     1629 +  int  _failures;
     1630 +  bool _verbose;
     1631 +
     1632 +public:
     1633 +  G1ParVerifyFinalCountTask(G1CollectedHeap* g1h,
     1634 +                            BitMap* region_bm, BitMap* card_bm,
     1635 +                            BitMap* expected_region_bm, BitMap* expected_card_bm)
     1636 +    : AbstractGangTask("G1 verify final counting"),
     1637 +      _g1h(g1h), _cm(_g1h->concurrent_mark()),
     1638 +      _actual_region_bm(region_bm), _actual_card_bm(card_bm),
     1639 +      _expected_region_bm(expected_region_bm), _expected_card_bm(expected_card_bm),
     1640 +      _failures(0), _verbose(false),
     1641 +      _n_workers(0)
     1642 +  {
     1643 +    assert(VerifyDuringGC, "don't call this otherwise");
     1644 +
     1645 +    // Use the value already set as the number of active threads
     1646 +    // in the call to run_task().
     1647 +    if (G1CollectedHeap::use_parallel_gc_threads()) {
     1648 +      assert( _g1h->workers()->active_workers() > 0,
     1649 +        "Should have been previously set");
     1650 +      _n_workers = _g1h->workers()->active_workers();
     1651 +    } else {
     1652 +      _n_workers = 1;
     1653 +    }
     1654 +
     1655 +    assert(_expected_card_bm->size() == _actual_card_bm->size(), "sanity");
     1656 +    assert(_expected_region_bm->size() == _actual_region_bm->size(), "sanity");
     1657 +
     1658 +    _verbose = _cm->verbose_medium();
     1659 +  }
     1660 +
     1661 +  void work(int worker_i) {
     1662 +    assert((size_t) worker_i < _n_workers, "invariant");
     1663 +
     1664 +    VerifyLiveObjectDataHRClosure verify_cl(_cm,
     1665 +                                            _actual_region_bm, _actual_card_bm,
     1666 +                                            _expected_region_bm,
     1667 +                                            _expected_card_bm,
     1668 +                                            _verbose);
1477 1669  
1478      -  bool changed() { return _changed;  }
1479      -  void reset()   { _changed = false; _words_done = 0; }
1480      -  void no_yield() { _yield = false; }
1481      -  size_t words_done() { return _words_done; }
1482      -  size_t tot_live() { return _tot_live; }
1483      -  size_t tot_used() { return _tot_used; }
     1670 +    if (G1CollectedHeap::use_parallel_gc_threads()) {
     1671 +      _g1h->heap_region_par_iterate_chunked(&verify_cl,
     1672 +                                            worker_i,
     1673 +                                            (int) _n_workers,
     1674 +                                            HeapRegion::VerifyCountClaimValue);
     1675 +    } else {
     1676 +      _g1h->heap_region_iterate(&verify_cl);
     1677 +    }
     1678 +
     1679 +    Atomic::add(verify_cl.failures(), &_failures);
     1680 +  }
     1681 +
     1682 +  int failures() const { return _failures; }
1484 1683  };
1485 1684  
     1685 +// Final update of count data (during cleanup).
     1686 +// Adds [top_at_count, NTAMS) to the marked bytes for each
     1687 +// region. Sets the bits in the card bitmap corresponding
     1688 +// to the interval [top_at_count, top], and sets the
     1689 +// liveness bit for each region containing live data
     1690 +// in the region bitmap.
1486 1691  
1487      -void ConcurrentMark::calcDesiredRegions() {
1488      -  _region_bm.clear();
1489      -  _card_bm.clear();
1490      -  CalcLiveObjectsClosure calccl(false /*final*/,
1491      -                                nextMarkBitMap(), this,
1492      -                                &_region_bm, &_card_bm);
1493      -  G1CollectedHeap *g1h = G1CollectedHeap::heap();
1494      -  g1h->heap_region_iterate(&calccl);
     1692 +class FinalCountDataUpdateClosure: public HeapRegionClosure {
     1693 +  ConcurrentMark* _cm;
     1694 +  BitMap* _region_bm;
     1695 +  BitMap* _card_bm;
     1696 +  intptr_t _bottom_card_num;
1495 1697  
1496      -  do {
1497      -    calccl.reset();
1498      -    g1h->heap_region_iterate(&calccl);
1499      -  } while (calccl.changed());
1500      -}
     1698 +  size_t _total_live_bytes;
     1699 +  size_t _total_used_bytes;
     1700 +  size_t _total_words_done;
     1701 +
     1702 +  void mark_card_num_range(intptr_t start_card_num, intptr_t last_card_num) {
     1703 +    BitMap::idx_t start_idx = start_card_num - _bottom_card_num;
     1704 +    BitMap::idx_t last_idx = last_card_num - _bottom_card_num;
     1705 +    
     1706 +    // Inclusive bit range [start_idx, last_idx]. par_at_put_range
     1707 +    // is exclusive so we have to also set the bit for last_idx.
     1708 +    // Passing last_idx+1 to the clear_range would work in
     1709 +    // most cases but could trip an OOB assertion.
     1710 +
     1711 +    if ((last_idx - start_idx) > 0) {
     1712 +      _card_bm->par_at_put_range(start_idx, last_idx, true);
     1713 +    }
     1714 +    _card_bm->par_set_bit(last_idx);
     1715 +  }
     1716 +
     1717 +  // It takes a region that's not empty (i.e., it has at least one
     1718 +  // live object in it and sets its corresponding bit on the region
     1719 +  // bitmap to 1. If the region is "starts humongous" it will also set
     1720 +  // to 1 the bits on the region bitmap that correspond to its
     1721 +  // associated "continues humongous" regions.
     1722 +  void set_bit_for_region(HeapRegion* hr) {
     1723 +    assert(!hr->continuesHumongous(), "should have filtered those out");
     1724 +
     1725 +    size_t index = hr->hrs_index();
     1726 +    if (!hr->startsHumongous()) {
     1727 +      // Normal (non-humongous) case: just set the bit.
     1728 +      _region_bm->par_set_bit((BitMap::idx_t) index);
     1729 +    } else {
     1730 +      // Starts humongous case: calculate how many regions are part of
     1731 +      // this humongous region and then set the bit range.
     1732 +      G1CollectedHeap* g1h = G1CollectedHeap::heap();
     1733 +      HeapRegion *last_hr = g1h->heap_region_containing_raw(hr->end() - 1);
     1734 +      size_t end_index = last_hr->hrs_index() + 1;
     1735 +      _region_bm->par_at_put_range((BitMap::idx_t) index,
     1736 +                                   (BitMap::idx_t) end_index, true);
     1737 +    }
     1738 +  }
     1739 +
     1740 + public:
     1741 +  FinalCountDataUpdateClosure(ConcurrentMark* cm,
     1742 +                              BitMap* region_bm,
     1743 +                              BitMap* card_bm) :
     1744 +    _cm(cm), _region_bm(region_bm), _card_bm(card_bm),
     1745 +    _total_words_done(0), _total_live_bytes(0), _total_used_bytes(0)
     1746 +  {
     1747 +    _bottom_card_num = cm->heap_bottom_card_num();
     1748 +  }
     1749 +
     1750 +  bool doHeapRegion(HeapRegion* hr) {
     1751 +
     1752 +    if (hr->continuesHumongous()) {
     1753 +      // We will ignore these here and process them when their
     1754 +      // associated "starts humongous" region is processed (see
     1755 +      // set_bit_for_heap_region()). Note that we cannot rely on their
     1756 +      // associated "starts humongous" region to have their bit set to
     1757 +      // 1 since, due to the region chunking in the parallel region
     1758 +      // iteration, a "continues humongous" region might be visited
     1759 +      // before its associated "starts humongous".
     1760 +      return false;
     1761 +    }
     1762 +
     1763 +    HeapWord* start = hr->top_at_conc_mark_count();
     1764 +    HeapWord* ntams = hr->next_top_at_mark_start();
     1765 +    HeapWord* top   = hr->top();
     1766 +    
     1767 +    assert(hr->bottom() <= start && start <= hr->end() &&
     1768 +           hr->bottom() <= ntams && ntams <= hr->end(), "Preconditions.");
     1769 +    
     1770 +    size_t words_done = ntams - hr->bottom();
     1771 +
     1772 +    intptr_t start_card_num = intptr_t(uintptr_t(start) >> CardTableModRefBS::card_shift);
     1773 +    intptr_t last_card_num = intptr_t(uintptr_t(top) >> CardTableModRefBS::card_shift);
     1774 +
     1775 +
     1776 +    if (start < ntams) {
     1777 +      // Region was changed between remark and cleanup pauses
     1778 +      // We need to add (ntams - start) to the marked bytes
     1779 +      // for this region, and set bits for the range
     1780 +      // [ card_num(start), card_num(ntams) ) in the
     1781 +      // card bitmap.
     1782 +      size_t live_bytes = (ntams - start) * HeapWordSize;
     1783 +      hr->add_to_marked_bytes(live_bytes);
     1784 +      
     1785 +      // Record the new top at conc count
     1786 +      hr->set_top_at_conc_mark_count(ntams);
     1787 +
     1788 +      // The setting of the bits card bitmap takes place below
     1789 +    }
     1790 +
     1791 +    // Mark the allocated-since-marking portion...
     1792 +    if (ntams < top) {
     1793 +      // This definitely means the region has live objects.
     1794 +      set_bit_for_region(hr);
     1795 +    }
     1796 +
     1797 +    // Now set the bits for [start, top]
     1798 +    mark_card_num_range(start_card_num, last_card_num);
     1799 +
     1800 +    // Set the bit for the region if it contains live data
     1801 +    if (hr->next_marked_bytes() > 0) {
     1802 +      set_bit_for_region(hr);
     1803 +    }
     1804 +
     1805 +    _total_words_done += words_done;
     1806 +    _total_used_bytes += hr->used();
     1807 +    _total_live_bytes += hr->next_marked_bytes();
     1808 +
     1809 +    return false;
     1810 +  }
     1811 +
     1812 +  size_t total_words_done() const { return _total_words_done; }
     1813 +  size_t total_live_bytes() const { return _total_live_bytes; }
     1814 +  size_t total_used_bytes() const { return _total_used_bytes; }
     1815 +};
1501 1816  
1502 1817  class G1ParFinalCountTask: public AbstractGangTask {
1503 1818  protected:
1504 1819    G1CollectedHeap* _g1h;
1505      -  CMBitMap* _bm;
     1820 +  ConcurrentMark* _cm;
     1821 +  BitMap* _actual_region_bm;
     1822 +  BitMap* _actual_card_bm;
     1823 +  
1506 1824    size_t _n_workers;
     1825 +
1507 1826    size_t *_live_bytes;
1508 1827    size_t *_used_bytes;
1509      -  BitMap* _region_bm;
1510      -  BitMap* _card_bm;
     1828 +
1511 1829  public:
1512      -  G1ParFinalCountTask(G1CollectedHeap* g1h, CMBitMap* bm,
1513      -                      BitMap* region_bm, BitMap* card_bm)
1514      -    : AbstractGangTask("G1 final counting"), _g1h(g1h),
1515      -    _bm(bm), _region_bm(region_bm), _card_bm(card_bm),
1516      -    _n_workers(0)
     1830 +  G1ParFinalCountTask(G1CollectedHeap* g1h, BitMap* region_bm, BitMap* card_bm)
     1831 +    : AbstractGangTask("G1 final counting"),
     1832 +      _g1h(g1h), _cm(_g1h->concurrent_mark()),
     1833 +      _actual_region_bm(region_bm), _actual_card_bm(card_bm),
     1834 +      _n_workers(0)
1517 1835    {
1518 1836      // Use the value already set as the number of active threads
1519 1837      // in the call to run_task().  Needed for the allocation of
1520 1838      // _live_bytes and _used_bytes.
1521 1839      if (G1CollectedHeap::use_parallel_gc_threads()) {
1522 1840        assert( _g1h->workers()->active_workers() > 0,
1523 1841          "Should have been previously set");
1524 1842        _n_workers = _g1h->workers()->active_workers();
1525 1843      } else {
1526 1844        _n_workers = 1;
↓ open down ↓ 1 lines elided ↑ open up ↑
1528 1846  
1529 1847      _live_bytes = NEW_C_HEAP_ARRAY(size_t, _n_workers);
1530 1848      _used_bytes = NEW_C_HEAP_ARRAY(size_t, _n_workers);
1531 1849    }
1532 1850  
1533 1851    ~G1ParFinalCountTask() {
1534 1852      FREE_C_HEAP_ARRAY(size_t, _live_bytes);
1535 1853      FREE_C_HEAP_ARRAY(size_t, _used_bytes);
1536 1854    }
1537 1855  
1538      -  void work(int i) {
1539      -    CalcLiveObjectsClosure calccl(true /*final*/,
1540      -                                  _bm, _g1h->concurrent_mark(),
1541      -                                  _region_bm, _card_bm);
1542      -    calccl.no_yield();
     1856 +  void work(int worker_i) {
     1857 +    assert((size_t) worker_i < _n_workers, "invariant");
     1858 +
     1859 +    FinalCountDataUpdateClosure final_update_cl(_cm,
     1860 +                                                _actual_region_bm,
     1861 +                                                _actual_card_bm);
     1862 +
1543 1863      if (G1CollectedHeap::use_parallel_gc_threads()) {
1544      -      _g1h->heap_region_par_iterate_chunked(&calccl, i,
     1864 +      _g1h->heap_region_par_iterate_chunked(&final_update_cl,
     1865 +                                            worker_i,
1545 1866                                              (int) _n_workers,
1546 1867                                              HeapRegion::FinalCountClaimValue);
1547 1868      } else {
1548      -      _g1h->heap_region_iterate(&calccl);
     1869 +      _g1h->heap_region_iterate(&final_update_cl);
1549 1870      }
1550      -    assert(calccl.complete(), "Shouldn't have yielded!");
1551 1871  
1552      -    assert((size_t) i < _n_workers, "invariant");
1553      -    _live_bytes[i] = calccl.tot_live();
1554      -    _used_bytes[i] = calccl.tot_used();
     1872 +    _live_bytes[worker_i] = final_update_cl.total_live_bytes();
     1873 +    _used_bytes[worker_i] = final_update_cl.total_used_bytes();
1555 1874    }
     1875 +
1556 1876    size_t live_bytes()  {
1557 1877      size_t live_bytes = 0;
1558 1878      for (size_t i = 0; i < _n_workers; ++i)
1559 1879        live_bytes += _live_bytes[i];
1560 1880      return live_bytes;
1561 1881    }
     1882 +
1562 1883    size_t used_bytes()  {
1563 1884      size_t used_bytes = 0;
1564 1885      for (size_t i = 0; i < _n_workers; ++i)
1565 1886        used_bytes += _used_bytes[i];
1566 1887      return used_bytes;
1567 1888    }
1568 1889  };
1569 1890  
1570 1891  class G1ParNoteEndTask;
1571 1892  
↓ open down ↓ 187 lines elided ↑ open up ↑
1759 2080                       /* option      */ VerifyOption_G1UsePrevMarking);
1760 2081    }
1761 2082  
1762 2083    G1CollectorPolicy* g1p = G1CollectedHeap::heap()->g1_policy();
1763 2084    g1p->record_concurrent_mark_cleanup_start();
1764 2085  
1765 2086    double start = os::elapsedTime();
1766 2087  
1767 2088    HeapRegionRemSet::reset_for_cleanup_tasks();
1768 2089  
     2090 +  // Clear the global region bitmap - it will be filled as part
     2091 +  // of the final counting task.
     2092 +  _region_bm.clear();
     2093 +
1769 2094    size_t n_workers;
1770 2095  
1771 2096    // Do counting once more with the world stopped for good measure.
1772      -  G1ParFinalCountTask g1_par_count_task(g1h, nextMarkBitMap(),
1773      -                                        &_region_bm, &_card_bm);
     2097 +  G1ParFinalCountTask g1_par_count_task(g1h, &_region_bm, &_card_bm);
     2098 +
1774 2099    if (G1CollectedHeap::use_parallel_gc_threads()) {
1775      -    assert(g1h->check_heap_region_claim_values(
1776      -                                               HeapRegion::InitialClaimValue),
     2100 +   assert(g1h->check_heap_region_claim_values(HeapRegion::InitialClaimValue),
1777 2101             "sanity check");
1778 2102  
1779 2103      g1h->set_par_threads();
1780 2104      n_workers = g1h->n_par_threads();
1781 2105      assert(g1h->n_par_threads() == (int) n_workers,
1782 2106             "Should not have been reset");
1783 2107      g1h->workers()->run_task(&g1_par_count_task);
1784 2108      // Done with the parallel phase so reset to 0.
1785 2109      g1h->set_par_threads(0);
1786 2110  
1787      -    assert(g1h->check_heap_region_claim_values(
1788      -                                             HeapRegion::FinalCountClaimValue),
     2111 +    assert(g1h->check_heap_region_claim_values(HeapRegion::FinalCountClaimValue),
1789 2112             "sanity check");
1790 2113    } else {
1791 2114      n_workers = 1;
1792 2115      g1_par_count_task.work(0);
1793 2116    }
1794 2117  
     2118 +  if (VerifyDuringGC) {
     2119 +    // Verify that the counting data accumulated during marking matches
     2120 +    // that calculated by walking the marking bitmap.
     2121 +    
     2122 +    // Bitmaps to hold expected values
     2123 +    BitMap expected_region_bm(_region_bm.size(), false);
     2124 +    BitMap expected_card_bm(_card_bm.size(), false);
     2125 +
     2126 +    G1ParVerifyFinalCountTask g1_par_verify_task(g1h,
     2127 +                                                 &_region_bm,
     2128 +                                                 &_card_bm,
     2129 +                                                 &expected_region_bm,
     2130 +                                                 &expected_card_bm);
     2131 +    
     2132 +    if (G1CollectedHeap::use_parallel_gc_threads()) {
     2133 +      g1h->set_par_threads((int)n_workers);
     2134 +      g1h->workers()->run_task(&g1_par_verify_task);
     2135 +      // Done with the parallel phase so reset to 0.
     2136 +      g1h->set_par_threads(0);
     2137 +      
     2138 +      assert(g1h->check_heap_region_claim_values(HeapRegion::VerifyCountClaimValue),
     2139 +             "sanity check");
     2140 +    } else {
     2141 +      g1_par_verify_task.work(0);
     2142 +    }
     2143 +
     2144 +    guarantee(g1_par_verify_task.failures() == 0, "Unexpected accounting failures");
     2145 +  }
     2146 +
1795 2147    size_t known_garbage_bytes =
1796 2148      g1_par_count_task.used_bytes() - g1_par_count_task.live_bytes();
1797 2149    g1p->set_known_garbage_bytes(known_garbage_bytes);
1798 2150  
1799 2151    size_t start_used_bytes = g1h->used();
1800 2152    _at_least_one_mark_complete = true;
1801 2153    g1h->set_marking_complete();
1802 2154  
1803 2155    ergo_verbose4(ErgoConcCycles,
1804 2156             "finish cleanup",
↓ open down ↓ 172 lines elided ↑ open up ↑
1977 2329  
1978 2330  bool G1CMIsAliveClosure::do_object_b(oop obj) {
1979 2331    HeapWord* addr = (HeapWord*)obj;
1980 2332    return addr != NULL &&
1981 2333           (!_g1->is_in_g1_reserved(addr) || !_g1->is_obj_ill(obj));
1982 2334  }
1983 2335  
1984 2336  class G1CMKeepAliveClosure: public OopClosure {
1985 2337    G1CollectedHeap* _g1;
1986 2338    ConcurrentMark*  _cm;
1987      -  CMBitMap*        _bitMap;
1988 2339   public:
1989      -  G1CMKeepAliveClosure(G1CollectedHeap* g1, ConcurrentMark* cm,
1990      -                       CMBitMap* bitMap) :
1991      -    _g1(g1), _cm(cm),
1992      -    _bitMap(bitMap) {}
     2340 +  G1CMKeepAliveClosure(G1CollectedHeap* g1, ConcurrentMark* cm) :
     2341 +    _g1(g1), _cm(cm)
     2342 +  {
     2343 +    assert(Thread::current()->is_VM_thread(), "otherwise fix worker id");
     2344 +  }
1993 2345  
1994 2346    virtual void do_oop(narrowOop* p) { do_oop_work(p); }
1995 2347    virtual void do_oop(      oop* p) { do_oop_work(p); }
1996 2348  
1997 2349    template <class T> void do_oop_work(T* p) {
1998 2350      oop obj = oopDesc::load_decode_heap_oop(p);
1999 2351      HeapWord* addr = (HeapWord*)obj;
2000 2352  
2001 2353      if (_cm->verbose_high()) {
2002 2354        gclog_or_tty->print_cr("\t[0] we're looking at location "
2003 2355                               "*"PTR_FORMAT" = "PTR_FORMAT,
2004 2356                               p, (void*) obj);
2005 2357      }
2006 2358  
2007 2359      if (_g1->is_in_g1_reserved(addr) && _g1->is_obj_ill(obj)) {
2008      -      _bitMap->mark(addr);
     2360 +      _cm->mark_and_count(obj);
2009 2361        _cm->mark_stack_push(obj);
2010 2362      }
2011 2363    }
2012 2364  };
2013 2365  
2014 2366  class G1CMDrainMarkingStackClosure: public VoidClosure {
     2367 +  ConcurrentMark*               _cm;
2015 2368    CMMarkStack*                  _markStack;
2016      -  CMBitMap*                     _bitMap;
2017 2369    G1CMKeepAliveClosure*         _oopClosure;
2018 2370   public:
2019      -  G1CMDrainMarkingStackClosure(CMBitMap* bitMap, CMMarkStack* markStack,
     2371 +  G1CMDrainMarkingStackClosure(ConcurrentMark* cm, CMMarkStack* markStack,
2020 2372                                 G1CMKeepAliveClosure* oopClosure) :
2021      -    _bitMap(bitMap),
     2373 +    _cm(cm),
2022 2374      _markStack(markStack),
2023 2375      _oopClosure(oopClosure)
2024 2376    {}
2025 2377  
2026 2378    void do_void() {
2027      -    _markStack->drain((OopClosure*)_oopClosure, _bitMap, false);
     2379 +    _markStack->drain((OopClosure*)_oopClosure, _cm->nextMarkBitMap(), false);
2028 2380    }
2029 2381  };
2030 2382  
2031 2383  // 'Keep Alive' closure used by parallel reference processing.
2032 2384  // An instance of this closure is used in the parallel reference processing
2033 2385  // code rather than an instance of G1CMKeepAliveClosure. We could have used
2034 2386  // the G1CMKeepAliveClosure as it is MT-safe. Also reference objects are
2035 2387  // placed on to discovered ref lists once so we can mark and push with no
2036 2388  // need to check whether the object has already been marked. Using the
2037 2389  // G1CMKeepAliveClosure would mean, however, having all the worker threads
↓ open down ↓ 198 lines elided ↑ open up ↑
2236 2588  
2237 2589      ReferenceProcessor* rp = g1h->ref_processor_cm();
2238 2590  
2239 2591      // See the comment in G1CollectedHeap::ref_processing_init()
2240 2592      // about how reference processing currently works in G1.
2241 2593  
2242 2594      // Process weak references.
2243 2595      rp->setup_policy(clear_all_soft_refs);
2244 2596      assert(_markStack.isEmpty(), "mark stack should be empty");
2245 2597  
2246      -    G1CMKeepAliveClosure g1_keep_alive(g1h, this, nextMarkBitMap());
     2598 +    G1CMKeepAliveClosure g1_keep_alive(g1h, this);
2247 2599      G1CMDrainMarkingStackClosure
2248      -      g1_drain_mark_stack(nextMarkBitMap(), &_markStack, &g1_keep_alive);
     2600 +      g1_drain_mark_stack(this, &_markStack, &g1_keep_alive);
2249 2601  
2250 2602      // We use the work gang from the G1CollectedHeap and we utilize all
2251 2603      // the worker threads.
2252 2604      int active_workers = g1h->workers() ? g1h->workers()->active_workers() : 1;
2253 2605      active_workers = MAX2(MIN2(active_workers, (int)_max_task_num), 1);
2254 2606  
2255 2607      G1CMRefProcTaskExecutor par_task_executor(g1h, this,
2256 2608                                                g1h->workers(), active_workers);
2257 2609  
2258 2610      if (rp->processing_is_mt()) {
↓ open down ↓ 357 lines elided ↑ open up ↑
2616 2968  // evacuation pause (since now tasks are not active and can be claimed
2617 2969  // during an evacuation pause). This was a late change to the code and
2618 2970  // is currently not being taken advantage of.
2619 2971  
2620 2972  class CMGlobalObjectClosure : public ObjectClosure {
2621 2973  private:
2622 2974    ConcurrentMark* _cm;
2623 2975  
2624 2976  public:
2625 2977    void do_object(oop obj) {
2626      -    _cm->deal_with_reference(obj);
     2978 +    _cm->deal_with_reference(obj, 0);
2627 2979    }
2628 2980  
2629 2981    CMGlobalObjectClosure(ConcurrentMark* cm) : _cm(cm) { }
2630 2982  };
2631 2983  
2632      -void ConcurrentMark::deal_with_reference(oop obj) {
     2984 +void ConcurrentMark::deal_with_reference(oop obj, int worker_i) {
2633 2985    if (verbose_high()) {
2634 2986      gclog_or_tty->print_cr("[global] we're dealing with reference "PTR_FORMAT,
2635 2987                             (void*) obj);
2636 2988    }
2637 2989  
2638 2990    HeapWord* objAddr = (HeapWord*) obj;
2639 2991    assert(obj->is_oop_or_null(true /* ignore mark word */), "Error");
2640 2992    if (_g1h->is_in_g1_reserved(objAddr)) {
2641 2993      assert(obj != NULL, "null check is implicit");
2642 2994      if (!_nextMarkBitMap->isMarked(objAddr)) {
↓ open down ↓ 1 lines elided ↑ open up ↑
2644 2996        // bitmap (otherwise, it's a waste of time since we won't do
2645 2997        // anything with it).
2646 2998        HeapRegion* hr = _g1h->heap_region_containing_raw(obj);
2647 2999        if (!hr->obj_allocated_since_next_marking(obj)) {
2648 3000          if (verbose_high()) {
2649 3001            gclog_or_tty->print_cr("[global] "PTR_FORMAT" is not considered "
2650 3002                                   "marked", (void*) obj);
2651 3003          }
2652 3004  
2653 3005          // we need to mark it first
2654      -        if (_nextMarkBitMap->parMark(objAddr)) {
     3006 +        if (par_mark_and_count(obj, hr, worker_i)) {
2655 3007            // No OrderAccess:store_load() is needed. It is implicit in the
2656      -          // CAS done in parMark(objAddr) above
     3008 +          // CAS done in the call to CMBitMap::parMark() in the above
     3009 +          // routine.
2657 3010            HeapWord* finger = _finger;
2658 3011            if (objAddr < finger) {
2659 3012              if (verbose_high()) {
2660 3013                gclog_or_tty->print_cr("[global] below the global finger "
2661 3014                                       "("PTR_FORMAT"), pushing it", finger);
2662 3015              }
2663 3016              if (!mark_stack_push(obj)) {
2664 3017                if (verbose_low()) {
2665 3018                  gclog_or_tty->print_cr("[global] global stack overflow during "
2666 3019                                         "deal_with_reference");
↓ open down ↓ 24 lines elided ↑ open up ↑
2691 3044    satb_mq_set.set_closure(NULL);
2692 3045    assert(satb_mq_set.completed_buffers_num() == 0, "invariant");
2693 3046  }
2694 3047  
2695 3048  void ConcurrentMark::markPrev(oop p) {
2696 3049    // Note we are overriding the read-only view of the prev map here, via
2697 3050    // the cast.
2698 3051    ((CMBitMap*)_prevMarkBitMap)->mark((HeapWord*)p);
2699 3052  }
2700 3053  
2701      -void ConcurrentMark::clear(oop p) {
     3054 +void ConcurrentMark::clear_mark(oop p) {
2702 3055    assert(p != NULL && p->is_oop(), "expected an oop");
2703 3056    HeapWord* addr = (HeapWord*)p;
2704 3057    assert(addr >= _nextMarkBitMap->startWord() ||
2705 3058           addr < _nextMarkBitMap->endWord(), "in a region");
2706 3059  
2707 3060    _nextMarkBitMap->clear(addr);
2708 3061  }
2709 3062  
2710 3063  void ConcurrentMark::clearRangeBothMaps(MemRegion mr) {
2711 3064    // Note we are overriding the read-only view of the prev map here, via
↓ open down ↓ 179 lines elided ↑ open up ↑
2891 3244    _finger = _heap_start;
2892 3245  
2893 3246    for (int i = 0; i < (int)_max_task_num; ++i) {
2894 3247      OopTaskQueue* queue = _task_queues->queue(i);
2895 3248      queue->set_empty();
2896 3249      // Clear any partial regions from the CMTasks
2897 3250      _tasks[i]->clear_aborted_region();
2898 3251    }
2899 3252  }
2900 3253  
     3254 +// Aggregate the counting data that was constructed concurrently
     3255 +// with marking.
     3256 +class AggregateCountDataHRClosure: public HeapRegionClosure {
     3257 +  ConcurrentMark* _cm;
     3258 +  BitMap* _cm_card_bm;
     3259 +  intptr_t _bottom_card_num;
     3260 +  size_t _max_task_num;
     3261 +
     3262 + public:
     3263 +  AggregateCountDataHRClosure(ConcurrentMark *cm,
     3264 +                          BitMap* cm_card_bm,
     3265 +                          intptr_t bottom_card_num,
     3266 +                          size_t max_task_num) :
     3267 +    _cm(cm),
     3268 +    _cm_card_bm(cm_card_bm),
     3269 +    _bottom_card_num(bottom_card_num),
     3270 +    _max_task_num(max_task_num)
     3271 +  { }
     3272 +
     3273 +  bool is_card_aligned(HeapWord* p) {
     3274 +    return ((uintptr_t(p) & (CardTableModRefBS::card_size - 1)) == 0);
     3275 +  }
     3276 +
     3277 +  bool doHeapRegion(HeapRegion* hr) {
     3278 +    if (hr->continuesHumongous()) {
     3279 +      // We will ignore these here and process them when their
     3280 +      // associated "starts humongous" region is processed.
     3281 +      // Note that we cannot rely on their associated
     3282 +      // "starts humongous" region to have their bit set to 1
     3283 +      // since, due to the region chunking in the parallel region
     3284 +      // iteration, a "continues humongous" region might be visited
     3285 +      // before its associated "starts humongous".
     3286 +      return false;
     3287 +    }
     3288 +
     3289 +    HeapWord* start = hr->bottom();
     3290 +    HeapWord* limit = hr->next_top_at_mark_start();
     3291 +    HeapWord* end = hr->end();
     3292 +    
     3293 +    assert(start <= limit && limit <= hr->top() && 
     3294 +           hr->top() <= hr->end(), "Preconditions");
     3295 +
     3296 +    assert(hr->next_marked_bytes() == 0, "Precondition");
     3297 +
     3298 +    if (start == limit) {
     3299 +      // NTAMS of this region has not been set so nothing to do.
     3300 +      return false;
     3301 +    }
     3302 +
     3303 +    intptr_t start_card_num = intptr_t(uintptr_t(start) >> CardTableModRefBS::card_shift);
     3304 +    intptr_t limit_card_num = intptr_t(uintptr_t(limit) >> CardTableModRefBS::card_shift);
     3305 +    intptr_t end_card_num   = intptr_t(uintptr_t(end) >> CardTableModRefBS::card_shift);
     3306 +
     3307 +    assert(is_card_aligned(start), "sanity");
     3308 +    assert(is_card_aligned(end), "sanity");
     3309 +   
     3310 +    // If ntams is not card aligned then we bump the index for
     3311 +    // limit so that we get the card spanning ntams.
     3312 +    if (!is_card_aligned(limit)) {
     3313 +      limit_card_num += 1;
     3314 +    }
     3315 +
     3316 +    assert(limit_card_num <= end_card_num, "or else use atomics");
     3317 +
     3318 +    BitMap::idx_t start_idx = start_card_num - _bottom_card_num;
     3319 +    BitMap::idx_t limit_idx = limit_card_num - _bottom_card_num;
     3320 +
     3321 +    // Aggregate the "stripe" in the count data associated with hr.
     3322 +    size_t hrs_index = hr->hrs_index();
     3323 +    size_t marked_bytes = 0;
     3324 +
     3325 +    for (int i = 0; (size_t)i < _max_task_num; i += 1) {
     3326 +      size_t* marked_bytes_array = _cm->count_marked_bytes_array_for(i);
     3327 +      BitMap* task_card_bm = _cm->count_card_bitmap_for(i);
     3328 +  
     3329 +      // Fetch the marked_bytes in this region for task i and
     3330 +      // add it to the running total for this region.
     3331 +      marked_bytes += marked_bytes_array[hrs_index];
     3332 +
     3333 +      // Now clear the value in the task's marked bytes array
     3334 +      // for this region.
     3335 +      marked_bytes_array[hrs_index] = 0;
     3336 +  
     3337 +      // Now union the bitmaps[0,max_task_num)[start_idx..limit_idx)
     3338 +      // into the global card bitmap.
     3339 +      BitMap::idx_t scan_idx = task_card_bm->get_next_one_offset(start_idx, limit_idx);
     3340 +
     3341 +      while (scan_idx < limit_idx) {
     3342 +        assert(task_card_bm->at(scan_idx) == true, "should be");
     3343 +        _cm_card_bm->set_bit(scan_idx);
     3344 +        task_card_bm->clear_bit(scan_idx);
     3345 +        assert(_cm_card_bm->at(scan_idx) == true, "should be");
     3346 +        scan_idx = task_card_bm->get_next_one_offset(start_idx + 1, limit_idx);
     3347 +      }
     3348 +    }
     3349 +
     3350 +    // Update the marked bytes for this region.
     3351 +    hr->add_to_marked_bytes(marked_bytes);
     3352 +  
     3353 +    // Now set the top at count to NTAMS.
     3354 +    hr->set_top_at_conc_mark_count(limit);
     3355 +
     3356 +    // Next heap region
     3357 +    return false;
     3358 +  }
     3359 +};
     3360 +
     3361 +class G1AggregateCountDataTask: public AbstractGangTask {
     3362 +protected:
     3363 +  G1CollectedHeap* _g1h;
     3364 +  ConcurrentMark* _cm;
     3365 +  BitMap* _cm_card_bm;
     3366 +  intptr_t _heap_bottom_card_num;
     3367 +  size_t _max_task_num;
     3368 +  int _active_workers;
     3369 +
     3370 +public:
     3371 +  G1AggregateCountDataTask(G1CollectedHeap* g1h,
     3372 +                           ConcurrentMark* cm,
     3373 +                           BitMap* cm_card_bm,
     3374 +                           intptr_t bottom_card_num,
     3375 +                           size_t max_task_num,
     3376 +                           int n_workers) :
     3377 +    AbstractGangTask("Count Aggregation"),
     3378 +    _g1h(g1h), _cm(cm), _cm_card_bm(cm_card_bm),
     3379 +    _heap_bottom_card_num(bottom_card_num),
     3380 +    _max_task_num(max_task_num),
     3381 +    _active_workers(n_workers)
     3382 +  { }
     3383 +
     3384 +  void work(int worker_i) {
     3385 +    AggregateCountDataHRClosure cl(_cm, _cm_card_bm,
     3386 +                                 _heap_bottom_card_num, _max_task_num);
     3387 +
     3388 +    if (G1CollectedHeap::use_parallel_gc_threads()) {
     3389 +      _g1h->heap_region_par_iterate_chunked(&cl, worker_i,
     3390 +                                            _active_workers,
     3391 +                                            HeapRegion::AggregateCountClaimValue);
     3392 +    } else {
     3393 +      _g1h->heap_region_iterate(&cl);
     3394 +    }
     3395 +  }
     3396 +};
     3397 +
     3398 +
     3399 +void ConcurrentMark::aggregate_and_clear_count_data() {
     3400 +  // Clear the global card bitmap
     3401 +  _card_bm.clear();
     3402 +
     3403 +  int n_workers = (G1CollectedHeap::use_parallel_gc_threads() ?
     3404 +                        _g1h->workers()->active_workers() :
     3405 +                        1);
     3406 +
     3407 +  G1AggregateCountDataTask g1_par_agg_task(_g1h, this, &_card_bm,
     3408 +                                           _heap_bottom_card_num, _max_task_num,
     3409 +                                           n_workers);
     3410 +
     3411 +  if (G1CollectedHeap::use_parallel_gc_threads()) {
     3412 +    assert(_g1h->check_heap_region_claim_values(HeapRegion::InitialClaimValue),
     3413 +           "sanity check");
     3414 +    _g1h->set_par_threads(n_workers);
     3415 +    _g1h->workers()->run_task(&g1_par_agg_task);
     3416 +    _g1h->set_par_threads(0);
     3417 +
     3418 +    assert(_g1h->check_heap_region_claim_values(HeapRegion::AggregateCountClaimValue),
     3419 +           "sanity check");
     3420 +    _g1h->reset_heap_region_claim_values();
     3421 +  } else {
     3422 +    g1_par_agg_task.work(0);
     3423 +  }
     3424 +}
     3425 +
     3426 +// Clear the per-worker arrays used to store the per-region counting data
     3427 +void ConcurrentMark::clear_all_count_data() {
     3428 +  assert(SafepointSynchronize::is_at_safepoint() ||
     3429 +         !Universe::is_fully_initialized(), "must be");
     3430 +
     3431 +  size_t max_regions = _g1h->max_regions();
     3432 +  
     3433 +  assert(_max_task_num != 0, "unitialized");
     3434 +  assert(_count_card_bitmaps != NULL, "uninitialized");
     3435 +  assert(_count_marked_bytes != NULL, "uninitialized");
     3436 +
     3437 +  for (int i = 0; (size_t) i < _max_task_num; i += 1) {
     3438 +    BitMap* task_card_bm = count_card_bitmap_for(i);
     3439 +    size_t* marked_bytes_array = count_marked_bytes_array_for(i);
     3440 +
     3441 +    assert(task_card_bm->size() == _card_bm.size(), "size mismatch");
     3442 +    assert(marked_bytes_array != NULL, "uninitialized");
     3443 +
     3444 +    for (int j = 0; (size_t) j < max_regions; j++) {
     3445 +      marked_bytes_array[j] = 0;
     3446 +    }
     3447 +    task_card_bm->clear();
     3448 +  }
     3449 +}
     3450 +
     3451 +void ConcurrentMark::clear_count_data_for_heap_region(HeapRegion* hr) {
     3452 +  // Clears the count data for the given region from _all_ of
     3453 +  // the per-task counting data structures.
     3454 +
     3455 +  MemRegion used_region = hr->used_region();
     3456 +  HeapWord* start = used_region.start();
     3457 +  HeapWord* last = used_region.last();
     3458 +  size_t hr_index = hr->hrs_index();
     3459 +
     3460 +  intptr_t start_card_num =
     3461 +    intptr_t(uintptr_t(start) >> CardTableModRefBS::card_shift);
     3462 +  intptr_t last_card_num =
     3463 +    intptr_t(uintptr_t(last) >> CardTableModRefBS::card_shift);
     3464 +  
     3465 +  BitMap::idx_t start_idx = start_card_num - heap_bottom_card_num();
     3466 +  BitMap::idx_t last_idx = last_card_num - heap_bottom_card_num();
     3467 +
     3468 +  size_t used_region_bytes = used_region.byte_size();
     3469 +  size_t marked_bytes = 0;
     3470 +
     3471 +  for (int i=0; (size_t)i < _max_task_num; i += 1) {
     3472 +    BitMap* task_card_bm = count_card_bitmap_for(i);
     3473 +    size_t* marked_bytes_array = count_marked_bytes_array_for(i);
     3474 +
     3475 +    marked_bytes += marked_bytes_array[hr_index];
     3476 +    // clear the amount of marked bytes in the task array for this
     3477 +    // region
     3478 +    marked_bytes_array[hr_index] = 0;
     3479 +    
     3480 +    // Clear the inclusive range [start_idx, last_idx] from the
     3481 +    // card bitmap. The clear_range routine is exclusive so we
     3482 +    // need to also explicitly clear the bit at last_idx.
     3483 +    // Passing last_idx+1 to the clear_range would work in
     3484 +    // most cases but could trip an OOB assertion.
     3485 +
     3486 +    if ((last_idx - start_idx) > 0) {
     3487 +      task_card_bm->clear_range(start_idx, last_idx);
     3488 +    }
     3489 +    task_card_bm->clear_bit(last_idx);
     3490 +  }
     3491 +}
     3492 +
2901 3493  void ConcurrentMark::print_stats() {
2902 3494    if (verbose_stats()) {
2903 3495      gclog_or_tty->print_cr("---------------------------------------------------------------------");
2904 3496      for (size_t i = 0; i < _active_tasks; ++i) {
2905 3497        _tasks[i]->print_stats();
2906 3498        gclog_or_tty->print_cr("---------------------------------------------------------------------");
2907 3499      }
2908 3500    }
2909 3501  }
2910 3502  
2911 3503  // Closures used by ConcurrentMark::complete_marking_in_collection_set().
2912 3504  
2913 3505  class CSetMarkOopClosure: public OopClosure {
2914 3506    friend class CSetMarkBitMapClosure;
2915 3507  
2916 3508    G1CollectedHeap* _g1h;
2917      -  CMBitMap*        _bm;
2918 3509    ConcurrentMark*  _cm;
2919 3510    oop*             _ms;
2920 3511    jint*            _array_ind_stack;
2921 3512    int              _ms_size;
2922 3513    int              _ms_ind;
2923 3514    int              _array_increment;
2924 3515    int              _worker_i;
2925 3516  
2926 3517    bool push(oop obj, int arr_ind = 0) {
2927 3518      if (_ms_ind == _ms_size) {
↓ open down ↓ 39 lines elided ↑ open up ↑
2967 3558        }
2968 3559        if (abort()) return false;
2969 3560      }
2970 3561      return true;
2971 3562    }
2972 3563  
2973 3564  public:
2974 3565    CSetMarkOopClosure(ConcurrentMark* cm, int ms_size, int worker_i) :
2975 3566      _g1h(G1CollectedHeap::heap()),
2976 3567      _cm(cm),
2977      -    _bm(cm->nextMarkBitMap()),
2978 3568      _ms_size(ms_size), _ms_ind(0),
2979 3569      _ms(NEW_C_HEAP_ARRAY(oop, ms_size)),
2980 3570      _array_ind_stack(NEW_C_HEAP_ARRAY(jint, ms_size)),
2981 3571      _array_increment(MAX2(ms_size/8, 16)),
2982 3572      _worker_i(worker_i) { }
2983 3573  
2984 3574    ~CSetMarkOopClosure() {
2985 3575      FREE_C_HEAP_ARRAY(oop, _ms);
2986 3576      FREE_C_HEAP_ARRAY(jint, _array_ind_stack);
2987 3577    }
↓ open down ↓ 9 lines elided ↑ open up ↑
2997 3587        // If the object has already been forwarded, we have to make sure
2998 3588        // that it's marked.  So follow the forwarding pointer.  Note that
2999 3589        // this does the right thing for self-forwarding pointers in the
3000 3590        // evacuation failure case.
3001 3591        obj = obj->forwardee();
3002 3592      }
3003 3593      HeapRegion* hr = _g1h->heap_region_containing(obj);
3004 3594      if (hr != NULL) {
3005 3595        if (hr->in_collection_set()) {
3006 3596          if (_g1h->is_obj_ill(obj)) {
3007      -          if (_bm->parMark((HeapWord*)obj)) {
     3597 +          if (_cm->par_mark_and_count(obj, hr, _worker_i)) {
3008 3598              if (!push(obj)) {
3009 3599                gclog_or_tty->print_cr("Setting abort in CSetMarkOopClosure because push failed.");
3010 3600                set_abort();
3011 3601              }
3012 3602            }
3013 3603          }
3014 3604        } else {
3015 3605          // Outside the collection set; we need to gray it
3016      -        _cm->deal_with_reference(obj);
     3606 +        _cm->deal_with_reference(obj, _worker_i);
3017 3607        }
3018 3608      }
3019 3609    }
3020 3610  };
3021 3611  
3022 3612  class CSetMarkBitMapClosure: public BitMapClosure {
3023 3613    G1CollectedHeap*   _g1h;
3024 3614    CMBitMap*          _bitMap;
3025 3615    ConcurrentMark*    _cm;
3026 3616    CSetMarkOopClosure _oop_cl;
↓ open down ↓ 255 lines elided ↑ open up ↑
3282 3872      gclog_or_tty->print_cr("    RS scrub total time = %8.2f s (avg = %8.2f ms).",
3283 3873                             _total_rs_scrub_time,
3284 3874                             (_cleanup_times.num() > 0 ? _total_rs_scrub_time * 1000.0 /
3285 3875                              (double)_cleanup_times.num()
3286 3876                             : 0.0));
3287 3877    }
3288 3878    gclog_or_tty->print_cr("  Total stop_world time = %8.2f s.",
3289 3879                           (_init_times.sum() + _remark_times.sum() +
3290 3880                            _cleanup_times.sum())/1000.0);
3291 3881    gclog_or_tty->print_cr("  Total concurrent time = %8.2f s "
3292      -                "(%8.2f s marking, %8.2f s counting).",
     3882 +                "(%8.2f s marking).",
3293 3883                  cmThread()->vtime_accum(),
3294      -                cmThread()->vtime_mark_accum(),
3295      -                cmThread()->vtime_count_accum());
     3884 +                cmThread()->vtime_mark_accum());
3296 3885  }
3297 3886  
3298 3887  void ConcurrentMark::print_worker_threads_on(outputStream* st) const {
3299 3888    _parallel_workers->print_worker_threads_on(st);
3300 3889  }
3301 3890  
3302 3891  // Closures
3303 3892  // XXX: there seems to be a lot of code  duplication here;
3304 3893  // should refactor and consolidate the shared code.
3305 3894  
↓ open down ↓ 1274 lines elided ↑ open up ↑
4580 5169                               "elapsed = %1.2lfms <<<<<<<<<<",
4581 5170                               _task_id, _time_target_ms, elapsed_time_ms);
4582 5171      }
4583 5172    }
4584 5173  
4585 5174    _claimed = false;
4586 5175  }
4587 5176  
4588 5177  CMTask::CMTask(int task_id,
4589 5178                 ConcurrentMark* cm,
     5179 +               size_t* marked_bytes,
     5180 +               BitMap* card_bm,
4590 5181                 CMTaskQueue* task_queue,
4591 5182                 CMTaskQueueSet* task_queues)
4592 5183    : _g1h(G1CollectedHeap::heap()),
4593 5184      _task_id(task_id), _cm(cm),
4594 5185      _claimed(false),
4595 5186      _nextMarkBitMap(NULL), _hash_seed(17),
4596 5187      _task_queue(task_queue),
4597 5188      _task_queues(task_queues),
4598 5189      _cm_oop_closure(NULL),
4599      -    _aborted_region(MemRegion()) {
     5190 +    _aborted_region(MemRegion()),
     5191 +    _marked_bytes_array(marked_bytes),
     5192 +    _card_bm(card_bm) {
4600 5193    guarantee(task_queue != NULL, "invariant");
4601 5194    guarantee(task_queues != NULL, "invariant");
4602 5195  
4603 5196    statsOnly( _clock_due_to_scanning = 0;
4604 5197               _clock_due_to_marking  = 0 );
4605 5198  
4606 5199    _marking_step_diffs_ms.add(0.5);
4607 5200  }
4608 5201  
4609 5202  // These are formatting macros that are used below to ensure
↓ open down ↓ 179 lines elided ↑ open up ↑
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX